1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Generic socket support routines. Memory allocators, socket lock/release 8 * handler for protocols to use and generic option handler. 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Alan Cox, <A.Cox@swansea.ac.uk> 14 * 15 * Fixes: 16 * Alan Cox : Numerous verify_area() problems 17 * Alan Cox : Connecting on a connecting socket 18 * now returns an error for tcp. 19 * Alan Cox : sock->protocol is set correctly. 20 * and is not sometimes left as 0. 21 * Alan Cox : connect handles icmp errors on a 22 * connect properly. Unfortunately there 23 * is a restart syscall nasty there. I 24 * can't match BSD without hacking the C 25 * library. Ideas urgently sought! 26 * Alan Cox : Disallow bind() to addresses that are 27 * not ours - especially broadcast ones!! 28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) 29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, 30 * instead they leave that for the DESTROY timer. 31 * Alan Cox : Clean up error flag in accept 32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer 33 * was buggy. Put a remove_sock() in the handler 34 * for memory when we hit 0. Also altered the timer 35 * code. The ACK stuff can wait and needs major 36 * TCP layer surgery. 37 * Alan Cox : Fixed TCP ack bug, removed remove sock 38 * and fixed timer/inet_bh race. 39 * Alan Cox : Added zapped flag for TCP 40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code 41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb 42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources 43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. 44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... 45 * Rick Sladkey : Relaxed UDP rules for matching packets. 46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support 47 * Pauline Middelink : identd support 48 * Alan Cox : Fixed connect() taking signals I think. 49 * Alan Cox : SO_LINGER supported 50 * Alan Cox : Error reporting fixes 51 * Anonymous : inet_create tidied up (sk->reuse setting) 52 * Alan Cox : inet sockets don't set sk->type! 53 * Alan Cox : Split socket option code 54 * Alan Cox : Callbacks 55 * Alan Cox : Nagle flag for Charles & Johannes stuff 56 * Alex : Removed restriction on inet fioctl 57 * Alan Cox : Splitting INET from NET core 58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() 59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code 60 * Alan Cox : Split IP from generic code 61 * Alan Cox : New kfree_skbmem() 62 * Alan Cox : Make SO_DEBUG superuser only. 63 * Alan Cox : Allow anyone to clear SO_DEBUG 64 * (compatibility fix) 65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. 66 * Alan Cox : Allocator for a socket is settable. 67 * Alan Cox : SO_ERROR includes soft errors. 68 * Alan Cox : Allow NULL arguments on some SO_ opts 69 * Alan Cox : Generic socket allocation to make hooks 70 * easier (suggested by Craig Metz). 71 * Michael Pall : SO_ERROR returns positive errno again 72 * Steve Whitehouse: Added default destructor to free 73 * protocol private data. 74 * Steve Whitehouse: Added various other default routines 75 * common to several socket families. 76 * Chris Evans : Call suser() check last on F_SETOWN 77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. 78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() 79 * Andi Kleen : Fix write_space callback 80 * Chris Evans : Security fixes - signedness again 81 * Arnaldo C. Melo : cleanups, use skb_queue_purge 82 * 83 * To Fix: 84 */ 85 86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 87 88 #include <asm/unaligned.h> 89 #include <linux/capability.h> 90 #include <linux/errno.h> 91 #include <linux/errqueue.h> 92 #include <linux/types.h> 93 #include <linux/socket.h> 94 #include <linux/in.h> 95 #include <linux/kernel.h> 96 #include <linux/module.h> 97 #include <linux/proc_fs.h> 98 #include <linux/seq_file.h> 99 #include <linux/sched.h> 100 #include <linux/sched/mm.h> 101 #include <linux/timer.h> 102 #include <linux/string.h> 103 #include <linux/sockios.h> 104 #include <linux/net.h> 105 #include <linux/mm.h> 106 #include <linux/slab.h> 107 #include <linux/interrupt.h> 108 #include <linux/poll.h> 109 #include <linux/tcp.h> 110 #include <linux/init.h> 111 #include <linux/highmem.h> 112 #include <linux/user_namespace.h> 113 #include <linux/static_key.h> 114 #include <linux/memcontrol.h> 115 #include <linux/prefetch.h> 116 #include <linux/compat.h> 117 118 #include <linux/uaccess.h> 119 120 #include <linux/netdevice.h> 121 #include <net/protocol.h> 122 #include <linux/skbuff.h> 123 #include <net/net_namespace.h> 124 #include <net/request_sock.h> 125 #include <net/sock.h> 126 #include <linux/net_tstamp.h> 127 #include <net/xfrm.h> 128 #include <linux/ipsec.h> 129 #include <net/cls_cgroup.h> 130 #include <net/netprio_cgroup.h> 131 #include <linux/sock_diag.h> 132 133 #include <linux/filter.h> 134 #include <net/sock_reuseport.h> 135 #include <net/bpf_sk_storage.h> 136 137 #include <trace/events/sock.h> 138 139 #include <net/tcp.h> 140 #include <net/busy_poll.h> 141 142 static DEFINE_MUTEX(proto_list_mutex); 143 static LIST_HEAD(proto_list); 144 145 static void sock_inuse_add(struct net *net, int val); 146 147 /** 148 * sk_ns_capable - General socket capability test 149 * @sk: Socket to use a capability on or through 150 * @user_ns: The user namespace of the capability to use 151 * @cap: The capability to use 152 * 153 * Test to see if the opener of the socket had when the socket was 154 * created and the current process has the capability @cap in the user 155 * namespace @user_ns. 156 */ 157 bool sk_ns_capable(const struct sock *sk, 158 struct user_namespace *user_ns, int cap) 159 { 160 return file_ns_capable(sk->sk_socket->file, user_ns, cap) && 161 ns_capable(user_ns, cap); 162 } 163 EXPORT_SYMBOL(sk_ns_capable); 164 165 /** 166 * sk_capable - Socket global capability test 167 * @sk: Socket to use a capability on or through 168 * @cap: The global capability to use 169 * 170 * Test to see if the opener of the socket had when the socket was 171 * created and the current process has the capability @cap in all user 172 * namespaces. 173 */ 174 bool sk_capable(const struct sock *sk, int cap) 175 { 176 return sk_ns_capable(sk, &init_user_ns, cap); 177 } 178 EXPORT_SYMBOL(sk_capable); 179 180 /** 181 * sk_net_capable - Network namespace socket capability test 182 * @sk: Socket to use a capability on or through 183 * @cap: The capability to use 184 * 185 * Test to see if the opener of the socket had when the socket was created 186 * and the current process has the capability @cap over the network namespace 187 * the socket is a member of. 188 */ 189 bool sk_net_capable(const struct sock *sk, int cap) 190 { 191 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); 192 } 193 EXPORT_SYMBOL(sk_net_capable); 194 195 /* 196 * Each address family might have different locking rules, so we have 197 * one slock key per address family and separate keys for internal and 198 * userspace sockets. 199 */ 200 static struct lock_class_key af_family_keys[AF_MAX]; 201 static struct lock_class_key af_family_kern_keys[AF_MAX]; 202 static struct lock_class_key af_family_slock_keys[AF_MAX]; 203 static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; 204 205 /* 206 * Make lock validator output more readable. (we pre-construct these 207 * strings build-time, so that runtime initialization of socket 208 * locks is fast): 209 */ 210 211 #define _sock_locks(x) \ 212 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ 213 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ 214 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ 215 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ 216 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ 217 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ 218 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ 219 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ 220 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ 221 x "27" , x "28" , x "AF_CAN" , \ 222 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ 223 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ 224 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ 225 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ 226 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ 227 x "AF_MAX" 228 229 static const char *const af_family_key_strings[AF_MAX+1] = { 230 _sock_locks("sk_lock-") 231 }; 232 static const char *const af_family_slock_key_strings[AF_MAX+1] = { 233 _sock_locks("slock-") 234 }; 235 static const char *const af_family_clock_key_strings[AF_MAX+1] = { 236 _sock_locks("clock-") 237 }; 238 239 static const char *const af_family_kern_key_strings[AF_MAX+1] = { 240 _sock_locks("k-sk_lock-") 241 }; 242 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { 243 _sock_locks("k-slock-") 244 }; 245 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { 246 _sock_locks("k-clock-") 247 }; 248 static const char *const af_family_rlock_key_strings[AF_MAX+1] = { 249 _sock_locks("rlock-") 250 }; 251 static const char *const af_family_wlock_key_strings[AF_MAX+1] = { 252 _sock_locks("wlock-") 253 }; 254 static const char *const af_family_elock_key_strings[AF_MAX+1] = { 255 _sock_locks("elock-") 256 }; 257 258 /* 259 * sk_callback_lock and sk queues locking rules are per-address-family, 260 * so split the lock classes by using a per-AF key: 261 */ 262 static struct lock_class_key af_callback_keys[AF_MAX]; 263 static struct lock_class_key af_rlock_keys[AF_MAX]; 264 static struct lock_class_key af_wlock_keys[AF_MAX]; 265 static struct lock_class_key af_elock_keys[AF_MAX]; 266 static struct lock_class_key af_kern_callback_keys[AF_MAX]; 267 268 /* Run time adjustable parameters. */ 269 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; 270 EXPORT_SYMBOL(sysctl_wmem_max); 271 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; 272 EXPORT_SYMBOL(sysctl_rmem_max); 273 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; 274 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; 275 276 /* Maximal space eaten by iovec or ancillary data plus some space */ 277 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 278 EXPORT_SYMBOL(sysctl_optmem_max); 279 280 int sysctl_tstamp_allow_data __read_mostly = 1; 281 282 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); 283 EXPORT_SYMBOL_GPL(memalloc_socks_key); 284 285 /** 286 * sk_set_memalloc - sets %SOCK_MEMALLOC 287 * @sk: socket to set it on 288 * 289 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. 290 * It's the responsibility of the admin to adjust min_free_kbytes 291 * to meet the requirements 292 */ 293 void sk_set_memalloc(struct sock *sk) 294 { 295 sock_set_flag(sk, SOCK_MEMALLOC); 296 sk->sk_allocation |= __GFP_MEMALLOC; 297 static_branch_inc(&memalloc_socks_key); 298 } 299 EXPORT_SYMBOL_GPL(sk_set_memalloc); 300 301 void sk_clear_memalloc(struct sock *sk) 302 { 303 sock_reset_flag(sk, SOCK_MEMALLOC); 304 sk->sk_allocation &= ~__GFP_MEMALLOC; 305 static_branch_dec(&memalloc_socks_key); 306 307 /* 308 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward 309 * progress of swapping. SOCK_MEMALLOC may be cleared while 310 * it has rmem allocations due to the last swapfile being deactivated 311 * but there is a risk that the socket is unusable due to exceeding 312 * the rmem limits. Reclaim the reserves and obey rmem limits again. 313 */ 314 sk_mem_reclaim(sk); 315 } 316 EXPORT_SYMBOL_GPL(sk_clear_memalloc); 317 318 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 319 { 320 int ret; 321 unsigned int noreclaim_flag; 322 323 /* these should have been dropped before queueing */ 324 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); 325 326 noreclaim_flag = memalloc_noreclaim_save(); 327 ret = sk->sk_backlog_rcv(sk, skb); 328 memalloc_noreclaim_restore(noreclaim_flag); 329 330 return ret; 331 } 332 EXPORT_SYMBOL(__sk_backlog_rcv); 333 334 static int sock_get_timeout(long timeo, void *optval, bool old_timeval) 335 { 336 struct __kernel_sock_timeval tv; 337 338 if (timeo == MAX_SCHEDULE_TIMEOUT) { 339 tv.tv_sec = 0; 340 tv.tv_usec = 0; 341 } else { 342 tv.tv_sec = timeo / HZ; 343 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; 344 } 345 346 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 347 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; 348 *(struct old_timeval32 *)optval = tv32; 349 return sizeof(tv32); 350 } 351 352 if (old_timeval) { 353 struct __kernel_old_timeval old_tv; 354 old_tv.tv_sec = tv.tv_sec; 355 old_tv.tv_usec = tv.tv_usec; 356 *(struct __kernel_old_timeval *)optval = old_tv; 357 return sizeof(old_tv); 358 } 359 360 *(struct __kernel_sock_timeval *)optval = tv; 361 return sizeof(tv); 362 } 363 364 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, 365 bool old_timeval) 366 { 367 struct __kernel_sock_timeval tv; 368 369 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 370 struct old_timeval32 tv32; 371 372 if (optlen < sizeof(tv32)) 373 return -EINVAL; 374 375 if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) 376 return -EFAULT; 377 tv.tv_sec = tv32.tv_sec; 378 tv.tv_usec = tv32.tv_usec; 379 } else if (old_timeval) { 380 struct __kernel_old_timeval old_tv; 381 382 if (optlen < sizeof(old_tv)) 383 return -EINVAL; 384 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) 385 return -EFAULT; 386 tv.tv_sec = old_tv.tv_sec; 387 tv.tv_usec = old_tv.tv_usec; 388 } else { 389 if (optlen < sizeof(tv)) 390 return -EINVAL; 391 if (copy_from_sockptr(&tv, optval, sizeof(tv))) 392 return -EFAULT; 393 } 394 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) 395 return -EDOM; 396 397 if (tv.tv_sec < 0) { 398 static int warned __read_mostly; 399 400 *timeo_p = 0; 401 if (warned < 10 && net_ratelimit()) { 402 warned++; 403 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", 404 __func__, current->comm, task_pid_nr(current)); 405 } 406 return 0; 407 } 408 *timeo_p = MAX_SCHEDULE_TIMEOUT; 409 if (tv.tv_sec == 0 && tv.tv_usec == 0) 410 return 0; 411 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) 412 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ); 413 return 0; 414 } 415 416 static bool sock_needs_netstamp(const struct sock *sk) 417 { 418 switch (sk->sk_family) { 419 case AF_UNSPEC: 420 case AF_UNIX: 421 return false; 422 default: 423 return true; 424 } 425 } 426 427 static void sock_disable_timestamp(struct sock *sk, unsigned long flags) 428 { 429 if (sk->sk_flags & flags) { 430 sk->sk_flags &= ~flags; 431 if (sock_needs_netstamp(sk) && 432 !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) 433 net_disable_timestamp(); 434 } 435 } 436 437 438 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 439 { 440 unsigned long flags; 441 struct sk_buff_head *list = &sk->sk_receive_queue; 442 443 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { 444 atomic_inc(&sk->sk_drops); 445 trace_sock_rcvqueue_full(sk, skb); 446 return -ENOMEM; 447 } 448 449 if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 450 atomic_inc(&sk->sk_drops); 451 return -ENOBUFS; 452 } 453 454 skb->dev = NULL; 455 skb_set_owner_r(skb, sk); 456 457 /* we escape from rcu protected region, make sure we dont leak 458 * a norefcounted dst 459 */ 460 skb_dst_force(skb); 461 462 spin_lock_irqsave(&list->lock, flags); 463 sock_skb_set_dropcount(sk, skb); 464 __skb_queue_tail(list, skb); 465 spin_unlock_irqrestore(&list->lock, flags); 466 467 if (!sock_flag(sk, SOCK_DEAD)) 468 sk->sk_data_ready(sk); 469 return 0; 470 } 471 EXPORT_SYMBOL(__sock_queue_rcv_skb); 472 473 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 474 { 475 int err; 476 477 err = sk_filter(sk, skb); 478 if (err) 479 return err; 480 481 return __sock_queue_rcv_skb(sk, skb); 482 } 483 EXPORT_SYMBOL(sock_queue_rcv_skb); 484 485 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, 486 const int nested, unsigned int trim_cap, bool refcounted) 487 { 488 int rc = NET_RX_SUCCESS; 489 490 if (sk_filter_trim_cap(sk, skb, trim_cap)) 491 goto discard_and_relse; 492 493 skb->dev = NULL; 494 495 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { 496 atomic_inc(&sk->sk_drops); 497 goto discard_and_relse; 498 } 499 if (nested) 500 bh_lock_sock_nested(sk); 501 else 502 bh_lock_sock(sk); 503 if (!sock_owned_by_user(sk)) { 504 /* 505 * trylock + unlock semantics: 506 */ 507 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); 508 509 rc = sk_backlog_rcv(sk, skb); 510 511 mutex_release(&sk->sk_lock.dep_map, _RET_IP_); 512 } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { 513 bh_unlock_sock(sk); 514 atomic_inc(&sk->sk_drops); 515 goto discard_and_relse; 516 } 517 518 bh_unlock_sock(sk); 519 out: 520 if (refcounted) 521 sock_put(sk); 522 return rc; 523 discard_and_relse: 524 kfree_skb(skb); 525 goto out; 526 } 527 EXPORT_SYMBOL(__sk_receive_skb); 528 529 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, 530 u32)); 531 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 532 u32)); 533 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 534 { 535 struct dst_entry *dst = __sk_dst_get(sk); 536 537 if (dst && dst->obsolete && 538 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 539 dst, cookie) == NULL) { 540 sk_tx_queue_clear(sk); 541 sk->sk_dst_pending_confirm = 0; 542 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 543 dst_release(dst); 544 return NULL; 545 } 546 547 return dst; 548 } 549 EXPORT_SYMBOL(__sk_dst_check); 550 551 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) 552 { 553 struct dst_entry *dst = sk_dst_get(sk); 554 555 if (dst && dst->obsolete && 556 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 557 dst, cookie) == NULL) { 558 sk_dst_reset(sk); 559 dst_release(dst); 560 return NULL; 561 } 562 563 return dst; 564 } 565 EXPORT_SYMBOL(sk_dst_check); 566 567 static int sock_bindtoindex_locked(struct sock *sk, int ifindex) 568 { 569 int ret = -ENOPROTOOPT; 570 #ifdef CONFIG_NETDEVICES 571 struct net *net = sock_net(sk); 572 573 /* Sorry... */ 574 ret = -EPERM; 575 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) 576 goto out; 577 578 ret = -EINVAL; 579 if (ifindex < 0) 580 goto out; 581 582 sk->sk_bound_dev_if = ifindex; 583 if (sk->sk_prot->rehash) 584 sk->sk_prot->rehash(sk); 585 sk_dst_reset(sk); 586 587 ret = 0; 588 589 out: 590 #endif 591 592 return ret; 593 } 594 595 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) 596 { 597 int ret; 598 599 if (lock_sk) 600 lock_sock(sk); 601 ret = sock_bindtoindex_locked(sk, ifindex); 602 if (lock_sk) 603 release_sock(sk); 604 605 return ret; 606 } 607 EXPORT_SYMBOL(sock_bindtoindex); 608 609 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) 610 { 611 int ret = -ENOPROTOOPT; 612 #ifdef CONFIG_NETDEVICES 613 struct net *net = sock_net(sk); 614 char devname[IFNAMSIZ]; 615 int index; 616 617 ret = -EINVAL; 618 if (optlen < 0) 619 goto out; 620 621 /* Bind this socket to a particular device like "eth0", 622 * as specified in the passed interface name. If the 623 * name is "" or the option length is zero the socket 624 * is not bound. 625 */ 626 if (optlen > IFNAMSIZ - 1) 627 optlen = IFNAMSIZ - 1; 628 memset(devname, 0, sizeof(devname)); 629 630 ret = -EFAULT; 631 if (copy_from_sockptr(devname, optval, optlen)) 632 goto out; 633 634 index = 0; 635 if (devname[0] != '\0') { 636 struct net_device *dev; 637 638 rcu_read_lock(); 639 dev = dev_get_by_name_rcu(net, devname); 640 if (dev) 641 index = dev->ifindex; 642 rcu_read_unlock(); 643 ret = -ENODEV; 644 if (!dev) 645 goto out; 646 } 647 648 return sock_bindtoindex(sk, index, true); 649 out: 650 #endif 651 652 return ret; 653 } 654 655 static int sock_getbindtodevice(struct sock *sk, char __user *optval, 656 int __user *optlen, int len) 657 { 658 int ret = -ENOPROTOOPT; 659 #ifdef CONFIG_NETDEVICES 660 struct net *net = sock_net(sk); 661 char devname[IFNAMSIZ]; 662 663 if (sk->sk_bound_dev_if == 0) { 664 len = 0; 665 goto zero; 666 } 667 668 ret = -EINVAL; 669 if (len < IFNAMSIZ) 670 goto out; 671 672 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if); 673 if (ret) 674 goto out; 675 676 len = strlen(devname) + 1; 677 678 ret = -EFAULT; 679 if (copy_to_user(optval, devname, len)) 680 goto out; 681 682 zero: 683 ret = -EFAULT; 684 if (put_user(len, optlen)) 685 goto out; 686 687 ret = 0; 688 689 out: 690 #endif 691 692 return ret; 693 } 694 695 bool sk_mc_loop(struct sock *sk) 696 { 697 if (dev_recursion_level()) 698 return false; 699 if (!sk) 700 return true; 701 switch (sk->sk_family) { 702 case AF_INET: 703 return inet_sk(sk)->mc_loop; 704 #if IS_ENABLED(CONFIG_IPV6) 705 case AF_INET6: 706 return inet6_sk(sk)->mc_loop; 707 #endif 708 } 709 WARN_ON_ONCE(1); 710 return true; 711 } 712 EXPORT_SYMBOL(sk_mc_loop); 713 714 void sock_set_reuseaddr(struct sock *sk) 715 { 716 lock_sock(sk); 717 sk->sk_reuse = SK_CAN_REUSE; 718 release_sock(sk); 719 } 720 EXPORT_SYMBOL(sock_set_reuseaddr); 721 722 void sock_set_reuseport(struct sock *sk) 723 { 724 lock_sock(sk); 725 sk->sk_reuseport = true; 726 release_sock(sk); 727 } 728 EXPORT_SYMBOL(sock_set_reuseport); 729 730 void sock_no_linger(struct sock *sk) 731 { 732 lock_sock(sk); 733 sk->sk_lingertime = 0; 734 sock_set_flag(sk, SOCK_LINGER); 735 release_sock(sk); 736 } 737 EXPORT_SYMBOL(sock_no_linger); 738 739 void sock_set_priority(struct sock *sk, u32 priority) 740 { 741 lock_sock(sk); 742 sk->sk_priority = priority; 743 release_sock(sk); 744 } 745 EXPORT_SYMBOL(sock_set_priority); 746 747 void sock_set_sndtimeo(struct sock *sk, s64 secs) 748 { 749 lock_sock(sk); 750 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) 751 sk->sk_sndtimeo = secs * HZ; 752 else 753 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 754 release_sock(sk); 755 } 756 EXPORT_SYMBOL(sock_set_sndtimeo); 757 758 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) 759 { 760 if (val) { 761 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); 762 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns); 763 sock_set_flag(sk, SOCK_RCVTSTAMP); 764 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 765 } else { 766 sock_reset_flag(sk, SOCK_RCVTSTAMP); 767 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 768 } 769 } 770 771 void sock_enable_timestamps(struct sock *sk) 772 { 773 lock_sock(sk); 774 __sock_set_timestamps(sk, true, false, true); 775 release_sock(sk); 776 } 777 EXPORT_SYMBOL(sock_enable_timestamps); 778 779 void sock_set_timestamp(struct sock *sk, int optname, bool valbool) 780 { 781 switch (optname) { 782 case SO_TIMESTAMP_OLD: 783 __sock_set_timestamps(sk, valbool, false, false); 784 break; 785 case SO_TIMESTAMP_NEW: 786 __sock_set_timestamps(sk, valbool, true, false); 787 break; 788 case SO_TIMESTAMPNS_OLD: 789 __sock_set_timestamps(sk, valbool, false, true); 790 break; 791 case SO_TIMESTAMPNS_NEW: 792 __sock_set_timestamps(sk, valbool, true, true); 793 break; 794 } 795 } 796 797 int sock_set_timestamping(struct sock *sk, int optname, int val) 798 { 799 if (val & ~SOF_TIMESTAMPING_MASK) 800 return -EINVAL; 801 802 if (val & SOF_TIMESTAMPING_OPT_ID && 803 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { 804 if (sk->sk_protocol == IPPROTO_TCP && 805 sk->sk_type == SOCK_STREAM) { 806 if ((1 << sk->sk_state) & 807 (TCPF_CLOSE | TCPF_LISTEN)) 808 return -EINVAL; 809 sk->sk_tskey = tcp_sk(sk)->snd_una; 810 } else { 811 sk->sk_tskey = 0; 812 } 813 } 814 815 if (val & SOF_TIMESTAMPING_OPT_STATS && 816 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) 817 return -EINVAL; 818 819 sk->sk_tsflags = val; 820 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); 821 822 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 823 sock_enable_timestamp(sk, 824 SOCK_TIMESTAMPING_RX_SOFTWARE); 825 else 826 sock_disable_timestamp(sk, 827 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); 828 return 0; 829 } 830 831 void sock_set_keepalive(struct sock *sk) 832 { 833 lock_sock(sk); 834 if (sk->sk_prot->keepalive) 835 sk->sk_prot->keepalive(sk, true); 836 sock_valbool_flag(sk, SOCK_KEEPOPEN, true); 837 release_sock(sk); 838 } 839 EXPORT_SYMBOL(sock_set_keepalive); 840 841 static void __sock_set_rcvbuf(struct sock *sk, int val) 842 { 843 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it 844 * as a negative value. 845 */ 846 val = min_t(int, val, INT_MAX / 2); 847 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 848 849 /* We double it on the way in to account for "struct sk_buff" etc. 850 * overhead. Applications assume that the SO_RCVBUF setting they make 851 * will allow that much actual data to be received on that socket. 852 * 853 * Applications are unaware that "struct sk_buff" and other overheads 854 * allocate from the receive buffer during socket buffer allocation. 855 * 856 * And after considering the possible alternatives, returning the value 857 * we actually used in getsockopt is the most desirable behavior. 858 */ 859 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); 860 } 861 862 void sock_set_rcvbuf(struct sock *sk, int val) 863 { 864 lock_sock(sk); 865 __sock_set_rcvbuf(sk, val); 866 release_sock(sk); 867 } 868 EXPORT_SYMBOL(sock_set_rcvbuf); 869 870 static void __sock_set_mark(struct sock *sk, u32 val) 871 { 872 if (val != sk->sk_mark) { 873 sk->sk_mark = val; 874 sk_dst_reset(sk); 875 } 876 } 877 878 void sock_set_mark(struct sock *sk, u32 val) 879 { 880 lock_sock(sk); 881 __sock_set_mark(sk, val); 882 release_sock(sk); 883 } 884 EXPORT_SYMBOL(sock_set_mark); 885 886 /* 887 * This is meant for all protocols to use and covers goings on 888 * at the socket level. Everything here is generic. 889 */ 890 891 int sock_setsockopt(struct socket *sock, int level, int optname, 892 sockptr_t optval, unsigned int optlen) 893 { 894 struct sock_txtime sk_txtime; 895 struct sock *sk = sock->sk; 896 int val; 897 int valbool; 898 struct linger ling; 899 int ret = 0; 900 901 /* 902 * Options without arguments 903 */ 904 905 if (optname == SO_BINDTODEVICE) 906 return sock_setbindtodevice(sk, optval, optlen); 907 908 if (optlen < sizeof(int)) 909 return -EINVAL; 910 911 if (copy_from_sockptr(&val, optval, sizeof(val))) 912 return -EFAULT; 913 914 valbool = val ? 1 : 0; 915 916 lock_sock(sk); 917 918 switch (optname) { 919 case SO_DEBUG: 920 if (val && !capable(CAP_NET_ADMIN)) 921 ret = -EACCES; 922 else 923 sock_valbool_flag(sk, SOCK_DBG, valbool); 924 break; 925 case SO_REUSEADDR: 926 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); 927 break; 928 case SO_REUSEPORT: 929 sk->sk_reuseport = valbool; 930 break; 931 case SO_TYPE: 932 case SO_PROTOCOL: 933 case SO_DOMAIN: 934 case SO_ERROR: 935 ret = -ENOPROTOOPT; 936 break; 937 case SO_DONTROUTE: 938 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); 939 sk_dst_reset(sk); 940 break; 941 case SO_BROADCAST: 942 sock_valbool_flag(sk, SOCK_BROADCAST, valbool); 943 break; 944 case SO_SNDBUF: 945 /* Don't error on this BSD doesn't and if you think 946 * about it this is right. Otherwise apps have to 947 * play 'guess the biggest size' games. RCVBUF/SNDBUF 948 * are treated in BSD as hints 949 */ 950 val = min_t(u32, val, sysctl_wmem_max); 951 set_sndbuf: 952 /* Ensure val * 2 fits into an int, to prevent max_t() 953 * from treating it as a negative value. 954 */ 955 val = min_t(int, val, INT_MAX / 2); 956 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 957 WRITE_ONCE(sk->sk_sndbuf, 958 max_t(int, val * 2, SOCK_MIN_SNDBUF)); 959 /* Wake up sending tasks if we upped the value. */ 960 sk->sk_write_space(sk); 961 break; 962 963 case SO_SNDBUFFORCE: 964 if (!capable(CAP_NET_ADMIN)) { 965 ret = -EPERM; 966 break; 967 } 968 969 /* No negative values (to prevent underflow, as val will be 970 * multiplied by 2). 971 */ 972 if (val < 0) 973 val = 0; 974 goto set_sndbuf; 975 976 case SO_RCVBUF: 977 /* Don't error on this BSD doesn't and if you think 978 * about it this is right. Otherwise apps have to 979 * play 'guess the biggest size' games. RCVBUF/SNDBUF 980 * are treated in BSD as hints 981 */ 982 __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max)); 983 break; 984 985 case SO_RCVBUFFORCE: 986 if (!capable(CAP_NET_ADMIN)) { 987 ret = -EPERM; 988 break; 989 } 990 991 /* No negative values (to prevent underflow, as val will be 992 * multiplied by 2). 993 */ 994 __sock_set_rcvbuf(sk, max(val, 0)); 995 break; 996 997 case SO_KEEPALIVE: 998 if (sk->sk_prot->keepalive) 999 sk->sk_prot->keepalive(sk, valbool); 1000 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 1001 break; 1002 1003 case SO_OOBINLINE: 1004 sock_valbool_flag(sk, SOCK_URGINLINE, valbool); 1005 break; 1006 1007 case SO_NO_CHECK: 1008 sk->sk_no_check_tx = valbool; 1009 break; 1010 1011 case SO_PRIORITY: 1012 if ((val >= 0 && val <= 6) || 1013 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 1014 sk->sk_priority = val; 1015 else 1016 ret = -EPERM; 1017 break; 1018 1019 case SO_LINGER: 1020 if (optlen < sizeof(ling)) { 1021 ret = -EINVAL; /* 1003.1g */ 1022 break; 1023 } 1024 if (copy_from_sockptr(&ling, optval, sizeof(ling))) { 1025 ret = -EFAULT; 1026 break; 1027 } 1028 if (!ling.l_onoff) 1029 sock_reset_flag(sk, SOCK_LINGER); 1030 else { 1031 #if (BITS_PER_LONG == 32) 1032 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) 1033 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; 1034 else 1035 #endif 1036 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; 1037 sock_set_flag(sk, SOCK_LINGER); 1038 } 1039 break; 1040 1041 case SO_BSDCOMPAT: 1042 break; 1043 1044 case SO_PASSCRED: 1045 if (valbool) 1046 set_bit(SOCK_PASSCRED, &sock->flags); 1047 else 1048 clear_bit(SOCK_PASSCRED, &sock->flags); 1049 break; 1050 1051 case SO_TIMESTAMP_OLD: 1052 case SO_TIMESTAMP_NEW: 1053 case SO_TIMESTAMPNS_OLD: 1054 case SO_TIMESTAMPNS_NEW: 1055 sock_set_timestamp(sk, valbool, optname); 1056 break; 1057 1058 case SO_TIMESTAMPING_NEW: 1059 case SO_TIMESTAMPING_OLD: 1060 ret = sock_set_timestamping(sk, optname, val); 1061 break; 1062 1063 case SO_RCVLOWAT: 1064 if (val < 0) 1065 val = INT_MAX; 1066 if (sock->ops->set_rcvlowat) 1067 ret = sock->ops->set_rcvlowat(sk, val); 1068 else 1069 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); 1070 break; 1071 1072 case SO_RCVTIMEO_OLD: 1073 case SO_RCVTIMEO_NEW: 1074 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, 1075 optlen, optname == SO_RCVTIMEO_OLD); 1076 break; 1077 1078 case SO_SNDTIMEO_OLD: 1079 case SO_SNDTIMEO_NEW: 1080 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, 1081 optlen, optname == SO_SNDTIMEO_OLD); 1082 break; 1083 1084 case SO_ATTACH_FILTER: { 1085 struct sock_fprog fprog; 1086 1087 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1088 if (!ret) 1089 ret = sk_attach_filter(&fprog, sk); 1090 break; 1091 } 1092 case SO_ATTACH_BPF: 1093 ret = -EINVAL; 1094 if (optlen == sizeof(u32)) { 1095 u32 ufd; 1096 1097 ret = -EFAULT; 1098 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1099 break; 1100 1101 ret = sk_attach_bpf(ufd, sk); 1102 } 1103 break; 1104 1105 case SO_ATTACH_REUSEPORT_CBPF: { 1106 struct sock_fprog fprog; 1107 1108 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1109 if (!ret) 1110 ret = sk_reuseport_attach_filter(&fprog, sk); 1111 break; 1112 } 1113 case SO_ATTACH_REUSEPORT_EBPF: 1114 ret = -EINVAL; 1115 if (optlen == sizeof(u32)) { 1116 u32 ufd; 1117 1118 ret = -EFAULT; 1119 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1120 break; 1121 1122 ret = sk_reuseport_attach_bpf(ufd, sk); 1123 } 1124 break; 1125 1126 case SO_DETACH_REUSEPORT_BPF: 1127 ret = reuseport_detach_prog(sk); 1128 break; 1129 1130 case SO_DETACH_FILTER: 1131 ret = sk_detach_filter(sk); 1132 break; 1133 1134 case SO_LOCK_FILTER: 1135 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) 1136 ret = -EPERM; 1137 else 1138 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); 1139 break; 1140 1141 case SO_PASSSEC: 1142 if (valbool) 1143 set_bit(SOCK_PASSSEC, &sock->flags); 1144 else 1145 clear_bit(SOCK_PASSSEC, &sock->flags); 1146 break; 1147 case SO_MARK: 1148 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1149 ret = -EPERM; 1150 break; 1151 } 1152 1153 __sock_set_mark(sk, val); 1154 break; 1155 1156 case SO_RXQ_OVFL: 1157 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); 1158 break; 1159 1160 case SO_WIFI_STATUS: 1161 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); 1162 break; 1163 1164 case SO_PEEK_OFF: 1165 if (sock->ops->set_peek_off) 1166 ret = sock->ops->set_peek_off(sk, val); 1167 else 1168 ret = -EOPNOTSUPP; 1169 break; 1170 1171 case SO_NOFCS: 1172 sock_valbool_flag(sk, SOCK_NOFCS, valbool); 1173 break; 1174 1175 case SO_SELECT_ERR_QUEUE: 1176 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); 1177 break; 1178 1179 #ifdef CONFIG_NET_RX_BUSY_POLL 1180 case SO_BUSY_POLL: 1181 /* allow unprivileged users to decrease the value */ 1182 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) 1183 ret = -EPERM; 1184 else { 1185 if (val < 0) 1186 ret = -EINVAL; 1187 else 1188 sk->sk_ll_usec = val; 1189 } 1190 break; 1191 case SO_PREFER_BUSY_POLL: 1192 if (valbool && !capable(CAP_NET_ADMIN)) 1193 ret = -EPERM; 1194 else 1195 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); 1196 break; 1197 case SO_BUSY_POLL_BUDGET: 1198 if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) { 1199 ret = -EPERM; 1200 } else { 1201 if (val < 0 || val > U16_MAX) 1202 ret = -EINVAL; 1203 else 1204 WRITE_ONCE(sk->sk_busy_poll_budget, val); 1205 } 1206 break; 1207 #endif 1208 1209 case SO_MAX_PACING_RATE: 1210 { 1211 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; 1212 1213 if (sizeof(ulval) != sizeof(val) && 1214 optlen >= sizeof(ulval) && 1215 copy_from_sockptr(&ulval, optval, sizeof(ulval))) { 1216 ret = -EFAULT; 1217 break; 1218 } 1219 if (ulval != ~0UL) 1220 cmpxchg(&sk->sk_pacing_status, 1221 SK_PACING_NONE, 1222 SK_PACING_NEEDED); 1223 sk->sk_max_pacing_rate = ulval; 1224 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval); 1225 break; 1226 } 1227 case SO_INCOMING_CPU: 1228 WRITE_ONCE(sk->sk_incoming_cpu, val); 1229 break; 1230 1231 case SO_CNX_ADVICE: 1232 if (val == 1) 1233 dst_negative_advice(sk); 1234 break; 1235 1236 case SO_ZEROCOPY: 1237 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { 1238 if (!((sk->sk_type == SOCK_STREAM && 1239 sk->sk_protocol == IPPROTO_TCP) || 1240 (sk->sk_type == SOCK_DGRAM && 1241 sk->sk_protocol == IPPROTO_UDP))) 1242 ret = -ENOTSUPP; 1243 } else if (sk->sk_family != PF_RDS) { 1244 ret = -ENOTSUPP; 1245 } 1246 if (!ret) { 1247 if (val < 0 || val > 1) 1248 ret = -EINVAL; 1249 else 1250 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); 1251 } 1252 break; 1253 1254 case SO_TXTIME: 1255 if (optlen != sizeof(struct sock_txtime)) { 1256 ret = -EINVAL; 1257 break; 1258 } else if (copy_from_sockptr(&sk_txtime, optval, 1259 sizeof(struct sock_txtime))) { 1260 ret = -EFAULT; 1261 break; 1262 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { 1263 ret = -EINVAL; 1264 break; 1265 } 1266 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet 1267 * scheduler has enough safe guards. 1268 */ 1269 if (sk_txtime.clockid != CLOCK_MONOTONIC && 1270 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1271 ret = -EPERM; 1272 break; 1273 } 1274 sock_valbool_flag(sk, SOCK_TXTIME, true); 1275 sk->sk_clockid = sk_txtime.clockid; 1276 sk->sk_txtime_deadline_mode = 1277 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); 1278 sk->sk_txtime_report_errors = 1279 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); 1280 break; 1281 1282 case SO_BINDTOIFINDEX: 1283 ret = sock_bindtoindex_locked(sk, val); 1284 break; 1285 1286 default: 1287 ret = -ENOPROTOOPT; 1288 break; 1289 } 1290 release_sock(sk); 1291 return ret; 1292 } 1293 EXPORT_SYMBOL(sock_setsockopt); 1294 1295 1296 static void cred_to_ucred(struct pid *pid, const struct cred *cred, 1297 struct ucred *ucred) 1298 { 1299 ucred->pid = pid_vnr(pid); 1300 ucred->uid = ucred->gid = -1; 1301 if (cred) { 1302 struct user_namespace *current_ns = current_user_ns(); 1303 1304 ucred->uid = from_kuid_munged(current_ns, cred->euid); 1305 ucred->gid = from_kgid_munged(current_ns, cred->egid); 1306 } 1307 } 1308 1309 static int groups_to_user(gid_t __user *dst, const struct group_info *src) 1310 { 1311 struct user_namespace *user_ns = current_user_ns(); 1312 int i; 1313 1314 for (i = 0; i < src->ngroups; i++) 1315 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i)) 1316 return -EFAULT; 1317 1318 return 0; 1319 } 1320 1321 int sock_getsockopt(struct socket *sock, int level, int optname, 1322 char __user *optval, int __user *optlen) 1323 { 1324 struct sock *sk = sock->sk; 1325 1326 union { 1327 int val; 1328 u64 val64; 1329 unsigned long ulval; 1330 struct linger ling; 1331 struct old_timeval32 tm32; 1332 struct __kernel_old_timeval tm; 1333 struct __kernel_sock_timeval stm; 1334 struct sock_txtime txtime; 1335 } v; 1336 1337 int lv = sizeof(int); 1338 int len; 1339 1340 if (get_user(len, optlen)) 1341 return -EFAULT; 1342 if (len < 0) 1343 return -EINVAL; 1344 1345 memset(&v, 0, sizeof(v)); 1346 1347 switch (optname) { 1348 case SO_DEBUG: 1349 v.val = sock_flag(sk, SOCK_DBG); 1350 break; 1351 1352 case SO_DONTROUTE: 1353 v.val = sock_flag(sk, SOCK_LOCALROUTE); 1354 break; 1355 1356 case SO_BROADCAST: 1357 v.val = sock_flag(sk, SOCK_BROADCAST); 1358 break; 1359 1360 case SO_SNDBUF: 1361 v.val = sk->sk_sndbuf; 1362 break; 1363 1364 case SO_RCVBUF: 1365 v.val = sk->sk_rcvbuf; 1366 break; 1367 1368 case SO_REUSEADDR: 1369 v.val = sk->sk_reuse; 1370 break; 1371 1372 case SO_REUSEPORT: 1373 v.val = sk->sk_reuseport; 1374 break; 1375 1376 case SO_KEEPALIVE: 1377 v.val = sock_flag(sk, SOCK_KEEPOPEN); 1378 break; 1379 1380 case SO_TYPE: 1381 v.val = sk->sk_type; 1382 break; 1383 1384 case SO_PROTOCOL: 1385 v.val = sk->sk_protocol; 1386 break; 1387 1388 case SO_DOMAIN: 1389 v.val = sk->sk_family; 1390 break; 1391 1392 case SO_ERROR: 1393 v.val = -sock_error(sk); 1394 if (v.val == 0) 1395 v.val = xchg(&sk->sk_err_soft, 0); 1396 break; 1397 1398 case SO_OOBINLINE: 1399 v.val = sock_flag(sk, SOCK_URGINLINE); 1400 break; 1401 1402 case SO_NO_CHECK: 1403 v.val = sk->sk_no_check_tx; 1404 break; 1405 1406 case SO_PRIORITY: 1407 v.val = sk->sk_priority; 1408 break; 1409 1410 case SO_LINGER: 1411 lv = sizeof(v.ling); 1412 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); 1413 v.ling.l_linger = sk->sk_lingertime / HZ; 1414 break; 1415 1416 case SO_BSDCOMPAT: 1417 break; 1418 1419 case SO_TIMESTAMP_OLD: 1420 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && 1421 !sock_flag(sk, SOCK_TSTAMP_NEW) && 1422 !sock_flag(sk, SOCK_RCVTSTAMPNS); 1423 break; 1424 1425 case SO_TIMESTAMPNS_OLD: 1426 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); 1427 break; 1428 1429 case SO_TIMESTAMP_NEW: 1430 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); 1431 break; 1432 1433 case SO_TIMESTAMPNS_NEW: 1434 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); 1435 break; 1436 1437 case SO_TIMESTAMPING_OLD: 1438 v.val = sk->sk_tsflags; 1439 break; 1440 1441 case SO_RCVTIMEO_OLD: 1442 case SO_RCVTIMEO_NEW: 1443 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname); 1444 break; 1445 1446 case SO_SNDTIMEO_OLD: 1447 case SO_SNDTIMEO_NEW: 1448 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname); 1449 break; 1450 1451 case SO_RCVLOWAT: 1452 v.val = sk->sk_rcvlowat; 1453 break; 1454 1455 case SO_SNDLOWAT: 1456 v.val = 1; 1457 break; 1458 1459 case SO_PASSCRED: 1460 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); 1461 break; 1462 1463 case SO_PEERCRED: 1464 { 1465 struct ucred peercred; 1466 if (len > sizeof(peercred)) 1467 len = sizeof(peercred); 1468 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); 1469 if (copy_to_user(optval, &peercred, len)) 1470 return -EFAULT; 1471 goto lenout; 1472 } 1473 1474 case SO_PEERGROUPS: 1475 { 1476 int ret, n; 1477 1478 if (!sk->sk_peer_cred) 1479 return -ENODATA; 1480 1481 n = sk->sk_peer_cred->group_info->ngroups; 1482 if (len < n * sizeof(gid_t)) { 1483 len = n * sizeof(gid_t); 1484 return put_user(len, optlen) ? -EFAULT : -ERANGE; 1485 } 1486 len = n * sizeof(gid_t); 1487 1488 ret = groups_to_user((gid_t __user *)optval, 1489 sk->sk_peer_cred->group_info); 1490 if (ret) 1491 return ret; 1492 goto lenout; 1493 } 1494 1495 case SO_PEERNAME: 1496 { 1497 char address[128]; 1498 1499 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2); 1500 if (lv < 0) 1501 return -ENOTCONN; 1502 if (lv < len) 1503 return -EINVAL; 1504 if (copy_to_user(optval, address, len)) 1505 return -EFAULT; 1506 goto lenout; 1507 } 1508 1509 /* Dubious BSD thing... Probably nobody even uses it, but 1510 * the UNIX standard wants it for whatever reason... -DaveM 1511 */ 1512 case SO_ACCEPTCONN: 1513 v.val = sk->sk_state == TCP_LISTEN; 1514 break; 1515 1516 case SO_PASSSEC: 1517 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); 1518 break; 1519 1520 case SO_PEERSEC: 1521 return security_socket_getpeersec_stream(sock, optval, optlen, len); 1522 1523 case SO_MARK: 1524 v.val = sk->sk_mark; 1525 break; 1526 1527 case SO_RXQ_OVFL: 1528 v.val = sock_flag(sk, SOCK_RXQ_OVFL); 1529 break; 1530 1531 case SO_WIFI_STATUS: 1532 v.val = sock_flag(sk, SOCK_WIFI_STATUS); 1533 break; 1534 1535 case SO_PEEK_OFF: 1536 if (!sock->ops->set_peek_off) 1537 return -EOPNOTSUPP; 1538 1539 v.val = sk->sk_peek_off; 1540 break; 1541 case SO_NOFCS: 1542 v.val = sock_flag(sk, SOCK_NOFCS); 1543 break; 1544 1545 case SO_BINDTODEVICE: 1546 return sock_getbindtodevice(sk, optval, optlen, len); 1547 1548 case SO_GET_FILTER: 1549 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); 1550 if (len < 0) 1551 return len; 1552 1553 goto lenout; 1554 1555 case SO_LOCK_FILTER: 1556 v.val = sock_flag(sk, SOCK_FILTER_LOCKED); 1557 break; 1558 1559 case SO_BPF_EXTENSIONS: 1560 v.val = bpf_tell_extensions(); 1561 break; 1562 1563 case SO_SELECT_ERR_QUEUE: 1564 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); 1565 break; 1566 1567 #ifdef CONFIG_NET_RX_BUSY_POLL 1568 case SO_BUSY_POLL: 1569 v.val = sk->sk_ll_usec; 1570 break; 1571 case SO_PREFER_BUSY_POLL: 1572 v.val = READ_ONCE(sk->sk_prefer_busy_poll); 1573 break; 1574 #endif 1575 1576 case SO_MAX_PACING_RATE: 1577 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { 1578 lv = sizeof(v.ulval); 1579 v.ulval = sk->sk_max_pacing_rate; 1580 } else { 1581 /* 32bit version */ 1582 v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U); 1583 } 1584 break; 1585 1586 case SO_INCOMING_CPU: 1587 v.val = READ_ONCE(sk->sk_incoming_cpu); 1588 break; 1589 1590 case SO_MEMINFO: 1591 { 1592 u32 meminfo[SK_MEMINFO_VARS]; 1593 1594 sk_get_meminfo(sk, meminfo); 1595 1596 len = min_t(unsigned int, len, sizeof(meminfo)); 1597 if (copy_to_user(optval, &meminfo, len)) 1598 return -EFAULT; 1599 1600 goto lenout; 1601 } 1602 1603 #ifdef CONFIG_NET_RX_BUSY_POLL 1604 case SO_INCOMING_NAPI_ID: 1605 v.val = READ_ONCE(sk->sk_napi_id); 1606 1607 /* aggregate non-NAPI IDs down to 0 */ 1608 if (v.val < MIN_NAPI_ID) 1609 v.val = 0; 1610 1611 break; 1612 #endif 1613 1614 case SO_COOKIE: 1615 lv = sizeof(u64); 1616 if (len < lv) 1617 return -EINVAL; 1618 v.val64 = sock_gen_cookie(sk); 1619 break; 1620 1621 case SO_ZEROCOPY: 1622 v.val = sock_flag(sk, SOCK_ZEROCOPY); 1623 break; 1624 1625 case SO_TXTIME: 1626 lv = sizeof(v.txtime); 1627 v.txtime.clockid = sk->sk_clockid; 1628 v.txtime.flags |= sk->sk_txtime_deadline_mode ? 1629 SOF_TXTIME_DEADLINE_MODE : 0; 1630 v.txtime.flags |= sk->sk_txtime_report_errors ? 1631 SOF_TXTIME_REPORT_ERRORS : 0; 1632 break; 1633 1634 case SO_BINDTOIFINDEX: 1635 v.val = sk->sk_bound_dev_if; 1636 break; 1637 1638 default: 1639 /* We implement the SO_SNDLOWAT etc to not be settable 1640 * (1003.1g 7). 1641 */ 1642 return -ENOPROTOOPT; 1643 } 1644 1645 if (len > lv) 1646 len = lv; 1647 if (copy_to_user(optval, &v, len)) 1648 return -EFAULT; 1649 lenout: 1650 if (put_user(len, optlen)) 1651 return -EFAULT; 1652 return 0; 1653 } 1654 1655 /* 1656 * Initialize an sk_lock. 1657 * 1658 * (We also register the sk_lock with the lock validator.) 1659 */ 1660 static inline void sock_lock_init(struct sock *sk) 1661 { 1662 if (sk->sk_kern_sock) 1663 sock_lock_init_class_and_name( 1664 sk, 1665 af_family_kern_slock_key_strings[sk->sk_family], 1666 af_family_kern_slock_keys + sk->sk_family, 1667 af_family_kern_key_strings[sk->sk_family], 1668 af_family_kern_keys + sk->sk_family); 1669 else 1670 sock_lock_init_class_and_name( 1671 sk, 1672 af_family_slock_key_strings[sk->sk_family], 1673 af_family_slock_keys + sk->sk_family, 1674 af_family_key_strings[sk->sk_family], 1675 af_family_keys + sk->sk_family); 1676 } 1677 1678 /* 1679 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 1680 * even temporarly, because of RCU lookups. sk_node should also be left as is. 1681 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end 1682 */ 1683 static void sock_copy(struct sock *nsk, const struct sock *osk) 1684 { 1685 const struct proto *prot = READ_ONCE(osk->sk_prot); 1686 #ifdef CONFIG_SECURITY_NETWORK 1687 void *sptr = nsk->sk_security; 1688 #endif 1689 1690 /* If we move sk_tx_queue_mapping out of the private section, 1691 * we must check if sk_tx_queue_clear() is called after 1692 * sock_copy() in sk_clone_lock(). 1693 */ 1694 BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) < 1695 offsetof(struct sock, sk_dontcopy_begin) || 1696 offsetof(struct sock, sk_tx_queue_mapping) >= 1697 offsetof(struct sock, sk_dontcopy_end)); 1698 1699 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); 1700 1701 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, 1702 prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); 1703 1704 #ifdef CONFIG_SECURITY_NETWORK 1705 nsk->sk_security = sptr; 1706 security_sk_clone(osk, nsk); 1707 #endif 1708 } 1709 1710 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 1711 int family) 1712 { 1713 struct sock *sk; 1714 struct kmem_cache *slab; 1715 1716 slab = prot->slab; 1717 if (slab != NULL) { 1718 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 1719 if (!sk) 1720 return sk; 1721 if (want_init_on_alloc(priority)) 1722 sk_prot_clear_nulls(sk, prot->obj_size); 1723 } else 1724 sk = kmalloc(prot->obj_size, priority); 1725 1726 if (sk != NULL) { 1727 if (security_sk_alloc(sk, family, priority)) 1728 goto out_free; 1729 1730 if (!try_module_get(prot->owner)) 1731 goto out_free_sec; 1732 } 1733 1734 return sk; 1735 1736 out_free_sec: 1737 security_sk_free(sk); 1738 out_free: 1739 if (slab != NULL) 1740 kmem_cache_free(slab, sk); 1741 else 1742 kfree(sk); 1743 return NULL; 1744 } 1745 1746 static void sk_prot_free(struct proto *prot, struct sock *sk) 1747 { 1748 struct kmem_cache *slab; 1749 struct module *owner; 1750 1751 owner = prot->owner; 1752 slab = prot->slab; 1753 1754 cgroup_sk_free(&sk->sk_cgrp_data); 1755 mem_cgroup_sk_free(sk); 1756 security_sk_free(sk); 1757 if (slab != NULL) 1758 kmem_cache_free(slab, sk); 1759 else 1760 kfree(sk); 1761 module_put(owner); 1762 } 1763 1764 /** 1765 * sk_alloc - All socket objects are allocated here 1766 * @net: the applicable net namespace 1767 * @family: protocol family 1768 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1769 * @prot: struct proto associated with this new sock instance 1770 * @kern: is this to be a kernel socket? 1771 */ 1772 struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 1773 struct proto *prot, int kern) 1774 { 1775 struct sock *sk; 1776 1777 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); 1778 if (sk) { 1779 sk->sk_family = family; 1780 /* 1781 * See comment in struct sock definition to understand 1782 * why we need sk_prot_creator -acme 1783 */ 1784 sk->sk_prot = sk->sk_prot_creator = prot; 1785 sk->sk_kern_sock = kern; 1786 sock_lock_init(sk); 1787 sk->sk_net_refcnt = kern ? 0 : 1; 1788 if (likely(sk->sk_net_refcnt)) { 1789 get_net(net); 1790 sock_inuse_add(net, 1); 1791 } 1792 1793 sock_net_set(sk, net); 1794 refcount_set(&sk->sk_wmem_alloc, 1); 1795 1796 mem_cgroup_sk_alloc(sk); 1797 cgroup_sk_alloc(&sk->sk_cgrp_data); 1798 sock_update_classid(&sk->sk_cgrp_data); 1799 sock_update_netprioidx(&sk->sk_cgrp_data); 1800 sk_tx_queue_clear(sk); 1801 } 1802 1803 return sk; 1804 } 1805 EXPORT_SYMBOL(sk_alloc); 1806 1807 /* Sockets having SOCK_RCU_FREE will call this function after one RCU 1808 * grace period. This is the case for UDP sockets and TCP listeners. 1809 */ 1810 static void __sk_destruct(struct rcu_head *head) 1811 { 1812 struct sock *sk = container_of(head, struct sock, sk_rcu); 1813 struct sk_filter *filter; 1814 1815 if (sk->sk_destruct) 1816 sk->sk_destruct(sk); 1817 1818 filter = rcu_dereference_check(sk->sk_filter, 1819 refcount_read(&sk->sk_wmem_alloc) == 0); 1820 if (filter) { 1821 sk_filter_uncharge(sk, filter); 1822 RCU_INIT_POINTER(sk->sk_filter, NULL); 1823 } 1824 1825 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); 1826 1827 #ifdef CONFIG_BPF_SYSCALL 1828 bpf_sk_storage_free(sk); 1829 #endif 1830 1831 if (atomic_read(&sk->sk_omem_alloc)) 1832 pr_debug("%s: optmem leakage (%d bytes) detected\n", 1833 __func__, atomic_read(&sk->sk_omem_alloc)); 1834 1835 if (sk->sk_frag.page) { 1836 put_page(sk->sk_frag.page); 1837 sk->sk_frag.page = NULL; 1838 } 1839 1840 if (sk->sk_peer_cred) 1841 put_cred(sk->sk_peer_cred); 1842 put_pid(sk->sk_peer_pid); 1843 if (likely(sk->sk_net_refcnt)) 1844 put_net(sock_net(sk)); 1845 sk_prot_free(sk->sk_prot_creator, sk); 1846 } 1847 1848 void sk_destruct(struct sock *sk) 1849 { 1850 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); 1851 1852 if (rcu_access_pointer(sk->sk_reuseport_cb)) { 1853 reuseport_detach_sock(sk); 1854 use_call_rcu = true; 1855 } 1856 1857 if (use_call_rcu) 1858 call_rcu(&sk->sk_rcu, __sk_destruct); 1859 else 1860 __sk_destruct(&sk->sk_rcu); 1861 } 1862 1863 static void __sk_free(struct sock *sk) 1864 { 1865 if (likely(sk->sk_net_refcnt)) 1866 sock_inuse_add(sock_net(sk), -1); 1867 1868 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) 1869 sock_diag_broadcast_destroy(sk); 1870 else 1871 sk_destruct(sk); 1872 } 1873 1874 void sk_free(struct sock *sk) 1875 { 1876 /* 1877 * We subtract one from sk_wmem_alloc and can know if 1878 * some packets are still in some tx queue. 1879 * If not null, sock_wfree() will call __sk_free(sk) later 1880 */ 1881 if (refcount_dec_and_test(&sk->sk_wmem_alloc)) 1882 __sk_free(sk); 1883 } 1884 EXPORT_SYMBOL(sk_free); 1885 1886 static void sk_init_common(struct sock *sk) 1887 { 1888 skb_queue_head_init(&sk->sk_receive_queue); 1889 skb_queue_head_init(&sk->sk_write_queue); 1890 skb_queue_head_init(&sk->sk_error_queue); 1891 1892 rwlock_init(&sk->sk_callback_lock); 1893 lockdep_set_class_and_name(&sk->sk_receive_queue.lock, 1894 af_rlock_keys + sk->sk_family, 1895 af_family_rlock_key_strings[sk->sk_family]); 1896 lockdep_set_class_and_name(&sk->sk_write_queue.lock, 1897 af_wlock_keys + sk->sk_family, 1898 af_family_wlock_key_strings[sk->sk_family]); 1899 lockdep_set_class_and_name(&sk->sk_error_queue.lock, 1900 af_elock_keys + sk->sk_family, 1901 af_family_elock_key_strings[sk->sk_family]); 1902 lockdep_set_class_and_name(&sk->sk_callback_lock, 1903 af_callback_keys + sk->sk_family, 1904 af_family_clock_key_strings[sk->sk_family]); 1905 } 1906 1907 /** 1908 * sk_clone_lock - clone a socket, and lock its clone 1909 * @sk: the socket to clone 1910 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1911 * 1912 * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) 1913 */ 1914 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) 1915 { 1916 struct proto *prot = READ_ONCE(sk->sk_prot); 1917 struct sk_filter *filter; 1918 bool is_charged = true; 1919 struct sock *newsk; 1920 1921 newsk = sk_prot_alloc(prot, priority, sk->sk_family); 1922 if (!newsk) 1923 goto out; 1924 1925 sock_copy(newsk, sk); 1926 1927 newsk->sk_prot_creator = prot; 1928 1929 /* SANITY */ 1930 if (likely(newsk->sk_net_refcnt)) 1931 get_net(sock_net(newsk)); 1932 sk_node_init(&newsk->sk_node); 1933 sock_lock_init(newsk); 1934 bh_lock_sock(newsk); 1935 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 1936 newsk->sk_backlog.len = 0; 1937 1938 atomic_set(&newsk->sk_rmem_alloc, 0); 1939 1940 /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */ 1941 refcount_set(&newsk->sk_wmem_alloc, 1); 1942 1943 atomic_set(&newsk->sk_omem_alloc, 0); 1944 sk_init_common(newsk); 1945 1946 newsk->sk_dst_cache = NULL; 1947 newsk->sk_dst_pending_confirm = 0; 1948 newsk->sk_wmem_queued = 0; 1949 newsk->sk_forward_alloc = 0; 1950 atomic_set(&newsk->sk_drops, 0); 1951 newsk->sk_send_head = NULL; 1952 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; 1953 atomic_set(&newsk->sk_zckey, 0); 1954 1955 sock_reset_flag(newsk, SOCK_DONE); 1956 1957 /* sk->sk_memcg will be populated at accept() time */ 1958 newsk->sk_memcg = NULL; 1959 1960 cgroup_sk_clone(&newsk->sk_cgrp_data); 1961 1962 rcu_read_lock(); 1963 filter = rcu_dereference(sk->sk_filter); 1964 if (filter != NULL) 1965 /* though it's an empty new sock, the charging may fail 1966 * if sysctl_optmem_max was changed between creation of 1967 * original socket and cloning 1968 */ 1969 is_charged = sk_filter_charge(newsk, filter); 1970 RCU_INIT_POINTER(newsk->sk_filter, filter); 1971 rcu_read_unlock(); 1972 1973 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { 1974 /* We need to make sure that we don't uncharge the new 1975 * socket if we couldn't charge it in the first place 1976 * as otherwise we uncharge the parent's filter. 1977 */ 1978 if (!is_charged) 1979 RCU_INIT_POINTER(newsk->sk_filter, NULL); 1980 sk_free_unlock_clone(newsk); 1981 newsk = NULL; 1982 goto out; 1983 } 1984 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); 1985 1986 if (bpf_sk_storage_clone(sk, newsk)) { 1987 sk_free_unlock_clone(newsk); 1988 newsk = NULL; 1989 goto out; 1990 } 1991 1992 /* Clear sk_user_data if parent had the pointer tagged 1993 * as not suitable for copying when cloning. 1994 */ 1995 if (sk_user_data_is_nocopy(newsk)) 1996 newsk->sk_user_data = NULL; 1997 1998 newsk->sk_err = 0; 1999 newsk->sk_err_soft = 0; 2000 newsk->sk_priority = 0; 2001 newsk->sk_incoming_cpu = raw_smp_processor_id(); 2002 if (likely(newsk->sk_net_refcnt)) 2003 sock_inuse_add(sock_net(newsk), 1); 2004 2005 /* Before updating sk_refcnt, we must commit prior changes to memory 2006 * (Documentation/RCU/rculist_nulls.rst for details) 2007 */ 2008 smp_wmb(); 2009 refcount_set(&newsk->sk_refcnt, 2); 2010 2011 /* Increment the counter in the same struct proto as the master 2012 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that 2013 * is the same as sk->sk_prot->socks, as this field was copied 2014 * with memcpy). 2015 * 2016 * This _changes_ the previous behaviour, where 2017 * tcp_create_openreq_child always was incrementing the 2018 * equivalent to tcp_prot->socks (inet_sock_nr), so this have 2019 * to be taken into account in all callers. -acme 2020 */ 2021 sk_refcnt_debug_inc(newsk); 2022 sk_set_socket(newsk, NULL); 2023 sk_tx_queue_clear(newsk); 2024 RCU_INIT_POINTER(newsk->sk_wq, NULL); 2025 2026 if (newsk->sk_prot->sockets_allocated) 2027 sk_sockets_allocated_inc(newsk); 2028 2029 if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP) 2030 net_enable_timestamp(); 2031 out: 2032 return newsk; 2033 } 2034 EXPORT_SYMBOL_GPL(sk_clone_lock); 2035 2036 void sk_free_unlock_clone(struct sock *sk) 2037 { 2038 /* It is still raw copy of parent, so invalidate 2039 * destructor and make plain sk_free() */ 2040 sk->sk_destruct = NULL; 2041 bh_unlock_sock(sk); 2042 sk_free(sk); 2043 } 2044 EXPORT_SYMBOL_GPL(sk_free_unlock_clone); 2045 2046 void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 2047 { 2048 u32 max_segs = 1; 2049 2050 sk_dst_set(sk, dst); 2051 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps; 2052 if (sk->sk_route_caps & NETIF_F_GSO) 2053 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 2054 sk->sk_route_caps &= ~sk->sk_route_nocaps; 2055 if (sk_can_gso(sk)) { 2056 if (dst->header_len && !xfrm_dst_offload_ok(dst)) { 2057 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 2058 } else { 2059 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 2060 sk->sk_gso_max_size = dst->dev->gso_max_size; 2061 max_segs = max_t(u32, dst->dev->gso_max_segs, 1); 2062 } 2063 } 2064 sk->sk_gso_max_segs = max_segs; 2065 } 2066 EXPORT_SYMBOL_GPL(sk_setup_caps); 2067 2068 /* 2069 * Simple resource managers for sockets. 2070 */ 2071 2072 2073 /* 2074 * Write buffer destructor automatically called from kfree_skb. 2075 */ 2076 void sock_wfree(struct sk_buff *skb) 2077 { 2078 struct sock *sk = skb->sk; 2079 unsigned int len = skb->truesize; 2080 2081 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { 2082 /* 2083 * Keep a reference on sk_wmem_alloc, this will be released 2084 * after sk_write_space() call 2085 */ 2086 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); 2087 sk->sk_write_space(sk); 2088 len = 1; 2089 } 2090 /* 2091 * if sk_wmem_alloc reaches 0, we must finish what sk_free() 2092 * could not do because of in-flight packets 2093 */ 2094 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) 2095 __sk_free(sk); 2096 } 2097 EXPORT_SYMBOL(sock_wfree); 2098 2099 /* This variant of sock_wfree() is used by TCP, 2100 * since it sets SOCK_USE_WRITE_QUEUE. 2101 */ 2102 void __sock_wfree(struct sk_buff *skb) 2103 { 2104 struct sock *sk = skb->sk; 2105 2106 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) 2107 __sk_free(sk); 2108 } 2109 2110 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) 2111 { 2112 skb_orphan(skb); 2113 skb->sk = sk; 2114 #ifdef CONFIG_INET 2115 if (unlikely(!sk_fullsock(sk))) { 2116 skb->destructor = sock_edemux; 2117 sock_hold(sk); 2118 return; 2119 } 2120 #endif 2121 skb->destructor = sock_wfree; 2122 skb_set_hash_from_sk(skb, sk); 2123 /* 2124 * We used to take a refcount on sk, but following operation 2125 * is enough to guarantee sk_free() wont free this sock until 2126 * all in-flight packets are completed 2127 */ 2128 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 2129 } 2130 EXPORT_SYMBOL(skb_set_owner_w); 2131 2132 static bool can_skb_orphan_partial(const struct sk_buff *skb) 2133 { 2134 #ifdef CONFIG_TLS_DEVICE 2135 /* Drivers depend on in-order delivery for crypto offload, 2136 * partial orphan breaks out-of-order-OK logic. 2137 */ 2138 if (skb->decrypted) 2139 return false; 2140 #endif 2141 return (skb->destructor == sock_wfree || 2142 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); 2143 } 2144 2145 /* This helper is used by netem, as it can hold packets in its 2146 * delay queue. We want to allow the owner socket to send more 2147 * packets, as if they were already TX completed by a typical driver. 2148 * But we also want to keep skb->sk set because some packet schedulers 2149 * rely on it (sch_fq for example). 2150 */ 2151 void skb_orphan_partial(struct sk_buff *skb) 2152 { 2153 if (skb_is_tcp_pure_ack(skb)) 2154 return; 2155 2156 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk)) 2157 return; 2158 2159 skb_orphan(skb); 2160 } 2161 EXPORT_SYMBOL(skb_orphan_partial); 2162 2163 /* 2164 * Read buffer destructor automatically called from kfree_skb. 2165 */ 2166 void sock_rfree(struct sk_buff *skb) 2167 { 2168 struct sock *sk = skb->sk; 2169 unsigned int len = skb->truesize; 2170 2171 atomic_sub(len, &sk->sk_rmem_alloc); 2172 sk_mem_uncharge(sk, len); 2173 } 2174 EXPORT_SYMBOL(sock_rfree); 2175 2176 /* 2177 * Buffer destructor for skbs that are not used directly in read or write 2178 * path, e.g. for error handler skbs. Automatically called from kfree_skb. 2179 */ 2180 void sock_efree(struct sk_buff *skb) 2181 { 2182 sock_put(skb->sk); 2183 } 2184 EXPORT_SYMBOL(sock_efree); 2185 2186 /* Buffer destructor for prefetch/receive path where reference count may 2187 * not be held, e.g. for listen sockets. 2188 */ 2189 #ifdef CONFIG_INET 2190 void sock_pfree(struct sk_buff *skb) 2191 { 2192 if (sk_is_refcounted(skb->sk)) 2193 sock_gen_put(skb->sk); 2194 } 2195 EXPORT_SYMBOL(sock_pfree); 2196 #endif /* CONFIG_INET */ 2197 2198 kuid_t sock_i_uid(struct sock *sk) 2199 { 2200 kuid_t uid; 2201 2202 read_lock_bh(&sk->sk_callback_lock); 2203 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; 2204 read_unlock_bh(&sk->sk_callback_lock); 2205 return uid; 2206 } 2207 EXPORT_SYMBOL(sock_i_uid); 2208 2209 unsigned long sock_i_ino(struct sock *sk) 2210 { 2211 unsigned long ino; 2212 2213 read_lock_bh(&sk->sk_callback_lock); 2214 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; 2215 read_unlock_bh(&sk->sk_callback_lock); 2216 return ino; 2217 } 2218 EXPORT_SYMBOL(sock_i_ino); 2219 2220 /* 2221 * Allocate a skb from the socket's send buffer. 2222 */ 2223 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 2224 gfp_t priority) 2225 { 2226 if (force || 2227 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { 2228 struct sk_buff *skb = alloc_skb(size, priority); 2229 2230 if (skb) { 2231 skb_set_owner_w(skb, sk); 2232 return skb; 2233 } 2234 } 2235 return NULL; 2236 } 2237 EXPORT_SYMBOL(sock_wmalloc); 2238 2239 static void sock_ofree(struct sk_buff *skb) 2240 { 2241 struct sock *sk = skb->sk; 2242 2243 atomic_sub(skb->truesize, &sk->sk_omem_alloc); 2244 } 2245 2246 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, 2247 gfp_t priority) 2248 { 2249 struct sk_buff *skb; 2250 2251 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ 2252 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > 2253 sysctl_optmem_max) 2254 return NULL; 2255 2256 skb = alloc_skb(size, priority); 2257 if (!skb) 2258 return NULL; 2259 2260 atomic_add(skb->truesize, &sk->sk_omem_alloc); 2261 skb->sk = sk; 2262 skb->destructor = sock_ofree; 2263 return skb; 2264 } 2265 2266 /* 2267 * Allocate a memory block from the socket's option memory buffer. 2268 */ 2269 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) 2270 { 2271 if ((unsigned int)size <= sysctl_optmem_max && 2272 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { 2273 void *mem; 2274 /* First do the add, to avoid the race if kmalloc 2275 * might sleep. 2276 */ 2277 atomic_add(size, &sk->sk_omem_alloc); 2278 mem = kmalloc(size, priority); 2279 if (mem) 2280 return mem; 2281 atomic_sub(size, &sk->sk_omem_alloc); 2282 } 2283 return NULL; 2284 } 2285 EXPORT_SYMBOL(sock_kmalloc); 2286 2287 /* Free an option memory block. Note, we actually want the inline 2288 * here as this allows gcc to detect the nullify and fold away the 2289 * condition entirely. 2290 */ 2291 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, 2292 const bool nullify) 2293 { 2294 if (WARN_ON_ONCE(!mem)) 2295 return; 2296 if (nullify) 2297 kfree_sensitive(mem); 2298 else 2299 kfree(mem); 2300 atomic_sub(size, &sk->sk_omem_alloc); 2301 } 2302 2303 void sock_kfree_s(struct sock *sk, void *mem, int size) 2304 { 2305 __sock_kfree_s(sk, mem, size, false); 2306 } 2307 EXPORT_SYMBOL(sock_kfree_s); 2308 2309 void sock_kzfree_s(struct sock *sk, void *mem, int size) 2310 { 2311 __sock_kfree_s(sk, mem, size, true); 2312 } 2313 EXPORT_SYMBOL(sock_kzfree_s); 2314 2315 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. 2316 I think, these locks should be removed for datagram sockets. 2317 */ 2318 static long sock_wait_for_wmem(struct sock *sk, long timeo) 2319 { 2320 DEFINE_WAIT(wait); 2321 2322 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2323 for (;;) { 2324 if (!timeo) 2325 break; 2326 if (signal_pending(current)) 2327 break; 2328 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2329 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 2330 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) 2331 break; 2332 if (sk->sk_shutdown & SEND_SHUTDOWN) 2333 break; 2334 if (sk->sk_err) 2335 break; 2336 timeo = schedule_timeout(timeo); 2337 } 2338 finish_wait(sk_sleep(sk), &wait); 2339 return timeo; 2340 } 2341 2342 2343 /* 2344 * Generic send/receive buffer handlers 2345 */ 2346 2347 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 2348 unsigned long data_len, int noblock, 2349 int *errcode, int max_page_order) 2350 { 2351 struct sk_buff *skb; 2352 long timeo; 2353 int err; 2354 2355 timeo = sock_sndtimeo(sk, noblock); 2356 for (;;) { 2357 err = sock_error(sk); 2358 if (err != 0) 2359 goto failure; 2360 2361 err = -EPIPE; 2362 if (sk->sk_shutdown & SEND_SHUTDOWN) 2363 goto failure; 2364 2365 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) 2366 break; 2367 2368 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2369 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2370 err = -EAGAIN; 2371 if (!timeo) 2372 goto failure; 2373 if (signal_pending(current)) 2374 goto interrupted; 2375 timeo = sock_wait_for_wmem(sk, timeo); 2376 } 2377 skb = alloc_skb_with_frags(header_len, data_len, max_page_order, 2378 errcode, sk->sk_allocation); 2379 if (skb) 2380 skb_set_owner_w(skb, sk); 2381 return skb; 2382 2383 interrupted: 2384 err = sock_intr_errno(timeo); 2385 failure: 2386 *errcode = err; 2387 return NULL; 2388 } 2389 EXPORT_SYMBOL(sock_alloc_send_pskb); 2390 2391 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 2392 int noblock, int *errcode) 2393 { 2394 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); 2395 } 2396 EXPORT_SYMBOL(sock_alloc_send_skb); 2397 2398 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, 2399 struct sockcm_cookie *sockc) 2400 { 2401 u32 tsflags; 2402 2403 switch (cmsg->cmsg_type) { 2404 case SO_MARK: 2405 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 2406 return -EPERM; 2407 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2408 return -EINVAL; 2409 sockc->mark = *(u32 *)CMSG_DATA(cmsg); 2410 break; 2411 case SO_TIMESTAMPING_OLD: 2412 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2413 return -EINVAL; 2414 2415 tsflags = *(u32 *)CMSG_DATA(cmsg); 2416 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) 2417 return -EINVAL; 2418 2419 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; 2420 sockc->tsflags |= tsflags; 2421 break; 2422 case SCM_TXTIME: 2423 if (!sock_flag(sk, SOCK_TXTIME)) 2424 return -EINVAL; 2425 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) 2426 return -EINVAL; 2427 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); 2428 break; 2429 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ 2430 case SCM_RIGHTS: 2431 case SCM_CREDENTIALS: 2432 break; 2433 default: 2434 return -EINVAL; 2435 } 2436 return 0; 2437 } 2438 EXPORT_SYMBOL(__sock_cmsg_send); 2439 2440 int sock_cmsg_send(struct sock *sk, struct msghdr *msg, 2441 struct sockcm_cookie *sockc) 2442 { 2443 struct cmsghdr *cmsg; 2444 int ret; 2445 2446 for_each_cmsghdr(cmsg, msg) { 2447 if (!CMSG_OK(msg, cmsg)) 2448 return -EINVAL; 2449 if (cmsg->cmsg_level != SOL_SOCKET) 2450 continue; 2451 ret = __sock_cmsg_send(sk, msg, cmsg, sockc); 2452 if (ret) 2453 return ret; 2454 } 2455 return 0; 2456 } 2457 EXPORT_SYMBOL(sock_cmsg_send); 2458 2459 static void sk_enter_memory_pressure(struct sock *sk) 2460 { 2461 if (!sk->sk_prot->enter_memory_pressure) 2462 return; 2463 2464 sk->sk_prot->enter_memory_pressure(sk); 2465 } 2466 2467 static void sk_leave_memory_pressure(struct sock *sk) 2468 { 2469 if (sk->sk_prot->leave_memory_pressure) { 2470 sk->sk_prot->leave_memory_pressure(sk); 2471 } else { 2472 unsigned long *memory_pressure = sk->sk_prot->memory_pressure; 2473 2474 if (memory_pressure && READ_ONCE(*memory_pressure)) 2475 WRITE_ONCE(*memory_pressure, 0); 2476 } 2477 } 2478 2479 #define SKB_FRAG_PAGE_ORDER get_order(32768) 2480 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); 2481 2482 /** 2483 * skb_page_frag_refill - check that a page_frag contains enough room 2484 * @sz: minimum size of the fragment we want to get 2485 * @pfrag: pointer to page_frag 2486 * @gfp: priority for memory allocation 2487 * 2488 * Note: While this allocator tries to use high order pages, there is 2489 * no guarantee that allocations succeed. Therefore, @sz MUST be 2490 * less or equal than PAGE_SIZE. 2491 */ 2492 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) 2493 { 2494 if (pfrag->page) { 2495 if (page_ref_count(pfrag->page) == 1) { 2496 pfrag->offset = 0; 2497 return true; 2498 } 2499 if (pfrag->offset + sz <= pfrag->size) 2500 return true; 2501 put_page(pfrag->page); 2502 } 2503 2504 pfrag->offset = 0; 2505 if (SKB_FRAG_PAGE_ORDER && 2506 !static_branch_unlikely(&net_high_order_alloc_disable_key)) { 2507 /* Avoid direct reclaim but allow kswapd to wake */ 2508 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | 2509 __GFP_COMP | __GFP_NOWARN | 2510 __GFP_NORETRY, 2511 SKB_FRAG_PAGE_ORDER); 2512 if (likely(pfrag->page)) { 2513 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; 2514 return true; 2515 } 2516 } 2517 pfrag->page = alloc_page(gfp); 2518 if (likely(pfrag->page)) { 2519 pfrag->size = PAGE_SIZE; 2520 return true; 2521 } 2522 return false; 2523 } 2524 EXPORT_SYMBOL(skb_page_frag_refill); 2525 2526 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 2527 { 2528 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) 2529 return true; 2530 2531 sk_enter_memory_pressure(sk); 2532 sk_stream_moderate_sndbuf(sk); 2533 return false; 2534 } 2535 EXPORT_SYMBOL(sk_page_frag_refill); 2536 2537 void __lock_sock(struct sock *sk) 2538 __releases(&sk->sk_lock.slock) 2539 __acquires(&sk->sk_lock.slock) 2540 { 2541 DEFINE_WAIT(wait); 2542 2543 for (;;) { 2544 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, 2545 TASK_UNINTERRUPTIBLE); 2546 spin_unlock_bh(&sk->sk_lock.slock); 2547 schedule(); 2548 spin_lock_bh(&sk->sk_lock.slock); 2549 if (!sock_owned_by_user(sk)) 2550 break; 2551 } 2552 finish_wait(&sk->sk_lock.wq, &wait); 2553 } 2554 2555 void __release_sock(struct sock *sk) 2556 __releases(&sk->sk_lock.slock) 2557 __acquires(&sk->sk_lock.slock) 2558 { 2559 struct sk_buff *skb, *next; 2560 2561 while ((skb = sk->sk_backlog.head) != NULL) { 2562 sk->sk_backlog.head = sk->sk_backlog.tail = NULL; 2563 2564 spin_unlock_bh(&sk->sk_lock.slock); 2565 2566 do { 2567 next = skb->next; 2568 prefetch(next); 2569 WARN_ON_ONCE(skb_dst_is_noref(skb)); 2570 skb_mark_not_on_list(skb); 2571 sk_backlog_rcv(sk, skb); 2572 2573 cond_resched(); 2574 2575 skb = next; 2576 } while (skb != NULL); 2577 2578 spin_lock_bh(&sk->sk_lock.slock); 2579 } 2580 2581 /* 2582 * Doing the zeroing here guarantee we can not loop forever 2583 * while a wild producer attempts to flood us. 2584 */ 2585 sk->sk_backlog.len = 0; 2586 } 2587 2588 void __sk_flush_backlog(struct sock *sk) 2589 { 2590 spin_lock_bh(&sk->sk_lock.slock); 2591 __release_sock(sk); 2592 spin_unlock_bh(&sk->sk_lock.slock); 2593 } 2594 2595 /** 2596 * sk_wait_data - wait for data to arrive at sk_receive_queue 2597 * @sk: sock to wait on 2598 * @timeo: for how long 2599 * @skb: last skb seen on sk_receive_queue 2600 * 2601 * Now socket state including sk->sk_err is changed only under lock, 2602 * hence we may omit checks after joining wait queue. 2603 * We check receive queue before schedule() only as optimization; 2604 * it is very likely that release_sock() added new data. 2605 */ 2606 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) 2607 { 2608 DEFINE_WAIT_FUNC(wait, woken_wake_function); 2609 int rc; 2610 2611 add_wait_queue(sk_sleep(sk), &wait); 2612 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2613 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); 2614 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2615 remove_wait_queue(sk_sleep(sk), &wait); 2616 return rc; 2617 } 2618 EXPORT_SYMBOL(sk_wait_data); 2619 2620 /** 2621 * __sk_mem_raise_allocated - increase memory_allocated 2622 * @sk: socket 2623 * @size: memory size to allocate 2624 * @amt: pages to allocate 2625 * @kind: allocation type 2626 * 2627 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc 2628 */ 2629 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) 2630 { 2631 struct proto *prot = sk->sk_prot; 2632 long allocated = sk_memory_allocated_add(sk, amt); 2633 bool charged = true; 2634 2635 if (mem_cgroup_sockets_enabled && sk->sk_memcg && 2636 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt))) 2637 goto suppress_allocation; 2638 2639 /* Under limit. */ 2640 if (allocated <= sk_prot_mem_limits(sk, 0)) { 2641 sk_leave_memory_pressure(sk); 2642 return 1; 2643 } 2644 2645 /* Under pressure. */ 2646 if (allocated > sk_prot_mem_limits(sk, 1)) 2647 sk_enter_memory_pressure(sk); 2648 2649 /* Over hard limit. */ 2650 if (allocated > sk_prot_mem_limits(sk, 2)) 2651 goto suppress_allocation; 2652 2653 /* guarantee minimum buffer size under pressure */ 2654 if (kind == SK_MEM_RECV) { 2655 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) 2656 return 1; 2657 2658 } else { /* SK_MEM_SEND */ 2659 int wmem0 = sk_get_wmem0(sk, prot); 2660 2661 if (sk->sk_type == SOCK_STREAM) { 2662 if (sk->sk_wmem_queued < wmem0) 2663 return 1; 2664 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { 2665 return 1; 2666 } 2667 } 2668 2669 if (sk_has_memory_pressure(sk)) { 2670 u64 alloc; 2671 2672 if (!sk_under_memory_pressure(sk)) 2673 return 1; 2674 alloc = sk_sockets_allocated_read_positive(sk); 2675 if (sk_prot_mem_limits(sk, 2) > alloc * 2676 sk_mem_pages(sk->sk_wmem_queued + 2677 atomic_read(&sk->sk_rmem_alloc) + 2678 sk->sk_forward_alloc)) 2679 return 1; 2680 } 2681 2682 suppress_allocation: 2683 2684 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { 2685 sk_stream_moderate_sndbuf(sk); 2686 2687 /* Fail only if socket is _under_ its sndbuf. 2688 * In this case we cannot block, so that we have to fail. 2689 */ 2690 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) 2691 return 1; 2692 } 2693 2694 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) 2695 trace_sock_exceed_buf_limit(sk, prot, allocated, kind); 2696 2697 sk_memory_allocated_sub(sk, amt); 2698 2699 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2700 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); 2701 2702 return 0; 2703 } 2704 EXPORT_SYMBOL(__sk_mem_raise_allocated); 2705 2706 /** 2707 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 2708 * @sk: socket 2709 * @size: memory size to allocate 2710 * @kind: allocation type 2711 * 2712 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 2713 * rmem allocation. This function assumes that protocols which have 2714 * memory_pressure use sk_wmem_queued as write buffer accounting. 2715 */ 2716 int __sk_mem_schedule(struct sock *sk, int size, int kind) 2717 { 2718 int ret, amt = sk_mem_pages(size); 2719 2720 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT; 2721 ret = __sk_mem_raise_allocated(sk, size, amt, kind); 2722 if (!ret) 2723 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT; 2724 return ret; 2725 } 2726 EXPORT_SYMBOL(__sk_mem_schedule); 2727 2728 /** 2729 * __sk_mem_reduce_allocated - reclaim memory_allocated 2730 * @sk: socket 2731 * @amount: number of quanta 2732 * 2733 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc 2734 */ 2735 void __sk_mem_reduce_allocated(struct sock *sk, int amount) 2736 { 2737 sk_memory_allocated_sub(sk, amount); 2738 2739 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2740 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); 2741 2742 if (sk_under_memory_pressure(sk) && 2743 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 2744 sk_leave_memory_pressure(sk); 2745 } 2746 EXPORT_SYMBOL(__sk_mem_reduce_allocated); 2747 2748 /** 2749 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated 2750 * @sk: socket 2751 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) 2752 */ 2753 void __sk_mem_reclaim(struct sock *sk, int amount) 2754 { 2755 amount >>= SK_MEM_QUANTUM_SHIFT; 2756 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; 2757 __sk_mem_reduce_allocated(sk, amount); 2758 } 2759 EXPORT_SYMBOL(__sk_mem_reclaim); 2760 2761 int sk_set_peek_off(struct sock *sk, int val) 2762 { 2763 sk->sk_peek_off = val; 2764 return 0; 2765 } 2766 EXPORT_SYMBOL_GPL(sk_set_peek_off); 2767 2768 /* 2769 * Set of default routines for initialising struct proto_ops when 2770 * the protocol does not support a particular function. In certain 2771 * cases where it makes no sense for a protocol to have a "do nothing" 2772 * function, some default processing is provided. 2773 */ 2774 2775 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) 2776 { 2777 return -EOPNOTSUPP; 2778 } 2779 EXPORT_SYMBOL(sock_no_bind); 2780 2781 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 2782 int len, int flags) 2783 { 2784 return -EOPNOTSUPP; 2785 } 2786 EXPORT_SYMBOL(sock_no_connect); 2787 2788 int sock_no_socketpair(struct socket *sock1, struct socket *sock2) 2789 { 2790 return -EOPNOTSUPP; 2791 } 2792 EXPORT_SYMBOL(sock_no_socketpair); 2793 2794 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, 2795 bool kern) 2796 { 2797 return -EOPNOTSUPP; 2798 } 2799 EXPORT_SYMBOL(sock_no_accept); 2800 2801 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 2802 int peer) 2803 { 2804 return -EOPNOTSUPP; 2805 } 2806 EXPORT_SYMBOL(sock_no_getname); 2807 2808 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 2809 { 2810 return -EOPNOTSUPP; 2811 } 2812 EXPORT_SYMBOL(sock_no_ioctl); 2813 2814 int sock_no_listen(struct socket *sock, int backlog) 2815 { 2816 return -EOPNOTSUPP; 2817 } 2818 EXPORT_SYMBOL(sock_no_listen); 2819 2820 int sock_no_shutdown(struct socket *sock, int how) 2821 { 2822 return -EOPNOTSUPP; 2823 } 2824 EXPORT_SYMBOL(sock_no_shutdown); 2825 2826 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) 2827 { 2828 return -EOPNOTSUPP; 2829 } 2830 EXPORT_SYMBOL(sock_no_sendmsg); 2831 2832 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) 2833 { 2834 return -EOPNOTSUPP; 2835 } 2836 EXPORT_SYMBOL(sock_no_sendmsg_locked); 2837 2838 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, 2839 int flags) 2840 { 2841 return -EOPNOTSUPP; 2842 } 2843 EXPORT_SYMBOL(sock_no_recvmsg); 2844 2845 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 2846 { 2847 /* Mirror missing mmap method error code */ 2848 return -ENODEV; 2849 } 2850 EXPORT_SYMBOL(sock_no_mmap); 2851 2852 /* 2853 * When a file is received (via SCM_RIGHTS, etc), we must bump the 2854 * various sock-based usage counts. 2855 */ 2856 void __receive_sock(struct file *file) 2857 { 2858 struct socket *sock; 2859 2860 sock = sock_from_file(file); 2861 if (sock) { 2862 sock_update_netprioidx(&sock->sk->sk_cgrp_data); 2863 sock_update_classid(&sock->sk->sk_cgrp_data); 2864 } 2865 } 2866 2867 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) 2868 { 2869 ssize_t res; 2870 struct msghdr msg = {.msg_flags = flags}; 2871 struct kvec iov; 2872 char *kaddr = kmap(page); 2873 iov.iov_base = kaddr + offset; 2874 iov.iov_len = size; 2875 res = kernel_sendmsg(sock, &msg, &iov, 1, size); 2876 kunmap(page); 2877 return res; 2878 } 2879 EXPORT_SYMBOL(sock_no_sendpage); 2880 2881 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page, 2882 int offset, size_t size, int flags) 2883 { 2884 ssize_t res; 2885 struct msghdr msg = {.msg_flags = flags}; 2886 struct kvec iov; 2887 char *kaddr = kmap(page); 2888 2889 iov.iov_base = kaddr + offset; 2890 iov.iov_len = size; 2891 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size); 2892 kunmap(page); 2893 return res; 2894 } 2895 EXPORT_SYMBOL(sock_no_sendpage_locked); 2896 2897 /* 2898 * Default Socket Callbacks 2899 */ 2900 2901 static void sock_def_wakeup(struct sock *sk) 2902 { 2903 struct socket_wq *wq; 2904 2905 rcu_read_lock(); 2906 wq = rcu_dereference(sk->sk_wq); 2907 if (skwq_has_sleeper(wq)) 2908 wake_up_interruptible_all(&wq->wait); 2909 rcu_read_unlock(); 2910 } 2911 2912 static void sock_def_error_report(struct sock *sk) 2913 { 2914 struct socket_wq *wq; 2915 2916 rcu_read_lock(); 2917 wq = rcu_dereference(sk->sk_wq); 2918 if (skwq_has_sleeper(wq)) 2919 wake_up_interruptible_poll(&wq->wait, EPOLLERR); 2920 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); 2921 rcu_read_unlock(); 2922 } 2923 2924 void sock_def_readable(struct sock *sk) 2925 { 2926 struct socket_wq *wq; 2927 2928 rcu_read_lock(); 2929 wq = rcu_dereference(sk->sk_wq); 2930 if (skwq_has_sleeper(wq)) 2931 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | 2932 EPOLLRDNORM | EPOLLRDBAND); 2933 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 2934 rcu_read_unlock(); 2935 } 2936 2937 static void sock_def_write_space(struct sock *sk) 2938 { 2939 struct socket_wq *wq; 2940 2941 rcu_read_lock(); 2942 2943 /* Do not wake up a writer until he can make "significant" 2944 * progress. --DaveM 2945 */ 2946 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) { 2947 wq = rcu_dereference(sk->sk_wq); 2948 if (skwq_has_sleeper(wq)) 2949 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 2950 EPOLLWRNORM | EPOLLWRBAND); 2951 2952 /* Should agree with poll, otherwise some programs break */ 2953 if (sock_writeable(sk)) 2954 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 2955 } 2956 2957 rcu_read_unlock(); 2958 } 2959 2960 static void sock_def_destruct(struct sock *sk) 2961 { 2962 } 2963 2964 void sk_send_sigurg(struct sock *sk) 2965 { 2966 if (sk->sk_socket && sk->sk_socket->file) 2967 if (send_sigurg(&sk->sk_socket->file->f_owner)) 2968 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 2969 } 2970 EXPORT_SYMBOL(sk_send_sigurg); 2971 2972 void sk_reset_timer(struct sock *sk, struct timer_list* timer, 2973 unsigned long expires) 2974 { 2975 if (!mod_timer(timer, expires)) 2976 sock_hold(sk); 2977 } 2978 EXPORT_SYMBOL(sk_reset_timer); 2979 2980 void sk_stop_timer(struct sock *sk, struct timer_list* timer) 2981 { 2982 if (del_timer(timer)) 2983 __sock_put(sk); 2984 } 2985 EXPORT_SYMBOL(sk_stop_timer); 2986 2987 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) 2988 { 2989 if (del_timer_sync(timer)) 2990 __sock_put(sk); 2991 } 2992 EXPORT_SYMBOL(sk_stop_timer_sync); 2993 2994 void sock_init_data(struct socket *sock, struct sock *sk) 2995 { 2996 sk_init_common(sk); 2997 sk->sk_send_head = NULL; 2998 2999 timer_setup(&sk->sk_timer, NULL, 0); 3000 3001 sk->sk_allocation = GFP_KERNEL; 3002 sk->sk_rcvbuf = sysctl_rmem_default; 3003 sk->sk_sndbuf = sysctl_wmem_default; 3004 sk->sk_state = TCP_CLOSE; 3005 sk_set_socket(sk, sock); 3006 3007 sock_set_flag(sk, SOCK_ZAPPED); 3008 3009 if (sock) { 3010 sk->sk_type = sock->type; 3011 RCU_INIT_POINTER(sk->sk_wq, &sock->wq); 3012 sock->sk = sk; 3013 sk->sk_uid = SOCK_INODE(sock)->i_uid; 3014 } else { 3015 RCU_INIT_POINTER(sk->sk_wq, NULL); 3016 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0); 3017 } 3018 3019 rwlock_init(&sk->sk_callback_lock); 3020 if (sk->sk_kern_sock) 3021 lockdep_set_class_and_name( 3022 &sk->sk_callback_lock, 3023 af_kern_callback_keys + sk->sk_family, 3024 af_family_kern_clock_key_strings[sk->sk_family]); 3025 else 3026 lockdep_set_class_and_name( 3027 &sk->sk_callback_lock, 3028 af_callback_keys + sk->sk_family, 3029 af_family_clock_key_strings[sk->sk_family]); 3030 3031 sk->sk_state_change = sock_def_wakeup; 3032 sk->sk_data_ready = sock_def_readable; 3033 sk->sk_write_space = sock_def_write_space; 3034 sk->sk_error_report = sock_def_error_report; 3035 sk->sk_destruct = sock_def_destruct; 3036 3037 sk->sk_frag.page = NULL; 3038 sk->sk_frag.offset = 0; 3039 sk->sk_peek_off = -1; 3040 3041 sk->sk_peer_pid = NULL; 3042 sk->sk_peer_cred = NULL; 3043 sk->sk_write_pending = 0; 3044 sk->sk_rcvlowat = 1; 3045 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 3046 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 3047 3048 sk->sk_stamp = SK_DEFAULT_STAMP; 3049 #if BITS_PER_LONG==32 3050 seqlock_init(&sk->sk_stamp_seq); 3051 #endif 3052 atomic_set(&sk->sk_zckey, 0); 3053 3054 #ifdef CONFIG_NET_RX_BUSY_POLL 3055 sk->sk_napi_id = 0; 3056 sk->sk_ll_usec = sysctl_net_busy_read; 3057 #endif 3058 3059 sk->sk_max_pacing_rate = ~0UL; 3060 sk->sk_pacing_rate = ~0UL; 3061 WRITE_ONCE(sk->sk_pacing_shift, 10); 3062 sk->sk_incoming_cpu = -1; 3063 3064 sk_rx_queue_clear(sk); 3065 /* 3066 * Before updating sk_refcnt, we must commit prior changes to memory 3067 * (Documentation/RCU/rculist_nulls.rst for details) 3068 */ 3069 smp_wmb(); 3070 refcount_set(&sk->sk_refcnt, 1); 3071 atomic_set(&sk->sk_drops, 0); 3072 } 3073 EXPORT_SYMBOL(sock_init_data); 3074 3075 void lock_sock_nested(struct sock *sk, int subclass) 3076 { 3077 might_sleep(); 3078 spin_lock_bh(&sk->sk_lock.slock); 3079 if (sk->sk_lock.owned) 3080 __lock_sock(sk); 3081 sk->sk_lock.owned = 1; 3082 spin_unlock(&sk->sk_lock.slock); 3083 /* 3084 * The sk_lock has mutex_lock() semantics here: 3085 */ 3086 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); 3087 local_bh_enable(); 3088 } 3089 EXPORT_SYMBOL(lock_sock_nested); 3090 3091 void release_sock(struct sock *sk) 3092 { 3093 spin_lock_bh(&sk->sk_lock.slock); 3094 if (sk->sk_backlog.tail) 3095 __release_sock(sk); 3096 3097 /* Warning : release_cb() might need to release sk ownership, 3098 * ie call sock_release_ownership(sk) before us. 3099 */ 3100 if (sk->sk_prot->release_cb) 3101 sk->sk_prot->release_cb(sk); 3102 3103 sock_release_ownership(sk); 3104 if (waitqueue_active(&sk->sk_lock.wq)) 3105 wake_up(&sk->sk_lock.wq); 3106 spin_unlock_bh(&sk->sk_lock.slock); 3107 } 3108 EXPORT_SYMBOL(release_sock); 3109 3110 /** 3111 * lock_sock_fast - fast version of lock_sock 3112 * @sk: socket 3113 * 3114 * This version should be used for very small section, where process wont block 3115 * return false if fast path is taken: 3116 * 3117 * sk_lock.slock locked, owned = 0, BH disabled 3118 * 3119 * return true if slow path is taken: 3120 * 3121 * sk_lock.slock unlocked, owned = 1, BH enabled 3122 */ 3123 bool lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) 3124 { 3125 might_sleep(); 3126 spin_lock_bh(&sk->sk_lock.slock); 3127 3128 if (!sk->sk_lock.owned) 3129 /* 3130 * Note : We must disable BH 3131 */ 3132 return false; 3133 3134 __lock_sock(sk); 3135 sk->sk_lock.owned = 1; 3136 spin_unlock(&sk->sk_lock.slock); 3137 /* 3138 * The sk_lock has mutex_lock() semantics here: 3139 */ 3140 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); 3141 __acquire(&sk->sk_lock.slock); 3142 local_bh_enable(); 3143 return true; 3144 } 3145 EXPORT_SYMBOL(lock_sock_fast); 3146 3147 int sock_gettstamp(struct socket *sock, void __user *userstamp, 3148 bool timeval, bool time32) 3149 { 3150 struct sock *sk = sock->sk; 3151 struct timespec64 ts; 3152 3153 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 3154 ts = ktime_to_timespec64(sock_read_timestamp(sk)); 3155 if (ts.tv_sec == -1) 3156 return -ENOENT; 3157 if (ts.tv_sec == 0) { 3158 ktime_t kt = ktime_get_real(); 3159 sock_write_timestamp(sk, kt); 3160 ts = ktime_to_timespec64(kt); 3161 } 3162 3163 if (timeval) 3164 ts.tv_nsec /= 1000; 3165 3166 #ifdef CONFIG_COMPAT_32BIT_TIME 3167 if (time32) 3168 return put_old_timespec32(&ts, userstamp); 3169 #endif 3170 #ifdef CONFIG_SPARC64 3171 /* beware of padding in sparc64 timeval */ 3172 if (timeval && !in_compat_syscall()) { 3173 struct __kernel_old_timeval __user tv = { 3174 .tv_sec = ts.tv_sec, 3175 .tv_usec = ts.tv_nsec, 3176 }; 3177 if (copy_to_user(userstamp, &tv, sizeof(tv))) 3178 return -EFAULT; 3179 return 0; 3180 } 3181 #endif 3182 return put_timespec64(&ts, userstamp); 3183 } 3184 EXPORT_SYMBOL(sock_gettstamp); 3185 3186 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) 3187 { 3188 if (!sock_flag(sk, flag)) { 3189 unsigned long previous_flags = sk->sk_flags; 3190 3191 sock_set_flag(sk, flag); 3192 /* 3193 * we just set one of the two flags which require net 3194 * time stamping, but time stamping might have been on 3195 * already because of the other one 3196 */ 3197 if (sock_needs_netstamp(sk) && 3198 !(previous_flags & SK_FLAGS_TIMESTAMP)) 3199 net_enable_timestamp(); 3200 } 3201 } 3202 3203 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, 3204 int level, int type) 3205 { 3206 struct sock_exterr_skb *serr; 3207 struct sk_buff *skb; 3208 int copied, err; 3209 3210 err = -EAGAIN; 3211 skb = sock_dequeue_err_skb(sk); 3212 if (skb == NULL) 3213 goto out; 3214 3215 copied = skb->len; 3216 if (copied > len) { 3217 msg->msg_flags |= MSG_TRUNC; 3218 copied = len; 3219 } 3220 err = skb_copy_datagram_msg(skb, 0, msg, copied); 3221 if (err) 3222 goto out_free_skb; 3223 3224 sock_recv_timestamp(msg, sk, skb); 3225 3226 serr = SKB_EXT_ERR(skb); 3227 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); 3228 3229 msg->msg_flags |= MSG_ERRQUEUE; 3230 err = copied; 3231 3232 out_free_skb: 3233 kfree_skb(skb); 3234 out: 3235 return err; 3236 } 3237 EXPORT_SYMBOL(sock_recv_errqueue); 3238 3239 /* 3240 * Get a socket option on an socket. 3241 * 3242 * FIX: POSIX 1003.1g is very ambiguous here. It states that 3243 * asynchronous errors should be reported by getsockopt. We assume 3244 * this means if you specify SO_ERROR (otherwise whats the point of it). 3245 */ 3246 int sock_common_getsockopt(struct socket *sock, int level, int optname, 3247 char __user *optval, int __user *optlen) 3248 { 3249 struct sock *sk = sock->sk; 3250 3251 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 3252 } 3253 EXPORT_SYMBOL(sock_common_getsockopt); 3254 3255 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 3256 int flags) 3257 { 3258 struct sock *sk = sock->sk; 3259 int addr_len = 0; 3260 int err; 3261 3262 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, 3263 flags & ~MSG_DONTWAIT, &addr_len); 3264 if (err >= 0) 3265 msg->msg_namelen = addr_len; 3266 return err; 3267 } 3268 EXPORT_SYMBOL(sock_common_recvmsg); 3269 3270 /* 3271 * Set socket options on an inet socket. 3272 */ 3273 int sock_common_setsockopt(struct socket *sock, int level, int optname, 3274 sockptr_t optval, unsigned int optlen) 3275 { 3276 struct sock *sk = sock->sk; 3277 3278 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 3279 } 3280 EXPORT_SYMBOL(sock_common_setsockopt); 3281 3282 void sk_common_release(struct sock *sk) 3283 { 3284 if (sk->sk_prot->destroy) 3285 sk->sk_prot->destroy(sk); 3286 3287 /* 3288 * Observation: when sk_common_release is called, processes have 3289 * no access to socket. But net still has. 3290 * Step one, detach it from networking: 3291 * 3292 * A. Remove from hash tables. 3293 */ 3294 3295 sk->sk_prot->unhash(sk); 3296 3297 /* 3298 * In this point socket cannot receive new packets, but it is possible 3299 * that some packets are in flight because some CPU runs receiver and 3300 * did hash table lookup before we unhashed socket. They will achieve 3301 * receive queue and will be purged by socket destructor. 3302 * 3303 * Also we still have packets pending on receive queue and probably, 3304 * our own packets waiting in device queues. sock_destroy will drain 3305 * receive queue, but transmitted packets will delay socket destruction 3306 * until the last reference will be released. 3307 */ 3308 3309 sock_orphan(sk); 3310 3311 xfrm_sk_free_policy(sk); 3312 3313 sk_refcnt_debug_release(sk); 3314 3315 sock_put(sk); 3316 } 3317 EXPORT_SYMBOL(sk_common_release); 3318 3319 void sk_get_meminfo(const struct sock *sk, u32 *mem) 3320 { 3321 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); 3322 3323 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); 3324 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); 3325 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); 3326 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); 3327 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; 3328 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); 3329 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); 3330 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); 3331 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); 3332 } 3333 3334 #ifdef CONFIG_PROC_FS 3335 #define PROTO_INUSE_NR 64 /* should be enough for the first time */ 3336 struct prot_inuse { 3337 int val[PROTO_INUSE_NR]; 3338 }; 3339 3340 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); 3341 3342 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) 3343 { 3344 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val); 3345 } 3346 EXPORT_SYMBOL_GPL(sock_prot_inuse_add); 3347 3348 int sock_prot_inuse_get(struct net *net, struct proto *prot) 3349 { 3350 int cpu, idx = prot->inuse_idx; 3351 int res = 0; 3352 3353 for_each_possible_cpu(cpu) 3354 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; 3355 3356 return res >= 0 ? res : 0; 3357 } 3358 EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 3359 3360 static void sock_inuse_add(struct net *net, int val) 3361 { 3362 this_cpu_add(*net->core.sock_inuse, val); 3363 } 3364 3365 int sock_inuse_get(struct net *net) 3366 { 3367 int cpu, res = 0; 3368 3369 for_each_possible_cpu(cpu) 3370 res += *per_cpu_ptr(net->core.sock_inuse, cpu); 3371 3372 return res; 3373 } 3374 3375 EXPORT_SYMBOL_GPL(sock_inuse_get); 3376 3377 static int __net_init sock_inuse_init_net(struct net *net) 3378 { 3379 net->core.prot_inuse = alloc_percpu(struct prot_inuse); 3380 if (net->core.prot_inuse == NULL) 3381 return -ENOMEM; 3382 3383 net->core.sock_inuse = alloc_percpu(int); 3384 if (net->core.sock_inuse == NULL) 3385 goto out; 3386 3387 return 0; 3388 3389 out: 3390 free_percpu(net->core.prot_inuse); 3391 return -ENOMEM; 3392 } 3393 3394 static void __net_exit sock_inuse_exit_net(struct net *net) 3395 { 3396 free_percpu(net->core.prot_inuse); 3397 free_percpu(net->core.sock_inuse); 3398 } 3399 3400 static struct pernet_operations net_inuse_ops = { 3401 .init = sock_inuse_init_net, 3402 .exit = sock_inuse_exit_net, 3403 }; 3404 3405 static __init int net_inuse_init(void) 3406 { 3407 if (register_pernet_subsys(&net_inuse_ops)) 3408 panic("Cannot initialize net inuse counters"); 3409 3410 return 0; 3411 } 3412 3413 core_initcall(net_inuse_init); 3414 3415 static int assign_proto_idx(struct proto *prot) 3416 { 3417 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); 3418 3419 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { 3420 pr_err("PROTO_INUSE_NR exhausted\n"); 3421 return -ENOSPC; 3422 } 3423 3424 set_bit(prot->inuse_idx, proto_inuse_idx); 3425 return 0; 3426 } 3427 3428 static void release_proto_idx(struct proto *prot) 3429 { 3430 if (prot->inuse_idx != PROTO_INUSE_NR - 1) 3431 clear_bit(prot->inuse_idx, proto_inuse_idx); 3432 } 3433 #else 3434 static inline int assign_proto_idx(struct proto *prot) 3435 { 3436 return 0; 3437 } 3438 3439 static inline void release_proto_idx(struct proto *prot) 3440 { 3441 } 3442 3443 static void sock_inuse_add(struct net *net, int val) 3444 { 3445 } 3446 #endif 3447 3448 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) 3449 { 3450 if (!twsk_prot) 3451 return; 3452 kfree(twsk_prot->twsk_slab_name); 3453 twsk_prot->twsk_slab_name = NULL; 3454 kmem_cache_destroy(twsk_prot->twsk_slab); 3455 twsk_prot->twsk_slab = NULL; 3456 } 3457 3458 static int tw_prot_init(const struct proto *prot) 3459 { 3460 struct timewait_sock_ops *twsk_prot = prot->twsk_prot; 3461 3462 if (!twsk_prot) 3463 return 0; 3464 3465 twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", 3466 prot->name); 3467 if (!twsk_prot->twsk_slab_name) 3468 return -ENOMEM; 3469 3470 twsk_prot->twsk_slab = 3471 kmem_cache_create(twsk_prot->twsk_slab_name, 3472 twsk_prot->twsk_obj_size, 0, 3473 SLAB_ACCOUNT | prot->slab_flags, 3474 NULL); 3475 if (!twsk_prot->twsk_slab) { 3476 pr_crit("%s: Can't create timewait sock SLAB cache!\n", 3477 prot->name); 3478 return -ENOMEM; 3479 } 3480 3481 return 0; 3482 } 3483 3484 static void req_prot_cleanup(struct request_sock_ops *rsk_prot) 3485 { 3486 if (!rsk_prot) 3487 return; 3488 kfree(rsk_prot->slab_name); 3489 rsk_prot->slab_name = NULL; 3490 kmem_cache_destroy(rsk_prot->slab); 3491 rsk_prot->slab = NULL; 3492 } 3493 3494 static int req_prot_init(const struct proto *prot) 3495 { 3496 struct request_sock_ops *rsk_prot = prot->rsk_prot; 3497 3498 if (!rsk_prot) 3499 return 0; 3500 3501 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", 3502 prot->name); 3503 if (!rsk_prot->slab_name) 3504 return -ENOMEM; 3505 3506 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, 3507 rsk_prot->obj_size, 0, 3508 SLAB_ACCOUNT | prot->slab_flags, 3509 NULL); 3510 3511 if (!rsk_prot->slab) { 3512 pr_crit("%s: Can't create request sock SLAB cache!\n", 3513 prot->name); 3514 return -ENOMEM; 3515 } 3516 return 0; 3517 } 3518 3519 int proto_register(struct proto *prot, int alloc_slab) 3520 { 3521 int ret = -ENOBUFS; 3522 3523 if (alloc_slab) { 3524 prot->slab = kmem_cache_create_usercopy(prot->name, 3525 prot->obj_size, 0, 3526 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | 3527 prot->slab_flags, 3528 prot->useroffset, prot->usersize, 3529 NULL); 3530 3531 if (prot->slab == NULL) { 3532 pr_crit("%s: Can't create sock SLAB cache!\n", 3533 prot->name); 3534 goto out; 3535 } 3536 3537 if (req_prot_init(prot)) 3538 goto out_free_request_sock_slab; 3539 3540 if (tw_prot_init(prot)) 3541 goto out_free_timewait_sock_slab; 3542 } 3543 3544 mutex_lock(&proto_list_mutex); 3545 ret = assign_proto_idx(prot); 3546 if (ret) { 3547 mutex_unlock(&proto_list_mutex); 3548 goto out_free_timewait_sock_slab; 3549 } 3550 list_add(&prot->node, &proto_list); 3551 mutex_unlock(&proto_list_mutex); 3552 return ret; 3553 3554 out_free_timewait_sock_slab: 3555 if (alloc_slab) 3556 tw_prot_cleanup(prot->twsk_prot); 3557 out_free_request_sock_slab: 3558 if (alloc_slab) { 3559 req_prot_cleanup(prot->rsk_prot); 3560 3561 kmem_cache_destroy(prot->slab); 3562 prot->slab = NULL; 3563 } 3564 out: 3565 return ret; 3566 } 3567 EXPORT_SYMBOL(proto_register); 3568 3569 void proto_unregister(struct proto *prot) 3570 { 3571 mutex_lock(&proto_list_mutex); 3572 release_proto_idx(prot); 3573 list_del(&prot->node); 3574 mutex_unlock(&proto_list_mutex); 3575 3576 kmem_cache_destroy(prot->slab); 3577 prot->slab = NULL; 3578 3579 req_prot_cleanup(prot->rsk_prot); 3580 tw_prot_cleanup(prot->twsk_prot); 3581 } 3582 EXPORT_SYMBOL(proto_unregister); 3583 3584 int sock_load_diag_module(int family, int protocol) 3585 { 3586 if (!protocol) { 3587 if (!sock_is_registered(family)) 3588 return -ENOENT; 3589 3590 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, 3591 NETLINK_SOCK_DIAG, family); 3592 } 3593 3594 #ifdef CONFIG_INET 3595 if (family == AF_INET && 3596 protocol != IPPROTO_RAW && 3597 protocol < MAX_INET_PROTOS && 3598 !rcu_access_pointer(inet_protos[protocol])) 3599 return -ENOENT; 3600 #endif 3601 3602 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, 3603 NETLINK_SOCK_DIAG, family, protocol); 3604 } 3605 EXPORT_SYMBOL(sock_load_diag_module); 3606 3607 #ifdef CONFIG_PROC_FS 3608 static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 3609 __acquires(proto_list_mutex) 3610 { 3611 mutex_lock(&proto_list_mutex); 3612 return seq_list_start_head(&proto_list, *pos); 3613 } 3614 3615 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3616 { 3617 return seq_list_next(v, &proto_list, pos); 3618 } 3619 3620 static void proto_seq_stop(struct seq_file *seq, void *v) 3621 __releases(proto_list_mutex) 3622 { 3623 mutex_unlock(&proto_list_mutex); 3624 } 3625 3626 static char proto_method_implemented(const void *method) 3627 { 3628 return method == NULL ? 'n' : 'y'; 3629 } 3630 static long sock_prot_memory_allocated(struct proto *proto) 3631 { 3632 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; 3633 } 3634 3635 static const char *sock_prot_memory_pressure(struct proto *proto) 3636 { 3637 return proto->memory_pressure != NULL ? 3638 proto_memory_pressure(proto) ? "yes" : "no" : "NI"; 3639 } 3640 3641 static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 3642 { 3643 3644 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " 3645 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 3646 proto->name, 3647 proto->obj_size, 3648 sock_prot_inuse_get(seq_file_net(seq), proto), 3649 sock_prot_memory_allocated(proto), 3650 sock_prot_memory_pressure(proto), 3651 proto->max_header, 3652 proto->slab == NULL ? "no" : "yes", 3653 module_name(proto->owner), 3654 proto_method_implemented(proto->close), 3655 proto_method_implemented(proto->connect), 3656 proto_method_implemented(proto->disconnect), 3657 proto_method_implemented(proto->accept), 3658 proto_method_implemented(proto->ioctl), 3659 proto_method_implemented(proto->init), 3660 proto_method_implemented(proto->destroy), 3661 proto_method_implemented(proto->shutdown), 3662 proto_method_implemented(proto->setsockopt), 3663 proto_method_implemented(proto->getsockopt), 3664 proto_method_implemented(proto->sendmsg), 3665 proto_method_implemented(proto->recvmsg), 3666 proto_method_implemented(proto->sendpage), 3667 proto_method_implemented(proto->bind), 3668 proto_method_implemented(proto->backlog_rcv), 3669 proto_method_implemented(proto->hash), 3670 proto_method_implemented(proto->unhash), 3671 proto_method_implemented(proto->get_port), 3672 proto_method_implemented(proto->enter_memory_pressure)); 3673 } 3674 3675 static int proto_seq_show(struct seq_file *seq, void *v) 3676 { 3677 if (v == &proto_list) 3678 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", 3679 "protocol", 3680 "size", 3681 "sockets", 3682 "memory", 3683 "press", 3684 "maxhdr", 3685 "slab", 3686 "module", 3687 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); 3688 else 3689 proto_seq_printf(seq, list_entry(v, struct proto, node)); 3690 return 0; 3691 } 3692 3693 static const struct seq_operations proto_seq_ops = { 3694 .start = proto_seq_start, 3695 .next = proto_seq_next, 3696 .stop = proto_seq_stop, 3697 .show = proto_seq_show, 3698 }; 3699 3700 static __net_init int proto_init_net(struct net *net) 3701 { 3702 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, 3703 sizeof(struct seq_net_private))) 3704 return -ENOMEM; 3705 3706 return 0; 3707 } 3708 3709 static __net_exit void proto_exit_net(struct net *net) 3710 { 3711 remove_proc_entry("protocols", net->proc_net); 3712 } 3713 3714 3715 static __net_initdata struct pernet_operations proto_net_ops = { 3716 .init = proto_init_net, 3717 .exit = proto_exit_net, 3718 }; 3719 3720 static int __init proto_init(void) 3721 { 3722 return register_pernet_subsys(&proto_net_ops); 3723 } 3724 3725 subsys_initcall(proto_init); 3726 3727 #endif /* PROC_FS */ 3728 3729 #ifdef CONFIG_NET_RX_BUSY_POLL 3730 bool sk_busy_loop_end(void *p, unsigned long start_time) 3731 { 3732 struct sock *sk = p; 3733 3734 return !skb_queue_empty_lockless(&sk->sk_receive_queue) || 3735 sk_busy_loop_timeout(sk, start_time); 3736 } 3737 EXPORT_SYMBOL(sk_busy_loop_end); 3738 #endif /* CONFIG_NET_RX_BUSY_POLL */ 3739 3740 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) 3741 { 3742 if (!sk->sk_prot->bind_add) 3743 return -EOPNOTSUPP; 3744 return sk->sk_prot->bind_add(sk, addr, addr_len); 3745 } 3746 EXPORT_SYMBOL(sock_bind_add); 3747