1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Generic socket support routines. Memory allocators, socket lock/release 8 * handler for protocols to use and generic option handler. 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Alan Cox, <A.Cox@swansea.ac.uk> 14 * 15 * Fixes: 16 * Alan Cox : Numerous verify_area() problems 17 * Alan Cox : Connecting on a connecting socket 18 * now returns an error for tcp. 19 * Alan Cox : sock->protocol is set correctly. 20 * and is not sometimes left as 0. 21 * Alan Cox : connect handles icmp errors on a 22 * connect properly. Unfortunately there 23 * is a restart syscall nasty there. I 24 * can't match BSD without hacking the C 25 * library. Ideas urgently sought! 26 * Alan Cox : Disallow bind() to addresses that are 27 * not ours - especially broadcast ones!! 28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) 29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, 30 * instead they leave that for the DESTROY timer. 31 * Alan Cox : Clean up error flag in accept 32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer 33 * was buggy. Put a remove_sock() in the handler 34 * for memory when we hit 0. Also altered the timer 35 * code. The ACK stuff can wait and needs major 36 * TCP layer surgery. 37 * Alan Cox : Fixed TCP ack bug, removed remove sock 38 * and fixed timer/inet_bh race. 39 * Alan Cox : Added zapped flag for TCP 40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code 41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb 42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources 43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. 44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... 45 * Rick Sladkey : Relaxed UDP rules for matching packets. 46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support 47 * Pauline Middelink : identd support 48 * Alan Cox : Fixed connect() taking signals I think. 49 * Alan Cox : SO_LINGER supported 50 * Alan Cox : Error reporting fixes 51 * Anonymous : inet_create tidied up (sk->reuse setting) 52 * Alan Cox : inet sockets don't set sk->type! 53 * Alan Cox : Split socket option code 54 * Alan Cox : Callbacks 55 * Alan Cox : Nagle flag for Charles & Johannes stuff 56 * Alex : Removed restriction on inet fioctl 57 * Alan Cox : Splitting INET from NET core 58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() 59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code 60 * Alan Cox : Split IP from generic code 61 * Alan Cox : New kfree_skbmem() 62 * Alan Cox : Make SO_DEBUG superuser only. 63 * Alan Cox : Allow anyone to clear SO_DEBUG 64 * (compatibility fix) 65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. 66 * Alan Cox : Allocator for a socket is settable. 67 * Alan Cox : SO_ERROR includes soft errors. 68 * Alan Cox : Allow NULL arguments on some SO_ opts 69 * Alan Cox : Generic socket allocation to make hooks 70 * easier (suggested by Craig Metz). 71 * Michael Pall : SO_ERROR returns positive errno again 72 * Steve Whitehouse: Added default destructor to free 73 * protocol private data. 74 * Steve Whitehouse: Added various other default routines 75 * common to several socket families. 76 * Chris Evans : Call suser() check last on F_SETOWN 77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. 78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() 79 * Andi Kleen : Fix write_space callback 80 * Chris Evans : Security fixes - signedness again 81 * Arnaldo C. Melo : cleanups, use skb_queue_purge 82 * 83 * To Fix: 84 */ 85 86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 87 88 #include <asm/unaligned.h> 89 #include <linux/capability.h> 90 #include <linux/errno.h> 91 #include <linux/errqueue.h> 92 #include <linux/types.h> 93 #include <linux/socket.h> 94 #include <linux/in.h> 95 #include <linux/kernel.h> 96 #include <linux/module.h> 97 #include <linux/proc_fs.h> 98 #include <linux/seq_file.h> 99 #include <linux/sched.h> 100 #include <linux/sched/mm.h> 101 #include <linux/timer.h> 102 #include <linux/string.h> 103 #include <linux/sockios.h> 104 #include <linux/net.h> 105 #include <linux/mm.h> 106 #include <linux/slab.h> 107 #include <linux/interrupt.h> 108 #include <linux/poll.h> 109 #include <linux/tcp.h> 110 #include <linux/init.h> 111 #include <linux/highmem.h> 112 #include <linux/user_namespace.h> 113 #include <linux/static_key.h> 114 #include <linux/memcontrol.h> 115 #include <linux/prefetch.h> 116 #include <linux/compat.h> 117 118 #include <linux/uaccess.h> 119 120 #include <linux/netdevice.h> 121 #include <net/protocol.h> 122 #include <linux/skbuff.h> 123 #include <net/net_namespace.h> 124 #include <net/request_sock.h> 125 #include <net/sock.h> 126 #include <linux/net_tstamp.h> 127 #include <net/xfrm.h> 128 #include <linux/ipsec.h> 129 #include <net/cls_cgroup.h> 130 #include <net/netprio_cgroup.h> 131 #include <linux/sock_diag.h> 132 133 #include <linux/filter.h> 134 #include <net/sock_reuseport.h> 135 #include <net/bpf_sk_storage.h> 136 137 #include <trace/events/sock.h> 138 139 #include <net/tcp.h> 140 #include <net/busy_poll.h> 141 142 static DEFINE_MUTEX(proto_list_mutex); 143 static LIST_HEAD(proto_list); 144 145 static void sock_inuse_add(struct net *net, int val); 146 147 /** 148 * sk_ns_capable - General socket capability test 149 * @sk: Socket to use a capability on or through 150 * @user_ns: The user namespace of the capability to use 151 * @cap: The capability to use 152 * 153 * Test to see if the opener of the socket had when the socket was 154 * created and the current process has the capability @cap in the user 155 * namespace @user_ns. 156 */ 157 bool sk_ns_capable(const struct sock *sk, 158 struct user_namespace *user_ns, int cap) 159 { 160 return file_ns_capable(sk->sk_socket->file, user_ns, cap) && 161 ns_capable(user_ns, cap); 162 } 163 EXPORT_SYMBOL(sk_ns_capable); 164 165 /** 166 * sk_capable - Socket global capability test 167 * @sk: Socket to use a capability on or through 168 * @cap: The global capability to use 169 * 170 * Test to see if the opener of the socket had when the socket was 171 * created and the current process has the capability @cap in all user 172 * namespaces. 173 */ 174 bool sk_capable(const struct sock *sk, int cap) 175 { 176 return sk_ns_capable(sk, &init_user_ns, cap); 177 } 178 EXPORT_SYMBOL(sk_capable); 179 180 /** 181 * sk_net_capable - Network namespace socket capability test 182 * @sk: Socket to use a capability on or through 183 * @cap: The capability to use 184 * 185 * Test to see if the opener of the socket had when the socket was created 186 * and the current process has the capability @cap over the network namespace 187 * the socket is a member of. 188 */ 189 bool sk_net_capable(const struct sock *sk, int cap) 190 { 191 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); 192 } 193 EXPORT_SYMBOL(sk_net_capable); 194 195 /* 196 * Each address family might have different locking rules, so we have 197 * one slock key per address family and separate keys for internal and 198 * userspace sockets. 199 */ 200 static struct lock_class_key af_family_keys[AF_MAX]; 201 static struct lock_class_key af_family_kern_keys[AF_MAX]; 202 static struct lock_class_key af_family_slock_keys[AF_MAX]; 203 static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; 204 205 /* 206 * Make lock validator output more readable. (we pre-construct these 207 * strings build-time, so that runtime initialization of socket 208 * locks is fast): 209 */ 210 211 #define _sock_locks(x) \ 212 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ 213 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ 214 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ 215 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ 216 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ 217 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ 218 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ 219 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ 220 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ 221 x "27" , x "28" , x "AF_CAN" , \ 222 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ 223 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ 224 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ 225 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ 226 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ 227 x "AF_MAX" 228 229 static const char *const af_family_key_strings[AF_MAX+1] = { 230 _sock_locks("sk_lock-") 231 }; 232 static const char *const af_family_slock_key_strings[AF_MAX+1] = { 233 _sock_locks("slock-") 234 }; 235 static const char *const af_family_clock_key_strings[AF_MAX+1] = { 236 _sock_locks("clock-") 237 }; 238 239 static const char *const af_family_kern_key_strings[AF_MAX+1] = { 240 _sock_locks("k-sk_lock-") 241 }; 242 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { 243 _sock_locks("k-slock-") 244 }; 245 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { 246 _sock_locks("k-clock-") 247 }; 248 static const char *const af_family_rlock_key_strings[AF_MAX+1] = { 249 _sock_locks("rlock-") 250 }; 251 static const char *const af_family_wlock_key_strings[AF_MAX+1] = { 252 _sock_locks("wlock-") 253 }; 254 static const char *const af_family_elock_key_strings[AF_MAX+1] = { 255 _sock_locks("elock-") 256 }; 257 258 /* 259 * sk_callback_lock and sk queues locking rules are per-address-family, 260 * so split the lock classes by using a per-AF key: 261 */ 262 static struct lock_class_key af_callback_keys[AF_MAX]; 263 static struct lock_class_key af_rlock_keys[AF_MAX]; 264 static struct lock_class_key af_wlock_keys[AF_MAX]; 265 static struct lock_class_key af_elock_keys[AF_MAX]; 266 static struct lock_class_key af_kern_callback_keys[AF_MAX]; 267 268 /* Run time adjustable parameters. */ 269 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; 270 EXPORT_SYMBOL(sysctl_wmem_max); 271 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; 272 EXPORT_SYMBOL(sysctl_rmem_max); 273 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; 274 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; 275 276 /* Maximal space eaten by iovec or ancillary data plus some space */ 277 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 278 EXPORT_SYMBOL(sysctl_optmem_max); 279 280 int sysctl_tstamp_allow_data __read_mostly = 1; 281 282 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); 283 EXPORT_SYMBOL_GPL(memalloc_socks_key); 284 285 /** 286 * sk_set_memalloc - sets %SOCK_MEMALLOC 287 * @sk: socket to set it on 288 * 289 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. 290 * It's the responsibility of the admin to adjust min_free_kbytes 291 * to meet the requirements 292 */ 293 void sk_set_memalloc(struct sock *sk) 294 { 295 sock_set_flag(sk, SOCK_MEMALLOC); 296 sk->sk_allocation |= __GFP_MEMALLOC; 297 static_branch_inc(&memalloc_socks_key); 298 } 299 EXPORT_SYMBOL_GPL(sk_set_memalloc); 300 301 void sk_clear_memalloc(struct sock *sk) 302 { 303 sock_reset_flag(sk, SOCK_MEMALLOC); 304 sk->sk_allocation &= ~__GFP_MEMALLOC; 305 static_branch_dec(&memalloc_socks_key); 306 307 /* 308 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward 309 * progress of swapping. SOCK_MEMALLOC may be cleared while 310 * it has rmem allocations due to the last swapfile being deactivated 311 * but there is a risk that the socket is unusable due to exceeding 312 * the rmem limits. Reclaim the reserves and obey rmem limits again. 313 */ 314 sk_mem_reclaim(sk); 315 } 316 EXPORT_SYMBOL_GPL(sk_clear_memalloc); 317 318 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 319 { 320 int ret; 321 unsigned int noreclaim_flag; 322 323 /* these should have been dropped before queueing */ 324 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); 325 326 noreclaim_flag = memalloc_noreclaim_save(); 327 ret = sk->sk_backlog_rcv(sk, skb); 328 memalloc_noreclaim_restore(noreclaim_flag); 329 330 return ret; 331 } 332 EXPORT_SYMBOL(__sk_backlog_rcv); 333 334 static int sock_get_timeout(long timeo, void *optval, bool old_timeval) 335 { 336 struct __kernel_sock_timeval tv; 337 338 if (timeo == MAX_SCHEDULE_TIMEOUT) { 339 tv.tv_sec = 0; 340 tv.tv_usec = 0; 341 } else { 342 tv.tv_sec = timeo / HZ; 343 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; 344 } 345 346 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 347 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; 348 *(struct old_timeval32 *)optval = tv32; 349 return sizeof(tv32); 350 } 351 352 if (old_timeval) { 353 struct __kernel_old_timeval old_tv; 354 old_tv.tv_sec = tv.tv_sec; 355 old_tv.tv_usec = tv.tv_usec; 356 *(struct __kernel_old_timeval *)optval = old_tv; 357 return sizeof(old_tv); 358 } 359 360 *(struct __kernel_sock_timeval *)optval = tv; 361 return sizeof(tv); 362 } 363 364 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, 365 bool old_timeval) 366 { 367 struct __kernel_sock_timeval tv; 368 369 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 370 struct old_timeval32 tv32; 371 372 if (optlen < sizeof(tv32)) 373 return -EINVAL; 374 375 if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) 376 return -EFAULT; 377 tv.tv_sec = tv32.tv_sec; 378 tv.tv_usec = tv32.tv_usec; 379 } else if (old_timeval) { 380 struct __kernel_old_timeval old_tv; 381 382 if (optlen < sizeof(old_tv)) 383 return -EINVAL; 384 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) 385 return -EFAULT; 386 tv.tv_sec = old_tv.tv_sec; 387 tv.tv_usec = old_tv.tv_usec; 388 } else { 389 if (optlen < sizeof(tv)) 390 return -EINVAL; 391 if (copy_from_sockptr(&tv, optval, sizeof(tv))) 392 return -EFAULT; 393 } 394 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) 395 return -EDOM; 396 397 if (tv.tv_sec < 0) { 398 static int warned __read_mostly; 399 400 *timeo_p = 0; 401 if (warned < 10 && net_ratelimit()) { 402 warned++; 403 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", 404 __func__, current->comm, task_pid_nr(current)); 405 } 406 return 0; 407 } 408 *timeo_p = MAX_SCHEDULE_TIMEOUT; 409 if (tv.tv_sec == 0 && tv.tv_usec == 0) 410 return 0; 411 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) 412 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ); 413 return 0; 414 } 415 416 static bool sock_needs_netstamp(const struct sock *sk) 417 { 418 switch (sk->sk_family) { 419 case AF_UNSPEC: 420 case AF_UNIX: 421 return false; 422 default: 423 return true; 424 } 425 } 426 427 static void sock_disable_timestamp(struct sock *sk, unsigned long flags) 428 { 429 if (sk->sk_flags & flags) { 430 sk->sk_flags &= ~flags; 431 if (sock_needs_netstamp(sk) && 432 !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) 433 net_disable_timestamp(); 434 } 435 } 436 437 438 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 439 { 440 unsigned long flags; 441 struct sk_buff_head *list = &sk->sk_receive_queue; 442 443 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { 444 atomic_inc(&sk->sk_drops); 445 trace_sock_rcvqueue_full(sk, skb); 446 return -ENOMEM; 447 } 448 449 if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 450 atomic_inc(&sk->sk_drops); 451 return -ENOBUFS; 452 } 453 454 skb->dev = NULL; 455 skb_set_owner_r(skb, sk); 456 457 /* we escape from rcu protected region, make sure we dont leak 458 * a norefcounted dst 459 */ 460 skb_dst_force(skb); 461 462 spin_lock_irqsave(&list->lock, flags); 463 sock_skb_set_dropcount(sk, skb); 464 __skb_queue_tail(list, skb); 465 spin_unlock_irqrestore(&list->lock, flags); 466 467 if (!sock_flag(sk, SOCK_DEAD)) 468 sk->sk_data_ready(sk); 469 return 0; 470 } 471 EXPORT_SYMBOL(__sock_queue_rcv_skb); 472 473 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 474 { 475 int err; 476 477 err = sk_filter(sk, skb); 478 if (err) 479 return err; 480 481 return __sock_queue_rcv_skb(sk, skb); 482 } 483 EXPORT_SYMBOL(sock_queue_rcv_skb); 484 485 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, 486 const int nested, unsigned int trim_cap, bool refcounted) 487 { 488 int rc = NET_RX_SUCCESS; 489 490 if (sk_filter_trim_cap(sk, skb, trim_cap)) 491 goto discard_and_relse; 492 493 skb->dev = NULL; 494 495 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { 496 atomic_inc(&sk->sk_drops); 497 goto discard_and_relse; 498 } 499 if (nested) 500 bh_lock_sock_nested(sk); 501 else 502 bh_lock_sock(sk); 503 if (!sock_owned_by_user(sk)) { 504 /* 505 * trylock + unlock semantics: 506 */ 507 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); 508 509 rc = sk_backlog_rcv(sk, skb); 510 511 mutex_release(&sk->sk_lock.dep_map, _RET_IP_); 512 } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { 513 bh_unlock_sock(sk); 514 atomic_inc(&sk->sk_drops); 515 goto discard_and_relse; 516 } 517 518 bh_unlock_sock(sk); 519 out: 520 if (refcounted) 521 sock_put(sk); 522 return rc; 523 discard_and_relse: 524 kfree_skb(skb); 525 goto out; 526 } 527 EXPORT_SYMBOL(__sk_receive_skb); 528 529 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, 530 u32)); 531 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 532 u32)); 533 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 534 { 535 struct dst_entry *dst = __sk_dst_get(sk); 536 537 if (dst && dst->obsolete && 538 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 539 dst, cookie) == NULL) { 540 sk_tx_queue_clear(sk); 541 sk->sk_dst_pending_confirm = 0; 542 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 543 dst_release(dst); 544 return NULL; 545 } 546 547 return dst; 548 } 549 EXPORT_SYMBOL(__sk_dst_check); 550 551 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) 552 { 553 struct dst_entry *dst = sk_dst_get(sk); 554 555 if (dst && dst->obsolete && 556 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 557 dst, cookie) == NULL) { 558 sk_dst_reset(sk); 559 dst_release(dst); 560 return NULL; 561 } 562 563 return dst; 564 } 565 EXPORT_SYMBOL(sk_dst_check); 566 567 static int sock_bindtoindex_locked(struct sock *sk, int ifindex) 568 { 569 int ret = -ENOPROTOOPT; 570 #ifdef CONFIG_NETDEVICES 571 struct net *net = sock_net(sk); 572 573 /* Sorry... */ 574 ret = -EPERM; 575 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) 576 goto out; 577 578 ret = -EINVAL; 579 if (ifindex < 0) 580 goto out; 581 582 sk->sk_bound_dev_if = ifindex; 583 if (sk->sk_prot->rehash) 584 sk->sk_prot->rehash(sk); 585 sk_dst_reset(sk); 586 587 ret = 0; 588 589 out: 590 #endif 591 592 return ret; 593 } 594 595 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) 596 { 597 int ret; 598 599 if (lock_sk) 600 lock_sock(sk); 601 ret = sock_bindtoindex_locked(sk, ifindex); 602 if (lock_sk) 603 release_sock(sk); 604 605 return ret; 606 } 607 EXPORT_SYMBOL(sock_bindtoindex); 608 609 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) 610 { 611 int ret = -ENOPROTOOPT; 612 #ifdef CONFIG_NETDEVICES 613 struct net *net = sock_net(sk); 614 char devname[IFNAMSIZ]; 615 int index; 616 617 ret = -EINVAL; 618 if (optlen < 0) 619 goto out; 620 621 /* Bind this socket to a particular device like "eth0", 622 * as specified in the passed interface name. If the 623 * name is "" or the option length is zero the socket 624 * is not bound. 625 */ 626 if (optlen > IFNAMSIZ - 1) 627 optlen = IFNAMSIZ - 1; 628 memset(devname, 0, sizeof(devname)); 629 630 ret = -EFAULT; 631 if (copy_from_sockptr(devname, optval, optlen)) 632 goto out; 633 634 index = 0; 635 if (devname[0] != '\0') { 636 struct net_device *dev; 637 638 rcu_read_lock(); 639 dev = dev_get_by_name_rcu(net, devname); 640 if (dev) 641 index = dev->ifindex; 642 rcu_read_unlock(); 643 ret = -ENODEV; 644 if (!dev) 645 goto out; 646 } 647 648 return sock_bindtoindex(sk, index, true); 649 out: 650 #endif 651 652 return ret; 653 } 654 655 static int sock_getbindtodevice(struct sock *sk, char __user *optval, 656 int __user *optlen, int len) 657 { 658 int ret = -ENOPROTOOPT; 659 #ifdef CONFIG_NETDEVICES 660 struct net *net = sock_net(sk); 661 char devname[IFNAMSIZ]; 662 663 if (sk->sk_bound_dev_if == 0) { 664 len = 0; 665 goto zero; 666 } 667 668 ret = -EINVAL; 669 if (len < IFNAMSIZ) 670 goto out; 671 672 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if); 673 if (ret) 674 goto out; 675 676 len = strlen(devname) + 1; 677 678 ret = -EFAULT; 679 if (copy_to_user(optval, devname, len)) 680 goto out; 681 682 zero: 683 ret = -EFAULT; 684 if (put_user(len, optlen)) 685 goto out; 686 687 ret = 0; 688 689 out: 690 #endif 691 692 return ret; 693 } 694 695 bool sk_mc_loop(struct sock *sk) 696 { 697 if (dev_recursion_level()) 698 return false; 699 if (!sk) 700 return true; 701 switch (sk->sk_family) { 702 case AF_INET: 703 return inet_sk(sk)->mc_loop; 704 #if IS_ENABLED(CONFIG_IPV6) 705 case AF_INET6: 706 return inet6_sk(sk)->mc_loop; 707 #endif 708 } 709 WARN_ON_ONCE(1); 710 return true; 711 } 712 EXPORT_SYMBOL(sk_mc_loop); 713 714 void sock_set_reuseaddr(struct sock *sk) 715 { 716 lock_sock(sk); 717 sk->sk_reuse = SK_CAN_REUSE; 718 release_sock(sk); 719 } 720 EXPORT_SYMBOL(sock_set_reuseaddr); 721 722 void sock_set_reuseport(struct sock *sk) 723 { 724 lock_sock(sk); 725 sk->sk_reuseport = true; 726 release_sock(sk); 727 } 728 EXPORT_SYMBOL(sock_set_reuseport); 729 730 void sock_no_linger(struct sock *sk) 731 { 732 lock_sock(sk); 733 sk->sk_lingertime = 0; 734 sock_set_flag(sk, SOCK_LINGER); 735 release_sock(sk); 736 } 737 EXPORT_SYMBOL(sock_no_linger); 738 739 void sock_set_priority(struct sock *sk, u32 priority) 740 { 741 lock_sock(sk); 742 sk->sk_priority = priority; 743 release_sock(sk); 744 } 745 EXPORT_SYMBOL(sock_set_priority); 746 747 void sock_set_sndtimeo(struct sock *sk, s64 secs) 748 { 749 lock_sock(sk); 750 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) 751 sk->sk_sndtimeo = secs * HZ; 752 else 753 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 754 release_sock(sk); 755 } 756 EXPORT_SYMBOL(sock_set_sndtimeo); 757 758 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) 759 { 760 if (val) { 761 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); 762 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns); 763 sock_set_flag(sk, SOCK_RCVTSTAMP); 764 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 765 } else { 766 sock_reset_flag(sk, SOCK_RCVTSTAMP); 767 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 768 } 769 } 770 771 void sock_enable_timestamps(struct sock *sk) 772 { 773 lock_sock(sk); 774 __sock_set_timestamps(sk, true, false, true); 775 release_sock(sk); 776 } 777 EXPORT_SYMBOL(sock_enable_timestamps); 778 779 void sock_set_timestamp(struct sock *sk, int optname, bool valbool) 780 { 781 switch (optname) { 782 case SO_TIMESTAMP_OLD: 783 __sock_set_timestamps(sk, valbool, false, false); 784 break; 785 case SO_TIMESTAMP_NEW: 786 __sock_set_timestamps(sk, valbool, true, false); 787 break; 788 case SO_TIMESTAMPNS_OLD: 789 __sock_set_timestamps(sk, valbool, false, true); 790 break; 791 case SO_TIMESTAMPNS_NEW: 792 __sock_set_timestamps(sk, valbool, true, true); 793 break; 794 } 795 } 796 797 int sock_set_timestamping(struct sock *sk, int optname, int val) 798 { 799 if (val & ~SOF_TIMESTAMPING_MASK) 800 return -EINVAL; 801 802 if (val & SOF_TIMESTAMPING_OPT_ID && 803 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { 804 if (sk->sk_protocol == IPPROTO_TCP && 805 sk->sk_type == SOCK_STREAM) { 806 if ((1 << sk->sk_state) & 807 (TCPF_CLOSE | TCPF_LISTEN)) 808 return -EINVAL; 809 sk->sk_tskey = tcp_sk(sk)->snd_una; 810 } else { 811 sk->sk_tskey = 0; 812 } 813 } 814 815 if (val & SOF_TIMESTAMPING_OPT_STATS && 816 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) 817 return -EINVAL; 818 819 sk->sk_tsflags = val; 820 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); 821 822 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 823 sock_enable_timestamp(sk, 824 SOCK_TIMESTAMPING_RX_SOFTWARE); 825 else 826 sock_disable_timestamp(sk, 827 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); 828 return 0; 829 } 830 831 void sock_set_keepalive(struct sock *sk) 832 { 833 lock_sock(sk); 834 if (sk->sk_prot->keepalive) 835 sk->sk_prot->keepalive(sk, true); 836 sock_valbool_flag(sk, SOCK_KEEPOPEN, true); 837 release_sock(sk); 838 } 839 EXPORT_SYMBOL(sock_set_keepalive); 840 841 static void __sock_set_rcvbuf(struct sock *sk, int val) 842 { 843 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it 844 * as a negative value. 845 */ 846 val = min_t(int, val, INT_MAX / 2); 847 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 848 849 /* We double it on the way in to account for "struct sk_buff" etc. 850 * overhead. Applications assume that the SO_RCVBUF setting they make 851 * will allow that much actual data to be received on that socket. 852 * 853 * Applications are unaware that "struct sk_buff" and other overheads 854 * allocate from the receive buffer during socket buffer allocation. 855 * 856 * And after considering the possible alternatives, returning the value 857 * we actually used in getsockopt is the most desirable behavior. 858 */ 859 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); 860 } 861 862 void sock_set_rcvbuf(struct sock *sk, int val) 863 { 864 lock_sock(sk); 865 __sock_set_rcvbuf(sk, val); 866 release_sock(sk); 867 } 868 EXPORT_SYMBOL(sock_set_rcvbuf); 869 870 static void __sock_set_mark(struct sock *sk, u32 val) 871 { 872 if (val != sk->sk_mark) { 873 sk->sk_mark = val; 874 sk_dst_reset(sk); 875 } 876 } 877 878 void sock_set_mark(struct sock *sk, u32 val) 879 { 880 lock_sock(sk); 881 __sock_set_mark(sk, val); 882 release_sock(sk); 883 } 884 EXPORT_SYMBOL(sock_set_mark); 885 886 /* 887 * This is meant for all protocols to use and covers goings on 888 * at the socket level. Everything here is generic. 889 */ 890 891 int sock_setsockopt(struct socket *sock, int level, int optname, 892 sockptr_t optval, unsigned int optlen) 893 { 894 struct sock_txtime sk_txtime; 895 struct sock *sk = sock->sk; 896 int val; 897 int valbool; 898 struct linger ling; 899 int ret = 0; 900 901 /* 902 * Options without arguments 903 */ 904 905 if (optname == SO_BINDTODEVICE) 906 return sock_setbindtodevice(sk, optval, optlen); 907 908 if (optlen < sizeof(int)) 909 return -EINVAL; 910 911 if (copy_from_sockptr(&val, optval, sizeof(val))) 912 return -EFAULT; 913 914 valbool = val ? 1 : 0; 915 916 lock_sock(sk); 917 918 switch (optname) { 919 case SO_DEBUG: 920 if (val && !capable(CAP_NET_ADMIN)) 921 ret = -EACCES; 922 else 923 sock_valbool_flag(sk, SOCK_DBG, valbool); 924 break; 925 case SO_REUSEADDR: 926 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); 927 break; 928 case SO_REUSEPORT: 929 sk->sk_reuseport = valbool; 930 break; 931 case SO_TYPE: 932 case SO_PROTOCOL: 933 case SO_DOMAIN: 934 case SO_ERROR: 935 ret = -ENOPROTOOPT; 936 break; 937 case SO_DONTROUTE: 938 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); 939 sk_dst_reset(sk); 940 break; 941 case SO_BROADCAST: 942 sock_valbool_flag(sk, SOCK_BROADCAST, valbool); 943 break; 944 case SO_SNDBUF: 945 /* Don't error on this BSD doesn't and if you think 946 * about it this is right. Otherwise apps have to 947 * play 'guess the biggest size' games. RCVBUF/SNDBUF 948 * are treated in BSD as hints 949 */ 950 val = min_t(u32, val, sysctl_wmem_max); 951 set_sndbuf: 952 /* Ensure val * 2 fits into an int, to prevent max_t() 953 * from treating it as a negative value. 954 */ 955 val = min_t(int, val, INT_MAX / 2); 956 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 957 WRITE_ONCE(sk->sk_sndbuf, 958 max_t(int, val * 2, SOCK_MIN_SNDBUF)); 959 /* Wake up sending tasks if we upped the value. */ 960 sk->sk_write_space(sk); 961 break; 962 963 case SO_SNDBUFFORCE: 964 if (!capable(CAP_NET_ADMIN)) { 965 ret = -EPERM; 966 break; 967 } 968 969 /* No negative values (to prevent underflow, as val will be 970 * multiplied by 2). 971 */ 972 if (val < 0) 973 val = 0; 974 goto set_sndbuf; 975 976 case SO_RCVBUF: 977 /* Don't error on this BSD doesn't and if you think 978 * about it this is right. Otherwise apps have to 979 * play 'guess the biggest size' games. RCVBUF/SNDBUF 980 * are treated in BSD as hints 981 */ 982 __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max)); 983 break; 984 985 case SO_RCVBUFFORCE: 986 if (!capable(CAP_NET_ADMIN)) { 987 ret = -EPERM; 988 break; 989 } 990 991 /* No negative values (to prevent underflow, as val will be 992 * multiplied by 2). 993 */ 994 __sock_set_rcvbuf(sk, max(val, 0)); 995 break; 996 997 case SO_KEEPALIVE: 998 if (sk->sk_prot->keepalive) 999 sk->sk_prot->keepalive(sk, valbool); 1000 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 1001 break; 1002 1003 case SO_OOBINLINE: 1004 sock_valbool_flag(sk, SOCK_URGINLINE, valbool); 1005 break; 1006 1007 case SO_NO_CHECK: 1008 sk->sk_no_check_tx = valbool; 1009 break; 1010 1011 case SO_PRIORITY: 1012 if ((val >= 0 && val <= 6) || 1013 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 1014 sk->sk_priority = val; 1015 else 1016 ret = -EPERM; 1017 break; 1018 1019 case SO_LINGER: 1020 if (optlen < sizeof(ling)) { 1021 ret = -EINVAL; /* 1003.1g */ 1022 break; 1023 } 1024 if (copy_from_sockptr(&ling, optval, sizeof(ling))) { 1025 ret = -EFAULT; 1026 break; 1027 } 1028 if (!ling.l_onoff) 1029 sock_reset_flag(sk, SOCK_LINGER); 1030 else { 1031 #if (BITS_PER_LONG == 32) 1032 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) 1033 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; 1034 else 1035 #endif 1036 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; 1037 sock_set_flag(sk, SOCK_LINGER); 1038 } 1039 break; 1040 1041 case SO_BSDCOMPAT: 1042 break; 1043 1044 case SO_PASSCRED: 1045 if (valbool) 1046 set_bit(SOCK_PASSCRED, &sock->flags); 1047 else 1048 clear_bit(SOCK_PASSCRED, &sock->flags); 1049 break; 1050 1051 case SO_TIMESTAMP_OLD: 1052 case SO_TIMESTAMP_NEW: 1053 case SO_TIMESTAMPNS_OLD: 1054 case SO_TIMESTAMPNS_NEW: 1055 sock_set_timestamp(sk, valbool, optname); 1056 break; 1057 1058 case SO_TIMESTAMPING_NEW: 1059 case SO_TIMESTAMPING_OLD: 1060 ret = sock_set_timestamping(sk, optname, val); 1061 break; 1062 1063 case SO_RCVLOWAT: 1064 if (val < 0) 1065 val = INT_MAX; 1066 if (sock->ops->set_rcvlowat) 1067 ret = sock->ops->set_rcvlowat(sk, val); 1068 else 1069 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); 1070 break; 1071 1072 case SO_RCVTIMEO_OLD: 1073 case SO_RCVTIMEO_NEW: 1074 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, 1075 optlen, optname == SO_RCVTIMEO_OLD); 1076 break; 1077 1078 case SO_SNDTIMEO_OLD: 1079 case SO_SNDTIMEO_NEW: 1080 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, 1081 optlen, optname == SO_SNDTIMEO_OLD); 1082 break; 1083 1084 case SO_ATTACH_FILTER: { 1085 struct sock_fprog fprog; 1086 1087 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1088 if (!ret) 1089 ret = sk_attach_filter(&fprog, sk); 1090 break; 1091 } 1092 case SO_ATTACH_BPF: 1093 ret = -EINVAL; 1094 if (optlen == sizeof(u32)) { 1095 u32 ufd; 1096 1097 ret = -EFAULT; 1098 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1099 break; 1100 1101 ret = sk_attach_bpf(ufd, sk); 1102 } 1103 break; 1104 1105 case SO_ATTACH_REUSEPORT_CBPF: { 1106 struct sock_fprog fprog; 1107 1108 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1109 if (!ret) 1110 ret = sk_reuseport_attach_filter(&fprog, sk); 1111 break; 1112 } 1113 case SO_ATTACH_REUSEPORT_EBPF: 1114 ret = -EINVAL; 1115 if (optlen == sizeof(u32)) { 1116 u32 ufd; 1117 1118 ret = -EFAULT; 1119 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1120 break; 1121 1122 ret = sk_reuseport_attach_bpf(ufd, sk); 1123 } 1124 break; 1125 1126 case SO_DETACH_REUSEPORT_BPF: 1127 ret = reuseport_detach_prog(sk); 1128 break; 1129 1130 case SO_DETACH_FILTER: 1131 ret = sk_detach_filter(sk); 1132 break; 1133 1134 case SO_LOCK_FILTER: 1135 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) 1136 ret = -EPERM; 1137 else 1138 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); 1139 break; 1140 1141 case SO_PASSSEC: 1142 if (valbool) 1143 set_bit(SOCK_PASSSEC, &sock->flags); 1144 else 1145 clear_bit(SOCK_PASSSEC, &sock->flags); 1146 break; 1147 case SO_MARK: 1148 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1149 ret = -EPERM; 1150 break; 1151 } 1152 1153 __sock_set_mark(sk, val); 1154 break; 1155 1156 case SO_RXQ_OVFL: 1157 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); 1158 break; 1159 1160 case SO_WIFI_STATUS: 1161 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); 1162 break; 1163 1164 case SO_PEEK_OFF: 1165 if (sock->ops->set_peek_off) 1166 ret = sock->ops->set_peek_off(sk, val); 1167 else 1168 ret = -EOPNOTSUPP; 1169 break; 1170 1171 case SO_NOFCS: 1172 sock_valbool_flag(sk, SOCK_NOFCS, valbool); 1173 break; 1174 1175 case SO_SELECT_ERR_QUEUE: 1176 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); 1177 break; 1178 1179 #ifdef CONFIG_NET_RX_BUSY_POLL 1180 case SO_BUSY_POLL: 1181 /* allow unprivileged users to decrease the value */ 1182 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) 1183 ret = -EPERM; 1184 else { 1185 if (val < 0) 1186 ret = -EINVAL; 1187 else 1188 sk->sk_ll_usec = val; 1189 } 1190 break; 1191 case SO_PREFER_BUSY_POLL: 1192 if (valbool && !capable(CAP_NET_ADMIN)) 1193 ret = -EPERM; 1194 else 1195 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); 1196 break; 1197 case SO_BUSY_POLL_BUDGET: 1198 if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) { 1199 ret = -EPERM; 1200 } else { 1201 if (val < 0 || val > U16_MAX) 1202 ret = -EINVAL; 1203 else 1204 WRITE_ONCE(sk->sk_busy_poll_budget, val); 1205 } 1206 break; 1207 #endif 1208 1209 case SO_MAX_PACING_RATE: 1210 { 1211 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; 1212 1213 if (sizeof(ulval) != sizeof(val) && 1214 optlen >= sizeof(ulval) && 1215 copy_from_sockptr(&ulval, optval, sizeof(ulval))) { 1216 ret = -EFAULT; 1217 break; 1218 } 1219 if (ulval != ~0UL) 1220 cmpxchg(&sk->sk_pacing_status, 1221 SK_PACING_NONE, 1222 SK_PACING_NEEDED); 1223 sk->sk_max_pacing_rate = ulval; 1224 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval); 1225 break; 1226 } 1227 case SO_INCOMING_CPU: 1228 WRITE_ONCE(sk->sk_incoming_cpu, val); 1229 break; 1230 1231 case SO_CNX_ADVICE: 1232 if (val == 1) 1233 dst_negative_advice(sk); 1234 break; 1235 1236 case SO_ZEROCOPY: 1237 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { 1238 if (!((sk->sk_type == SOCK_STREAM && 1239 sk->sk_protocol == IPPROTO_TCP) || 1240 (sk->sk_type == SOCK_DGRAM && 1241 sk->sk_protocol == IPPROTO_UDP))) 1242 ret = -ENOTSUPP; 1243 } else if (sk->sk_family != PF_RDS) { 1244 ret = -ENOTSUPP; 1245 } 1246 if (!ret) { 1247 if (val < 0 || val > 1) 1248 ret = -EINVAL; 1249 else 1250 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); 1251 } 1252 break; 1253 1254 case SO_TXTIME: 1255 if (optlen != sizeof(struct sock_txtime)) { 1256 ret = -EINVAL; 1257 break; 1258 } else if (copy_from_sockptr(&sk_txtime, optval, 1259 sizeof(struct sock_txtime))) { 1260 ret = -EFAULT; 1261 break; 1262 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { 1263 ret = -EINVAL; 1264 break; 1265 } 1266 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet 1267 * scheduler has enough safe guards. 1268 */ 1269 if (sk_txtime.clockid != CLOCK_MONOTONIC && 1270 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1271 ret = -EPERM; 1272 break; 1273 } 1274 sock_valbool_flag(sk, SOCK_TXTIME, true); 1275 sk->sk_clockid = sk_txtime.clockid; 1276 sk->sk_txtime_deadline_mode = 1277 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); 1278 sk->sk_txtime_report_errors = 1279 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); 1280 break; 1281 1282 case SO_BINDTOIFINDEX: 1283 ret = sock_bindtoindex_locked(sk, val); 1284 break; 1285 1286 default: 1287 ret = -ENOPROTOOPT; 1288 break; 1289 } 1290 release_sock(sk); 1291 return ret; 1292 } 1293 EXPORT_SYMBOL(sock_setsockopt); 1294 1295 1296 static void cred_to_ucred(struct pid *pid, const struct cred *cred, 1297 struct ucred *ucred) 1298 { 1299 ucred->pid = pid_vnr(pid); 1300 ucred->uid = ucred->gid = -1; 1301 if (cred) { 1302 struct user_namespace *current_ns = current_user_ns(); 1303 1304 ucred->uid = from_kuid_munged(current_ns, cred->euid); 1305 ucred->gid = from_kgid_munged(current_ns, cred->egid); 1306 } 1307 } 1308 1309 static int groups_to_user(gid_t __user *dst, const struct group_info *src) 1310 { 1311 struct user_namespace *user_ns = current_user_ns(); 1312 int i; 1313 1314 for (i = 0; i < src->ngroups; i++) 1315 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i)) 1316 return -EFAULT; 1317 1318 return 0; 1319 } 1320 1321 int sock_getsockopt(struct socket *sock, int level, int optname, 1322 char __user *optval, int __user *optlen) 1323 { 1324 struct sock *sk = sock->sk; 1325 1326 union { 1327 int val; 1328 u64 val64; 1329 unsigned long ulval; 1330 struct linger ling; 1331 struct old_timeval32 tm32; 1332 struct __kernel_old_timeval tm; 1333 struct __kernel_sock_timeval stm; 1334 struct sock_txtime txtime; 1335 } v; 1336 1337 int lv = sizeof(int); 1338 int len; 1339 1340 if (get_user(len, optlen)) 1341 return -EFAULT; 1342 if (len < 0) 1343 return -EINVAL; 1344 1345 memset(&v, 0, sizeof(v)); 1346 1347 switch (optname) { 1348 case SO_DEBUG: 1349 v.val = sock_flag(sk, SOCK_DBG); 1350 break; 1351 1352 case SO_DONTROUTE: 1353 v.val = sock_flag(sk, SOCK_LOCALROUTE); 1354 break; 1355 1356 case SO_BROADCAST: 1357 v.val = sock_flag(sk, SOCK_BROADCAST); 1358 break; 1359 1360 case SO_SNDBUF: 1361 v.val = sk->sk_sndbuf; 1362 break; 1363 1364 case SO_RCVBUF: 1365 v.val = sk->sk_rcvbuf; 1366 break; 1367 1368 case SO_REUSEADDR: 1369 v.val = sk->sk_reuse; 1370 break; 1371 1372 case SO_REUSEPORT: 1373 v.val = sk->sk_reuseport; 1374 break; 1375 1376 case SO_KEEPALIVE: 1377 v.val = sock_flag(sk, SOCK_KEEPOPEN); 1378 break; 1379 1380 case SO_TYPE: 1381 v.val = sk->sk_type; 1382 break; 1383 1384 case SO_PROTOCOL: 1385 v.val = sk->sk_protocol; 1386 break; 1387 1388 case SO_DOMAIN: 1389 v.val = sk->sk_family; 1390 break; 1391 1392 case SO_ERROR: 1393 v.val = -sock_error(sk); 1394 if (v.val == 0) 1395 v.val = xchg(&sk->sk_err_soft, 0); 1396 break; 1397 1398 case SO_OOBINLINE: 1399 v.val = sock_flag(sk, SOCK_URGINLINE); 1400 break; 1401 1402 case SO_NO_CHECK: 1403 v.val = sk->sk_no_check_tx; 1404 break; 1405 1406 case SO_PRIORITY: 1407 v.val = sk->sk_priority; 1408 break; 1409 1410 case SO_LINGER: 1411 lv = sizeof(v.ling); 1412 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); 1413 v.ling.l_linger = sk->sk_lingertime / HZ; 1414 break; 1415 1416 case SO_BSDCOMPAT: 1417 break; 1418 1419 case SO_TIMESTAMP_OLD: 1420 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && 1421 !sock_flag(sk, SOCK_TSTAMP_NEW) && 1422 !sock_flag(sk, SOCK_RCVTSTAMPNS); 1423 break; 1424 1425 case SO_TIMESTAMPNS_OLD: 1426 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); 1427 break; 1428 1429 case SO_TIMESTAMP_NEW: 1430 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); 1431 break; 1432 1433 case SO_TIMESTAMPNS_NEW: 1434 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); 1435 break; 1436 1437 case SO_TIMESTAMPING_OLD: 1438 v.val = sk->sk_tsflags; 1439 break; 1440 1441 case SO_RCVTIMEO_OLD: 1442 case SO_RCVTIMEO_NEW: 1443 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname); 1444 break; 1445 1446 case SO_SNDTIMEO_OLD: 1447 case SO_SNDTIMEO_NEW: 1448 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname); 1449 break; 1450 1451 case SO_RCVLOWAT: 1452 v.val = sk->sk_rcvlowat; 1453 break; 1454 1455 case SO_SNDLOWAT: 1456 v.val = 1; 1457 break; 1458 1459 case SO_PASSCRED: 1460 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); 1461 break; 1462 1463 case SO_PEERCRED: 1464 { 1465 struct ucred peercred; 1466 if (len > sizeof(peercred)) 1467 len = sizeof(peercred); 1468 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); 1469 if (copy_to_user(optval, &peercred, len)) 1470 return -EFAULT; 1471 goto lenout; 1472 } 1473 1474 case SO_PEERGROUPS: 1475 { 1476 int ret, n; 1477 1478 if (!sk->sk_peer_cred) 1479 return -ENODATA; 1480 1481 n = sk->sk_peer_cred->group_info->ngroups; 1482 if (len < n * sizeof(gid_t)) { 1483 len = n * sizeof(gid_t); 1484 return put_user(len, optlen) ? -EFAULT : -ERANGE; 1485 } 1486 len = n * sizeof(gid_t); 1487 1488 ret = groups_to_user((gid_t __user *)optval, 1489 sk->sk_peer_cred->group_info); 1490 if (ret) 1491 return ret; 1492 goto lenout; 1493 } 1494 1495 case SO_PEERNAME: 1496 { 1497 char address[128]; 1498 1499 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2); 1500 if (lv < 0) 1501 return -ENOTCONN; 1502 if (lv < len) 1503 return -EINVAL; 1504 if (copy_to_user(optval, address, len)) 1505 return -EFAULT; 1506 goto lenout; 1507 } 1508 1509 /* Dubious BSD thing... Probably nobody even uses it, but 1510 * the UNIX standard wants it for whatever reason... -DaveM 1511 */ 1512 case SO_ACCEPTCONN: 1513 v.val = sk->sk_state == TCP_LISTEN; 1514 break; 1515 1516 case SO_PASSSEC: 1517 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); 1518 break; 1519 1520 case SO_PEERSEC: 1521 return security_socket_getpeersec_stream(sock, optval, optlen, len); 1522 1523 case SO_MARK: 1524 v.val = sk->sk_mark; 1525 break; 1526 1527 case SO_RXQ_OVFL: 1528 v.val = sock_flag(sk, SOCK_RXQ_OVFL); 1529 break; 1530 1531 case SO_WIFI_STATUS: 1532 v.val = sock_flag(sk, SOCK_WIFI_STATUS); 1533 break; 1534 1535 case SO_PEEK_OFF: 1536 if (!sock->ops->set_peek_off) 1537 return -EOPNOTSUPP; 1538 1539 v.val = sk->sk_peek_off; 1540 break; 1541 case SO_NOFCS: 1542 v.val = sock_flag(sk, SOCK_NOFCS); 1543 break; 1544 1545 case SO_BINDTODEVICE: 1546 return sock_getbindtodevice(sk, optval, optlen, len); 1547 1548 case SO_GET_FILTER: 1549 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); 1550 if (len < 0) 1551 return len; 1552 1553 goto lenout; 1554 1555 case SO_LOCK_FILTER: 1556 v.val = sock_flag(sk, SOCK_FILTER_LOCKED); 1557 break; 1558 1559 case SO_BPF_EXTENSIONS: 1560 v.val = bpf_tell_extensions(); 1561 break; 1562 1563 case SO_SELECT_ERR_QUEUE: 1564 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); 1565 break; 1566 1567 #ifdef CONFIG_NET_RX_BUSY_POLL 1568 case SO_BUSY_POLL: 1569 v.val = sk->sk_ll_usec; 1570 break; 1571 case SO_PREFER_BUSY_POLL: 1572 v.val = READ_ONCE(sk->sk_prefer_busy_poll); 1573 break; 1574 #endif 1575 1576 case SO_MAX_PACING_RATE: 1577 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { 1578 lv = sizeof(v.ulval); 1579 v.ulval = sk->sk_max_pacing_rate; 1580 } else { 1581 /* 32bit version */ 1582 v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U); 1583 } 1584 break; 1585 1586 case SO_INCOMING_CPU: 1587 v.val = READ_ONCE(sk->sk_incoming_cpu); 1588 break; 1589 1590 case SO_MEMINFO: 1591 { 1592 u32 meminfo[SK_MEMINFO_VARS]; 1593 1594 sk_get_meminfo(sk, meminfo); 1595 1596 len = min_t(unsigned int, len, sizeof(meminfo)); 1597 if (copy_to_user(optval, &meminfo, len)) 1598 return -EFAULT; 1599 1600 goto lenout; 1601 } 1602 1603 #ifdef CONFIG_NET_RX_BUSY_POLL 1604 case SO_INCOMING_NAPI_ID: 1605 v.val = READ_ONCE(sk->sk_napi_id); 1606 1607 /* aggregate non-NAPI IDs down to 0 */ 1608 if (v.val < MIN_NAPI_ID) 1609 v.val = 0; 1610 1611 break; 1612 #endif 1613 1614 case SO_COOKIE: 1615 lv = sizeof(u64); 1616 if (len < lv) 1617 return -EINVAL; 1618 v.val64 = sock_gen_cookie(sk); 1619 break; 1620 1621 case SO_ZEROCOPY: 1622 v.val = sock_flag(sk, SOCK_ZEROCOPY); 1623 break; 1624 1625 case SO_TXTIME: 1626 lv = sizeof(v.txtime); 1627 v.txtime.clockid = sk->sk_clockid; 1628 v.txtime.flags |= sk->sk_txtime_deadline_mode ? 1629 SOF_TXTIME_DEADLINE_MODE : 0; 1630 v.txtime.flags |= sk->sk_txtime_report_errors ? 1631 SOF_TXTIME_REPORT_ERRORS : 0; 1632 break; 1633 1634 case SO_BINDTOIFINDEX: 1635 v.val = sk->sk_bound_dev_if; 1636 break; 1637 1638 case SO_NETNS_COOKIE: 1639 lv = sizeof(u64); 1640 if (len != lv) 1641 return -EINVAL; 1642 v.val64 = sock_net(sk)->net_cookie; 1643 break; 1644 1645 default: 1646 /* We implement the SO_SNDLOWAT etc to not be settable 1647 * (1003.1g 7). 1648 */ 1649 return -ENOPROTOOPT; 1650 } 1651 1652 if (len > lv) 1653 len = lv; 1654 if (copy_to_user(optval, &v, len)) 1655 return -EFAULT; 1656 lenout: 1657 if (put_user(len, optlen)) 1658 return -EFAULT; 1659 return 0; 1660 } 1661 1662 /* 1663 * Initialize an sk_lock. 1664 * 1665 * (We also register the sk_lock with the lock validator.) 1666 */ 1667 static inline void sock_lock_init(struct sock *sk) 1668 { 1669 if (sk->sk_kern_sock) 1670 sock_lock_init_class_and_name( 1671 sk, 1672 af_family_kern_slock_key_strings[sk->sk_family], 1673 af_family_kern_slock_keys + sk->sk_family, 1674 af_family_kern_key_strings[sk->sk_family], 1675 af_family_kern_keys + sk->sk_family); 1676 else 1677 sock_lock_init_class_and_name( 1678 sk, 1679 af_family_slock_key_strings[sk->sk_family], 1680 af_family_slock_keys + sk->sk_family, 1681 af_family_key_strings[sk->sk_family], 1682 af_family_keys + sk->sk_family); 1683 } 1684 1685 /* 1686 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 1687 * even temporarly, because of RCU lookups. sk_node should also be left as is. 1688 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end 1689 */ 1690 static void sock_copy(struct sock *nsk, const struct sock *osk) 1691 { 1692 const struct proto *prot = READ_ONCE(osk->sk_prot); 1693 #ifdef CONFIG_SECURITY_NETWORK 1694 void *sptr = nsk->sk_security; 1695 #endif 1696 1697 /* If we move sk_tx_queue_mapping out of the private section, 1698 * we must check if sk_tx_queue_clear() is called after 1699 * sock_copy() in sk_clone_lock(). 1700 */ 1701 BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) < 1702 offsetof(struct sock, sk_dontcopy_begin) || 1703 offsetof(struct sock, sk_tx_queue_mapping) >= 1704 offsetof(struct sock, sk_dontcopy_end)); 1705 1706 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); 1707 1708 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, 1709 prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); 1710 1711 #ifdef CONFIG_SECURITY_NETWORK 1712 nsk->sk_security = sptr; 1713 security_sk_clone(osk, nsk); 1714 #endif 1715 } 1716 1717 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 1718 int family) 1719 { 1720 struct sock *sk; 1721 struct kmem_cache *slab; 1722 1723 slab = prot->slab; 1724 if (slab != NULL) { 1725 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 1726 if (!sk) 1727 return sk; 1728 if (want_init_on_alloc(priority)) 1729 sk_prot_clear_nulls(sk, prot->obj_size); 1730 } else 1731 sk = kmalloc(prot->obj_size, priority); 1732 1733 if (sk != NULL) { 1734 if (security_sk_alloc(sk, family, priority)) 1735 goto out_free; 1736 1737 if (!try_module_get(prot->owner)) 1738 goto out_free_sec; 1739 } 1740 1741 return sk; 1742 1743 out_free_sec: 1744 security_sk_free(sk); 1745 out_free: 1746 if (slab != NULL) 1747 kmem_cache_free(slab, sk); 1748 else 1749 kfree(sk); 1750 return NULL; 1751 } 1752 1753 static void sk_prot_free(struct proto *prot, struct sock *sk) 1754 { 1755 struct kmem_cache *slab; 1756 struct module *owner; 1757 1758 owner = prot->owner; 1759 slab = prot->slab; 1760 1761 cgroup_sk_free(&sk->sk_cgrp_data); 1762 mem_cgroup_sk_free(sk); 1763 security_sk_free(sk); 1764 if (slab != NULL) 1765 kmem_cache_free(slab, sk); 1766 else 1767 kfree(sk); 1768 module_put(owner); 1769 } 1770 1771 /** 1772 * sk_alloc - All socket objects are allocated here 1773 * @net: the applicable net namespace 1774 * @family: protocol family 1775 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1776 * @prot: struct proto associated with this new sock instance 1777 * @kern: is this to be a kernel socket? 1778 */ 1779 struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 1780 struct proto *prot, int kern) 1781 { 1782 struct sock *sk; 1783 1784 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); 1785 if (sk) { 1786 sk->sk_family = family; 1787 /* 1788 * See comment in struct sock definition to understand 1789 * why we need sk_prot_creator -acme 1790 */ 1791 sk->sk_prot = sk->sk_prot_creator = prot; 1792 sk->sk_kern_sock = kern; 1793 sock_lock_init(sk); 1794 sk->sk_net_refcnt = kern ? 0 : 1; 1795 if (likely(sk->sk_net_refcnt)) { 1796 get_net(net); 1797 sock_inuse_add(net, 1); 1798 } 1799 1800 sock_net_set(sk, net); 1801 refcount_set(&sk->sk_wmem_alloc, 1); 1802 1803 mem_cgroup_sk_alloc(sk); 1804 cgroup_sk_alloc(&sk->sk_cgrp_data); 1805 sock_update_classid(&sk->sk_cgrp_data); 1806 sock_update_netprioidx(&sk->sk_cgrp_data); 1807 sk_tx_queue_clear(sk); 1808 } 1809 1810 return sk; 1811 } 1812 EXPORT_SYMBOL(sk_alloc); 1813 1814 /* Sockets having SOCK_RCU_FREE will call this function after one RCU 1815 * grace period. This is the case for UDP sockets and TCP listeners. 1816 */ 1817 static void __sk_destruct(struct rcu_head *head) 1818 { 1819 struct sock *sk = container_of(head, struct sock, sk_rcu); 1820 struct sk_filter *filter; 1821 1822 if (sk->sk_destruct) 1823 sk->sk_destruct(sk); 1824 1825 filter = rcu_dereference_check(sk->sk_filter, 1826 refcount_read(&sk->sk_wmem_alloc) == 0); 1827 if (filter) { 1828 sk_filter_uncharge(sk, filter); 1829 RCU_INIT_POINTER(sk->sk_filter, NULL); 1830 } 1831 1832 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); 1833 1834 #ifdef CONFIG_BPF_SYSCALL 1835 bpf_sk_storage_free(sk); 1836 #endif 1837 1838 if (atomic_read(&sk->sk_omem_alloc)) 1839 pr_debug("%s: optmem leakage (%d bytes) detected\n", 1840 __func__, atomic_read(&sk->sk_omem_alloc)); 1841 1842 if (sk->sk_frag.page) { 1843 put_page(sk->sk_frag.page); 1844 sk->sk_frag.page = NULL; 1845 } 1846 1847 if (sk->sk_peer_cred) 1848 put_cred(sk->sk_peer_cred); 1849 put_pid(sk->sk_peer_pid); 1850 if (likely(sk->sk_net_refcnt)) 1851 put_net(sock_net(sk)); 1852 sk_prot_free(sk->sk_prot_creator, sk); 1853 } 1854 1855 void sk_destruct(struct sock *sk) 1856 { 1857 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); 1858 1859 if (rcu_access_pointer(sk->sk_reuseport_cb)) { 1860 reuseport_detach_sock(sk); 1861 use_call_rcu = true; 1862 } 1863 1864 if (use_call_rcu) 1865 call_rcu(&sk->sk_rcu, __sk_destruct); 1866 else 1867 __sk_destruct(&sk->sk_rcu); 1868 } 1869 1870 static void __sk_free(struct sock *sk) 1871 { 1872 if (likely(sk->sk_net_refcnt)) 1873 sock_inuse_add(sock_net(sk), -1); 1874 1875 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) 1876 sock_diag_broadcast_destroy(sk); 1877 else 1878 sk_destruct(sk); 1879 } 1880 1881 void sk_free(struct sock *sk) 1882 { 1883 /* 1884 * We subtract one from sk_wmem_alloc and can know if 1885 * some packets are still in some tx queue. 1886 * If not null, sock_wfree() will call __sk_free(sk) later 1887 */ 1888 if (refcount_dec_and_test(&sk->sk_wmem_alloc)) 1889 __sk_free(sk); 1890 } 1891 EXPORT_SYMBOL(sk_free); 1892 1893 static void sk_init_common(struct sock *sk) 1894 { 1895 skb_queue_head_init(&sk->sk_receive_queue); 1896 skb_queue_head_init(&sk->sk_write_queue); 1897 skb_queue_head_init(&sk->sk_error_queue); 1898 1899 rwlock_init(&sk->sk_callback_lock); 1900 lockdep_set_class_and_name(&sk->sk_receive_queue.lock, 1901 af_rlock_keys + sk->sk_family, 1902 af_family_rlock_key_strings[sk->sk_family]); 1903 lockdep_set_class_and_name(&sk->sk_write_queue.lock, 1904 af_wlock_keys + sk->sk_family, 1905 af_family_wlock_key_strings[sk->sk_family]); 1906 lockdep_set_class_and_name(&sk->sk_error_queue.lock, 1907 af_elock_keys + sk->sk_family, 1908 af_family_elock_key_strings[sk->sk_family]); 1909 lockdep_set_class_and_name(&sk->sk_callback_lock, 1910 af_callback_keys + sk->sk_family, 1911 af_family_clock_key_strings[sk->sk_family]); 1912 } 1913 1914 /** 1915 * sk_clone_lock - clone a socket, and lock its clone 1916 * @sk: the socket to clone 1917 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1918 * 1919 * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) 1920 */ 1921 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) 1922 { 1923 struct proto *prot = READ_ONCE(sk->sk_prot); 1924 struct sk_filter *filter; 1925 bool is_charged = true; 1926 struct sock *newsk; 1927 1928 newsk = sk_prot_alloc(prot, priority, sk->sk_family); 1929 if (!newsk) 1930 goto out; 1931 1932 sock_copy(newsk, sk); 1933 1934 newsk->sk_prot_creator = prot; 1935 1936 /* SANITY */ 1937 if (likely(newsk->sk_net_refcnt)) 1938 get_net(sock_net(newsk)); 1939 sk_node_init(&newsk->sk_node); 1940 sock_lock_init(newsk); 1941 bh_lock_sock(newsk); 1942 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 1943 newsk->sk_backlog.len = 0; 1944 1945 atomic_set(&newsk->sk_rmem_alloc, 0); 1946 1947 /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */ 1948 refcount_set(&newsk->sk_wmem_alloc, 1); 1949 1950 atomic_set(&newsk->sk_omem_alloc, 0); 1951 sk_init_common(newsk); 1952 1953 newsk->sk_dst_cache = NULL; 1954 newsk->sk_dst_pending_confirm = 0; 1955 newsk->sk_wmem_queued = 0; 1956 newsk->sk_forward_alloc = 0; 1957 atomic_set(&newsk->sk_drops, 0); 1958 newsk->sk_send_head = NULL; 1959 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; 1960 atomic_set(&newsk->sk_zckey, 0); 1961 1962 sock_reset_flag(newsk, SOCK_DONE); 1963 1964 /* sk->sk_memcg will be populated at accept() time */ 1965 newsk->sk_memcg = NULL; 1966 1967 cgroup_sk_clone(&newsk->sk_cgrp_data); 1968 1969 rcu_read_lock(); 1970 filter = rcu_dereference(sk->sk_filter); 1971 if (filter != NULL) 1972 /* though it's an empty new sock, the charging may fail 1973 * if sysctl_optmem_max was changed between creation of 1974 * original socket and cloning 1975 */ 1976 is_charged = sk_filter_charge(newsk, filter); 1977 RCU_INIT_POINTER(newsk->sk_filter, filter); 1978 rcu_read_unlock(); 1979 1980 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { 1981 /* We need to make sure that we don't uncharge the new 1982 * socket if we couldn't charge it in the first place 1983 * as otherwise we uncharge the parent's filter. 1984 */ 1985 if (!is_charged) 1986 RCU_INIT_POINTER(newsk->sk_filter, NULL); 1987 sk_free_unlock_clone(newsk); 1988 newsk = NULL; 1989 goto out; 1990 } 1991 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); 1992 1993 if (bpf_sk_storage_clone(sk, newsk)) { 1994 sk_free_unlock_clone(newsk); 1995 newsk = NULL; 1996 goto out; 1997 } 1998 1999 /* Clear sk_user_data if parent had the pointer tagged 2000 * as not suitable for copying when cloning. 2001 */ 2002 if (sk_user_data_is_nocopy(newsk)) 2003 newsk->sk_user_data = NULL; 2004 2005 newsk->sk_err = 0; 2006 newsk->sk_err_soft = 0; 2007 newsk->sk_priority = 0; 2008 newsk->sk_incoming_cpu = raw_smp_processor_id(); 2009 if (likely(newsk->sk_net_refcnt)) 2010 sock_inuse_add(sock_net(newsk), 1); 2011 2012 /* Before updating sk_refcnt, we must commit prior changes to memory 2013 * (Documentation/RCU/rculist_nulls.rst for details) 2014 */ 2015 smp_wmb(); 2016 refcount_set(&newsk->sk_refcnt, 2); 2017 2018 /* Increment the counter in the same struct proto as the master 2019 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that 2020 * is the same as sk->sk_prot->socks, as this field was copied 2021 * with memcpy). 2022 * 2023 * This _changes_ the previous behaviour, where 2024 * tcp_create_openreq_child always was incrementing the 2025 * equivalent to tcp_prot->socks (inet_sock_nr), so this have 2026 * to be taken into account in all callers. -acme 2027 */ 2028 sk_refcnt_debug_inc(newsk); 2029 sk_set_socket(newsk, NULL); 2030 sk_tx_queue_clear(newsk); 2031 RCU_INIT_POINTER(newsk->sk_wq, NULL); 2032 2033 if (newsk->sk_prot->sockets_allocated) 2034 sk_sockets_allocated_inc(newsk); 2035 2036 if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP) 2037 net_enable_timestamp(); 2038 out: 2039 return newsk; 2040 } 2041 EXPORT_SYMBOL_GPL(sk_clone_lock); 2042 2043 void sk_free_unlock_clone(struct sock *sk) 2044 { 2045 /* It is still raw copy of parent, so invalidate 2046 * destructor and make plain sk_free() */ 2047 sk->sk_destruct = NULL; 2048 bh_unlock_sock(sk); 2049 sk_free(sk); 2050 } 2051 EXPORT_SYMBOL_GPL(sk_free_unlock_clone); 2052 2053 void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 2054 { 2055 u32 max_segs = 1; 2056 2057 sk_dst_set(sk, dst); 2058 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps; 2059 if (sk->sk_route_caps & NETIF_F_GSO) 2060 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 2061 sk->sk_route_caps &= ~sk->sk_route_nocaps; 2062 if (sk_can_gso(sk)) { 2063 if (dst->header_len && !xfrm_dst_offload_ok(dst)) { 2064 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 2065 } else { 2066 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 2067 sk->sk_gso_max_size = dst->dev->gso_max_size; 2068 max_segs = max_t(u32, dst->dev->gso_max_segs, 1); 2069 } 2070 } 2071 sk->sk_gso_max_segs = max_segs; 2072 } 2073 EXPORT_SYMBOL_GPL(sk_setup_caps); 2074 2075 /* 2076 * Simple resource managers for sockets. 2077 */ 2078 2079 2080 /* 2081 * Write buffer destructor automatically called from kfree_skb. 2082 */ 2083 void sock_wfree(struct sk_buff *skb) 2084 { 2085 struct sock *sk = skb->sk; 2086 unsigned int len = skb->truesize; 2087 2088 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { 2089 /* 2090 * Keep a reference on sk_wmem_alloc, this will be released 2091 * after sk_write_space() call 2092 */ 2093 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); 2094 sk->sk_write_space(sk); 2095 len = 1; 2096 } 2097 /* 2098 * if sk_wmem_alloc reaches 0, we must finish what sk_free() 2099 * could not do because of in-flight packets 2100 */ 2101 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) 2102 __sk_free(sk); 2103 } 2104 EXPORT_SYMBOL(sock_wfree); 2105 2106 /* This variant of sock_wfree() is used by TCP, 2107 * since it sets SOCK_USE_WRITE_QUEUE. 2108 */ 2109 void __sock_wfree(struct sk_buff *skb) 2110 { 2111 struct sock *sk = skb->sk; 2112 2113 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) 2114 __sk_free(sk); 2115 } 2116 2117 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) 2118 { 2119 skb_orphan(skb); 2120 skb->sk = sk; 2121 #ifdef CONFIG_INET 2122 if (unlikely(!sk_fullsock(sk))) { 2123 skb->destructor = sock_edemux; 2124 sock_hold(sk); 2125 return; 2126 } 2127 #endif 2128 skb->destructor = sock_wfree; 2129 skb_set_hash_from_sk(skb, sk); 2130 /* 2131 * We used to take a refcount on sk, but following operation 2132 * is enough to guarantee sk_free() wont free this sock until 2133 * all in-flight packets are completed 2134 */ 2135 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 2136 } 2137 EXPORT_SYMBOL(skb_set_owner_w); 2138 2139 static bool can_skb_orphan_partial(const struct sk_buff *skb) 2140 { 2141 #ifdef CONFIG_TLS_DEVICE 2142 /* Drivers depend on in-order delivery for crypto offload, 2143 * partial orphan breaks out-of-order-OK logic. 2144 */ 2145 if (skb->decrypted) 2146 return false; 2147 #endif 2148 return (skb->destructor == sock_wfree || 2149 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); 2150 } 2151 2152 /* This helper is used by netem, as it can hold packets in its 2153 * delay queue. We want to allow the owner socket to send more 2154 * packets, as if they were already TX completed by a typical driver. 2155 * But we also want to keep skb->sk set because some packet schedulers 2156 * rely on it (sch_fq for example). 2157 */ 2158 void skb_orphan_partial(struct sk_buff *skb) 2159 { 2160 if (skb_is_tcp_pure_ack(skb)) 2161 return; 2162 2163 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk)) 2164 return; 2165 2166 skb_orphan(skb); 2167 } 2168 EXPORT_SYMBOL(skb_orphan_partial); 2169 2170 /* 2171 * Read buffer destructor automatically called from kfree_skb. 2172 */ 2173 void sock_rfree(struct sk_buff *skb) 2174 { 2175 struct sock *sk = skb->sk; 2176 unsigned int len = skb->truesize; 2177 2178 atomic_sub(len, &sk->sk_rmem_alloc); 2179 sk_mem_uncharge(sk, len); 2180 } 2181 EXPORT_SYMBOL(sock_rfree); 2182 2183 /* 2184 * Buffer destructor for skbs that are not used directly in read or write 2185 * path, e.g. for error handler skbs. Automatically called from kfree_skb. 2186 */ 2187 void sock_efree(struct sk_buff *skb) 2188 { 2189 sock_put(skb->sk); 2190 } 2191 EXPORT_SYMBOL(sock_efree); 2192 2193 /* Buffer destructor for prefetch/receive path where reference count may 2194 * not be held, e.g. for listen sockets. 2195 */ 2196 #ifdef CONFIG_INET 2197 void sock_pfree(struct sk_buff *skb) 2198 { 2199 if (sk_is_refcounted(skb->sk)) 2200 sock_gen_put(skb->sk); 2201 } 2202 EXPORT_SYMBOL(sock_pfree); 2203 #endif /* CONFIG_INET */ 2204 2205 kuid_t sock_i_uid(struct sock *sk) 2206 { 2207 kuid_t uid; 2208 2209 read_lock_bh(&sk->sk_callback_lock); 2210 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; 2211 read_unlock_bh(&sk->sk_callback_lock); 2212 return uid; 2213 } 2214 EXPORT_SYMBOL(sock_i_uid); 2215 2216 unsigned long sock_i_ino(struct sock *sk) 2217 { 2218 unsigned long ino; 2219 2220 read_lock_bh(&sk->sk_callback_lock); 2221 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; 2222 read_unlock_bh(&sk->sk_callback_lock); 2223 return ino; 2224 } 2225 EXPORT_SYMBOL(sock_i_ino); 2226 2227 /* 2228 * Allocate a skb from the socket's send buffer. 2229 */ 2230 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 2231 gfp_t priority) 2232 { 2233 if (force || 2234 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { 2235 struct sk_buff *skb = alloc_skb(size, priority); 2236 2237 if (skb) { 2238 skb_set_owner_w(skb, sk); 2239 return skb; 2240 } 2241 } 2242 return NULL; 2243 } 2244 EXPORT_SYMBOL(sock_wmalloc); 2245 2246 static void sock_ofree(struct sk_buff *skb) 2247 { 2248 struct sock *sk = skb->sk; 2249 2250 atomic_sub(skb->truesize, &sk->sk_omem_alloc); 2251 } 2252 2253 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, 2254 gfp_t priority) 2255 { 2256 struct sk_buff *skb; 2257 2258 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ 2259 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > 2260 sysctl_optmem_max) 2261 return NULL; 2262 2263 skb = alloc_skb(size, priority); 2264 if (!skb) 2265 return NULL; 2266 2267 atomic_add(skb->truesize, &sk->sk_omem_alloc); 2268 skb->sk = sk; 2269 skb->destructor = sock_ofree; 2270 return skb; 2271 } 2272 2273 /* 2274 * Allocate a memory block from the socket's option memory buffer. 2275 */ 2276 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) 2277 { 2278 if ((unsigned int)size <= sysctl_optmem_max && 2279 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { 2280 void *mem; 2281 /* First do the add, to avoid the race if kmalloc 2282 * might sleep. 2283 */ 2284 atomic_add(size, &sk->sk_omem_alloc); 2285 mem = kmalloc(size, priority); 2286 if (mem) 2287 return mem; 2288 atomic_sub(size, &sk->sk_omem_alloc); 2289 } 2290 return NULL; 2291 } 2292 EXPORT_SYMBOL(sock_kmalloc); 2293 2294 /* Free an option memory block. Note, we actually want the inline 2295 * here as this allows gcc to detect the nullify and fold away the 2296 * condition entirely. 2297 */ 2298 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, 2299 const bool nullify) 2300 { 2301 if (WARN_ON_ONCE(!mem)) 2302 return; 2303 if (nullify) 2304 kfree_sensitive(mem); 2305 else 2306 kfree(mem); 2307 atomic_sub(size, &sk->sk_omem_alloc); 2308 } 2309 2310 void sock_kfree_s(struct sock *sk, void *mem, int size) 2311 { 2312 __sock_kfree_s(sk, mem, size, false); 2313 } 2314 EXPORT_SYMBOL(sock_kfree_s); 2315 2316 void sock_kzfree_s(struct sock *sk, void *mem, int size) 2317 { 2318 __sock_kfree_s(sk, mem, size, true); 2319 } 2320 EXPORT_SYMBOL(sock_kzfree_s); 2321 2322 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. 2323 I think, these locks should be removed for datagram sockets. 2324 */ 2325 static long sock_wait_for_wmem(struct sock *sk, long timeo) 2326 { 2327 DEFINE_WAIT(wait); 2328 2329 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2330 for (;;) { 2331 if (!timeo) 2332 break; 2333 if (signal_pending(current)) 2334 break; 2335 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2336 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 2337 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) 2338 break; 2339 if (sk->sk_shutdown & SEND_SHUTDOWN) 2340 break; 2341 if (sk->sk_err) 2342 break; 2343 timeo = schedule_timeout(timeo); 2344 } 2345 finish_wait(sk_sleep(sk), &wait); 2346 return timeo; 2347 } 2348 2349 2350 /* 2351 * Generic send/receive buffer handlers 2352 */ 2353 2354 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 2355 unsigned long data_len, int noblock, 2356 int *errcode, int max_page_order) 2357 { 2358 struct sk_buff *skb; 2359 long timeo; 2360 int err; 2361 2362 timeo = sock_sndtimeo(sk, noblock); 2363 for (;;) { 2364 err = sock_error(sk); 2365 if (err != 0) 2366 goto failure; 2367 2368 err = -EPIPE; 2369 if (sk->sk_shutdown & SEND_SHUTDOWN) 2370 goto failure; 2371 2372 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) 2373 break; 2374 2375 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2376 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2377 err = -EAGAIN; 2378 if (!timeo) 2379 goto failure; 2380 if (signal_pending(current)) 2381 goto interrupted; 2382 timeo = sock_wait_for_wmem(sk, timeo); 2383 } 2384 skb = alloc_skb_with_frags(header_len, data_len, max_page_order, 2385 errcode, sk->sk_allocation); 2386 if (skb) 2387 skb_set_owner_w(skb, sk); 2388 return skb; 2389 2390 interrupted: 2391 err = sock_intr_errno(timeo); 2392 failure: 2393 *errcode = err; 2394 return NULL; 2395 } 2396 EXPORT_SYMBOL(sock_alloc_send_pskb); 2397 2398 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 2399 int noblock, int *errcode) 2400 { 2401 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); 2402 } 2403 EXPORT_SYMBOL(sock_alloc_send_skb); 2404 2405 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, 2406 struct sockcm_cookie *sockc) 2407 { 2408 u32 tsflags; 2409 2410 switch (cmsg->cmsg_type) { 2411 case SO_MARK: 2412 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 2413 return -EPERM; 2414 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2415 return -EINVAL; 2416 sockc->mark = *(u32 *)CMSG_DATA(cmsg); 2417 break; 2418 case SO_TIMESTAMPING_OLD: 2419 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2420 return -EINVAL; 2421 2422 tsflags = *(u32 *)CMSG_DATA(cmsg); 2423 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) 2424 return -EINVAL; 2425 2426 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; 2427 sockc->tsflags |= tsflags; 2428 break; 2429 case SCM_TXTIME: 2430 if (!sock_flag(sk, SOCK_TXTIME)) 2431 return -EINVAL; 2432 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) 2433 return -EINVAL; 2434 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); 2435 break; 2436 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ 2437 case SCM_RIGHTS: 2438 case SCM_CREDENTIALS: 2439 break; 2440 default: 2441 return -EINVAL; 2442 } 2443 return 0; 2444 } 2445 EXPORT_SYMBOL(__sock_cmsg_send); 2446 2447 int sock_cmsg_send(struct sock *sk, struct msghdr *msg, 2448 struct sockcm_cookie *sockc) 2449 { 2450 struct cmsghdr *cmsg; 2451 int ret; 2452 2453 for_each_cmsghdr(cmsg, msg) { 2454 if (!CMSG_OK(msg, cmsg)) 2455 return -EINVAL; 2456 if (cmsg->cmsg_level != SOL_SOCKET) 2457 continue; 2458 ret = __sock_cmsg_send(sk, msg, cmsg, sockc); 2459 if (ret) 2460 return ret; 2461 } 2462 return 0; 2463 } 2464 EXPORT_SYMBOL(sock_cmsg_send); 2465 2466 static void sk_enter_memory_pressure(struct sock *sk) 2467 { 2468 if (!sk->sk_prot->enter_memory_pressure) 2469 return; 2470 2471 sk->sk_prot->enter_memory_pressure(sk); 2472 } 2473 2474 static void sk_leave_memory_pressure(struct sock *sk) 2475 { 2476 if (sk->sk_prot->leave_memory_pressure) { 2477 sk->sk_prot->leave_memory_pressure(sk); 2478 } else { 2479 unsigned long *memory_pressure = sk->sk_prot->memory_pressure; 2480 2481 if (memory_pressure && READ_ONCE(*memory_pressure)) 2482 WRITE_ONCE(*memory_pressure, 0); 2483 } 2484 } 2485 2486 #define SKB_FRAG_PAGE_ORDER get_order(32768) 2487 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); 2488 2489 /** 2490 * skb_page_frag_refill - check that a page_frag contains enough room 2491 * @sz: minimum size of the fragment we want to get 2492 * @pfrag: pointer to page_frag 2493 * @gfp: priority for memory allocation 2494 * 2495 * Note: While this allocator tries to use high order pages, there is 2496 * no guarantee that allocations succeed. Therefore, @sz MUST be 2497 * less or equal than PAGE_SIZE. 2498 */ 2499 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) 2500 { 2501 if (pfrag->page) { 2502 if (page_ref_count(pfrag->page) == 1) { 2503 pfrag->offset = 0; 2504 return true; 2505 } 2506 if (pfrag->offset + sz <= pfrag->size) 2507 return true; 2508 put_page(pfrag->page); 2509 } 2510 2511 pfrag->offset = 0; 2512 if (SKB_FRAG_PAGE_ORDER && 2513 !static_branch_unlikely(&net_high_order_alloc_disable_key)) { 2514 /* Avoid direct reclaim but allow kswapd to wake */ 2515 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | 2516 __GFP_COMP | __GFP_NOWARN | 2517 __GFP_NORETRY, 2518 SKB_FRAG_PAGE_ORDER); 2519 if (likely(pfrag->page)) { 2520 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; 2521 return true; 2522 } 2523 } 2524 pfrag->page = alloc_page(gfp); 2525 if (likely(pfrag->page)) { 2526 pfrag->size = PAGE_SIZE; 2527 return true; 2528 } 2529 return false; 2530 } 2531 EXPORT_SYMBOL(skb_page_frag_refill); 2532 2533 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 2534 { 2535 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) 2536 return true; 2537 2538 sk_enter_memory_pressure(sk); 2539 sk_stream_moderate_sndbuf(sk); 2540 return false; 2541 } 2542 EXPORT_SYMBOL(sk_page_frag_refill); 2543 2544 void __lock_sock(struct sock *sk) 2545 __releases(&sk->sk_lock.slock) 2546 __acquires(&sk->sk_lock.slock) 2547 { 2548 DEFINE_WAIT(wait); 2549 2550 for (;;) { 2551 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, 2552 TASK_UNINTERRUPTIBLE); 2553 spin_unlock_bh(&sk->sk_lock.slock); 2554 schedule(); 2555 spin_lock_bh(&sk->sk_lock.slock); 2556 if (!sock_owned_by_user(sk)) 2557 break; 2558 } 2559 finish_wait(&sk->sk_lock.wq, &wait); 2560 } 2561 2562 void __release_sock(struct sock *sk) 2563 __releases(&sk->sk_lock.slock) 2564 __acquires(&sk->sk_lock.slock) 2565 { 2566 struct sk_buff *skb, *next; 2567 2568 while ((skb = sk->sk_backlog.head) != NULL) { 2569 sk->sk_backlog.head = sk->sk_backlog.tail = NULL; 2570 2571 spin_unlock_bh(&sk->sk_lock.slock); 2572 2573 do { 2574 next = skb->next; 2575 prefetch(next); 2576 WARN_ON_ONCE(skb_dst_is_noref(skb)); 2577 skb_mark_not_on_list(skb); 2578 sk_backlog_rcv(sk, skb); 2579 2580 cond_resched(); 2581 2582 skb = next; 2583 } while (skb != NULL); 2584 2585 spin_lock_bh(&sk->sk_lock.slock); 2586 } 2587 2588 /* 2589 * Doing the zeroing here guarantee we can not loop forever 2590 * while a wild producer attempts to flood us. 2591 */ 2592 sk->sk_backlog.len = 0; 2593 } 2594 2595 void __sk_flush_backlog(struct sock *sk) 2596 { 2597 spin_lock_bh(&sk->sk_lock.slock); 2598 __release_sock(sk); 2599 spin_unlock_bh(&sk->sk_lock.slock); 2600 } 2601 2602 /** 2603 * sk_wait_data - wait for data to arrive at sk_receive_queue 2604 * @sk: sock to wait on 2605 * @timeo: for how long 2606 * @skb: last skb seen on sk_receive_queue 2607 * 2608 * Now socket state including sk->sk_err is changed only under lock, 2609 * hence we may omit checks after joining wait queue. 2610 * We check receive queue before schedule() only as optimization; 2611 * it is very likely that release_sock() added new data. 2612 */ 2613 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) 2614 { 2615 DEFINE_WAIT_FUNC(wait, woken_wake_function); 2616 int rc; 2617 2618 add_wait_queue(sk_sleep(sk), &wait); 2619 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2620 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); 2621 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2622 remove_wait_queue(sk_sleep(sk), &wait); 2623 return rc; 2624 } 2625 EXPORT_SYMBOL(sk_wait_data); 2626 2627 /** 2628 * __sk_mem_raise_allocated - increase memory_allocated 2629 * @sk: socket 2630 * @size: memory size to allocate 2631 * @amt: pages to allocate 2632 * @kind: allocation type 2633 * 2634 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc 2635 */ 2636 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) 2637 { 2638 struct proto *prot = sk->sk_prot; 2639 long allocated = sk_memory_allocated_add(sk, amt); 2640 bool charged = true; 2641 2642 if (mem_cgroup_sockets_enabled && sk->sk_memcg && 2643 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt))) 2644 goto suppress_allocation; 2645 2646 /* Under limit. */ 2647 if (allocated <= sk_prot_mem_limits(sk, 0)) { 2648 sk_leave_memory_pressure(sk); 2649 return 1; 2650 } 2651 2652 /* Under pressure. */ 2653 if (allocated > sk_prot_mem_limits(sk, 1)) 2654 sk_enter_memory_pressure(sk); 2655 2656 /* Over hard limit. */ 2657 if (allocated > sk_prot_mem_limits(sk, 2)) 2658 goto suppress_allocation; 2659 2660 /* guarantee minimum buffer size under pressure */ 2661 if (kind == SK_MEM_RECV) { 2662 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) 2663 return 1; 2664 2665 } else { /* SK_MEM_SEND */ 2666 int wmem0 = sk_get_wmem0(sk, prot); 2667 2668 if (sk->sk_type == SOCK_STREAM) { 2669 if (sk->sk_wmem_queued < wmem0) 2670 return 1; 2671 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { 2672 return 1; 2673 } 2674 } 2675 2676 if (sk_has_memory_pressure(sk)) { 2677 u64 alloc; 2678 2679 if (!sk_under_memory_pressure(sk)) 2680 return 1; 2681 alloc = sk_sockets_allocated_read_positive(sk); 2682 if (sk_prot_mem_limits(sk, 2) > alloc * 2683 sk_mem_pages(sk->sk_wmem_queued + 2684 atomic_read(&sk->sk_rmem_alloc) + 2685 sk->sk_forward_alloc)) 2686 return 1; 2687 } 2688 2689 suppress_allocation: 2690 2691 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { 2692 sk_stream_moderate_sndbuf(sk); 2693 2694 /* Fail only if socket is _under_ its sndbuf. 2695 * In this case we cannot block, so that we have to fail. 2696 */ 2697 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) 2698 return 1; 2699 } 2700 2701 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) 2702 trace_sock_exceed_buf_limit(sk, prot, allocated, kind); 2703 2704 sk_memory_allocated_sub(sk, amt); 2705 2706 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2707 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); 2708 2709 return 0; 2710 } 2711 EXPORT_SYMBOL(__sk_mem_raise_allocated); 2712 2713 /** 2714 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 2715 * @sk: socket 2716 * @size: memory size to allocate 2717 * @kind: allocation type 2718 * 2719 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 2720 * rmem allocation. This function assumes that protocols which have 2721 * memory_pressure use sk_wmem_queued as write buffer accounting. 2722 */ 2723 int __sk_mem_schedule(struct sock *sk, int size, int kind) 2724 { 2725 int ret, amt = sk_mem_pages(size); 2726 2727 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT; 2728 ret = __sk_mem_raise_allocated(sk, size, amt, kind); 2729 if (!ret) 2730 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT; 2731 return ret; 2732 } 2733 EXPORT_SYMBOL(__sk_mem_schedule); 2734 2735 /** 2736 * __sk_mem_reduce_allocated - reclaim memory_allocated 2737 * @sk: socket 2738 * @amount: number of quanta 2739 * 2740 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc 2741 */ 2742 void __sk_mem_reduce_allocated(struct sock *sk, int amount) 2743 { 2744 sk_memory_allocated_sub(sk, amount); 2745 2746 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2747 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); 2748 2749 if (sk_under_memory_pressure(sk) && 2750 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 2751 sk_leave_memory_pressure(sk); 2752 } 2753 EXPORT_SYMBOL(__sk_mem_reduce_allocated); 2754 2755 /** 2756 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated 2757 * @sk: socket 2758 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) 2759 */ 2760 void __sk_mem_reclaim(struct sock *sk, int amount) 2761 { 2762 amount >>= SK_MEM_QUANTUM_SHIFT; 2763 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; 2764 __sk_mem_reduce_allocated(sk, amount); 2765 } 2766 EXPORT_SYMBOL(__sk_mem_reclaim); 2767 2768 int sk_set_peek_off(struct sock *sk, int val) 2769 { 2770 sk->sk_peek_off = val; 2771 return 0; 2772 } 2773 EXPORT_SYMBOL_GPL(sk_set_peek_off); 2774 2775 /* 2776 * Set of default routines for initialising struct proto_ops when 2777 * the protocol does not support a particular function. In certain 2778 * cases where it makes no sense for a protocol to have a "do nothing" 2779 * function, some default processing is provided. 2780 */ 2781 2782 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) 2783 { 2784 return -EOPNOTSUPP; 2785 } 2786 EXPORT_SYMBOL(sock_no_bind); 2787 2788 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 2789 int len, int flags) 2790 { 2791 return -EOPNOTSUPP; 2792 } 2793 EXPORT_SYMBOL(sock_no_connect); 2794 2795 int sock_no_socketpair(struct socket *sock1, struct socket *sock2) 2796 { 2797 return -EOPNOTSUPP; 2798 } 2799 EXPORT_SYMBOL(sock_no_socketpair); 2800 2801 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, 2802 bool kern) 2803 { 2804 return -EOPNOTSUPP; 2805 } 2806 EXPORT_SYMBOL(sock_no_accept); 2807 2808 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 2809 int peer) 2810 { 2811 return -EOPNOTSUPP; 2812 } 2813 EXPORT_SYMBOL(sock_no_getname); 2814 2815 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 2816 { 2817 return -EOPNOTSUPP; 2818 } 2819 EXPORT_SYMBOL(sock_no_ioctl); 2820 2821 int sock_no_listen(struct socket *sock, int backlog) 2822 { 2823 return -EOPNOTSUPP; 2824 } 2825 EXPORT_SYMBOL(sock_no_listen); 2826 2827 int sock_no_shutdown(struct socket *sock, int how) 2828 { 2829 return -EOPNOTSUPP; 2830 } 2831 EXPORT_SYMBOL(sock_no_shutdown); 2832 2833 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) 2834 { 2835 return -EOPNOTSUPP; 2836 } 2837 EXPORT_SYMBOL(sock_no_sendmsg); 2838 2839 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) 2840 { 2841 return -EOPNOTSUPP; 2842 } 2843 EXPORT_SYMBOL(sock_no_sendmsg_locked); 2844 2845 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, 2846 int flags) 2847 { 2848 return -EOPNOTSUPP; 2849 } 2850 EXPORT_SYMBOL(sock_no_recvmsg); 2851 2852 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 2853 { 2854 /* Mirror missing mmap method error code */ 2855 return -ENODEV; 2856 } 2857 EXPORT_SYMBOL(sock_no_mmap); 2858 2859 /* 2860 * When a file is received (via SCM_RIGHTS, etc), we must bump the 2861 * various sock-based usage counts. 2862 */ 2863 void __receive_sock(struct file *file) 2864 { 2865 struct socket *sock; 2866 2867 sock = sock_from_file(file); 2868 if (sock) { 2869 sock_update_netprioidx(&sock->sk->sk_cgrp_data); 2870 sock_update_classid(&sock->sk->sk_cgrp_data); 2871 } 2872 } 2873 2874 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) 2875 { 2876 ssize_t res; 2877 struct msghdr msg = {.msg_flags = flags}; 2878 struct kvec iov; 2879 char *kaddr = kmap(page); 2880 iov.iov_base = kaddr + offset; 2881 iov.iov_len = size; 2882 res = kernel_sendmsg(sock, &msg, &iov, 1, size); 2883 kunmap(page); 2884 return res; 2885 } 2886 EXPORT_SYMBOL(sock_no_sendpage); 2887 2888 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page, 2889 int offset, size_t size, int flags) 2890 { 2891 ssize_t res; 2892 struct msghdr msg = {.msg_flags = flags}; 2893 struct kvec iov; 2894 char *kaddr = kmap(page); 2895 2896 iov.iov_base = kaddr + offset; 2897 iov.iov_len = size; 2898 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size); 2899 kunmap(page); 2900 return res; 2901 } 2902 EXPORT_SYMBOL(sock_no_sendpage_locked); 2903 2904 /* 2905 * Default Socket Callbacks 2906 */ 2907 2908 static void sock_def_wakeup(struct sock *sk) 2909 { 2910 struct socket_wq *wq; 2911 2912 rcu_read_lock(); 2913 wq = rcu_dereference(sk->sk_wq); 2914 if (skwq_has_sleeper(wq)) 2915 wake_up_interruptible_all(&wq->wait); 2916 rcu_read_unlock(); 2917 } 2918 2919 static void sock_def_error_report(struct sock *sk) 2920 { 2921 struct socket_wq *wq; 2922 2923 rcu_read_lock(); 2924 wq = rcu_dereference(sk->sk_wq); 2925 if (skwq_has_sleeper(wq)) 2926 wake_up_interruptible_poll(&wq->wait, EPOLLERR); 2927 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); 2928 rcu_read_unlock(); 2929 } 2930 2931 void sock_def_readable(struct sock *sk) 2932 { 2933 struct socket_wq *wq; 2934 2935 rcu_read_lock(); 2936 wq = rcu_dereference(sk->sk_wq); 2937 if (skwq_has_sleeper(wq)) 2938 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | 2939 EPOLLRDNORM | EPOLLRDBAND); 2940 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 2941 rcu_read_unlock(); 2942 } 2943 2944 static void sock_def_write_space(struct sock *sk) 2945 { 2946 struct socket_wq *wq; 2947 2948 rcu_read_lock(); 2949 2950 /* Do not wake up a writer until he can make "significant" 2951 * progress. --DaveM 2952 */ 2953 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) { 2954 wq = rcu_dereference(sk->sk_wq); 2955 if (skwq_has_sleeper(wq)) 2956 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 2957 EPOLLWRNORM | EPOLLWRBAND); 2958 2959 /* Should agree with poll, otherwise some programs break */ 2960 if (sock_writeable(sk)) 2961 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 2962 } 2963 2964 rcu_read_unlock(); 2965 } 2966 2967 static void sock_def_destruct(struct sock *sk) 2968 { 2969 } 2970 2971 void sk_send_sigurg(struct sock *sk) 2972 { 2973 if (sk->sk_socket && sk->sk_socket->file) 2974 if (send_sigurg(&sk->sk_socket->file->f_owner)) 2975 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 2976 } 2977 EXPORT_SYMBOL(sk_send_sigurg); 2978 2979 void sk_reset_timer(struct sock *sk, struct timer_list* timer, 2980 unsigned long expires) 2981 { 2982 if (!mod_timer(timer, expires)) 2983 sock_hold(sk); 2984 } 2985 EXPORT_SYMBOL(sk_reset_timer); 2986 2987 void sk_stop_timer(struct sock *sk, struct timer_list* timer) 2988 { 2989 if (del_timer(timer)) 2990 __sock_put(sk); 2991 } 2992 EXPORT_SYMBOL(sk_stop_timer); 2993 2994 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) 2995 { 2996 if (del_timer_sync(timer)) 2997 __sock_put(sk); 2998 } 2999 EXPORT_SYMBOL(sk_stop_timer_sync); 3000 3001 void sock_init_data(struct socket *sock, struct sock *sk) 3002 { 3003 sk_init_common(sk); 3004 sk->sk_send_head = NULL; 3005 3006 timer_setup(&sk->sk_timer, NULL, 0); 3007 3008 sk->sk_allocation = GFP_KERNEL; 3009 sk->sk_rcvbuf = sysctl_rmem_default; 3010 sk->sk_sndbuf = sysctl_wmem_default; 3011 sk->sk_state = TCP_CLOSE; 3012 sk_set_socket(sk, sock); 3013 3014 sock_set_flag(sk, SOCK_ZAPPED); 3015 3016 if (sock) { 3017 sk->sk_type = sock->type; 3018 RCU_INIT_POINTER(sk->sk_wq, &sock->wq); 3019 sock->sk = sk; 3020 sk->sk_uid = SOCK_INODE(sock)->i_uid; 3021 } else { 3022 RCU_INIT_POINTER(sk->sk_wq, NULL); 3023 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0); 3024 } 3025 3026 rwlock_init(&sk->sk_callback_lock); 3027 if (sk->sk_kern_sock) 3028 lockdep_set_class_and_name( 3029 &sk->sk_callback_lock, 3030 af_kern_callback_keys + sk->sk_family, 3031 af_family_kern_clock_key_strings[sk->sk_family]); 3032 else 3033 lockdep_set_class_and_name( 3034 &sk->sk_callback_lock, 3035 af_callback_keys + sk->sk_family, 3036 af_family_clock_key_strings[sk->sk_family]); 3037 3038 sk->sk_state_change = sock_def_wakeup; 3039 sk->sk_data_ready = sock_def_readable; 3040 sk->sk_write_space = sock_def_write_space; 3041 sk->sk_error_report = sock_def_error_report; 3042 sk->sk_destruct = sock_def_destruct; 3043 3044 sk->sk_frag.page = NULL; 3045 sk->sk_frag.offset = 0; 3046 sk->sk_peek_off = -1; 3047 3048 sk->sk_peer_pid = NULL; 3049 sk->sk_peer_cred = NULL; 3050 sk->sk_write_pending = 0; 3051 sk->sk_rcvlowat = 1; 3052 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 3053 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 3054 3055 sk->sk_stamp = SK_DEFAULT_STAMP; 3056 #if BITS_PER_LONG==32 3057 seqlock_init(&sk->sk_stamp_seq); 3058 #endif 3059 atomic_set(&sk->sk_zckey, 0); 3060 3061 #ifdef CONFIG_NET_RX_BUSY_POLL 3062 sk->sk_napi_id = 0; 3063 sk->sk_ll_usec = sysctl_net_busy_read; 3064 #endif 3065 3066 sk->sk_max_pacing_rate = ~0UL; 3067 sk->sk_pacing_rate = ~0UL; 3068 WRITE_ONCE(sk->sk_pacing_shift, 10); 3069 sk->sk_incoming_cpu = -1; 3070 3071 sk_rx_queue_clear(sk); 3072 /* 3073 * Before updating sk_refcnt, we must commit prior changes to memory 3074 * (Documentation/RCU/rculist_nulls.rst for details) 3075 */ 3076 smp_wmb(); 3077 refcount_set(&sk->sk_refcnt, 1); 3078 atomic_set(&sk->sk_drops, 0); 3079 } 3080 EXPORT_SYMBOL(sock_init_data); 3081 3082 void lock_sock_nested(struct sock *sk, int subclass) 3083 { 3084 might_sleep(); 3085 spin_lock_bh(&sk->sk_lock.slock); 3086 if (sk->sk_lock.owned) 3087 __lock_sock(sk); 3088 sk->sk_lock.owned = 1; 3089 spin_unlock(&sk->sk_lock.slock); 3090 /* 3091 * The sk_lock has mutex_lock() semantics here: 3092 */ 3093 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); 3094 local_bh_enable(); 3095 } 3096 EXPORT_SYMBOL(lock_sock_nested); 3097 3098 void release_sock(struct sock *sk) 3099 { 3100 spin_lock_bh(&sk->sk_lock.slock); 3101 if (sk->sk_backlog.tail) 3102 __release_sock(sk); 3103 3104 /* Warning : release_cb() might need to release sk ownership, 3105 * ie call sock_release_ownership(sk) before us. 3106 */ 3107 if (sk->sk_prot->release_cb) 3108 sk->sk_prot->release_cb(sk); 3109 3110 sock_release_ownership(sk); 3111 if (waitqueue_active(&sk->sk_lock.wq)) 3112 wake_up(&sk->sk_lock.wq); 3113 spin_unlock_bh(&sk->sk_lock.slock); 3114 } 3115 EXPORT_SYMBOL(release_sock); 3116 3117 /** 3118 * lock_sock_fast - fast version of lock_sock 3119 * @sk: socket 3120 * 3121 * This version should be used for very small section, where process wont block 3122 * return false if fast path is taken: 3123 * 3124 * sk_lock.slock locked, owned = 0, BH disabled 3125 * 3126 * return true if slow path is taken: 3127 * 3128 * sk_lock.slock unlocked, owned = 1, BH enabled 3129 */ 3130 bool lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) 3131 { 3132 might_sleep(); 3133 spin_lock_bh(&sk->sk_lock.slock); 3134 3135 if (!sk->sk_lock.owned) 3136 /* 3137 * Note : We must disable BH 3138 */ 3139 return false; 3140 3141 __lock_sock(sk); 3142 sk->sk_lock.owned = 1; 3143 spin_unlock(&sk->sk_lock.slock); 3144 /* 3145 * The sk_lock has mutex_lock() semantics here: 3146 */ 3147 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); 3148 __acquire(&sk->sk_lock.slock); 3149 local_bh_enable(); 3150 return true; 3151 } 3152 EXPORT_SYMBOL(lock_sock_fast); 3153 3154 int sock_gettstamp(struct socket *sock, void __user *userstamp, 3155 bool timeval, bool time32) 3156 { 3157 struct sock *sk = sock->sk; 3158 struct timespec64 ts; 3159 3160 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 3161 ts = ktime_to_timespec64(sock_read_timestamp(sk)); 3162 if (ts.tv_sec == -1) 3163 return -ENOENT; 3164 if (ts.tv_sec == 0) { 3165 ktime_t kt = ktime_get_real(); 3166 sock_write_timestamp(sk, kt); 3167 ts = ktime_to_timespec64(kt); 3168 } 3169 3170 if (timeval) 3171 ts.tv_nsec /= 1000; 3172 3173 #ifdef CONFIG_COMPAT_32BIT_TIME 3174 if (time32) 3175 return put_old_timespec32(&ts, userstamp); 3176 #endif 3177 #ifdef CONFIG_SPARC64 3178 /* beware of padding in sparc64 timeval */ 3179 if (timeval && !in_compat_syscall()) { 3180 struct __kernel_old_timeval __user tv = { 3181 .tv_sec = ts.tv_sec, 3182 .tv_usec = ts.tv_nsec, 3183 }; 3184 if (copy_to_user(userstamp, &tv, sizeof(tv))) 3185 return -EFAULT; 3186 return 0; 3187 } 3188 #endif 3189 return put_timespec64(&ts, userstamp); 3190 } 3191 EXPORT_SYMBOL(sock_gettstamp); 3192 3193 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) 3194 { 3195 if (!sock_flag(sk, flag)) { 3196 unsigned long previous_flags = sk->sk_flags; 3197 3198 sock_set_flag(sk, flag); 3199 /* 3200 * we just set one of the two flags which require net 3201 * time stamping, but time stamping might have been on 3202 * already because of the other one 3203 */ 3204 if (sock_needs_netstamp(sk) && 3205 !(previous_flags & SK_FLAGS_TIMESTAMP)) 3206 net_enable_timestamp(); 3207 } 3208 } 3209 3210 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, 3211 int level, int type) 3212 { 3213 struct sock_exterr_skb *serr; 3214 struct sk_buff *skb; 3215 int copied, err; 3216 3217 err = -EAGAIN; 3218 skb = sock_dequeue_err_skb(sk); 3219 if (skb == NULL) 3220 goto out; 3221 3222 copied = skb->len; 3223 if (copied > len) { 3224 msg->msg_flags |= MSG_TRUNC; 3225 copied = len; 3226 } 3227 err = skb_copy_datagram_msg(skb, 0, msg, copied); 3228 if (err) 3229 goto out_free_skb; 3230 3231 sock_recv_timestamp(msg, sk, skb); 3232 3233 serr = SKB_EXT_ERR(skb); 3234 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); 3235 3236 msg->msg_flags |= MSG_ERRQUEUE; 3237 err = copied; 3238 3239 out_free_skb: 3240 kfree_skb(skb); 3241 out: 3242 return err; 3243 } 3244 EXPORT_SYMBOL(sock_recv_errqueue); 3245 3246 /* 3247 * Get a socket option on an socket. 3248 * 3249 * FIX: POSIX 1003.1g is very ambiguous here. It states that 3250 * asynchronous errors should be reported by getsockopt. We assume 3251 * this means if you specify SO_ERROR (otherwise whats the point of it). 3252 */ 3253 int sock_common_getsockopt(struct socket *sock, int level, int optname, 3254 char __user *optval, int __user *optlen) 3255 { 3256 struct sock *sk = sock->sk; 3257 3258 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 3259 } 3260 EXPORT_SYMBOL(sock_common_getsockopt); 3261 3262 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 3263 int flags) 3264 { 3265 struct sock *sk = sock->sk; 3266 int addr_len = 0; 3267 int err; 3268 3269 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, 3270 flags & ~MSG_DONTWAIT, &addr_len); 3271 if (err >= 0) 3272 msg->msg_namelen = addr_len; 3273 return err; 3274 } 3275 EXPORT_SYMBOL(sock_common_recvmsg); 3276 3277 /* 3278 * Set socket options on an inet socket. 3279 */ 3280 int sock_common_setsockopt(struct socket *sock, int level, int optname, 3281 sockptr_t optval, unsigned int optlen) 3282 { 3283 struct sock *sk = sock->sk; 3284 3285 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 3286 } 3287 EXPORT_SYMBOL(sock_common_setsockopt); 3288 3289 void sk_common_release(struct sock *sk) 3290 { 3291 if (sk->sk_prot->destroy) 3292 sk->sk_prot->destroy(sk); 3293 3294 /* 3295 * Observation: when sk_common_release is called, processes have 3296 * no access to socket. But net still has. 3297 * Step one, detach it from networking: 3298 * 3299 * A. Remove from hash tables. 3300 */ 3301 3302 sk->sk_prot->unhash(sk); 3303 3304 /* 3305 * In this point socket cannot receive new packets, but it is possible 3306 * that some packets are in flight because some CPU runs receiver and 3307 * did hash table lookup before we unhashed socket. They will achieve 3308 * receive queue and will be purged by socket destructor. 3309 * 3310 * Also we still have packets pending on receive queue and probably, 3311 * our own packets waiting in device queues. sock_destroy will drain 3312 * receive queue, but transmitted packets will delay socket destruction 3313 * until the last reference will be released. 3314 */ 3315 3316 sock_orphan(sk); 3317 3318 xfrm_sk_free_policy(sk); 3319 3320 sk_refcnt_debug_release(sk); 3321 3322 sock_put(sk); 3323 } 3324 EXPORT_SYMBOL(sk_common_release); 3325 3326 void sk_get_meminfo(const struct sock *sk, u32 *mem) 3327 { 3328 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); 3329 3330 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); 3331 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); 3332 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); 3333 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); 3334 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; 3335 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); 3336 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); 3337 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); 3338 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); 3339 } 3340 3341 #ifdef CONFIG_PROC_FS 3342 #define PROTO_INUSE_NR 64 /* should be enough for the first time */ 3343 struct prot_inuse { 3344 int val[PROTO_INUSE_NR]; 3345 }; 3346 3347 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); 3348 3349 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) 3350 { 3351 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val); 3352 } 3353 EXPORT_SYMBOL_GPL(sock_prot_inuse_add); 3354 3355 int sock_prot_inuse_get(struct net *net, struct proto *prot) 3356 { 3357 int cpu, idx = prot->inuse_idx; 3358 int res = 0; 3359 3360 for_each_possible_cpu(cpu) 3361 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; 3362 3363 return res >= 0 ? res : 0; 3364 } 3365 EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 3366 3367 static void sock_inuse_add(struct net *net, int val) 3368 { 3369 this_cpu_add(*net->core.sock_inuse, val); 3370 } 3371 3372 int sock_inuse_get(struct net *net) 3373 { 3374 int cpu, res = 0; 3375 3376 for_each_possible_cpu(cpu) 3377 res += *per_cpu_ptr(net->core.sock_inuse, cpu); 3378 3379 return res; 3380 } 3381 3382 EXPORT_SYMBOL_GPL(sock_inuse_get); 3383 3384 static int __net_init sock_inuse_init_net(struct net *net) 3385 { 3386 net->core.prot_inuse = alloc_percpu(struct prot_inuse); 3387 if (net->core.prot_inuse == NULL) 3388 return -ENOMEM; 3389 3390 net->core.sock_inuse = alloc_percpu(int); 3391 if (net->core.sock_inuse == NULL) 3392 goto out; 3393 3394 return 0; 3395 3396 out: 3397 free_percpu(net->core.prot_inuse); 3398 return -ENOMEM; 3399 } 3400 3401 static void __net_exit sock_inuse_exit_net(struct net *net) 3402 { 3403 free_percpu(net->core.prot_inuse); 3404 free_percpu(net->core.sock_inuse); 3405 } 3406 3407 static struct pernet_operations net_inuse_ops = { 3408 .init = sock_inuse_init_net, 3409 .exit = sock_inuse_exit_net, 3410 }; 3411 3412 static __init int net_inuse_init(void) 3413 { 3414 if (register_pernet_subsys(&net_inuse_ops)) 3415 panic("Cannot initialize net inuse counters"); 3416 3417 return 0; 3418 } 3419 3420 core_initcall(net_inuse_init); 3421 3422 static int assign_proto_idx(struct proto *prot) 3423 { 3424 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); 3425 3426 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { 3427 pr_err("PROTO_INUSE_NR exhausted\n"); 3428 return -ENOSPC; 3429 } 3430 3431 set_bit(prot->inuse_idx, proto_inuse_idx); 3432 return 0; 3433 } 3434 3435 static void release_proto_idx(struct proto *prot) 3436 { 3437 if (prot->inuse_idx != PROTO_INUSE_NR - 1) 3438 clear_bit(prot->inuse_idx, proto_inuse_idx); 3439 } 3440 #else 3441 static inline int assign_proto_idx(struct proto *prot) 3442 { 3443 return 0; 3444 } 3445 3446 static inline void release_proto_idx(struct proto *prot) 3447 { 3448 } 3449 3450 static void sock_inuse_add(struct net *net, int val) 3451 { 3452 } 3453 #endif 3454 3455 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) 3456 { 3457 if (!twsk_prot) 3458 return; 3459 kfree(twsk_prot->twsk_slab_name); 3460 twsk_prot->twsk_slab_name = NULL; 3461 kmem_cache_destroy(twsk_prot->twsk_slab); 3462 twsk_prot->twsk_slab = NULL; 3463 } 3464 3465 static int tw_prot_init(const struct proto *prot) 3466 { 3467 struct timewait_sock_ops *twsk_prot = prot->twsk_prot; 3468 3469 if (!twsk_prot) 3470 return 0; 3471 3472 twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", 3473 prot->name); 3474 if (!twsk_prot->twsk_slab_name) 3475 return -ENOMEM; 3476 3477 twsk_prot->twsk_slab = 3478 kmem_cache_create(twsk_prot->twsk_slab_name, 3479 twsk_prot->twsk_obj_size, 0, 3480 SLAB_ACCOUNT | prot->slab_flags, 3481 NULL); 3482 if (!twsk_prot->twsk_slab) { 3483 pr_crit("%s: Can't create timewait sock SLAB cache!\n", 3484 prot->name); 3485 return -ENOMEM; 3486 } 3487 3488 return 0; 3489 } 3490 3491 static void req_prot_cleanup(struct request_sock_ops *rsk_prot) 3492 { 3493 if (!rsk_prot) 3494 return; 3495 kfree(rsk_prot->slab_name); 3496 rsk_prot->slab_name = NULL; 3497 kmem_cache_destroy(rsk_prot->slab); 3498 rsk_prot->slab = NULL; 3499 } 3500 3501 static int req_prot_init(const struct proto *prot) 3502 { 3503 struct request_sock_ops *rsk_prot = prot->rsk_prot; 3504 3505 if (!rsk_prot) 3506 return 0; 3507 3508 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", 3509 prot->name); 3510 if (!rsk_prot->slab_name) 3511 return -ENOMEM; 3512 3513 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, 3514 rsk_prot->obj_size, 0, 3515 SLAB_ACCOUNT | prot->slab_flags, 3516 NULL); 3517 3518 if (!rsk_prot->slab) { 3519 pr_crit("%s: Can't create request sock SLAB cache!\n", 3520 prot->name); 3521 return -ENOMEM; 3522 } 3523 return 0; 3524 } 3525 3526 int proto_register(struct proto *prot, int alloc_slab) 3527 { 3528 int ret = -ENOBUFS; 3529 3530 if (alloc_slab) { 3531 prot->slab = kmem_cache_create_usercopy(prot->name, 3532 prot->obj_size, 0, 3533 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | 3534 prot->slab_flags, 3535 prot->useroffset, prot->usersize, 3536 NULL); 3537 3538 if (prot->slab == NULL) { 3539 pr_crit("%s: Can't create sock SLAB cache!\n", 3540 prot->name); 3541 goto out; 3542 } 3543 3544 if (req_prot_init(prot)) 3545 goto out_free_request_sock_slab; 3546 3547 if (tw_prot_init(prot)) 3548 goto out_free_timewait_sock_slab; 3549 } 3550 3551 mutex_lock(&proto_list_mutex); 3552 ret = assign_proto_idx(prot); 3553 if (ret) { 3554 mutex_unlock(&proto_list_mutex); 3555 goto out_free_timewait_sock_slab; 3556 } 3557 list_add(&prot->node, &proto_list); 3558 mutex_unlock(&proto_list_mutex); 3559 return ret; 3560 3561 out_free_timewait_sock_slab: 3562 if (alloc_slab) 3563 tw_prot_cleanup(prot->twsk_prot); 3564 out_free_request_sock_slab: 3565 if (alloc_slab) { 3566 req_prot_cleanup(prot->rsk_prot); 3567 3568 kmem_cache_destroy(prot->slab); 3569 prot->slab = NULL; 3570 } 3571 out: 3572 return ret; 3573 } 3574 EXPORT_SYMBOL(proto_register); 3575 3576 void proto_unregister(struct proto *prot) 3577 { 3578 mutex_lock(&proto_list_mutex); 3579 release_proto_idx(prot); 3580 list_del(&prot->node); 3581 mutex_unlock(&proto_list_mutex); 3582 3583 kmem_cache_destroy(prot->slab); 3584 prot->slab = NULL; 3585 3586 req_prot_cleanup(prot->rsk_prot); 3587 tw_prot_cleanup(prot->twsk_prot); 3588 } 3589 EXPORT_SYMBOL(proto_unregister); 3590 3591 int sock_load_diag_module(int family, int protocol) 3592 { 3593 if (!protocol) { 3594 if (!sock_is_registered(family)) 3595 return -ENOENT; 3596 3597 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, 3598 NETLINK_SOCK_DIAG, family); 3599 } 3600 3601 #ifdef CONFIG_INET 3602 if (family == AF_INET && 3603 protocol != IPPROTO_RAW && 3604 protocol < MAX_INET_PROTOS && 3605 !rcu_access_pointer(inet_protos[protocol])) 3606 return -ENOENT; 3607 #endif 3608 3609 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, 3610 NETLINK_SOCK_DIAG, family, protocol); 3611 } 3612 EXPORT_SYMBOL(sock_load_diag_module); 3613 3614 #ifdef CONFIG_PROC_FS 3615 static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 3616 __acquires(proto_list_mutex) 3617 { 3618 mutex_lock(&proto_list_mutex); 3619 return seq_list_start_head(&proto_list, *pos); 3620 } 3621 3622 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3623 { 3624 return seq_list_next(v, &proto_list, pos); 3625 } 3626 3627 static void proto_seq_stop(struct seq_file *seq, void *v) 3628 __releases(proto_list_mutex) 3629 { 3630 mutex_unlock(&proto_list_mutex); 3631 } 3632 3633 static char proto_method_implemented(const void *method) 3634 { 3635 return method == NULL ? 'n' : 'y'; 3636 } 3637 static long sock_prot_memory_allocated(struct proto *proto) 3638 { 3639 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; 3640 } 3641 3642 static const char *sock_prot_memory_pressure(struct proto *proto) 3643 { 3644 return proto->memory_pressure != NULL ? 3645 proto_memory_pressure(proto) ? "yes" : "no" : "NI"; 3646 } 3647 3648 static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 3649 { 3650 3651 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " 3652 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 3653 proto->name, 3654 proto->obj_size, 3655 sock_prot_inuse_get(seq_file_net(seq), proto), 3656 sock_prot_memory_allocated(proto), 3657 sock_prot_memory_pressure(proto), 3658 proto->max_header, 3659 proto->slab == NULL ? "no" : "yes", 3660 module_name(proto->owner), 3661 proto_method_implemented(proto->close), 3662 proto_method_implemented(proto->connect), 3663 proto_method_implemented(proto->disconnect), 3664 proto_method_implemented(proto->accept), 3665 proto_method_implemented(proto->ioctl), 3666 proto_method_implemented(proto->init), 3667 proto_method_implemented(proto->destroy), 3668 proto_method_implemented(proto->shutdown), 3669 proto_method_implemented(proto->setsockopt), 3670 proto_method_implemented(proto->getsockopt), 3671 proto_method_implemented(proto->sendmsg), 3672 proto_method_implemented(proto->recvmsg), 3673 proto_method_implemented(proto->sendpage), 3674 proto_method_implemented(proto->bind), 3675 proto_method_implemented(proto->backlog_rcv), 3676 proto_method_implemented(proto->hash), 3677 proto_method_implemented(proto->unhash), 3678 proto_method_implemented(proto->get_port), 3679 proto_method_implemented(proto->enter_memory_pressure)); 3680 } 3681 3682 static int proto_seq_show(struct seq_file *seq, void *v) 3683 { 3684 if (v == &proto_list) 3685 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", 3686 "protocol", 3687 "size", 3688 "sockets", 3689 "memory", 3690 "press", 3691 "maxhdr", 3692 "slab", 3693 "module", 3694 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); 3695 else 3696 proto_seq_printf(seq, list_entry(v, struct proto, node)); 3697 return 0; 3698 } 3699 3700 static const struct seq_operations proto_seq_ops = { 3701 .start = proto_seq_start, 3702 .next = proto_seq_next, 3703 .stop = proto_seq_stop, 3704 .show = proto_seq_show, 3705 }; 3706 3707 static __net_init int proto_init_net(struct net *net) 3708 { 3709 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, 3710 sizeof(struct seq_net_private))) 3711 return -ENOMEM; 3712 3713 return 0; 3714 } 3715 3716 static __net_exit void proto_exit_net(struct net *net) 3717 { 3718 remove_proc_entry("protocols", net->proc_net); 3719 } 3720 3721 3722 static __net_initdata struct pernet_operations proto_net_ops = { 3723 .init = proto_init_net, 3724 .exit = proto_exit_net, 3725 }; 3726 3727 static int __init proto_init(void) 3728 { 3729 return register_pernet_subsys(&proto_net_ops); 3730 } 3731 3732 subsys_initcall(proto_init); 3733 3734 #endif /* PROC_FS */ 3735 3736 #ifdef CONFIG_NET_RX_BUSY_POLL 3737 bool sk_busy_loop_end(void *p, unsigned long start_time) 3738 { 3739 struct sock *sk = p; 3740 3741 return !skb_queue_empty_lockless(&sk->sk_receive_queue) || 3742 sk_busy_loop_timeout(sk, start_time); 3743 } 3744 EXPORT_SYMBOL(sk_busy_loop_end); 3745 #endif /* CONFIG_NET_RX_BUSY_POLL */ 3746 3747 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) 3748 { 3749 if (!sk->sk_prot->bind_add) 3750 return -EOPNOTSUPP; 3751 return sk->sk_prot->bind_add(sk, addr, addr_len); 3752 } 3753 EXPORT_SYMBOL(sock_bind_add); 3754