1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Generic socket support routines. Memory allocators, socket lock/release 8 * handler for protocols to use and generic option handler. 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Alan Cox, <A.Cox@swansea.ac.uk> 14 * 15 * Fixes: 16 * Alan Cox : Numerous verify_area() problems 17 * Alan Cox : Connecting on a connecting socket 18 * now returns an error for tcp. 19 * Alan Cox : sock->protocol is set correctly. 20 * and is not sometimes left as 0. 21 * Alan Cox : connect handles icmp errors on a 22 * connect properly. Unfortunately there 23 * is a restart syscall nasty there. I 24 * can't match BSD without hacking the C 25 * library. Ideas urgently sought! 26 * Alan Cox : Disallow bind() to addresses that are 27 * not ours - especially broadcast ones!! 28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) 29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, 30 * instead they leave that for the DESTROY timer. 31 * Alan Cox : Clean up error flag in accept 32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer 33 * was buggy. Put a remove_sock() in the handler 34 * for memory when we hit 0. Also altered the timer 35 * code. The ACK stuff can wait and needs major 36 * TCP layer surgery. 37 * Alan Cox : Fixed TCP ack bug, removed remove sock 38 * and fixed timer/inet_bh race. 39 * Alan Cox : Added zapped flag for TCP 40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code 41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb 42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources 43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. 44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... 45 * Rick Sladkey : Relaxed UDP rules for matching packets. 46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support 47 * Pauline Middelink : identd support 48 * Alan Cox : Fixed connect() taking signals I think. 49 * Alan Cox : SO_LINGER supported 50 * Alan Cox : Error reporting fixes 51 * Anonymous : inet_create tidied up (sk->reuse setting) 52 * Alan Cox : inet sockets don't set sk->type! 53 * Alan Cox : Split socket option code 54 * Alan Cox : Callbacks 55 * Alan Cox : Nagle flag for Charles & Johannes stuff 56 * Alex : Removed restriction on inet fioctl 57 * Alan Cox : Splitting INET from NET core 58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() 59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code 60 * Alan Cox : Split IP from generic code 61 * Alan Cox : New kfree_skbmem() 62 * Alan Cox : Make SO_DEBUG superuser only. 63 * Alan Cox : Allow anyone to clear SO_DEBUG 64 * (compatibility fix) 65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. 66 * Alan Cox : Allocator for a socket is settable. 67 * Alan Cox : SO_ERROR includes soft errors. 68 * Alan Cox : Allow NULL arguments on some SO_ opts 69 * Alan Cox : Generic socket allocation to make hooks 70 * easier (suggested by Craig Metz). 71 * Michael Pall : SO_ERROR returns positive errno again 72 * Steve Whitehouse: Added default destructor to free 73 * protocol private data. 74 * Steve Whitehouse: Added various other default routines 75 * common to several socket families. 76 * Chris Evans : Call suser() check last on F_SETOWN 77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. 78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() 79 * Andi Kleen : Fix write_space callback 80 * Chris Evans : Security fixes - signedness again 81 * Arnaldo C. Melo : cleanups, use skb_queue_purge 82 * 83 * To Fix: 84 */ 85 86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 87 88 #include <asm/unaligned.h> 89 #include <linux/capability.h> 90 #include <linux/errno.h> 91 #include <linux/errqueue.h> 92 #include <linux/types.h> 93 #include <linux/socket.h> 94 #include <linux/in.h> 95 #include <linux/kernel.h> 96 #include <linux/module.h> 97 #include <linux/proc_fs.h> 98 #include <linux/seq_file.h> 99 #include <linux/sched.h> 100 #include <linux/sched/mm.h> 101 #include <linux/timer.h> 102 #include <linux/string.h> 103 #include <linux/sockios.h> 104 #include <linux/net.h> 105 #include <linux/mm.h> 106 #include <linux/slab.h> 107 #include <linux/interrupt.h> 108 #include <linux/poll.h> 109 #include <linux/tcp.h> 110 #include <linux/init.h> 111 #include <linux/highmem.h> 112 #include <linux/user_namespace.h> 113 #include <linux/static_key.h> 114 #include <linux/memcontrol.h> 115 #include <linux/prefetch.h> 116 #include <linux/compat.h> 117 118 #include <linux/uaccess.h> 119 120 #include <linux/netdevice.h> 121 #include <net/protocol.h> 122 #include <linux/skbuff.h> 123 #include <net/net_namespace.h> 124 #include <net/request_sock.h> 125 #include <net/sock.h> 126 #include <linux/net_tstamp.h> 127 #include <net/xfrm.h> 128 #include <linux/ipsec.h> 129 #include <net/cls_cgroup.h> 130 #include <net/netprio_cgroup.h> 131 #include <linux/sock_diag.h> 132 133 #include <linux/filter.h> 134 #include <net/sock_reuseport.h> 135 #include <net/bpf_sk_storage.h> 136 137 #include <trace/events/sock.h> 138 139 #include <net/tcp.h> 140 #include <net/busy_poll.h> 141 142 static DEFINE_MUTEX(proto_list_mutex); 143 static LIST_HEAD(proto_list); 144 145 static void sock_inuse_add(struct net *net, int val); 146 147 /** 148 * sk_ns_capable - General socket capability test 149 * @sk: Socket to use a capability on or through 150 * @user_ns: The user namespace of the capability to use 151 * @cap: The capability to use 152 * 153 * Test to see if the opener of the socket had when the socket was 154 * created and the current process has the capability @cap in the user 155 * namespace @user_ns. 156 */ 157 bool sk_ns_capable(const struct sock *sk, 158 struct user_namespace *user_ns, int cap) 159 { 160 return file_ns_capable(sk->sk_socket->file, user_ns, cap) && 161 ns_capable(user_ns, cap); 162 } 163 EXPORT_SYMBOL(sk_ns_capable); 164 165 /** 166 * sk_capable - Socket global capability test 167 * @sk: Socket to use a capability on or through 168 * @cap: The global capability to use 169 * 170 * Test to see if the opener of the socket had when the socket was 171 * created and the current process has the capability @cap in all user 172 * namespaces. 173 */ 174 bool sk_capable(const struct sock *sk, int cap) 175 { 176 return sk_ns_capable(sk, &init_user_ns, cap); 177 } 178 EXPORT_SYMBOL(sk_capable); 179 180 /** 181 * sk_net_capable - Network namespace socket capability test 182 * @sk: Socket to use a capability on or through 183 * @cap: The capability to use 184 * 185 * Test to see if the opener of the socket had when the socket was created 186 * and the current process has the capability @cap over the network namespace 187 * the socket is a member of. 188 */ 189 bool sk_net_capable(const struct sock *sk, int cap) 190 { 191 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); 192 } 193 EXPORT_SYMBOL(sk_net_capable); 194 195 /* 196 * Each address family might have different locking rules, so we have 197 * one slock key per address family and separate keys for internal and 198 * userspace sockets. 199 */ 200 static struct lock_class_key af_family_keys[AF_MAX]; 201 static struct lock_class_key af_family_kern_keys[AF_MAX]; 202 static struct lock_class_key af_family_slock_keys[AF_MAX]; 203 static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; 204 205 /* 206 * Make lock validator output more readable. (we pre-construct these 207 * strings build-time, so that runtime initialization of socket 208 * locks is fast): 209 */ 210 211 #define _sock_locks(x) \ 212 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ 213 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ 214 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ 215 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ 216 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ 217 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ 218 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ 219 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ 220 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ 221 x "27" , x "28" , x "AF_CAN" , \ 222 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ 223 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ 224 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ 225 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ 226 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ 227 x "AF_MAX" 228 229 static const char *const af_family_key_strings[AF_MAX+1] = { 230 _sock_locks("sk_lock-") 231 }; 232 static const char *const af_family_slock_key_strings[AF_MAX+1] = { 233 _sock_locks("slock-") 234 }; 235 static const char *const af_family_clock_key_strings[AF_MAX+1] = { 236 _sock_locks("clock-") 237 }; 238 239 static const char *const af_family_kern_key_strings[AF_MAX+1] = { 240 _sock_locks("k-sk_lock-") 241 }; 242 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { 243 _sock_locks("k-slock-") 244 }; 245 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { 246 _sock_locks("k-clock-") 247 }; 248 static const char *const af_family_rlock_key_strings[AF_MAX+1] = { 249 _sock_locks("rlock-") 250 }; 251 static const char *const af_family_wlock_key_strings[AF_MAX+1] = { 252 _sock_locks("wlock-") 253 }; 254 static const char *const af_family_elock_key_strings[AF_MAX+1] = { 255 _sock_locks("elock-") 256 }; 257 258 /* 259 * sk_callback_lock and sk queues locking rules are per-address-family, 260 * so split the lock classes by using a per-AF key: 261 */ 262 static struct lock_class_key af_callback_keys[AF_MAX]; 263 static struct lock_class_key af_rlock_keys[AF_MAX]; 264 static struct lock_class_key af_wlock_keys[AF_MAX]; 265 static struct lock_class_key af_elock_keys[AF_MAX]; 266 static struct lock_class_key af_kern_callback_keys[AF_MAX]; 267 268 /* Run time adjustable parameters. */ 269 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; 270 EXPORT_SYMBOL(sysctl_wmem_max); 271 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; 272 EXPORT_SYMBOL(sysctl_rmem_max); 273 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; 274 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; 275 276 /* Maximal space eaten by iovec or ancillary data plus some space */ 277 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 278 EXPORT_SYMBOL(sysctl_optmem_max); 279 280 int sysctl_tstamp_allow_data __read_mostly = 1; 281 282 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); 283 EXPORT_SYMBOL_GPL(memalloc_socks_key); 284 285 /** 286 * sk_set_memalloc - sets %SOCK_MEMALLOC 287 * @sk: socket to set it on 288 * 289 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. 290 * It's the responsibility of the admin to adjust min_free_kbytes 291 * to meet the requirements 292 */ 293 void sk_set_memalloc(struct sock *sk) 294 { 295 sock_set_flag(sk, SOCK_MEMALLOC); 296 sk->sk_allocation |= __GFP_MEMALLOC; 297 static_branch_inc(&memalloc_socks_key); 298 } 299 EXPORT_SYMBOL_GPL(sk_set_memalloc); 300 301 void sk_clear_memalloc(struct sock *sk) 302 { 303 sock_reset_flag(sk, SOCK_MEMALLOC); 304 sk->sk_allocation &= ~__GFP_MEMALLOC; 305 static_branch_dec(&memalloc_socks_key); 306 307 /* 308 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward 309 * progress of swapping. SOCK_MEMALLOC may be cleared while 310 * it has rmem allocations due to the last swapfile being deactivated 311 * but there is a risk that the socket is unusable due to exceeding 312 * the rmem limits. Reclaim the reserves and obey rmem limits again. 313 */ 314 sk_mem_reclaim(sk); 315 } 316 EXPORT_SYMBOL_GPL(sk_clear_memalloc); 317 318 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 319 { 320 int ret; 321 unsigned int noreclaim_flag; 322 323 /* these should have been dropped before queueing */ 324 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); 325 326 noreclaim_flag = memalloc_noreclaim_save(); 327 ret = sk->sk_backlog_rcv(sk, skb); 328 memalloc_noreclaim_restore(noreclaim_flag); 329 330 return ret; 331 } 332 EXPORT_SYMBOL(__sk_backlog_rcv); 333 334 static int sock_get_timeout(long timeo, void *optval, bool old_timeval) 335 { 336 struct __kernel_sock_timeval tv; 337 338 if (timeo == MAX_SCHEDULE_TIMEOUT) { 339 tv.tv_sec = 0; 340 tv.tv_usec = 0; 341 } else { 342 tv.tv_sec = timeo / HZ; 343 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; 344 } 345 346 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 347 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; 348 *(struct old_timeval32 *)optval = tv32; 349 return sizeof(tv32); 350 } 351 352 if (old_timeval) { 353 struct __kernel_old_timeval old_tv; 354 old_tv.tv_sec = tv.tv_sec; 355 old_tv.tv_usec = tv.tv_usec; 356 *(struct __kernel_old_timeval *)optval = old_tv; 357 return sizeof(old_tv); 358 } 359 360 *(struct __kernel_sock_timeval *)optval = tv; 361 return sizeof(tv); 362 } 363 364 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, 365 bool old_timeval) 366 { 367 struct __kernel_sock_timeval tv; 368 369 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 370 struct old_timeval32 tv32; 371 372 if (optlen < sizeof(tv32)) 373 return -EINVAL; 374 375 if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) 376 return -EFAULT; 377 tv.tv_sec = tv32.tv_sec; 378 tv.tv_usec = tv32.tv_usec; 379 } else if (old_timeval) { 380 struct __kernel_old_timeval old_tv; 381 382 if (optlen < sizeof(old_tv)) 383 return -EINVAL; 384 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) 385 return -EFAULT; 386 tv.tv_sec = old_tv.tv_sec; 387 tv.tv_usec = old_tv.tv_usec; 388 } else { 389 if (optlen < sizeof(tv)) 390 return -EINVAL; 391 if (copy_from_sockptr(&tv, optval, sizeof(tv))) 392 return -EFAULT; 393 } 394 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) 395 return -EDOM; 396 397 if (tv.tv_sec < 0) { 398 static int warned __read_mostly; 399 400 *timeo_p = 0; 401 if (warned < 10 && net_ratelimit()) { 402 warned++; 403 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", 404 __func__, current->comm, task_pid_nr(current)); 405 } 406 return 0; 407 } 408 *timeo_p = MAX_SCHEDULE_TIMEOUT; 409 if (tv.tv_sec == 0 && tv.tv_usec == 0) 410 return 0; 411 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) 412 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ); 413 return 0; 414 } 415 416 static bool sock_needs_netstamp(const struct sock *sk) 417 { 418 switch (sk->sk_family) { 419 case AF_UNSPEC: 420 case AF_UNIX: 421 return false; 422 default: 423 return true; 424 } 425 } 426 427 static void sock_disable_timestamp(struct sock *sk, unsigned long flags) 428 { 429 if (sk->sk_flags & flags) { 430 sk->sk_flags &= ~flags; 431 if (sock_needs_netstamp(sk) && 432 !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) 433 net_disable_timestamp(); 434 } 435 } 436 437 438 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 439 { 440 unsigned long flags; 441 struct sk_buff_head *list = &sk->sk_receive_queue; 442 443 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { 444 atomic_inc(&sk->sk_drops); 445 trace_sock_rcvqueue_full(sk, skb); 446 return -ENOMEM; 447 } 448 449 if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 450 atomic_inc(&sk->sk_drops); 451 return -ENOBUFS; 452 } 453 454 skb->dev = NULL; 455 skb_set_owner_r(skb, sk); 456 457 /* we escape from rcu protected region, make sure we dont leak 458 * a norefcounted dst 459 */ 460 skb_dst_force(skb); 461 462 spin_lock_irqsave(&list->lock, flags); 463 sock_skb_set_dropcount(sk, skb); 464 __skb_queue_tail(list, skb); 465 spin_unlock_irqrestore(&list->lock, flags); 466 467 if (!sock_flag(sk, SOCK_DEAD)) 468 sk->sk_data_ready(sk); 469 return 0; 470 } 471 EXPORT_SYMBOL(__sock_queue_rcv_skb); 472 473 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 474 { 475 int err; 476 477 err = sk_filter(sk, skb); 478 if (err) 479 return err; 480 481 return __sock_queue_rcv_skb(sk, skb); 482 } 483 EXPORT_SYMBOL(sock_queue_rcv_skb); 484 485 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, 486 const int nested, unsigned int trim_cap, bool refcounted) 487 { 488 int rc = NET_RX_SUCCESS; 489 490 if (sk_filter_trim_cap(sk, skb, trim_cap)) 491 goto discard_and_relse; 492 493 skb->dev = NULL; 494 495 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { 496 atomic_inc(&sk->sk_drops); 497 goto discard_and_relse; 498 } 499 if (nested) 500 bh_lock_sock_nested(sk); 501 else 502 bh_lock_sock(sk); 503 if (!sock_owned_by_user(sk)) { 504 /* 505 * trylock + unlock semantics: 506 */ 507 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); 508 509 rc = sk_backlog_rcv(sk, skb); 510 511 mutex_release(&sk->sk_lock.dep_map, _RET_IP_); 512 } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { 513 bh_unlock_sock(sk); 514 atomic_inc(&sk->sk_drops); 515 goto discard_and_relse; 516 } 517 518 bh_unlock_sock(sk); 519 out: 520 if (refcounted) 521 sock_put(sk); 522 return rc; 523 discard_and_relse: 524 kfree_skb(skb); 525 goto out; 526 } 527 EXPORT_SYMBOL(__sk_receive_skb); 528 529 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 530 { 531 struct dst_entry *dst = __sk_dst_get(sk); 532 533 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 534 sk_tx_queue_clear(sk); 535 sk->sk_dst_pending_confirm = 0; 536 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 537 dst_release(dst); 538 return NULL; 539 } 540 541 return dst; 542 } 543 EXPORT_SYMBOL(__sk_dst_check); 544 545 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) 546 { 547 struct dst_entry *dst = sk_dst_get(sk); 548 549 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 550 sk_dst_reset(sk); 551 dst_release(dst); 552 return NULL; 553 } 554 555 return dst; 556 } 557 EXPORT_SYMBOL(sk_dst_check); 558 559 static int sock_bindtoindex_locked(struct sock *sk, int ifindex) 560 { 561 int ret = -ENOPROTOOPT; 562 #ifdef CONFIG_NETDEVICES 563 struct net *net = sock_net(sk); 564 565 /* Sorry... */ 566 ret = -EPERM; 567 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) 568 goto out; 569 570 ret = -EINVAL; 571 if (ifindex < 0) 572 goto out; 573 574 sk->sk_bound_dev_if = ifindex; 575 if (sk->sk_prot->rehash) 576 sk->sk_prot->rehash(sk); 577 sk_dst_reset(sk); 578 579 ret = 0; 580 581 out: 582 #endif 583 584 return ret; 585 } 586 587 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) 588 { 589 int ret; 590 591 if (lock_sk) 592 lock_sock(sk); 593 ret = sock_bindtoindex_locked(sk, ifindex); 594 if (lock_sk) 595 release_sock(sk); 596 597 return ret; 598 } 599 EXPORT_SYMBOL(sock_bindtoindex); 600 601 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) 602 { 603 int ret = -ENOPROTOOPT; 604 #ifdef CONFIG_NETDEVICES 605 struct net *net = sock_net(sk); 606 char devname[IFNAMSIZ]; 607 int index; 608 609 ret = -EINVAL; 610 if (optlen < 0) 611 goto out; 612 613 /* Bind this socket to a particular device like "eth0", 614 * as specified in the passed interface name. If the 615 * name is "" or the option length is zero the socket 616 * is not bound. 617 */ 618 if (optlen > IFNAMSIZ - 1) 619 optlen = IFNAMSIZ - 1; 620 memset(devname, 0, sizeof(devname)); 621 622 ret = -EFAULT; 623 if (copy_from_sockptr(devname, optval, optlen)) 624 goto out; 625 626 index = 0; 627 if (devname[0] != '\0') { 628 struct net_device *dev; 629 630 rcu_read_lock(); 631 dev = dev_get_by_name_rcu(net, devname); 632 if (dev) 633 index = dev->ifindex; 634 rcu_read_unlock(); 635 ret = -ENODEV; 636 if (!dev) 637 goto out; 638 } 639 640 return sock_bindtoindex(sk, index, true); 641 out: 642 #endif 643 644 return ret; 645 } 646 647 static int sock_getbindtodevice(struct sock *sk, char __user *optval, 648 int __user *optlen, int len) 649 { 650 int ret = -ENOPROTOOPT; 651 #ifdef CONFIG_NETDEVICES 652 struct net *net = sock_net(sk); 653 char devname[IFNAMSIZ]; 654 655 if (sk->sk_bound_dev_if == 0) { 656 len = 0; 657 goto zero; 658 } 659 660 ret = -EINVAL; 661 if (len < IFNAMSIZ) 662 goto out; 663 664 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if); 665 if (ret) 666 goto out; 667 668 len = strlen(devname) + 1; 669 670 ret = -EFAULT; 671 if (copy_to_user(optval, devname, len)) 672 goto out; 673 674 zero: 675 ret = -EFAULT; 676 if (put_user(len, optlen)) 677 goto out; 678 679 ret = 0; 680 681 out: 682 #endif 683 684 return ret; 685 } 686 687 bool sk_mc_loop(struct sock *sk) 688 { 689 if (dev_recursion_level()) 690 return false; 691 if (!sk) 692 return true; 693 switch (sk->sk_family) { 694 case AF_INET: 695 return inet_sk(sk)->mc_loop; 696 #if IS_ENABLED(CONFIG_IPV6) 697 case AF_INET6: 698 return inet6_sk(sk)->mc_loop; 699 #endif 700 } 701 WARN_ON_ONCE(1); 702 return true; 703 } 704 EXPORT_SYMBOL(sk_mc_loop); 705 706 void sock_set_reuseaddr(struct sock *sk) 707 { 708 lock_sock(sk); 709 sk->sk_reuse = SK_CAN_REUSE; 710 release_sock(sk); 711 } 712 EXPORT_SYMBOL(sock_set_reuseaddr); 713 714 void sock_set_reuseport(struct sock *sk) 715 { 716 lock_sock(sk); 717 sk->sk_reuseport = true; 718 release_sock(sk); 719 } 720 EXPORT_SYMBOL(sock_set_reuseport); 721 722 void sock_no_linger(struct sock *sk) 723 { 724 lock_sock(sk); 725 sk->sk_lingertime = 0; 726 sock_set_flag(sk, SOCK_LINGER); 727 release_sock(sk); 728 } 729 EXPORT_SYMBOL(sock_no_linger); 730 731 void sock_set_priority(struct sock *sk, u32 priority) 732 { 733 lock_sock(sk); 734 sk->sk_priority = priority; 735 release_sock(sk); 736 } 737 EXPORT_SYMBOL(sock_set_priority); 738 739 void sock_set_sndtimeo(struct sock *sk, s64 secs) 740 { 741 lock_sock(sk); 742 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) 743 sk->sk_sndtimeo = secs * HZ; 744 else 745 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 746 release_sock(sk); 747 } 748 EXPORT_SYMBOL(sock_set_sndtimeo); 749 750 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) 751 { 752 if (val) { 753 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); 754 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns); 755 sock_set_flag(sk, SOCK_RCVTSTAMP); 756 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 757 } else { 758 sock_reset_flag(sk, SOCK_RCVTSTAMP); 759 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 760 sock_reset_flag(sk, SOCK_TSTAMP_NEW); 761 } 762 } 763 764 void sock_enable_timestamps(struct sock *sk) 765 { 766 lock_sock(sk); 767 __sock_set_timestamps(sk, true, false, true); 768 release_sock(sk); 769 } 770 EXPORT_SYMBOL(sock_enable_timestamps); 771 772 void sock_set_keepalive(struct sock *sk) 773 { 774 lock_sock(sk); 775 if (sk->sk_prot->keepalive) 776 sk->sk_prot->keepalive(sk, true); 777 sock_valbool_flag(sk, SOCK_KEEPOPEN, true); 778 release_sock(sk); 779 } 780 EXPORT_SYMBOL(sock_set_keepalive); 781 782 static void __sock_set_rcvbuf(struct sock *sk, int val) 783 { 784 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it 785 * as a negative value. 786 */ 787 val = min_t(int, val, INT_MAX / 2); 788 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 789 790 /* We double it on the way in to account for "struct sk_buff" etc. 791 * overhead. Applications assume that the SO_RCVBUF setting they make 792 * will allow that much actual data to be received on that socket. 793 * 794 * Applications are unaware that "struct sk_buff" and other overheads 795 * allocate from the receive buffer during socket buffer allocation. 796 * 797 * And after considering the possible alternatives, returning the value 798 * we actually used in getsockopt is the most desirable behavior. 799 */ 800 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); 801 } 802 803 void sock_set_rcvbuf(struct sock *sk, int val) 804 { 805 lock_sock(sk); 806 __sock_set_rcvbuf(sk, val); 807 release_sock(sk); 808 } 809 EXPORT_SYMBOL(sock_set_rcvbuf); 810 811 void sock_set_mark(struct sock *sk, u32 val) 812 { 813 lock_sock(sk); 814 sk->sk_mark = val; 815 release_sock(sk); 816 } 817 EXPORT_SYMBOL(sock_set_mark); 818 819 /* 820 * This is meant for all protocols to use and covers goings on 821 * at the socket level. Everything here is generic. 822 */ 823 824 int sock_setsockopt(struct socket *sock, int level, int optname, 825 sockptr_t optval, unsigned int optlen) 826 { 827 struct sock_txtime sk_txtime; 828 struct sock *sk = sock->sk; 829 int val; 830 int valbool; 831 struct linger ling; 832 int ret = 0; 833 834 /* 835 * Options without arguments 836 */ 837 838 if (optname == SO_BINDTODEVICE) 839 return sock_setbindtodevice(sk, optval, optlen); 840 841 if (optlen < sizeof(int)) 842 return -EINVAL; 843 844 if (copy_from_sockptr(&val, optval, sizeof(val))) 845 return -EFAULT; 846 847 valbool = val ? 1 : 0; 848 849 lock_sock(sk); 850 851 switch (optname) { 852 case SO_DEBUG: 853 if (val && !capable(CAP_NET_ADMIN)) 854 ret = -EACCES; 855 else 856 sock_valbool_flag(sk, SOCK_DBG, valbool); 857 break; 858 case SO_REUSEADDR: 859 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); 860 break; 861 case SO_REUSEPORT: 862 sk->sk_reuseport = valbool; 863 break; 864 case SO_TYPE: 865 case SO_PROTOCOL: 866 case SO_DOMAIN: 867 case SO_ERROR: 868 ret = -ENOPROTOOPT; 869 break; 870 case SO_DONTROUTE: 871 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); 872 sk_dst_reset(sk); 873 break; 874 case SO_BROADCAST: 875 sock_valbool_flag(sk, SOCK_BROADCAST, valbool); 876 break; 877 case SO_SNDBUF: 878 /* Don't error on this BSD doesn't and if you think 879 * about it this is right. Otherwise apps have to 880 * play 'guess the biggest size' games. RCVBUF/SNDBUF 881 * are treated in BSD as hints 882 */ 883 val = min_t(u32, val, sysctl_wmem_max); 884 set_sndbuf: 885 /* Ensure val * 2 fits into an int, to prevent max_t() 886 * from treating it as a negative value. 887 */ 888 val = min_t(int, val, INT_MAX / 2); 889 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 890 WRITE_ONCE(sk->sk_sndbuf, 891 max_t(int, val * 2, SOCK_MIN_SNDBUF)); 892 /* Wake up sending tasks if we upped the value. */ 893 sk->sk_write_space(sk); 894 break; 895 896 case SO_SNDBUFFORCE: 897 if (!capable(CAP_NET_ADMIN)) { 898 ret = -EPERM; 899 break; 900 } 901 902 /* No negative values (to prevent underflow, as val will be 903 * multiplied by 2). 904 */ 905 if (val < 0) 906 val = 0; 907 goto set_sndbuf; 908 909 case SO_RCVBUF: 910 /* Don't error on this BSD doesn't and if you think 911 * about it this is right. Otherwise apps have to 912 * play 'guess the biggest size' games. RCVBUF/SNDBUF 913 * are treated in BSD as hints 914 */ 915 __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max)); 916 break; 917 918 case SO_RCVBUFFORCE: 919 if (!capable(CAP_NET_ADMIN)) { 920 ret = -EPERM; 921 break; 922 } 923 924 /* No negative values (to prevent underflow, as val will be 925 * multiplied by 2). 926 */ 927 __sock_set_rcvbuf(sk, max(val, 0)); 928 break; 929 930 case SO_KEEPALIVE: 931 if (sk->sk_prot->keepalive) 932 sk->sk_prot->keepalive(sk, valbool); 933 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 934 break; 935 936 case SO_OOBINLINE: 937 sock_valbool_flag(sk, SOCK_URGINLINE, valbool); 938 break; 939 940 case SO_NO_CHECK: 941 sk->sk_no_check_tx = valbool; 942 break; 943 944 case SO_PRIORITY: 945 if ((val >= 0 && val <= 6) || 946 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 947 sk->sk_priority = val; 948 else 949 ret = -EPERM; 950 break; 951 952 case SO_LINGER: 953 if (optlen < sizeof(ling)) { 954 ret = -EINVAL; /* 1003.1g */ 955 break; 956 } 957 if (copy_from_sockptr(&ling, optval, sizeof(ling))) { 958 ret = -EFAULT; 959 break; 960 } 961 if (!ling.l_onoff) 962 sock_reset_flag(sk, SOCK_LINGER); 963 else { 964 #if (BITS_PER_LONG == 32) 965 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) 966 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; 967 else 968 #endif 969 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; 970 sock_set_flag(sk, SOCK_LINGER); 971 } 972 break; 973 974 case SO_BSDCOMPAT: 975 break; 976 977 case SO_PASSCRED: 978 if (valbool) 979 set_bit(SOCK_PASSCRED, &sock->flags); 980 else 981 clear_bit(SOCK_PASSCRED, &sock->flags); 982 break; 983 984 case SO_TIMESTAMP_OLD: 985 __sock_set_timestamps(sk, valbool, false, false); 986 break; 987 case SO_TIMESTAMP_NEW: 988 __sock_set_timestamps(sk, valbool, true, false); 989 break; 990 case SO_TIMESTAMPNS_OLD: 991 __sock_set_timestamps(sk, valbool, false, true); 992 break; 993 case SO_TIMESTAMPNS_NEW: 994 __sock_set_timestamps(sk, valbool, true, true); 995 break; 996 case SO_TIMESTAMPING_NEW: 997 sock_set_flag(sk, SOCK_TSTAMP_NEW); 998 fallthrough; 999 case SO_TIMESTAMPING_OLD: 1000 if (val & ~SOF_TIMESTAMPING_MASK) { 1001 ret = -EINVAL; 1002 break; 1003 } 1004 1005 if (val & SOF_TIMESTAMPING_OPT_ID && 1006 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { 1007 if (sk->sk_protocol == IPPROTO_TCP && 1008 sk->sk_type == SOCK_STREAM) { 1009 if ((1 << sk->sk_state) & 1010 (TCPF_CLOSE | TCPF_LISTEN)) { 1011 ret = -EINVAL; 1012 break; 1013 } 1014 sk->sk_tskey = tcp_sk(sk)->snd_una; 1015 } else { 1016 sk->sk_tskey = 0; 1017 } 1018 } 1019 1020 if (val & SOF_TIMESTAMPING_OPT_STATS && 1021 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) { 1022 ret = -EINVAL; 1023 break; 1024 } 1025 1026 sk->sk_tsflags = val; 1027 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 1028 sock_enable_timestamp(sk, 1029 SOCK_TIMESTAMPING_RX_SOFTWARE); 1030 else { 1031 if (optname == SO_TIMESTAMPING_NEW) 1032 sock_reset_flag(sk, SOCK_TSTAMP_NEW); 1033 1034 sock_disable_timestamp(sk, 1035 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); 1036 } 1037 break; 1038 1039 case SO_RCVLOWAT: 1040 if (val < 0) 1041 val = INT_MAX; 1042 if (sock->ops->set_rcvlowat) 1043 ret = sock->ops->set_rcvlowat(sk, val); 1044 else 1045 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); 1046 break; 1047 1048 case SO_RCVTIMEO_OLD: 1049 case SO_RCVTIMEO_NEW: 1050 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, 1051 optlen, optname == SO_RCVTIMEO_OLD); 1052 break; 1053 1054 case SO_SNDTIMEO_OLD: 1055 case SO_SNDTIMEO_NEW: 1056 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, 1057 optlen, optname == SO_SNDTIMEO_OLD); 1058 break; 1059 1060 case SO_ATTACH_FILTER: { 1061 struct sock_fprog fprog; 1062 1063 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1064 if (!ret) 1065 ret = sk_attach_filter(&fprog, sk); 1066 break; 1067 } 1068 case SO_ATTACH_BPF: 1069 ret = -EINVAL; 1070 if (optlen == sizeof(u32)) { 1071 u32 ufd; 1072 1073 ret = -EFAULT; 1074 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1075 break; 1076 1077 ret = sk_attach_bpf(ufd, sk); 1078 } 1079 break; 1080 1081 case SO_ATTACH_REUSEPORT_CBPF: { 1082 struct sock_fprog fprog; 1083 1084 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1085 if (!ret) 1086 ret = sk_reuseport_attach_filter(&fprog, sk); 1087 break; 1088 } 1089 case SO_ATTACH_REUSEPORT_EBPF: 1090 ret = -EINVAL; 1091 if (optlen == sizeof(u32)) { 1092 u32 ufd; 1093 1094 ret = -EFAULT; 1095 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1096 break; 1097 1098 ret = sk_reuseport_attach_bpf(ufd, sk); 1099 } 1100 break; 1101 1102 case SO_DETACH_REUSEPORT_BPF: 1103 ret = reuseport_detach_prog(sk); 1104 break; 1105 1106 case SO_DETACH_FILTER: 1107 ret = sk_detach_filter(sk); 1108 break; 1109 1110 case SO_LOCK_FILTER: 1111 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) 1112 ret = -EPERM; 1113 else 1114 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); 1115 break; 1116 1117 case SO_PASSSEC: 1118 if (valbool) 1119 set_bit(SOCK_PASSSEC, &sock->flags); 1120 else 1121 clear_bit(SOCK_PASSSEC, &sock->flags); 1122 break; 1123 case SO_MARK: 1124 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1125 ret = -EPERM; 1126 } else if (val != sk->sk_mark) { 1127 sk->sk_mark = val; 1128 sk_dst_reset(sk); 1129 } 1130 break; 1131 1132 case SO_RXQ_OVFL: 1133 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); 1134 break; 1135 1136 case SO_WIFI_STATUS: 1137 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); 1138 break; 1139 1140 case SO_PEEK_OFF: 1141 if (sock->ops->set_peek_off) 1142 ret = sock->ops->set_peek_off(sk, val); 1143 else 1144 ret = -EOPNOTSUPP; 1145 break; 1146 1147 case SO_NOFCS: 1148 sock_valbool_flag(sk, SOCK_NOFCS, valbool); 1149 break; 1150 1151 case SO_SELECT_ERR_QUEUE: 1152 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); 1153 break; 1154 1155 #ifdef CONFIG_NET_RX_BUSY_POLL 1156 case SO_BUSY_POLL: 1157 /* allow unprivileged users to decrease the value */ 1158 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) 1159 ret = -EPERM; 1160 else { 1161 if (val < 0) 1162 ret = -EINVAL; 1163 else 1164 sk->sk_ll_usec = val; 1165 } 1166 break; 1167 #endif 1168 1169 case SO_MAX_PACING_RATE: 1170 { 1171 unsigned long ulval = (val == ~0U) ? ~0UL : val; 1172 1173 if (sizeof(ulval) != sizeof(val) && 1174 optlen >= sizeof(ulval) && 1175 copy_from_sockptr(&ulval, optval, sizeof(ulval))) { 1176 ret = -EFAULT; 1177 break; 1178 } 1179 if (ulval != ~0UL) 1180 cmpxchg(&sk->sk_pacing_status, 1181 SK_PACING_NONE, 1182 SK_PACING_NEEDED); 1183 sk->sk_max_pacing_rate = ulval; 1184 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval); 1185 break; 1186 } 1187 case SO_INCOMING_CPU: 1188 WRITE_ONCE(sk->sk_incoming_cpu, val); 1189 break; 1190 1191 case SO_CNX_ADVICE: 1192 if (val == 1) 1193 dst_negative_advice(sk); 1194 break; 1195 1196 case SO_ZEROCOPY: 1197 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { 1198 if (!((sk->sk_type == SOCK_STREAM && 1199 sk->sk_protocol == IPPROTO_TCP) || 1200 (sk->sk_type == SOCK_DGRAM && 1201 sk->sk_protocol == IPPROTO_UDP))) 1202 ret = -ENOTSUPP; 1203 } else if (sk->sk_family != PF_RDS) { 1204 ret = -ENOTSUPP; 1205 } 1206 if (!ret) { 1207 if (val < 0 || val > 1) 1208 ret = -EINVAL; 1209 else 1210 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); 1211 } 1212 break; 1213 1214 case SO_TXTIME: 1215 if (optlen != sizeof(struct sock_txtime)) { 1216 ret = -EINVAL; 1217 break; 1218 } else if (copy_from_sockptr(&sk_txtime, optval, 1219 sizeof(struct sock_txtime))) { 1220 ret = -EFAULT; 1221 break; 1222 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { 1223 ret = -EINVAL; 1224 break; 1225 } 1226 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet 1227 * scheduler has enough safe guards. 1228 */ 1229 if (sk_txtime.clockid != CLOCK_MONOTONIC && 1230 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1231 ret = -EPERM; 1232 break; 1233 } 1234 sock_valbool_flag(sk, SOCK_TXTIME, true); 1235 sk->sk_clockid = sk_txtime.clockid; 1236 sk->sk_txtime_deadline_mode = 1237 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); 1238 sk->sk_txtime_report_errors = 1239 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); 1240 break; 1241 1242 case SO_BINDTOIFINDEX: 1243 ret = sock_bindtoindex_locked(sk, val); 1244 break; 1245 1246 default: 1247 ret = -ENOPROTOOPT; 1248 break; 1249 } 1250 release_sock(sk); 1251 return ret; 1252 } 1253 EXPORT_SYMBOL(sock_setsockopt); 1254 1255 1256 static void cred_to_ucred(struct pid *pid, const struct cred *cred, 1257 struct ucred *ucred) 1258 { 1259 ucred->pid = pid_vnr(pid); 1260 ucred->uid = ucred->gid = -1; 1261 if (cred) { 1262 struct user_namespace *current_ns = current_user_ns(); 1263 1264 ucred->uid = from_kuid_munged(current_ns, cred->euid); 1265 ucred->gid = from_kgid_munged(current_ns, cred->egid); 1266 } 1267 } 1268 1269 static int groups_to_user(gid_t __user *dst, const struct group_info *src) 1270 { 1271 struct user_namespace *user_ns = current_user_ns(); 1272 int i; 1273 1274 for (i = 0; i < src->ngroups; i++) 1275 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i)) 1276 return -EFAULT; 1277 1278 return 0; 1279 } 1280 1281 int sock_getsockopt(struct socket *sock, int level, int optname, 1282 char __user *optval, int __user *optlen) 1283 { 1284 struct sock *sk = sock->sk; 1285 1286 union { 1287 int val; 1288 u64 val64; 1289 unsigned long ulval; 1290 struct linger ling; 1291 struct old_timeval32 tm32; 1292 struct __kernel_old_timeval tm; 1293 struct __kernel_sock_timeval stm; 1294 struct sock_txtime txtime; 1295 } v; 1296 1297 int lv = sizeof(int); 1298 int len; 1299 1300 if (get_user(len, optlen)) 1301 return -EFAULT; 1302 if (len < 0) 1303 return -EINVAL; 1304 1305 memset(&v, 0, sizeof(v)); 1306 1307 switch (optname) { 1308 case SO_DEBUG: 1309 v.val = sock_flag(sk, SOCK_DBG); 1310 break; 1311 1312 case SO_DONTROUTE: 1313 v.val = sock_flag(sk, SOCK_LOCALROUTE); 1314 break; 1315 1316 case SO_BROADCAST: 1317 v.val = sock_flag(sk, SOCK_BROADCAST); 1318 break; 1319 1320 case SO_SNDBUF: 1321 v.val = sk->sk_sndbuf; 1322 break; 1323 1324 case SO_RCVBUF: 1325 v.val = sk->sk_rcvbuf; 1326 break; 1327 1328 case SO_REUSEADDR: 1329 v.val = sk->sk_reuse; 1330 break; 1331 1332 case SO_REUSEPORT: 1333 v.val = sk->sk_reuseport; 1334 break; 1335 1336 case SO_KEEPALIVE: 1337 v.val = sock_flag(sk, SOCK_KEEPOPEN); 1338 break; 1339 1340 case SO_TYPE: 1341 v.val = sk->sk_type; 1342 break; 1343 1344 case SO_PROTOCOL: 1345 v.val = sk->sk_protocol; 1346 break; 1347 1348 case SO_DOMAIN: 1349 v.val = sk->sk_family; 1350 break; 1351 1352 case SO_ERROR: 1353 v.val = -sock_error(sk); 1354 if (v.val == 0) 1355 v.val = xchg(&sk->sk_err_soft, 0); 1356 break; 1357 1358 case SO_OOBINLINE: 1359 v.val = sock_flag(sk, SOCK_URGINLINE); 1360 break; 1361 1362 case SO_NO_CHECK: 1363 v.val = sk->sk_no_check_tx; 1364 break; 1365 1366 case SO_PRIORITY: 1367 v.val = sk->sk_priority; 1368 break; 1369 1370 case SO_LINGER: 1371 lv = sizeof(v.ling); 1372 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); 1373 v.ling.l_linger = sk->sk_lingertime / HZ; 1374 break; 1375 1376 case SO_BSDCOMPAT: 1377 break; 1378 1379 case SO_TIMESTAMP_OLD: 1380 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && 1381 !sock_flag(sk, SOCK_TSTAMP_NEW) && 1382 !sock_flag(sk, SOCK_RCVTSTAMPNS); 1383 break; 1384 1385 case SO_TIMESTAMPNS_OLD: 1386 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); 1387 break; 1388 1389 case SO_TIMESTAMP_NEW: 1390 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); 1391 break; 1392 1393 case SO_TIMESTAMPNS_NEW: 1394 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); 1395 break; 1396 1397 case SO_TIMESTAMPING_OLD: 1398 v.val = sk->sk_tsflags; 1399 break; 1400 1401 case SO_RCVTIMEO_OLD: 1402 case SO_RCVTIMEO_NEW: 1403 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname); 1404 break; 1405 1406 case SO_SNDTIMEO_OLD: 1407 case SO_SNDTIMEO_NEW: 1408 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname); 1409 break; 1410 1411 case SO_RCVLOWAT: 1412 v.val = sk->sk_rcvlowat; 1413 break; 1414 1415 case SO_SNDLOWAT: 1416 v.val = 1; 1417 break; 1418 1419 case SO_PASSCRED: 1420 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); 1421 break; 1422 1423 case SO_PEERCRED: 1424 { 1425 struct ucred peercred; 1426 if (len > sizeof(peercred)) 1427 len = sizeof(peercred); 1428 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); 1429 if (copy_to_user(optval, &peercred, len)) 1430 return -EFAULT; 1431 goto lenout; 1432 } 1433 1434 case SO_PEERGROUPS: 1435 { 1436 int ret, n; 1437 1438 if (!sk->sk_peer_cred) 1439 return -ENODATA; 1440 1441 n = sk->sk_peer_cred->group_info->ngroups; 1442 if (len < n * sizeof(gid_t)) { 1443 len = n * sizeof(gid_t); 1444 return put_user(len, optlen) ? -EFAULT : -ERANGE; 1445 } 1446 len = n * sizeof(gid_t); 1447 1448 ret = groups_to_user((gid_t __user *)optval, 1449 sk->sk_peer_cred->group_info); 1450 if (ret) 1451 return ret; 1452 goto lenout; 1453 } 1454 1455 case SO_PEERNAME: 1456 { 1457 char address[128]; 1458 1459 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2); 1460 if (lv < 0) 1461 return -ENOTCONN; 1462 if (lv < len) 1463 return -EINVAL; 1464 if (copy_to_user(optval, address, len)) 1465 return -EFAULT; 1466 goto lenout; 1467 } 1468 1469 /* Dubious BSD thing... Probably nobody even uses it, but 1470 * the UNIX standard wants it for whatever reason... -DaveM 1471 */ 1472 case SO_ACCEPTCONN: 1473 v.val = sk->sk_state == TCP_LISTEN; 1474 break; 1475 1476 case SO_PASSSEC: 1477 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); 1478 break; 1479 1480 case SO_PEERSEC: 1481 return security_socket_getpeersec_stream(sock, optval, optlen, len); 1482 1483 case SO_MARK: 1484 v.val = sk->sk_mark; 1485 break; 1486 1487 case SO_RXQ_OVFL: 1488 v.val = sock_flag(sk, SOCK_RXQ_OVFL); 1489 break; 1490 1491 case SO_WIFI_STATUS: 1492 v.val = sock_flag(sk, SOCK_WIFI_STATUS); 1493 break; 1494 1495 case SO_PEEK_OFF: 1496 if (!sock->ops->set_peek_off) 1497 return -EOPNOTSUPP; 1498 1499 v.val = sk->sk_peek_off; 1500 break; 1501 case SO_NOFCS: 1502 v.val = sock_flag(sk, SOCK_NOFCS); 1503 break; 1504 1505 case SO_BINDTODEVICE: 1506 return sock_getbindtodevice(sk, optval, optlen, len); 1507 1508 case SO_GET_FILTER: 1509 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); 1510 if (len < 0) 1511 return len; 1512 1513 goto lenout; 1514 1515 case SO_LOCK_FILTER: 1516 v.val = sock_flag(sk, SOCK_FILTER_LOCKED); 1517 break; 1518 1519 case SO_BPF_EXTENSIONS: 1520 v.val = bpf_tell_extensions(); 1521 break; 1522 1523 case SO_SELECT_ERR_QUEUE: 1524 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); 1525 break; 1526 1527 #ifdef CONFIG_NET_RX_BUSY_POLL 1528 case SO_BUSY_POLL: 1529 v.val = sk->sk_ll_usec; 1530 break; 1531 #endif 1532 1533 case SO_MAX_PACING_RATE: 1534 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { 1535 lv = sizeof(v.ulval); 1536 v.ulval = sk->sk_max_pacing_rate; 1537 } else { 1538 /* 32bit version */ 1539 v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U); 1540 } 1541 break; 1542 1543 case SO_INCOMING_CPU: 1544 v.val = READ_ONCE(sk->sk_incoming_cpu); 1545 break; 1546 1547 case SO_MEMINFO: 1548 { 1549 u32 meminfo[SK_MEMINFO_VARS]; 1550 1551 sk_get_meminfo(sk, meminfo); 1552 1553 len = min_t(unsigned int, len, sizeof(meminfo)); 1554 if (copy_to_user(optval, &meminfo, len)) 1555 return -EFAULT; 1556 1557 goto lenout; 1558 } 1559 1560 #ifdef CONFIG_NET_RX_BUSY_POLL 1561 case SO_INCOMING_NAPI_ID: 1562 v.val = READ_ONCE(sk->sk_napi_id); 1563 1564 /* aggregate non-NAPI IDs down to 0 */ 1565 if (v.val < MIN_NAPI_ID) 1566 v.val = 0; 1567 1568 break; 1569 #endif 1570 1571 case SO_COOKIE: 1572 lv = sizeof(u64); 1573 if (len < lv) 1574 return -EINVAL; 1575 v.val64 = sock_gen_cookie(sk); 1576 break; 1577 1578 case SO_ZEROCOPY: 1579 v.val = sock_flag(sk, SOCK_ZEROCOPY); 1580 break; 1581 1582 case SO_TXTIME: 1583 lv = sizeof(v.txtime); 1584 v.txtime.clockid = sk->sk_clockid; 1585 v.txtime.flags |= sk->sk_txtime_deadline_mode ? 1586 SOF_TXTIME_DEADLINE_MODE : 0; 1587 v.txtime.flags |= sk->sk_txtime_report_errors ? 1588 SOF_TXTIME_REPORT_ERRORS : 0; 1589 break; 1590 1591 case SO_BINDTOIFINDEX: 1592 v.val = sk->sk_bound_dev_if; 1593 break; 1594 1595 default: 1596 /* We implement the SO_SNDLOWAT etc to not be settable 1597 * (1003.1g 7). 1598 */ 1599 return -ENOPROTOOPT; 1600 } 1601 1602 if (len > lv) 1603 len = lv; 1604 if (copy_to_user(optval, &v, len)) 1605 return -EFAULT; 1606 lenout: 1607 if (put_user(len, optlen)) 1608 return -EFAULT; 1609 return 0; 1610 } 1611 1612 /* 1613 * Initialize an sk_lock. 1614 * 1615 * (We also register the sk_lock with the lock validator.) 1616 */ 1617 static inline void sock_lock_init(struct sock *sk) 1618 { 1619 if (sk->sk_kern_sock) 1620 sock_lock_init_class_and_name( 1621 sk, 1622 af_family_kern_slock_key_strings[sk->sk_family], 1623 af_family_kern_slock_keys + sk->sk_family, 1624 af_family_kern_key_strings[sk->sk_family], 1625 af_family_kern_keys + sk->sk_family); 1626 else 1627 sock_lock_init_class_and_name( 1628 sk, 1629 af_family_slock_key_strings[sk->sk_family], 1630 af_family_slock_keys + sk->sk_family, 1631 af_family_key_strings[sk->sk_family], 1632 af_family_keys + sk->sk_family); 1633 } 1634 1635 /* 1636 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 1637 * even temporarly, because of RCU lookups. sk_node should also be left as is. 1638 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end 1639 */ 1640 static void sock_copy(struct sock *nsk, const struct sock *osk) 1641 { 1642 const struct proto *prot = READ_ONCE(osk->sk_prot); 1643 #ifdef CONFIG_SECURITY_NETWORK 1644 void *sptr = nsk->sk_security; 1645 #endif 1646 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); 1647 1648 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, 1649 prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); 1650 1651 #ifdef CONFIG_SECURITY_NETWORK 1652 nsk->sk_security = sptr; 1653 security_sk_clone(osk, nsk); 1654 #endif 1655 } 1656 1657 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 1658 int family) 1659 { 1660 struct sock *sk; 1661 struct kmem_cache *slab; 1662 1663 slab = prot->slab; 1664 if (slab != NULL) { 1665 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 1666 if (!sk) 1667 return sk; 1668 if (want_init_on_alloc(priority)) 1669 sk_prot_clear_nulls(sk, prot->obj_size); 1670 } else 1671 sk = kmalloc(prot->obj_size, priority); 1672 1673 if (sk != NULL) { 1674 if (security_sk_alloc(sk, family, priority)) 1675 goto out_free; 1676 1677 if (!try_module_get(prot->owner)) 1678 goto out_free_sec; 1679 sk_tx_queue_clear(sk); 1680 } 1681 1682 return sk; 1683 1684 out_free_sec: 1685 security_sk_free(sk); 1686 out_free: 1687 if (slab != NULL) 1688 kmem_cache_free(slab, sk); 1689 else 1690 kfree(sk); 1691 return NULL; 1692 } 1693 1694 static void sk_prot_free(struct proto *prot, struct sock *sk) 1695 { 1696 struct kmem_cache *slab; 1697 struct module *owner; 1698 1699 owner = prot->owner; 1700 slab = prot->slab; 1701 1702 cgroup_sk_free(&sk->sk_cgrp_data); 1703 mem_cgroup_sk_free(sk); 1704 security_sk_free(sk); 1705 if (slab != NULL) 1706 kmem_cache_free(slab, sk); 1707 else 1708 kfree(sk); 1709 module_put(owner); 1710 } 1711 1712 /** 1713 * sk_alloc - All socket objects are allocated here 1714 * @net: the applicable net namespace 1715 * @family: protocol family 1716 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1717 * @prot: struct proto associated with this new sock instance 1718 * @kern: is this to be a kernel socket? 1719 */ 1720 struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 1721 struct proto *prot, int kern) 1722 { 1723 struct sock *sk; 1724 1725 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); 1726 if (sk) { 1727 sk->sk_family = family; 1728 /* 1729 * See comment in struct sock definition to understand 1730 * why we need sk_prot_creator -acme 1731 */ 1732 sk->sk_prot = sk->sk_prot_creator = prot; 1733 sk->sk_kern_sock = kern; 1734 sock_lock_init(sk); 1735 sk->sk_net_refcnt = kern ? 0 : 1; 1736 if (likely(sk->sk_net_refcnt)) { 1737 get_net(net); 1738 sock_inuse_add(net, 1); 1739 } 1740 1741 sock_net_set(sk, net); 1742 refcount_set(&sk->sk_wmem_alloc, 1); 1743 1744 mem_cgroup_sk_alloc(sk); 1745 cgroup_sk_alloc(&sk->sk_cgrp_data); 1746 sock_update_classid(&sk->sk_cgrp_data); 1747 sock_update_netprioidx(&sk->sk_cgrp_data); 1748 sk_tx_queue_clear(sk); 1749 } 1750 1751 return sk; 1752 } 1753 EXPORT_SYMBOL(sk_alloc); 1754 1755 /* Sockets having SOCK_RCU_FREE will call this function after one RCU 1756 * grace period. This is the case for UDP sockets and TCP listeners. 1757 */ 1758 static void __sk_destruct(struct rcu_head *head) 1759 { 1760 struct sock *sk = container_of(head, struct sock, sk_rcu); 1761 struct sk_filter *filter; 1762 1763 if (sk->sk_destruct) 1764 sk->sk_destruct(sk); 1765 1766 filter = rcu_dereference_check(sk->sk_filter, 1767 refcount_read(&sk->sk_wmem_alloc) == 0); 1768 if (filter) { 1769 sk_filter_uncharge(sk, filter); 1770 RCU_INIT_POINTER(sk->sk_filter, NULL); 1771 } 1772 1773 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); 1774 1775 #ifdef CONFIG_BPF_SYSCALL 1776 bpf_sk_storage_free(sk); 1777 #endif 1778 1779 if (atomic_read(&sk->sk_omem_alloc)) 1780 pr_debug("%s: optmem leakage (%d bytes) detected\n", 1781 __func__, atomic_read(&sk->sk_omem_alloc)); 1782 1783 if (sk->sk_frag.page) { 1784 put_page(sk->sk_frag.page); 1785 sk->sk_frag.page = NULL; 1786 } 1787 1788 if (sk->sk_peer_cred) 1789 put_cred(sk->sk_peer_cred); 1790 put_pid(sk->sk_peer_pid); 1791 if (likely(sk->sk_net_refcnt)) 1792 put_net(sock_net(sk)); 1793 sk_prot_free(sk->sk_prot_creator, sk); 1794 } 1795 1796 void sk_destruct(struct sock *sk) 1797 { 1798 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); 1799 1800 if (rcu_access_pointer(sk->sk_reuseport_cb)) { 1801 reuseport_detach_sock(sk); 1802 use_call_rcu = true; 1803 } 1804 1805 if (use_call_rcu) 1806 call_rcu(&sk->sk_rcu, __sk_destruct); 1807 else 1808 __sk_destruct(&sk->sk_rcu); 1809 } 1810 1811 static void __sk_free(struct sock *sk) 1812 { 1813 if (likely(sk->sk_net_refcnt)) 1814 sock_inuse_add(sock_net(sk), -1); 1815 1816 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) 1817 sock_diag_broadcast_destroy(sk); 1818 else 1819 sk_destruct(sk); 1820 } 1821 1822 void sk_free(struct sock *sk) 1823 { 1824 /* 1825 * We subtract one from sk_wmem_alloc and can know if 1826 * some packets are still in some tx queue. 1827 * If not null, sock_wfree() will call __sk_free(sk) later 1828 */ 1829 if (refcount_dec_and_test(&sk->sk_wmem_alloc)) 1830 __sk_free(sk); 1831 } 1832 EXPORT_SYMBOL(sk_free); 1833 1834 static void sk_init_common(struct sock *sk) 1835 { 1836 skb_queue_head_init(&sk->sk_receive_queue); 1837 skb_queue_head_init(&sk->sk_write_queue); 1838 skb_queue_head_init(&sk->sk_error_queue); 1839 1840 rwlock_init(&sk->sk_callback_lock); 1841 lockdep_set_class_and_name(&sk->sk_receive_queue.lock, 1842 af_rlock_keys + sk->sk_family, 1843 af_family_rlock_key_strings[sk->sk_family]); 1844 lockdep_set_class_and_name(&sk->sk_write_queue.lock, 1845 af_wlock_keys + sk->sk_family, 1846 af_family_wlock_key_strings[sk->sk_family]); 1847 lockdep_set_class_and_name(&sk->sk_error_queue.lock, 1848 af_elock_keys + sk->sk_family, 1849 af_family_elock_key_strings[sk->sk_family]); 1850 lockdep_set_class_and_name(&sk->sk_callback_lock, 1851 af_callback_keys + sk->sk_family, 1852 af_family_clock_key_strings[sk->sk_family]); 1853 } 1854 1855 /** 1856 * sk_clone_lock - clone a socket, and lock its clone 1857 * @sk: the socket to clone 1858 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1859 * 1860 * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) 1861 */ 1862 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) 1863 { 1864 struct proto *prot = READ_ONCE(sk->sk_prot); 1865 struct sock *newsk; 1866 bool is_charged = true; 1867 1868 newsk = sk_prot_alloc(prot, priority, sk->sk_family); 1869 if (newsk != NULL) { 1870 struct sk_filter *filter; 1871 1872 sock_copy(newsk, sk); 1873 1874 newsk->sk_prot_creator = prot; 1875 1876 /* SANITY */ 1877 if (likely(newsk->sk_net_refcnt)) 1878 get_net(sock_net(newsk)); 1879 sk_node_init(&newsk->sk_node); 1880 sock_lock_init(newsk); 1881 bh_lock_sock(newsk); 1882 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 1883 newsk->sk_backlog.len = 0; 1884 1885 atomic_set(&newsk->sk_rmem_alloc, 0); 1886 /* 1887 * sk_wmem_alloc set to one (see sk_free() and sock_wfree()) 1888 */ 1889 refcount_set(&newsk->sk_wmem_alloc, 1); 1890 atomic_set(&newsk->sk_omem_alloc, 0); 1891 sk_init_common(newsk); 1892 1893 newsk->sk_dst_cache = NULL; 1894 newsk->sk_dst_pending_confirm = 0; 1895 newsk->sk_wmem_queued = 0; 1896 newsk->sk_forward_alloc = 0; 1897 atomic_set(&newsk->sk_drops, 0); 1898 newsk->sk_send_head = NULL; 1899 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; 1900 atomic_set(&newsk->sk_zckey, 0); 1901 1902 sock_reset_flag(newsk, SOCK_DONE); 1903 1904 /* sk->sk_memcg will be populated at accept() time */ 1905 newsk->sk_memcg = NULL; 1906 1907 cgroup_sk_clone(&newsk->sk_cgrp_data); 1908 1909 rcu_read_lock(); 1910 filter = rcu_dereference(sk->sk_filter); 1911 if (filter != NULL) 1912 /* though it's an empty new sock, the charging may fail 1913 * if sysctl_optmem_max was changed between creation of 1914 * original socket and cloning 1915 */ 1916 is_charged = sk_filter_charge(newsk, filter); 1917 RCU_INIT_POINTER(newsk->sk_filter, filter); 1918 rcu_read_unlock(); 1919 1920 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { 1921 /* We need to make sure that we don't uncharge the new 1922 * socket if we couldn't charge it in the first place 1923 * as otherwise we uncharge the parent's filter. 1924 */ 1925 if (!is_charged) 1926 RCU_INIT_POINTER(newsk->sk_filter, NULL); 1927 sk_free_unlock_clone(newsk); 1928 newsk = NULL; 1929 goto out; 1930 } 1931 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); 1932 1933 if (bpf_sk_storage_clone(sk, newsk)) { 1934 sk_free_unlock_clone(newsk); 1935 newsk = NULL; 1936 goto out; 1937 } 1938 1939 /* Clear sk_user_data if parent had the pointer tagged 1940 * as not suitable for copying when cloning. 1941 */ 1942 if (sk_user_data_is_nocopy(newsk)) 1943 newsk->sk_user_data = NULL; 1944 1945 newsk->sk_err = 0; 1946 newsk->sk_err_soft = 0; 1947 newsk->sk_priority = 0; 1948 newsk->sk_incoming_cpu = raw_smp_processor_id(); 1949 if (likely(newsk->sk_net_refcnt)) 1950 sock_inuse_add(sock_net(newsk), 1); 1951 1952 /* 1953 * Before updating sk_refcnt, we must commit prior changes to memory 1954 * (Documentation/RCU/rculist_nulls.rst for details) 1955 */ 1956 smp_wmb(); 1957 refcount_set(&newsk->sk_refcnt, 2); 1958 1959 /* 1960 * Increment the counter in the same struct proto as the master 1961 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that 1962 * is the same as sk->sk_prot->socks, as this field was copied 1963 * with memcpy). 1964 * 1965 * This _changes_ the previous behaviour, where 1966 * tcp_create_openreq_child always was incrementing the 1967 * equivalent to tcp_prot->socks (inet_sock_nr), so this have 1968 * to be taken into account in all callers. -acme 1969 */ 1970 sk_refcnt_debug_inc(newsk); 1971 sk_set_socket(newsk, NULL); 1972 sk_tx_queue_clear(newsk); 1973 RCU_INIT_POINTER(newsk->sk_wq, NULL); 1974 1975 if (newsk->sk_prot->sockets_allocated) 1976 sk_sockets_allocated_inc(newsk); 1977 1978 if (sock_needs_netstamp(sk) && 1979 newsk->sk_flags & SK_FLAGS_TIMESTAMP) 1980 net_enable_timestamp(); 1981 } 1982 out: 1983 return newsk; 1984 } 1985 EXPORT_SYMBOL_GPL(sk_clone_lock); 1986 1987 void sk_free_unlock_clone(struct sock *sk) 1988 { 1989 /* It is still raw copy of parent, so invalidate 1990 * destructor and make plain sk_free() */ 1991 sk->sk_destruct = NULL; 1992 bh_unlock_sock(sk); 1993 sk_free(sk); 1994 } 1995 EXPORT_SYMBOL_GPL(sk_free_unlock_clone); 1996 1997 void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 1998 { 1999 u32 max_segs = 1; 2000 2001 sk_dst_set(sk, dst); 2002 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps; 2003 if (sk->sk_route_caps & NETIF_F_GSO) 2004 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 2005 sk->sk_route_caps &= ~sk->sk_route_nocaps; 2006 if (sk_can_gso(sk)) { 2007 if (dst->header_len && !xfrm_dst_offload_ok(dst)) { 2008 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 2009 } else { 2010 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 2011 sk->sk_gso_max_size = dst->dev->gso_max_size; 2012 max_segs = max_t(u32, dst->dev->gso_max_segs, 1); 2013 } 2014 } 2015 sk->sk_gso_max_segs = max_segs; 2016 } 2017 EXPORT_SYMBOL_GPL(sk_setup_caps); 2018 2019 /* 2020 * Simple resource managers for sockets. 2021 */ 2022 2023 2024 /* 2025 * Write buffer destructor automatically called from kfree_skb. 2026 */ 2027 void sock_wfree(struct sk_buff *skb) 2028 { 2029 struct sock *sk = skb->sk; 2030 unsigned int len = skb->truesize; 2031 2032 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { 2033 /* 2034 * Keep a reference on sk_wmem_alloc, this will be released 2035 * after sk_write_space() call 2036 */ 2037 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); 2038 sk->sk_write_space(sk); 2039 len = 1; 2040 } 2041 /* 2042 * if sk_wmem_alloc reaches 0, we must finish what sk_free() 2043 * could not do because of in-flight packets 2044 */ 2045 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) 2046 __sk_free(sk); 2047 } 2048 EXPORT_SYMBOL(sock_wfree); 2049 2050 /* This variant of sock_wfree() is used by TCP, 2051 * since it sets SOCK_USE_WRITE_QUEUE. 2052 */ 2053 void __sock_wfree(struct sk_buff *skb) 2054 { 2055 struct sock *sk = skb->sk; 2056 2057 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) 2058 __sk_free(sk); 2059 } 2060 2061 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) 2062 { 2063 skb_orphan(skb); 2064 skb->sk = sk; 2065 #ifdef CONFIG_INET 2066 if (unlikely(!sk_fullsock(sk))) { 2067 skb->destructor = sock_edemux; 2068 sock_hold(sk); 2069 return; 2070 } 2071 #endif 2072 skb->destructor = sock_wfree; 2073 skb_set_hash_from_sk(skb, sk); 2074 /* 2075 * We used to take a refcount on sk, but following operation 2076 * is enough to guarantee sk_free() wont free this sock until 2077 * all in-flight packets are completed 2078 */ 2079 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 2080 } 2081 EXPORT_SYMBOL(skb_set_owner_w); 2082 2083 static bool can_skb_orphan_partial(const struct sk_buff *skb) 2084 { 2085 #ifdef CONFIG_TLS_DEVICE 2086 /* Drivers depend on in-order delivery for crypto offload, 2087 * partial orphan breaks out-of-order-OK logic. 2088 */ 2089 if (skb->decrypted) 2090 return false; 2091 #endif 2092 return (skb->destructor == sock_wfree || 2093 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); 2094 } 2095 2096 /* This helper is used by netem, as it can hold packets in its 2097 * delay queue. We want to allow the owner socket to send more 2098 * packets, as if they were already TX completed by a typical driver. 2099 * But we also want to keep skb->sk set because some packet schedulers 2100 * rely on it (sch_fq for example). 2101 */ 2102 void skb_orphan_partial(struct sk_buff *skb) 2103 { 2104 if (skb_is_tcp_pure_ack(skb)) 2105 return; 2106 2107 if (can_skb_orphan_partial(skb)) { 2108 struct sock *sk = skb->sk; 2109 2110 if (refcount_inc_not_zero(&sk->sk_refcnt)) { 2111 WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)); 2112 skb->destructor = sock_efree; 2113 } 2114 } else { 2115 skb_orphan(skb); 2116 } 2117 } 2118 EXPORT_SYMBOL(skb_orphan_partial); 2119 2120 /* 2121 * Read buffer destructor automatically called from kfree_skb. 2122 */ 2123 void sock_rfree(struct sk_buff *skb) 2124 { 2125 struct sock *sk = skb->sk; 2126 unsigned int len = skb->truesize; 2127 2128 atomic_sub(len, &sk->sk_rmem_alloc); 2129 sk_mem_uncharge(sk, len); 2130 } 2131 EXPORT_SYMBOL(sock_rfree); 2132 2133 /* 2134 * Buffer destructor for skbs that are not used directly in read or write 2135 * path, e.g. for error handler skbs. Automatically called from kfree_skb. 2136 */ 2137 void sock_efree(struct sk_buff *skb) 2138 { 2139 sock_put(skb->sk); 2140 } 2141 EXPORT_SYMBOL(sock_efree); 2142 2143 /* Buffer destructor for prefetch/receive path where reference count may 2144 * not be held, e.g. for listen sockets. 2145 */ 2146 #ifdef CONFIG_INET 2147 void sock_pfree(struct sk_buff *skb) 2148 { 2149 if (sk_is_refcounted(skb->sk)) 2150 sock_gen_put(skb->sk); 2151 } 2152 EXPORT_SYMBOL(sock_pfree); 2153 #endif /* CONFIG_INET */ 2154 2155 kuid_t sock_i_uid(struct sock *sk) 2156 { 2157 kuid_t uid; 2158 2159 read_lock_bh(&sk->sk_callback_lock); 2160 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; 2161 read_unlock_bh(&sk->sk_callback_lock); 2162 return uid; 2163 } 2164 EXPORT_SYMBOL(sock_i_uid); 2165 2166 unsigned long sock_i_ino(struct sock *sk) 2167 { 2168 unsigned long ino; 2169 2170 read_lock_bh(&sk->sk_callback_lock); 2171 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; 2172 read_unlock_bh(&sk->sk_callback_lock); 2173 return ino; 2174 } 2175 EXPORT_SYMBOL(sock_i_ino); 2176 2177 /* 2178 * Allocate a skb from the socket's send buffer. 2179 */ 2180 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 2181 gfp_t priority) 2182 { 2183 if (force || 2184 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { 2185 struct sk_buff *skb = alloc_skb(size, priority); 2186 2187 if (skb) { 2188 skb_set_owner_w(skb, sk); 2189 return skb; 2190 } 2191 } 2192 return NULL; 2193 } 2194 EXPORT_SYMBOL(sock_wmalloc); 2195 2196 static void sock_ofree(struct sk_buff *skb) 2197 { 2198 struct sock *sk = skb->sk; 2199 2200 atomic_sub(skb->truesize, &sk->sk_omem_alloc); 2201 } 2202 2203 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, 2204 gfp_t priority) 2205 { 2206 struct sk_buff *skb; 2207 2208 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ 2209 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > 2210 sysctl_optmem_max) 2211 return NULL; 2212 2213 skb = alloc_skb(size, priority); 2214 if (!skb) 2215 return NULL; 2216 2217 atomic_add(skb->truesize, &sk->sk_omem_alloc); 2218 skb->sk = sk; 2219 skb->destructor = sock_ofree; 2220 return skb; 2221 } 2222 2223 /* 2224 * Allocate a memory block from the socket's option memory buffer. 2225 */ 2226 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) 2227 { 2228 if ((unsigned int)size <= sysctl_optmem_max && 2229 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { 2230 void *mem; 2231 /* First do the add, to avoid the race if kmalloc 2232 * might sleep. 2233 */ 2234 atomic_add(size, &sk->sk_omem_alloc); 2235 mem = kmalloc(size, priority); 2236 if (mem) 2237 return mem; 2238 atomic_sub(size, &sk->sk_omem_alloc); 2239 } 2240 return NULL; 2241 } 2242 EXPORT_SYMBOL(sock_kmalloc); 2243 2244 /* Free an option memory block. Note, we actually want the inline 2245 * here as this allows gcc to detect the nullify and fold away the 2246 * condition entirely. 2247 */ 2248 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, 2249 const bool nullify) 2250 { 2251 if (WARN_ON_ONCE(!mem)) 2252 return; 2253 if (nullify) 2254 kfree_sensitive(mem); 2255 else 2256 kfree(mem); 2257 atomic_sub(size, &sk->sk_omem_alloc); 2258 } 2259 2260 void sock_kfree_s(struct sock *sk, void *mem, int size) 2261 { 2262 __sock_kfree_s(sk, mem, size, false); 2263 } 2264 EXPORT_SYMBOL(sock_kfree_s); 2265 2266 void sock_kzfree_s(struct sock *sk, void *mem, int size) 2267 { 2268 __sock_kfree_s(sk, mem, size, true); 2269 } 2270 EXPORT_SYMBOL(sock_kzfree_s); 2271 2272 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. 2273 I think, these locks should be removed for datagram sockets. 2274 */ 2275 static long sock_wait_for_wmem(struct sock *sk, long timeo) 2276 { 2277 DEFINE_WAIT(wait); 2278 2279 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2280 for (;;) { 2281 if (!timeo) 2282 break; 2283 if (signal_pending(current)) 2284 break; 2285 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2286 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 2287 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) 2288 break; 2289 if (sk->sk_shutdown & SEND_SHUTDOWN) 2290 break; 2291 if (sk->sk_err) 2292 break; 2293 timeo = schedule_timeout(timeo); 2294 } 2295 finish_wait(sk_sleep(sk), &wait); 2296 return timeo; 2297 } 2298 2299 2300 /* 2301 * Generic send/receive buffer handlers 2302 */ 2303 2304 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 2305 unsigned long data_len, int noblock, 2306 int *errcode, int max_page_order) 2307 { 2308 struct sk_buff *skb; 2309 long timeo; 2310 int err; 2311 2312 timeo = sock_sndtimeo(sk, noblock); 2313 for (;;) { 2314 err = sock_error(sk); 2315 if (err != 0) 2316 goto failure; 2317 2318 err = -EPIPE; 2319 if (sk->sk_shutdown & SEND_SHUTDOWN) 2320 goto failure; 2321 2322 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) 2323 break; 2324 2325 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2326 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2327 err = -EAGAIN; 2328 if (!timeo) 2329 goto failure; 2330 if (signal_pending(current)) 2331 goto interrupted; 2332 timeo = sock_wait_for_wmem(sk, timeo); 2333 } 2334 skb = alloc_skb_with_frags(header_len, data_len, max_page_order, 2335 errcode, sk->sk_allocation); 2336 if (skb) 2337 skb_set_owner_w(skb, sk); 2338 return skb; 2339 2340 interrupted: 2341 err = sock_intr_errno(timeo); 2342 failure: 2343 *errcode = err; 2344 return NULL; 2345 } 2346 EXPORT_SYMBOL(sock_alloc_send_pskb); 2347 2348 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 2349 int noblock, int *errcode) 2350 { 2351 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); 2352 } 2353 EXPORT_SYMBOL(sock_alloc_send_skb); 2354 2355 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, 2356 struct sockcm_cookie *sockc) 2357 { 2358 u32 tsflags; 2359 2360 switch (cmsg->cmsg_type) { 2361 case SO_MARK: 2362 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 2363 return -EPERM; 2364 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2365 return -EINVAL; 2366 sockc->mark = *(u32 *)CMSG_DATA(cmsg); 2367 break; 2368 case SO_TIMESTAMPING_OLD: 2369 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2370 return -EINVAL; 2371 2372 tsflags = *(u32 *)CMSG_DATA(cmsg); 2373 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) 2374 return -EINVAL; 2375 2376 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; 2377 sockc->tsflags |= tsflags; 2378 break; 2379 case SCM_TXTIME: 2380 if (!sock_flag(sk, SOCK_TXTIME)) 2381 return -EINVAL; 2382 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) 2383 return -EINVAL; 2384 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); 2385 break; 2386 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ 2387 case SCM_RIGHTS: 2388 case SCM_CREDENTIALS: 2389 break; 2390 default: 2391 return -EINVAL; 2392 } 2393 return 0; 2394 } 2395 EXPORT_SYMBOL(__sock_cmsg_send); 2396 2397 int sock_cmsg_send(struct sock *sk, struct msghdr *msg, 2398 struct sockcm_cookie *sockc) 2399 { 2400 struct cmsghdr *cmsg; 2401 int ret; 2402 2403 for_each_cmsghdr(cmsg, msg) { 2404 if (!CMSG_OK(msg, cmsg)) 2405 return -EINVAL; 2406 if (cmsg->cmsg_level != SOL_SOCKET) 2407 continue; 2408 ret = __sock_cmsg_send(sk, msg, cmsg, sockc); 2409 if (ret) 2410 return ret; 2411 } 2412 return 0; 2413 } 2414 EXPORT_SYMBOL(sock_cmsg_send); 2415 2416 static void sk_enter_memory_pressure(struct sock *sk) 2417 { 2418 if (!sk->sk_prot->enter_memory_pressure) 2419 return; 2420 2421 sk->sk_prot->enter_memory_pressure(sk); 2422 } 2423 2424 static void sk_leave_memory_pressure(struct sock *sk) 2425 { 2426 if (sk->sk_prot->leave_memory_pressure) { 2427 sk->sk_prot->leave_memory_pressure(sk); 2428 } else { 2429 unsigned long *memory_pressure = sk->sk_prot->memory_pressure; 2430 2431 if (memory_pressure && READ_ONCE(*memory_pressure)) 2432 WRITE_ONCE(*memory_pressure, 0); 2433 } 2434 } 2435 2436 #define SKB_FRAG_PAGE_ORDER get_order(32768) 2437 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); 2438 2439 /** 2440 * skb_page_frag_refill - check that a page_frag contains enough room 2441 * @sz: minimum size of the fragment we want to get 2442 * @pfrag: pointer to page_frag 2443 * @gfp: priority for memory allocation 2444 * 2445 * Note: While this allocator tries to use high order pages, there is 2446 * no guarantee that allocations succeed. Therefore, @sz MUST be 2447 * less or equal than PAGE_SIZE. 2448 */ 2449 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) 2450 { 2451 if (pfrag->page) { 2452 if (page_ref_count(pfrag->page) == 1) { 2453 pfrag->offset = 0; 2454 return true; 2455 } 2456 if (pfrag->offset + sz <= pfrag->size) 2457 return true; 2458 put_page(pfrag->page); 2459 } 2460 2461 pfrag->offset = 0; 2462 if (SKB_FRAG_PAGE_ORDER && 2463 !static_branch_unlikely(&net_high_order_alloc_disable_key)) { 2464 /* Avoid direct reclaim but allow kswapd to wake */ 2465 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | 2466 __GFP_COMP | __GFP_NOWARN | 2467 __GFP_NORETRY, 2468 SKB_FRAG_PAGE_ORDER); 2469 if (likely(pfrag->page)) { 2470 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; 2471 return true; 2472 } 2473 } 2474 pfrag->page = alloc_page(gfp); 2475 if (likely(pfrag->page)) { 2476 pfrag->size = PAGE_SIZE; 2477 return true; 2478 } 2479 return false; 2480 } 2481 EXPORT_SYMBOL(skb_page_frag_refill); 2482 2483 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 2484 { 2485 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) 2486 return true; 2487 2488 sk_enter_memory_pressure(sk); 2489 sk_stream_moderate_sndbuf(sk); 2490 return false; 2491 } 2492 EXPORT_SYMBOL(sk_page_frag_refill); 2493 2494 static void __lock_sock(struct sock *sk) 2495 __releases(&sk->sk_lock.slock) 2496 __acquires(&sk->sk_lock.slock) 2497 { 2498 DEFINE_WAIT(wait); 2499 2500 for (;;) { 2501 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, 2502 TASK_UNINTERRUPTIBLE); 2503 spin_unlock_bh(&sk->sk_lock.slock); 2504 schedule(); 2505 spin_lock_bh(&sk->sk_lock.slock); 2506 if (!sock_owned_by_user(sk)) 2507 break; 2508 } 2509 finish_wait(&sk->sk_lock.wq, &wait); 2510 } 2511 2512 void __release_sock(struct sock *sk) 2513 __releases(&sk->sk_lock.slock) 2514 __acquires(&sk->sk_lock.slock) 2515 { 2516 struct sk_buff *skb, *next; 2517 2518 while ((skb = sk->sk_backlog.head) != NULL) { 2519 sk->sk_backlog.head = sk->sk_backlog.tail = NULL; 2520 2521 spin_unlock_bh(&sk->sk_lock.slock); 2522 2523 do { 2524 next = skb->next; 2525 prefetch(next); 2526 WARN_ON_ONCE(skb_dst_is_noref(skb)); 2527 skb_mark_not_on_list(skb); 2528 sk_backlog_rcv(sk, skb); 2529 2530 cond_resched(); 2531 2532 skb = next; 2533 } while (skb != NULL); 2534 2535 spin_lock_bh(&sk->sk_lock.slock); 2536 } 2537 2538 /* 2539 * Doing the zeroing here guarantee we can not loop forever 2540 * while a wild producer attempts to flood us. 2541 */ 2542 sk->sk_backlog.len = 0; 2543 } 2544 2545 void __sk_flush_backlog(struct sock *sk) 2546 { 2547 spin_lock_bh(&sk->sk_lock.slock); 2548 __release_sock(sk); 2549 spin_unlock_bh(&sk->sk_lock.slock); 2550 } 2551 2552 /** 2553 * sk_wait_data - wait for data to arrive at sk_receive_queue 2554 * @sk: sock to wait on 2555 * @timeo: for how long 2556 * @skb: last skb seen on sk_receive_queue 2557 * 2558 * Now socket state including sk->sk_err is changed only under lock, 2559 * hence we may omit checks after joining wait queue. 2560 * We check receive queue before schedule() only as optimization; 2561 * it is very likely that release_sock() added new data. 2562 */ 2563 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) 2564 { 2565 DEFINE_WAIT_FUNC(wait, woken_wake_function); 2566 int rc; 2567 2568 add_wait_queue(sk_sleep(sk), &wait); 2569 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2570 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); 2571 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2572 remove_wait_queue(sk_sleep(sk), &wait); 2573 return rc; 2574 } 2575 EXPORT_SYMBOL(sk_wait_data); 2576 2577 /** 2578 * __sk_mem_raise_allocated - increase memory_allocated 2579 * @sk: socket 2580 * @size: memory size to allocate 2581 * @amt: pages to allocate 2582 * @kind: allocation type 2583 * 2584 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc 2585 */ 2586 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) 2587 { 2588 struct proto *prot = sk->sk_prot; 2589 long allocated = sk_memory_allocated_add(sk, amt); 2590 bool charged = true; 2591 2592 if (mem_cgroup_sockets_enabled && sk->sk_memcg && 2593 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt))) 2594 goto suppress_allocation; 2595 2596 /* Under limit. */ 2597 if (allocated <= sk_prot_mem_limits(sk, 0)) { 2598 sk_leave_memory_pressure(sk); 2599 return 1; 2600 } 2601 2602 /* Under pressure. */ 2603 if (allocated > sk_prot_mem_limits(sk, 1)) 2604 sk_enter_memory_pressure(sk); 2605 2606 /* Over hard limit. */ 2607 if (allocated > sk_prot_mem_limits(sk, 2)) 2608 goto suppress_allocation; 2609 2610 /* guarantee minimum buffer size under pressure */ 2611 if (kind == SK_MEM_RECV) { 2612 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) 2613 return 1; 2614 2615 } else { /* SK_MEM_SEND */ 2616 int wmem0 = sk_get_wmem0(sk, prot); 2617 2618 if (sk->sk_type == SOCK_STREAM) { 2619 if (sk->sk_wmem_queued < wmem0) 2620 return 1; 2621 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { 2622 return 1; 2623 } 2624 } 2625 2626 if (sk_has_memory_pressure(sk)) { 2627 u64 alloc; 2628 2629 if (!sk_under_memory_pressure(sk)) 2630 return 1; 2631 alloc = sk_sockets_allocated_read_positive(sk); 2632 if (sk_prot_mem_limits(sk, 2) > alloc * 2633 sk_mem_pages(sk->sk_wmem_queued + 2634 atomic_read(&sk->sk_rmem_alloc) + 2635 sk->sk_forward_alloc)) 2636 return 1; 2637 } 2638 2639 suppress_allocation: 2640 2641 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { 2642 sk_stream_moderate_sndbuf(sk); 2643 2644 /* Fail only if socket is _under_ its sndbuf. 2645 * In this case we cannot block, so that we have to fail. 2646 */ 2647 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) 2648 return 1; 2649 } 2650 2651 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) 2652 trace_sock_exceed_buf_limit(sk, prot, allocated, kind); 2653 2654 sk_memory_allocated_sub(sk, amt); 2655 2656 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2657 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); 2658 2659 return 0; 2660 } 2661 EXPORT_SYMBOL(__sk_mem_raise_allocated); 2662 2663 /** 2664 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 2665 * @sk: socket 2666 * @size: memory size to allocate 2667 * @kind: allocation type 2668 * 2669 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 2670 * rmem allocation. This function assumes that protocols which have 2671 * memory_pressure use sk_wmem_queued as write buffer accounting. 2672 */ 2673 int __sk_mem_schedule(struct sock *sk, int size, int kind) 2674 { 2675 int ret, amt = sk_mem_pages(size); 2676 2677 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT; 2678 ret = __sk_mem_raise_allocated(sk, size, amt, kind); 2679 if (!ret) 2680 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT; 2681 return ret; 2682 } 2683 EXPORT_SYMBOL(__sk_mem_schedule); 2684 2685 /** 2686 * __sk_mem_reduce_allocated - reclaim memory_allocated 2687 * @sk: socket 2688 * @amount: number of quanta 2689 * 2690 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc 2691 */ 2692 void __sk_mem_reduce_allocated(struct sock *sk, int amount) 2693 { 2694 sk_memory_allocated_sub(sk, amount); 2695 2696 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2697 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); 2698 2699 if (sk_under_memory_pressure(sk) && 2700 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 2701 sk_leave_memory_pressure(sk); 2702 } 2703 EXPORT_SYMBOL(__sk_mem_reduce_allocated); 2704 2705 /** 2706 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated 2707 * @sk: socket 2708 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) 2709 */ 2710 void __sk_mem_reclaim(struct sock *sk, int amount) 2711 { 2712 amount >>= SK_MEM_QUANTUM_SHIFT; 2713 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; 2714 __sk_mem_reduce_allocated(sk, amount); 2715 } 2716 EXPORT_SYMBOL(__sk_mem_reclaim); 2717 2718 int sk_set_peek_off(struct sock *sk, int val) 2719 { 2720 sk->sk_peek_off = val; 2721 return 0; 2722 } 2723 EXPORT_SYMBOL_GPL(sk_set_peek_off); 2724 2725 /* 2726 * Set of default routines for initialising struct proto_ops when 2727 * the protocol does not support a particular function. In certain 2728 * cases where it makes no sense for a protocol to have a "do nothing" 2729 * function, some default processing is provided. 2730 */ 2731 2732 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) 2733 { 2734 return -EOPNOTSUPP; 2735 } 2736 EXPORT_SYMBOL(sock_no_bind); 2737 2738 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 2739 int len, int flags) 2740 { 2741 return -EOPNOTSUPP; 2742 } 2743 EXPORT_SYMBOL(sock_no_connect); 2744 2745 int sock_no_socketpair(struct socket *sock1, struct socket *sock2) 2746 { 2747 return -EOPNOTSUPP; 2748 } 2749 EXPORT_SYMBOL(sock_no_socketpair); 2750 2751 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, 2752 bool kern) 2753 { 2754 return -EOPNOTSUPP; 2755 } 2756 EXPORT_SYMBOL(sock_no_accept); 2757 2758 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 2759 int peer) 2760 { 2761 return -EOPNOTSUPP; 2762 } 2763 EXPORT_SYMBOL(sock_no_getname); 2764 2765 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 2766 { 2767 return -EOPNOTSUPP; 2768 } 2769 EXPORT_SYMBOL(sock_no_ioctl); 2770 2771 int sock_no_listen(struct socket *sock, int backlog) 2772 { 2773 return -EOPNOTSUPP; 2774 } 2775 EXPORT_SYMBOL(sock_no_listen); 2776 2777 int sock_no_shutdown(struct socket *sock, int how) 2778 { 2779 return -EOPNOTSUPP; 2780 } 2781 EXPORT_SYMBOL(sock_no_shutdown); 2782 2783 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) 2784 { 2785 return -EOPNOTSUPP; 2786 } 2787 EXPORT_SYMBOL(sock_no_sendmsg); 2788 2789 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) 2790 { 2791 return -EOPNOTSUPP; 2792 } 2793 EXPORT_SYMBOL(sock_no_sendmsg_locked); 2794 2795 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, 2796 int flags) 2797 { 2798 return -EOPNOTSUPP; 2799 } 2800 EXPORT_SYMBOL(sock_no_recvmsg); 2801 2802 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 2803 { 2804 /* Mirror missing mmap method error code */ 2805 return -ENODEV; 2806 } 2807 EXPORT_SYMBOL(sock_no_mmap); 2808 2809 /* 2810 * When a file is received (via SCM_RIGHTS, etc), we must bump the 2811 * various sock-based usage counts. 2812 */ 2813 void __receive_sock(struct file *file) 2814 { 2815 struct socket *sock; 2816 int error; 2817 2818 /* 2819 * The resulting value of "error" is ignored here since we only 2820 * need to take action when the file is a socket and testing 2821 * "sock" for NULL is sufficient. 2822 */ 2823 sock = sock_from_file(file, &error); 2824 if (sock) { 2825 sock_update_netprioidx(&sock->sk->sk_cgrp_data); 2826 sock_update_classid(&sock->sk->sk_cgrp_data); 2827 } 2828 } 2829 2830 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) 2831 { 2832 ssize_t res; 2833 struct msghdr msg = {.msg_flags = flags}; 2834 struct kvec iov; 2835 char *kaddr = kmap(page); 2836 iov.iov_base = kaddr + offset; 2837 iov.iov_len = size; 2838 res = kernel_sendmsg(sock, &msg, &iov, 1, size); 2839 kunmap(page); 2840 return res; 2841 } 2842 EXPORT_SYMBOL(sock_no_sendpage); 2843 2844 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page, 2845 int offset, size_t size, int flags) 2846 { 2847 ssize_t res; 2848 struct msghdr msg = {.msg_flags = flags}; 2849 struct kvec iov; 2850 char *kaddr = kmap(page); 2851 2852 iov.iov_base = kaddr + offset; 2853 iov.iov_len = size; 2854 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size); 2855 kunmap(page); 2856 return res; 2857 } 2858 EXPORT_SYMBOL(sock_no_sendpage_locked); 2859 2860 /* 2861 * Default Socket Callbacks 2862 */ 2863 2864 static void sock_def_wakeup(struct sock *sk) 2865 { 2866 struct socket_wq *wq; 2867 2868 rcu_read_lock(); 2869 wq = rcu_dereference(sk->sk_wq); 2870 if (skwq_has_sleeper(wq)) 2871 wake_up_interruptible_all(&wq->wait); 2872 rcu_read_unlock(); 2873 } 2874 2875 static void sock_def_error_report(struct sock *sk) 2876 { 2877 struct socket_wq *wq; 2878 2879 rcu_read_lock(); 2880 wq = rcu_dereference(sk->sk_wq); 2881 if (skwq_has_sleeper(wq)) 2882 wake_up_interruptible_poll(&wq->wait, EPOLLERR); 2883 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); 2884 rcu_read_unlock(); 2885 } 2886 2887 void sock_def_readable(struct sock *sk) 2888 { 2889 struct socket_wq *wq; 2890 2891 rcu_read_lock(); 2892 wq = rcu_dereference(sk->sk_wq); 2893 if (skwq_has_sleeper(wq)) 2894 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | 2895 EPOLLRDNORM | EPOLLRDBAND); 2896 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 2897 rcu_read_unlock(); 2898 } 2899 2900 static void sock_def_write_space(struct sock *sk) 2901 { 2902 struct socket_wq *wq; 2903 2904 rcu_read_lock(); 2905 2906 /* Do not wake up a writer until he can make "significant" 2907 * progress. --DaveM 2908 */ 2909 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) { 2910 wq = rcu_dereference(sk->sk_wq); 2911 if (skwq_has_sleeper(wq)) 2912 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 2913 EPOLLWRNORM | EPOLLWRBAND); 2914 2915 /* Should agree with poll, otherwise some programs break */ 2916 if (sock_writeable(sk)) 2917 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 2918 } 2919 2920 rcu_read_unlock(); 2921 } 2922 2923 static void sock_def_destruct(struct sock *sk) 2924 { 2925 } 2926 2927 void sk_send_sigurg(struct sock *sk) 2928 { 2929 if (sk->sk_socket && sk->sk_socket->file) 2930 if (send_sigurg(&sk->sk_socket->file->f_owner)) 2931 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 2932 } 2933 EXPORT_SYMBOL(sk_send_sigurg); 2934 2935 void sk_reset_timer(struct sock *sk, struct timer_list* timer, 2936 unsigned long expires) 2937 { 2938 if (!mod_timer(timer, expires)) 2939 sock_hold(sk); 2940 } 2941 EXPORT_SYMBOL(sk_reset_timer); 2942 2943 void sk_stop_timer(struct sock *sk, struct timer_list* timer) 2944 { 2945 if (del_timer(timer)) 2946 __sock_put(sk); 2947 } 2948 EXPORT_SYMBOL(sk_stop_timer); 2949 2950 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) 2951 { 2952 if (del_timer_sync(timer)) 2953 __sock_put(sk); 2954 } 2955 EXPORT_SYMBOL(sk_stop_timer_sync); 2956 2957 void sock_init_data(struct socket *sock, struct sock *sk) 2958 { 2959 sk_init_common(sk); 2960 sk->sk_send_head = NULL; 2961 2962 timer_setup(&sk->sk_timer, NULL, 0); 2963 2964 sk->sk_allocation = GFP_KERNEL; 2965 sk->sk_rcvbuf = sysctl_rmem_default; 2966 sk->sk_sndbuf = sysctl_wmem_default; 2967 sk->sk_state = TCP_CLOSE; 2968 sk_set_socket(sk, sock); 2969 2970 sock_set_flag(sk, SOCK_ZAPPED); 2971 2972 if (sock) { 2973 sk->sk_type = sock->type; 2974 RCU_INIT_POINTER(sk->sk_wq, &sock->wq); 2975 sock->sk = sk; 2976 sk->sk_uid = SOCK_INODE(sock)->i_uid; 2977 } else { 2978 RCU_INIT_POINTER(sk->sk_wq, NULL); 2979 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0); 2980 } 2981 2982 rwlock_init(&sk->sk_callback_lock); 2983 if (sk->sk_kern_sock) 2984 lockdep_set_class_and_name( 2985 &sk->sk_callback_lock, 2986 af_kern_callback_keys + sk->sk_family, 2987 af_family_kern_clock_key_strings[sk->sk_family]); 2988 else 2989 lockdep_set_class_and_name( 2990 &sk->sk_callback_lock, 2991 af_callback_keys + sk->sk_family, 2992 af_family_clock_key_strings[sk->sk_family]); 2993 2994 sk->sk_state_change = sock_def_wakeup; 2995 sk->sk_data_ready = sock_def_readable; 2996 sk->sk_write_space = sock_def_write_space; 2997 sk->sk_error_report = sock_def_error_report; 2998 sk->sk_destruct = sock_def_destruct; 2999 3000 sk->sk_frag.page = NULL; 3001 sk->sk_frag.offset = 0; 3002 sk->sk_peek_off = -1; 3003 3004 sk->sk_peer_pid = NULL; 3005 sk->sk_peer_cred = NULL; 3006 sk->sk_write_pending = 0; 3007 sk->sk_rcvlowat = 1; 3008 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 3009 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 3010 3011 sk->sk_stamp = SK_DEFAULT_STAMP; 3012 #if BITS_PER_LONG==32 3013 seqlock_init(&sk->sk_stamp_seq); 3014 #endif 3015 atomic_set(&sk->sk_zckey, 0); 3016 3017 #ifdef CONFIG_NET_RX_BUSY_POLL 3018 sk->sk_napi_id = 0; 3019 sk->sk_ll_usec = sysctl_net_busy_read; 3020 #endif 3021 3022 sk->sk_max_pacing_rate = ~0UL; 3023 sk->sk_pacing_rate = ~0UL; 3024 WRITE_ONCE(sk->sk_pacing_shift, 10); 3025 sk->sk_incoming_cpu = -1; 3026 3027 sk_rx_queue_clear(sk); 3028 /* 3029 * Before updating sk_refcnt, we must commit prior changes to memory 3030 * (Documentation/RCU/rculist_nulls.rst for details) 3031 */ 3032 smp_wmb(); 3033 refcount_set(&sk->sk_refcnt, 1); 3034 atomic_set(&sk->sk_drops, 0); 3035 } 3036 EXPORT_SYMBOL(sock_init_data); 3037 3038 void lock_sock_nested(struct sock *sk, int subclass) 3039 { 3040 might_sleep(); 3041 spin_lock_bh(&sk->sk_lock.slock); 3042 if (sk->sk_lock.owned) 3043 __lock_sock(sk); 3044 sk->sk_lock.owned = 1; 3045 spin_unlock(&sk->sk_lock.slock); 3046 /* 3047 * The sk_lock has mutex_lock() semantics here: 3048 */ 3049 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); 3050 local_bh_enable(); 3051 } 3052 EXPORT_SYMBOL(lock_sock_nested); 3053 3054 void release_sock(struct sock *sk) 3055 { 3056 spin_lock_bh(&sk->sk_lock.slock); 3057 if (sk->sk_backlog.tail) 3058 __release_sock(sk); 3059 3060 /* Warning : release_cb() might need to release sk ownership, 3061 * ie call sock_release_ownership(sk) before us. 3062 */ 3063 if (sk->sk_prot->release_cb) 3064 sk->sk_prot->release_cb(sk); 3065 3066 sock_release_ownership(sk); 3067 if (waitqueue_active(&sk->sk_lock.wq)) 3068 wake_up(&sk->sk_lock.wq); 3069 spin_unlock_bh(&sk->sk_lock.slock); 3070 } 3071 EXPORT_SYMBOL(release_sock); 3072 3073 /** 3074 * lock_sock_fast - fast version of lock_sock 3075 * @sk: socket 3076 * 3077 * This version should be used for very small section, where process wont block 3078 * return false if fast path is taken: 3079 * 3080 * sk_lock.slock locked, owned = 0, BH disabled 3081 * 3082 * return true if slow path is taken: 3083 * 3084 * sk_lock.slock unlocked, owned = 1, BH enabled 3085 */ 3086 bool lock_sock_fast(struct sock *sk) 3087 { 3088 might_sleep(); 3089 spin_lock_bh(&sk->sk_lock.slock); 3090 3091 if (!sk->sk_lock.owned) 3092 /* 3093 * Note : We must disable BH 3094 */ 3095 return false; 3096 3097 __lock_sock(sk); 3098 sk->sk_lock.owned = 1; 3099 spin_unlock(&sk->sk_lock.slock); 3100 /* 3101 * The sk_lock has mutex_lock() semantics here: 3102 */ 3103 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); 3104 local_bh_enable(); 3105 return true; 3106 } 3107 EXPORT_SYMBOL(lock_sock_fast); 3108 3109 int sock_gettstamp(struct socket *sock, void __user *userstamp, 3110 bool timeval, bool time32) 3111 { 3112 struct sock *sk = sock->sk; 3113 struct timespec64 ts; 3114 3115 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 3116 ts = ktime_to_timespec64(sock_read_timestamp(sk)); 3117 if (ts.tv_sec == -1) 3118 return -ENOENT; 3119 if (ts.tv_sec == 0) { 3120 ktime_t kt = ktime_get_real(); 3121 sock_write_timestamp(sk, kt); 3122 ts = ktime_to_timespec64(kt); 3123 } 3124 3125 if (timeval) 3126 ts.tv_nsec /= 1000; 3127 3128 #ifdef CONFIG_COMPAT_32BIT_TIME 3129 if (time32) 3130 return put_old_timespec32(&ts, userstamp); 3131 #endif 3132 #ifdef CONFIG_SPARC64 3133 /* beware of padding in sparc64 timeval */ 3134 if (timeval && !in_compat_syscall()) { 3135 struct __kernel_old_timeval __user tv = { 3136 .tv_sec = ts.tv_sec, 3137 .tv_usec = ts.tv_nsec, 3138 }; 3139 if (copy_to_user(userstamp, &tv, sizeof(tv))) 3140 return -EFAULT; 3141 return 0; 3142 } 3143 #endif 3144 return put_timespec64(&ts, userstamp); 3145 } 3146 EXPORT_SYMBOL(sock_gettstamp); 3147 3148 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) 3149 { 3150 if (!sock_flag(sk, flag)) { 3151 unsigned long previous_flags = sk->sk_flags; 3152 3153 sock_set_flag(sk, flag); 3154 /* 3155 * we just set one of the two flags which require net 3156 * time stamping, but time stamping might have been on 3157 * already because of the other one 3158 */ 3159 if (sock_needs_netstamp(sk) && 3160 !(previous_flags & SK_FLAGS_TIMESTAMP)) 3161 net_enable_timestamp(); 3162 } 3163 } 3164 3165 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, 3166 int level, int type) 3167 { 3168 struct sock_exterr_skb *serr; 3169 struct sk_buff *skb; 3170 int copied, err; 3171 3172 err = -EAGAIN; 3173 skb = sock_dequeue_err_skb(sk); 3174 if (skb == NULL) 3175 goto out; 3176 3177 copied = skb->len; 3178 if (copied > len) { 3179 msg->msg_flags |= MSG_TRUNC; 3180 copied = len; 3181 } 3182 err = skb_copy_datagram_msg(skb, 0, msg, copied); 3183 if (err) 3184 goto out_free_skb; 3185 3186 sock_recv_timestamp(msg, sk, skb); 3187 3188 serr = SKB_EXT_ERR(skb); 3189 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); 3190 3191 msg->msg_flags |= MSG_ERRQUEUE; 3192 err = copied; 3193 3194 out_free_skb: 3195 kfree_skb(skb); 3196 out: 3197 return err; 3198 } 3199 EXPORT_SYMBOL(sock_recv_errqueue); 3200 3201 /* 3202 * Get a socket option on an socket. 3203 * 3204 * FIX: POSIX 1003.1g is very ambiguous here. It states that 3205 * asynchronous errors should be reported by getsockopt. We assume 3206 * this means if you specify SO_ERROR (otherwise whats the point of it). 3207 */ 3208 int sock_common_getsockopt(struct socket *sock, int level, int optname, 3209 char __user *optval, int __user *optlen) 3210 { 3211 struct sock *sk = sock->sk; 3212 3213 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 3214 } 3215 EXPORT_SYMBOL(sock_common_getsockopt); 3216 3217 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 3218 int flags) 3219 { 3220 struct sock *sk = sock->sk; 3221 int addr_len = 0; 3222 int err; 3223 3224 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, 3225 flags & ~MSG_DONTWAIT, &addr_len); 3226 if (err >= 0) 3227 msg->msg_namelen = addr_len; 3228 return err; 3229 } 3230 EXPORT_SYMBOL(sock_common_recvmsg); 3231 3232 /* 3233 * Set socket options on an inet socket. 3234 */ 3235 int sock_common_setsockopt(struct socket *sock, int level, int optname, 3236 sockptr_t optval, unsigned int optlen) 3237 { 3238 struct sock *sk = sock->sk; 3239 3240 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 3241 } 3242 EXPORT_SYMBOL(sock_common_setsockopt); 3243 3244 void sk_common_release(struct sock *sk) 3245 { 3246 if (sk->sk_prot->destroy) 3247 sk->sk_prot->destroy(sk); 3248 3249 /* 3250 * Observation: when sk_common_release is called, processes have 3251 * no access to socket. But net still has. 3252 * Step one, detach it from networking: 3253 * 3254 * A. Remove from hash tables. 3255 */ 3256 3257 sk->sk_prot->unhash(sk); 3258 3259 /* 3260 * In this point socket cannot receive new packets, but it is possible 3261 * that some packets are in flight because some CPU runs receiver and 3262 * did hash table lookup before we unhashed socket. They will achieve 3263 * receive queue and will be purged by socket destructor. 3264 * 3265 * Also we still have packets pending on receive queue and probably, 3266 * our own packets waiting in device queues. sock_destroy will drain 3267 * receive queue, but transmitted packets will delay socket destruction 3268 * until the last reference will be released. 3269 */ 3270 3271 sock_orphan(sk); 3272 3273 xfrm_sk_free_policy(sk); 3274 3275 sk_refcnt_debug_release(sk); 3276 3277 sock_put(sk); 3278 } 3279 EXPORT_SYMBOL(sk_common_release); 3280 3281 void sk_get_meminfo(const struct sock *sk, u32 *mem) 3282 { 3283 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); 3284 3285 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); 3286 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); 3287 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); 3288 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); 3289 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; 3290 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); 3291 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); 3292 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); 3293 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); 3294 } 3295 3296 #ifdef CONFIG_PROC_FS 3297 #define PROTO_INUSE_NR 64 /* should be enough for the first time */ 3298 struct prot_inuse { 3299 int val[PROTO_INUSE_NR]; 3300 }; 3301 3302 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); 3303 3304 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) 3305 { 3306 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val); 3307 } 3308 EXPORT_SYMBOL_GPL(sock_prot_inuse_add); 3309 3310 int sock_prot_inuse_get(struct net *net, struct proto *prot) 3311 { 3312 int cpu, idx = prot->inuse_idx; 3313 int res = 0; 3314 3315 for_each_possible_cpu(cpu) 3316 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; 3317 3318 return res >= 0 ? res : 0; 3319 } 3320 EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 3321 3322 static void sock_inuse_add(struct net *net, int val) 3323 { 3324 this_cpu_add(*net->core.sock_inuse, val); 3325 } 3326 3327 int sock_inuse_get(struct net *net) 3328 { 3329 int cpu, res = 0; 3330 3331 for_each_possible_cpu(cpu) 3332 res += *per_cpu_ptr(net->core.sock_inuse, cpu); 3333 3334 return res; 3335 } 3336 3337 EXPORT_SYMBOL_GPL(sock_inuse_get); 3338 3339 static int __net_init sock_inuse_init_net(struct net *net) 3340 { 3341 net->core.prot_inuse = alloc_percpu(struct prot_inuse); 3342 if (net->core.prot_inuse == NULL) 3343 return -ENOMEM; 3344 3345 net->core.sock_inuse = alloc_percpu(int); 3346 if (net->core.sock_inuse == NULL) 3347 goto out; 3348 3349 return 0; 3350 3351 out: 3352 free_percpu(net->core.prot_inuse); 3353 return -ENOMEM; 3354 } 3355 3356 static void __net_exit sock_inuse_exit_net(struct net *net) 3357 { 3358 free_percpu(net->core.prot_inuse); 3359 free_percpu(net->core.sock_inuse); 3360 } 3361 3362 static struct pernet_operations net_inuse_ops = { 3363 .init = sock_inuse_init_net, 3364 .exit = sock_inuse_exit_net, 3365 }; 3366 3367 static __init int net_inuse_init(void) 3368 { 3369 if (register_pernet_subsys(&net_inuse_ops)) 3370 panic("Cannot initialize net inuse counters"); 3371 3372 return 0; 3373 } 3374 3375 core_initcall(net_inuse_init); 3376 3377 static int assign_proto_idx(struct proto *prot) 3378 { 3379 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); 3380 3381 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { 3382 pr_err("PROTO_INUSE_NR exhausted\n"); 3383 return -ENOSPC; 3384 } 3385 3386 set_bit(prot->inuse_idx, proto_inuse_idx); 3387 return 0; 3388 } 3389 3390 static void release_proto_idx(struct proto *prot) 3391 { 3392 if (prot->inuse_idx != PROTO_INUSE_NR - 1) 3393 clear_bit(prot->inuse_idx, proto_inuse_idx); 3394 } 3395 #else 3396 static inline int assign_proto_idx(struct proto *prot) 3397 { 3398 return 0; 3399 } 3400 3401 static inline void release_proto_idx(struct proto *prot) 3402 { 3403 } 3404 3405 static void sock_inuse_add(struct net *net, int val) 3406 { 3407 } 3408 #endif 3409 3410 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) 3411 { 3412 if (!twsk_prot) 3413 return; 3414 kfree(twsk_prot->twsk_slab_name); 3415 twsk_prot->twsk_slab_name = NULL; 3416 kmem_cache_destroy(twsk_prot->twsk_slab); 3417 twsk_prot->twsk_slab = NULL; 3418 } 3419 3420 static void req_prot_cleanup(struct request_sock_ops *rsk_prot) 3421 { 3422 if (!rsk_prot) 3423 return; 3424 kfree(rsk_prot->slab_name); 3425 rsk_prot->slab_name = NULL; 3426 kmem_cache_destroy(rsk_prot->slab); 3427 rsk_prot->slab = NULL; 3428 } 3429 3430 static int req_prot_init(const struct proto *prot) 3431 { 3432 struct request_sock_ops *rsk_prot = prot->rsk_prot; 3433 3434 if (!rsk_prot) 3435 return 0; 3436 3437 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", 3438 prot->name); 3439 if (!rsk_prot->slab_name) 3440 return -ENOMEM; 3441 3442 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, 3443 rsk_prot->obj_size, 0, 3444 SLAB_ACCOUNT | prot->slab_flags, 3445 NULL); 3446 3447 if (!rsk_prot->slab) { 3448 pr_crit("%s: Can't create request sock SLAB cache!\n", 3449 prot->name); 3450 return -ENOMEM; 3451 } 3452 return 0; 3453 } 3454 3455 int proto_register(struct proto *prot, int alloc_slab) 3456 { 3457 int ret = -ENOBUFS; 3458 3459 if (alloc_slab) { 3460 prot->slab = kmem_cache_create_usercopy(prot->name, 3461 prot->obj_size, 0, 3462 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | 3463 prot->slab_flags, 3464 prot->useroffset, prot->usersize, 3465 NULL); 3466 3467 if (prot->slab == NULL) { 3468 pr_crit("%s: Can't create sock SLAB cache!\n", 3469 prot->name); 3470 goto out; 3471 } 3472 3473 if (req_prot_init(prot)) 3474 goto out_free_request_sock_slab; 3475 3476 if (prot->twsk_prot != NULL) { 3477 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name); 3478 3479 if (prot->twsk_prot->twsk_slab_name == NULL) 3480 goto out_free_request_sock_slab; 3481 3482 prot->twsk_prot->twsk_slab = 3483 kmem_cache_create(prot->twsk_prot->twsk_slab_name, 3484 prot->twsk_prot->twsk_obj_size, 3485 0, 3486 SLAB_ACCOUNT | 3487 prot->slab_flags, 3488 NULL); 3489 if (prot->twsk_prot->twsk_slab == NULL) 3490 goto out_free_timewait_sock_slab; 3491 } 3492 } 3493 3494 mutex_lock(&proto_list_mutex); 3495 ret = assign_proto_idx(prot); 3496 if (ret) { 3497 mutex_unlock(&proto_list_mutex); 3498 goto out_free_timewait_sock_slab; 3499 } 3500 list_add(&prot->node, &proto_list); 3501 mutex_unlock(&proto_list_mutex); 3502 return ret; 3503 3504 out_free_timewait_sock_slab: 3505 if (alloc_slab && prot->twsk_prot) 3506 tw_prot_cleanup(prot->twsk_prot); 3507 out_free_request_sock_slab: 3508 if (alloc_slab) { 3509 req_prot_cleanup(prot->rsk_prot); 3510 3511 kmem_cache_destroy(prot->slab); 3512 prot->slab = NULL; 3513 } 3514 out: 3515 return ret; 3516 } 3517 EXPORT_SYMBOL(proto_register); 3518 3519 void proto_unregister(struct proto *prot) 3520 { 3521 mutex_lock(&proto_list_mutex); 3522 release_proto_idx(prot); 3523 list_del(&prot->node); 3524 mutex_unlock(&proto_list_mutex); 3525 3526 kmem_cache_destroy(prot->slab); 3527 prot->slab = NULL; 3528 3529 req_prot_cleanup(prot->rsk_prot); 3530 tw_prot_cleanup(prot->twsk_prot); 3531 } 3532 EXPORT_SYMBOL(proto_unregister); 3533 3534 int sock_load_diag_module(int family, int protocol) 3535 { 3536 if (!protocol) { 3537 if (!sock_is_registered(family)) 3538 return -ENOENT; 3539 3540 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, 3541 NETLINK_SOCK_DIAG, family); 3542 } 3543 3544 #ifdef CONFIG_INET 3545 if (family == AF_INET && 3546 protocol != IPPROTO_RAW && 3547 protocol < MAX_INET_PROTOS && 3548 !rcu_access_pointer(inet_protos[protocol])) 3549 return -ENOENT; 3550 #endif 3551 3552 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, 3553 NETLINK_SOCK_DIAG, family, protocol); 3554 } 3555 EXPORT_SYMBOL(sock_load_diag_module); 3556 3557 #ifdef CONFIG_PROC_FS 3558 static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 3559 __acquires(proto_list_mutex) 3560 { 3561 mutex_lock(&proto_list_mutex); 3562 return seq_list_start_head(&proto_list, *pos); 3563 } 3564 3565 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3566 { 3567 return seq_list_next(v, &proto_list, pos); 3568 } 3569 3570 static void proto_seq_stop(struct seq_file *seq, void *v) 3571 __releases(proto_list_mutex) 3572 { 3573 mutex_unlock(&proto_list_mutex); 3574 } 3575 3576 static char proto_method_implemented(const void *method) 3577 { 3578 return method == NULL ? 'n' : 'y'; 3579 } 3580 static long sock_prot_memory_allocated(struct proto *proto) 3581 { 3582 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; 3583 } 3584 3585 static const char *sock_prot_memory_pressure(struct proto *proto) 3586 { 3587 return proto->memory_pressure != NULL ? 3588 proto_memory_pressure(proto) ? "yes" : "no" : "NI"; 3589 } 3590 3591 static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 3592 { 3593 3594 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " 3595 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 3596 proto->name, 3597 proto->obj_size, 3598 sock_prot_inuse_get(seq_file_net(seq), proto), 3599 sock_prot_memory_allocated(proto), 3600 sock_prot_memory_pressure(proto), 3601 proto->max_header, 3602 proto->slab == NULL ? "no" : "yes", 3603 module_name(proto->owner), 3604 proto_method_implemented(proto->close), 3605 proto_method_implemented(proto->connect), 3606 proto_method_implemented(proto->disconnect), 3607 proto_method_implemented(proto->accept), 3608 proto_method_implemented(proto->ioctl), 3609 proto_method_implemented(proto->init), 3610 proto_method_implemented(proto->destroy), 3611 proto_method_implemented(proto->shutdown), 3612 proto_method_implemented(proto->setsockopt), 3613 proto_method_implemented(proto->getsockopt), 3614 proto_method_implemented(proto->sendmsg), 3615 proto_method_implemented(proto->recvmsg), 3616 proto_method_implemented(proto->sendpage), 3617 proto_method_implemented(proto->bind), 3618 proto_method_implemented(proto->backlog_rcv), 3619 proto_method_implemented(proto->hash), 3620 proto_method_implemented(proto->unhash), 3621 proto_method_implemented(proto->get_port), 3622 proto_method_implemented(proto->enter_memory_pressure)); 3623 } 3624 3625 static int proto_seq_show(struct seq_file *seq, void *v) 3626 { 3627 if (v == &proto_list) 3628 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", 3629 "protocol", 3630 "size", 3631 "sockets", 3632 "memory", 3633 "press", 3634 "maxhdr", 3635 "slab", 3636 "module", 3637 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); 3638 else 3639 proto_seq_printf(seq, list_entry(v, struct proto, node)); 3640 return 0; 3641 } 3642 3643 static const struct seq_operations proto_seq_ops = { 3644 .start = proto_seq_start, 3645 .next = proto_seq_next, 3646 .stop = proto_seq_stop, 3647 .show = proto_seq_show, 3648 }; 3649 3650 static __net_init int proto_init_net(struct net *net) 3651 { 3652 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, 3653 sizeof(struct seq_net_private))) 3654 return -ENOMEM; 3655 3656 return 0; 3657 } 3658 3659 static __net_exit void proto_exit_net(struct net *net) 3660 { 3661 remove_proc_entry("protocols", net->proc_net); 3662 } 3663 3664 3665 static __net_initdata struct pernet_operations proto_net_ops = { 3666 .init = proto_init_net, 3667 .exit = proto_exit_net, 3668 }; 3669 3670 static int __init proto_init(void) 3671 { 3672 return register_pernet_subsys(&proto_net_ops); 3673 } 3674 3675 subsys_initcall(proto_init); 3676 3677 #endif /* PROC_FS */ 3678 3679 #ifdef CONFIG_NET_RX_BUSY_POLL 3680 bool sk_busy_loop_end(void *p, unsigned long start_time) 3681 { 3682 struct sock *sk = p; 3683 3684 return !skb_queue_empty_lockless(&sk->sk_receive_queue) || 3685 sk_busy_loop_timeout(sk, start_time); 3686 } 3687 EXPORT_SYMBOL(sk_busy_loop_end); 3688 #endif /* CONFIG_NET_RX_BUSY_POLL */ 3689 3690 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) 3691 { 3692 if (!sk->sk_prot->bind_add) 3693 return -EOPNOTSUPP; 3694 return sk->sk_prot->bind_add(sk, addr, addr_len); 3695 } 3696 EXPORT_SYMBOL(sock_bind_add); 3697