1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Generic socket support routines. Memory allocators, socket lock/release 8 * handler for protocols to use and generic option handler. 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Alan Cox, <A.Cox@swansea.ac.uk> 14 * 15 * Fixes: 16 * Alan Cox : Numerous verify_area() problems 17 * Alan Cox : Connecting on a connecting socket 18 * now returns an error for tcp. 19 * Alan Cox : sock->protocol is set correctly. 20 * and is not sometimes left as 0. 21 * Alan Cox : connect handles icmp errors on a 22 * connect properly. Unfortunately there 23 * is a restart syscall nasty there. I 24 * can't match BSD without hacking the C 25 * library. Ideas urgently sought! 26 * Alan Cox : Disallow bind() to addresses that are 27 * not ours - especially broadcast ones!! 28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) 29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, 30 * instead they leave that for the DESTROY timer. 31 * Alan Cox : Clean up error flag in accept 32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer 33 * was buggy. Put a remove_sock() in the handler 34 * for memory when we hit 0. Also altered the timer 35 * code. The ACK stuff can wait and needs major 36 * TCP layer surgery. 37 * Alan Cox : Fixed TCP ack bug, removed remove sock 38 * and fixed timer/inet_bh race. 39 * Alan Cox : Added zapped flag for TCP 40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code 41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb 42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources 43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. 44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... 45 * Rick Sladkey : Relaxed UDP rules for matching packets. 46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support 47 * Pauline Middelink : identd support 48 * Alan Cox : Fixed connect() taking signals I think. 49 * Alan Cox : SO_LINGER supported 50 * Alan Cox : Error reporting fixes 51 * Anonymous : inet_create tidied up (sk->reuse setting) 52 * Alan Cox : inet sockets don't set sk->type! 53 * Alan Cox : Split socket option code 54 * Alan Cox : Callbacks 55 * Alan Cox : Nagle flag for Charles & Johannes stuff 56 * Alex : Removed restriction on inet fioctl 57 * Alan Cox : Splitting INET from NET core 58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() 59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code 60 * Alan Cox : Split IP from generic code 61 * Alan Cox : New kfree_skbmem() 62 * Alan Cox : Make SO_DEBUG superuser only. 63 * Alan Cox : Allow anyone to clear SO_DEBUG 64 * (compatibility fix) 65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. 66 * Alan Cox : Allocator for a socket is settable. 67 * Alan Cox : SO_ERROR includes soft errors. 68 * Alan Cox : Allow NULL arguments on some SO_ opts 69 * Alan Cox : Generic socket allocation to make hooks 70 * easier (suggested by Craig Metz). 71 * Michael Pall : SO_ERROR returns positive errno again 72 * Steve Whitehouse: Added default destructor to free 73 * protocol private data. 74 * Steve Whitehouse: Added various other default routines 75 * common to several socket families. 76 * Chris Evans : Call suser() check last on F_SETOWN 77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. 78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() 79 * Andi Kleen : Fix write_space callback 80 * Chris Evans : Security fixes - signedness again 81 * Arnaldo C. Melo : cleanups, use skb_queue_purge 82 * 83 * To Fix: 84 */ 85 86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 87 88 #include <asm/unaligned.h> 89 #include <linux/capability.h> 90 #include <linux/errno.h> 91 #include <linux/errqueue.h> 92 #include <linux/types.h> 93 #include <linux/socket.h> 94 #include <linux/in.h> 95 #include <linux/kernel.h> 96 #include <linux/module.h> 97 #include <linux/proc_fs.h> 98 #include <linux/seq_file.h> 99 #include <linux/sched.h> 100 #include <linux/sched/mm.h> 101 #include <linux/timer.h> 102 #include <linux/string.h> 103 #include <linux/sockios.h> 104 #include <linux/net.h> 105 #include <linux/mm.h> 106 #include <linux/slab.h> 107 #include <linux/interrupt.h> 108 #include <linux/poll.h> 109 #include <linux/tcp.h> 110 #include <linux/init.h> 111 #include <linux/highmem.h> 112 #include <linux/user_namespace.h> 113 #include <linux/static_key.h> 114 #include <linux/memcontrol.h> 115 #include <linux/prefetch.h> 116 #include <linux/compat.h> 117 118 #include <linux/uaccess.h> 119 120 #include <linux/netdevice.h> 121 #include <net/protocol.h> 122 #include <linux/skbuff.h> 123 #include <net/net_namespace.h> 124 #include <net/request_sock.h> 125 #include <net/sock.h> 126 #include <linux/net_tstamp.h> 127 #include <net/xfrm.h> 128 #include <linux/ipsec.h> 129 #include <net/cls_cgroup.h> 130 #include <net/netprio_cgroup.h> 131 #include <linux/sock_diag.h> 132 133 #include <linux/filter.h> 134 #include <net/sock_reuseport.h> 135 #include <net/bpf_sk_storage.h> 136 137 #include <trace/events/sock.h> 138 139 #include <net/tcp.h> 140 #include <net/busy_poll.h> 141 142 #include <linux/ethtool.h> 143 144 #include "dev.h" 145 146 static DEFINE_MUTEX(proto_list_mutex); 147 static LIST_HEAD(proto_list); 148 149 static void sock_def_write_space_wfree(struct sock *sk); 150 static void sock_def_write_space(struct sock *sk); 151 152 /** 153 * sk_ns_capable - General socket capability test 154 * @sk: Socket to use a capability on or through 155 * @user_ns: The user namespace of the capability to use 156 * @cap: The capability to use 157 * 158 * Test to see if the opener of the socket had when the socket was 159 * created and the current process has the capability @cap in the user 160 * namespace @user_ns. 161 */ 162 bool sk_ns_capable(const struct sock *sk, 163 struct user_namespace *user_ns, int cap) 164 { 165 return file_ns_capable(sk->sk_socket->file, user_ns, cap) && 166 ns_capable(user_ns, cap); 167 } 168 EXPORT_SYMBOL(sk_ns_capable); 169 170 /** 171 * sk_capable - Socket global capability test 172 * @sk: Socket to use a capability on or through 173 * @cap: The global capability to use 174 * 175 * Test to see if the opener of the socket had when the socket was 176 * created and the current process has the capability @cap in all user 177 * namespaces. 178 */ 179 bool sk_capable(const struct sock *sk, int cap) 180 { 181 return sk_ns_capable(sk, &init_user_ns, cap); 182 } 183 EXPORT_SYMBOL(sk_capable); 184 185 /** 186 * sk_net_capable - Network namespace socket capability test 187 * @sk: Socket to use a capability on or through 188 * @cap: The capability to use 189 * 190 * Test to see if the opener of the socket had when the socket was created 191 * and the current process has the capability @cap over the network namespace 192 * the socket is a member of. 193 */ 194 bool sk_net_capable(const struct sock *sk, int cap) 195 { 196 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); 197 } 198 EXPORT_SYMBOL(sk_net_capable); 199 200 /* 201 * Each address family might have different locking rules, so we have 202 * one slock key per address family and separate keys for internal and 203 * userspace sockets. 204 */ 205 static struct lock_class_key af_family_keys[AF_MAX]; 206 static struct lock_class_key af_family_kern_keys[AF_MAX]; 207 static struct lock_class_key af_family_slock_keys[AF_MAX]; 208 static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; 209 210 /* 211 * Make lock validator output more readable. (we pre-construct these 212 * strings build-time, so that runtime initialization of socket 213 * locks is fast): 214 */ 215 216 #define _sock_locks(x) \ 217 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ 218 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ 219 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ 220 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ 221 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ 222 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ 223 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ 224 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ 225 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ 226 x "27" , x "28" , x "AF_CAN" , \ 227 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ 228 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ 229 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ 230 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ 231 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ 232 x "AF_MCTP" , \ 233 x "AF_MAX" 234 235 static const char *const af_family_key_strings[AF_MAX+1] = { 236 _sock_locks("sk_lock-") 237 }; 238 static const char *const af_family_slock_key_strings[AF_MAX+1] = { 239 _sock_locks("slock-") 240 }; 241 static const char *const af_family_clock_key_strings[AF_MAX+1] = { 242 _sock_locks("clock-") 243 }; 244 245 static const char *const af_family_kern_key_strings[AF_MAX+1] = { 246 _sock_locks("k-sk_lock-") 247 }; 248 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { 249 _sock_locks("k-slock-") 250 }; 251 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { 252 _sock_locks("k-clock-") 253 }; 254 static const char *const af_family_rlock_key_strings[AF_MAX+1] = { 255 _sock_locks("rlock-") 256 }; 257 static const char *const af_family_wlock_key_strings[AF_MAX+1] = { 258 _sock_locks("wlock-") 259 }; 260 static const char *const af_family_elock_key_strings[AF_MAX+1] = { 261 _sock_locks("elock-") 262 }; 263 264 /* 265 * sk_callback_lock and sk queues locking rules are per-address-family, 266 * so split the lock classes by using a per-AF key: 267 */ 268 static struct lock_class_key af_callback_keys[AF_MAX]; 269 static struct lock_class_key af_rlock_keys[AF_MAX]; 270 static struct lock_class_key af_wlock_keys[AF_MAX]; 271 static struct lock_class_key af_elock_keys[AF_MAX]; 272 static struct lock_class_key af_kern_callback_keys[AF_MAX]; 273 274 /* Run time adjustable parameters. */ 275 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; 276 EXPORT_SYMBOL(sysctl_wmem_max); 277 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; 278 EXPORT_SYMBOL(sysctl_rmem_max); 279 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; 280 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; 281 282 /* Maximal space eaten by iovec or ancillary data plus some space */ 283 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 284 EXPORT_SYMBOL(sysctl_optmem_max); 285 286 int sysctl_tstamp_allow_data __read_mostly = 1; 287 288 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); 289 EXPORT_SYMBOL_GPL(memalloc_socks_key); 290 291 /** 292 * sk_set_memalloc - sets %SOCK_MEMALLOC 293 * @sk: socket to set it on 294 * 295 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. 296 * It's the responsibility of the admin to adjust min_free_kbytes 297 * to meet the requirements 298 */ 299 void sk_set_memalloc(struct sock *sk) 300 { 301 sock_set_flag(sk, SOCK_MEMALLOC); 302 sk->sk_allocation |= __GFP_MEMALLOC; 303 static_branch_inc(&memalloc_socks_key); 304 } 305 EXPORT_SYMBOL_GPL(sk_set_memalloc); 306 307 void sk_clear_memalloc(struct sock *sk) 308 { 309 sock_reset_flag(sk, SOCK_MEMALLOC); 310 sk->sk_allocation &= ~__GFP_MEMALLOC; 311 static_branch_dec(&memalloc_socks_key); 312 313 /* 314 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward 315 * progress of swapping. SOCK_MEMALLOC may be cleared while 316 * it has rmem allocations due to the last swapfile being deactivated 317 * but there is a risk that the socket is unusable due to exceeding 318 * the rmem limits. Reclaim the reserves and obey rmem limits again. 319 */ 320 sk_mem_reclaim(sk); 321 } 322 EXPORT_SYMBOL_GPL(sk_clear_memalloc); 323 324 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 325 { 326 int ret; 327 unsigned int noreclaim_flag; 328 329 /* these should have been dropped before queueing */ 330 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); 331 332 noreclaim_flag = memalloc_noreclaim_save(); 333 ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv, 334 tcp_v6_do_rcv, 335 tcp_v4_do_rcv, 336 sk, skb); 337 memalloc_noreclaim_restore(noreclaim_flag); 338 339 return ret; 340 } 341 EXPORT_SYMBOL(__sk_backlog_rcv); 342 343 void sk_error_report(struct sock *sk) 344 { 345 sk->sk_error_report(sk); 346 347 switch (sk->sk_family) { 348 case AF_INET: 349 fallthrough; 350 case AF_INET6: 351 trace_inet_sk_error_report(sk); 352 break; 353 default: 354 break; 355 } 356 } 357 EXPORT_SYMBOL(sk_error_report); 358 359 int sock_get_timeout(long timeo, void *optval, bool old_timeval) 360 { 361 struct __kernel_sock_timeval tv; 362 363 if (timeo == MAX_SCHEDULE_TIMEOUT) { 364 tv.tv_sec = 0; 365 tv.tv_usec = 0; 366 } else { 367 tv.tv_sec = timeo / HZ; 368 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; 369 } 370 371 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 372 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; 373 *(struct old_timeval32 *)optval = tv32; 374 return sizeof(tv32); 375 } 376 377 if (old_timeval) { 378 struct __kernel_old_timeval old_tv; 379 old_tv.tv_sec = tv.tv_sec; 380 old_tv.tv_usec = tv.tv_usec; 381 *(struct __kernel_old_timeval *)optval = old_tv; 382 return sizeof(old_tv); 383 } 384 385 *(struct __kernel_sock_timeval *)optval = tv; 386 return sizeof(tv); 387 } 388 EXPORT_SYMBOL(sock_get_timeout); 389 390 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, 391 sockptr_t optval, int optlen, bool old_timeval) 392 { 393 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 394 struct old_timeval32 tv32; 395 396 if (optlen < sizeof(tv32)) 397 return -EINVAL; 398 399 if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) 400 return -EFAULT; 401 tv->tv_sec = tv32.tv_sec; 402 tv->tv_usec = tv32.tv_usec; 403 } else if (old_timeval) { 404 struct __kernel_old_timeval old_tv; 405 406 if (optlen < sizeof(old_tv)) 407 return -EINVAL; 408 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) 409 return -EFAULT; 410 tv->tv_sec = old_tv.tv_sec; 411 tv->tv_usec = old_tv.tv_usec; 412 } else { 413 if (optlen < sizeof(*tv)) 414 return -EINVAL; 415 if (copy_from_sockptr(tv, optval, sizeof(*tv))) 416 return -EFAULT; 417 } 418 419 return 0; 420 } 421 EXPORT_SYMBOL(sock_copy_user_timeval); 422 423 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, 424 bool old_timeval) 425 { 426 struct __kernel_sock_timeval tv; 427 int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval); 428 429 if (err) 430 return err; 431 432 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) 433 return -EDOM; 434 435 if (tv.tv_sec < 0) { 436 static int warned __read_mostly; 437 438 *timeo_p = 0; 439 if (warned < 10 && net_ratelimit()) { 440 warned++; 441 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", 442 __func__, current->comm, task_pid_nr(current)); 443 } 444 return 0; 445 } 446 *timeo_p = MAX_SCHEDULE_TIMEOUT; 447 if (tv.tv_sec == 0 && tv.tv_usec == 0) 448 return 0; 449 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) 450 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ); 451 return 0; 452 } 453 454 static bool sock_needs_netstamp(const struct sock *sk) 455 { 456 switch (sk->sk_family) { 457 case AF_UNSPEC: 458 case AF_UNIX: 459 return false; 460 default: 461 return true; 462 } 463 } 464 465 static void sock_disable_timestamp(struct sock *sk, unsigned long flags) 466 { 467 if (sk->sk_flags & flags) { 468 sk->sk_flags &= ~flags; 469 if (sock_needs_netstamp(sk) && 470 !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) 471 net_disable_timestamp(); 472 } 473 } 474 475 476 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 477 { 478 unsigned long flags; 479 struct sk_buff_head *list = &sk->sk_receive_queue; 480 481 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { 482 atomic_inc(&sk->sk_drops); 483 trace_sock_rcvqueue_full(sk, skb); 484 return -ENOMEM; 485 } 486 487 if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 488 atomic_inc(&sk->sk_drops); 489 return -ENOBUFS; 490 } 491 492 skb->dev = NULL; 493 skb_set_owner_r(skb, sk); 494 495 /* we escape from rcu protected region, make sure we dont leak 496 * a norefcounted dst 497 */ 498 skb_dst_force(skb); 499 500 spin_lock_irqsave(&list->lock, flags); 501 sock_skb_set_dropcount(sk, skb); 502 __skb_queue_tail(list, skb); 503 spin_unlock_irqrestore(&list->lock, flags); 504 505 if (!sock_flag(sk, SOCK_DEAD)) 506 sk->sk_data_ready(sk); 507 return 0; 508 } 509 EXPORT_SYMBOL(__sock_queue_rcv_skb); 510 511 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb, 512 enum skb_drop_reason *reason) 513 { 514 enum skb_drop_reason drop_reason; 515 int err; 516 517 err = sk_filter(sk, skb); 518 if (err) { 519 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 520 goto out; 521 } 522 err = __sock_queue_rcv_skb(sk, skb); 523 switch (err) { 524 case -ENOMEM: 525 drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF; 526 break; 527 case -ENOBUFS: 528 drop_reason = SKB_DROP_REASON_PROTO_MEM; 529 break; 530 default: 531 drop_reason = SKB_NOT_DROPPED_YET; 532 break; 533 } 534 out: 535 if (reason) 536 *reason = drop_reason; 537 return err; 538 } 539 EXPORT_SYMBOL(sock_queue_rcv_skb_reason); 540 541 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, 542 const int nested, unsigned int trim_cap, bool refcounted) 543 { 544 int rc = NET_RX_SUCCESS; 545 546 if (sk_filter_trim_cap(sk, skb, trim_cap)) 547 goto discard_and_relse; 548 549 skb->dev = NULL; 550 551 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { 552 atomic_inc(&sk->sk_drops); 553 goto discard_and_relse; 554 } 555 if (nested) 556 bh_lock_sock_nested(sk); 557 else 558 bh_lock_sock(sk); 559 if (!sock_owned_by_user(sk)) { 560 /* 561 * trylock + unlock semantics: 562 */ 563 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); 564 565 rc = sk_backlog_rcv(sk, skb); 566 567 mutex_release(&sk->sk_lock.dep_map, _RET_IP_); 568 } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { 569 bh_unlock_sock(sk); 570 atomic_inc(&sk->sk_drops); 571 goto discard_and_relse; 572 } 573 574 bh_unlock_sock(sk); 575 out: 576 if (refcounted) 577 sock_put(sk); 578 return rc; 579 discard_and_relse: 580 kfree_skb(skb); 581 goto out; 582 } 583 EXPORT_SYMBOL(__sk_receive_skb); 584 585 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, 586 u32)); 587 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 588 u32)); 589 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 590 { 591 struct dst_entry *dst = __sk_dst_get(sk); 592 593 if (dst && dst->obsolete && 594 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 595 dst, cookie) == NULL) { 596 sk_tx_queue_clear(sk); 597 sk->sk_dst_pending_confirm = 0; 598 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 599 dst_release(dst); 600 return NULL; 601 } 602 603 return dst; 604 } 605 EXPORT_SYMBOL(__sk_dst_check); 606 607 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) 608 { 609 struct dst_entry *dst = sk_dst_get(sk); 610 611 if (dst && dst->obsolete && 612 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 613 dst, cookie) == NULL) { 614 sk_dst_reset(sk); 615 dst_release(dst); 616 return NULL; 617 } 618 619 return dst; 620 } 621 EXPORT_SYMBOL(sk_dst_check); 622 623 static int sock_bindtoindex_locked(struct sock *sk, int ifindex) 624 { 625 int ret = -ENOPROTOOPT; 626 #ifdef CONFIG_NETDEVICES 627 struct net *net = sock_net(sk); 628 629 /* Sorry... */ 630 ret = -EPERM; 631 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) 632 goto out; 633 634 ret = -EINVAL; 635 if (ifindex < 0) 636 goto out; 637 638 sk->sk_bound_dev_if = ifindex; 639 if (sk->sk_prot->rehash) 640 sk->sk_prot->rehash(sk); 641 sk_dst_reset(sk); 642 643 ret = 0; 644 645 out: 646 #endif 647 648 return ret; 649 } 650 651 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) 652 { 653 int ret; 654 655 if (lock_sk) 656 lock_sock(sk); 657 ret = sock_bindtoindex_locked(sk, ifindex); 658 if (lock_sk) 659 release_sock(sk); 660 661 return ret; 662 } 663 EXPORT_SYMBOL(sock_bindtoindex); 664 665 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) 666 { 667 int ret = -ENOPROTOOPT; 668 #ifdef CONFIG_NETDEVICES 669 struct net *net = sock_net(sk); 670 char devname[IFNAMSIZ]; 671 int index; 672 673 ret = -EINVAL; 674 if (optlen < 0) 675 goto out; 676 677 /* Bind this socket to a particular device like "eth0", 678 * as specified in the passed interface name. If the 679 * name is "" or the option length is zero the socket 680 * is not bound. 681 */ 682 if (optlen > IFNAMSIZ - 1) 683 optlen = IFNAMSIZ - 1; 684 memset(devname, 0, sizeof(devname)); 685 686 ret = -EFAULT; 687 if (copy_from_sockptr(devname, optval, optlen)) 688 goto out; 689 690 index = 0; 691 if (devname[0] != '\0') { 692 struct net_device *dev; 693 694 rcu_read_lock(); 695 dev = dev_get_by_name_rcu(net, devname); 696 if (dev) 697 index = dev->ifindex; 698 rcu_read_unlock(); 699 ret = -ENODEV; 700 if (!dev) 701 goto out; 702 } 703 704 return sock_bindtoindex(sk, index, true); 705 out: 706 #endif 707 708 return ret; 709 } 710 711 static int sock_getbindtodevice(struct sock *sk, char __user *optval, 712 int __user *optlen, int len) 713 { 714 int ret = -ENOPROTOOPT; 715 #ifdef CONFIG_NETDEVICES 716 struct net *net = sock_net(sk); 717 char devname[IFNAMSIZ]; 718 719 if (sk->sk_bound_dev_if == 0) { 720 len = 0; 721 goto zero; 722 } 723 724 ret = -EINVAL; 725 if (len < IFNAMSIZ) 726 goto out; 727 728 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if); 729 if (ret) 730 goto out; 731 732 len = strlen(devname) + 1; 733 734 ret = -EFAULT; 735 if (copy_to_user(optval, devname, len)) 736 goto out; 737 738 zero: 739 ret = -EFAULT; 740 if (put_user(len, optlen)) 741 goto out; 742 743 ret = 0; 744 745 out: 746 #endif 747 748 return ret; 749 } 750 751 bool sk_mc_loop(struct sock *sk) 752 { 753 if (dev_recursion_level()) 754 return false; 755 if (!sk) 756 return true; 757 switch (sk->sk_family) { 758 case AF_INET: 759 return inet_sk(sk)->mc_loop; 760 #if IS_ENABLED(CONFIG_IPV6) 761 case AF_INET6: 762 return inet6_sk(sk)->mc_loop; 763 #endif 764 } 765 WARN_ON_ONCE(1); 766 return true; 767 } 768 EXPORT_SYMBOL(sk_mc_loop); 769 770 void sock_set_reuseaddr(struct sock *sk) 771 { 772 lock_sock(sk); 773 sk->sk_reuse = SK_CAN_REUSE; 774 release_sock(sk); 775 } 776 EXPORT_SYMBOL(sock_set_reuseaddr); 777 778 void sock_set_reuseport(struct sock *sk) 779 { 780 lock_sock(sk); 781 sk->sk_reuseport = true; 782 release_sock(sk); 783 } 784 EXPORT_SYMBOL(sock_set_reuseport); 785 786 void sock_no_linger(struct sock *sk) 787 { 788 lock_sock(sk); 789 sk->sk_lingertime = 0; 790 sock_set_flag(sk, SOCK_LINGER); 791 release_sock(sk); 792 } 793 EXPORT_SYMBOL(sock_no_linger); 794 795 void sock_set_priority(struct sock *sk, u32 priority) 796 { 797 lock_sock(sk); 798 sk->sk_priority = priority; 799 release_sock(sk); 800 } 801 EXPORT_SYMBOL(sock_set_priority); 802 803 void sock_set_sndtimeo(struct sock *sk, s64 secs) 804 { 805 lock_sock(sk); 806 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) 807 sk->sk_sndtimeo = secs * HZ; 808 else 809 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 810 release_sock(sk); 811 } 812 EXPORT_SYMBOL(sock_set_sndtimeo); 813 814 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) 815 { 816 if (val) { 817 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); 818 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns); 819 sock_set_flag(sk, SOCK_RCVTSTAMP); 820 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 821 } else { 822 sock_reset_flag(sk, SOCK_RCVTSTAMP); 823 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 824 } 825 } 826 827 void sock_enable_timestamps(struct sock *sk) 828 { 829 lock_sock(sk); 830 __sock_set_timestamps(sk, true, false, true); 831 release_sock(sk); 832 } 833 EXPORT_SYMBOL(sock_enable_timestamps); 834 835 void sock_set_timestamp(struct sock *sk, int optname, bool valbool) 836 { 837 switch (optname) { 838 case SO_TIMESTAMP_OLD: 839 __sock_set_timestamps(sk, valbool, false, false); 840 break; 841 case SO_TIMESTAMP_NEW: 842 __sock_set_timestamps(sk, valbool, true, false); 843 break; 844 case SO_TIMESTAMPNS_OLD: 845 __sock_set_timestamps(sk, valbool, false, true); 846 break; 847 case SO_TIMESTAMPNS_NEW: 848 __sock_set_timestamps(sk, valbool, true, true); 849 break; 850 } 851 } 852 853 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index) 854 { 855 struct net *net = sock_net(sk); 856 struct net_device *dev = NULL; 857 bool match = false; 858 int *vclock_index; 859 int i, num; 860 861 if (sk->sk_bound_dev_if) 862 dev = dev_get_by_index(net, sk->sk_bound_dev_if); 863 864 if (!dev) { 865 pr_err("%s: sock not bind to device\n", __func__); 866 return -EOPNOTSUPP; 867 } 868 869 num = ethtool_get_phc_vclocks(dev, &vclock_index); 870 dev_put(dev); 871 872 for (i = 0; i < num; i++) { 873 if (*(vclock_index + i) == phc_index) { 874 match = true; 875 break; 876 } 877 } 878 879 if (num > 0) 880 kfree(vclock_index); 881 882 if (!match) 883 return -EINVAL; 884 885 sk->sk_bind_phc = phc_index; 886 887 return 0; 888 } 889 890 int sock_set_timestamping(struct sock *sk, int optname, 891 struct so_timestamping timestamping) 892 { 893 int val = timestamping.flags; 894 int ret; 895 896 if (val & ~SOF_TIMESTAMPING_MASK) 897 return -EINVAL; 898 899 if (val & SOF_TIMESTAMPING_OPT_ID && 900 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { 901 if (sk_is_tcp(sk)) { 902 if ((1 << sk->sk_state) & 903 (TCPF_CLOSE | TCPF_LISTEN)) 904 return -EINVAL; 905 atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); 906 } else { 907 atomic_set(&sk->sk_tskey, 0); 908 } 909 } 910 911 if (val & SOF_TIMESTAMPING_OPT_STATS && 912 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) 913 return -EINVAL; 914 915 if (val & SOF_TIMESTAMPING_BIND_PHC) { 916 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc); 917 if (ret) 918 return ret; 919 } 920 921 sk->sk_tsflags = val; 922 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); 923 924 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 925 sock_enable_timestamp(sk, 926 SOCK_TIMESTAMPING_RX_SOFTWARE); 927 else 928 sock_disable_timestamp(sk, 929 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); 930 return 0; 931 } 932 933 void sock_set_keepalive(struct sock *sk) 934 { 935 lock_sock(sk); 936 if (sk->sk_prot->keepalive) 937 sk->sk_prot->keepalive(sk, true); 938 sock_valbool_flag(sk, SOCK_KEEPOPEN, true); 939 release_sock(sk); 940 } 941 EXPORT_SYMBOL(sock_set_keepalive); 942 943 static void __sock_set_rcvbuf(struct sock *sk, int val) 944 { 945 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it 946 * as a negative value. 947 */ 948 val = min_t(int, val, INT_MAX / 2); 949 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 950 951 /* We double it on the way in to account for "struct sk_buff" etc. 952 * overhead. Applications assume that the SO_RCVBUF setting they make 953 * will allow that much actual data to be received on that socket. 954 * 955 * Applications are unaware that "struct sk_buff" and other overheads 956 * allocate from the receive buffer during socket buffer allocation. 957 * 958 * And after considering the possible alternatives, returning the value 959 * we actually used in getsockopt is the most desirable behavior. 960 */ 961 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); 962 } 963 964 void sock_set_rcvbuf(struct sock *sk, int val) 965 { 966 lock_sock(sk); 967 __sock_set_rcvbuf(sk, val); 968 release_sock(sk); 969 } 970 EXPORT_SYMBOL(sock_set_rcvbuf); 971 972 static void __sock_set_mark(struct sock *sk, u32 val) 973 { 974 if (val != sk->sk_mark) { 975 sk->sk_mark = val; 976 sk_dst_reset(sk); 977 } 978 } 979 980 void sock_set_mark(struct sock *sk, u32 val) 981 { 982 lock_sock(sk); 983 __sock_set_mark(sk, val); 984 release_sock(sk); 985 } 986 EXPORT_SYMBOL(sock_set_mark); 987 988 static void sock_release_reserved_memory(struct sock *sk, int bytes) 989 { 990 /* Round down bytes to multiple of pages */ 991 bytes &= ~(SK_MEM_QUANTUM - 1); 992 993 WARN_ON(bytes > sk->sk_reserved_mem); 994 sk->sk_reserved_mem -= bytes; 995 sk_mem_reclaim(sk); 996 } 997 998 static int sock_reserve_memory(struct sock *sk, int bytes) 999 { 1000 long allocated; 1001 bool charged; 1002 int pages; 1003 1004 if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk)) 1005 return -EOPNOTSUPP; 1006 1007 if (!bytes) 1008 return 0; 1009 1010 pages = sk_mem_pages(bytes); 1011 1012 /* pre-charge to memcg */ 1013 charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages, 1014 GFP_KERNEL | __GFP_RETRY_MAYFAIL); 1015 if (!charged) 1016 return -ENOMEM; 1017 1018 /* pre-charge to forward_alloc */ 1019 allocated = sk_memory_allocated_add(sk, pages); 1020 /* If the system goes into memory pressure with this 1021 * precharge, give up and return error. 1022 */ 1023 if (allocated > sk_prot_mem_limits(sk, 1)) { 1024 sk_memory_allocated_sub(sk, pages); 1025 mem_cgroup_uncharge_skmem(sk->sk_memcg, pages); 1026 return -ENOMEM; 1027 } 1028 sk->sk_forward_alloc += pages << SK_MEM_QUANTUM_SHIFT; 1029 1030 sk->sk_reserved_mem += pages << SK_MEM_QUANTUM_SHIFT; 1031 1032 return 0; 1033 } 1034 1035 /* 1036 * This is meant for all protocols to use and covers goings on 1037 * at the socket level. Everything here is generic. 1038 */ 1039 1040 int sock_setsockopt(struct socket *sock, int level, int optname, 1041 sockptr_t optval, unsigned int optlen) 1042 { 1043 struct so_timestamping timestamping; 1044 struct sock_txtime sk_txtime; 1045 struct sock *sk = sock->sk; 1046 int val; 1047 int valbool; 1048 struct linger ling; 1049 int ret = 0; 1050 1051 /* 1052 * Options without arguments 1053 */ 1054 1055 if (optname == SO_BINDTODEVICE) 1056 return sock_setbindtodevice(sk, optval, optlen); 1057 1058 if (optlen < sizeof(int)) 1059 return -EINVAL; 1060 1061 if (copy_from_sockptr(&val, optval, sizeof(val))) 1062 return -EFAULT; 1063 1064 valbool = val ? 1 : 0; 1065 1066 lock_sock(sk); 1067 1068 switch (optname) { 1069 case SO_DEBUG: 1070 if (val && !capable(CAP_NET_ADMIN)) 1071 ret = -EACCES; 1072 else 1073 sock_valbool_flag(sk, SOCK_DBG, valbool); 1074 break; 1075 case SO_REUSEADDR: 1076 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); 1077 break; 1078 case SO_REUSEPORT: 1079 sk->sk_reuseport = valbool; 1080 break; 1081 case SO_TYPE: 1082 case SO_PROTOCOL: 1083 case SO_DOMAIN: 1084 case SO_ERROR: 1085 ret = -ENOPROTOOPT; 1086 break; 1087 case SO_DONTROUTE: 1088 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); 1089 sk_dst_reset(sk); 1090 break; 1091 case SO_BROADCAST: 1092 sock_valbool_flag(sk, SOCK_BROADCAST, valbool); 1093 break; 1094 case SO_SNDBUF: 1095 /* Don't error on this BSD doesn't and if you think 1096 * about it this is right. Otherwise apps have to 1097 * play 'guess the biggest size' games. RCVBUF/SNDBUF 1098 * are treated in BSD as hints 1099 */ 1100 val = min_t(u32, val, sysctl_wmem_max); 1101 set_sndbuf: 1102 /* Ensure val * 2 fits into an int, to prevent max_t() 1103 * from treating it as a negative value. 1104 */ 1105 val = min_t(int, val, INT_MAX / 2); 1106 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 1107 WRITE_ONCE(sk->sk_sndbuf, 1108 max_t(int, val * 2, SOCK_MIN_SNDBUF)); 1109 /* Wake up sending tasks if we upped the value. */ 1110 sk->sk_write_space(sk); 1111 break; 1112 1113 case SO_SNDBUFFORCE: 1114 if (!capable(CAP_NET_ADMIN)) { 1115 ret = -EPERM; 1116 break; 1117 } 1118 1119 /* No negative values (to prevent underflow, as val will be 1120 * multiplied by 2). 1121 */ 1122 if (val < 0) 1123 val = 0; 1124 goto set_sndbuf; 1125 1126 case SO_RCVBUF: 1127 /* Don't error on this BSD doesn't and if you think 1128 * about it this is right. Otherwise apps have to 1129 * play 'guess the biggest size' games. RCVBUF/SNDBUF 1130 * are treated in BSD as hints 1131 */ 1132 __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max)); 1133 break; 1134 1135 case SO_RCVBUFFORCE: 1136 if (!capable(CAP_NET_ADMIN)) { 1137 ret = -EPERM; 1138 break; 1139 } 1140 1141 /* No negative values (to prevent underflow, as val will be 1142 * multiplied by 2). 1143 */ 1144 __sock_set_rcvbuf(sk, max(val, 0)); 1145 break; 1146 1147 case SO_KEEPALIVE: 1148 if (sk->sk_prot->keepalive) 1149 sk->sk_prot->keepalive(sk, valbool); 1150 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 1151 break; 1152 1153 case SO_OOBINLINE: 1154 sock_valbool_flag(sk, SOCK_URGINLINE, valbool); 1155 break; 1156 1157 case SO_NO_CHECK: 1158 sk->sk_no_check_tx = valbool; 1159 break; 1160 1161 case SO_PRIORITY: 1162 if ((val >= 0 && val <= 6) || 1163 ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || 1164 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 1165 sk->sk_priority = val; 1166 else 1167 ret = -EPERM; 1168 break; 1169 1170 case SO_LINGER: 1171 if (optlen < sizeof(ling)) { 1172 ret = -EINVAL; /* 1003.1g */ 1173 break; 1174 } 1175 if (copy_from_sockptr(&ling, optval, sizeof(ling))) { 1176 ret = -EFAULT; 1177 break; 1178 } 1179 if (!ling.l_onoff) 1180 sock_reset_flag(sk, SOCK_LINGER); 1181 else { 1182 #if (BITS_PER_LONG == 32) 1183 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) 1184 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; 1185 else 1186 #endif 1187 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; 1188 sock_set_flag(sk, SOCK_LINGER); 1189 } 1190 break; 1191 1192 case SO_BSDCOMPAT: 1193 break; 1194 1195 case SO_PASSCRED: 1196 if (valbool) 1197 set_bit(SOCK_PASSCRED, &sock->flags); 1198 else 1199 clear_bit(SOCK_PASSCRED, &sock->flags); 1200 break; 1201 1202 case SO_TIMESTAMP_OLD: 1203 case SO_TIMESTAMP_NEW: 1204 case SO_TIMESTAMPNS_OLD: 1205 case SO_TIMESTAMPNS_NEW: 1206 sock_set_timestamp(sk, optname, valbool); 1207 break; 1208 1209 case SO_TIMESTAMPING_NEW: 1210 case SO_TIMESTAMPING_OLD: 1211 if (optlen == sizeof(timestamping)) { 1212 if (copy_from_sockptr(×tamping, optval, 1213 sizeof(timestamping))) { 1214 ret = -EFAULT; 1215 break; 1216 } 1217 } else { 1218 memset(×tamping, 0, sizeof(timestamping)); 1219 timestamping.flags = val; 1220 } 1221 ret = sock_set_timestamping(sk, optname, timestamping); 1222 break; 1223 1224 case SO_RCVLOWAT: 1225 if (val < 0) 1226 val = INT_MAX; 1227 if (sock->ops->set_rcvlowat) 1228 ret = sock->ops->set_rcvlowat(sk, val); 1229 else 1230 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); 1231 break; 1232 1233 case SO_RCVTIMEO_OLD: 1234 case SO_RCVTIMEO_NEW: 1235 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, 1236 optlen, optname == SO_RCVTIMEO_OLD); 1237 break; 1238 1239 case SO_SNDTIMEO_OLD: 1240 case SO_SNDTIMEO_NEW: 1241 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, 1242 optlen, optname == SO_SNDTIMEO_OLD); 1243 break; 1244 1245 case SO_ATTACH_FILTER: { 1246 struct sock_fprog fprog; 1247 1248 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1249 if (!ret) 1250 ret = sk_attach_filter(&fprog, sk); 1251 break; 1252 } 1253 case SO_ATTACH_BPF: 1254 ret = -EINVAL; 1255 if (optlen == sizeof(u32)) { 1256 u32 ufd; 1257 1258 ret = -EFAULT; 1259 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1260 break; 1261 1262 ret = sk_attach_bpf(ufd, sk); 1263 } 1264 break; 1265 1266 case SO_ATTACH_REUSEPORT_CBPF: { 1267 struct sock_fprog fprog; 1268 1269 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1270 if (!ret) 1271 ret = sk_reuseport_attach_filter(&fprog, sk); 1272 break; 1273 } 1274 case SO_ATTACH_REUSEPORT_EBPF: 1275 ret = -EINVAL; 1276 if (optlen == sizeof(u32)) { 1277 u32 ufd; 1278 1279 ret = -EFAULT; 1280 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1281 break; 1282 1283 ret = sk_reuseport_attach_bpf(ufd, sk); 1284 } 1285 break; 1286 1287 case SO_DETACH_REUSEPORT_BPF: 1288 ret = reuseport_detach_prog(sk); 1289 break; 1290 1291 case SO_DETACH_FILTER: 1292 ret = sk_detach_filter(sk); 1293 break; 1294 1295 case SO_LOCK_FILTER: 1296 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) 1297 ret = -EPERM; 1298 else 1299 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); 1300 break; 1301 1302 case SO_PASSSEC: 1303 if (valbool) 1304 set_bit(SOCK_PASSSEC, &sock->flags); 1305 else 1306 clear_bit(SOCK_PASSSEC, &sock->flags); 1307 break; 1308 case SO_MARK: 1309 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && 1310 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1311 ret = -EPERM; 1312 break; 1313 } 1314 1315 __sock_set_mark(sk, val); 1316 break; 1317 case SO_RCVMARK: 1318 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && 1319 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1320 ret = -EPERM; 1321 break; 1322 } 1323 1324 sock_valbool_flag(sk, SOCK_RCVMARK, valbool); 1325 break; 1326 1327 case SO_RXQ_OVFL: 1328 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); 1329 break; 1330 1331 case SO_WIFI_STATUS: 1332 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); 1333 break; 1334 1335 case SO_PEEK_OFF: 1336 if (sock->ops->set_peek_off) 1337 ret = sock->ops->set_peek_off(sk, val); 1338 else 1339 ret = -EOPNOTSUPP; 1340 break; 1341 1342 case SO_NOFCS: 1343 sock_valbool_flag(sk, SOCK_NOFCS, valbool); 1344 break; 1345 1346 case SO_SELECT_ERR_QUEUE: 1347 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); 1348 break; 1349 1350 #ifdef CONFIG_NET_RX_BUSY_POLL 1351 case SO_BUSY_POLL: 1352 /* allow unprivileged users to decrease the value */ 1353 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) 1354 ret = -EPERM; 1355 else { 1356 if (val < 0) 1357 ret = -EINVAL; 1358 else 1359 WRITE_ONCE(sk->sk_ll_usec, val); 1360 } 1361 break; 1362 case SO_PREFER_BUSY_POLL: 1363 if (valbool && !capable(CAP_NET_ADMIN)) 1364 ret = -EPERM; 1365 else 1366 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); 1367 break; 1368 case SO_BUSY_POLL_BUDGET: 1369 if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) { 1370 ret = -EPERM; 1371 } else { 1372 if (val < 0 || val > U16_MAX) 1373 ret = -EINVAL; 1374 else 1375 WRITE_ONCE(sk->sk_busy_poll_budget, val); 1376 } 1377 break; 1378 #endif 1379 1380 case SO_MAX_PACING_RATE: 1381 { 1382 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; 1383 1384 if (sizeof(ulval) != sizeof(val) && 1385 optlen >= sizeof(ulval) && 1386 copy_from_sockptr(&ulval, optval, sizeof(ulval))) { 1387 ret = -EFAULT; 1388 break; 1389 } 1390 if (ulval != ~0UL) 1391 cmpxchg(&sk->sk_pacing_status, 1392 SK_PACING_NONE, 1393 SK_PACING_NEEDED); 1394 sk->sk_max_pacing_rate = ulval; 1395 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval); 1396 break; 1397 } 1398 case SO_INCOMING_CPU: 1399 WRITE_ONCE(sk->sk_incoming_cpu, val); 1400 break; 1401 1402 case SO_CNX_ADVICE: 1403 if (val == 1) 1404 dst_negative_advice(sk); 1405 break; 1406 1407 case SO_ZEROCOPY: 1408 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { 1409 if (!(sk_is_tcp(sk) || 1410 (sk->sk_type == SOCK_DGRAM && 1411 sk->sk_protocol == IPPROTO_UDP))) 1412 ret = -EOPNOTSUPP; 1413 } else if (sk->sk_family != PF_RDS) { 1414 ret = -EOPNOTSUPP; 1415 } 1416 if (!ret) { 1417 if (val < 0 || val > 1) 1418 ret = -EINVAL; 1419 else 1420 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); 1421 } 1422 break; 1423 1424 case SO_TXTIME: 1425 if (optlen != sizeof(struct sock_txtime)) { 1426 ret = -EINVAL; 1427 break; 1428 } else if (copy_from_sockptr(&sk_txtime, optval, 1429 sizeof(struct sock_txtime))) { 1430 ret = -EFAULT; 1431 break; 1432 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { 1433 ret = -EINVAL; 1434 break; 1435 } 1436 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet 1437 * scheduler has enough safe guards. 1438 */ 1439 if (sk_txtime.clockid != CLOCK_MONOTONIC && 1440 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1441 ret = -EPERM; 1442 break; 1443 } 1444 sock_valbool_flag(sk, SOCK_TXTIME, true); 1445 sk->sk_clockid = sk_txtime.clockid; 1446 sk->sk_txtime_deadline_mode = 1447 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); 1448 sk->sk_txtime_report_errors = 1449 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); 1450 break; 1451 1452 case SO_BINDTOIFINDEX: 1453 ret = sock_bindtoindex_locked(sk, val); 1454 break; 1455 1456 case SO_BUF_LOCK: 1457 if (val & ~SOCK_BUF_LOCK_MASK) { 1458 ret = -EINVAL; 1459 break; 1460 } 1461 sk->sk_userlocks = val | (sk->sk_userlocks & 1462 ~SOCK_BUF_LOCK_MASK); 1463 break; 1464 1465 case SO_RESERVE_MEM: 1466 { 1467 int delta; 1468 1469 if (val < 0) { 1470 ret = -EINVAL; 1471 break; 1472 } 1473 1474 delta = val - sk->sk_reserved_mem; 1475 if (delta < 0) 1476 sock_release_reserved_memory(sk, -delta); 1477 else 1478 ret = sock_reserve_memory(sk, delta); 1479 break; 1480 } 1481 1482 case SO_TXREHASH: 1483 if (val < -1 || val > 1) { 1484 ret = -EINVAL; 1485 break; 1486 } 1487 /* Paired with READ_ONCE() in tcp_rtx_synack() */ 1488 WRITE_ONCE(sk->sk_txrehash, (u8)val); 1489 break; 1490 1491 default: 1492 ret = -ENOPROTOOPT; 1493 break; 1494 } 1495 release_sock(sk); 1496 return ret; 1497 } 1498 EXPORT_SYMBOL(sock_setsockopt); 1499 1500 static const struct cred *sk_get_peer_cred(struct sock *sk) 1501 { 1502 const struct cred *cred; 1503 1504 spin_lock(&sk->sk_peer_lock); 1505 cred = get_cred(sk->sk_peer_cred); 1506 spin_unlock(&sk->sk_peer_lock); 1507 1508 return cred; 1509 } 1510 1511 static void cred_to_ucred(struct pid *pid, const struct cred *cred, 1512 struct ucred *ucred) 1513 { 1514 ucred->pid = pid_vnr(pid); 1515 ucred->uid = ucred->gid = -1; 1516 if (cred) { 1517 struct user_namespace *current_ns = current_user_ns(); 1518 1519 ucred->uid = from_kuid_munged(current_ns, cred->euid); 1520 ucred->gid = from_kgid_munged(current_ns, cred->egid); 1521 } 1522 } 1523 1524 static int groups_to_user(gid_t __user *dst, const struct group_info *src) 1525 { 1526 struct user_namespace *user_ns = current_user_ns(); 1527 int i; 1528 1529 for (i = 0; i < src->ngroups; i++) 1530 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i)) 1531 return -EFAULT; 1532 1533 return 0; 1534 } 1535 1536 int sock_getsockopt(struct socket *sock, int level, int optname, 1537 char __user *optval, int __user *optlen) 1538 { 1539 struct sock *sk = sock->sk; 1540 1541 union { 1542 int val; 1543 u64 val64; 1544 unsigned long ulval; 1545 struct linger ling; 1546 struct old_timeval32 tm32; 1547 struct __kernel_old_timeval tm; 1548 struct __kernel_sock_timeval stm; 1549 struct sock_txtime txtime; 1550 struct so_timestamping timestamping; 1551 } v; 1552 1553 int lv = sizeof(int); 1554 int len; 1555 1556 if (get_user(len, optlen)) 1557 return -EFAULT; 1558 if (len < 0) 1559 return -EINVAL; 1560 1561 memset(&v, 0, sizeof(v)); 1562 1563 switch (optname) { 1564 case SO_DEBUG: 1565 v.val = sock_flag(sk, SOCK_DBG); 1566 break; 1567 1568 case SO_DONTROUTE: 1569 v.val = sock_flag(sk, SOCK_LOCALROUTE); 1570 break; 1571 1572 case SO_BROADCAST: 1573 v.val = sock_flag(sk, SOCK_BROADCAST); 1574 break; 1575 1576 case SO_SNDBUF: 1577 v.val = sk->sk_sndbuf; 1578 break; 1579 1580 case SO_RCVBUF: 1581 v.val = sk->sk_rcvbuf; 1582 break; 1583 1584 case SO_REUSEADDR: 1585 v.val = sk->sk_reuse; 1586 break; 1587 1588 case SO_REUSEPORT: 1589 v.val = sk->sk_reuseport; 1590 break; 1591 1592 case SO_KEEPALIVE: 1593 v.val = sock_flag(sk, SOCK_KEEPOPEN); 1594 break; 1595 1596 case SO_TYPE: 1597 v.val = sk->sk_type; 1598 break; 1599 1600 case SO_PROTOCOL: 1601 v.val = sk->sk_protocol; 1602 break; 1603 1604 case SO_DOMAIN: 1605 v.val = sk->sk_family; 1606 break; 1607 1608 case SO_ERROR: 1609 v.val = -sock_error(sk); 1610 if (v.val == 0) 1611 v.val = xchg(&sk->sk_err_soft, 0); 1612 break; 1613 1614 case SO_OOBINLINE: 1615 v.val = sock_flag(sk, SOCK_URGINLINE); 1616 break; 1617 1618 case SO_NO_CHECK: 1619 v.val = sk->sk_no_check_tx; 1620 break; 1621 1622 case SO_PRIORITY: 1623 v.val = sk->sk_priority; 1624 break; 1625 1626 case SO_LINGER: 1627 lv = sizeof(v.ling); 1628 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); 1629 v.ling.l_linger = sk->sk_lingertime / HZ; 1630 break; 1631 1632 case SO_BSDCOMPAT: 1633 break; 1634 1635 case SO_TIMESTAMP_OLD: 1636 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && 1637 !sock_flag(sk, SOCK_TSTAMP_NEW) && 1638 !sock_flag(sk, SOCK_RCVTSTAMPNS); 1639 break; 1640 1641 case SO_TIMESTAMPNS_OLD: 1642 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); 1643 break; 1644 1645 case SO_TIMESTAMP_NEW: 1646 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); 1647 break; 1648 1649 case SO_TIMESTAMPNS_NEW: 1650 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); 1651 break; 1652 1653 case SO_TIMESTAMPING_OLD: 1654 lv = sizeof(v.timestamping); 1655 v.timestamping.flags = sk->sk_tsflags; 1656 v.timestamping.bind_phc = sk->sk_bind_phc; 1657 break; 1658 1659 case SO_RCVTIMEO_OLD: 1660 case SO_RCVTIMEO_NEW: 1661 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname); 1662 break; 1663 1664 case SO_SNDTIMEO_OLD: 1665 case SO_SNDTIMEO_NEW: 1666 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname); 1667 break; 1668 1669 case SO_RCVLOWAT: 1670 v.val = sk->sk_rcvlowat; 1671 break; 1672 1673 case SO_SNDLOWAT: 1674 v.val = 1; 1675 break; 1676 1677 case SO_PASSCRED: 1678 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); 1679 break; 1680 1681 case SO_PEERCRED: 1682 { 1683 struct ucred peercred; 1684 if (len > sizeof(peercred)) 1685 len = sizeof(peercred); 1686 1687 spin_lock(&sk->sk_peer_lock); 1688 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); 1689 spin_unlock(&sk->sk_peer_lock); 1690 1691 if (copy_to_user(optval, &peercred, len)) 1692 return -EFAULT; 1693 goto lenout; 1694 } 1695 1696 case SO_PEERGROUPS: 1697 { 1698 const struct cred *cred; 1699 int ret, n; 1700 1701 cred = sk_get_peer_cred(sk); 1702 if (!cred) 1703 return -ENODATA; 1704 1705 n = cred->group_info->ngroups; 1706 if (len < n * sizeof(gid_t)) { 1707 len = n * sizeof(gid_t); 1708 put_cred(cred); 1709 return put_user(len, optlen) ? -EFAULT : -ERANGE; 1710 } 1711 len = n * sizeof(gid_t); 1712 1713 ret = groups_to_user((gid_t __user *)optval, cred->group_info); 1714 put_cred(cred); 1715 if (ret) 1716 return ret; 1717 goto lenout; 1718 } 1719 1720 case SO_PEERNAME: 1721 { 1722 char address[128]; 1723 1724 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2); 1725 if (lv < 0) 1726 return -ENOTCONN; 1727 if (lv < len) 1728 return -EINVAL; 1729 if (copy_to_user(optval, address, len)) 1730 return -EFAULT; 1731 goto lenout; 1732 } 1733 1734 /* Dubious BSD thing... Probably nobody even uses it, but 1735 * the UNIX standard wants it for whatever reason... -DaveM 1736 */ 1737 case SO_ACCEPTCONN: 1738 v.val = sk->sk_state == TCP_LISTEN; 1739 break; 1740 1741 case SO_PASSSEC: 1742 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); 1743 break; 1744 1745 case SO_PEERSEC: 1746 return security_socket_getpeersec_stream(sock, optval, optlen, len); 1747 1748 case SO_MARK: 1749 v.val = sk->sk_mark; 1750 break; 1751 1752 case SO_RCVMARK: 1753 v.val = sock_flag(sk, SOCK_RCVMARK); 1754 break; 1755 1756 case SO_RXQ_OVFL: 1757 v.val = sock_flag(sk, SOCK_RXQ_OVFL); 1758 break; 1759 1760 case SO_WIFI_STATUS: 1761 v.val = sock_flag(sk, SOCK_WIFI_STATUS); 1762 break; 1763 1764 case SO_PEEK_OFF: 1765 if (!sock->ops->set_peek_off) 1766 return -EOPNOTSUPP; 1767 1768 v.val = sk->sk_peek_off; 1769 break; 1770 case SO_NOFCS: 1771 v.val = sock_flag(sk, SOCK_NOFCS); 1772 break; 1773 1774 case SO_BINDTODEVICE: 1775 return sock_getbindtodevice(sk, optval, optlen, len); 1776 1777 case SO_GET_FILTER: 1778 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); 1779 if (len < 0) 1780 return len; 1781 1782 goto lenout; 1783 1784 case SO_LOCK_FILTER: 1785 v.val = sock_flag(sk, SOCK_FILTER_LOCKED); 1786 break; 1787 1788 case SO_BPF_EXTENSIONS: 1789 v.val = bpf_tell_extensions(); 1790 break; 1791 1792 case SO_SELECT_ERR_QUEUE: 1793 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); 1794 break; 1795 1796 #ifdef CONFIG_NET_RX_BUSY_POLL 1797 case SO_BUSY_POLL: 1798 v.val = sk->sk_ll_usec; 1799 break; 1800 case SO_PREFER_BUSY_POLL: 1801 v.val = READ_ONCE(sk->sk_prefer_busy_poll); 1802 break; 1803 #endif 1804 1805 case SO_MAX_PACING_RATE: 1806 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { 1807 lv = sizeof(v.ulval); 1808 v.ulval = sk->sk_max_pacing_rate; 1809 } else { 1810 /* 32bit version */ 1811 v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U); 1812 } 1813 break; 1814 1815 case SO_INCOMING_CPU: 1816 v.val = READ_ONCE(sk->sk_incoming_cpu); 1817 break; 1818 1819 case SO_MEMINFO: 1820 { 1821 u32 meminfo[SK_MEMINFO_VARS]; 1822 1823 sk_get_meminfo(sk, meminfo); 1824 1825 len = min_t(unsigned int, len, sizeof(meminfo)); 1826 if (copy_to_user(optval, &meminfo, len)) 1827 return -EFAULT; 1828 1829 goto lenout; 1830 } 1831 1832 #ifdef CONFIG_NET_RX_BUSY_POLL 1833 case SO_INCOMING_NAPI_ID: 1834 v.val = READ_ONCE(sk->sk_napi_id); 1835 1836 /* aggregate non-NAPI IDs down to 0 */ 1837 if (v.val < MIN_NAPI_ID) 1838 v.val = 0; 1839 1840 break; 1841 #endif 1842 1843 case SO_COOKIE: 1844 lv = sizeof(u64); 1845 if (len < lv) 1846 return -EINVAL; 1847 v.val64 = sock_gen_cookie(sk); 1848 break; 1849 1850 case SO_ZEROCOPY: 1851 v.val = sock_flag(sk, SOCK_ZEROCOPY); 1852 break; 1853 1854 case SO_TXTIME: 1855 lv = sizeof(v.txtime); 1856 v.txtime.clockid = sk->sk_clockid; 1857 v.txtime.flags |= sk->sk_txtime_deadline_mode ? 1858 SOF_TXTIME_DEADLINE_MODE : 0; 1859 v.txtime.flags |= sk->sk_txtime_report_errors ? 1860 SOF_TXTIME_REPORT_ERRORS : 0; 1861 break; 1862 1863 case SO_BINDTOIFINDEX: 1864 v.val = sk->sk_bound_dev_if; 1865 break; 1866 1867 case SO_NETNS_COOKIE: 1868 lv = sizeof(u64); 1869 if (len != lv) 1870 return -EINVAL; 1871 v.val64 = sock_net(sk)->net_cookie; 1872 break; 1873 1874 case SO_BUF_LOCK: 1875 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK; 1876 break; 1877 1878 case SO_RESERVE_MEM: 1879 v.val = sk->sk_reserved_mem; 1880 break; 1881 1882 case SO_TXREHASH: 1883 v.val = sk->sk_txrehash; 1884 break; 1885 1886 default: 1887 /* We implement the SO_SNDLOWAT etc to not be settable 1888 * (1003.1g 7). 1889 */ 1890 return -ENOPROTOOPT; 1891 } 1892 1893 if (len > lv) 1894 len = lv; 1895 if (copy_to_user(optval, &v, len)) 1896 return -EFAULT; 1897 lenout: 1898 if (put_user(len, optlen)) 1899 return -EFAULT; 1900 return 0; 1901 } 1902 1903 /* 1904 * Initialize an sk_lock. 1905 * 1906 * (We also register the sk_lock with the lock validator.) 1907 */ 1908 static inline void sock_lock_init(struct sock *sk) 1909 { 1910 if (sk->sk_kern_sock) 1911 sock_lock_init_class_and_name( 1912 sk, 1913 af_family_kern_slock_key_strings[sk->sk_family], 1914 af_family_kern_slock_keys + sk->sk_family, 1915 af_family_kern_key_strings[sk->sk_family], 1916 af_family_kern_keys + sk->sk_family); 1917 else 1918 sock_lock_init_class_and_name( 1919 sk, 1920 af_family_slock_key_strings[sk->sk_family], 1921 af_family_slock_keys + sk->sk_family, 1922 af_family_key_strings[sk->sk_family], 1923 af_family_keys + sk->sk_family); 1924 } 1925 1926 /* 1927 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 1928 * even temporarly, because of RCU lookups. sk_node should also be left as is. 1929 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end 1930 */ 1931 static void sock_copy(struct sock *nsk, const struct sock *osk) 1932 { 1933 const struct proto *prot = READ_ONCE(osk->sk_prot); 1934 #ifdef CONFIG_SECURITY_NETWORK 1935 void *sptr = nsk->sk_security; 1936 #endif 1937 1938 /* If we move sk_tx_queue_mapping out of the private section, 1939 * we must check if sk_tx_queue_clear() is called after 1940 * sock_copy() in sk_clone_lock(). 1941 */ 1942 BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) < 1943 offsetof(struct sock, sk_dontcopy_begin) || 1944 offsetof(struct sock, sk_tx_queue_mapping) >= 1945 offsetof(struct sock, sk_dontcopy_end)); 1946 1947 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); 1948 1949 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, 1950 prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); 1951 1952 #ifdef CONFIG_SECURITY_NETWORK 1953 nsk->sk_security = sptr; 1954 security_sk_clone(osk, nsk); 1955 #endif 1956 } 1957 1958 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 1959 int family) 1960 { 1961 struct sock *sk; 1962 struct kmem_cache *slab; 1963 1964 slab = prot->slab; 1965 if (slab != NULL) { 1966 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 1967 if (!sk) 1968 return sk; 1969 if (want_init_on_alloc(priority)) 1970 sk_prot_clear_nulls(sk, prot->obj_size); 1971 } else 1972 sk = kmalloc(prot->obj_size, priority); 1973 1974 if (sk != NULL) { 1975 if (security_sk_alloc(sk, family, priority)) 1976 goto out_free; 1977 1978 if (!try_module_get(prot->owner)) 1979 goto out_free_sec; 1980 } 1981 1982 return sk; 1983 1984 out_free_sec: 1985 security_sk_free(sk); 1986 out_free: 1987 if (slab != NULL) 1988 kmem_cache_free(slab, sk); 1989 else 1990 kfree(sk); 1991 return NULL; 1992 } 1993 1994 static void sk_prot_free(struct proto *prot, struct sock *sk) 1995 { 1996 struct kmem_cache *slab; 1997 struct module *owner; 1998 1999 owner = prot->owner; 2000 slab = prot->slab; 2001 2002 cgroup_sk_free(&sk->sk_cgrp_data); 2003 mem_cgroup_sk_free(sk); 2004 security_sk_free(sk); 2005 if (slab != NULL) 2006 kmem_cache_free(slab, sk); 2007 else 2008 kfree(sk); 2009 module_put(owner); 2010 } 2011 2012 /** 2013 * sk_alloc - All socket objects are allocated here 2014 * @net: the applicable net namespace 2015 * @family: protocol family 2016 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 2017 * @prot: struct proto associated with this new sock instance 2018 * @kern: is this to be a kernel socket? 2019 */ 2020 struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 2021 struct proto *prot, int kern) 2022 { 2023 struct sock *sk; 2024 2025 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); 2026 if (sk) { 2027 sk->sk_family = family; 2028 /* 2029 * See comment in struct sock definition to understand 2030 * why we need sk_prot_creator -acme 2031 */ 2032 sk->sk_prot = sk->sk_prot_creator = prot; 2033 sk->sk_kern_sock = kern; 2034 sock_lock_init(sk); 2035 sk->sk_net_refcnt = kern ? 0 : 1; 2036 if (likely(sk->sk_net_refcnt)) { 2037 get_net_track(net, &sk->ns_tracker, priority); 2038 sock_inuse_add(net, 1); 2039 } 2040 2041 sock_net_set(sk, net); 2042 refcount_set(&sk->sk_wmem_alloc, 1); 2043 2044 mem_cgroup_sk_alloc(sk); 2045 cgroup_sk_alloc(&sk->sk_cgrp_data); 2046 sock_update_classid(&sk->sk_cgrp_data); 2047 sock_update_netprioidx(&sk->sk_cgrp_data); 2048 sk_tx_queue_clear(sk); 2049 } 2050 2051 return sk; 2052 } 2053 EXPORT_SYMBOL(sk_alloc); 2054 2055 /* Sockets having SOCK_RCU_FREE will call this function after one RCU 2056 * grace period. This is the case for UDP sockets and TCP listeners. 2057 */ 2058 static void __sk_destruct(struct rcu_head *head) 2059 { 2060 struct sock *sk = container_of(head, struct sock, sk_rcu); 2061 struct sk_filter *filter; 2062 2063 if (sk->sk_destruct) 2064 sk->sk_destruct(sk); 2065 2066 filter = rcu_dereference_check(sk->sk_filter, 2067 refcount_read(&sk->sk_wmem_alloc) == 0); 2068 if (filter) { 2069 sk_filter_uncharge(sk, filter); 2070 RCU_INIT_POINTER(sk->sk_filter, NULL); 2071 } 2072 2073 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); 2074 2075 #ifdef CONFIG_BPF_SYSCALL 2076 bpf_sk_storage_free(sk); 2077 #endif 2078 2079 if (atomic_read(&sk->sk_omem_alloc)) 2080 pr_debug("%s: optmem leakage (%d bytes) detected\n", 2081 __func__, atomic_read(&sk->sk_omem_alloc)); 2082 2083 if (sk->sk_frag.page) { 2084 put_page(sk->sk_frag.page); 2085 sk->sk_frag.page = NULL; 2086 } 2087 2088 /* We do not need to acquire sk->sk_peer_lock, we are the last user. */ 2089 put_cred(sk->sk_peer_cred); 2090 put_pid(sk->sk_peer_pid); 2091 2092 if (likely(sk->sk_net_refcnt)) 2093 put_net_track(sock_net(sk), &sk->ns_tracker); 2094 sk_prot_free(sk->sk_prot_creator, sk); 2095 } 2096 2097 void sk_destruct(struct sock *sk) 2098 { 2099 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); 2100 2101 if (rcu_access_pointer(sk->sk_reuseport_cb)) { 2102 reuseport_detach_sock(sk); 2103 use_call_rcu = true; 2104 } 2105 2106 if (use_call_rcu) 2107 call_rcu(&sk->sk_rcu, __sk_destruct); 2108 else 2109 __sk_destruct(&sk->sk_rcu); 2110 } 2111 2112 static void __sk_free(struct sock *sk) 2113 { 2114 if (likely(sk->sk_net_refcnt)) 2115 sock_inuse_add(sock_net(sk), -1); 2116 2117 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) 2118 sock_diag_broadcast_destroy(sk); 2119 else 2120 sk_destruct(sk); 2121 } 2122 2123 void sk_free(struct sock *sk) 2124 { 2125 /* 2126 * We subtract one from sk_wmem_alloc and can know if 2127 * some packets are still in some tx queue. 2128 * If not null, sock_wfree() will call __sk_free(sk) later 2129 */ 2130 if (refcount_dec_and_test(&sk->sk_wmem_alloc)) 2131 __sk_free(sk); 2132 } 2133 EXPORT_SYMBOL(sk_free); 2134 2135 static void sk_init_common(struct sock *sk) 2136 { 2137 skb_queue_head_init(&sk->sk_receive_queue); 2138 skb_queue_head_init(&sk->sk_write_queue); 2139 skb_queue_head_init(&sk->sk_error_queue); 2140 2141 rwlock_init(&sk->sk_callback_lock); 2142 lockdep_set_class_and_name(&sk->sk_receive_queue.lock, 2143 af_rlock_keys + sk->sk_family, 2144 af_family_rlock_key_strings[sk->sk_family]); 2145 lockdep_set_class_and_name(&sk->sk_write_queue.lock, 2146 af_wlock_keys + sk->sk_family, 2147 af_family_wlock_key_strings[sk->sk_family]); 2148 lockdep_set_class_and_name(&sk->sk_error_queue.lock, 2149 af_elock_keys + sk->sk_family, 2150 af_family_elock_key_strings[sk->sk_family]); 2151 lockdep_set_class_and_name(&sk->sk_callback_lock, 2152 af_callback_keys + sk->sk_family, 2153 af_family_clock_key_strings[sk->sk_family]); 2154 } 2155 2156 /** 2157 * sk_clone_lock - clone a socket, and lock its clone 2158 * @sk: the socket to clone 2159 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 2160 * 2161 * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) 2162 */ 2163 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) 2164 { 2165 struct proto *prot = READ_ONCE(sk->sk_prot); 2166 struct sk_filter *filter; 2167 bool is_charged = true; 2168 struct sock *newsk; 2169 2170 newsk = sk_prot_alloc(prot, priority, sk->sk_family); 2171 if (!newsk) 2172 goto out; 2173 2174 sock_copy(newsk, sk); 2175 2176 newsk->sk_prot_creator = prot; 2177 2178 /* SANITY */ 2179 if (likely(newsk->sk_net_refcnt)) { 2180 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority); 2181 sock_inuse_add(sock_net(newsk), 1); 2182 } 2183 sk_node_init(&newsk->sk_node); 2184 sock_lock_init(newsk); 2185 bh_lock_sock(newsk); 2186 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 2187 newsk->sk_backlog.len = 0; 2188 2189 atomic_set(&newsk->sk_rmem_alloc, 0); 2190 2191 /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */ 2192 refcount_set(&newsk->sk_wmem_alloc, 1); 2193 2194 atomic_set(&newsk->sk_omem_alloc, 0); 2195 sk_init_common(newsk); 2196 2197 newsk->sk_dst_cache = NULL; 2198 newsk->sk_dst_pending_confirm = 0; 2199 newsk->sk_wmem_queued = 0; 2200 newsk->sk_forward_alloc = 0; 2201 newsk->sk_reserved_mem = 0; 2202 atomic_set(&newsk->sk_drops, 0); 2203 newsk->sk_send_head = NULL; 2204 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; 2205 atomic_set(&newsk->sk_zckey, 0); 2206 2207 sock_reset_flag(newsk, SOCK_DONE); 2208 2209 /* sk->sk_memcg will be populated at accept() time */ 2210 newsk->sk_memcg = NULL; 2211 2212 cgroup_sk_clone(&newsk->sk_cgrp_data); 2213 2214 rcu_read_lock(); 2215 filter = rcu_dereference(sk->sk_filter); 2216 if (filter != NULL) 2217 /* though it's an empty new sock, the charging may fail 2218 * if sysctl_optmem_max was changed between creation of 2219 * original socket and cloning 2220 */ 2221 is_charged = sk_filter_charge(newsk, filter); 2222 RCU_INIT_POINTER(newsk->sk_filter, filter); 2223 rcu_read_unlock(); 2224 2225 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { 2226 /* We need to make sure that we don't uncharge the new 2227 * socket if we couldn't charge it in the first place 2228 * as otherwise we uncharge the parent's filter. 2229 */ 2230 if (!is_charged) 2231 RCU_INIT_POINTER(newsk->sk_filter, NULL); 2232 sk_free_unlock_clone(newsk); 2233 newsk = NULL; 2234 goto out; 2235 } 2236 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); 2237 2238 if (bpf_sk_storage_clone(sk, newsk)) { 2239 sk_free_unlock_clone(newsk); 2240 newsk = NULL; 2241 goto out; 2242 } 2243 2244 /* Clear sk_user_data if parent had the pointer tagged 2245 * as not suitable for copying when cloning. 2246 */ 2247 if (sk_user_data_is_nocopy(newsk)) 2248 newsk->sk_user_data = NULL; 2249 2250 newsk->sk_err = 0; 2251 newsk->sk_err_soft = 0; 2252 newsk->sk_priority = 0; 2253 newsk->sk_incoming_cpu = raw_smp_processor_id(); 2254 2255 /* Before updating sk_refcnt, we must commit prior changes to memory 2256 * (Documentation/RCU/rculist_nulls.rst for details) 2257 */ 2258 smp_wmb(); 2259 refcount_set(&newsk->sk_refcnt, 2); 2260 2261 /* Increment the counter in the same struct proto as the master 2262 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that 2263 * is the same as sk->sk_prot->socks, as this field was copied 2264 * with memcpy). 2265 * 2266 * This _changes_ the previous behaviour, where 2267 * tcp_create_openreq_child always was incrementing the 2268 * equivalent to tcp_prot->socks (inet_sock_nr), so this have 2269 * to be taken into account in all callers. -acme 2270 */ 2271 sk_refcnt_debug_inc(newsk); 2272 sk_set_socket(newsk, NULL); 2273 sk_tx_queue_clear(newsk); 2274 RCU_INIT_POINTER(newsk->sk_wq, NULL); 2275 2276 if (newsk->sk_prot->sockets_allocated) 2277 sk_sockets_allocated_inc(newsk); 2278 2279 if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP) 2280 net_enable_timestamp(); 2281 out: 2282 return newsk; 2283 } 2284 EXPORT_SYMBOL_GPL(sk_clone_lock); 2285 2286 void sk_free_unlock_clone(struct sock *sk) 2287 { 2288 /* It is still raw copy of parent, so invalidate 2289 * destructor and make plain sk_free() */ 2290 sk->sk_destruct = NULL; 2291 bh_unlock_sock(sk); 2292 sk_free(sk); 2293 } 2294 EXPORT_SYMBOL_GPL(sk_free_unlock_clone); 2295 2296 void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 2297 { 2298 u32 max_segs = 1; 2299 2300 sk_dst_set(sk, dst); 2301 sk->sk_route_caps = dst->dev->features; 2302 if (sk_is_tcp(sk)) 2303 sk->sk_route_caps |= NETIF_F_GSO; 2304 if (sk->sk_route_caps & NETIF_F_GSO) 2305 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 2306 if (unlikely(sk->sk_gso_disabled)) 2307 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 2308 if (sk_can_gso(sk)) { 2309 if (dst->header_len && !xfrm_dst_offload_ok(dst)) { 2310 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 2311 } else { 2312 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 2313 /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */ 2314 sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size); 2315 sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1); 2316 /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ 2317 max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1); 2318 } 2319 } 2320 sk->sk_gso_max_segs = max_segs; 2321 } 2322 EXPORT_SYMBOL_GPL(sk_setup_caps); 2323 2324 /* 2325 * Simple resource managers for sockets. 2326 */ 2327 2328 2329 /* 2330 * Write buffer destructor automatically called from kfree_skb. 2331 */ 2332 void sock_wfree(struct sk_buff *skb) 2333 { 2334 struct sock *sk = skb->sk; 2335 unsigned int len = skb->truesize; 2336 bool free; 2337 2338 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { 2339 if (sock_flag(sk, SOCK_RCU_FREE) && 2340 sk->sk_write_space == sock_def_write_space) { 2341 rcu_read_lock(); 2342 free = refcount_sub_and_test(len, &sk->sk_wmem_alloc); 2343 sock_def_write_space_wfree(sk); 2344 rcu_read_unlock(); 2345 if (unlikely(free)) 2346 __sk_free(sk); 2347 return; 2348 } 2349 2350 /* 2351 * Keep a reference on sk_wmem_alloc, this will be released 2352 * after sk_write_space() call 2353 */ 2354 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); 2355 sk->sk_write_space(sk); 2356 len = 1; 2357 } 2358 /* 2359 * if sk_wmem_alloc reaches 0, we must finish what sk_free() 2360 * could not do because of in-flight packets 2361 */ 2362 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) 2363 __sk_free(sk); 2364 } 2365 EXPORT_SYMBOL(sock_wfree); 2366 2367 /* This variant of sock_wfree() is used by TCP, 2368 * since it sets SOCK_USE_WRITE_QUEUE. 2369 */ 2370 void __sock_wfree(struct sk_buff *skb) 2371 { 2372 struct sock *sk = skb->sk; 2373 2374 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) 2375 __sk_free(sk); 2376 } 2377 2378 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) 2379 { 2380 skb_orphan(skb); 2381 skb->sk = sk; 2382 #ifdef CONFIG_INET 2383 if (unlikely(!sk_fullsock(sk))) { 2384 skb->destructor = sock_edemux; 2385 sock_hold(sk); 2386 return; 2387 } 2388 #endif 2389 skb->destructor = sock_wfree; 2390 skb_set_hash_from_sk(skb, sk); 2391 /* 2392 * We used to take a refcount on sk, but following operation 2393 * is enough to guarantee sk_free() wont free this sock until 2394 * all in-flight packets are completed 2395 */ 2396 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 2397 } 2398 EXPORT_SYMBOL(skb_set_owner_w); 2399 2400 static bool can_skb_orphan_partial(const struct sk_buff *skb) 2401 { 2402 #ifdef CONFIG_TLS_DEVICE 2403 /* Drivers depend on in-order delivery for crypto offload, 2404 * partial orphan breaks out-of-order-OK logic. 2405 */ 2406 if (skb->decrypted) 2407 return false; 2408 #endif 2409 return (skb->destructor == sock_wfree || 2410 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); 2411 } 2412 2413 /* This helper is used by netem, as it can hold packets in its 2414 * delay queue. We want to allow the owner socket to send more 2415 * packets, as if they were already TX completed by a typical driver. 2416 * But we also want to keep skb->sk set because some packet schedulers 2417 * rely on it (sch_fq for example). 2418 */ 2419 void skb_orphan_partial(struct sk_buff *skb) 2420 { 2421 if (skb_is_tcp_pure_ack(skb)) 2422 return; 2423 2424 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk)) 2425 return; 2426 2427 skb_orphan(skb); 2428 } 2429 EXPORT_SYMBOL(skb_orphan_partial); 2430 2431 /* 2432 * Read buffer destructor automatically called from kfree_skb. 2433 */ 2434 void sock_rfree(struct sk_buff *skb) 2435 { 2436 struct sock *sk = skb->sk; 2437 unsigned int len = skb->truesize; 2438 2439 atomic_sub(len, &sk->sk_rmem_alloc); 2440 sk_mem_uncharge(sk, len); 2441 } 2442 EXPORT_SYMBOL(sock_rfree); 2443 2444 /* 2445 * Buffer destructor for skbs that are not used directly in read or write 2446 * path, e.g. for error handler skbs. Automatically called from kfree_skb. 2447 */ 2448 void sock_efree(struct sk_buff *skb) 2449 { 2450 sock_put(skb->sk); 2451 } 2452 EXPORT_SYMBOL(sock_efree); 2453 2454 /* Buffer destructor for prefetch/receive path where reference count may 2455 * not be held, e.g. for listen sockets. 2456 */ 2457 #ifdef CONFIG_INET 2458 void sock_pfree(struct sk_buff *skb) 2459 { 2460 if (sk_is_refcounted(skb->sk)) 2461 sock_gen_put(skb->sk); 2462 } 2463 EXPORT_SYMBOL(sock_pfree); 2464 #endif /* CONFIG_INET */ 2465 2466 kuid_t sock_i_uid(struct sock *sk) 2467 { 2468 kuid_t uid; 2469 2470 read_lock_bh(&sk->sk_callback_lock); 2471 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; 2472 read_unlock_bh(&sk->sk_callback_lock); 2473 return uid; 2474 } 2475 EXPORT_SYMBOL(sock_i_uid); 2476 2477 unsigned long sock_i_ino(struct sock *sk) 2478 { 2479 unsigned long ino; 2480 2481 read_lock_bh(&sk->sk_callback_lock); 2482 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; 2483 read_unlock_bh(&sk->sk_callback_lock); 2484 return ino; 2485 } 2486 EXPORT_SYMBOL(sock_i_ino); 2487 2488 /* 2489 * Allocate a skb from the socket's send buffer. 2490 */ 2491 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 2492 gfp_t priority) 2493 { 2494 if (force || 2495 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { 2496 struct sk_buff *skb = alloc_skb(size, priority); 2497 2498 if (skb) { 2499 skb_set_owner_w(skb, sk); 2500 return skb; 2501 } 2502 } 2503 return NULL; 2504 } 2505 EXPORT_SYMBOL(sock_wmalloc); 2506 2507 static void sock_ofree(struct sk_buff *skb) 2508 { 2509 struct sock *sk = skb->sk; 2510 2511 atomic_sub(skb->truesize, &sk->sk_omem_alloc); 2512 } 2513 2514 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, 2515 gfp_t priority) 2516 { 2517 struct sk_buff *skb; 2518 2519 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ 2520 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > 2521 sysctl_optmem_max) 2522 return NULL; 2523 2524 skb = alloc_skb(size, priority); 2525 if (!skb) 2526 return NULL; 2527 2528 atomic_add(skb->truesize, &sk->sk_omem_alloc); 2529 skb->sk = sk; 2530 skb->destructor = sock_ofree; 2531 return skb; 2532 } 2533 2534 /* 2535 * Allocate a memory block from the socket's option memory buffer. 2536 */ 2537 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) 2538 { 2539 if ((unsigned int)size <= sysctl_optmem_max && 2540 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { 2541 void *mem; 2542 /* First do the add, to avoid the race if kmalloc 2543 * might sleep. 2544 */ 2545 atomic_add(size, &sk->sk_omem_alloc); 2546 mem = kmalloc(size, priority); 2547 if (mem) 2548 return mem; 2549 atomic_sub(size, &sk->sk_omem_alloc); 2550 } 2551 return NULL; 2552 } 2553 EXPORT_SYMBOL(sock_kmalloc); 2554 2555 /* Free an option memory block. Note, we actually want the inline 2556 * here as this allows gcc to detect the nullify and fold away the 2557 * condition entirely. 2558 */ 2559 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, 2560 const bool nullify) 2561 { 2562 if (WARN_ON_ONCE(!mem)) 2563 return; 2564 if (nullify) 2565 kfree_sensitive(mem); 2566 else 2567 kfree(mem); 2568 atomic_sub(size, &sk->sk_omem_alloc); 2569 } 2570 2571 void sock_kfree_s(struct sock *sk, void *mem, int size) 2572 { 2573 __sock_kfree_s(sk, mem, size, false); 2574 } 2575 EXPORT_SYMBOL(sock_kfree_s); 2576 2577 void sock_kzfree_s(struct sock *sk, void *mem, int size) 2578 { 2579 __sock_kfree_s(sk, mem, size, true); 2580 } 2581 EXPORT_SYMBOL(sock_kzfree_s); 2582 2583 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. 2584 I think, these locks should be removed for datagram sockets. 2585 */ 2586 static long sock_wait_for_wmem(struct sock *sk, long timeo) 2587 { 2588 DEFINE_WAIT(wait); 2589 2590 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2591 for (;;) { 2592 if (!timeo) 2593 break; 2594 if (signal_pending(current)) 2595 break; 2596 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2597 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 2598 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) 2599 break; 2600 if (sk->sk_shutdown & SEND_SHUTDOWN) 2601 break; 2602 if (sk->sk_err) 2603 break; 2604 timeo = schedule_timeout(timeo); 2605 } 2606 finish_wait(sk_sleep(sk), &wait); 2607 return timeo; 2608 } 2609 2610 2611 /* 2612 * Generic send/receive buffer handlers 2613 */ 2614 2615 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 2616 unsigned long data_len, int noblock, 2617 int *errcode, int max_page_order) 2618 { 2619 struct sk_buff *skb; 2620 long timeo; 2621 int err; 2622 2623 timeo = sock_sndtimeo(sk, noblock); 2624 for (;;) { 2625 err = sock_error(sk); 2626 if (err != 0) 2627 goto failure; 2628 2629 err = -EPIPE; 2630 if (sk->sk_shutdown & SEND_SHUTDOWN) 2631 goto failure; 2632 2633 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) 2634 break; 2635 2636 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2637 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2638 err = -EAGAIN; 2639 if (!timeo) 2640 goto failure; 2641 if (signal_pending(current)) 2642 goto interrupted; 2643 timeo = sock_wait_for_wmem(sk, timeo); 2644 } 2645 skb = alloc_skb_with_frags(header_len, data_len, max_page_order, 2646 errcode, sk->sk_allocation); 2647 if (skb) 2648 skb_set_owner_w(skb, sk); 2649 return skb; 2650 2651 interrupted: 2652 err = sock_intr_errno(timeo); 2653 failure: 2654 *errcode = err; 2655 return NULL; 2656 } 2657 EXPORT_SYMBOL(sock_alloc_send_pskb); 2658 2659 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, 2660 struct sockcm_cookie *sockc) 2661 { 2662 u32 tsflags; 2663 2664 switch (cmsg->cmsg_type) { 2665 case SO_MARK: 2666 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && 2667 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 2668 return -EPERM; 2669 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2670 return -EINVAL; 2671 sockc->mark = *(u32 *)CMSG_DATA(cmsg); 2672 break; 2673 case SO_TIMESTAMPING_OLD: 2674 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2675 return -EINVAL; 2676 2677 tsflags = *(u32 *)CMSG_DATA(cmsg); 2678 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) 2679 return -EINVAL; 2680 2681 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; 2682 sockc->tsflags |= tsflags; 2683 break; 2684 case SCM_TXTIME: 2685 if (!sock_flag(sk, SOCK_TXTIME)) 2686 return -EINVAL; 2687 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) 2688 return -EINVAL; 2689 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); 2690 break; 2691 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ 2692 case SCM_RIGHTS: 2693 case SCM_CREDENTIALS: 2694 break; 2695 default: 2696 return -EINVAL; 2697 } 2698 return 0; 2699 } 2700 EXPORT_SYMBOL(__sock_cmsg_send); 2701 2702 int sock_cmsg_send(struct sock *sk, struct msghdr *msg, 2703 struct sockcm_cookie *sockc) 2704 { 2705 struct cmsghdr *cmsg; 2706 int ret; 2707 2708 for_each_cmsghdr(cmsg, msg) { 2709 if (!CMSG_OK(msg, cmsg)) 2710 return -EINVAL; 2711 if (cmsg->cmsg_level != SOL_SOCKET) 2712 continue; 2713 ret = __sock_cmsg_send(sk, msg, cmsg, sockc); 2714 if (ret) 2715 return ret; 2716 } 2717 return 0; 2718 } 2719 EXPORT_SYMBOL(sock_cmsg_send); 2720 2721 static void sk_enter_memory_pressure(struct sock *sk) 2722 { 2723 if (!sk->sk_prot->enter_memory_pressure) 2724 return; 2725 2726 sk->sk_prot->enter_memory_pressure(sk); 2727 } 2728 2729 static void sk_leave_memory_pressure(struct sock *sk) 2730 { 2731 if (sk->sk_prot->leave_memory_pressure) { 2732 sk->sk_prot->leave_memory_pressure(sk); 2733 } else { 2734 unsigned long *memory_pressure = sk->sk_prot->memory_pressure; 2735 2736 if (memory_pressure && READ_ONCE(*memory_pressure)) 2737 WRITE_ONCE(*memory_pressure, 0); 2738 } 2739 } 2740 2741 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); 2742 2743 /** 2744 * skb_page_frag_refill - check that a page_frag contains enough room 2745 * @sz: minimum size of the fragment we want to get 2746 * @pfrag: pointer to page_frag 2747 * @gfp: priority for memory allocation 2748 * 2749 * Note: While this allocator tries to use high order pages, there is 2750 * no guarantee that allocations succeed. Therefore, @sz MUST be 2751 * less or equal than PAGE_SIZE. 2752 */ 2753 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) 2754 { 2755 if (pfrag->page) { 2756 if (page_ref_count(pfrag->page) == 1) { 2757 pfrag->offset = 0; 2758 return true; 2759 } 2760 if (pfrag->offset + sz <= pfrag->size) 2761 return true; 2762 put_page(pfrag->page); 2763 } 2764 2765 pfrag->offset = 0; 2766 if (SKB_FRAG_PAGE_ORDER && 2767 !static_branch_unlikely(&net_high_order_alloc_disable_key)) { 2768 /* Avoid direct reclaim but allow kswapd to wake */ 2769 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | 2770 __GFP_COMP | __GFP_NOWARN | 2771 __GFP_NORETRY, 2772 SKB_FRAG_PAGE_ORDER); 2773 if (likely(pfrag->page)) { 2774 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; 2775 return true; 2776 } 2777 } 2778 pfrag->page = alloc_page(gfp); 2779 if (likely(pfrag->page)) { 2780 pfrag->size = PAGE_SIZE; 2781 return true; 2782 } 2783 return false; 2784 } 2785 EXPORT_SYMBOL(skb_page_frag_refill); 2786 2787 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 2788 { 2789 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) 2790 return true; 2791 2792 sk_enter_memory_pressure(sk); 2793 sk_stream_moderate_sndbuf(sk); 2794 return false; 2795 } 2796 EXPORT_SYMBOL(sk_page_frag_refill); 2797 2798 void __lock_sock(struct sock *sk) 2799 __releases(&sk->sk_lock.slock) 2800 __acquires(&sk->sk_lock.slock) 2801 { 2802 DEFINE_WAIT(wait); 2803 2804 for (;;) { 2805 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, 2806 TASK_UNINTERRUPTIBLE); 2807 spin_unlock_bh(&sk->sk_lock.slock); 2808 schedule(); 2809 spin_lock_bh(&sk->sk_lock.slock); 2810 if (!sock_owned_by_user(sk)) 2811 break; 2812 } 2813 finish_wait(&sk->sk_lock.wq, &wait); 2814 } 2815 2816 void __release_sock(struct sock *sk) 2817 __releases(&sk->sk_lock.slock) 2818 __acquires(&sk->sk_lock.slock) 2819 { 2820 struct sk_buff *skb, *next; 2821 2822 while ((skb = sk->sk_backlog.head) != NULL) { 2823 sk->sk_backlog.head = sk->sk_backlog.tail = NULL; 2824 2825 spin_unlock_bh(&sk->sk_lock.slock); 2826 2827 do { 2828 next = skb->next; 2829 prefetch(next); 2830 WARN_ON_ONCE(skb_dst_is_noref(skb)); 2831 skb_mark_not_on_list(skb); 2832 sk_backlog_rcv(sk, skb); 2833 2834 cond_resched(); 2835 2836 skb = next; 2837 } while (skb != NULL); 2838 2839 spin_lock_bh(&sk->sk_lock.slock); 2840 } 2841 2842 /* 2843 * Doing the zeroing here guarantee we can not loop forever 2844 * while a wild producer attempts to flood us. 2845 */ 2846 sk->sk_backlog.len = 0; 2847 } 2848 2849 void __sk_flush_backlog(struct sock *sk) 2850 { 2851 spin_lock_bh(&sk->sk_lock.slock); 2852 __release_sock(sk); 2853 spin_unlock_bh(&sk->sk_lock.slock); 2854 } 2855 2856 /** 2857 * sk_wait_data - wait for data to arrive at sk_receive_queue 2858 * @sk: sock to wait on 2859 * @timeo: for how long 2860 * @skb: last skb seen on sk_receive_queue 2861 * 2862 * Now socket state including sk->sk_err is changed only under lock, 2863 * hence we may omit checks after joining wait queue. 2864 * We check receive queue before schedule() only as optimization; 2865 * it is very likely that release_sock() added new data. 2866 */ 2867 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) 2868 { 2869 DEFINE_WAIT_FUNC(wait, woken_wake_function); 2870 int rc; 2871 2872 add_wait_queue(sk_sleep(sk), &wait); 2873 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2874 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); 2875 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2876 remove_wait_queue(sk_sleep(sk), &wait); 2877 return rc; 2878 } 2879 EXPORT_SYMBOL(sk_wait_data); 2880 2881 /** 2882 * __sk_mem_raise_allocated - increase memory_allocated 2883 * @sk: socket 2884 * @size: memory size to allocate 2885 * @amt: pages to allocate 2886 * @kind: allocation type 2887 * 2888 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc 2889 */ 2890 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) 2891 { 2892 struct proto *prot = sk->sk_prot; 2893 long allocated = sk_memory_allocated_add(sk, amt); 2894 bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg; 2895 bool charged = true; 2896 2897 if (memcg_charge && 2898 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt, 2899 gfp_memcg_charge()))) 2900 goto suppress_allocation; 2901 2902 /* Under limit. */ 2903 if (allocated <= sk_prot_mem_limits(sk, 0)) { 2904 sk_leave_memory_pressure(sk); 2905 return 1; 2906 } 2907 2908 /* Under pressure. */ 2909 if (allocated > sk_prot_mem_limits(sk, 1)) 2910 sk_enter_memory_pressure(sk); 2911 2912 /* Over hard limit. */ 2913 if (allocated > sk_prot_mem_limits(sk, 2)) 2914 goto suppress_allocation; 2915 2916 /* guarantee minimum buffer size under pressure */ 2917 if (kind == SK_MEM_RECV) { 2918 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) 2919 return 1; 2920 2921 } else { /* SK_MEM_SEND */ 2922 int wmem0 = sk_get_wmem0(sk, prot); 2923 2924 if (sk->sk_type == SOCK_STREAM) { 2925 if (sk->sk_wmem_queued < wmem0) 2926 return 1; 2927 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { 2928 return 1; 2929 } 2930 } 2931 2932 if (sk_has_memory_pressure(sk)) { 2933 u64 alloc; 2934 2935 if (!sk_under_memory_pressure(sk)) 2936 return 1; 2937 alloc = sk_sockets_allocated_read_positive(sk); 2938 if (sk_prot_mem_limits(sk, 2) > alloc * 2939 sk_mem_pages(sk->sk_wmem_queued + 2940 atomic_read(&sk->sk_rmem_alloc) + 2941 sk->sk_forward_alloc)) 2942 return 1; 2943 } 2944 2945 suppress_allocation: 2946 2947 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { 2948 sk_stream_moderate_sndbuf(sk); 2949 2950 /* Fail only if socket is _under_ its sndbuf. 2951 * In this case we cannot block, so that we have to fail. 2952 */ 2953 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) { 2954 /* Force charge with __GFP_NOFAIL */ 2955 if (memcg_charge && !charged) { 2956 mem_cgroup_charge_skmem(sk->sk_memcg, amt, 2957 gfp_memcg_charge() | __GFP_NOFAIL); 2958 } 2959 return 1; 2960 } 2961 } 2962 2963 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) 2964 trace_sock_exceed_buf_limit(sk, prot, allocated, kind); 2965 2966 sk_memory_allocated_sub(sk, amt); 2967 2968 if (memcg_charge && charged) 2969 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); 2970 2971 return 0; 2972 } 2973 EXPORT_SYMBOL(__sk_mem_raise_allocated); 2974 2975 /** 2976 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 2977 * @sk: socket 2978 * @size: memory size to allocate 2979 * @kind: allocation type 2980 * 2981 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 2982 * rmem allocation. This function assumes that protocols which have 2983 * memory_pressure use sk_wmem_queued as write buffer accounting. 2984 */ 2985 int __sk_mem_schedule(struct sock *sk, int size, int kind) 2986 { 2987 int ret, amt = sk_mem_pages(size); 2988 2989 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT; 2990 ret = __sk_mem_raise_allocated(sk, size, amt, kind); 2991 if (!ret) 2992 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT; 2993 return ret; 2994 } 2995 EXPORT_SYMBOL(__sk_mem_schedule); 2996 2997 /** 2998 * __sk_mem_reduce_allocated - reclaim memory_allocated 2999 * @sk: socket 3000 * @amount: number of quanta 3001 * 3002 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc 3003 */ 3004 void __sk_mem_reduce_allocated(struct sock *sk, int amount) 3005 { 3006 sk_memory_allocated_sub(sk, amount); 3007 3008 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 3009 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); 3010 3011 if (sk_under_memory_pressure(sk) && 3012 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 3013 sk_leave_memory_pressure(sk); 3014 } 3015 EXPORT_SYMBOL(__sk_mem_reduce_allocated); 3016 3017 /** 3018 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated 3019 * @sk: socket 3020 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) 3021 */ 3022 void __sk_mem_reclaim(struct sock *sk, int amount) 3023 { 3024 amount >>= SK_MEM_QUANTUM_SHIFT; 3025 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; 3026 __sk_mem_reduce_allocated(sk, amount); 3027 } 3028 EXPORT_SYMBOL(__sk_mem_reclaim); 3029 3030 int sk_set_peek_off(struct sock *sk, int val) 3031 { 3032 sk->sk_peek_off = val; 3033 return 0; 3034 } 3035 EXPORT_SYMBOL_GPL(sk_set_peek_off); 3036 3037 /* 3038 * Set of default routines for initialising struct proto_ops when 3039 * the protocol does not support a particular function. In certain 3040 * cases where it makes no sense for a protocol to have a "do nothing" 3041 * function, some default processing is provided. 3042 */ 3043 3044 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) 3045 { 3046 return -EOPNOTSUPP; 3047 } 3048 EXPORT_SYMBOL(sock_no_bind); 3049 3050 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 3051 int len, int flags) 3052 { 3053 return -EOPNOTSUPP; 3054 } 3055 EXPORT_SYMBOL(sock_no_connect); 3056 3057 int sock_no_socketpair(struct socket *sock1, struct socket *sock2) 3058 { 3059 return -EOPNOTSUPP; 3060 } 3061 EXPORT_SYMBOL(sock_no_socketpair); 3062 3063 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, 3064 bool kern) 3065 { 3066 return -EOPNOTSUPP; 3067 } 3068 EXPORT_SYMBOL(sock_no_accept); 3069 3070 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 3071 int peer) 3072 { 3073 return -EOPNOTSUPP; 3074 } 3075 EXPORT_SYMBOL(sock_no_getname); 3076 3077 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3078 { 3079 return -EOPNOTSUPP; 3080 } 3081 EXPORT_SYMBOL(sock_no_ioctl); 3082 3083 int sock_no_listen(struct socket *sock, int backlog) 3084 { 3085 return -EOPNOTSUPP; 3086 } 3087 EXPORT_SYMBOL(sock_no_listen); 3088 3089 int sock_no_shutdown(struct socket *sock, int how) 3090 { 3091 return -EOPNOTSUPP; 3092 } 3093 EXPORT_SYMBOL(sock_no_shutdown); 3094 3095 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) 3096 { 3097 return -EOPNOTSUPP; 3098 } 3099 EXPORT_SYMBOL(sock_no_sendmsg); 3100 3101 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) 3102 { 3103 return -EOPNOTSUPP; 3104 } 3105 EXPORT_SYMBOL(sock_no_sendmsg_locked); 3106 3107 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, 3108 int flags) 3109 { 3110 return -EOPNOTSUPP; 3111 } 3112 EXPORT_SYMBOL(sock_no_recvmsg); 3113 3114 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 3115 { 3116 /* Mirror missing mmap method error code */ 3117 return -ENODEV; 3118 } 3119 EXPORT_SYMBOL(sock_no_mmap); 3120 3121 /* 3122 * When a file is received (via SCM_RIGHTS, etc), we must bump the 3123 * various sock-based usage counts. 3124 */ 3125 void __receive_sock(struct file *file) 3126 { 3127 struct socket *sock; 3128 3129 sock = sock_from_file(file); 3130 if (sock) { 3131 sock_update_netprioidx(&sock->sk->sk_cgrp_data); 3132 sock_update_classid(&sock->sk->sk_cgrp_data); 3133 } 3134 } 3135 3136 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) 3137 { 3138 ssize_t res; 3139 struct msghdr msg = {.msg_flags = flags}; 3140 struct kvec iov; 3141 char *kaddr = kmap(page); 3142 iov.iov_base = kaddr + offset; 3143 iov.iov_len = size; 3144 res = kernel_sendmsg(sock, &msg, &iov, 1, size); 3145 kunmap(page); 3146 return res; 3147 } 3148 EXPORT_SYMBOL(sock_no_sendpage); 3149 3150 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page, 3151 int offset, size_t size, int flags) 3152 { 3153 ssize_t res; 3154 struct msghdr msg = {.msg_flags = flags}; 3155 struct kvec iov; 3156 char *kaddr = kmap(page); 3157 3158 iov.iov_base = kaddr + offset; 3159 iov.iov_len = size; 3160 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size); 3161 kunmap(page); 3162 return res; 3163 } 3164 EXPORT_SYMBOL(sock_no_sendpage_locked); 3165 3166 /* 3167 * Default Socket Callbacks 3168 */ 3169 3170 static void sock_def_wakeup(struct sock *sk) 3171 { 3172 struct socket_wq *wq; 3173 3174 rcu_read_lock(); 3175 wq = rcu_dereference(sk->sk_wq); 3176 if (skwq_has_sleeper(wq)) 3177 wake_up_interruptible_all(&wq->wait); 3178 rcu_read_unlock(); 3179 } 3180 3181 static void sock_def_error_report(struct sock *sk) 3182 { 3183 struct socket_wq *wq; 3184 3185 rcu_read_lock(); 3186 wq = rcu_dereference(sk->sk_wq); 3187 if (skwq_has_sleeper(wq)) 3188 wake_up_interruptible_poll(&wq->wait, EPOLLERR); 3189 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); 3190 rcu_read_unlock(); 3191 } 3192 3193 void sock_def_readable(struct sock *sk) 3194 { 3195 struct socket_wq *wq; 3196 3197 rcu_read_lock(); 3198 wq = rcu_dereference(sk->sk_wq); 3199 if (skwq_has_sleeper(wq)) 3200 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | 3201 EPOLLRDNORM | EPOLLRDBAND); 3202 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 3203 rcu_read_unlock(); 3204 } 3205 3206 static void sock_def_write_space(struct sock *sk) 3207 { 3208 struct socket_wq *wq; 3209 3210 rcu_read_lock(); 3211 3212 /* Do not wake up a writer until he can make "significant" 3213 * progress. --DaveM 3214 */ 3215 if (sock_writeable(sk)) { 3216 wq = rcu_dereference(sk->sk_wq); 3217 if (skwq_has_sleeper(wq)) 3218 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 3219 EPOLLWRNORM | EPOLLWRBAND); 3220 3221 /* Should agree with poll, otherwise some programs break */ 3222 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 3223 } 3224 3225 rcu_read_unlock(); 3226 } 3227 3228 /* An optimised version of sock_def_write_space(), should only be called 3229 * for SOCK_RCU_FREE sockets under RCU read section and after putting 3230 * ->sk_wmem_alloc. 3231 */ 3232 static void sock_def_write_space_wfree(struct sock *sk) 3233 { 3234 /* Do not wake up a writer until he can make "significant" 3235 * progress. --DaveM 3236 */ 3237 if (sock_writeable(sk)) { 3238 struct socket_wq *wq = rcu_dereference(sk->sk_wq); 3239 3240 /* rely on refcount_sub from sock_wfree() */ 3241 smp_mb__after_atomic(); 3242 if (wq && waitqueue_active(&wq->wait)) 3243 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 3244 EPOLLWRNORM | EPOLLWRBAND); 3245 3246 /* Should agree with poll, otherwise some programs break */ 3247 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 3248 } 3249 } 3250 3251 static void sock_def_destruct(struct sock *sk) 3252 { 3253 } 3254 3255 void sk_send_sigurg(struct sock *sk) 3256 { 3257 if (sk->sk_socket && sk->sk_socket->file) 3258 if (send_sigurg(&sk->sk_socket->file->f_owner)) 3259 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 3260 } 3261 EXPORT_SYMBOL(sk_send_sigurg); 3262 3263 void sk_reset_timer(struct sock *sk, struct timer_list* timer, 3264 unsigned long expires) 3265 { 3266 if (!mod_timer(timer, expires)) 3267 sock_hold(sk); 3268 } 3269 EXPORT_SYMBOL(sk_reset_timer); 3270 3271 void sk_stop_timer(struct sock *sk, struct timer_list* timer) 3272 { 3273 if (del_timer(timer)) 3274 __sock_put(sk); 3275 } 3276 EXPORT_SYMBOL(sk_stop_timer); 3277 3278 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) 3279 { 3280 if (del_timer_sync(timer)) 3281 __sock_put(sk); 3282 } 3283 EXPORT_SYMBOL(sk_stop_timer_sync); 3284 3285 void sock_init_data(struct socket *sock, struct sock *sk) 3286 { 3287 sk_init_common(sk); 3288 sk->sk_send_head = NULL; 3289 3290 timer_setup(&sk->sk_timer, NULL, 0); 3291 3292 sk->sk_allocation = GFP_KERNEL; 3293 sk->sk_rcvbuf = sysctl_rmem_default; 3294 sk->sk_sndbuf = sysctl_wmem_default; 3295 sk->sk_state = TCP_CLOSE; 3296 sk_set_socket(sk, sock); 3297 3298 sock_set_flag(sk, SOCK_ZAPPED); 3299 3300 if (sock) { 3301 sk->sk_type = sock->type; 3302 RCU_INIT_POINTER(sk->sk_wq, &sock->wq); 3303 sock->sk = sk; 3304 sk->sk_uid = SOCK_INODE(sock)->i_uid; 3305 } else { 3306 RCU_INIT_POINTER(sk->sk_wq, NULL); 3307 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0); 3308 } 3309 3310 rwlock_init(&sk->sk_callback_lock); 3311 if (sk->sk_kern_sock) 3312 lockdep_set_class_and_name( 3313 &sk->sk_callback_lock, 3314 af_kern_callback_keys + sk->sk_family, 3315 af_family_kern_clock_key_strings[sk->sk_family]); 3316 else 3317 lockdep_set_class_and_name( 3318 &sk->sk_callback_lock, 3319 af_callback_keys + sk->sk_family, 3320 af_family_clock_key_strings[sk->sk_family]); 3321 3322 sk->sk_state_change = sock_def_wakeup; 3323 sk->sk_data_ready = sock_def_readable; 3324 sk->sk_write_space = sock_def_write_space; 3325 sk->sk_error_report = sock_def_error_report; 3326 sk->sk_destruct = sock_def_destruct; 3327 3328 sk->sk_frag.page = NULL; 3329 sk->sk_frag.offset = 0; 3330 sk->sk_peek_off = -1; 3331 3332 sk->sk_peer_pid = NULL; 3333 sk->sk_peer_cred = NULL; 3334 spin_lock_init(&sk->sk_peer_lock); 3335 3336 sk->sk_write_pending = 0; 3337 sk->sk_rcvlowat = 1; 3338 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 3339 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 3340 3341 sk->sk_stamp = SK_DEFAULT_STAMP; 3342 #if BITS_PER_LONG==32 3343 seqlock_init(&sk->sk_stamp_seq); 3344 #endif 3345 atomic_set(&sk->sk_zckey, 0); 3346 3347 #ifdef CONFIG_NET_RX_BUSY_POLL 3348 sk->sk_napi_id = 0; 3349 sk->sk_ll_usec = sysctl_net_busy_read; 3350 #endif 3351 3352 sk->sk_max_pacing_rate = ~0UL; 3353 sk->sk_pacing_rate = ~0UL; 3354 WRITE_ONCE(sk->sk_pacing_shift, 10); 3355 sk->sk_incoming_cpu = -1; 3356 sk->sk_txrehash = SOCK_TXREHASH_DEFAULT; 3357 3358 sk_rx_queue_clear(sk); 3359 /* 3360 * Before updating sk_refcnt, we must commit prior changes to memory 3361 * (Documentation/RCU/rculist_nulls.rst for details) 3362 */ 3363 smp_wmb(); 3364 refcount_set(&sk->sk_refcnt, 1); 3365 atomic_set(&sk->sk_drops, 0); 3366 } 3367 EXPORT_SYMBOL(sock_init_data); 3368 3369 void lock_sock_nested(struct sock *sk, int subclass) 3370 { 3371 /* The sk_lock has mutex_lock() semantics here. */ 3372 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); 3373 3374 might_sleep(); 3375 spin_lock_bh(&sk->sk_lock.slock); 3376 if (sock_owned_by_user_nocheck(sk)) 3377 __lock_sock(sk); 3378 sk->sk_lock.owned = 1; 3379 spin_unlock_bh(&sk->sk_lock.slock); 3380 } 3381 EXPORT_SYMBOL(lock_sock_nested); 3382 3383 void release_sock(struct sock *sk) 3384 { 3385 spin_lock_bh(&sk->sk_lock.slock); 3386 if (sk->sk_backlog.tail) 3387 __release_sock(sk); 3388 3389 /* Warning : release_cb() might need to release sk ownership, 3390 * ie call sock_release_ownership(sk) before us. 3391 */ 3392 if (sk->sk_prot->release_cb) 3393 sk->sk_prot->release_cb(sk); 3394 3395 sock_release_ownership(sk); 3396 if (waitqueue_active(&sk->sk_lock.wq)) 3397 wake_up(&sk->sk_lock.wq); 3398 spin_unlock_bh(&sk->sk_lock.slock); 3399 } 3400 EXPORT_SYMBOL(release_sock); 3401 3402 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) 3403 { 3404 might_sleep(); 3405 spin_lock_bh(&sk->sk_lock.slock); 3406 3407 if (!sock_owned_by_user_nocheck(sk)) { 3408 /* 3409 * Fast path return with bottom halves disabled and 3410 * sock::sk_lock.slock held. 3411 * 3412 * The 'mutex' is not contended and holding 3413 * sock::sk_lock.slock prevents all other lockers to 3414 * proceed so the corresponding unlock_sock_fast() can 3415 * avoid the slow path of release_sock() completely and 3416 * just release slock. 3417 * 3418 * From a semantical POV this is equivalent to 'acquiring' 3419 * the 'mutex', hence the corresponding lockdep 3420 * mutex_release() has to happen in the fast path of 3421 * unlock_sock_fast(). 3422 */ 3423 return false; 3424 } 3425 3426 __lock_sock(sk); 3427 sk->sk_lock.owned = 1; 3428 __acquire(&sk->sk_lock.slock); 3429 spin_unlock_bh(&sk->sk_lock.slock); 3430 return true; 3431 } 3432 EXPORT_SYMBOL(__lock_sock_fast); 3433 3434 int sock_gettstamp(struct socket *sock, void __user *userstamp, 3435 bool timeval, bool time32) 3436 { 3437 struct sock *sk = sock->sk; 3438 struct timespec64 ts; 3439 3440 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 3441 ts = ktime_to_timespec64(sock_read_timestamp(sk)); 3442 if (ts.tv_sec == -1) 3443 return -ENOENT; 3444 if (ts.tv_sec == 0) { 3445 ktime_t kt = ktime_get_real(); 3446 sock_write_timestamp(sk, kt); 3447 ts = ktime_to_timespec64(kt); 3448 } 3449 3450 if (timeval) 3451 ts.tv_nsec /= 1000; 3452 3453 #ifdef CONFIG_COMPAT_32BIT_TIME 3454 if (time32) 3455 return put_old_timespec32(&ts, userstamp); 3456 #endif 3457 #ifdef CONFIG_SPARC64 3458 /* beware of padding in sparc64 timeval */ 3459 if (timeval && !in_compat_syscall()) { 3460 struct __kernel_old_timeval __user tv = { 3461 .tv_sec = ts.tv_sec, 3462 .tv_usec = ts.tv_nsec, 3463 }; 3464 if (copy_to_user(userstamp, &tv, sizeof(tv))) 3465 return -EFAULT; 3466 return 0; 3467 } 3468 #endif 3469 return put_timespec64(&ts, userstamp); 3470 } 3471 EXPORT_SYMBOL(sock_gettstamp); 3472 3473 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) 3474 { 3475 if (!sock_flag(sk, flag)) { 3476 unsigned long previous_flags = sk->sk_flags; 3477 3478 sock_set_flag(sk, flag); 3479 /* 3480 * we just set one of the two flags which require net 3481 * time stamping, but time stamping might have been on 3482 * already because of the other one 3483 */ 3484 if (sock_needs_netstamp(sk) && 3485 !(previous_flags & SK_FLAGS_TIMESTAMP)) 3486 net_enable_timestamp(); 3487 } 3488 } 3489 3490 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, 3491 int level, int type) 3492 { 3493 struct sock_exterr_skb *serr; 3494 struct sk_buff *skb; 3495 int copied, err; 3496 3497 err = -EAGAIN; 3498 skb = sock_dequeue_err_skb(sk); 3499 if (skb == NULL) 3500 goto out; 3501 3502 copied = skb->len; 3503 if (copied > len) { 3504 msg->msg_flags |= MSG_TRUNC; 3505 copied = len; 3506 } 3507 err = skb_copy_datagram_msg(skb, 0, msg, copied); 3508 if (err) 3509 goto out_free_skb; 3510 3511 sock_recv_timestamp(msg, sk, skb); 3512 3513 serr = SKB_EXT_ERR(skb); 3514 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); 3515 3516 msg->msg_flags |= MSG_ERRQUEUE; 3517 err = copied; 3518 3519 out_free_skb: 3520 kfree_skb(skb); 3521 out: 3522 return err; 3523 } 3524 EXPORT_SYMBOL(sock_recv_errqueue); 3525 3526 /* 3527 * Get a socket option on an socket. 3528 * 3529 * FIX: POSIX 1003.1g is very ambiguous here. It states that 3530 * asynchronous errors should be reported by getsockopt. We assume 3531 * this means if you specify SO_ERROR (otherwise whats the point of it). 3532 */ 3533 int sock_common_getsockopt(struct socket *sock, int level, int optname, 3534 char __user *optval, int __user *optlen) 3535 { 3536 struct sock *sk = sock->sk; 3537 3538 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 3539 } 3540 EXPORT_SYMBOL(sock_common_getsockopt); 3541 3542 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 3543 int flags) 3544 { 3545 struct sock *sk = sock->sk; 3546 int addr_len = 0; 3547 int err; 3548 3549 err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len); 3550 if (err >= 0) 3551 msg->msg_namelen = addr_len; 3552 return err; 3553 } 3554 EXPORT_SYMBOL(sock_common_recvmsg); 3555 3556 /* 3557 * Set socket options on an inet socket. 3558 */ 3559 int sock_common_setsockopt(struct socket *sock, int level, int optname, 3560 sockptr_t optval, unsigned int optlen) 3561 { 3562 struct sock *sk = sock->sk; 3563 3564 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 3565 } 3566 EXPORT_SYMBOL(sock_common_setsockopt); 3567 3568 void sk_common_release(struct sock *sk) 3569 { 3570 if (sk->sk_prot->destroy) 3571 sk->sk_prot->destroy(sk); 3572 3573 /* 3574 * Observation: when sk_common_release is called, processes have 3575 * no access to socket. But net still has. 3576 * Step one, detach it from networking: 3577 * 3578 * A. Remove from hash tables. 3579 */ 3580 3581 sk->sk_prot->unhash(sk); 3582 3583 /* 3584 * In this point socket cannot receive new packets, but it is possible 3585 * that some packets are in flight because some CPU runs receiver and 3586 * did hash table lookup before we unhashed socket. They will achieve 3587 * receive queue and will be purged by socket destructor. 3588 * 3589 * Also we still have packets pending on receive queue and probably, 3590 * our own packets waiting in device queues. sock_destroy will drain 3591 * receive queue, but transmitted packets will delay socket destruction 3592 * until the last reference will be released. 3593 */ 3594 3595 sock_orphan(sk); 3596 3597 xfrm_sk_free_policy(sk); 3598 3599 sk_refcnt_debug_release(sk); 3600 3601 sock_put(sk); 3602 } 3603 EXPORT_SYMBOL(sk_common_release); 3604 3605 void sk_get_meminfo(const struct sock *sk, u32 *mem) 3606 { 3607 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); 3608 3609 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); 3610 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); 3611 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); 3612 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); 3613 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; 3614 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); 3615 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); 3616 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); 3617 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); 3618 } 3619 3620 #ifdef CONFIG_PROC_FS 3621 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); 3622 3623 int sock_prot_inuse_get(struct net *net, struct proto *prot) 3624 { 3625 int cpu, idx = prot->inuse_idx; 3626 int res = 0; 3627 3628 for_each_possible_cpu(cpu) 3629 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; 3630 3631 return res >= 0 ? res : 0; 3632 } 3633 EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 3634 3635 int sock_inuse_get(struct net *net) 3636 { 3637 int cpu, res = 0; 3638 3639 for_each_possible_cpu(cpu) 3640 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all; 3641 3642 return res; 3643 } 3644 3645 EXPORT_SYMBOL_GPL(sock_inuse_get); 3646 3647 static int __net_init sock_inuse_init_net(struct net *net) 3648 { 3649 net->core.prot_inuse = alloc_percpu(struct prot_inuse); 3650 if (net->core.prot_inuse == NULL) 3651 return -ENOMEM; 3652 return 0; 3653 } 3654 3655 static void __net_exit sock_inuse_exit_net(struct net *net) 3656 { 3657 free_percpu(net->core.prot_inuse); 3658 } 3659 3660 static struct pernet_operations net_inuse_ops = { 3661 .init = sock_inuse_init_net, 3662 .exit = sock_inuse_exit_net, 3663 }; 3664 3665 static __init int net_inuse_init(void) 3666 { 3667 if (register_pernet_subsys(&net_inuse_ops)) 3668 panic("Cannot initialize net inuse counters"); 3669 3670 return 0; 3671 } 3672 3673 core_initcall(net_inuse_init); 3674 3675 static int assign_proto_idx(struct proto *prot) 3676 { 3677 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); 3678 3679 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { 3680 pr_err("PROTO_INUSE_NR exhausted\n"); 3681 return -ENOSPC; 3682 } 3683 3684 set_bit(prot->inuse_idx, proto_inuse_idx); 3685 return 0; 3686 } 3687 3688 static void release_proto_idx(struct proto *prot) 3689 { 3690 if (prot->inuse_idx != PROTO_INUSE_NR - 1) 3691 clear_bit(prot->inuse_idx, proto_inuse_idx); 3692 } 3693 #else 3694 static inline int assign_proto_idx(struct proto *prot) 3695 { 3696 return 0; 3697 } 3698 3699 static inline void release_proto_idx(struct proto *prot) 3700 { 3701 } 3702 3703 #endif 3704 3705 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) 3706 { 3707 if (!twsk_prot) 3708 return; 3709 kfree(twsk_prot->twsk_slab_name); 3710 twsk_prot->twsk_slab_name = NULL; 3711 kmem_cache_destroy(twsk_prot->twsk_slab); 3712 twsk_prot->twsk_slab = NULL; 3713 } 3714 3715 static int tw_prot_init(const struct proto *prot) 3716 { 3717 struct timewait_sock_ops *twsk_prot = prot->twsk_prot; 3718 3719 if (!twsk_prot) 3720 return 0; 3721 3722 twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", 3723 prot->name); 3724 if (!twsk_prot->twsk_slab_name) 3725 return -ENOMEM; 3726 3727 twsk_prot->twsk_slab = 3728 kmem_cache_create(twsk_prot->twsk_slab_name, 3729 twsk_prot->twsk_obj_size, 0, 3730 SLAB_ACCOUNT | prot->slab_flags, 3731 NULL); 3732 if (!twsk_prot->twsk_slab) { 3733 pr_crit("%s: Can't create timewait sock SLAB cache!\n", 3734 prot->name); 3735 return -ENOMEM; 3736 } 3737 3738 return 0; 3739 } 3740 3741 static void req_prot_cleanup(struct request_sock_ops *rsk_prot) 3742 { 3743 if (!rsk_prot) 3744 return; 3745 kfree(rsk_prot->slab_name); 3746 rsk_prot->slab_name = NULL; 3747 kmem_cache_destroy(rsk_prot->slab); 3748 rsk_prot->slab = NULL; 3749 } 3750 3751 static int req_prot_init(const struct proto *prot) 3752 { 3753 struct request_sock_ops *rsk_prot = prot->rsk_prot; 3754 3755 if (!rsk_prot) 3756 return 0; 3757 3758 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", 3759 prot->name); 3760 if (!rsk_prot->slab_name) 3761 return -ENOMEM; 3762 3763 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, 3764 rsk_prot->obj_size, 0, 3765 SLAB_ACCOUNT | prot->slab_flags, 3766 NULL); 3767 3768 if (!rsk_prot->slab) { 3769 pr_crit("%s: Can't create request sock SLAB cache!\n", 3770 prot->name); 3771 return -ENOMEM; 3772 } 3773 return 0; 3774 } 3775 3776 int proto_register(struct proto *prot, int alloc_slab) 3777 { 3778 int ret = -ENOBUFS; 3779 3780 if (prot->memory_allocated && !prot->sysctl_mem) { 3781 pr_err("%s: missing sysctl_mem\n", prot->name); 3782 return -EINVAL; 3783 } 3784 if (alloc_slab) { 3785 prot->slab = kmem_cache_create_usercopy(prot->name, 3786 prot->obj_size, 0, 3787 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | 3788 prot->slab_flags, 3789 prot->useroffset, prot->usersize, 3790 NULL); 3791 3792 if (prot->slab == NULL) { 3793 pr_crit("%s: Can't create sock SLAB cache!\n", 3794 prot->name); 3795 goto out; 3796 } 3797 3798 if (req_prot_init(prot)) 3799 goto out_free_request_sock_slab; 3800 3801 if (tw_prot_init(prot)) 3802 goto out_free_timewait_sock_slab; 3803 } 3804 3805 mutex_lock(&proto_list_mutex); 3806 ret = assign_proto_idx(prot); 3807 if (ret) { 3808 mutex_unlock(&proto_list_mutex); 3809 goto out_free_timewait_sock_slab; 3810 } 3811 list_add(&prot->node, &proto_list); 3812 mutex_unlock(&proto_list_mutex); 3813 return ret; 3814 3815 out_free_timewait_sock_slab: 3816 if (alloc_slab) 3817 tw_prot_cleanup(prot->twsk_prot); 3818 out_free_request_sock_slab: 3819 if (alloc_slab) { 3820 req_prot_cleanup(prot->rsk_prot); 3821 3822 kmem_cache_destroy(prot->slab); 3823 prot->slab = NULL; 3824 } 3825 out: 3826 return ret; 3827 } 3828 EXPORT_SYMBOL(proto_register); 3829 3830 void proto_unregister(struct proto *prot) 3831 { 3832 mutex_lock(&proto_list_mutex); 3833 release_proto_idx(prot); 3834 list_del(&prot->node); 3835 mutex_unlock(&proto_list_mutex); 3836 3837 kmem_cache_destroy(prot->slab); 3838 prot->slab = NULL; 3839 3840 req_prot_cleanup(prot->rsk_prot); 3841 tw_prot_cleanup(prot->twsk_prot); 3842 } 3843 EXPORT_SYMBOL(proto_unregister); 3844 3845 int sock_load_diag_module(int family, int protocol) 3846 { 3847 if (!protocol) { 3848 if (!sock_is_registered(family)) 3849 return -ENOENT; 3850 3851 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, 3852 NETLINK_SOCK_DIAG, family); 3853 } 3854 3855 #ifdef CONFIG_INET 3856 if (family == AF_INET && 3857 protocol != IPPROTO_RAW && 3858 protocol < MAX_INET_PROTOS && 3859 !rcu_access_pointer(inet_protos[protocol])) 3860 return -ENOENT; 3861 #endif 3862 3863 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, 3864 NETLINK_SOCK_DIAG, family, protocol); 3865 } 3866 EXPORT_SYMBOL(sock_load_diag_module); 3867 3868 #ifdef CONFIG_PROC_FS 3869 static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 3870 __acquires(proto_list_mutex) 3871 { 3872 mutex_lock(&proto_list_mutex); 3873 return seq_list_start_head(&proto_list, *pos); 3874 } 3875 3876 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3877 { 3878 return seq_list_next(v, &proto_list, pos); 3879 } 3880 3881 static void proto_seq_stop(struct seq_file *seq, void *v) 3882 __releases(proto_list_mutex) 3883 { 3884 mutex_unlock(&proto_list_mutex); 3885 } 3886 3887 static char proto_method_implemented(const void *method) 3888 { 3889 return method == NULL ? 'n' : 'y'; 3890 } 3891 static long sock_prot_memory_allocated(struct proto *proto) 3892 { 3893 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; 3894 } 3895 3896 static const char *sock_prot_memory_pressure(struct proto *proto) 3897 { 3898 return proto->memory_pressure != NULL ? 3899 proto_memory_pressure(proto) ? "yes" : "no" : "NI"; 3900 } 3901 3902 static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 3903 { 3904 3905 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " 3906 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 3907 proto->name, 3908 proto->obj_size, 3909 sock_prot_inuse_get(seq_file_net(seq), proto), 3910 sock_prot_memory_allocated(proto), 3911 sock_prot_memory_pressure(proto), 3912 proto->max_header, 3913 proto->slab == NULL ? "no" : "yes", 3914 module_name(proto->owner), 3915 proto_method_implemented(proto->close), 3916 proto_method_implemented(proto->connect), 3917 proto_method_implemented(proto->disconnect), 3918 proto_method_implemented(proto->accept), 3919 proto_method_implemented(proto->ioctl), 3920 proto_method_implemented(proto->init), 3921 proto_method_implemented(proto->destroy), 3922 proto_method_implemented(proto->shutdown), 3923 proto_method_implemented(proto->setsockopt), 3924 proto_method_implemented(proto->getsockopt), 3925 proto_method_implemented(proto->sendmsg), 3926 proto_method_implemented(proto->recvmsg), 3927 proto_method_implemented(proto->sendpage), 3928 proto_method_implemented(proto->bind), 3929 proto_method_implemented(proto->backlog_rcv), 3930 proto_method_implemented(proto->hash), 3931 proto_method_implemented(proto->unhash), 3932 proto_method_implemented(proto->get_port), 3933 proto_method_implemented(proto->enter_memory_pressure)); 3934 } 3935 3936 static int proto_seq_show(struct seq_file *seq, void *v) 3937 { 3938 if (v == &proto_list) 3939 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", 3940 "protocol", 3941 "size", 3942 "sockets", 3943 "memory", 3944 "press", 3945 "maxhdr", 3946 "slab", 3947 "module", 3948 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); 3949 else 3950 proto_seq_printf(seq, list_entry(v, struct proto, node)); 3951 return 0; 3952 } 3953 3954 static const struct seq_operations proto_seq_ops = { 3955 .start = proto_seq_start, 3956 .next = proto_seq_next, 3957 .stop = proto_seq_stop, 3958 .show = proto_seq_show, 3959 }; 3960 3961 static __net_init int proto_init_net(struct net *net) 3962 { 3963 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, 3964 sizeof(struct seq_net_private))) 3965 return -ENOMEM; 3966 3967 return 0; 3968 } 3969 3970 static __net_exit void proto_exit_net(struct net *net) 3971 { 3972 remove_proc_entry("protocols", net->proc_net); 3973 } 3974 3975 3976 static __net_initdata struct pernet_operations proto_net_ops = { 3977 .init = proto_init_net, 3978 .exit = proto_exit_net, 3979 }; 3980 3981 static int __init proto_init(void) 3982 { 3983 return register_pernet_subsys(&proto_net_ops); 3984 } 3985 3986 subsys_initcall(proto_init); 3987 3988 #endif /* PROC_FS */ 3989 3990 #ifdef CONFIG_NET_RX_BUSY_POLL 3991 bool sk_busy_loop_end(void *p, unsigned long start_time) 3992 { 3993 struct sock *sk = p; 3994 3995 return !skb_queue_empty_lockless(&sk->sk_receive_queue) || 3996 sk_busy_loop_timeout(sk, start_time); 3997 } 3998 EXPORT_SYMBOL(sk_busy_loop_end); 3999 #endif /* CONFIG_NET_RX_BUSY_POLL */ 4000 4001 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) 4002 { 4003 if (!sk->sk_prot->bind_add) 4004 return -EOPNOTSUPP; 4005 return sk->sk_prot->bind_add(sk, addr, addr_len); 4006 } 4007 EXPORT_SYMBOL(sock_bind_add); 4008