1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Generic socket support routines. Memory allocators, socket lock/release 7 * handler for protocols to use and generic option handler. 8 * 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Alan Cox, <A.Cox@swansea.ac.uk> 14 * 15 * Fixes: 16 * Alan Cox : Numerous verify_area() problems 17 * Alan Cox : Connecting on a connecting socket 18 * now returns an error for tcp. 19 * Alan Cox : sock->protocol is set correctly. 20 * and is not sometimes left as 0. 21 * Alan Cox : connect handles icmp errors on a 22 * connect properly. Unfortunately there 23 * is a restart syscall nasty there. I 24 * can't match BSD without hacking the C 25 * library. Ideas urgently sought! 26 * Alan Cox : Disallow bind() to addresses that are 27 * not ours - especially broadcast ones!! 28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) 29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, 30 * instead they leave that for the DESTROY timer. 31 * Alan Cox : Clean up error flag in accept 32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer 33 * was buggy. Put a remove_sock() in the handler 34 * for memory when we hit 0. Also altered the timer 35 * code. The ACK stuff can wait and needs major 36 * TCP layer surgery. 37 * Alan Cox : Fixed TCP ack bug, removed remove sock 38 * and fixed timer/inet_bh race. 39 * Alan Cox : Added zapped flag for TCP 40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code 41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb 42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources 43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. 44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... 45 * Rick Sladkey : Relaxed UDP rules for matching packets. 46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support 47 * Pauline Middelink : identd support 48 * Alan Cox : Fixed connect() taking signals I think. 49 * Alan Cox : SO_LINGER supported 50 * Alan Cox : Error reporting fixes 51 * Anonymous : inet_create tidied up (sk->reuse setting) 52 * Alan Cox : inet sockets don't set sk->type! 53 * Alan Cox : Split socket option code 54 * Alan Cox : Callbacks 55 * Alan Cox : Nagle flag for Charles & Johannes stuff 56 * Alex : Removed restriction on inet fioctl 57 * Alan Cox : Splitting INET from NET core 58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() 59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code 60 * Alan Cox : Split IP from generic code 61 * Alan Cox : New kfree_skbmem() 62 * Alan Cox : Make SO_DEBUG superuser only. 63 * Alan Cox : Allow anyone to clear SO_DEBUG 64 * (compatibility fix) 65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. 66 * Alan Cox : Allocator for a socket is settable. 67 * Alan Cox : SO_ERROR includes soft errors. 68 * Alan Cox : Allow NULL arguments on some SO_ opts 69 * Alan Cox : Generic socket allocation to make hooks 70 * easier (suggested by Craig Metz). 71 * Michael Pall : SO_ERROR returns positive errno again 72 * Steve Whitehouse: Added default destructor to free 73 * protocol private data. 74 * Steve Whitehouse: Added various other default routines 75 * common to several socket families. 76 * Chris Evans : Call suser() check last on F_SETOWN 77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. 78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() 79 * Andi Kleen : Fix write_space callback 80 * Chris Evans : Security fixes - signedness again 81 * Arnaldo C. Melo : cleanups, use skb_queue_purge 82 * 83 * To Fix: 84 * 85 * 86 * This program is free software; you can redistribute it and/or 87 * modify it under the terms of the GNU General Public License 88 * as published by the Free Software Foundation; either version 89 * 2 of the License, or (at your option) any later version. 90 */ 91 92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 93 94 #include <asm/unaligned.h> 95 #include <linux/capability.h> 96 #include <linux/errno.h> 97 #include <linux/errqueue.h> 98 #include <linux/types.h> 99 #include <linux/socket.h> 100 #include <linux/in.h> 101 #include <linux/kernel.h> 102 #include <linux/module.h> 103 #include <linux/proc_fs.h> 104 #include <linux/seq_file.h> 105 #include <linux/sched.h> 106 #include <linux/sched/mm.h> 107 #include <linux/timer.h> 108 #include <linux/string.h> 109 #include <linux/sockios.h> 110 #include <linux/net.h> 111 #include <linux/mm.h> 112 #include <linux/slab.h> 113 #include <linux/interrupt.h> 114 #include <linux/poll.h> 115 #include <linux/tcp.h> 116 #include <linux/init.h> 117 #include <linux/highmem.h> 118 #include <linux/user_namespace.h> 119 #include <linux/static_key.h> 120 #include <linux/memcontrol.h> 121 #include <linux/prefetch.h> 122 123 #include <linux/uaccess.h> 124 125 #include <linux/netdevice.h> 126 #include <net/protocol.h> 127 #include <linux/skbuff.h> 128 #include <net/net_namespace.h> 129 #include <net/request_sock.h> 130 #include <net/sock.h> 131 #include <linux/net_tstamp.h> 132 #include <net/xfrm.h> 133 #include <linux/ipsec.h> 134 #include <net/cls_cgroup.h> 135 #include <net/netprio_cgroup.h> 136 #include <linux/sock_diag.h> 137 138 #include <linux/filter.h> 139 #include <net/sock_reuseport.h> 140 141 #include <trace/events/sock.h> 142 143 #include <net/tcp.h> 144 #include <net/busy_poll.h> 145 146 static DEFINE_MUTEX(proto_list_mutex); 147 static LIST_HEAD(proto_list); 148 149 static void sock_inuse_add(struct net *net, int val); 150 151 /** 152 * sk_ns_capable - General socket capability test 153 * @sk: Socket to use a capability on or through 154 * @user_ns: The user namespace of the capability to use 155 * @cap: The capability to use 156 * 157 * Test to see if the opener of the socket had when the socket was 158 * created and the current process has the capability @cap in the user 159 * namespace @user_ns. 160 */ 161 bool sk_ns_capable(const struct sock *sk, 162 struct user_namespace *user_ns, int cap) 163 { 164 return file_ns_capable(sk->sk_socket->file, user_ns, cap) && 165 ns_capable(user_ns, cap); 166 } 167 EXPORT_SYMBOL(sk_ns_capable); 168 169 /** 170 * sk_capable - Socket global capability test 171 * @sk: Socket to use a capability on or through 172 * @cap: The global capability to use 173 * 174 * Test to see if the opener of the socket had when the socket was 175 * created and the current process has the capability @cap in all user 176 * namespaces. 177 */ 178 bool sk_capable(const struct sock *sk, int cap) 179 { 180 return sk_ns_capable(sk, &init_user_ns, cap); 181 } 182 EXPORT_SYMBOL(sk_capable); 183 184 /** 185 * sk_net_capable - Network namespace socket capability test 186 * @sk: Socket to use a capability on or through 187 * @cap: The capability to use 188 * 189 * Test to see if the opener of the socket had when the socket was created 190 * and the current process has the capability @cap over the network namespace 191 * the socket is a member of. 192 */ 193 bool sk_net_capable(const struct sock *sk, int cap) 194 { 195 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); 196 } 197 EXPORT_SYMBOL(sk_net_capable); 198 199 /* 200 * Each address family might have different locking rules, so we have 201 * one slock key per address family and separate keys for internal and 202 * userspace sockets. 203 */ 204 static struct lock_class_key af_family_keys[AF_MAX]; 205 static struct lock_class_key af_family_kern_keys[AF_MAX]; 206 static struct lock_class_key af_family_slock_keys[AF_MAX]; 207 static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; 208 209 /* 210 * Make lock validator output more readable. (we pre-construct these 211 * strings build-time, so that runtime initialization of socket 212 * locks is fast): 213 */ 214 215 #define _sock_locks(x) \ 216 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ 217 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ 218 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ 219 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ 220 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ 221 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ 222 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ 223 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ 224 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ 225 x "27" , x "28" , x "AF_CAN" , \ 226 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ 227 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ 228 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ 229 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ 230 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ 231 x "AF_MAX" 232 233 static const char *const af_family_key_strings[AF_MAX+1] = { 234 _sock_locks("sk_lock-") 235 }; 236 static const char *const af_family_slock_key_strings[AF_MAX+1] = { 237 _sock_locks("slock-") 238 }; 239 static const char *const af_family_clock_key_strings[AF_MAX+1] = { 240 _sock_locks("clock-") 241 }; 242 243 static const char *const af_family_kern_key_strings[AF_MAX+1] = { 244 _sock_locks("k-sk_lock-") 245 }; 246 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { 247 _sock_locks("k-slock-") 248 }; 249 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { 250 _sock_locks("k-clock-") 251 }; 252 static const char *const af_family_rlock_key_strings[AF_MAX+1] = { 253 "rlock-AF_UNSPEC", "rlock-AF_UNIX" , "rlock-AF_INET" , 254 "rlock-AF_AX25" , "rlock-AF_IPX" , "rlock-AF_APPLETALK", 255 "rlock-AF_NETROM", "rlock-AF_BRIDGE" , "rlock-AF_ATMPVC" , 256 "rlock-AF_X25" , "rlock-AF_INET6" , "rlock-AF_ROSE" , 257 "rlock-AF_DECnet", "rlock-AF_NETBEUI" , "rlock-AF_SECURITY" , 258 "rlock-AF_KEY" , "rlock-AF_NETLINK" , "rlock-AF_PACKET" , 259 "rlock-AF_ASH" , "rlock-AF_ECONET" , "rlock-AF_ATMSVC" , 260 "rlock-AF_RDS" , "rlock-AF_SNA" , "rlock-AF_IRDA" , 261 "rlock-AF_PPPOX" , "rlock-AF_WANPIPE" , "rlock-AF_LLC" , 262 "rlock-27" , "rlock-28" , "rlock-AF_CAN" , 263 "rlock-AF_TIPC" , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV" , 264 "rlock-AF_RXRPC" , "rlock-AF_ISDN" , "rlock-AF_PHONET" , 265 "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG" , 266 "rlock-AF_NFC" , "rlock-AF_VSOCK" , "rlock-AF_KCM" , 267 "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_XDP" , 268 "rlock-AF_MAX" 269 }; 270 static const char *const af_family_wlock_key_strings[AF_MAX+1] = { 271 "wlock-AF_UNSPEC", "wlock-AF_UNIX" , "wlock-AF_INET" , 272 "wlock-AF_AX25" , "wlock-AF_IPX" , "wlock-AF_APPLETALK", 273 "wlock-AF_NETROM", "wlock-AF_BRIDGE" , "wlock-AF_ATMPVC" , 274 "wlock-AF_X25" , "wlock-AF_INET6" , "wlock-AF_ROSE" , 275 "wlock-AF_DECnet", "wlock-AF_NETBEUI" , "wlock-AF_SECURITY" , 276 "wlock-AF_KEY" , "wlock-AF_NETLINK" , "wlock-AF_PACKET" , 277 "wlock-AF_ASH" , "wlock-AF_ECONET" , "wlock-AF_ATMSVC" , 278 "wlock-AF_RDS" , "wlock-AF_SNA" , "wlock-AF_IRDA" , 279 "wlock-AF_PPPOX" , "wlock-AF_WANPIPE" , "wlock-AF_LLC" , 280 "wlock-27" , "wlock-28" , "wlock-AF_CAN" , 281 "wlock-AF_TIPC" , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV" , 282 "wlock-AF_RXRPC" , "wlock-AF_ISDN" , "wlock-AF_PHONET" , 283 "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG" , 284 "wlock-AF_NFC" , "wlock-AF_VSOCK" , "wlock-AF_KCM" , 285 "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_XDP" , 286 "wlock-AF_MAX" 287 }; 288 static const char *const af_family_elock_key_strings[AF_MAX+1] = { 289 "elock-AF_UNSPEC", "elock-AF_UNIX" , "elock-AF_INET" , 290 "elock-AF_AX25" , "elock-AF_IPX" , "elock-AF_APPLETALK", 291 "elock-AF_NETROM", "elock-AF_BRIDGE" , "elock-AF_ATMPVC" , 292 "elock-AF_X25" , "elock-AF_INET6" , "elock-AF_ROSE" , 293 "elock-AF_DECnet", "elock-AF_NETBEUI" , "elock-AF_SECURITY" , 294 "elock-AF_KEY" , "elock-AF_NETLINK" , "elock-AF_PACKET" , 295 "elock-AF_ASH" , "elock-AF_ECONET" , "elock-AF_ATMSVC" , 296 "elock-AF_RDS" , "elock-AF_SNA" , "elock-AF_IRDA" , 297 "elock-AF_PPPOX" , "elock-AF_WANPIPE" , "elock-AF_LLC" , 298 "elock-27" , "elock-28" , "elock-AF_CAN" , 299 "elock-AF_TIPC" , "elock-AF_BLUETOOTH", "elock-AF_IUCV" , 300 "elock-AF_RXRPC" , "elock-AF_ISDN" , "elock-AF_PHONET" , 301 "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG" , 302 "elock-AF_NFC" , "elock-AF_VSOCK" , "elock-AF_KCM" , 303 "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_XDP" , 304 "elock-AF_MAX" 305 }; 306 307 /* 308 * sk_callback_lock and sk queues locking rules are per-address-family, 309 * so split the lock classes by using a per-AF key: 310 */ 311 static struct lock_class_key af_callback_keys[AF_MAX]; 312 static struct lock_class_key af_rlock_keys[AF_MAX]; 313 static struct lock_class_key af_wlock_keys[AF_MAX]; 314 static struct lock_class_key af_elock_keys[AF_MAX]; 315 static struct lock_class_key af_kern_callback_keys[AF_MAX]; 316 317 /* Run time adjustable parameters. */ 318 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; 319 EXPORT_SYMBOL(sysctl_wmem_max); 320 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; 321 EXPORT_SYMBOL(sysctl_rmem_max); 322 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; 323 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; 324 325 /* Maximal space eaten by iovec or ancillary data plus some space */ 326 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 327 EXPORT_SYMBOL(sysctl_optmem_max); 328 329 int sysctl_tstamp_allow_data __read_mostly = 1; 330 331 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); 332 EXPORT_SYMBOL_GPL(memalloc_socks_key); 333 334 /** 335 * sk_set_memalloc - sets %SOCK_MEMALLOC 336 * @sk: socket to set it on 337 * 338 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. 339 * It's the responsibility of the admin to adjust min_free_kbytes 340 * to meet the requirements 341 */ 342 void sk_set_memalloc(struct sock *sk) 343 { 344 sock_set_flag(sk, SOCK_MEMALLOC); 345 sk->sk_allocation |= __GFP_MEMALLOC; 346 static_branch_inc(&memalloc_socks_key); 347 } 348 EXPORT_SYMBOL_GPL(sk_set_memalloc); 349 350 void sk_clear_memalloc(struct sock *sk) 351 { 352 sock_reset_flag(sk, SOCK_MEMALLOC); 353 sk->sk_allocation &= ~__GFP_MEMALLOC; 354 static_branch_dec(&memalloc_socks_key); 355 356 /* 357 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward 358 * progress of swapping. SOCK_MEMALLOC may be cleared while 359 * it has rmem allocations due to the last swapfile being deactivated 360 * but there is a risk that the socket is unusable due to exceeding 361 * the rmem limits. Reclaim the reserves and obey rmem limits again. 362 */ 363 sk_mem_reclaim(sk); 364 } 365 EXPORT_SYMBOL_GPL(sk_clear_memalloc); 366 367 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 368 { 369 int ret; 370 unsigned int noreclaim_flag; 371 372 /* these should have been dropped before queueing */ 373 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); 374 375 noreclaim_flag = memalloc_noreclaim_save(); 376 ret = sk->sk_backlog_rcv(sk, skb); 377 memalloc_noreclaim_restore(noreclaim_flag); 378 379 return ret; 380 } 381 EXPORT_SYMBOL(__sk_backlog_rcv); 382 383 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) 384 { 385 struct timeval tv; 386 387 if (optlen < sizeof(tv)) 388 return -EINVAL; 389 if (copy_from_user(&tv, optval, sizeof(tv))) 390 return -EFAULT; 391 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) 392 return -EDOM; 393 394 if (tv.tv_sec < 0) { 395 static int warned __read_mostly; 396 397 *timeo_p = 0; 398 if (warned < 10 && net_ratelimit()) { 399 warned++; 400 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", 401 __func__, current->comm, task_pid_nr(current)); 402 } 403 return 0; 404 } 405 *timeo_p = MAX_SCHEDULE_TIMEOUT; 406 if (tv.tv_sec == 0 && tv.tv_usec == 0) 407 return 0; 408 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1)) 409 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ); 410 return 0; 411 } 412 413 static void sock_warn_obsolete_bsdism(const char *name) 414 { 415 static int warned; 416 static char warncomm[TASK_COMM_LEN]; 417 if (strcmp(warncomm, current->comm) && warned < 5) { 418 strcpy(warncomm, current->comm); 419 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n", 420 warncomm, name); 421 warned++; 422 } 423 } 424 425 static bool sock_needs_netstamp(const struct sock *sk) 426 { 427 switch (sk->sk_family) { 428 case AF_UNSPEC: 429 case AF_UNIX: 430 return false; 431 default: 432 return true; 433 } 434 } 435 436 static void sock_disable_timestamp(struct sock *sk, unsigned long flags) 437 { 438 if (sk->sk_flags & flags) { 439 sk->sk_flags &= ~flags; 440 if (sock_needs_netstamp(sk) && 441 !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) 442 net_disable_timestamp(); 443 } 444 } 445 446 447 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 448 { 449 unsigned long flags; 450 struct sk_buff_head *list = &sk->sk_receive_queue; 451 452 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { 453 atomic_inc(&sk->sk_drops); 454 trace_sock_rcvqueue_full(sk, skb); 455 return -ENOMEM; 456 } 457 458 if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 459 atomic_inc(&sk->sk_drops); 460 return -ENOBUFS; 461 } 462 463 skb->dev = NULL; 464 skb_set_owner_r(skb, sk); 465 466 /* we escape from rcu protected region, make sure we dont leak 467 * a norefcounted dst 468 */ 469 skb_dst_force(skb); 470 471 spin_lock_irqsave(&list->lock, flags); 472 sock_skb_set_dropcount(sk, skb); 473 __skb_queue_tail(list, skb); 474 spin_unlock_irqrestore(&list->lock, flags); 475 476 if (!sock_flag(sk, SOCK_DEAD)) 477 sk->sk_data_ready(sk); 478 return 0; 479 } 480 EXPORT_SYMBOL(__sock_queue_rcv_skb); 481 482 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 483 { 484 int err; 485 486 err = sk_filter(sk, skb); 487 if (err) 488 return err; 489 490 return __sock_queue_rcv_skb(sk, skb); 491 } 492 EXPORT_SYMBOL(sock_queue_rcv_skb); 493 494 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, 495 const int nested, unsigned int trim_cap, bool refcounted) 496 { 497 int rc = NET_RX_SUCCESS; 498 499 if (sk_filter_trim_cap(sk, skb, trim_cap)) 500 goto discard_and_relse; 501 502 skb->dev = NULL; 503 504 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { 505 atomic_inc(&sk->sk_drops); 506 goto discard_and_relse; 507 } 508 if (nested) 509 bh_lock_sock_nested(sk); 510 else 511 bh_lock_sock(sk); 512 if (!sock_owned_by_user(sk)) { 513 /* 514 * trylock + unlock semantics: 515 */ 516 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); 517 518 rc = sk_backlog_rcv(sk, skb); 519 520 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); 521 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) { 522 bh_unlock_sock(sk); 523 atomic_inc(&sk->sk_drops); 524 goto discard_and_relse; 525 } 526 527 bh_unlock_sock(sk); 528 out: 529 if (refcounted) 530 sock_put(sk); 531 return rc; 532 discard_and_relse: 533 kfree_skb(skb); 534 goto out; 535 } 536 EXPORT_SYMBOL(__sk_receive_skb); 537 538 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 539 { 540 struct dst_entry *dst = __sk_dst_get(sk); 541 542 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 543 sk_tx_queue_clear(sk); 544 sk->sk_dst_pending_confirm = 0; 545 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 546 dst_release(dst); 547 return NULL; 548 } 549 550 return dst; 551 } 552 EXPORT_SYMBOL(__sk_dst_check); 553 554 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) 555 { 556 struct dst_entry *dst = sk_dst_get(sk); 557 558 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 559 sk_dst_reset(sk); 560 dst_release(dst); 561 return NULL; 562 } 563 564 return dst; 565 } 566 EXPORT_SYMBOL(sk_dst_check); 567 568 static int sock_setbindtodevice(struct sock *sk, char __user *optval, 569 int optlen) 570 { 571 int ret = -ENOPROTOOPT; 572 #ifdef CONFIG_NETDEVICES 573 struct net *net = sock_net(sk); 574 char devname[IFNAMSIZ]; 575 int index; 576 577 /* Sorry... */ 578 ret = -EPERM; 579 if (!ns_capable(net->user_ns, CAP_NET_RAW)) 580 goto out; 581 582 ret = -EINVAL; 583 if (optlen < 0) 584 goto out; 585 586 /* Bind this socket to a particular device like "eth0", 587 * as specified in the passed interface name. If the 588 * name is "" or the option length is zero the socket 589 * is not bound. 590 */ 591 if (optlen > IFNAMSIZ - 1) 592 optlen = IFNAMSIZ - 1; 593 memset(devname, 0, sizeof(devname)); 594 595 ret = -EFAULT; 596 if (copy_from_user(devname, optval, optlen)) 597 goto out; 598 599 index = 0; 600 if (devname[0] != '\0') { 601 struct net_device *dev; 602 603 rcu_read_lock(); 604 dev = dev_get_by_name_rcu(net, devname); 605 if (dev) 606 index = dev->ifindex; 607 rcu_read_unlock(); 608 ret = -ENODEV; 609 if (!dev) 610 goto out; 611 } 612 613 lock_sock(sk); 614 sk->sk_bound_dev_if = index; 615 sk_dst_reset(sk); 616 release_sock(sk); 617 618 ret = 0; 619 620 out: 621 #endif 622 623 return ret; 624 } 625 626 static int sock_getbindtodevice(struct sock *sk, char __user *optval, 627 int __user *optlen, int len) 628 { 629 int ret = -ENOPROTOOPT; 630 #ifdef CONFIG_NETDEVICES 631 struct net *net = sock_net(sk); 632 char devname[IFNAMSIZ]; 633 634 if (sk->sk_bound_dev_if == 0) { 635 len = 0; 636 goto zero; 637 } 638 639 ret = -EINVAL; 640 if (len < IFNAMSIZ) 641 goto out; 642 643 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if); 644 if (ret) 645 goto out; 646 647 len = strlen(devname) + 1; 648 649 ret = -EFAULT; 650 if (copy_to_user(optval, devname, len)) 651 goto out; 652 653 zero: 654 ret = -EFAULT; 655 if (put_user(len, optlen)) 656 goto out; 657 658 ret = 0; 659 660 out: 661 #endif 662 663 return ret; 664 } 665 666 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) 667 { 668 if (valbool) 669 sock_set_flag(sk, bit); 670 else 671 sock_reset_flag(sk, bit); 672 } 673 674 bool sk_mc_loop(struct sock *sk) 675 { 676 if (dev_recursion_level()) 677 return false; 678 if (!sk) 679 return true; 680 switch (sk->sk_family) { 681 case AF_INET: 682 return inet_sk(sk)->mc_loop; 683 #if IS_ENABLED(CONFIG_IPV6) 684 case AF_INET6: 685 return inet6_sk(sk)->mc_loop; 686 #endif 687 } 688 WARN_ON(1); 689 return true; 690 } 691 EXPORT_SYMBOL(sk_mc_loop); 692 693 /* 694 * This is meant for all protocols to use and covers goings on 695 * at the socket level. Everything here is generic. 696 */ 697 698 int sock_setsockopt(struct socket *sock, int level, int optname, 699 char __user *optval, unsigned int optlen) 700 { 701 struct sock_txtime sk_txtime; 702 struct sock *sk = sock->sk; 703 int val; 704 int valbool; 705 struct linger ling; 706 int ret = 0; 707 708 /* 709 * Options without arguments 710 */ 711 712 if (optname == SO_BINDTODEVICE) 713 return sock_setbindtodevice(sk, optval, optlen); 714 715 if (optlen < sizeof(int)) 716 return -EINVAL; 717 718 if (get_user(val, (int __user *)optval)) 719 return -EFAULT; 720 721 valbool = val ? 1 : 0; 722 723 lock_sock(sk); 724 725 switch (optname) { 726 case SO_DEBUG: 727 if (val && !capable(CAP_NET_ADMIN)) 728 ret = -EACCES; 729 else 730 sock_valbool_flag(sk, SOCK_DBG, valbool); 731 break; 732 case SO_REUSEADDR: 733 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); 734 break; 735 case SO_REUSEPORT: 736 sk->sk_reuseport = valbool; 737 break; 738 case SO_TYPE: 739 case SO_PROTOCOL: 740 case SO_DOMAIN: 741 case SO_ERROR: 742 ret = -ENOPROTOOPT; 743 break; 744 case SO_DONTROUTE: 745 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); 746 break; 747 case SO_BROADCAST: 748 sock_valbool_flag(sk, SOCK_BROADCAST, valbool); 749 break; 750 case SO_SNDBUF: 751 /* Don't error on this BSD doesn't and if you think 752 * about it this is right. Otherwise apps have to 753 * play 'guess the biggest size' games. RCVBUF/SNDBUF 754 * are treated in BSD as hints 755 */ 756 val = min_t(u32, val, sysctl_wmem_max); 757 set_sndbuf: 758 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 759 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); 760 /* Wake up sending tasks if we upped the value. */ 761 sk->sk_write_space(sk); 762 break; 763 764 case SO_SNDBUFFORCE: 765 if (!capable(CAP_NET_ADMIN)) { 766 ret = -EPERM; 767 break; 768 } 769 goto set_sndbuf; 770 771 case SO_RCVBUF: 772 /* Don't error on this BSD doesn't and if you think 773 * about it this is right. Otherwise apps have to 774 * play 'guess the biggest size' games. RCVBUF/SNDBUF 775 * are treated in BSD as hints 776 */ 777 val = min_t(u32, val, sysctl_rmem_max); 778 set_rcvbuf: 779 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 780 /* 781 * We double it on the way in to account for 782 * "struct sk_buff" etc. overhead. Applications 783 * assume that the SO_RCVBUF setting they make will 784 * allow that much actual data to be received on that 785 * socket. 786 * 787 * Applications are unaware that "struct sk_buff" and 788 * other overheads allocate from the receive buffer 789 * during socket buffer allocation. 790 * 791 * And after considering the possible alternatives, 792 * returning the value we actually used in getsockopt 793 * is the most desirable behavior. 794 */ 795 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF); 796 break; 797 798 case SO_RCVBUFFORCE: 799 if (!capable(CAP_NET_ADMIN)) { 800 ret = -EPERM; 801 break; 802 } 803 goto set_rcvbuf; 804 805 case SO_KEEPALIVE: 806 if (sk->sk_prot->keepalive) 807 sk->sk_prot->keepalive(sk, valbool); 808 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 809 break; 810 811 case SO_OOBINLINE: 812 sock_valbool_flag(sk, SOCK_URGINLINE, valbool); 813 break; 814 815 case SO_NO_CHECK: 816 sk->sk_no_check_tx = valbool; 817 break; 818 819 case SO_PRIORITY: 820 if ((val >= 0 && val <= 6) || 821 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 822 sk->sk_priority = val; 823 else 824 ret = -EPERM; 825 break; 826 827 case SO_LINGER: 828 if (optlen < sizeof(ling)) { 829 ret = -EINVAL; /* 1003.1g */ 830 break; 831 } 832 if (copy_from_user(&ling, optval, sizeof(ling))) { 833 ret = -EFAULT; 834 break; 835 } 836 if (!ling.l_onoff) 837 sock_reset_flag(sk, SOCK_LINGER); 838 else { 839 #if (BITS_PER_LONG == 32) 840 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) 841 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; 842 else 843 #endif 844 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; 845 sock_set_flag(sk, SOCK_LINGER); 846 } 847 break; 848 849 case SO_BSDCOMPAT: 850 sock_warn_obsolete_bsdism("setsockopt"); 851 break; 852 853 case SO_PASSCRED: 854 if (valbool) 855 set_bit(SOCK_PASSCRED, &sock->flags); 856 else 857 clear_bit(SOCK_PASSCRED, &sock->flags); 858 break; 859 860 case SO_TIMESTAMP: 861 case SO_TIMESTAMPNS: 862 if (valbool) { 863 if (optname == SO_TIMESTAMP) 864 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 865 else 866 sock_set_flag(sk, SOCK_RCVTSTAMPNS); 867 sock_set_flag(sk, SOCK_RCVTSTAMP); 868 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 869 } else { 870 sock_reset_flag(sk, SOCK_RCVTSTAMP); 871 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 872 } 873 break; 874 875 case SO_TIMESTAMPING: 876 if (val & ~SOF_TIMESTAMPING_MASK) { 877 ret = -EINVAL; 878 break; 879 } 880 881 if (val & SOF_TIMESTAMPING_OPT_ID && 882 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { 883 if (sk->sk_protocol == IPPROTO_TCP && 884 sk->sk_type == SOCK_STREAM) { 885 if ((1 << sk->sk_state) & 886 (TCPF_CLOSE | TCPF_LISTEN)) { 887 ret = -EINVAL; 888 break; 889 } 890 sk->sk_tskey = tcp_sk(sk)->snd_una; 891 } else { 892 sk->sk_tskey = 0; 893 } 894 } 895 896 if (val & SOF_TIMESTAMPING_OPT_STATS && 897 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) { 898 ret = -EINVAL; 899 break; 900 } 901 902 sk->sk_tsflags = val; 903 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 904 sock_enable_timestamp(sk, 905 SOCK_TIMESTAMPING_RX_SOFTWARE); 906 else 907 sock_disable_timestamp(sk, 908 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); 909 break; 910 911 case SO_RCVLOWAT: 912 if (val < 0) 913 val = INT_MAX; 914 if (sock->ops->set_rcvlowat) 915 ret = sock->ops->set_rcvlowat(sk, val); 916 else 917 sk->sk_rcvlowat = val ? : 1; 918 break; 919 920 case SO_RCVTIMEO: 921 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen); 922 break; 923 924 case SO_SNDTIMEO: 925 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen); 926 break; 927 928 case SO_ATTACH_FILTER: 929 ret = -EINVAL; 930 if (optlen == sizeof(struct sock_fprog)) { 931 struct sock_fprog fprog; 932 933 ret = -EFAULT; 934 if (copy_from_user(&fprog, optval, sizeof(fprog))) 935 break; 936 937 ret = sk_attach_filter(&fprog, sk); 938 } 939 break; 940 941 case SO_ATTACH_BPF: 942 ret = -EINVAL; 943 if (optlen == sizeof(u32)) { 944 u32 ufd; 945 946 ret = -EFAULT; 947 if (copy_from_user(&ufd, optval, sizeof(ufd))) 948 break; 949 950 ret = sk_attach_bpf(ufd, sk); 951 } 952 break; 953 954 case SO_ATTACH_REUSEPORT_CBPF: 955 ret = -EINVAL; 956 if (optlen == sizeof(struct sock_fprog)) { 957 struct sock_fprog fprog; 958 959 ret = -EFAULT; 960 if (copy_from_user(&fprog, optval, sizeof(fprog))) 961 break; 962 963 ret = sk_reuseport_attach_filter(&fprog, sk); 964 } 965 break; 966 967 case SO_ATTACH_REUSEPORT_EBPF: 968 ret = -EINVAL; 969 if (optlen == sizeof(u32)) { 970 u32 ufd; 971 972 ret = -EFAULT; 973 if (copy_from_user(&ufd, optval, sizeof(ufd))) 974 break; 975 976 ret = sk_reuseport_attach_bpf(ufd, sk); 977 } 978 break; 979 980 case SO_DETACH_FILTER: 981 ret = sk_detach_filter(sk); 982 break; 983 984 case SO_LOCK_FILTER: 985 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) 986 ret = -EPERM; 987 else 988 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); 989 break; 990 991 case SO_PASSSEC: 992 if (valbool) 993 set_bit(SOCK_PASSSEC, &sock->flags); 994 else 995 clear_bit(SOCK_PASSSEC, &sock->flags); 996 break; 997 case SO_MARK: 998 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 999 ret = -EPERM; 1000 else 1001 sk->sk_mark = val; 1002 break; 1003 1004 case SO_RXQ_OVFL: 1005 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); 1006 break; 1007 1008 case SO_WIFI_STATUS: 1009 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); 1010 break; 1011 1012 case SO_PEEK_OFF: 1013 if (sock->ops->set_peek_off) 1014 ret = sock->ops->set_peek_off(sk, val); 1015 else 1016 ret = -EOPNOTSUPP; 1017 break; 1018 1019 case SO_NOFCS: 1020 sock_valbool_flag(sk, SOCK_NOFCS, valbool); 1021 break; 1022 1023 case SO_SELECT_ERR_QUEUE: 1024 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); 1025 break; 1026 1027 #ifdef CONFIG_NET_RX_BUSY_POLL 1028 case SO_BUSY_POLL: 1029 /* allow unprivileged users to decrease the value */ 1030 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) 1031 ret = -EPERM; 1032 else { 1033 if (val < 0) 1034 ret = -EINVAL; 1035 else 1036 sk->sk_ll_usec = val; 1037 } 1038 break; 1039 #endif 1040 1041 case SO_MAX_PACING_RATE: 1042 if (val != ~0U) 1043 cmpxchg(&sk->sk_pacing_status, 1044 SK_PACING_NONE, 1045 SK_PACING_NEEDED); 1046 sk->sk_max_pacing_rate = val; 1047 sk->sk_pacing_rate = min(sk->sk_pacing_rate, 1048 sk->sk_max_pacing_rate); 1049 break; 1050 1051 case SO_INCOMING_CPU: 1052 sk->sk_incoming_cpu = val; 1053 break; 1054 1055 case SO_CNX_ADVICE: 1056 if (val == 1) 1057 dst_negative_advice(sk); 1058 break; 1059 1060 case SO_ZEROCOPY: 1061 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { 1062 if (sk->sk_protocol != IPPROTO_TCP) 1063 ret = -ENOTSUPP; 1064 } else if (sk->sk_family != PF_RDS) { 1065 ret = -ENOTSUPP; 1066 } 1067 if (!ret) { 1068 if (val < 0 || val > 1) 1069 ret = -EINVAL; 1070 else 1071 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); 1072 } 1073 break; 1074 1075 case SO_TXTIME: 1076 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1077 ret = -EPERM; 1078 } else if (optlen != sizeof(struct sock_txtime)) { 1079 ret = -EINVAL; 1080 } else if (copy_from_user(&sk_txtime, optval, 1081 sizeof(struct sock_txtime))) { 1082 ret = -EFAULT; 1083 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { 1084 ret = -EINVAL; 1085 } else { 1086 sock_valbool_flag(sk, SOCK_TXTIME, true); 1087 sk->sk_clockid = sk_txtime.clockid; 1088 sk->sk_txtime_deadline_mode = 1089 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); 1090 sk->sk_txtime_report_errors = 1091 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); 1092 } 1093 break; 1094 1095 default: 1096 ret = -ENOPROTOOPT; 1097 break; 1098 } 1099 release_sock(sk); 1100 return ret; 1101 } 1102 EXPORT_SYMBOL(sock_setsockopt); 1103 1104 1105 static void cred_to_ucred(struct pid *pid, const struct cred *cred, 1106 struct ucred *ucred) 1107 { 1108 ucred->pid = pid_vnr(pid); 1109 ucred->uid = ucred->gid = -1; 1110 if (cred) { 1111 struct user_namespace *current_ns = current_user_ns(); 1112 1113 ucred->uid = from_kuid_munged(current_ns, cred->euid); 1114 ucred->gid = from_kgid_munged(current_ns, cred->egid); 1115 } 1116 } 1117 1118 static int groups_to_user(gid_t __user *dst, const struct group_info *src) 1119 { 1120 struct user_namespace *user_ns = current_user_ns(); 1121 int i; 1122 1123 for (i = 0; i < src->ngroups; i++) 1124 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i)) 1125 return -EFAULT; 1126 1127 return 0; 1128 } 1129 1130 int sock_getsockopt(struct socket *sock, int level, int optname, 1131 char __user *optval, int __user *optlen) 1132 { 1133 struct sock *sk = sock->sk; 1134 1135 union { 1136 int val; 1137 u64 val64; 1138 struct linger ling; 1139 struct timeval tm; 1140 struct sock_txtime txtime; 1141 } v; 1142 1143 int lv = sizeof(int); 1144 int len; 1145 1146 if (get_user(len, optlen)) 1147 return -EFAULT; 1148 if (len < 0) 1149 return -EINVAL; 1150 1151 memset(&v, 0, sizeof(v)); 1152 1153 switch (optname) { 1154 case SO_DEBUG: 1155 v.val = sock_flag(sk, SOCK_DBG); 1156 break; 1157 1158 case SO_DONTROUTE: 1159 v.val = sock_flag(sk, SOCK_LOCALROUTE); 1160 break; 1161 1162 case SO_BROADCAST: 1163 v.val = sock_flag(sk, SOCK_BROADCAST); 1164 break; 1165 1166 case SO_SNDBUF: 1167 v.val = sk->sk_sndbuf; 1168 break; 1169 1170 case SO_RCVBUF: 1171 v.val = sk->sk_rcvbuf; 1172 break; 1173 1174 case SO_REUSEADDR: 1175 v.val = sk->sk_reuse; 1176 break; 1177 1178 case SO_REUSEPORT: 1179 v.val = sk->sk_reuseport; 1180 break; 1181 1182 case SO_KEEPALIVE: 1183 v.val = sock_flag(sk, SOCK_KEEPOPEN); 1184 break; 1185 1186 case SO_TYPE: 1187 v.val = sk->sk_type; 1188 break; 1189 1190 case SO_PROTOCOL: 1191 v.val = sk->sk_protocol; 1192 break; 1193 1194 case SO_DOMAIN: 1195 v.val = sk->sk_family; 1196 break; 1197 1198 case SO_ERROR: 1199 v.val = -sock_error(sk); 1200 if (v.val == 0) 1201 v.val = xchg(&sk->sk_err_soft, 0); 1202 break; 1203 1204 case SO_OOBINLINE: 1205 v.val = sock_flag(sk, SOCK_URGINLINE); 1206 break; 1207 1208 case SO_NO_CHECK: 1209 v.val = sk->sk_no_check_tx; 1210 break; 1211 1212 case SO_PRIORITY: 1213 v.val = sk->sk_priority; 1214 break; 1215 1216 case SO_LINGER: 1217 lv = sizeof(v.ling); 1218 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); 1219 v.ling.l_linger = sk->sk_lingertime / HZ; 1220 break; 1221 1222 case SO_BSDCOMPAT: 1223 sock_warn_obsolete_bsdism("getsockopt"); 1224 break; 1225 1226 case SO_TIMESTAMP: 1227 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && 1228 !sock_flag(sk, SOCK_RCVTSTAMPNS); 1229 break; 1230 1231 case SO_TIMESTAMPNS: 1232 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS); 1233 break; 1234 1235 case SO_TIMESTAMPING: 1236 v.val = sk->sk_tsflags; 1237 break; 1238 1239 case SO_RCVTIMEO: 1240 lv = sizeof(struct timeval); 1241 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) { 1242 v.tm.tv_sec = 0; 1243 v.tm.tv_usec = 0; 1244 } else { 1245 v.tm.tv_sec = sk->sk_rcvtimeo / HZ; 1246 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ; 1247 } 1248 break; 1249 1250 case SO_SNDTIMEO: 1251 lv = sizeof(struct timeval); 1252 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) { 1253 v.tm.tv_sec = 0; 1254 v.tm.tv_usec = 0; 1255 } else { 1256 v.tm.tv_sec = sk->sk_sndtimeo / HZ; 1257 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ; 1258 } 1259 break; 1260 1261 case SO_RCVLOWAT: 1262 v.val = sk->sk_rcvlowat; 1263 break; 1264 1265 case SO_SNDLOWAT: 1266 v.val = 1; 1267 break; 1268 1269 case SO_PASSCRED: 1270 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); 1271 break; 1272 1273 case SO_PEERCRED: 1274 { 1275 struct ucred peercred; 1276 if (len > sizeof(peercred)) 1277 len = sizeof(peercred); 1278 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); 1279 if (copy_to_user(optval, &peercred, len)) 1280 return -EFAULT; 1281 goto lenout; 1282 } 1283 1284 case SO_PEERGROUPS: 1285 { 1286 int ret, n; 1287 1288 if (!sk->sk_peer_cred) 1289 return -ENODATA; 1290 1291 n = sk->sk_peer_cred->group_info->ngroups; 1292 if (len < n * sizeof(gid_t)) { 1293 len = n * sizeof(gid_t); 1294 return put_user(len, optlen) ? -EFAULT : -ERANGE; 1295 } 1296 len = n * sizeof(gid_t); 1297 1298 ret = groups_to_user((gid_t __user *)optval, 1299 sk->sk_peer_cred->group_info); 1300 if (ret) 1301 return ret; 1302 goto lenout; 1303 } 1304 1305 case SO_PEERNAME: 1306 { 1307 char address[128]; 1308 1309 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2); 1310 if (lv < 0) 1311 return -ENOTCONN; 1312 if (lv < len) 1313 return -EINVAL; 1314 if (copy_to_user(optval, address, len)) 1315 return -EFAULT; 1316 goto lenout; 1317 } 1318 1319 /* Dubious BSD thing... Probably nobody even uses it, but 1320 * the UNIX standard wants it for whatever reason... -DaveM 1321 */ 1322 case SO_ACCEPTCONN: 1323 v.val = sk->sk_state == TCP_LISTEN; 1324 break; 1325 1326 case SO_PASSSEC: 1327 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); 1328 break; 1329 1330 case SO_PEERSEC: 1331 return security_socket_getpeersec_stream(sock, optval, optlen, len); 1332 1333 case SO_MARK: 1334 v.val = sk->sk_mark; 1335 break; 1336 1337 case SO_RXQ_OVFL: 1338 v.val = sock_flag(sk, SOCK_RXQ_OVFL); 1339 break; 1340 1341 case SO_WIFI_STATUS: 1342 v.val = sock_flag(sk, SOCK_WIFI_STATUS); 1343 break; 1344 1345 case SO_PEEK_OFF: 1346 if (!sock->ops->set_peek_off) 1347 return -EOPNOTSUPP; 1348 1349 v.val = sk->sk_peek_off; 1350 break; 1351 case SO_NOFCS: 1352 v.val = sock_flag(sk, SOCK_NOFCS); 1353 break; 1354 1355 case SO_BINDTODEVICE: 1356 return sock_getbindtodevice(sk, optval, optlen, len); 1357 1358 case SO_GET_FILTER: 1359 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); 1360 if (len < 0) 1361 return len; 1362 1363 goto lenout; 1364 1365 case SO_LOCK_FILTER: 1366 v.val = sock_flag(sk, SOCK_FILTER_LOCKED); 1367 break; 1368 1369 case SO_BPF_EXTENSIONS: 1370 v.val = bpf_tell_extensions(); 1371 break; 1372 1373 case SO_SELECT_ERR_QUEUE: 1374 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); 1375 break; 1376 1377 #ifdef CONFIG_NET_RX_BUSY_POLL 1378 case SO_BUSY_POLL: 1379 v.val = sk->sk_ll_usec; 1380 break; 1381 #endif 1382 1383 case SO_MAX_PACING_RATE: 1384 v.val = sk->sk_max_pacing_rate; 1385 break; 1386 1387 case SO_INCOMING_CPU: 1388 v.val = sk->sk_incoming_cpu; 1389 break; 1390 1391 case SO_MEMINFO: 1392 { 1393 u32 meminfo[SK_MEMINFO_VARS]; 1394 1395 if (get_user(len, optlen)) 1396 return -EFAULT; 1397 1398 sk_get_meminfo(sk, meminfo); 1399 1400 len = min_t(unsigned int, len, sizeof(meminfo)); 1401 if (copy_to_user(optval, &meminfo, len)) 1402 return -EFAULT; 1403 1404 goto lenout; 1405 } 1406 1407 #ifdef CONFIG_NET_RX_BUSY_POLL 1408 case SO_INCOMING_NAPI_ID: 1409 v.val = READ_ONCE(sk->sk_napi_id); 1410 1411 /* aggregate non-NAPI IDs down to 0 */ 1412 if (v.val < MIN_NAPI_ID) 1413 v.val = 0; 1414 1415 break; 1416 #endif 1417 1418 case SO_COOKIE: 1419 lv = sizeof(u64); 1420 if (len < lv) 1421 return -EINVAL; 1422 v.val64 = sock_gen_cookie(sk); 1423 break; 1424 1425 case SO_ZEROCOPY: 1426 v.val = sock_flag(sk, SOCK_ZEROCOPY); 1427 break; 1428 1429 case SO_TXTIME: 1430 lv = sizeof(v.txtime); 1431 v.txtime.clockid = sk->sk_clockid; 1432 v.txtime.flags |= sk->sk_txtime_deadline_mode ? 1433 SOF_TXTIME_DEADLINE_MODE : 0; 1434 v.txtime.flags |= sk->sk_txtime_report_errors ? 1435 SOF_TXTIME_REPORT_ERRORS : 0; 1436 break; 1437 1438 default: 1439 /* We implement the SO_SNDLOWAT etc to not be settable 1440 * (1003.1g 7). 1441 */ 1442 return -ENOPROTOOPT; 1443 } 1444 1445 if (len > lv) 1446 len = lv; 1447 if (copy_to_user(optval, &v, len)) 1448 return -EFAULT; 1449 lenout: 1450 if (put_user(len, optlen)) 1451 return -EFAULT; 1452 return 0; 1453 } 1454 1455 /* 1456 * Initialize an sk_lock. 1457 * 1458 * (We also register the sk_lock with the lock validator.) 1459 */ 1460 static inline void sock_lock_init(struct sock *sk) 1461 { 1462 if (sk->sk_kern_sock) 1463 sock_lock_init_class_and_name( 1464 sk, 1465 af_family_kern_slock_key_strings[sk->sk_family], 1466 af_family_kern_slock_keys + sk->sk_family, 1467 af_family_kern_key_strings[sk->sk_family], 1468 af_family_kern_keys + sk->sk_family); 1469 else 1470 sock_lock_init_class_and_name( 1471 sk, 1472 af_family_slock_key_strings[sk->sk_family], 1473 af_family_slock_keys + sk->sk_family, 1474 af_family_key_strings[sk->sk_family], 1475 af_family_keys + sk->sk_family); 1476 } 1477 1478 /* 1479 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 1480 * even temporarly, because of RCU lookups. sk_node should also be left as is. 1481 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end 1482 */ 1483 static void sock_copy(struct sock *nsk, const struct sock *osk) 1484 { 1485 #ifdef CONFIG_SECURITY_NETWORK 1486 void *sptr = nsk->sk_security; 1487 #endif 1488 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); 1489 1490 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, 1491 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); 1492 1493 #ifdef CONFIG_SECURITY_NETWORK 1494 nsk->sk_security = sptr; 1495 security_sk_clone(osk, nsk); 1496 #endif 1497 } 1498 1499 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 1500 int family) 1501 { 1502 struct sock *sk; 1503 struct kmem_cache *slab; 1504 1505 slab = prot->slab; 1506 if (slab != NULL) { 1507 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 1508 if (!sk) 1509 return sk; 1510 if (priority & __GFP_ZERO) 1511 sk_prot_clear_nulls(sk, prot->obj_size); 1512 } else 1513 sk = kmalloc(prot->obj_size, priority); 1514 1515 if (sk != NULL) { 1516 if (security_sk_alloc(sk, family, priority)) 1517 goto out_free; 1518 1519 if (!try_module_get(prot->owner)) 1520 goto out_free_sec; 1521 sk_tx_queue_clear(sk); 1522 } 1523 1524 return sk; 1525 1526 out_free_sec: 1527 security_sk_free(sk); 1528 out_free: 1529 if (slab != NULL) 1530 kmem_cache_free(slab, sk); 1531 else 1532 kfree(sk); 1533 return NULL; 1534 } 1535 1536 static void sk_prot_free(struct proto *prot, struct sock *sk) 1537 { 1538 struct kmem_cache *slab; 1539 struct module *owner; 1540 1541 owner = prot->owner; 1542 slab = prot->slab; 1543 1544 cgroup_sk_free(&sk->sk_cgrp_data); 1545 mem_cgroup_sk_free(sk); 1546 security_sk_free(sk); 1547 if (slab != NULL) 1548 kmem_cache_free(slab, sk); 1549 else 1550 kfree(sk); 1551 module_put(owner); 1552 } 1553 1554 /** 1555 * sk_alloc - All socket objects are allocated here 1556 * @net: the applicable net namespace 1557 * @family: protocol family 1558 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1559 * @prot: struct proto associated with this new sock instance 1560 * @kern: is this to be a kernel socket? 1561 */ 1562 struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 1563 struct proto *prot, int kern) 1564 { 1565 struct sock *sk; 1566 1567 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); 1568 if (sk) { 1569 sk->sk_family = family; 1570 /* 1571 * See comment in struct sock definition to understand 1572 * why we need sk_prot_creator -acme 1573 */ 1574 sk->sk_prot = sk->sk_prot_creator = prot; 1575 sk->sk_kern_sock = kern; 1576 sock_lock_init(sk); 1577 sk->sk_net_refcnt = kern ? 0 : 1; 1578 if (likely(sk->sk_net_refcnt)) { 1579 get_net(net); 1580 sock_inuse_add(net, 1); 1581 } 1582 1583 sock_net_set(sk, net); 1584 refcount_set(&sk->sk_wmem_alloc, 1); 1585 1586 mem_cgroup_sk_alloc(sk); 1587 cgroup_sk_alloc(&sk->sk_cgrp_data); 1588 sock_update_classid(&sk->sk_cgrp_data); 1589 sock_update_netprioidx(&sk->sk_cgrp_data); 1590 } 1591 1592 return sk; 1593 } 1594 EXPORT_SYMBOL(sk_alloc); 1595 1596 /* Sockets having SOCK_RCU_FREE will call this function after one RCU 1597 * grace period. This is the case for UDP sockets and TCP listeners. 1598 */ 1599 static void __sk_destruct(struct rcu_head *head) 1600 { 1601 struct sock *sk = container_of(head, struct sock, sk_rcu); 1602 struct sk_filter *filter; 1603 1604 if (sk->sk_destruct) 1605 sk->sk_destruct(sk); 1606 1607 filter = rcu_dereference_check(sk->sk_filter, 1608 refcount_read(&sk->sk_wmem_alloc) == 0); 1609 if (filter) { 1610 sk_filter_uncharge(sk, filter); 1611 RCU_INIT_POINTER(sk->sk_filter, NULL); 1612 } 1613 if (rcu_access_pointer(sk->sk_reuseport_cb)) 1614 reuseport_detach_sock(sk); 1615 1616 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); 1617 1618 if (atomic_read(&sk->sk_omem_alloc)) 1619 pr_debug("%s: optmem leakage (%d bytes) detected\n", 1620 __func__, atomic_read(&sk->sk_omem_alloc)); 1621 1622 if (sk->sk_frag.page) { 1623 put_page(sk->sk_frag.page); 1624 sk->sk_frag.page = NULL; 1625 } 1626 1627 if (sk->sk_peer_cred) 1628 put_cred(sk->sk_peer_cred); 1629 put_pid(sk->sk_peer_pid); 1630 if (likely(sk->sk_net_refcnt)) 1631 put_net(sock_net(sk)); 1632 sk_prot_free(sk->sk_prot_creator, sk); 1633 } 1634 1635 void sk_destruct(struct sock *sk) 1636 { 1637 if (sock_flag(sk, SOCK_RCU_FREE)) 1638 call_rcu(&sk->sk_rcu, __sk_destruct); 1639 else 1640 __sk_destruct(&sk->sk_rcu); 1641 } 1642 1643 static void __sk_free(struct sock *sk) 1644 { 1645 if (likely(sk->sk_net_refcnt)) 1646 sock_inuse_add(sock_net(sk), -1); 1647 1648 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) 1649 sock_diag_broadcast_destroy(sk); 1650 else 1651 sk_destruct(sk); 1652 } 1653 1654 void sk_free(struct sock *sk) 1655 { 1656 /* 1657 * We subtract one from sk_wmem_alloc and can know if 1658 * some packets are still in some tx queue. 1659 * If not null, sock_wfree() will call __sk_free(sk) later 1660 */ 1661 if (refcount_dec_and_test(&sk->sk_wmem_alloc)) 1662 __sk_free(sk); 1663 } 1664 EXPORT_SYMBOL(sk_free); 1665 1666 static void sk_init_common(struct sock *sk) 1667 { 1668 skb_queue_head_init(&sk->sk_receive_queue); 1669 skb_queue_head_init(&sk->sk_write_queue); 1670 skb_queue_head_init(&sk->sk_error_queue); 1671 1672 rwlock_init(&sk->sk_callback_lock); 1673 lockdep_set_class_and_name(&sk->sk_receive_queue.lock, 1674 af_rlock_keys + sk->sk_family, 1675 af_family_rlock_key_strings[sk->sk_family]); 1676 lockdep_set_class_and_name(&sk->sk_write_queue.lock, 1677 af_wlock_keys + sk->sk_family, 1678 af_family_wlock_key_strings[sk->sk_family]); 1679 lockdep_set_class_and_name(&sk->sk_error_queue.lock, 1680 af_elock_keys + sk->sk_family, 1681 af_family_elock_key_strings[sk->sk_family]); 1682 lockdep_set_class_and_name(&sk->sk_callback_lock, 1683 af_callback_keys + sk->sk_family, 1684 af_family_clock_key_strings[sk->sk_family]); 1685 } 1686 1687 /** 1688 * sk_clone_lock - clone a socket, and lock its clone 1689 * @sk: the socket to clone 1690 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1691 * 1692 * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) 1693 */ 1694 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) 1695 { 1696 struct sock *newsk; 1697 bool is_charged = true; 1698 1699 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family); 1700 if (newsk != NULL) { 1701 struct sk_filter *filter; 1702 1703 sock_copy(newsk, sk); 1704 1705 newsk->sk_prot_creator = sk->sk_prot; 1706 1707 /* SANITY */ 1708 if (likely(newsk->sk_net_refcnt)) 1709 get_net(sock_net(newsk)); 1710 sk_node_init(&newsk->sk_node); 1711 sock_lock_init(newsk); 1712 bh_lock_sock(newsk); 1713 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 1714 newsk->sk_backlog.len = 0; 1715 1716 atomic_set(&newsk->sk_rmem_alloc, 0); 1717 /* 1718 * sk_wmem_alloc set to one (see sk_free() and sock_wfree()) 1719 */ 1720 refcount_set(&newsk->sk_wmem_alloc, 1); 1721 atomic_set(&newsk->sk_omem_alloc, 0); 1722 sk_init_common(newsk); 1723 1724 newsk->sk_dst_cache = NULL; 1725 newsk->sk_dst_pending_confirm = 0; 1726 newsk->sk_wmem_queued = 0; 1727 newsk->sk_forward_alloc = 0; 1728 atomic_set(&newsk->sk_drops, 0); 1729 newsk->sk_send_head = NULL; 1730 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; 1731 atomic_set(&newsk->sk_zckey, 0); 1732 1733 sock_reset_flag(newsk, SOCK_DONE); 1734 mem_cgroup_sk_alloc(newsk); 1735 cgroup_sk_alloc(&newsk->sk_cgrp_data); 1736 1737 rcu_read_lock(); 1738 filter = rcu_dereference(sk->sk_filter); 1739 if (filter != NULL) 1740 /* though it's an empty new sock, the charging may fail 1741 * if sysctl_optmem_max was changed between creation of 1742 * original socket and cloning 1743 */ 1744 is_charged = sk_filter_charge(newsk, filter); 1745 RCU_INIT_POINTER(newsk->sk_filter, filter); 1746 rcu_read_unlock(); 1747 1748 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { 1749 /* We need to make sure that we don't uncharge the new 1750 * socket if we couldn't charge it in the first place 1751 * as otherwise we uncharge the parent's filter. 1752 */ 1753 if (!is_charged) 1754 RCU_INIT_POINTER(newsk->sk_filter, NULL); 1755 sk_free_unlock_clone(newsk); 1756 newsk = NULL; 1757 goto out; 1758 } 1759 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); 1760 1761 newsk->sk_err = 0; 1762 newsk->sk_err_soft = 0; 1763 newsk->sk_priority = 0; 1764 newsk->sk_incoming_cpu = raw_smp_processor_id(); 1765 atomic64_set(&newsk->sk_cookie, 0); 1766 if (likely(newsk->sk_net_refcnt)) 1767 sock_inuse_add(sock_net(newsk), 1); 1768 1769 /* 1770 * Before updating sk_refcnt, we must commit prior changes to memory 1771 * (Documentation/RCU/rculist_nulls.txt for details) 1772 */ 1773 smp_wmb(); 1774 refcount_set(&newsk->sk_refcnt, 2); 1775 1776 /* 1777 * Increment the counter in the same struct proto as the master 1778 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that 1779 * is the same as sk->sk_prot->socks, as this field was copied 1780 * with memcpy). 1781 * 1782 * This _changes_ the previous behaviour, where 1783 * tcp_create_openreq_child always was incrementing the 1784 * equivalent to tcp_prot->socks (inet_sock_nr), so this have 1785 * to be taken into account in all callers. -acme 1786 */ 1787 sk_refcnt_debug_inc(newsk); 1788 sk_set_socket(newsk, NULL); 1789 newsk->sk_wq = NULL; 1790 1791 if (newsk->sk_prot->sockets_allocated) 1792 sk_sockets_allocated_inc(newsk); 1793 1794 if (sock_needs_netstamp(sk) && 1795 newsk->sk_flags & SK_FLAGS_TIMESTAMP) 1796 net_enable_timestamp(); 1797 } 1798 out: 1799 return newsk; 1800 } 1801 EXPORT_SYMBOL_GPL(sk_clone_lock); 1802 1803 void sk_free_unlock_clone(struct sock *sk) 1804 { 1805 /* It is still raw copy of parent, so invalidate 1806 * destructor and make plain sk_free() */ 1807 sk->sk_destruct = NULL; 1808 bh_unlock_sock(sk); 1809 sk_free(sk); 1810 } 1811 EXPORT_SYMBOL_GPL(sk_free_unlock_clone); 1812 1813 void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 1814 { 1815 u32 max_segs = 1; 1816 1817 sk_dst_set(sk, dst); 1818 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps; 1819 if (sk->sk_route_caps & NETIF_F_GSO) 1820 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 1821 sk->sk_route_caps &= ~sk->sk_route_nocaps; 1822 if (sk_can_gso(sk)) { 1823 if (dst->header_len && !xfrm_dst_offload_ok(dst)) { 1824 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 1825 } else { 1826 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 1827 sk->sk_gso_max_size = dst->dev->gso_max_size; 1828 max_segs = max_t(u32, dst->dev->gso_max_segs, 1); 1829 } 1830 } 1831 sk->sk_gso_max_segs = max_segs; 1832 } 1833 EXPORT_SYMBOL_GPL(sk_setup_caps); 1834 1835 /* 1836 * Simple resource managers for sockets. 1837 */ 1838 1839 1840 /* 1841 * Write buffer destructor automatically called from kfree_skb. 1842 */ 1843 void sock_wfree(struct sk_buff *skb) 1844 { 1845 struct sock *sk = skb->sk; 1846 unsigned int len = skb->truesize; 1847 1848 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { 1849 /* 1850 * Keep a reference on sk_wmem_alloc, this will be released 1851 * after sk_write_space() call 1852 */ 1853 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); 1854 sk->sk_write_space(sk); 1855 len = 1; 1856 } 1857 /* 1858 * if sk_wmem_alloc reaches 0, we must finish what sk_free() 1859 * could not do because of in-flight packets 1860 */ 1861 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) 1862 __sk_free(sk); 1863 } 1864 EXPORT_SYMBOL(sock_wfree); 1865 1866 /* This variant of sock_wfree() is used by TCP, 1867 * since it sets SOCK_USE_WRITE_QUEUE. 1868 */ 1869 void __sock_wfree(struct sk_buff *skb) 1870 { 1871 struct sock *sk = skb->sk; 1872 1873 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) 1874 __sk_free(sk); 1875 } 1876 1877 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) 1878 { 1879 skb_orphan(skb); 1880 skb->sk = sk; 1881 #ifdef CONFIG_INET 1882 if (unlikely(!sk_fullsock(sk))) { 1883 skb->destructor = sock_edemux; 1884 sock_hold(sk); 1885 return; 1886 } 1887 #endif 1888 skb->destructor = sock_wfree; 1889 skb_set_hash_from_sk(skb, sk); 1890 /* 1891 * We used to take a refcount on sk, but following operation 1892 * is enough to guarantee sk_free() wont free this sock until 1893 * all in-flight packets are completed 1894 */ 1895 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 1896 } 1897 EXPORT_SYMBOL(skb_set_owner_w); 1898 1899 /* This helper is used by netem, as it can hold packets in its 1900 * delay queue. We want to allow the owner socket to send more 1901 * packets, as if they were already TX completed by a typical driver. 1902 * But we also want to keep skb->sk set because some packet schedulers 1903 * rely on it (sch_fq for example). 1904 */ 1905 void skb_orphan_partial(struct sk_buff *skb) 1906 { 1907 if (skb_is_tcp_pure_ack(skb)) 1908 return; 1909 1910 if (skb->destructor == sock_wfree 1911 #ifdef CONFIG_INET 1912 || skb->destructor == tcp_wfree 1913 #endif 1914 ) { 1915 struct sock *sk = skb->sk; 1916 1917 if (refcount_inc_not_zero(&sk->sk_refcnt)) { 1918 WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)); 1919 skb->destructor = sock_efree; 1920 } 1921 } else { 1922 skb_orphan(skb); 1923 } 1924 } 1925 EXPORT_SYMBOL(skb_orphan_partial); 1926 1927 /* 1928 * Read buffer destructor automatically called from kfree_skb. 1929 */ 1930 void sock_rfree(struct sk_buff *skb) 1931 { 1932 struct sock *sk = skb->sk; 1933 unsigned int len = skb->truesize; 1934 1935 atomic_sub(len, &sk->sk_rmem_alloc); 1936 sk_mem_uncharge(sk, len); 1937 } 1938 EXPORT_SYMBOL(sock_rfree); 1939 1940 /* 1941 * Buffer destructor for skbs that are not used directly in read or write 1942 * path, e.g. for error handler skbs. Automatically called from kfree_skb. 1943 */ 1944 void sock_efree(struct sk_buff *skb) 1945 { 1946 sock_put(skb->sk); 1947 } 1948 EXPORT_SYMBOL(sock_efree); 1949 1950 kuid_t sock_i_uid(struct sock *sk) 1951 { 1952 kuid_t uid; 1953 1954 read_lock_bh(&sk->sk_callback_lock); 1955 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; 1956 read_unlock_bh(&sk->sk_callback_lock); 1957 return uid; 1958 } 1959 EXPORT_SYMBOL(sock_i_uid); 1960 1961 unsigned long sock_i_ino(struct sock *sk) 1962 { 1963 unsigned long ino; 1964 1965 read_lock_bh(&sk->sk_callback_lock); 1966 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; 1967 read_unlock_bh(&sk->sk_callback_lock); 1968 return ino; 1969 } 1970 EXPORT_SYMBOL(sock_i_ino); 1971 1972 /* 1973 * Allocate a skb from the socket's send buffer. 1974 */ 1975 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 1976 gfp_t priority) 1977 { 1978 if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { 1979 struct sk_buff *skb = alloc_skb(size, priority); 1980 if (skb) { 1981 skb_set_owner_w(skb, sk); 1982 return skb; 1983 } 1984 } 1985 return NULL; 1986 } 1987 EXPORT_SYMBOL(sock_wmalloc); 1988 1989 static void sock_ofree(struct sk_buff *skb) 1990 { 1991 struct sock *sk = skb->sk; 1992 1993 atomic_sub(skb->truesize, &sk->sk_omem_alloc); 1994 } 1995 1996 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, 1997 gfp_t priority) 1998 { 1999 struct sk_buff *skb; 2000 2001 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ 2002 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > 2003 sysctl_optmem_max) 2004 return NULL; 2005 2006 skb = alloc_skb(size, priority); 2007 if (!skb) 2008 return NULL; 2009 2010 atomic_add(skb->truesize, &sk->sk_omem_alloc); 2011 skb->sk = sk; 2012 skb->destructor = sock_ofree; 2013 return skb; 2014 } 2015 2016 /* 2017 * Allocate a memory block from the socket's option memory buffer. 2018 */ 2019 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) 2020 { 2021 if ((unsigned int)size <= sysctl_optmem_max && 2022 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { 2023 void *mem; 2024 /* First do the add, to avoid the race if kmalloc 2025 * might sleep. 2026 */ 2027 atomic_add(size, &sk->sk_omem_alloc); 2028 mem = kmalloc(size, priority); 2029 if (mem) 2030 return mem; 2031 atomic_sub(size, &sk->sk_omem_alloc); 2032 } 2033 return NULL; 2034 } 2035 EXPORT_SYMBOL(sock_kmalloc); 2036 2037 /* Free an option memory block. Note, we actually want the inline 2038 * here as this allows gcc to detect the nullify and fold away the 2039 * condition entirely. 2040 */ 2041 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, 2042 const bool nullify) 2043 { 2044 if (WARN_ON_ONCE(!mem)) 2045 return; 2046 if (nullify) 2047 kzfree(mem); 2048 else 2049 kfree(mem); 2050 atomic_sub(size, &sk->sk_omem_alloc); 2051 } 2052 2053 void sock_kfree_s(struct sock *sk, void *mem, int size) 2054 { 2055 __sock_kfree_s(sk, mem, size, false); 2056 } 2057 EXPORT_SYMBOL(sock_kfree_s); 2058 2059 void sock_kzfree_s(struct sock *sk, void *mem, int size) 2060 { 2061 __sock_kfree_s(sk, mem, size, true); 2062 } 2063 EXPORT_SYMBOL(sock_kzfree_s); 2064 2065 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. 2066 I think, these locks should be removed for datagram sockets. 2067 */ 2068 static long sock_wait_for_wmem(struct sock *sk, long timeo) 2069 { 2070 DEFINE_WAIT(wait); 2071 2072 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2073 for (;;) { 2074 if (!timeo) 2075 break; 2076 if (signal_pending(current)) 2077 break; 2078 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2079 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 2080 if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) 2081 break; 2082 if (sk->sk_shutdown & SEND_SHUTDOWN) 2083 break; 2084 if (sk->sk_err) 2085 break; 2086 timeo = schedule_timeout(timeo); 2087 } 2088 finish_wait(sk_sleep(sk), &wait); 2089 return timeo; 2090 } 2091 2092 2093 /* 2094 * Generic send/receive buffer handlers 2095 */ 2096 2097 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 2098 unsigned long data_len, int noblock, 2099 int *errcode, int max_page_order) 2100 { 2101 struct sk_buff *skb; 2102 long timeo; 2103 int err; 2104 2105 timeo = sock_sndtimeo(sk, noblock); 2106 for (;;) { 2107 err = sock_error(sk); 2108 if (err != 0) 2109 goto failure; 2110 2111 err = -EPIPE; 2112 if (sk->sk_shutdown & SEND_SHUTDOWN) 2113 goto failure; 2114 2115 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf) 2116 break; 2117 2118 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2119 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2120 err = -EAGAIN; 2121 if (!timeo) 2122 goto failure; 2123 if (signal_pending(current)) 2124 goto interrupted; 2125 timeo = sock_wait_for_wmem(sk, timeo); 2126 } 2127 skb = alloc_skb_with_frags(header_len, data_len, max_page_order, 2128 errcode, sk->sk_allocation); 2129 if (skb) 2130 skb_set_owner_w(skb, sk); 2131 return skb; 2132 2133 interrupted: 2134 err = sock_intr_errno(timeo); 2135 failure: 2136 *errcode = err; 2137 return NULL; 2138 } 2139 EXPORT_SYMBOL(sock_alloc_send_pskb); 2140 2141 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 2142 int noblock, int *errcode) 2143 { 2144 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); 2145 } 2146 EXPORT_SYMBOL(sock_alloc_send_skb); 2147 2148 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, 2149 struct sockcm_cookie *sockc) 2150 { 2151 u32 tsflags; 2152 2153 switch (cmsg->cmsg_type) { 2154 case SO_MARK: 2155 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 2156 return -EPERM; 2157 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2158 return -EINVAL; 2159 sockc->mark = *(u32 *)CMSG_DATA(cmsg); 2160 break; 2161 case SO_TIMESTAMPING: 2162 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2163 return -EINVAL; 2164 2165 tsflags = *(u32 *)CMSG_DATA(cmsg); 2166 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) 2167 return -EINVAL; 2168 2169 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; 2170 sockc->tsflags |= tsflags; 2171 break; 2172 case SCM_TXTIME: 2173 if (!sock_flag(sk, SOCK_TXTIME)) 2174 return -EINVAL; 2175 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) 2176 return -EINVAL; 2177 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); 2178 break; 2179 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ 2180 case SCM_RIGHTS: 2181 case SCM_CREDENTIALS: 2182 break; 2183 default: 2184 return -EINVAL; 2185 } 2186 return 0; 2187 } 2188 EXPORT_SYMBOL(__sock_cmsg_send); 2189 2190 int sock_cmsg_send(struct sock *sk, struct msghdr *msg, 2191 struct sockcm_cookie *sockc) 2192 { 2193 struct cmsghdr *cmsg; 2194 int ret; 2195 2196 for_each_cmsghdr(cmsg, msg) { 2197 if (!CMSG_OK(msg, cmsg)) 2198 return -EINVAL; 2199 if (cmsg->cmsg_level != SOL_SOCKET) 2200 continue; 2201 ret = __sock_cmsg_send(sk, msg, cmsg, sockc); 2202 if (ret) 2203 return ret; 2204 } 2205 return 0; 2206 } 2207 EXPORT_SYMBOL(sock_cmsg_send); 2208 2209 static void sk_enter_memory_pressure(struct sock *sk) 2210 { 2211 if (!sk->sk_prot->enter_memory_pressure) 2212 return; 2213 2214 sk->sk_prot->enter_memory_pressure(sk); 2215 } 2216 2217 static void sk_leave_memory_pressure(struct sock *sk) 2218 { 2219 if (sk->sk_prot->leave_memory_pressure) { 2220 sk->sk_prot->leave_memory_pressure(sk); 2221 } else { 2222 unsigned long *memory_pressure = sk->sk_prot->memory_pressure; 2223 2224 if (memory_pressure && *memory_pressure) 2225 *memory_pressure = 0; 2226 } 2227 } 2228 2229 /* On 32bit arches, an skb frag is limited to 2^15 */ 2230 #define SKB_FRAG_PAGE_ORDER get_order(32768) 2231 2232 /** 2233 * skb_page_frag_refill - check that a page_frag contains enough room 2234 * @sz: minimum size of the fragment we want to get 2235 * @pfrag: pointer to page_frag 2236 * @gfp: priority for memory allocation 2237 * 2238 * Note: While this allocator tries to use high order pages, there is 2239 * no guarantee that allocations succeed. Therefore, @sz MUST be 2240 * less or equal than PAGE_SIZE. 2241 */ 2242 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) 2243 { 2244 if (pfrag->page) { 2245 if (page_ref_count(pfrag->page) == 1) { 2246 pfrag->offset = 0; 2247 return true; 2248 } 2249 if (pfrag->offset + sz <= pfrag->size) 2250 return true; 2251 put_page(pfrag->page); 2252 } 2253 2254 pfrag->offset = 0; 2255 if (SKB_FRAG_PAGE_ORDER) { 2256 /* Avoid direct reclaim but allow kswapd to wake */ 2257 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | 2258 __GFP_COMP | __GFP_NOWARN | 2259 __GFP_NORETRY, 2260 SKB_FRAG_PAGE_ORDER); 2261 if (likely(pfrag->page)) { 2262 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; 2263 return true; 2264 } 2265 } 2266 pfrag->page = alloc_page(gfp); 2267 if (likely(pfrag->page)) { 2268 pfrag->size = PAGE_SIZE; 2269 return true; 2270 } 2271 return false; 2272 } 2273 EXPORT_SYMBOL(skb_page_frag_refill); 2274 2275 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 2276 { 2277 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) 2278 return true; 2279 2280 sk_enter_memory_pressure(sk); 2281 sk_stream_moderate_sndbuf(sk); 2282 return false; 2283 } 2284 EXPORT_SYMBOL(sk_page_frag_refill); 2285 2286 int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, 2287 int sg_start, int *sg_curr_index, unsigned int *sg_curr_size, 2288 int first_coalesce) 2289 { 2290 int sg_curr = *sg_curr_index, use = 0, rc = 0; 2291 unsigned int size = *sg_curr_size; 2292 struct page_frag *pfrag; 2293 struct scatterlist *sge; 2294 2295 len -= size; 2296 pfrag = sk_page_frag(sk); 2297 2298 while (len > 0) { 2299 unsigned int orig_offset; 2300 2301 if (!sk_page_frag_refill(sk, pfrag)) { 2302 rc = -ENOMEM; 2303 goto out; 2304 } 2305 2306 use = min_t(int, len, pfrag->size - pfrag->offset); 2307 2308 if (!sk_wmem_schedule(sk, use)) { 2309 rc = -ENOMEM; 2310 goto out; 2311 } 2312 2313 sk_mem_charge(sk, use); 2314 size += use; 2315 orig_offset = pfrag->offset; 2316 pfrag->offset += use; 2317 2318 sge = sg + sg_curr - 1; 2319 if (sg_curr > first_coalesce && sg_page(sg) == pfrag->page && 2320 sg->offset + sg->length == orig_offset) { 2321 sg->length += use; 2322 } else { 2323 sge = sg + sg_curr; 2324 sg_unmark_end(sge); 2325 sg_set_page(sge, pfrag->page, use, orig_offset); 2326 get_page(pfrag->page); 2327 sg_curr++; 2328 2329 if (sg_curr == MAX_SKB_FRAGS) 2330 sg_curr = 0; 2331 2332 if (sg_curr == sg_start) { 2333 rc = -ENOSPC; 2334 break; 2335 } 2336 } 2337 2338 len -= use; 2339 } 2340 out: 2341 *sg_curr_size = size; 2342 *sg_curr_index = sg_curr; 2343 return rc; 2344 } 2345 EXPORT_SYMBOL(sk_alloc_sg); 2346 2347 static void __lock_sock(struct sock *sk) 2348 __releases(&sk->sk_lock.slock) 2349 __acquires(&sk->sk_lock.slock) 2350 { 2351 DEFINE_WAIT(wait); 2352 2353 for (;;) { 2354 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, 2355 TASK_UNINTERRUPTIBLE); 2356 spin_unlock_bh(&sk->sk_lock.slock); 2357 schedule(); 2358 spin_lock_bh(&sk->sk_lock.slock); 2359 if (!sock_owned_by_user(sk)) 2360 break; 2361 } 2362 finish_wait(&sk->sk_lock.wq, &wait); 2363 } 2364 2365 static void __release_sock(struct sock *sk) 2366 __releases(&sk->sk_lock.slock) 2367 __acquires(&sk->sk_lock.slock) 2368 { 2369 struct sk_buff *skb, *next; 2370 2371 while ((skb = sk->sk_backlog.head) != NULL) { 2372 sk->sk_backlog.head = sk->sk_backlog.tail = NULL; 2373 2374 spin_unlock_bh(&sk->sk_lock.slock); 2375 2376 do { 2377 next = skb->next; 2378 prefetch(next); 2379 WARN_ON_ONCE(skb_dst_is_noref(skb)); 2380 skb->next = NULL; 2381 sk_backlog_rcv(sk, skb); 2382 2383 cond_resched(); 2384 2385 skb = next; 2386 } while (skb != NULL); 2387 2388 spin_lock_bh(&sk->sk_lock.slock); 2389 } 2390 2391 /* 2392 * Doing the zeroing here guarantee we can not loop forever 2393 * while a wild producer attempts to flood us. 2394 */ 2395 sk->sk_backlog.len = 0; 2396 } 2397 2398 void __sk_flush_backlog(struct sock *sk) 2399 { 2400 spin_lock_bh(&sk->sk_lock.slock); 2401 __release_sock(sk); 2402 spin_unlock_bh(&sk->sk_lock.slock); 2403 } 2404 2405 /** 2406 * sk_wait_data - wait for data to arrive at sk_receive_queue 2407 * @sk: sock to wait on 2408 * @timeo: for how long 2409 * @skb: last skb seen on sk_receive_queue 2410 * 2411 * Now socket state including sk->sk_err is changed only under lock, 2412 * hence we may omit checks after joining wait queue. 2413 * We check receive queue before schedule() only as optimization; 2414 * it is very likely that release_sock() added new data. 2415 */ 2416 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) 2417 { 2418 DEFINE_WAIT_FUNC(wait, woken_wake_function); 2419 int rc; 2420 2421 add_wait_queue(sk_sleep(sk), &wait); 2422 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2423 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); 2424 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2425 remove_wait_queue(sk_sleep(sk), &wait); 2426 return rc; 2427 } 2428 EXPORT_SYMBOL(sk_wait_data); 2429 2430 /** 2431 * __sk_mem_raise_allocated - increase memory_allocated 2432 * @sk: socket 2433 * @size: memory size to allocate 2434 * @amt: pages to allocate 2435 * @kind: allocation type 2436 * 2437 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc 2438 */ 2439 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) 2440 { 2441 struct proto *prot = sk->sk_prot; 2442 long allocated = sk_memory_allocated_add(sk, amt); 2443 bool charged = true; 2444 2445 if (mem_cgroup_sockets_enabled && sk->sk_memcg && 2446 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt))) 2447 goto suppress_allocation; 2448 2449 /* Under limit. */ 2450 if (allocated <= sk_prot_mem_limits(sk, 0)) { 2451 sk_leave_memory_pressure(sk); 2452 return 1; 2453 } 2454 2455 /* Under pressure. */ 2456 if (allocated > sk_prot_mem_limits(sk, 1)) 2457 sk_enter_memory_pressure(sk); 2458 2459 /* Over hard limit. */ 2460 if (allocated > sk_prot_mem_limits(sk, 2)) 2461 goto suppress_allocation; 2462 2463 /* guarantee minimum buffer size under pressure */ 2464 if (kind == SK_MEM_RECV) { 2465 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) 2466 return 1; 2467 2468 } else { /* SK_MEM_SEND */ 2469 int wmem0 = sk_get_wmem0(sk, prot); 2470 2471 if (sk->sk_type == SOCK_STREAM) { 2472 if (sk->sk_wmem_queued < wmem0) 2473 return 1; 2474 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { 2475 return 1; 2476 } 2477 } 2478 2479 if (sk_has_memory_pressure(sk)) { 2480 int alloc; 2481 2482 if (!sk_under_memory_pressure(sk)) 2483 return 1; 2484 alloc = sk_sockets_allocated_read_positive(sk); 2485 if (sk_prot_mem_limits(sk, 2) > alloc * 2486 sk_mem_pages(sk->sk_wmem_queued + 2487 atomic_read(&sk->sk_rmem_alloc) + 2488 sk->sk_forward_alloc)) 2489 return 1; 2490 } 2491 2492 suppress_allocation: 2493 2494 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { 2495 sk_stream_moderate_sndbuf(sk); 2496 2497 /* Fail only if socket is _under_ its sndbuf. 2498 * In this case we cannot block, so that we have to fail. 2499 */ 2500 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) 2501 return 1; 2502 } 2503 2504 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) 2505 trace_sock_exceed_buf_limit(sk, prot, allocated, kind); 2506 2507 sk_memory_allocated_sub(sk, amt); 2508 2509 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2510 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); 2511 2512 return 0; 2513 } 2514 EXPORT_SYMBOL(__sk_mem_raise_allocated); 2515 2516 /** 2517 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 2518 * @sk: socket 2519 * @size: memory size to allocate 2520 * @kind: allocation type 2521 * 2522 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 2523 * rmem allocation. This function assumes that protocols which have 2524 * memory_pressure use sk_wmem_queued as write buffer accounting. 2525 */ 2526 int __sk_mem_schedule(struct sock *sk, int size, int kind) 2527 { 2528 int ret, amt = sk_mem_pages(size); 2529 2530 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT; 2531 ret = __sk_mem_raise_allocated(sk, size, amt, kind); 2532 if (!ret) 2533 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT; 2534 return ret; 2535 } 2536 EXPORT_SYMBOL(__sk_mem_schedule); 2537 2538 /** 2539 * __sk_mem_reduce_allocated - reclaim memory_allocated 2540 * @sk: socket 2541 * @amount: number of quanta 2542 * 2543 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc 2544 */ 2545 void __sk_mem_reduce_allocated(struct sock *sk, int amount) 2546 { 2547 sk_memory_allocated_sub(sk, amount); 2548 2549 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2550 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); 2551 2552 if (sk_under_memory_pressure(sk) && 2553 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 2554 sk_leave_memory_pressure(sk); 2555 } 2556 EXPORT_SYMBOL(__sk_mem_reduce_allocated); 2557 2558 /** 2559 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated 2560 * @sk: socket 2561 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) 2562 */ 2563 void __sk_mem_reclaim(struct sock *sk, int amount) 2564 { 2565 amount >>= SK_MEM_QUANTUM_SHIFT; 2566 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; 2567 __sk_mem_reduce_allocated(sk, amount); 2568 } 2569 EXPORT_SYMBOL(__sk_mem_reclaim); 2570 2571 int sk_set_peek_off(struct sock *sk, int val) 2572 { 2573 sk->sk_peek_off = val; 2574 return 0; 2575 } 2576 EXPORT_SYMBOL_GPL(sk_set_peek_off); 2577 2578 /* 2579 * Set of default routines for initialising struct proto_ops when 2580 * the protocol does not support a particular function. In certain 2581 * cases where it makes no sense for a protocol to have a "do nothing" 2582 * function, some default processing is provided. 2583 */ 2584 2585 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) 2586 { 2587 return -EOPNOTSUPP; 2588 } 2589 EXPORT_SYMBOL(sock_no_bind); 2590 2591 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 2592 int len, int flags) 2593 { 2594 return -EOPNOTSUPP; 2595 } 2596 EXPORT_SYMBOL(sock_no_connect); 2597 2598 int sock_no_socketpair(struct socket *sock1, struct socket *sock2) 2599 { 2600 return -EOPNOTSUPP; 2601 } 2602 EXPORT_SYMBOL(sock_no_socketpair); 2603 2604 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, 2605 bool kern) 2606 { 2607 return -EOPNOTSUPP; 2608 } 2609 EXPORT_SYMBOL(sock_no_accept); 2610 2611 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 2612 int peer) 2613 { 2614 return -EOPNOTSUPP; 2615 } 2616 EXPORT_SYMBOL(sock_no_getname); 2617 2618 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 2619 { 2620 return -EOPNOTSUPP; 2621 } 2622 EXPORT_SYMBOL(sock_no_ioctl); 2623 2624 int sock_no_listen(struct socket *sock, int backlog) 2625 { 2626 return -EOPNOTSUPP; 2627 } 2628 EXPORT_SYMBOL(sock_no_listen); 2629 2630 int sock_no_shutdown(struct socket *sock, int how) 2631 { 2632 return -EOPNOTSUPP; 2633 } 2634 EXPORT_SYMBOL(sock_no_shutdown); 2635 2636 int sock_no_setsockopt(struct socket *sock, int level, int optname, 2637 char __user *optval, unsigned int optlen) 2638 { 2639 return -EOPNOTSUPP; 2640 } 2641 EXPORT_SYMBOL(sock_no_setsockopt); 2642 2643 int sock_no_getsockopt(struct socket *sock, int level, int optname, 2644 char __user *optval, int __user *optlen) 2645 { 2646 return -EOPNOTSUPP; 2647 } 2648 EXPORT_SYMBOL(sock_no_getsockopt); 2649 2650 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) 2651 { 2652 return -EOPNOTSUPP; 2653 } 2654 EXPORT_SYMBOL(sock_no_sendmsg); 2655 2656 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) 2657 { 2658 return -EOPNOTSUPP; 2659 } 2660 EXPORT_SYMBOL(sock_no_sendmsg_locked); 2661 2662 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, 2663 int flags) 2664 { 2665 return -EOPNOTSUPP; 2666 } 2667 EXPORT_SYMBOL(sock_no_recvmsg); 2668 2669 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 2670 { 2671 /* Mirror missing mmap method error code */ 2672 return -ENODEV; 2673 } 2674 EXPORT_SYMBOL(sock_no_mmap); 2675 2676 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) 2677 { 2678 ssize_t res; 2679 struct msghdr msg = {.msg_flags = flags}; 2680 struct kvec iov; 2681 char *kaddr = kmap(page); 2682 iov.iov_base = kaddr + offset; 2683 iov.iov_len = size; 2684 res = kernel_sendmsg(sock, &msg, &iov, 1, size); 2685 kunmap(page); 2686 return res; 2687 } 2688 EXPORT_SYMBOL(sock_no_sendpage); 2689 2690 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page, 2691 int offset, size_t size, int flags) 2692 { 2693 ssize_t res; 2694 struct msghdr msg = {.msg_flags = flags}; 2695 struct kvec iov; 2696 char *kaddr = kmap(page); 2697 2698 iov.iov_base = kaddr + offset; 2699 iov.iov_len = size; 2700 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size); 2701 kunmap(page); 2702 return res; 2703 } 2704 EXPORT_SYMBOL(sock_no_sendpage_locked); 2705 2706 /* 2707 * Default Socket Callbacks 2708 */ 2709 2710 static void sock_def_wakeup(struct sock *sk) 2711 { 2712 struct socket_wq *wq; 2713 2714 rcu_read_lock(); 2715 wq = rcu_dereference(sk->sk_wq); 2716 if (skwq_has_sleeper(wq)) 2717 wake_up_interruptible_all(&wq->wait); 2718 rcu_read_unlock(); 2719 } 2720 2721 static void sock_def_error_report(struct sock *sk) 2722 { 2723 struct socket_wq *wq; 2724 2725 rcu_read_lock(); 2726 wq = rcu_dereference(sk->sk_wq); 2727 if (skwq_has_sleeper(wq)) 2728 wake_up_interruptible_poll(&wq->wait, EPOLLERR); 2729 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); 2730 rcu_read_unlock(); 2731 } 2732 2733 static void sock_def_readable(struct sock *sk) 2734 { 2735 struct socket_wq *wq; 2736 2737 rcu_read_lock(); 2738 wq = rcu_dereference(sk->sk_wq); 2739 if (skwq_has_sleeper(wq)) 2740 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | 2741 EPOLLRDNORM | EPOLLRDBAND); 2742 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 2743 rcu_read_unlock(); 2744 } 2745 2746 static void sock_def_write_space(struct sock *sk) 2747 { 2748 struct socket_wq *wq; 2749 2750 rcu_read_lock(); 2751 2752 /* Do not wake up a writer until he can make "significant" 2753 * progress. --DaveM 2754 */ 2755 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { 2756 wq = rcu_dereference(sk->sk_wq); 2757 if (skwq_has_sleeper(wq)) 2758 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 2759 EPOLLWRNORM | EPOLLWRBAND); 2760 2761 /* Should agree with poll, otherwise some programs break */ 2762 if (sock_writeable(sk)) 2763 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 2764 } 2765 2766 rcu_read_unlock(); 2767 } 2768 2769 static void sock_def_destruct(struct sock *sk) 2770 { 2771 } 2772 2773 void sk_send_sigurg(struct sock *sk) 2774 { 2775 if (sk->sk_socket && sk->sk_socket->file) 2776 if (send_sigurg(&sk->sk_socket->file->f_owner)) 2777 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 2778 } 2779 EXPORT_SYMBOL(sk_send_sigurg); 2780 2781 void sk_reset_timer(struct sock *sk, struct timer_list* timer, 2782 unsigned long expires) 2783 { 2784 if (!mod_timer(timer, expires)) 2785 sock_hold(sk); 2786 } 2787 EXPORT_SYMBOL(sk_reset_timer); 2788 2789 void sk_stop_timer(struct sock *sk, struct timer_list* timer) 2790 { 2791 if (del_timer(timer)) 2792 __sock_put(sk); 2793 } 2794 EXPORT_SYMBOL(sk_stop_timer); 2795 2796 void sock_init_data(struct socket *sock, struct sock *sk) 2797 { 2798 sk_init_common(sk); 2799 sk->sk_send_head = NULL; 2800 2801 timer_setup(&sk->sk_timer, NULL, 0); 2802 2803 sk->sk_allocation = GFP_KERNEL; 2804 sk->sk_rcvbuf = sysctl_rmem_default; 2805 sk->sk_sndbuf = sysctl_wmem_default; 2806 sk->sk_state = TCP_CLOSE; 2807 sk_set_socket(sk, sock); 2808 2809 sock_set_flag(sk, SOCK_ZAPPED); 2810 2811 if (sock) { 2812 sk->sk_type = sock->type; 2813 sk->sk_wq = sock->wq; 2814 sock->sk = sk; 2815 sk->sk_uid = SOCK_INODE(sock)->i_uid; 2816 } else { 2817 sk->sk_wq = NULL; 2818 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0); 2819 } 2820 2821 rwlock_init(&sk->sk_callback_lock); 2822 if (sk->sk_kern_sock) 2823 lockdep_set_class_and_name( 2824 &sk->sk_callback_lock, 2825 af_kern_callback_keys + sk->sk_family, 2826 af_family_kern_clock_key_strings[sk->sk_family]); 2827 else 2828 lockdep_set_class_and_name( 2829 &sk->sk_callback_lock, 2830 af_callback_keys + sk->sk_family, 2831 af_family_clock_key_strings[sk->sk_family]); 2832 2833 sk->sk_state_change = sock_def_wakeup; 2834 sk->sk_data_ready = sock_def_readable; 2835 sk->sk_write_space = sock_def_write_space; 2836 sk->sk_error_report = sock_def_error_report; 2837 sk->sk_destruct = sock_def_destruct; 2838 2839 sk->sk_frag.page = NULL; 2840 sk->sk_frag.offset = 0; 2841 sk->sk_peek_off = -1; 2842 2843 sk->sk_peer_pid = NULL; 2844 sk->sk_peer_cred = NULL; 2845 sk->sk_write_pending = 0; 2846 sk->sk_rcvlowat = 1; 2847 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 2848 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 2849 2850 sk->sk_stamp = SK_DEFAULT_STAMP; 2851 atomic_set(&sk->sk_zckey, 0); 2852 2853 #ifdef CONFIG_NET_RX_BUSY_POLL 2854 sk->sk_napi_id = 0; 2855 sk->sk_ll_usec = sysctl_net_busy_read; 2856 #endif 2857 2858 sk->sk_max_pacing_rate = ~0U; 2859 sk->sk_pacing_rate = ~0U; 2860 sk->sk_pacing_shift = 10; 2861 sk->sk_incoming_cpu = -1; 2862 2863 sk_rx_queue_clear(sk); 2864 /* 2865 * Before updating sk_refcnt, we must commit prior changes to memory 2866 * (Documentation/RCU/rculist_nulls.txt for details) 2867 */ 2868 smp_wmb(); 2869 refcount_set(&sk->sk_refcnt, 1); 2870 atomic_set(&sk->sk_drops, 0); 2871 } 2872 EXPORT_SYMBOL(sock_init_data); 2873 2874 void lock_sock_nested(struct sock *sk, int subclass) 2875 { 2876 might_sleep(); 2877 spin_lock_bh(&sk->sk_lock.slock); 2878 if (sk->sk_lock.owned) 2879 __lock_sock(sk); 2880 sk->sk_lock.owned = 1; 2881 spin_unlock(&sk->sk_lock.slock); 2882 /* 2883 * The sk_lock has mutex_lock() semantics here: 2884 */ 2885 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); 2886 local_bh_enable(); 2887 } 2888 EXPORT_SYMBOL(lock_sock_nested); 2889 2890 void release_sock(struct sock *sk) 2891 { 2892 spin_lock_bh(&sk->sk_lock.slock); 2893 if (sk->sk_backlog.tail) 2894 __release_sock(sk); 2895 2896 /* Warning : release_cb() might need to release sk ownership, 2897 * ie call sock_release_ownership(sk) before us. 2898 */ 2899 if (sk->sk_prot->release_cb) 2900 sk->sk_prot->release_cb(sk); 2901 2902 sock_release_ownership(sk); 2903 if (waitqueue_active(&sk->sk_lock.wq)) 2904 wake_up(&sk->sk_lock.wq); 2905 spin_unlock_bh(&sk->sk_lock.slock); 2906 } 2907 EXPORT_SYMBOL(release_sock); 2908 2909 /** 2910 * lock_sock_fast - fast version of lock_sock 2911 * @sk: socket 2912 * 2913 * This version should be used for very small section, where process wont block 2914 * return false if fast path is taken: 2915 * 2916 * sk_lock.slock locked, owned = 0, BH disabled 2917 * 2918 * return true if slow path is taken: 2919 * 2920 * sk_lock.slock unlocked, owned = 1, BH enabled 2921 */ 2922 bool lock_sock_fast(struct sock *sk) 2923 { 2924 might_sleep(); 2925 spin_lock_bh(&sk->sk_lock.slock); 2926 2927 if (!sk->sk_lock.owned) 2928 /* 2929 * Note : We must disable BH 2930 */ 2931 return false; 2932 2933 __lock_sock(sk); 2934 sk->sk_lock.owned = 1; 2935 spin_unlock(&sk->sk_lock.slock); 2936 /* 2937 * The sk_lock has mutex_lock() semantics here: 2938 */ 2939 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); 2940 local_bh_enable(); 2941 return true; 2942 } 2943 EXPORT_SYMBOL(lock_sock_fast); 2944 2945 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) 2946 { 2947 struct timeval tv; 2948 if (!sock_flag(sk, SOCK_TIMESTAMP)) 2949 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 2950 tv = ktime_to_timeval(sk->sk_stamp); 2951 if (tv.tv_sec == -1) 2952 return -ENOENT; 2953 if (tv.tv_sec == 0) { 2954 sk->sk_stamp = ktime_get_real(); 2955 tv = ktime_to_timeval(sk->sk_stamp); 2956 } 2957 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0; 2958 } 2959 EXPORT_SYMBOL(sock_get_timestamp); 2960 2961 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) 2962 { 2963 struct timespec ts; 2964 if (!sock_flag(sk, SOCK_TIMESTAMP)) 2965 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 2966 ts = ktime_to_timespec(sk->sk_stamp); 2967 if (ts.tv_sec == -1) 2968 return -ENOENT; 2969 if (ts.tv_sec == 0) { 2970 sk->sk_stamp = ktime_get_real(); 2971 ts = ktime_to_timespec(sk->sk_stamp); 2972 } 2973 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0; 2974 } 2975 EXPORT_SYMBOL(sock_get_timestampns); 2976 2977 void sock_enable_timestamp(struct sock *sk, int flag) 2978 { 2979 if (!sock_flag(sk, flag)) { 2980 unsigned long previous_flags = sk->sk_flags; 2981 2982 sock_set_flag(sk, flag); 2983 /* 2984 * we just set one of the two flags which require net 2985 * time stamping, but time stamping might have been on 2986 * already because of the other one 2987 */ 2988 if (sock_needs_netstamp(sk) && 2989 !(previous_flags & SK_FLAGS_TIMESTAMP)) 2990 net_enable_timestamp(); 2991 } 2992 } 2993 2994 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, 2995 int level, int type) 2996 { 2997 struct sock_exterr_skb *serr; 2998 struct sk_buff *skb; 2999 int copied, err; 3000 3001 err = -EAGAIN; 3002 skb = sock_dequeue_err_skb(sk); 3003 if (skb == NULL) 3004 goto out; 3005 3006 copied = skb->len; 3007 if (copied > len) { 3008 msg->msg_flags |= MSG_TRUNC; 3009 copied = len; 3010 } 3011 err = skb_copy_datagram_msg(skb, 0, msg, copied); 3012 if (err) 3013 goto out_free_skb; 3014 3015 sock_recv_timestamp(msg, sk, skb); 3016 3017 serr = SKB_EXT_ERR(skb); 3018 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); 3019 3020 msg->msg_flags |= MSG_ERRQUEUE; 3021 err = copied; 3022 3023 out_free_skb: 3024 kfree_skb(skb); 3025 out: 3026 return err; 3027 } 3028 EXPORT_SYMBOL(sock_recv_errqueue); 3029 3030 /* 3031 * Get a socket option on an socket. 3032 * 3033 * FIX: POSIX 1003.1g is very ambiguous here. It states that 3034 * asynchronous errors should be reported by getsockopt. We assume 3035 * this means if you specify SO_ERROR (otherwise whats the point of it). 3036 */ 3037 int sock_common_getsockopt(struct socket *sock, int level, int optname, 3038 char __user *optval, int __user *optlen) 3039 { 3040 struct sock *sk = sock->sk; 3041 3042 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 3043 } 3044 EXPORT_SYMBOL(sock_common_getsockopt); 3045 3046 #ifdef CONFIG_COMPAT 3047 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname, 3048 char __user *optval, int __user *optlen) 3049 { 3050 struct sock *sk = sock->sk; 3051 3052 if (sk->sk_prot->compat_getsockopt != NULL) 3053 return sk->sk_prot->compat_getsockopt(sk, level, optname, 3054 optval, optlen); 3055 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 3056 } 3057 EXPORT_SYMBOL(compat_sock_common_getsockopt); 3058 #endif 3059 3060 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 3061 int flags) 3062 { 3063 struct sock *sk = sock->sk; 3064 int addr_len = 0; 3065 int err; 3066 3067 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, 3068 flags & ~MSG_DONTWAIT, &addr_len); 3069 if (err >= 0) 3070 msg->msg_namelen = addr_len; 3071 return err; 3072 } 3073 EXPORT_SYMBOL(sock_common_recvmsg); 3074 3075 /* 3076 * Set socket options on an inet socket. 3077 */ 3078 int sock_common_setsockopt(struct socket *sock, int level, int optname, 3079 char __user *optval, unsigned int optlen) 3080 { 3081 struct sock *sk = sock->sk; 3082 3083 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 3084 } 3085 EXPORT_SYMBOL(sock_common_setsockopt); 3086 3087 #ifdef CONFIG_COMPAT 3088 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname, 3089 char __user *optval, unsigned int optlen) 3090 { 3091 struct sock *sk = sock->sk; 3092 3093 if (sk->sk_prot->compat_setsockopt != NULL) 3094 return sk->sk_prot->compat_setsockopt(sk, level, optname, 3095 optval, optlen); 3096 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 3097 } 3098 EXPORT_SYMBOL(compat_sock_common_setsockopt); 3099 #endif 3100 3101 void sk_common_release(struct sock *sk) 3102 { 3103 if (sk->sk_prot->destroy) 3104 sk->sk_prot->destroy(sk); 3105 3106 /* 3107 * Observation: when sock_common_release is called, processes have 3108 * no access to socket. But net still has. 3109 * Step one, detach it from networking: 3110 * 3111 * A. Remove from hash tables. 3112 */ 3113 3114 sk->sk_prot->unhash(sk); 3115 3116 /* 3117 * In this point socket cannot receive new packets, but it is possible 3118 * that some packets are in flight because some CPU runs receiver and 3119 * did hash table lookup before we unhashed socket. They will achieve 3120 * receive queue and will be purged by socket destructor. 3121 * 3122 * Also we still have packets pending on receive queue and probably, 3123 * our own packets waiting in device queues. sock_destroy will drain 3124 * receive queue, but transmitted packets will delay socket destruction 3125 * until the last reference will be released. 3126 */ 3127 3128 sock_orphan(sk); 3129 3130 xfrm_sk_free_policy(sk); 3131 3132 sk_refcnt_debug_release(sk); 3133 3134 sock_put(sk); 3135 } 3136 EXPORT_SYMBOL(sk_common_release); 3137 3138 void sk_get_meminfo(const struct sock *sk, u32 *mem) 3139 { 3140 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); 3141 3142 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); 3143 mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf; 3144 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); 3145 mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf; 3146 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; 3147 mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; 3148 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); 3149 mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; 3150 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); 3151 } 3152 3153 #ifdef CONFIG_PROC_FS 3154 #define PROTO_INUSE_NR 64 /* should be enough for the first time */ 3155 struct prot_inuse { 3156 int val[PROTO_INUSE_NR]; 3157 }; 3158 3159 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); 3160 3161 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) 3162 { 3163 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val); 3164 } 3165 EXPORT_SYMBOL_GPL(sock_prot_inuse_add); 3166 3167 int sock_prot_inuse_get(struct net *net, struct proto *prot) 3168 { 3169 int cpu, idx = prot->inuse_idx; 3170 int res = 0; 3171 3172 for_each_possible_cpu(cpu) 3173 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; 3174 3175 return res >= 0 ? res : 0; 3176 } 3177 EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 3178 3179 static void sock_inuse_add(struct net *net, int val) 3180 { 3181 this_cpu_add(*net->core.sock_inuse, val); 3182 } 3183 3184 int sock_inuse_get(struct net *net) 3185 { 3186 int cpu, res = 0; 3187 3188 for_each_possible_cpu(cpu) 3189 res += *per_cpu_ptr(net->core.sock_inuse, cpu); 3190 3191 return res; 3192 } 3193 3194 EXPORT_SYMBOL_GPL(sock_inuse_get); 3195 3196 static int __net_init sock_inuse_init_net(struct net *net) 3197 { 3198 net->core.prot_inuse = alloc_percpu(struct prot_inuse); 3199 if (net->core.prot_inuse == NULL) 3200 return -ENOMEM; 3201 3202 net->core.sock_inuse = alloc_percpu(int); 3203 if (net->core.sock_inuse == NULL) 3204 goto out; 3205 3206 return 0; 3207 3208 out: 3209 free_percpu(net->core.prot_inuse); 3210 return -ENOMEM; 3211 } 3212 3213 static void __net_exit sock_inuse_exit_net(struct net *net) 3214 { 3215 free_percpu(net->core.prot_inuse); 3216 free_percpu(net->core.sock_inuse); 3217 } 3218 3219 static struct pernet_operations net_inuse_ops = { 3220 .init = sock_inuse_init_net, 3221 .exit = sock_inuse_exit_net, 3222 }; 3223 3224 static __init int net_inuse_init(void) 3225 { 3226 if (register_pernet_subsys(&net_inuse_ops)) 3227 panic("Cannot initialize net inuse counters"); 3228 3229 return 0; 3230 } 3231 3232 core_initcall(net_inuse_init); 3233 3234 static void assign_proto_idx(struct proto *prot) 3235 { 3236 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); 3237 3238 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { 3239 pr_err("PROTO_INUSE_NR exhausted\n"); 3240 return; 3241 } 3242 3243 set_bit(prot->inuse_idx, proto_inuse_idx); 3244 } 3245 3246 static void release_proto_idx(struct proto *prot) 3247 { 3248 if (prot->inuse_idx != PROTO_INUSE_NR - 1) 3249 clear_bit(prot->inuse_idx, proto_inuse_idx); 3250 } 3251 #else 3252 static inline void assign_proto_idx(struct proto *prot) 3253 { 3254 } 3255 3256 static inline void release_proto_idx(struct proto *prot) 3257 { 3258 } 3259 3260 static void sock_inuse_add(struct net *net, int val) 3261 { 3262 } 3263 #endif 3264 3265 static void req_prot_cleanup(struct request_sock_ops *rsk_prot) 3266 { 3267 if (!rsk_prot) 3268 return; 3269 kfree(rsk_prot->slab_name); 3270 rsk_prot->slab_name = NULL; 3271 kmem_cache_destroy(rsk_prot->slab); 3272 rsk_prot->slab = NULL; 3273 } 3274 3275 static int req_prot_init(const struct proto *prot) 3276 { 3277 struct request_sock_ops *rsk_prot = prot->rsk_prot; 3278 3279 if (!rsk_prot) 3280 return 0; 3281 3282 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", 3283 prot->name); 3284 if (!rsk_prot->slab_name) 3285 return -ENOMEM; 3286 3287 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, 3288 rsk_prot->obj_size, 0, 3289 SLAB_ACCOUNT | prot->slab_flags, 3290 NULL); 3291 3292 if (!rsk_prot->slab) { 3293 pr_crit("%s: Can't create request sock SLAB cache!\n", 3294 prot->name); 3295 return -ENOMEM; 3296 } 3297 return 0; 3298 } 3299 3300 int proto_register(struct proto *prot, int alloc_slab) 3301 { 3302 if (alloc_slab) { 3303 prot->slab = kmem_cache_create_usercopy(prot->name, 3304 prot->obj_size, 0, 3305 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | 3306 prot->slab_flags, 3307 prot->useroffset, prot->usersize, 3308 NULL); 3309 3310 if (prot->slab == NULL) { 3311 pr_crit("%s: Can't create sock SLAB cache!\n", 3312 prot->name); 3313 goto out; 3314 } 3315 3316 if (req_prot_init(prot)) 3317 goto out_free_request_sock_slab; 3318 3319 if (prot->twsk_prot != NULL) { 3320 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name); 3321 3322 if (prot->twsk_prot->twsk_slab_name == NULL) 3323 goto out_free_request_sock_slab; 3324 3325 prot->twsk_prot->twsk_slab = 3326 kmem_cache_create(prot->twsk_prot->twsk_slab_name, 3327 prot->twsk_prot->twsk_obj_size, 3328 0, 3329 SLAB_ACCOUNT | 3330 prot->slab_flags, 3331 NULL); 3332 if (prot->twsk_prot->twsk_slab == NULL) 3333 goto out_free_timewait_sock_slab_name; 3334 } 3335 } 3336 3337 mutex_lock(&proto_list_mutex); 3338 list_add(&prot->node, &proto_list); 3339 assign_proto_idx(prot); 3340 mutex_unlock(&proto_list_mutex); 3341 return 0; 3342 3343 out_free_timewait_sock_slab_name: 3344 kfree(prot->twsk_prot->twsk_slab_name); 3345 out_free_request_sock_slab: 3346 req_prot_cleanup(prot->rsk_prot); 3347 3348 kmem_cache_destroy(prot->slab); 3349 prot->slab = NULL; 3350 out: 3351 return -ENOBUFS; 3352 } 3353 EXPORT_SYMBOL(proto_register); 3354 3355 void proto_unregister(struct proto *prot) 3356 { 3357 mutex_lock(&proto_list_mutex); 3358 release_proto_idx(prot); 3359 list_del(&prot->node); 3360 mutex_unlock(&proto_list_mutex); 3361 3362 kmem_cache_destroy(prot->slab); 3363 prot->slab = NULL; 3364 3365 req_prot_cleanup(prot->rsk_prot); 3366 3367 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) { 3368 kmem_cache_destroy(prot->twsk_prot->twsk_slab); 3369 kfree(prot->twsk_prot->twsk_slab_name); 3370 prot->twsk_prot->twsk_slab = NULL; 3371 } 3372 } 3373 EXPORT_SYMBOL(proto_unregister); 3374 3375 int sock_load_diag_module(int family, int protocol) 3376 { 3377 if (!protocol) { 3378 if (!sock_is_registered(family)) 3379 return -ENOENT; 3380 3381 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, 3382 NETLINK_SOCK_DIAG, family); 3383 } 3384 3385 #ifdef CONFIG_INET 3386 if (family == AF_INET && 3387 !rcu_access_pointer(inet_protos[protocol])) 3388 return -ENOENT; 3389 #endif 3390 3391 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, 3392 NETLINK_SOCK_DIAG, family, protocol); 3393 } 3394 EXPORT_SYMBOL(sock_load_diag_module); 3395 3396 #ifdef CONFIG_PROC_FS 3397 static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 3398 __acquires(proto_list_mutex) 3399 { 3400 mutex_lock(&proto_list_mutex); 3401 return seq_list_start_head(&proto_list, *pos); 3402 } 3403 3404 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3405 { 3406 return seq_list_next(v, &proto_list, pos); 3407 } 3408 3409 static void proto_seq_stop(struct seq_file *seq, void *v) 3410 __releases(proto_list_mutex) 3411 { 3412 mutex_unlock(&proto_list_mutex); 3413 } 3414 3415 static char proto_method_implemented(const void *method) 3416 { 3417 return method == NULL ? 'n' : 'y'; 3418 } 3419 static long sock_prot_memory_allocated(struct proto *proto) 3420 { 3421 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; 3422 } 3423 3424 static char *sock_prot_memory_pressure(struct proto *proto) 3425 { 3426 return proto->memory_pressure != NULL ? 3427 proto_memory_pressure(proto) ? "yes" : "no" : "NI"; 3428 } 3429 3430 static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 3431 { 3432 3433 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " 3434 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 3435 proto->name, 3436 proto->obj_size, 3437 sock_prot_inuse_get(seq_file_net(seq), proto), 3438 sock_prot_memory_allocated(proto), 3439 sock_prot_memory_pressure(proto), 3440 proto->max_header, 3441 proto->slab == NULL ? "no" : "yes", 3442 module_name(proto->owner), 3443 proto_method_implemented(proto->close), 3444 proto_method_implemented(proto->connect), 3445 proto_method_implemented(proto->disconnect), 3446 proto_method_implemented(proto->accept), 3447 proto_method_implemented(proto->ioctl), 3448 proto_method_implemented(proto->init), 3449 proto_method_implemented(proto->destroy), 3450 proto_method_implemented(proto->shutdown), 3451 proto_method_implemented(proto->setsockopt), 3452 proto_method_implemented(proto->getsockopt), 3453 proto_method_implemented(proto->sendmsg), 3454 proto_method_implemented(proto->recvmsg), 3455 proto_method_implemented(proto->sendpage), 3456 proto_method_implemented(proto->bind), 3457 proto_method_implemented(proto->backlog_rcv), 3458 proto_method_implemented(proto->hash), 3459 proto_method_implemented(proto->unhash), 3460 proto_method_implemented(proto->get_port), 3461 proto_method_implemented(proto->enter_memory_pressure)); 3462 } 3463 3464 static int proto_seq_show(struct seq_file *seq, void *v) 3465 { 3466 if (v == &proto_list) 3467 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", 3468 "protocol", 3469 "size", 3470 "sockets", 3471 "memory", 3472 "press", 3473 "maxhdr", 3474 "slab", 3475 "module", 3476 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); 3477 else 3478 proto_seq_printf(seq, list_entry(v, struct proto, node)); 3479 return 0; 3480 } 3481 3482 static const struct seq_operations proto_seq_ops = { 3483 .start = proto_seq_start, 3484 .next = proto_seq_next, 3485 .stop = proto_seq_stop, 3486 .show = proto_seq_show, 3487 }; 3488 3489 static __net_init int proto_init_net(struct net *net) 3490 { 3491 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, 3492 sizeof(struct seq_net_private))) 3493 return -ENOMEM; 3494 3495 return 0; 3496 } 3497 3498 static __net_exit void proto_exit_net(struct net *net) 3499 { 3500 remove_proc_entry("protocols", net->proc_net); 3501 } 3502 3503 3504 static __net_initdata struct pernet_operations proto_net_ops = { 3505 .init = proto_init_net, 3506 .exit = proto_exit_net, 3507 }; 3508 3509 static int __init proto_init(void) 3510 { 3511 return register_pernet_subsys(&proto_net_ops); 3512 } 3513 3514 subsys_initcall(proto_init); 3515 3516 #endif /* PROC_FS */ 3517 3518 #ifdef CONFIG_NET_RX_BUSY_POLL 3519 bool sk_busy_loop_end(void *p, unsigned long start_time) 3520 { 3521 struct sock *sk = p; 3522 3523 return !skb_queue_empty(&sk->sk_receive_queue) || 3524 sk_busy_loop_timeout(sk, start_time); 3525 } 3526 EXPORT_SYMBOL(sk_busy_loop_end); 3527 #endif /* CONFIG_NET_RX_BUSY_POLL */ 3528