1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Generic socket support routines. Memory allocators, socket lock/release 7 * handler for protocols to use and generic option handler. 8 * 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Alan Cox, <A.Cox@swansea.ac.uk> 14 * 15 * Fixes: 16 * Alan Cox : Numerous verify_area() problems 17 * Alan Cox : Connecting on a connecting socket 18 * now returns an error for tcp. 19 * Alan Cox : sock->protocol is set correctly. 20 * and is not sometimes left as 0. 21 * Alan Cox : connect handles icmp errors on a 22 * connect properly. Unfortunately there 23 * is a restart syscall nasty there. I 24 * can't match BSD without hacking the C 25 * library. Ideas urgently sought! 26 * Alan Cox : Disallow bind() to addresses that are 27 * not ours - especially broadcast ones!! 28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) 29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, 30 * instead they leave that for the DESTROY timer. 31 * Alan Cox : Clean up error flag in accept 32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer 33 * was buggy. Put a remove_sock() in the handler 34 * for memory when we hit 0. Also altered the timer 35 * code. The ACK stuff can wait and needs major 36 * TCP layer surgery. 37 * Alan Cox : Fixed TCP ack bug, removed remove sock 38 * and fixed timer/inet_bh race. 39 * Alan Cox : Added zapped flag for TCP 40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code 41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb 42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources 43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. 44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... 45 * Rick Sladkey : Relaxed UDP rules for matching packets. 46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support 47 * Pauline Middelink : identd support 48 * Alan Cox : Fixed connect() taking signals I think. 49 * Alan Cox : SO_LINGER supported 50 * Alan Cox : Error reporting fixes 51 * Anonymous : inet_create tidied up (sk->reuse setting) 52 * Alan Cox : inet sockets don't set sk->type! 53 * Alan Cox : Split socket option code 54 * Alan Cox : Callbacks 55 * Alan Cox : Nagle flag for Charles & Johannes stuff 56 * Alex : Removed restriction on inet fioctl 57 * Alan Cox : Splitting INET from NET core 58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() 59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code 60 * Alan Cox : Split IP from generic code 61 * Alan Cox : New kfree_skbmem() 62 * Alan Cox : Make SO_DEBUG superuser only. 63 * Alan Cox : Allow anyone to clear SO_DEBUG 64 * (compatibility fix) 65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. 66 * Alan Cox : Allocator for a socket is settable. 67 * Alan Cox : SO_ERROR includes soft errors. 68 * Alan Cox : Allow NULL arguments on some SO_ opts 69 * Alan Cox : Generic socket allocation to make hooks 70 * easier (suggested by Craig Metz). 71 * Michael Pall : SO_ERROR returns positive errno again 72 * Steve Whitehouse: Added default destructor to free 73 * protocol private data. 74 * Steve Whitehouse: Added various other default routines 75 * common to several socket families. 76 * Chris Evans : Call suser() check last on F_SETOWN 77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. 78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() 79 * Andi Kleen : Fix write_space callback 80 * Chris Evans : Security fixes - signedness again 81 * Arnaldo C. Melo : cleanups, use skb_queue_purge 82 * 83 * To Fix: 84 * 85 * 86 * This program is free software; you can redistribute it and/or 87 * modify it under the terms of the GNU General Public License 88 * as published by the Free Software Foundation; either version 89 * 2 of the License, or (at your option) any later version. 90 */ 91 92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 93 94 #include <linux/capability.h> 95 #include <linux/errno.h> 96 #include <linux/errqueue.h> 97 #include <linux/types.h> 98 #include <linux/socket.h> 99 #include <linux/in.h> 100 #include <linux/kernel.h> 101 #include <linux/module.h> 102 #include <linux/proc_fs.h> 103 #include <linux/seq_file.h> 104 #include <linux/sched.h> 105 #include <linux/sched/mm.h> 106 #include <linux/timer.h> 107 #include <linux/string.h> 108 #include <linux/sockios.h> 109 #include <linux/net.h> 110 #include <linux/mm.h> 111 #include <linux/slab.h> 112 #include <linux/interrupt.h> 113 #include <linux/poll.h> 114 #include <linux/tcp.h> 115 #include <linux/init.h> 116 #include <linux/highmem.h> 117 #include <linux/user_namespace.h> 118 #include <linux/static_key.h> 119 #include <linux/memcontrol.h> 120 #include <linux/prefetch.h> 121 122 #include <linux/uaccess.h> 123 124 #include <linux/netdevice.h> 125 #include <net/protocol.h> 126 #include <linux/skbuff.h> 127 #include <net/net_namespace.h> 128 #include <net/request_sock.h> 129 #include <net/sock.h> 130 #include <linux/net_tstamp.h> 131 #include <net/xfrm.h> 132 #include <linux/ipsec.h> 133 #include <net/cls_cgroup.h> 134 #include <net/netprio_cgroup.h> 135 #include <linux/sock_diag.h> 136 137 #include <linux/filter.h> 138 #include <net/sock_reuseport.h> 139 140 #include <trace/events/sock.h> 141 142 #include <net/tcp.h> 143 #include <net/busy_poll.h> 144 145 static DEFINE_MUTEX(proto_list_mutex); 146 static LIST_HEAD(proto_list); 147 148 static void sock_inuse_add(struct net *net, int val); 149 150 /** 151 * sk_ns_capable - General socket capability test 152 * @sk: Socket to use a capability on or through 153 * @user_ns: The user namespace of the capability to use 154 * @cap: The capability to use 155 * 156 * Test to see if the opener of the socket had when the socket was 157 * created and the current process has the capability @cap in the user 158 * namespace @user_ns. 159 */ 160 bool sk_ns_capable(const struct sock *sk, 161 struct user_namespace *user_ns, int cap) 162 { 163 return file_ns_capable(sk->sk_socket->file, user_ns, cap) && 164 ns_capable(user_ns, cap); 165 } 166 EXPORT_SYMBOL(sk_ns_capable); 167 168 /** 169 * sk_capable - Socket global capability test 170 * @sk: Socket to use a capability on or through 171 * @cap: The global capability to use 172 * 173 * Test to see if the opener of the socket had when the socket was 174 * created and the current process has the capability @cap in all user 175 * namespaces. 176 */ 177 bool sk_capable(const struct sock *sk, int cap) 178 { 179 return sk_ns_capable(sk, &init_user_ns, cap); 180 } 181 EXPORT_SYMBOL(sk_capable); 182 183 /** 184 * sk_net_capable - Network namespace socket capability test 185 * @sk: Socket to use a capability on or through 186 * @cap: The capability to use 187 * 188 * Test to see if the opener of the socket had when the socket was created 189 * and the current process has the capability @cap over the network namespace 190 * the socket is a member of. 191 */ 192 bool sk_net_capable(const struct sock *sk, int cap) 193 { 194 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); 195 } 196 EXPORT_SYMBOL(sk_net_capable); 197 198 /* 199 * Each address family might have different locking rules, so we have 200 * one slock key per address family and separate keys for internal and 201 * userspace sockets. 202 */ 203 static struct lock_class_key af_family_keys[AF_MAX]; 204 static struct lock_class_key af_family_kern_keys[AF_MAX]; 205 static struct lock_class_key af_family_slock_keys[AF_MAX]; 206 static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; 207 208 /* 209 * Make lock validator output more readable. (we pre-construct these 210 * strings build-time, so that runtime initialization of socket 211 * locks is fast): 212 */ 213 214 #define _sock_locks(x) \ 215 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ 216 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ 217 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ 218 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ 219 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ 220 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ 221 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ 222 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ 223 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ 224 x "27" , x "28" , x "AF_CAN" , \ 225 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ 226 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ 227 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ 228 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ 229 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ 230 x "AF_MAX" 231 232 static const char *const af_family_key_strings[AF_MAX+1] = { 233 _sock_locks("sk_lock-") 234 }; 235 static const char *const af_family_slock_key_strings[AF_MAX+1] = { 236 _sock_locks("slock-") 237 }; 238 static const char *const af_family_clock_key_strings[AF_MAX+1] = { 239 _sock_locks("clock-") 240 }; 241 242 static const char *const af_family_kern_key_strings[AF_MAX+1] = { 243 _sock_locks("k-sk_lock-") 244 }; 245 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { 246 _sock_locks("k-slock-") 247 }; 248 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { 249 _sock_locks("k-clock-") 250 }; 251 static const char *const af_family_rlock_key_strings[AF_MAX+1] = { 252 "rlock-AF_UNSPEC", "rlock-AF_UNIX" , "rlock-AF_INET" , 253 "rlock-AF_AX25" , "rlock-AF_IPX" , "rlock-AF_APPLETALK", 254 "rlock-AF_NETROM", "rlock-AF_BRIDGE" , "rlock-AF_ATMPVC" , 255 "rlock-AF_X25" , "rlock-AF_INET6" , "rlock-AF_ROSE" , 256 "rlock-AF_DECnet", "rlock-AF_NETBEUI" , "rlock-AF_SECURITY" , 257 "rlock-AF_KEY" , "rlock-AF_NETLINK" , "rlock-AF_PACKET" , 258 "rlock-AF_ASH" , "rlock-AF_ECONET" , "rlock-AF_ATMSVC" , 259 "rlock-AF_RDS" , "rlock-AF_SNA" , "rlock-AF_IRDA" , 260 "rlock-AF_PPPOX" , "rlock-AF_WANPIPE" , "rlock-AF_LLC" , 261 "rlock-27" , "rlock-28" , "rlock-AF_CAN" , 262 "rlock-AF_TIPC" , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV" , 263 "rlock-AF_RXRPC" , "rlock-AF_ISDN" , "rlock-AF_PHONET" , 264 "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG" , 265 "rlock-AF_NFC" , "rlock-AF_VSOCK" , "rlock-AF_KCM" , 266 "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_XDP" , 267 "rlock-AF_MAX" 268 }; 269 static const char *const af_family_wlock_key_strings[AF_MAX+1] = { 270 "wlock-AF_UNSPEC", "wlock-AF_UNIX" , "wlock-AF_INET" , 271 "wlock-AF_AX25" , "wlock-AF_IPX" , "wlock-AF_APPLETALK", 272 "wlock-AF_NETROM", "wlock-AF_BRIDGE" , "wlock-AF_ATMPVC" , 273 "wlock-AF_X25" , "wlock-AF_INET6" , "wlock-AF_ROSE" , 274 "wlock-AF_DECnet", "wlock-AF_NETBEUI" , "wlock-AF_SECURITY" , 275 "wlock-AF_KEY" , "wlock-AF_NETLINK" , "wlock-AF_PACKET" , 276 "wlock-AF_ASH" , "wlock-AF_ECONET" , "wlock-AF_ATMSVC" , 277 "wlock-AF_RDS" , "wlock-AF_SNA" , "wlock-AF_IRDA" , 278 "wlock-AF_PPPOX" , "wlock-AF_WANPIPE" , "wlock-AF_LLC" , 279 "wlock-27" , "wlock-28" , "wlock-AF_CAN" , 280 "wlock-AF_TIPC" , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV" , 281 "wlock-AF_RXRPC" , "wlock-AF_ISDN" , "wlock-AF_PHONET" , 282 "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG" , 283 "wlock-AF_NFC" , "wlock-AF_VSOCK" , "wlock-AF_KCM" , 284 "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_XDP" , 285 "wlock-AF_MAX" 286 }; 287 static const char *const af_family_elock_key_strings[AF_MAX+1] = { 288 "elock-AF_UNSPEC", "elock-AF_UNIX" , "elock-AF_INET" , 289 "elock-AF_AX25" , "elock-AF_IPX" , "elock-AF_APPLETALK", 290 "elock-AF_NETROM", "elock-AF_BRIDGE" , "elock-AF_ATMPVC" , 291 "elock-AF_X25" , "elock-AF_INET6" , "elock-AF_ROSE" , 292 "elock-AF_DECnet", "elock-AF_NETBEUI" , "elock-AF_SECURITY" , 293 "elock-AF_KEY" , "elock-AF_NETLINK" , "elock-AF_PACKET" , 294 "elock-AF_ASH" , "elock-AF_ECONET" , "elock-AF_ATMSVC" , 295 "elock-AF_RDS" , "elock-AF_SNA" , "elock-AF_IRDA" , 296 "elock-AF_PPPOX" , "elock-AF_WANPIPE" , "elock-AF_LLC" , 297 "elock-27" , "elock-28" , "elock-AF_CAN" , 298 "elock-AF_TIPC" , "elock-AF_BLUETOOTH", "elock-AF_IUCV" , 299 "elock-AF_RXRPC" , "elock-AF_ISDN" , "elock-AF_PHONET" , 300 "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG" , 301 "elock-AF_NFC" , "elock-AF_VSOCK" , "elock-AF_KCM" , 302 "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_XDP" , 303 "elock-AF_MAX" 304 }; 305 306 /* 307 * sk_callback_lock and sk queues locking rules are per-address-family, 308 * so split the lock classes by using a per-AF key: 309 */ 310 static struct lock_class_key af_callback_keys[AF_MAX]; 311 static struct lock_class_key af_rlock_keys[AF_MAX]; 312 static struct lock_class_key af_wlock_keys[AF_MAX]; 313 static struct lock_class_key af_elock_keys[AF_MAX]; 314 static struct lock_class_key af_kern_callback_keys[AF_MAX]; 315 316 /* Run time adjustable parameters. */ 317 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; 318 EXPORT_SYMBOL(sysctl_wmem_max); 319 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; 320 EXPORT_SYMBOL(sysctl_rmem_max); 321 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; 322 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; 323 324 /* Maximal space eaten by iovec or ancillary data plus some space */ 325 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 326 EXPORT_SYMBOL(sysctl_optmem_max); 327 328 int sysctl_tstamp_allow_data __read_mostly = 1; 329 330 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); 331 EXPORT_SYMBOL_GPL(memalloc_socks_key); 332 333 /** 334 * sk_set_memalloc - sets %SOCK_MEMALLOC 335 * @sk: socket to set it on 336 * 337 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. 338 * It's the responsibility of the admin to adjust min_free_kbytes 339 * to meet the requirements 340 */ 341 void sk_set_memalloc(struct sock *sk) 342 { 343 sock_set_flag(sk, SOCK_MEMALLOC); 344 sk->sk_allocation |= __GFP_MEMALLOC; 345 static_branch_inc(&memalloc_socks_key); 346 } 347 EXPORT_SYMBOL_GPL(sk_set_memalloc); 348 349 void sk_clear_memalloc(struct sock *sk) 350 { 351 sock_reset_flag(sk, SOCK_MEMALLOC); 352 sk->sk_allocation &= ~__GFP_MEMALLOC; 353 static_branch_dec(&memalloc_socks_key); 354 355 /* 356 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward 357 * progress of swapping. SOCK_MEMALLOC may be cleared while 358 * it has rmem allocations due to the last swapfile being deactivated 359 * but there is a risk that the socket is unusable due to exceeding 360 * the rmem limits. Reclaim the reserves and obey rmem limits again. 361 */ 362 sk_mem_reclaim(sk); 363 } 364 EXPORT_SYMBOL_GPL(sk_clear_memalloc); 365 366 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 367 { 368 int ret; 369 unsigned int noreclaim_flag; 370 371 /* these should have been dropped before queueing */ 372 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); 373 374 noreclaim_flag = memalloc_noreclaim_save(); 375 ret = sk->sk_backlog_rcv(sk, skb); 376 memalloc_noreclaim_restore(noreclaim_flag); 377 378 return ret; 379 } 380 EXPORT_SYMBOL(__sk_backlog_rcv); 381 382 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) 383 { 384 struct timeval tv; 385 386 if (optlen < sizeof(tv)) 387 return -EINVAL; 388 if (copy_from_user(&tv, optval, sizeof(tv))) 389 return -EFAULT; 390 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) 391 return -EDOM; 392 393 if (tv.tv_sec < 0) { 394 static int warned __read_mostly; 395 396 *timeo_p = 0; 397 if (warned < 10 && net_ratelimit()) { 398 warned++; 399 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", 400 __func__, current->comm, task_pid_nr(current)); 401 } 402 return 0; 403 } 404 *timeo_p = MAX_SCHEDULE_TIMEOUT; 405 if (tv.tv_sec == 0 && tv.tv_usec == 0) 406 return 0; 407 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1)) 408 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ); 409 return 0; 410 } 411 412 static void sock_warn_obsolete_bsdism(const char *name) 413 { 414 static int warned; 415 static char warncomm[TASK_COMM_LEN]; 416 if (strcmp(warncomm, current->comm) && warned < 5) { 417 strcpy(warncomm, current->comm); 418 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n", 419 warncomm, name); 420 warned++; 421 } 422 } 423 424 static bool sock_needs_netstamp(const struct sock *sk) 425 { 426 switch (sk->sk_family) { 427 case AF_UNSPEC: 428 case AF_UNIX: 429 return false; 430 default: 431 return true; 432 } 433 } 434 435 static void sock_disable_timestamp(struct sock *sk, unsigned long flags) 436 { 437 if (sk->sk_flags & flags) { 438 sk->sk_flags &= ~flags; 439 if (sock_needs_netstamp(sk) && 440 !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) 441 net_disable_timestamp(); 442 } 443 } 444 445 446 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 447 { 448 unsigned long flags; 449 struct sk_buff_head *list = &sk->sk_receive_queue; 450 451 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { 452 atomic_inc(&sk->sk_drops); 453 trace_sock_rcvqueue_full(sk, skb); 454 return -ENOMEM; 455 } 456 457 if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 458 atomic_inc(&sk->sk_drops); 459 return -ENOBUFS; 460 } 461 462 skb->dev = NULL; 463 skb_set_owner_r(skb, sk); 464 465 /* we escape from rcu protected region, make sure we dont leak 466 * a norefcounted dst 467 */ 468 skb_dst_force(skb); 469 470 spin_lock_irqsave(&list->lock, flags); 471 sock_skb_set_dropcount(sk, skb); 472 __skb_queue_tail(list, skb); 473 spin_unlock_irqrestore(&list->lock, flags); 474 475 if (!sock_flag(sk, SOCK_DEAD)) 476 sk->sk_data_ready(sk); 477 return 0; 478 } 479 EXPORT_SYMBOL(__sock_queue_rcv_skb); 480 481 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 482 { 483 int err; 484 485 err = sk_filter(sk, skb); 486 if (err) 487 return err; 488 489 return __sock_queue_rcv_skb(sk, skb); 490 } 491 EXPORT_SYMBOL(sock_queue_rcv_skb); 492 493 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, 494 const int nested, unsigned int trim_cap, bool refcounted) 495 { 496 int rc = NET_RX_SUCCESS; 497 498 if (sk_filter_trim_cap(sk, skb, trim_cap)) 499 goto discard_and_relse; 500 501 skb->dev = NULL; 502 503 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { 504 atomic_inc(&sk->sk_drops); 505 goto discard_and_relse; 506 } 507 if (nested) 508 bh_lock_sock_nested(sk); 509 else 510 bh_lock_sock(sk); 511 if (!sock_owned_by_user(sk)) { 512 /* 513 * trylock + unlock semantics: 514 */ 515 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); 516 517 rc = sk_backlog_rcv(sk, skb); 518 519 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); 520 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) { 521 bh_unlock_sock(sk); 522 atomic_inc(&sk->sk_drops); 523 goto discard_and_relse; 524 } 525 526 bh_unlock_sock(sk); 527 out: 528 if (refcounted) 529 sock_put(sk); 530 return rc; 531 discard_and_relse: 532 kfree_skb(skb); 533 goto out; 534 } 535 EXPORT_SYMBOL(__sk_receive_skb); 536 537 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 538 { 539 struct dst_entry *dst = __sk_dst_get(sk); 540 541 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 542 sk_tx_queue_clear(sk); 543 sk->sk_dst_pending_confirm = 0; 544 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 545 dst_release(dst); 546 return NULL; 547 } 548 549 return dst; 550 } 551 EXPORT_SYMBOL(__sk_dst_check); 552 553 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) 554 { 555 struct dst_entry *dst = sk_dst_get(sk); 556 557 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 558 sk_dst_reset(sk); 559 dst_release(dst); 560 return NULL; 561 } 562 563 return dst; 564 } 565 EXPORT_SYMBOL(sk_dst_check); 566 567 static int sock_setbindtodevice(struct sock *sk, char __user *optval, 568 int optlen) 569 { 570 int ret = -ENOPROTOOPT; 571 #ifdef CONFIG_NETDEVICES 572 struct net *net = sock_net(sk); 573 char devname[IFNAMSIZ]; 574 int index; 575 576 /* Sorry... */ 577 ret = -EPERM; 578 if (!ns_capable(net->user_ns, CAP_NET_RAW)) 579 goto out; 580 581 ret = -EINVAL; 582 if (optlen < 0) 583 goto out; 584 585 /* Bind this socket to a particular device like "eth0", 586 * as specified in the passed interface name. If the 587 * name is "" or the option length is zero the socket 588 * is not bound. 589 */ 590 if (optlen > IFNAMSIZ - 1) 591 optlen = IFNAMSIZ - 1; 592 memset(devname, 0, sizeof(devname)); 593 594 ret = -EFAULT; 595 if (copy_from_user(devname, optval, optlen)) 596 goto out; 597 598 index = 0; 599 if (devname[0] != '\0') { 600 struct net_device *dev; 601 602 rcu_read_lock(); 603 dev = dev_get_by_name_rcu(net, devname); 604 if (dev) 605 index = dev->ifindex; 606 rcu_read_unlock(); 607 ret = -ENODEV; 608 if (!dev) 609 goto out; 610 } 611 612 lock_sock(sk); 613 sk->sk_bound_dev_if = index; 614 sk_dst_reset(sk); 615 release_sock(sk); 616 617 ret = 0; 618 619 out: 620 #endif 621 622 return ret; 623 } 624 625 static int sock_getbindtodevice(struct sock *sk, char __user *optval, 626 int __user *optlen, int len) 627 { 628 int ret = -ENOPROTOOPT; 629 #ifdef CONFIG_NETDEVICES 630 struct net *net = sock_net(sk); 631 char devname[IFNAMSIZ]; 632 633 if (sk->sk_bound_dev_if == 0) { 634 len = 0; 635 goto zero; 636 } 637 638 ret = -EINVAL; 639 if (len < IFNAMSIZ) 640 goto out; 641 642 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if); 643 if (ret) 644 goto out; 645 646 len = strlen(devname) + 1; 647 648 ret = -EFAULT; 649 if (copy_to_user(optval, devname, len)) 650 goto out; 651 652 zero: 653 ret = -EFAULT; 654 if (put_user(len, optlen)) 655 goto out; 656 657 ret = 0; 658 659 out: 660 #endif 661 662 return ret; 663 } 664 665 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) 666 { 667 if (valbool) 668 sock_set_flag(sk, bit); 669 else 670 sock_reset_flag(sk, bit); 671 } 672 673 bool sk_mc_loop(struct sock *sk) 674 { 675 if (dev_recursion_level()) 676 return false; 677 if (!sk) 678 return true; 679 switch (sk->sk_family) { 680 case AF_INET: 681 return inet_sk(sk)->mc_loop; 682 #if IS_ENABLED(CONFIG_IPV6) 683 case AF_INET6: 684 return inet6_sk(sk)->mc_loop; 685 #endif 686 } 687 WARN_ON(1); 688 return true; 689 } 690 EXPORT_SYMBOL(sk_mc_loop); 691 692 /* 693 * This is meant for all protocols to use and covers goings on 694 * at the socket level. Everything here is generic. 695 */ 696 697 int sock_setsockopt(struct socket *sock, int level, int optname, 698 char __user *optval, unsigned int optlen) 699 { 700 struct sock *sk = sock->sk; 701 int val; 702 int valbool; 703 struct linger ling; 704 int ret = 0; 705 706 /* 707 * Options without arguments 708 */ 709 710 if (optname == SO_BINDTODEVICE) 711 return sock_setbindtodevice(sk, optval, optlen); 712 713 if (optlen < sizeof(int)) 714 return -EINVAL; 715 716 if (get_user(val, (int __user *)optval)) 717 return -EFAULT; 718 719 valbool = val ? 1 : 0; 720 721 lock_sock(sk); 722 723 switch (optname) { 724 case SO_DEBUG: 725 if (val && !capable(CAP_NET_ADMIN)) 726 ret = -EACCES; 727 else 728 sock_valbool_flag(sk, SOCK_DBG, valbool); 729 break; 730 case SO_REUSEADDR: 731 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); 732 break; 733 case SO_REUSEPORT: 734 sk->sk_reuseport = valbool; 735 break; 736 case SO_TYPE: 737 case SO_PROTOCOL: 738 case SO_DOMAIN: 739 case SO_ERROR: 740 ret = -ENOPROTOOPT; 741 break; 742 case SO_DONTROUTE: 743 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); 744 break; 745 case SO_BROADCAST: 746 sock_valbool_flag(sk, SOCK_BROADCAST, valbool); 747 break; 748 case SO_SNDBUF: 749 /* Don't error on this BSD doesn't and if you think 750 * about it this is right. Otherwise apps have to 751 * play 'guess the biggest size' games. RCVBUF/SNDBUF 752 * are treated in BSD as hints 753 */ 754 val = min_t(u32, val, sysctl_wmem_max); 755 set_sndbuf: 756 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 757 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); 758 /* Wake up sending tasks if we upped the value. */ 759 sk->sk_write_space(sk); 760 break; 761 762 case SO_SNDBUFFORCE: 763 if (!capable(CAP_NET_ADMIN)) { 764 ret = -EPERM; 765 break; 766 } 767 goto set_sndbuf; 768 769 case SO_RCVBUF: 770 /* Don't error on this BSD doesn't and if you think 771 * about it this is right. Otherwise apps have to 772 * play 'guess the biggest size' games. RCVBUF/SNDBUF 773 * are treated in BSD as hints 774 */ 775 val = min_t(u32, val, sysctl_rmem_max); 776 set_rcvbuf: 777 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 778 /* 779 * We double it on the way in to account for 780 * "struct sk_buff" etc. overhead. Applications 781 * assume that the SO_RCVBUF setting they make will 782 * allow that much actual data to be received on that 783 * socket. 784 * 785 * Applications are unaware that "struct sk_buff" and 786 * other overheads allocate from the receive buffer 787 * during socket buffer allocation. 788 * 789 * And after considering the possible alternatives, 790 * returning the value we actually used in getsockopt 791 * is the most desirable behavior. 792 */ 793 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF); 794 break; 795 796 case SO_RCVBUFFORCE: 797 if (!capable(CAP_NET_ADMIN)) { 798 ret = -EPERM; 799 break; 800 } 801 goto set_rcvbuf; 802 803 case SO_KEEPALIVE: 804 if (sk->sk_prot->keepalive) 805 sk->sk_prot->keepalive(sk, valbool); 806 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 807 break; 808 809 case SO_OOBINLINE: 810 sock_valbool_flag(sk, SOCK_URGINLINE, valbool); 811 break; 812 813 case SO_NO_CHECK: 814 sk->sk_no_check_tx = valbool; 815 break; 816 817 case SO_PRIORITY: 818 if ((val >= 0 && val <= 6) || 819 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 820 sk->sk_priority = val; 821 else 822 ret = -EPERM; 823 break; 824 825 case SO_LINGER: 826 if (optlen < sizeof(ling)) { 827 ret = -EINVAL; /* 1003.1g */ 828 break; 829 } 830 if (copy_from_user(&ling, optval, sizeof(ling))) { 831 ret = -EFAULT; 832 break; 833 } 834 if (!ling.l_onoff) 835 sock_reset_flag(sk, SOCK_LINGER); 836 else { 837 #if (BITS_PER_LONG == 32) 838 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) 839 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; 840 else 841 #endif 842 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; 843 sock_set_flag(sk, SOCK_LINGER); 844 } 845 break; 846 847 case SO_BSDCOMPAT: 848 sock_warn_obsolete_bsdism("setsockopt"); 849 break; 850 851 case SO_PASSCRED: 852 if (valbool) 853 set_bit(SOCK_PASSCRED, &sock->flags); 854 else 855 clear_bit(SOCK_PASSCRED, &sock->flags); 856 break; 857 858 case SO_TIMESTAMP: 859 case SO_TIMESTAMPNS: 860 if (valbool) { 861 if (optname == SO_TIMESTAMP) 862 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 863 else 864 sock_set_flag(sk, SOCK_RCVTSTAMPNS); 865 sock_set_flag(sk, SOCK_RCVTSTAMP); 866 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 867 } else { 868 sock_reset_flag(sk, SOCK_RCVTSTAMP); 869 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 870 } 871 break; 872 873 case SO_TIMESTAMPING: 874 if (val & ~SOF_TIMESTAMPING_MASK) { 875 ret = -EINVAL; 876 break; 877 } 878 879 if (val & SOF_TIMESTAMPING_OPT_ID && 880 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { 881 if (sk->sk_protocol == IPPROTO_TCP && 882 sk->sk_type == SOCK_STREAM) { 883 if ((1 << sk->sk_state) & 884 (TCPF_CLOSE | TCPF_LISTEN)) { 885 ret = -EINVAL; 886 break; 887 } 888 sk->sk_tskey = tcp_sk(sk)->snd_una; 889 } else { 890 sk->sk_tskey = 0; 891 } 892 } 893 894 if (val & SOF_TIMESTAMPING_OPT_STATS && 895 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) { 896 ret = -EINVAL; 897 break; 898 } 899 900 sk->sk_tsflags = val; 901 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 902 sock_enable_timestamp(sk, 903 SOCK_TIMESTAMPING_RX_SOFTWARE); 904 else 905 sock_disable_timestamp(sk, 906 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); 907 break; 908 909 case SO_RCVLOWAT: 910 if (val < 0) 911 val = INT_MAX; 912 if (sock->ops->set_rcvlowat) 913 ret = sock->ops->set_rcvlowat(sk, val); 914 else 915 sk->sk_rcvlowat = val ? : 1; 916 break; 917 918 case SO_RCVTIMEO: 919 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen); 920 break; 921 922 case SO_SNDTIMEO: 923 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen); 924 break; 925 926 case SO_ATTACH_FILTER: 927 ret = -EINVAL; 928 if (optlen == sizeof(struct sock_fprog)) { 929 struct sock_fprog fprog; 930 931 ret = -EFAULT; 932 if (copy_from_user(&fprog, optval, sizeof(fprog))) 933 break; 934 935 ret = sk_attach_filter(&fprog, sk); 936 } 937 break; 938 939 case SO_ATTACH_BPF: 940 ret = -EINVAL; 941 if (optlen == sizeof(u32)) { 942 u32 ufd; 943 944 ret = -EFAULT; 945 if (copy_from_user(&ufd, optval, sizeof(ufd))) 946 break; 947 948 ret = sk_attach_bpf(ufd, sk); 949 } 950 break; 951 952 case SO_ATTACH_REUSEPORT_CBPF: 953 ret = -EINVAL; 954 if (optlen == sizeof(struct sock_fprog)) { 955 struct sock_fprog fprog; 956 957 ret = -EFAULT; 958 if (copy_from_user(&fprog, optval, sizeof(fprog))) 959 break; 960 961 ret = sk_reuseport_attach_filter(&fprog, sk); 962 } 963 break; 964 965 case SO_ATTACH_REUSEPORT_EBPF: 966 ret = -EINVAL; 967 if (optlen == sizeof(u32)) { 968 u32 ufd; 969 970 ret = -EFAULT; 971 if (copy_from_user(&ufd, optval, sizeof(ufd))) 972 break; 973 974 ret = sk_reuseport_attach_bpf(ufd, sk); 975 } 976 break; 977 978 case SO_DETACH_FILTER: 979 ret = sk_detach_filter(sk); 980 break; 981 982 case SO_LOCK_FILTER: 983 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) 984 ret = -EPERM; 985 else 986 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); 987 break; 988 989 case SO_PASSSEC: 990 if (valbool) 991 set_bit(SOCK_PASSSEC, &sock->flags); 992 else 993 clear_bit(SOCK_PASSSEC, &sock->flags); 994 break; 995 case SO_MARK: 996 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 997 ret = -EPERM; 998 else 999 sk->sk_mark = val; 1000 break; 1001 1002 case SO_RXQ_OVFL: 1003 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); 1004 break; 1005 1006 case SO_WIFI_STATUS: 1007 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); 1008 break; 1009 1010 case SO_PEEK_OFF: 1011 if (sock->ops->set_peek_off) 1012 ret = sock->ops->set_peek_off(sk, val); 1013 else 1014 ret = -EOPNOTSUPP; 1015 break; 1016 1017 case SO_NOFCS: 1018 sock_valbool_flag(sk, SOCK_NOFCS, valbool); 1019 break; 1020 1021 case SO_SELECT_ERR_QUEUE: 1022 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); 1023 break; 1024 1025 #ifdef CONFIG_NET_RX_BUSY_POLL 1026 case SO_BUSY_POLL: 1027 /* allow unprivileged users to decrease the value */ 1028 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) 1029 ret = -EPERM; 1030 else { 1031 if (val < 0) 1032 ret = -EINVAL; 1033 else 1034 sk->sk_ll_usec = val; 1035 } 1036 break; 1037 #endif 1038 1039 case SO_MAX_PACING_RATE: 1040 if (val != ~0U) 1041 cmpxchg(&sk->sk_pacing_status, 1042 SK_PACING_NONE, 1043 SK_PACING_NEEDED); 1044 sk->sk_max_pacing_rate = val; 1045 sk->sk_pacing_rate = min(sk->sk_pacing_rate, 1046 sk->sk_max_pacing_rate); 1047 break; 1048 1049 case SO_INCOMING_CPU: 1050 sk->sk_incoming_cpu = val; 1051 break; 1052 1053 case SO_CNX_ADVICE: 1054 if (val == 1) 1055 dst_negative_advice(sk); 1056 break; 1057 1058 case SO_ZEROCOPY: 1059 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { 1060 if (sk->sk_protocol != IPPROTO_TCP) 1061 ret = -ENOTSUPP; 1062 } else if (sk->sk_family != PF_RDS) { 1063 ret = -ENOTSUPP; 1064 } 1065 if (!ret) { 1066 if (val < 0 || val > 1) 1067 ret = -EINVAL; 1068 else 1069 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); 1070 } 1071 break; 1072 1073 default: 1074 ret = -ENOPROTOOPT; 1075 break; 1076 } 1077 release_sock(sk); 1078 return ret; 1079 } 1080 EXPORT_SYMBOL(sock_setsockopt); 1081 1082 1083 static void cred_to_ucred(struct pid *pid, const struct cred *cred, 1084 struct ucred *ucred) 1085 { 1086 ucred->pid = pid_vnr(pid); 1087 ucred->uid = ucred->gid = -1; 1088 if (cred) { 1089 struct user_namespace *current_ns = current_user_ns(); 1090 1091 ucred->uid = from_kuid_munged(current_ns, cred->euid); 1092 ucred->gid = from_kgid_munged(current_ns, cred->egid); 1093 } 1094 } 1095 1096 static int groups_to_user(gid_t __user *dst, const struct group_info *src) 1097 { 1098 struct user_namespace *user_ns = current_user_ns(); 1099 int i; 1100 1101 for (i = 0; i < src->ngroups; i++) 1102 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i)) 1103 return -EFAULT; 1104 1105 return 0; 1106 } 1107 1108 int sock_getsockopt(struct socket *sock, int level, int optname, 1109 char __user *optval, int __user *optlen) 1110 { 1111 struct sock *sk = sock->sk; 1112 1113 union { 1114 int val; 1115 u64 val64; 1116 struct linger ling; 1117 struct timeval tm; 1118 } v; 1119 1120 int lv = sizeof(int); 1121 int len; 1122 1123 if (get_user(len, optlen)) 1124 return -EFAULT; 1125 if (len < 0) 1126 return -EINVAL; 1127 1128 memset(&v, 0, sizeof(v)); 1129 1130 switch (optname) { 1131 case SO_DEBUG: 1132 v.val = sock_flag(sk, SOCK_DBG); 1133 break; 1134 1135 case SO_DONTROUTE: 1136 v.val = sock_flag(sk, SOCK_LOCALROUTE); 1137 break; 1138 1139 case SO_BROADCAST: 1140 v.val = sock_flag(sk, SOCK_BROADCAST); 1141 break; 1142 1143 case SO_SNDBUF: 1144 v.val = sk->sk_sndbuf; 1145 break; 1146 1147 case SO_RCVBUF: 1148 v.val = sk->sk_rcvbuf; 1149 break; 1150 1151 case SO_REUSEADDR: 1152 v.val = sk->sk_reuse; 1153 break; 1154 1155 case SO_REUSEPORT: 1156 v.val = sk->sk_reuseport; 1157 break; 1158 1159 case SO_KEEPALIVE: 1160 v.val = sock_flag(sk, SOCK_KEEPOPEN); 1161 break; 1162 1163 case SO_TYPE: 1164 v.val = sk->sk_type; 1165 break; 1166 1167 case SO_PROTOCOL: 1168 v.val = sk->sk_protocol; 1169 break; 1170 1171 case SO_DOMAIN: 1172 v.val = sk->sk_family; 1173 break; 1174 1175 case SO_ERROR: 1176 v.val = -sock_error(sk); 1177 if (v.val == 0) 1178 v.val = xchg(&sk->sk_err_soft, 0); 1179 break; 1180 1181 case SO_OOBINLINE: 1182 v.val = sock_flag(sk, SOCK_URGINLINE); 1183 break; 1184 1185 case SO_NO_CHECK: 1186 v.val = sk->sk_no_check_tx; 1187 break; 1188 1189 case SO_PRIORITY: 1190 v.val = sk->sk_priority; 1191 break; 1192 1193 case SO_LINGER: 1194 lv = sizeof(v.ling); 1195 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); 1196 v.ling.l_linger = sk->sk_lingertime / HZ; 1197 break; 1198 1199 case SO_BSDCOMPAT: 1200 sock_warn_obsolete_bsdism("getsockopt"); 1201 break; 1202 1203 case SO_TIMESTAMP: 1204 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && 1205 !sock_flag(sk, SOCK_RCVTSTAMPNS); 1206 break; 1207 1208 case SO_TIMESTAMPNS: 1209 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS); 1210 break; 1211 1212 case SO_TIMESTAMPING: 1213 v.val = sk->sk_tsflags; 1214 break; 1215 1216 case SO_RCVTIMEO: 1217 lv = sizeof(struct timeval); 1218 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) { 1219 v.tm.tv_sec = 0; 1220 v.tm.tv_usec = 0; 1221 } else { 1222 v.tm.tv_sec = sk->sk_rcvtimeo / HZ; 1223 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ; 1224 } 1225 break; 1226 1227 case SO_SNDTIMEO: 1228 lv = sizeof(struct timeval); 1229 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) { 1230 v.tm.tv_sec = 0; 1231 v.tm.tv_usec = 0; 1232 } else { 1233 v.tm.tv_sec = sk->sk_sndtimeo / HZ; 1234 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ; 1235 } 1236 break; 1237 1238 case SO_RCVLOWAT: 1239 v.val = sk->sk_rcvlowat; 1240 break; 1241 1242 case SO_SNDLOWAT: 1243 v.val = 1; 1244 break; 1245 1246 case SO_PASSCRED: 1247 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); 1248 break; 1249 1250 case SO_PEERCRED: 1251 { 1252 struct ucred peercred; 1253 if (len > sizeof(peercred)) 1254 len = sizeof(peercred); 1255 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); 1256 if (copy_to_user(optval, &peercred, len)) 1257 return -EFAULT; 1258 goto lenout; 1259 } 1260 1261 case SO_PEERGROUPS: 1262 { 1263 int ret, n; 1264 1265 if (!sk->sk_peer_cred) 1266 return -ENODATA; 1267 1268 n = sk->sk_peer_cred->group_info->ngroups; 1269 if (len < n * sizeof(gid_t)) { 1270 len = n * sizeof(gid_t); 1271 return put_user(len, optlen) ? -EFAULT : -ERANGE; 1272 } 1273 len = n * sizeof(gid_t); 1274 1275 ret = groups_to_user((gid_t __user *)optval, 1276 sk->sk_peer_cred->group_info); 1277 if (ret) 1278 return ret; 1279 goto lenout; 1280 } 1281 1282 case SO_PEERNAME: 1283 { 1284 char address[128]; 1285 1286 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2); 1287 if (lv < 0) 1288 return -ENOTCONN; 1289 if (lv < len) 1290 return -EINVAL; 1291 if (copy_to_user(optval, address, len)) 1292 return -EFAULT; 1293 goto lenout; 1294 } 1295 1296 /* Dubious BSD thing... Probably nobody even uses it, but 1297 * the UNIX standard wants it for whatever reason... -DaveM 1298 */ 1299 case SO_ACCEPTCONN: 1300 v.val = sk->sk_state == TCP_LISTEN; 1301 break; 1302 1303 case SO_PASSSEC: 1304 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); 1305 break; 1306 1307 case SO_PEERSEC: 1308 return security_socket_getpeersec_stream(sock, optval, optlen, len); 1309 1310 case SO_MARK: 1311 v.val = sk->sk_mark; 1312 break; 1313 1314 case SO_RXQ_OVFL: 1315 v.val = sock_flag(sk, SOCK_RXQ_OVFL); 1316 break; 1317 1318 case SO_WIFI_STATUS: 1319 v.val = sock_flag(sk, SOCK_WIFI_STATUS); 1320 break; 1321 1322 case SO_PEEK_OFF: 1323 if (!sock->ops->set_peek_off) 1324 return -EOPNOTSUPP; 1325 1326 v.val = sk->sk_peek_off; 1327 break; 1328 case SO_NOFCS: 1329 v.val = sock_flag(sk, SOCK_NOFCS); 1330 break; 1331 1332 case SO_BINDTODEVICE: 1333 return sock_getbindtodevice(sk, optval, optlen, len); 1334 1335 case SO_GET_FILTER: 1336 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); 1337 if (len < 0) 1338 return len; 1339 1340 goto lenout; 1341 1342 case SO_LOCK_FILTER: 1343 v.val = sock_flag(sk, SOCK_FILTER_LOCKED); 1344 break; 1345 1346 case SO_BPF_EXTENSIONS: 1347 v.val = bpf_tell_extensions(); 1348 break; 1349 1350 case SO_SELECT_ERR_QUEUE: 1351 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); 1352 break; 1353 1354 #ifdef CONFIG_NET_RX_BUSY_POLL 1355 case SO_BUSY_POLL: 1356 v.val = sk->sk_ll_usec; 1357 break; 1358 #endif 1359 1360 case SO_MAX_PACING_RATE: 1361 v.val = sk->sk_max_pacing_rate; 1362 break; 1363 1364 case SO_INCOMING_CPU: 1365 v.val = sk->sk_incoming_cpu; 1366 break; 1367 1368 case SO_MEMINFO: 1369 { 1370 u32 meminfo[SK_MEMINFO_VARS]; 1371 1372 if (get_user(len, optlen)) 1373 return -EFAULT; 1374 1375 sk_get_meminfo(sk, meminfo); 1376 1377 len = min_t(unsigned int, len, sizeof(meminfo)); 1378 if (copy_to_user(optval, &meminfo, len)) 1379 return -EFAULT; 1380 1381 goto lenout; 1382 } 1383 1384 #ifdef CONFIG_NET_RX_BUSY_POLL 1385 case SO_INCOMING_NAPI_ID: 1386 v.val = READ_ONCE(sk->sk_napi_id); 1387 1388 /* aggregate non-NAPI IDs down to 0 */ 1389 if (v.val < MIN_NAPI_ID) 1390 v.val = 0; 1391 1392 break; 1393 #endif 1394 1395 case SO_COOKIE: 1396 lv = sizeof(u64); 1397 if (len < lv) 1398 return -EINVAL; 1399 v.val64 = sock_gen_cookie(sk); 1400 break; 1401 1402 case SO_ZEROCOPY: 1403 v.val = sock_flag(sk, SOCK_ZEROCOPY); 1404 break; 1405 1406 default: 1407 /* We implement the SO_SNDLOWAT etc to not be settable 1408 * (1003.1g 7). 1409 */ 1410 return -ENOPROTOOPT; 1411 } 1412 1413 if (len > lv) 1414 len = lv; 1415 if (copy_to_user(optval, &v, len)) 1416 return -EFAULT; 1417 lenout: 1418 if (put_user(len, optlen)) 1419 return -EFAULT; 1420 return 0; 1421 } 1422 1423 /* 1424 * Initialize an sk_lock. 1425 * 1426 * (We also register the sk_lock with the lock validator.) 1427 */ 1428 static inline void sock_lock_init(struct sock *sk) 1429 { 1430 if (sk->sk_kern_sock) 1431 sock_lock_init_class_and_name( 1432 sk, 1433 af_family_kern_slock_key_strings[sk->sk_family], 1434 af_family_kern_slock_keys + sk->sk_family, 1435 af_family_kern_key_strings[sk->sk_family], 1436 af_family_kern_keys + sk->sk_family); 1437 else 1438 sock_lock_init_class_and_name( 1439 sk, 1440 af_family_slock_key_strings[sk->sk_family], 1441 af_family_slock_keys + sk->sk_family, 1442 af_family_key_strings[sk->sk_family], 1443 af_family_keys + sk->sk_family); 1444 } 1445 1446 /* 1447 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 1448 * even temporarly, because of RCU lookups. sk_node should also be left as is. 1449 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end 1450 */ 1451 static void sock_copy(struct sock *nsk, const struct sock *osk) 1452 { 1453 #ifdef CONFIG_SECURITY_NETWORK 1454 void *sptr = nsk->sk_security; 1455 #endif 1456 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); 1457 1458 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, 1459 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); 1460 1461 #ifdef CONFIG_SECURITY_NETWORK 1462 nsk->sk_security = sptr; 1463 security_sk_clone(osk, nsk); 1464 #endif 1465 } 1466 1467 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 1468 int family) 1469 { 1470 struct sock *sk; 1471 struct kmem_cache *slab; 1472 1473 slab = prot->slab; 1474 if (slab != NULL) { 1475 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 1476 if (!sk) 1477 return sk; 1478 if (priority & __GFP_ZERO) 1479 sk_prot_clear_nulls(sk, prot->obj_size); 1480 } else 1481 sk = kmalloc(prot->obj_size, priority); 1482 1483 if (sk != NULL) { 1484 if (security_sk_alloc(sk, family, priority)) 1485 goto out_free; 1486 1487 if (!try_module_get(prot->owner)) 1488 goto out_free_sec; 1489 sk_tx_queue_clear(sk); 1490 } 1491 1492 return sk; 1493 1494 out_free_sec: 1495 security_sk_free(sk); 1496 out_free: 1497 if (slab != NULL) 1498 kmem_cache_free(slab, sk); 1499 else 1500 kfree(sk); 1501 return NULL; 1502 } 1503 1504 static void sk_prot_free(struct proto *prot, struct sock *sk) 1505 { 1506 struct kmem_cache *slab; 1507 struct module *owner; 1508 1509 owner = prot->owner; 1510 slab = prot->slab; 1511 1512 cgroup_sk_free(&sk->sk_cgrp_data); 1513 mem_cgroup_sk_free(sk); 1514 security_sk_free(sk); 1515 if (slab != NULL) 1516 kmem_cache_free(slab, sk); 1517 else 1518 kfree(sk); 1519 module_put(owner); 1520 } 1521 1522 /** 1523 * sk_alloc - All socket objects are allocated here 1524 * @net: the applicable net namespace 1525 * @family: protocol family 1526 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1527 * @prot: struct proto associated with this new sock instance 1528 * @kern: is this to be a kernel socket? 1529 */ 1530 struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 1531 struct proto *prot, int kern) 1532 { 1533 struct sock *sk; 1534 1535 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); 1536 if (sk) { 1537 sk->sk_family = family; 1538 /* 1539 * See comment in struct sock definition to understand 1540 * why we need sk_prot_creator -acme 1541 */ 1542 sk->sk_prot = sk->sk_prot_creator = prot; 1543 sk->sk_kern_sock = kern; 1544 sock_lock_init(sk); 1545 sk->sk_net_refcnt = kern ? 0 : 1; 1546 if (likely(sk->sk_net_refcnt)) { 1547 get_net(net); 1548 sock_inuse_add(net, 1); 1549 } 1550 1551 sock_net_set(sk, net); 1552 refcount_set(&sk->sk_wmem_alloc, 1); 1553 1554 mem_cgroup_sk_alloc(sk); 1555 cgroup_sk_alloc(&sk->sk_cgrp_data); 1556 sock_update_classid(&sk->sk_cgrp_data); 1557 sock_update_netprioidx(&sk->sk_cgrp_data); 1558 } 1559 1560 return sk; 1561 } 1562 EXPORT_SYMBOL(sk_alloc); 1563 1564 /* Sockets having SOCK_RCU_FREE will call this function after one RCU 1565 * grace period. This is the case for UDP sockets and TCP listeners. 1566 */ 1567 static void __sk_destruct(struct rcu_head *head) 1568 { 1569 struct sock *sk = container_of(head, struct sock, sk_rcu); 1570 struct sk_filter *filter; 1571 1572 if (sk->sk_destruct) 1573 sk->sk_destruct(sk); 1574 1575 filter = rcu_dereference_check(sk->sk_filter, 1576 refcount_read(&sk->sk_wmem_alloc) == 0); 1577 if (filter) { 1578 sk_filter_uncharge(sk, filter); 1579 RCU_INIT_POINTER(sk->sk_filter, NULL); 1580 } 1581 if (rcu_access_pointer(sk->sk_reuseport_cb)) 1582 reuseport_detach_sock(sk); 1583 1584 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); 1585 1586 if (atomic_read(&sk->sk_omem_alloc)) 1587 pr_debug("%s: optmem leakage (%d bytes) detected\n", 1588 __func__, atomic_read(&sk->sk_omem_alloc)); 1589 1590 if (sk->sk_frag.page) { 1591 put_page(sk->sk_frag.page); 1592 sk->sk_frag.page = NULL; 1593 } 1594 1595 if (sk->sk_peer_cred) 1596 put_cred(sk->sk_peer_cred); 1597 put_pid(sk->sk_peer_pid); 1598 if (likely(sk->sk_net_refcnt)) 1599 put_net(sock_net(sk)); 1600 sk_prot_free(sk->sk_prot_creator, sk); 1601 } 1602 1603 void sk_destruct(struct sock *sk) 1604 { 1605 if (sock_flag(sk, SOCK_RCU_FREE)) 1606 call_rcu(&sk->sk_rcu, __sk_destruct); 1607 else 1608 __sk_destruct(&sk->sk_rcu); 1609 } 1610 1611 static void __sk_free(struct sock *sk) 1612 { 1613 if (likely(sk->sk_net_refcnt)) 1614 sock_inuse_add(sock_net(sk), -1); 1615 1616 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) 1617 sock_diag_broadcast_destroy(sk); 1618 else 1619 sk_destruct(sk); 1620 } 1621 1622 void sk_free(struct sock *sk) 1623 { 1624 /* 1625 * We subtract one from sk_wmem_alloc and can know if 1626 * some packets are still in some tx queue. 1627 * If not null, sock_wfree() will call __sk_free(sk) later 1628 */ 1629 if (refcount_dec_and_test(&sk->sk_wmem_alloc)) 1630 __sk_free(sk); 1631 } 1632 EXPORT_SYMBOL(sk_free); 1633 1634 static void sk_init_common(struct sock *sk) 1635 { 1636 skb_queue_head_init(&sk->sk_receive_queue); 1637 skb_queue_head_init(&sk->sk_write_queue); 1638 skb_queue_head_init(&sk->sk_error_queue); 1639 1640 rwlock_init(&sk->sk_callback_lock); 1641 lockdep_set_class_and_name(&sk->sk_receive_queue.lock, 1642 af_rlock_keys + sk->sk_family, 1643 af_family_rlock_key_strings[sk->sk_family]); 1644 lockdep_set_class_and_name(&sk->sk_write_queue.lock, 1645 af_wlock_keys + sk->sk_family, 1646 af_family_wlock_key_strings[sk->sk_family]); 1647 lockdep_set_class_and_name(&sk->sk_error_queue.lock, 1648 af_elock_keys + sk->sk_family, 1649 af_family_elock_key_strings[sk->sk_family]); 1650 lockdep_set_class_and_name(&sk->sk_callback_lock, 1651 af_callback_keys + sk->sk_family, 1652 af_family_clock_key_strings[sk->sk_family]); 1653 } 1654 1655 /** 1656 * sk_clone_lock - clone a socket, and lock its clone 1657 * @sk: the socket to clone 1658 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1659 * 1660 * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) 1661 */ 1662 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) 1663 { 1664 struct sock *newsk; 1665 bool is_charged = true; 1666 1667 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family); 1668 if (newsk != NULL) { 1669 struct sk_filter *filter; 1670 1671 sock_copy(newsk, sk); 1672 1673 newsk->sk_prot_creator = sk->sk_prot; 1674 1675 /* SANITY */ 1676 if (likely(newsk->sk_net_refcnt)) 1677 get_net(sock_net(newsk)); 1678 sk_node_init(&newsk->sk_node); 1679 sock_lock_init(newsk); 1680 bh_lock_sock(newsk); 1681 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 1682 newsk->sk_backlog.len = 0; 1683 1684 atomic_set(&newsk->sk_rmem_alloc, 0); 1685 /* 1686 * sk_wmem_alloc set to one (see sk_free() and sock_wfree()) 1687 */ 1688 refcount_set(&newsk->sk_wmem_alloc, 1); 1689 atomic_set(&newsk->sk_omem_alloc, 0); 1690 sk_init_common(newsk); 1691 1692 newsk->sk_dst_cache = NULL; 1693 newsk->sk_dst_pending_confirm = 0; 1694 newsk->sk_wmem_queued = 0; 1695 newsk->sk_forward_alloc = 0; 1696 atomic_set(&newsk->sk_drops, 0); 1697 newsk->sk_send_head = NULL; 1698 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; 1699 atomic_set(&newsk->sk_zckey, 0); 1700 1701 sock_reset_flag(newsk, SOCK_DONE); 1702 mem_cgroup_sk_alloc(newsk); 1703 cgroup_sk_alloc(&newsk->sk_cgrp_data); 1704 1705 rcu_read_lock(); 1706 filter = rcu_dereference(sk->sk_filter); 1707 if (filter != NULL) 1708 /* though it's an empty new sock, the charging may fail 1709 * if sysctl_optmem_max was changed between creation of 1710 * original socket and cloning 1711 */ 1712 is_charged = sk_filter_charge(newsk, filter); 1713 RCU_INIT_POINTER(newsk->sk_filter, filter); 1714 rcu_read_unlock(); 1715 1716 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { 1717 /* We need to make sure that we don't uncharge the new 1718 * socket if we couldn't charge it in the first place 1719 * as otherwise we uncharge the parent's filter. 1720 */ 1721 if (!is_charged) 1722 RCU_INIT_POINTER(newsk->sk_filter, NULL); 1723 sk_free_unlock_clone(newsk); 1724 newsk = NULL; 1725 goto out; 1726 } 1727 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); 1728 1729 newsk->sk_err = 0; 1730 newsk->sk_err_soft = 0; 1731 newsk->sk_priority = 0; 1732 newsk->sk_incoming_cpu = raw_smp_processor_id(); 1733 atomic64_set(&newsk->sk_cookie, 0); 1734 if (likely(newsk->sk_net_refcnt)) 1735 sock_inuse_add(sock_net(newsk), 1); 1736 1737 /* 1738 * Before updating sk_refcnt, we must commit prior changes to memory 1739 * (Documentation/RCU/rculist_nulls.txt for details) 1740 */ 1741 smp_wmb(); 1742 refcount_set(&newsk->sk_refcnt, 2); 1743 1744 /* 1745 * Increment the counter in the same struct proto as the master 1746 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that 1747 * is the same as sk->sk_prot->socks, as this field was copied 1748 * with memcpy). 1749 * 1750 * This _changes_ the previous behaviour, where 1751 * tcp_create_openreq_child always was incrementing the 1752 * equivalent to tcp_prot->socks (inet_sock_nr), so this have 1753 * to be taken into account in all callers. -acme 1754 */ 1755 sk_refcnt_debug_inc(newsk); 1756 sk_set_socket(newsk, NULL); 1757 newsk->sk_wq = NULL; 1758 1759 if (newsk->sk_prot->sockets_allocated) 1760 sk_sockets_allocated_inc(newsk); 1761 1762 if (sock_needs_netstamp(sk) && 1763 newsk->sk_flags & SK_FLAGS_TIMESTAMP) 1764 net_enable_timestamp(); 1765 } 1766 out: 1767 return newsk; 1768 } 1769 EXPORT_SYMBOL_GPL(sk_clone_lock); 1770 1771 void sk_free_unlock_clone(struct sock *sk) 1772 { 1773 /* It is still raw copy of parent, so invalidate 1774 * destructor and make plain sk_free() */ 1775 sk->sk_destruct = NULL; 1776 bh_unlock_sock(sk); 1777 sk_free(sk); 1778 } 1779 EXPORT_SYMBOL_GPL(sk_free_unlock_clone); 1780 1781 void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 1782 { 1783 u32 max_segs = 1; 1784 1785 sk_dst_set(sk, dst); 1786 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps; 1787 if (sk->sk_route_caps & NETIF_F_GSO) 1788 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 1789 sk->sk_route_caps &= ~sk->sk_route_nocaps; 1790 if (sk_can_gso(sk)) { 1791 if (dst->header_len && !xfrm_dst_offload_ok(dst)) { 1792 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 1793 } else { 1794 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 1795 sk->sk_gso_max_size = dst->dev->gso_max_size; 1796 max_segs = max_t(u32, dst->dev->gso_max_segs, 1); 1797 } 1798 } 1799 sk->sk_gso_max_segs = max_segs; 1800 } 1801 EXPORT_SYMBOL_GPL(sk_setup_caps); 1802 1803 /* 1804 * Simple resource managers for sockets. 1805 */ 1806 1807 1808 /* 1809 * Write buffer destructor automatically called from kfree_skb. 1810 */ 1811 void sock_wfree(struct sk_buff *skb) 1812 { 1813 struct sock *sk = skb->sk; 1814 unsigned int len = skb->truesize; 1815 1816 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { 1817 /* 1818 * Keep a reference on sk_wmem_alloc, this will be released 1819 * after sk_write_space() call 1820 */ 1821 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); 1822 sk->sk_write_space(sk); 1823 len = 1; 1824 } 1825 /* 1826 * if sk_wmem_alloc reaches 0, we must finish what sk_free() 1827 * could not do because of in-flight packets 1828 */ 1829 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) 1830 __sk_free(sk); 1831 } 1832 EXPORT_SYMBOL(sock_wfree); 1833 1834 /* This variant of sock_wfree() is used by TCP, 1835 * since it sets SOCK_USE_WRITE_QUEUE. 1836 */ 1837 void __sock_wfree(struct sk_buff *skb) 1838 { 1839 struct sock *sk = skb->sk; 1840 1841 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) 1842 __sk_free(sk); 1843 } 1844 1845 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) 1846 { 1847 skb_orphan(skb); 1848 skb->sk = sk; 1849 #ifdef CONFIG_INET 1850 if (unlikely(!sk_fullsock(sk))) { 1851 skb->destructor = sock_edemux; 1852 sock_hold(sk); 1853 return; 1854 } 1855 #endif 1856 skb->destructor = sock_wfree; 1857 skb_set_hash_from_sk(skb, sk); 1858 /* 1859 * We used to take a refcount on sk, but following operation 1860 * is enough to guarantee sk_free() wont free this sock until 1861 * all in-flight packets are completed 1862 */ 1863 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 1864 } 1865 EXPORT_SYMBOL(skb_set_owner_w); 1866 1867 /* This helper is used by netem, as it can hold packets in its 1868 * delay queue. We want to allow the owner socket to send more 1869 * packets, as if they were already TX completed by a typical driver. 1870 * But we also want to keep skb->sk set because some packet schedulers 1871 * rely on it (sch_fq for example). 1872 */ 1873 void skb_orphan_partial(struct sk_buff *skb) 1874 { 1875 if (skb_is_tcp_pure_ack(skb)) 1876 return; 1877 1878 if (skb->destructor == sock_wfree 1879 #ifdef CONFIG_INET 1880 || skb->destructor == tcp_wfree 1881 #endif 1882 ) { 1883 struct sock *sk = skb->sk; 1884 1885 if (refcount_inc_not_zero(&sk->sk_refcnt)) { 1886 WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)); 1887 skb->destructor = sock_efree; 1888 } 1889 } else { 1890 skb_orphan(skb); 1891 } 1892 } 1893 EXPORT_SYMBOL(skb_orphan_partial); 1894 1895 /* 1896 * Read buffer destructor automatically called from kfree_skb. 1897 */ 1898 void sock_rfree(struct sk_buff *skb) 1899 { 1900 struct sock *sk = skb->sk; 1901 unsigned int len = skb->truesize; 1902 1903 atomic_sub(len, &sk->sk_rmem_alloc); 1904 sk_mem_uncharge(sk, len); 1905 } 1906 EXPORT_SYMBOL(sock_rfree); 1907 1908 /* 1909 * Buffer destructor for skbs that are not used directly in read or write 1910 * path, e.g. for error handler skbs. Automatically called from kfree_skb. 1911 */ 1912 void sock_efree(struct sk_buff *skb) 1913 { 1914 sock_put(skb->sk); 1915 } 1916 EXPORT_SYMBOL(sock_efree); 1917 1918 kuid_t sock_i_uid(struct sock *sk) 1919 { 1920 kuid_t uid; 1921 1922 read_lock_bh(&sk->sk_callback_lock); 1923 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; 1924 read_unlock_bh(&sk->sk_callback_lock); 1925 return uid; 1926 } 1927 EXPORT_SYMBOL(sock_i_uid); 1928 1929 unsigned long sock_i_ino(struct sock *sk) 1930 { 1931 unsigned long ino; 1932 1933 read_lock_bh(&sk->sk_callback_lock); 1934 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; 1935 read_unlock_bh(&sk->sk_callback_lock); 1936 return ino; 1937 } 1938 EXPORT_SYMBOL(sock_i_ino); 1939 1940 /* 1941 * Allocate a skb from the socket's send buffer. 1942 */ 1943 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 1944 gfp_t priority) 1945 { 1946 if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { 1947 struct sk_buff *skb = alloc_skb(size, priority); 1948 if (skb) { 1949 skb_set_owner_w(skb, sk); 1950 return skb; 1951 } 1952 } 1953 return NULL; 1954 } 1955 EXPORT_SYMBOL(sock_wmalloc); 1956 1957 static void sock_ofree(struct sk_buff *skb) 1958 { 1959 struct sock *sk = skb->sk; 1960 1961 atomic_sub(skb->truesize, &sk->sk_omem_alloc); 1962 } 1963 1964 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, 1965 gfp_t priority) 1966 { 1967 struct sk_buff *skb; 1968 1969 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ 1970 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > 1971 sysctl_optmem_max) 1972 return NULL; 1973 1974 skb = alloc_skb(size, priority); 1975 if (!skb) 1976 return NULL; 1977 1978 atomic_add(skb->truesize, &sk->sk_omem_alloc); 1979 skb->sk = sk; 1980 skb->destructor = sock_ofree; 1981 return skb; 1982 } 1983 1984 /* 1985 * Allocate a memory block from the socket's option memory buffer. 1986 */ 1987 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) 1988 { 1989 if ((unsigned int)size <= sysctl_optmem_max && 1990 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { 1991 void *mem; 1992 /* First do the add, to avoid the race if kmalloc 1993 * might sleep. 1994 */ 1995 atomic_add(size, &sk->sk_omem_alloc); 1996 mem = kmalloc(size, priority); 1997 if (mem) 1998 return mem; 1999 atomic_sub(size, &sk->sk_omem_alloc); 2000 } 2001 return NULL; 2002 } 2003 EXPORT_SYMBOL(sock_kmalloc); 2004 2005 /* Free an option memory block. Note, we actually want the inline 2006 * here as this allows gcc to detect the nullify and fold away the 2007 * condition entirely. 2008 */ 2009 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, 2010 const bool nullify) 2011 { 2012 if (WARN_ON_ONCE(!mem)) 2013 return; 2014 if (nullify) 2015 kzfree(mem); 2016 else 2017 kfree(mem); 2018 atomic_sub(size, &sk->sk_omem_alloc); 2019 } 2020 2021 void sock_kfree_s(struct sock *sk, void *mem, int size) 2022 { 2023 __sock_kfree_s(sk, mem, size, false); 2024 } 2025 EXPORT_SYMBOL(sock_kfree_s); 2026 2027 void sock_kzfree_s(struct sock *sk, void *mem, int size) 2028 { 2029 __sock_kfree_s(sk, mem, size, true); 2030 } 2031 EXPORT_SYMBOL(sock_kzfree_s); 2032 2033 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. 2034 I think, these locks should be removed for datagram sockets. 2035 */ 2036 static long sock_wait_for_wmem(struct sock *sk, long timeo) 2037 { 2038 DEFINE_WAIT(wait); 2039 2040 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2041 for (;;) { 2042 if (!timeo) 2043 break; 2044 if (signal_pending(current)) 2045 break; 2046 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2047 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 2048 if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) 2049 break; 2050 if (sk->sk_shutdown & SEND_SHUTDOWN) 2051 break; 2052 if (sk->sk_err) 2053 break; 2054 timeo = schedule_timeout(timeo); 2055 } 2056 finish_wait(sk_sleep(sk), &wait); 2057 return timeo; 2058 } 2059 2060 2061 /* 2062 * Generic send/receive buffer handlers 2063 */ 2064 2065 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 2066 unsigned long data_len, int noblock, 2067 int *errcode, int max_page_order) 2068 { 2069 struct sk_buff *skb; 2070 long timeo; 2071 int err; 2072 2073 timeo = sock_sndtimeo(sk, noblock); 2074 for (;;) { 2075 err = sock_error(sk); 2076 if (err != 0) 2077 goto failure; 2078 2079 err = -EPIPE; 2080 if (sk->sk_shutdown & SEND_SHUTDOWN) 2081 goto failure; 2082 2083 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf) 2084 break; 2085 2086 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2087 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2088 err = -EAGAIN; 2089 if (!timeo) 2090 goto failure; 2091 if (signal_pending(current)) 2092 goto interrupted; 2093 timeo = sock_wait_for_wmem(sk, timeo); 2094 } 2095 skb = alloc_skb_with_frags(header_len, data_len, max_page_order, 2096 errcode, sk->sk_allocation); 2097 if (skb) 2098 skb_set_owner_w(skb, sk); 2099 return skb; 2100 2101 interrupted: 2102 err = sock_intr_errno(timeo); 2103 failure: 2104 *errcode = err; 2105 return NULL; 2106 } 2107 EXPORT_SYMBOL(sock_alloc_send_pskb); 2108 2109 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 2110 int noblock, int *errcode) 2111 { 2112 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); 2113 } 2114 EXPORT_SYMBOL(sock_alloc_send_skb); 2115 2116 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, 2117 struct sockcm_cookie *sockc) 2118 { 2119 u32 tsflags; 2120 2121 switch (cmsg->cmsg_type) { 2122 case SO_MARK: 2123 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 2124 return -EPERM; 2125 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2126 return -EINVAL; 2127 sockc->mark = *(u32 *)CMSG_DATA(cmsg); 2128 break; 2129 case SO_TIMESTAMPING: 2130 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2131 return -EINVAL; 2132 2133 tsflags = *(u32 *)CMSG_DATA(cmsg); 2134 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) 2135 return -EINVAL; 2136 2137 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; 2138 sockc->tsflags |= tsflags; 2139 break; 2140 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ 2141 case SCM_RIGHTS: 2142 case SCM_CREDENTIALS: 2143 break; 2144 default: 2145 return -EINVAL; 2146 } 2147 return 0; 2148 } 2149 EXPORT_SYMBOL(__sock_cmsg_send); 2150 2151 int sock_cmsg_send(struct sock *sk, struct msghdr *msg, 2152 struct sockcm_cookie *sockc) 2153 { 2154 struct cmsghdr *cmsg; 2155 int ret; 2156 2157 for_each_cmsghdr(cmsg, msg) { 2158 if (!CMSG_OK(msg, cmsg)) 2159 return -EINVAL; 2160 if (cmsg->cmsg_level != SOL_SOCKET) 2161 continue; 2162 ret = __sock_cmsg_send(sk, msg, cmsg, sockc); 2163 if (ret) 2164 return ret; 2165 } 2166 return 0; 2167 } 2168 EXPORT_SYMBOL(sock_cmsg_send); 2169 2170 static void sk_enter_memory_pressure(struct sock *sk) 2171 { 2172 if (!sk->sk_prot->enter_memory_pressure) 2173 return; 2174 2175 sk->sk_prot->enter_memory_pressure(sk); 2176 } 2177 2178 static void sk_leave_memory_pressure(struct sock *sk) 2179 { 2180 if (sk->sk_prot->leave_memory_pressure) { 2181 sk->sk_prot->leave_memory_pressure(sk); 2182 } else { 2183 unsigned long *memory_pressure = sk->sk_prot->memory_pressure; 2184 2185 if (memory_pressure && *memory_pressure) 2186 *memory_pressure = 0; 2187 } 2188 } 2189 2190 /* On 32bit arches, an skb frag is limited to 2^15 */ 2191 #define SKB_FRAG_PAGE_ORDER get_order(32768) 2192 2193 /** 2194 * skb_page_frag_refill - check that a page_frag contains enough room 2195 * @sz: minimum size of the fragment we want to get 2196 * @pfrag: pointer to page_frag 2197 * @gfp: priority for memory allocation 2198 * 2199 * Note: While this allocator tries to use high order pages, there is 2200 * no guarantee that allocations succeed. Therefore, @sz MUST be 2201 * less or equal than PAGE_SIZE. 2202 */ 2203 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) 2204 { 2205 if (pfrag->page) { 2206 if (page_ref_count(pfrag->page) == 1) { 2207 pfrag->offset = 0; 2208 return true; 2209 } 2210 if (pfrag->offset + sz <= pfrag->size) 2211 return true; 2212 put_page(pfrag->page); 2213 } 2214 2215 pfrag->offset = 0; 2216 if (SKB_FRAG_PAGE_ORDER) { 2217 /* Avoid direct reclaim but allow kswapd to wake */ 2218 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | 2219 __GFP_COMP | __GFP_NOWARN | 2220 __GFP_NORETRY, 2221 SKB_FRAG_PAGE_ORDER); 2222 if (likely(pfrag->page)) { 2223 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; 2224 return true; 2225 } 2226 } 2227 pfrag->page = alloc_page(gfp); 2228 if (likely(pfrag->page)) { 2229 pfrag->size = PAGE_SIZE; 2230 return true; 2231 } 2232 return false; 2233 } 2234 EXPORT_SYMBOL(skb_page_frag_refill); 2235 2236 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 2237 { 2238 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) 2239 return true; 2240 2241 sk_enter_memory_pressure(sk); 2242 sk_stream_moderate_sndbuf(sk); 2243 return false; 2244 } 2245 EXPORT_SYMBOL(sk_page_frag_refill); 2246 2247 int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, 2248 int sg_start, int *sg_curr_index, unsigned int *sg_curr_size, 2249 int first_coalesce) 2250 { 2251 int sg_curr = *sg_curr_index, use = 0, rc = 0; 2252 unsigned int size = *sg_curr_size; 2253 struct page_frag *pfrag; 2254 struct scatterlist *sge; 2255 2256 len -= size; 2257 pfrag = sk_page_frag(sk); 2258 2259 while (len > 0) { 2260 unsigned int orig_offset; 2261 2262 if (!sk_page_frag_refill(sk, pfrag)) { 2263 rc = -ENOMEM; 2264 goto out; 2265 } 2266 2267 use = min_t(int, len, pfrag->size - pfrag->offset); 2268 2269 if (!sk_wmem_schedule(sk, use)) { 2270 rc = -ENOMEM; 2271 goto out; 2272 } 2273 2274 sk_mem_charge(sk, use); 2275 size += use; 2276 orig_offset = pfrag->offset; 2277 pfrag->offset += use; 2278 2279 sge = sg + sg_curr - 1; 2280 if (sg_curr > first_coalesce && sg_page(sg) == pfrag->page && 2281 sg->offset + sg->length == orig_offset) { 2282 sg->length += use; 2283 } else { 2284 sge = sg + sg_curr; 2285 sg_unmark_end(sge); 2286 sg_set_page(sge, pfrag->page, use, orig_offset); 2287 get_page(pfrag->page); 2288 sg_curr++; 2289 2290 if (sg_curr == MAX_SKB_FRAGS) 2291 sg_curr = 0; 2292 2293 if (sg_curr == sg_start) { 2294 rc = -ENOSPC; 2295 break; 2296 } 2297 } 2298 2299 len -= use; 2300 } 2301 out: 2302 *sg_curr_size = size; 2303 *sg_curr_index = sg_curr; 2304 return rc; 2305 } 2306 EXPORT_SYMBOL(sk_alloc_sg); 2307 2308 static void __lock_sock(struct sock *sk) 2309 __releases(&sk->sk_lock.slock) 2310 __acquires(&sk->sk_lock.slock) 2311 { 2312 DEFINE_WAIT(wait); 2313 2314 for (;;) { 2315 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, 2316 TASK_UNINTERRUPTIBLE); 2317 spin_unlock_bh(&sk->sk_lock.slock); 2318 schedule(); 2319 spin_lock_bh(&sk->sk_lock.slock); 2320 if (!sock_owned_by_user(sk)) 2321 break; 2322 } 2323 finish_wait(&sk->sk_lock.wq, &wait); 2324 } 2325 2326 static void __release_sock(struct sock *sk) 2327 __releases(&sk->sk_lock.slock) 2328 __acquires(&sk->sk_lock.slock) 2329 { 2330 struct sk_buff *skb, *next; 2331 2332 while ((skb = sk->sk_backlog.head) != NULL) { 2333 sk->sk_backlog.head = sk->sk_backlog.tail = NULL; 2334 2335 spin_unlock_bh(&sk->sk_lock.slock); 2336 2337 do { 2338 next = skb->next; 2339 prefetch(next); 2340 WARN_ON_ONCE(skb_dst_is_noref(skb)); 2341 skb->next = NULL; 2342 sk_backlog_rcv(sk, skb); 2343 2344 cond_resched(); 2345 2346 skb = next; 2347 } while (skb != NULL); 2348 2349 spin_lock_bh(&sk->sk_lock.slock); 2350 } 2351 2352 /* 2353 * Doing the zeroing here guarantee we can not loop forever 2354 * while a wild producer attempts to flood us. 2355 */ 2356 sk->sk_backlog.len = 0; 2357 } 2358 2359 void __sk_flush_backlog(struct sock *sk) 2360 { 2361 spin_lock_bh(&sk->sk_lock.slock); 2362 __release_sock(sk); 2363 spin_unlock_bh(&sk->sk_lock.slock); 2364 } 2365 2366 /** 2367 * sk_wait_data - wait for data to arrive at sk_receive_queue 2368 * @sk: sock to wait on 2369 * @timeo: for how long 2370 * @skb: last skb seen on sk_receive_queue 2371 * 2372 * Now socket state including sk->sk_err is changed only under lock, 2373 * hence we may omit checks after joining wait queue. 2374 * We check receive queue before schedule() only as optimization; 2375 * it is very likely that release_sock() added new data. 2376 */ 2377 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) 2378 { 2379 DEFINE_WAIT_FUNC(wait, woken_wake_function); 2380 int rc; 2381 2382 add_wait_queue(sk_sleep(sk), &wait); 2383 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2384 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); 2385 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2386 remove_wait_queue(sk_sleep(sk), &wait); 2387 return rc; 2388 } 2389 EXPORT_SYMBOL(sk_wait_data); 2390 2391 /** 2392 * __sk_mem_raise_allocated - increase memory_allocated 2393 * @sk: socket 2394 * @size: memory size to allocate 2395 * @amt: pages to allocate 2396 * @kind: allocation type 2397 * 2398 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc 2399 */ 2400 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) 2401 { 2402 struct proto *prot = sk->sk_prot; 2403 long allocated = sk_memory_allocated_add(sk, amt); 2404 2405 if (mem_cgroup_sockets_enabled && sk->sk_memcg && 2406 !mem_cgroup_charge_skmem(sk->sk_memcg, amt)) 2407 goto suppress_allocation; 2408 2409 /* Under limit. */ 2410 if (allocated <= sk_prot_mem_limits(sk, 0)) { 2411 sk_leave_memory_pressure(sk); 2412 return 1; 2413 } 2414 2415 /* Under pressure. */ 2416 if (allocated > sk_prot_mem_limits(sk, 1)) 2417 sk_enter_memory_pressure(sk); 2418 2419 /* Over hard limit. */ 2420 if (allocated > sk_prot_mem_limits(sk, 2)) 2421 goto suppress_allocation; 2422 2423 /* guarantee minimum buffer size under pressure */ 2424 if (kind == SK_MEM_RECV) { 2425 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) 2426 return 1; 2427 2428 } else { /* SK_MEM_SEND */ 2429 int wmem0 = sk_get_wmem0(sk, prot); 2430 2431 if (sk->sk_type == SOCK_STREAM) { 2432 if (sk->sk_wmem_queued < wmem0) 2433 return 1; 2434 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { 2435 return 1; 2436 } 2437 } 2438 2439 if (sk_has_memory_pressure(sk)) { 2440 int alloc; 2441 2442 if (!sk_under_memory_pressure(sk)) 2443 return 1; 2444 alloc = sk_sockets_allocated_read_positive(sk); 2445 if (sk_prot_mem_limits(sk, 2) > alloc * 2446 sk_mem_pages(sk->sk_wmem_queued + 2447 atomic_read(&sk->sk_rmem_alloc) + 2448 sk->sk_forward_alloc)) 2449 return 1; 2450 } 2451 2452 suppress_allocation: 2453 2454 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { 2455 sk_stream_moderate_sndbuf(sk); 2456 2457 /* Fail only if socket is _under_ its sndbuf. 2458 * In this case we cannot block, so that we have to fail. 2459 */ 2460 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) 2461 return 1; 2462 } 2463 2464 trace_sock_exceed_buf_limit(sk, prot, allocated); 2465 2466 sk_memory_allocated_sub(sk, amt); 2467 2468 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2469 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); 2470 2471 return 0; 2472 } 2473 EXPORT_SYMBOL(__sk_mem_raise_allocated); 2474 2475 /** 2476 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 2477 * @sk: socket 2478 * @size: memory size to allocate 2479 * @kind: allocation type 2480 * 2481 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 2482 * rmem allocation. This function assumes that protocols which have 2483 * memory_pressure use sk_wmem_queued as write buffer accounting. 2484 */ 2485 int __sk_mem_schedule(struct sock *sk, int size, int kind) 2486 { 2487 int ret, amt = sk_mem_pages(size); 2488 2489 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT; 2490 ret = __sk_mem_raise_allocated(sk, size, amt, kind); 2491 if (!ret) 2492 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT; 2493 return ret; 2494 } 2495 EXPORT_SYMBOL(__sk_mem_schedule); 2496 2497 /** 2498 * __sk_mem_reduce_allocated - reclaim memory_allocated 2499 * @sk: socket 2500 * @amount: number of quanta 2501 * 2502 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc 2503 */ 2504 void __sk_mem_reduce_allocated(struct sock *sk, int amount) 2505 { 2506 sk_memory_allocated_sub(sk, amount); 2507 2508 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2509 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); 2510 2511 if (sk_under_memory_pressure(sk) && 2512 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 2513 sk_leave_memory_pressure(sk); 2514 } 2515 EXPORT_SYMBOL(__sk_mem_reduce_allocated); 2516 2517 /** 2518 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated 2519 * @sk: socket 2520 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) 2521 */ 2522 void __sk_mem_reclaim(struct sock *sk, int amount) 2523 { 2524 amount >>= SK_MEM_QUANTUM_SHIFT; 2525 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; 2526 __sk_mem_reduce_allocated(sk, amount); 2527 } 2528 EXPORT_SYMBOL(__sk_mem_reclaim); 2529 2530 int sk_set_peek_off(struct sock *sk, int val) 2531 { 2532 sk->sk_peek_off = val; 2533 return 0; 2534 } 2535 EXPORT_SYMBOL_GPL(sk_set_peek_off); 2536 2537 /* 2538 * Set of default routines for initialising struct proto_ops when 2539 * the protocol does not support a particular function. In certain 2540 * cases where it makes no sense for a protocol to have a "do nothing" 2541 * function, some default processing is provided. 2542 */ 2543 2544 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) 2545 { 2546 return -EOPNOTSUPP; 2547 } 2548 EXPORT_SYMBOL(sock_no_bind); 2549 2550 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 2551 int len, int flags) 2552 { 2553 return -EOPNOTSUPP; 2554 } 2555 EXPORT_SYMBOL(sock_no_connect); 2556 2557 int sock_no_socketpair(struct socket *sock1, struct socket *sock2) 2558 { 2559 return -EOPNOTSUPP; 2560 } 2561 EXPORT_SYMBOL(sock_no_socketpair); 2562 2563 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, 2564 bool kern) 2565 { 2566 return -EOPNOTSUPP; 2567 } 2568 EXPORT_SYMBOL(sock_no_accept); 2569 2570 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 2571 int peer) 2572 { 2573 return -EOPNOTSUPP; 2574 } 2575 EXPORT_SYMBOL(sock_no_getname); 2576 2577 __poll_t sock_no_poll(struct file *file, struct socket *sock, poll_table *pt) 2578 { 2579 return 0; 2580 } 2581 EXPORT_SYMBOL(sock_no_poll); 2582 2583 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 2584 { 2585 return -EOPNOTSUPP; 2586 } 2587 EXPORT_SYMBOL(sock_no_ioctl); 2588 2589 int sock_no_listen(struct socket *sock, int backlog) 2590 { 2591 return -EOPNOTSUPP; 2592 } 2593 EXPORT_SYMBOL(sock_no_listen); 2594 2595 int sock_no_shutdown(struct socket *sock, int how) 2596 { 2597 return -EOPNOTSUPP; 2598 } 2599 EXPORT_SYMBOL(sock_no_shutdown); 2600 2601 int sock_no_setsockopt(struct socket *sock, int level, int optname, 2602 char __user *optval, unsigned int optlen) 2603 { 2604 return -EOPNOTSUPP; 2605 } 2606 EXPORT_SYMBOL(sock_no_setsockopt); 2607 2608 int sock_no_getsockopt(struct socket *sock, int level, int optname, 2609 char __user *optval, int __user *optlen) 2610 { 2611 return -EOPNOTSUPP; 2612 } 2613 EXPORT_SYMBOL(sock_no_getsockopt); 2614 2615 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) 2616 { 2617 return -EOPNOTSUPP; 2618 } 2619 EXPORT_SYMBOL(sock_no_sendmsg); 2620 2621 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) 2622 { 2623 return -EOPNOTSUPP; 2624 } 2625 EXPORT_SYMBOL(sock_no_sendmsg_locked); 2626 2627 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, 2628 int flags) 2629 { 2630 return -EOPNOTSUPP; 2631 } 2632 EXPORT_SYMBOL(sock_no_recvmsg); 2633 2634 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 2635 { 2636 /* Mirror missing mmap method error code */ 2637 return -ENODEV; 2638 } 2639 EXPORT_SYMBOL(sock_no_mmap); 2640 2641 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) 2642 { 2643 ssize_t res; 2644 struct msghdr msg = {.msg_flags = flags}; 2645 struct kvec iov; 2646 char *kaddr = kmap(page); 2647 iov.iov_base = kaddr + offset; 2648 iov.iov_len = size; 2649 res = kernel_sendmsg(sock, &msg, &iov, 1, size); 2650 kunmap(page); 2651 return res; 2652 } 2653 EXPORT_SYMBOL(sock_no_sendpage); 2654 2655 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page, 2656 int offset, size_t size, int flags) 2657 { 2658 ssize_t res; 2659 struct msghdr msg = {.msg_flags = flags}; 2660 struct kvec iov; 2661 char *kaddr = kmap(page); 2662 2663 iov.iov_base = kaddr + offset; 2664 iov.iov_len = size; 2665 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size); 2666 kunmap(page); 2667 return res; 2668 } 2669 EXPORT_SYMBOL(sock_no_sendpage_locked); 2670 2671 /* 2672 * Default Socket Callbacks 2673 */ 2674 2675 static void sock_def_wakeup(struct sock *sk) 2676 { 2677 struct socket_wq *wq; 2678 2679 rcu_read_lock(); 2680 wq = rcu_dereference(sk->sk_wq); 2681 if (skwq_has_sleeper(wq)) 2682 wake_up_interruptible_all(&wq->wait); 2683 rcu_read_unlock(); 2684 } 2685 2686 static void sock_def_error_report(struct sock *sk) 2687 { 2688 struct socket_wq *wq; 2689 2690 rcu_read_lock(); 2691 wq = rcu_dereference(sk->sk_wq); 2692 if (skwq_has_sleeper(wq)) 2693 wake_up_interruptible_poll(&wq->wait, EPOLLERR); 2694 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); 2695 rcu_read_unlock(); 2696 } 2697 2698 static void sock_def_readable(struct sock *sk) 2699 { 2700 struct socket_wq *wq; 2701 2702 rcu_read_lock(); 2703 wq = rcu_dereference(sk->sk_wq); 2704 if (skwq_has_sleeper(wq)) 2705 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | 2706 EPOLLRDNORM | EPOLLRDBAND); 2707 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 2708 rcu_read_unlock(); 2709 } 2710 2711 static void sock_def_write_space(struct sock *sk) 2712 { 2713 struct socket_wq *wq; 2714 2715 rcu_read_lock(); 2716 2717 /* Do not wake up a writer until he can make "significant" 2718 * progress. --DaveM 2719 */ 2720 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { 2721 wq = rcu_dereference(sk->sk_wq); 2722 if (skwq_has_sleeper(wq)) 2723 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 2724 EPOLLWRNORM | EPOLLWRBAND); 2725 2726 /* Should agree with poll, otherwise some programs break */ 2727 if (sock_writeable(sk)) 2728 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 2729 } 2730 2731 rcu_read_unlock(); 2732 } 2733 2734 static void sock_def_destruct(struct sock *sk) 2735 { 2736 } 2737 2738 void sk_send_sigurg(struct sock *sk) 2739 { 2740 if (sk->sk_socket && sk->sk_socket->file) 2741 if (send_sigurg(&sk->sk_socket->file->f_owner)) 2742 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 2743 } 2744 EXPORT_SYMBOL(sk_send_sigurg); 2745 2746 void sk_reset_timer(struct sock *sk, struct timer_list* timer, 2747 unsigned long expires) 2748 { 2749 if (!mod_timer(timer, expires)) 2750 sock_hold(sk); 2751 } 2752 EXPORT_SYMBOL(sk_reset_timer); 2753 2754 void sk_stop_timer(struct sock *sk, struct timer_list* timer) 2755 { 2756 if (del_timer(timer)) 2757 __sock_put(sk); 2758 } 2759 EXPORT_SYMBOL(sk_stop_timer); 2760 2761 void sock_init_data(struct socket *sock, struct sock *sk) 2762 { 2763 sk_init_common(sk); 2764 sk->sk_send_head = NULL; 2765 2766 timer_setup(&sk->sk_timer, NULL, 0); 2767 2768 sk->sk_allocation = GFP_KERNEL; 2769 sk->sk_rcvbuf = sysctl_rmem_default; 2770 sk->sk_sndbuf = sysctl_wmem_default; 2771 sk->sk_state = TCP_CLOSE; 2772 sk_set_socket(sk, sock); 2773 2774 sock_set_flag(sk, SOCK_ZAPPED); 2775 2776 if (sock) { 2777 sk->sk_type = sock->type; 2778 sk->sk_wq = sock->wq; 2779 sock->sk = sk; 2780 sk->sk_uid = SOCK_INODE(sock)->i_uid; 2781 } else { 2782 sk->sk_wq = NULL; 2783 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0); 2784 } 2785 2786 rwlock_init(&sk->sk_callback_lock); 2787 if (sk->sk_kern_sock) 2788 lockdep_set_class_and_name( 2789 &sk->sk_callback_lock, 2790 af_kern_callback_keys + sk->sk_family, 2791 af_family_kern_clock_key_strings[sk->sk_family]); 2792 else 2793 lockdep_set_class_and_name( 2794 &sk->sk_callback_lock, 2795 af_callback_keys + sk->sk_family, 2796 af_family_clock_key_strings[sk->sk_family]); 2797 2798 sk->sk_state_change = sock_def_wakeup; 2799 sk->sk_data_ready = sock_def_readable; 2800 sk->sk_write_space = sock_def_write_space; 2801 sk->sk_error_report = sock_def_error_report; 2802 sk->sk_destruct = sock_def_destruct; 2803 2804 sk->sk_frag.page = NULL; 2805 sk->sk_frag.offset = 0; 2806 sk->sk_peek_off = -1; 2807 2808 sk->sk_peer_pid = NULL; 2809 sk->sk_peer_cred = NULL; 2810 sk->sk_write_pending = 0; 2811 sk->sk_rcvlowat = 1; 2812 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 2813 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 2814 2815 sk->sk_stamp = SK_DEFAULT_STAMP; 2816 atomic_set(&sk->sk_zckey, 0); 2817 2818 #ifdef CONFIG_NET_RX_BUSY_POLL 2819 sk->sk_napi_id = 0; 2820 sk->sk_ll_usec = sysctl_net_busy_read; 2821 #endif 2822 2823 sk->sk_max_pacing_rate = ~0U; 2824 sk->sk_pacing_rate = ~0U; 2825 sk->sk_pacing_shift = 10; 2826 sk->sk_incoming_cpu = -1; 2827 /* 2828 * Before updating sk_refcnt, we must commit prior changes to memory 2829 * (Documentation/RCU/rculist_nulls.txt for details) 2830 */ 2831 smp_wmb(); 2832 refcount_set(&sk->sk_refcnt, 1); 2833 atomic_set(&sk->sk_drops, 0); 2834 } 2835 EXPORT_SYMBOL(sock_init_data); 2836 2837 void lock_sock_nested(struct sock *sk, int subclass) 2838 { 2839 might_sleep(); 2840 spin_lock_bh(&sk->sk_lock.slock); 2841 if (sk->sk_lock.owned) 2842 __lock_sock(sk); 2843 sk->sk_lock.owned = 1; 2844 spin_unlock(&sk->sk_lock.slock); 2845 /* 2846 * The sk_lock has mutex_lock() semantics here: 2847 */ 2848 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); 2849 local_bh_enable(); 2850 } 2851 EXPORT_SYMBOL(lock_sock_nested); 2852 2853 void release_sock(struct sock *sk) 2854 { 2855 spin_lock_bh(&sk->sk_lock.slock); 2856 if (sk->sk_backlog.tail) 2857 __release_sock(sk); 2858 2859 /* Warning : release_cb() might need to release sk ownership, 2860 * ie call sock_release_ownership(sk) before us. 2861 */ 2862 if (sk->sk_prot->release_cb) 2863 sk->sk_prot->release_cb(sk); 2864 2865 sock_release_ownership(sk); 2866 if (waitqueue_active(&sk->sk_lock.wq)) 2867 wake_up(&sk->sk_lock.wq); 2868 spin_unlock_bh(&sk->sk_lock.slock); 2869 } 2870 EXPORT_SYMBOL(release_sock); 2871 2872 /** 2873 * lock_sock_fast - fast version of lock_sock 2874 * @sk: socket 2875 * 2876 * This version should be used for very small section, where process wont block 2877 * return false if fast path is taken: 2878 * 2879 * sk_lock.slock locked, owned = 0, BH disabled 2880 * 2881 * return true if slow path is taken: 2882 * 2883 * sk_lock.slock unlocked, owned = 1, BH enabled 2884 */ 2885 bool lock_sock_fast(struct sock *sk) 2886 { 2887 might_sleep(); 2888 spin_lock_bh(&sk->sk_lock.slock); 2889 2890 if (!sk->sk_lock.owned) 2891 /* 2892 * Note : We must disable BH 2893 */ 2894 return false; 2895 2896 __lock_sock(sk); 2897 sk->sk_lock.owned = 1; 2898 spin_unlock(&sk->sk_lock.slock); 2899 /* 2900 * The sk_lock has mutex_lock() semantics here: 2901 */ 2902 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); 2903 local_bh_enable(); 2904 return true; 2905 } 2906 EXPORT_SYMBOL(lock_sock_fast); 2907 2908 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) 2909 { 2910 struct timeval tv; 2911 if (!sock_flag(sk, SOCK_TIMESTAMP)) 2912 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 2913 tv = ktime_to_timeval(sk->sk_stamp); 2914 if (tv.tv_sec == -1) 2915 return -ENOENT; 2916 if (tv.tv_sec == 0) { 2917 sk->sk_stamp = ktime_get_real(); 2918 tv = ktime_to_timeval(sk->sk_stamp); 2919 } 2920 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0; 2921 } 2922 EXPORT_SYMBOL(sock_get_timestamp); 2923 2924 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) 2925 { 2926 struct timespec ts; 2927 if (!sock_flag(sk, SOCK_TIMESTAMP)) 2928 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 2929 ts = ktime_to_timespec(sk->sk_stamp); 2930 if (ts.tv_sec == -1) 2931 return -ENOENT; 2932 if (ts.tv_sec == 0) { 2933 sk->sk_stamp = ktime_get_real(); 2934 ts = ktime_to_timespec(sk->sk_stamp); 2935 } 2936 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0; 2937 } 2938 EXPORT_SYMBOL(sock_get_timestampns); 2939 2940 void sock_enable_timestamp(struct sock *sk, int flag) 2941 { 2942 if (!sock_flag(sk, flag)) { 2943 unsigned long previous_flags = sk->sk_flags; 2944 2945 sock_set_flag(sk, flag); 2946 /* 2947 * we just set one of the two flags which require net 2948 * time stamping, but time stamping might have been on 2949 * already because of the other one 2950 */ 2951 if (sock_needs_netstamp(sk) && 2952 !(previous_flags & SK_FLAGS_TIMESTAMP)) 2953 net_enable_timestamp(); 2954 } 2955 } 2956 2957 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, 2958 int level, int type) 2959 { 2960 struct sock_exterr_skb *serr; 2961 struct sk_buff *skb; 2962 int copied, err; 2963 2964 err = -EAGAIN; 2965 skb = sock_dequeue_err_skb(sk); 2966 if (skb == NULL) 2967 goto out; 2968 2969 copied = skb->len; 2970 if (copied > len) { 2971 msg->msg_flags |= MSG_TRUNC; 2972 copied = len; 2973 } 2974 err = skb_copy_datagram_msg(skb, 0, msg, copied); 2975 if (err) 2976 goto out_free_skb; 2977 2978 sock_recv_timestamp(msg, sk, skb); 2979 2980 serr = SKB_EXT_ERR(skb); 2981 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); 2982 2983 msg->msg_flags |= MSG_ERRQUEUE; 2984 err = copied; 2985 2986 out_free_skb: 2987 kfree_skb(skb); 2988 out: 2989 return err; 2990 } 2991 EXPORT_SYMBOL(sock_recv_errqueue); 2992 2993 /* 2994 * Get a socket option on an socket. 2995 * 2996 * FIX: POSIX 1003.1g is very ambiguous here. It states that 2997 * asynchronous errors should be reported by getsockopt. We assume 2998 * this means if you specify SO_ERROR (otherwise whats the point of it). 2999 */ 3000 int sock_common_getsockopt(struct socket *sock, int level, int optname, 3001 char __user *optval, int __user *optlen) 3002 { 3003 struct sock *sk = sock->sk; 3004 3005 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 3006 } 3007 EXPORT_SYMBOL(sock_common_getsockopt); 3008 3009 #ifdef CONFIG_COMPAT 3010 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname, 3011 char __user *optval, int __user *optlen) 3012 { 3013 struct sock *sk = sock->sk; 3014 3015 if (sk->sk_prot->compat_getsockopt != NULL) 3016 return sk->sk_prot->compat_getsockopt(sk, level, optname, 3017 optval, optlen); 3018 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 3019 } 3020 EXPORT_SYMBOL(compat_sock_common_getsockopt); 3021 #endif 3022 3023 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 3024 int flags) 3025 { 3026 struct sock *sk = sock->sk; 3027 int addr_len = 0; 3028 int err; 3029 3030 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, 3031 flags & ~MSG_DONTWAIT, &addr_len); 3032 if (err >= 0) 3033 msg->msg_namelen = addr_len; 3034 return err; 3035 } 3036 EXPORT_SYMBOL(sock_common_recvmsg); 3037 3038 /* 3039 * Set socket options on an inet socket. 3040 */ 3041 int sock_common_setsockopt(struct socket *sock, int level, int optname, 3042 char __user *optval, unsigned int optlen) 3043 { 3044 struct sock *sk = sock->sk; 3045 3046 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 3047 } 3048 EXPORT_SYMBOL(sock_common_setsockopt); 3049 3050 #ifdef CONFIG_COMPAT 3051 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname, 3052 char __user *optval, unsigned int optlen) 3053 { 3054 struct sock *sk = sock->sk; 3055 3056 if (sk->sk_prot->compat_setsockopt != NULL) 3057 return sk->sk_prot->compat_setsockopt(sk, level, optname, 3058 optval, optlen); 3059 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 3060 } 3061 EXPORT_SYMBOL(compat_sock_common_setsockopt); 3062 #endif 3063 3064 void sk_common_release(struct sock *sk) 3065 { 3066 if (sk->sk_prot->destroy) 3067 sk->sk_prot->destroy(sk); 3068 3069 /* 3070 * Observation: when sock_common_release is called, processes have 3071 * no access to socket. But net still has. 3072 * Step one, detach it from networking: 3073 * 3074 * A. Remove from hash tables. 3075 */ 3076 3077 sk->sk_prot->unhash(sk); 3078 3079 /* 3080 * In this point socket cannot receive new packets, but it is possible 3081 * that some packets are in flight because some CPU runs receiver and 3082 * did hash table lookup before we unhashed socket. They will achieve 3083 * receive queue and will be purged by socket destructor. 3084 * 3085 * Also we still have packets pending on receive queue and probably, 3086 * our own packets waiting in device queues. sock_destroy will drain 3087 * receive queue, but transmitted packets will delay socket destruction 3088 * until the last reference will be released. 3089 */ 3090 3091 sock_orphan(sk); 3092 3093 xfrm_sk_free_policy(sk); 3094 3095 sk_refcnt_debug_release(sk); 3096 3097 sock_put(sk); 3098 } 3099 EXPORT_SYMBOL(sk_common_release); 3100 3101 void sk_get_meminfo(const struct sock *sk, u32 *mem) 3102 { 3103 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); 3104 3105 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); 3106 mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf; 3107 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); 3108 mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf; 3109 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; 3110 mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; 3111 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); 3112 mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; 3113 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); 3114 } 3115 3116 #ifdef CONFIG_PROC_FS 3117 #define PROTO_INUSE_NR 64 /* should be enough for the first time */ 3118 struct prot_inuse { 3119 int val[PROTO_INUSE_NR]; 3120 }; 3121 3122 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); 3123 3124 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) 3125 { 3126 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val); 3127 } 3128 EXPORT_SYMBOL_GPL(sock_prot_inuse_add); 3129 3130 int sock_prot_inuse_get(struct net *net, struct proto *prot) 3131 { 3132 int cpu, idx = prot->inuse_idx; 3133 int res = 0; 3134 3135 for_each_possible_cpu(cpu) 3136 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; 3137 3138 return res >= 0 ? res : 0; 3139 } 3140 EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 3141 3142 static void sock_inuse_add(struct net *net, int val) 3143 { 3144 this_cpu_add(*net->core.sock_inuse, val); 3145 } 3146 3147 int sock_inuse_get(struct net *net) 3148 { 3149 int cpu, res = 0; 3150 3151 for_each_possible_cpu(cpu) 3152 res += *per_cpu_ptr(net->core.sock_inuse, cpu); 3153 3154 return res; 3155 } 3156 3157 EXPORT_SYMBOL_GPL(sock_inuse_get); 3158 3159 static int __net_init sock_inuse_init_net(struct net *net) 3160 { 3161 net->core.prot_inuse = alloc_percpu(struct prot_inuse); 3162 if (net->core.prot_inuse == NULL) 3163 return -ENOMEM; 3164 3165 net->core.sock_inuse = alloc_percpu(int); 3166 if (net->core.sock_inuse == NULL) 3167 goto out; 3168 3169 return 0; 3170 3171 out: 3172 free_percpu(net->core.prot_inuse); 3173 return -ENOMEM; 3174 } 3175 3176 static void __net_exit sock_inuse_exit_net(struct net *net) 3177 { 3178 free_percpu(net->core.prot_inuse); 3179 free_percpu(net->core.sock_inuse); 3180 } 3181 3182 static struct pernet_operations net_inuse_ops = { 3183 .init = sock_inuse_init_net, 3184 .exit = sock_inuse_exit_net, 3185 }; 3186 3187 static __init int net_inuse_init(void) 3188 { 3189 if (register_pernet_subsys(&net_inuse_ops)) 3190 panic("Cannot initialize net inuse counters"); 3191 3192 return 0; 3193 } 3194 3195 core_initcall(net_inuse_init); 3196 3197 static void assign_proto_idx(struct proto *prot) 3198 { 3199 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); 3200 3201 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { 3202 pr_err("PROTO_INUSE_NR exhausted\n"); 3203 return; 3204 } 3205 3206 set_bit(prot->inuse_idx, proto_inuse_idx); 3207 } 3208 3209 static void release_proto_idx(struct proto *prot) 3210 { 3211 if (prot->inuse_idx != PROTO_INUSE_NR - 1) 3212 clear_bit(prot->inuse_idx, proto_inuse_idx); 3213 } 3214 #else 3215 static inline void assign_proto_idx(struct proto *prot) 3216 { 3217 } 3218 3219 static inline void release_proto_idx(struct proto *prot) 3220 { 3221 } 3222 3223 static void sock_inuse_add(struct net *net, int val) 3224 { 3225 } 3226 #endif 3227 3228 static void req_prot_cleanup(struct request_sock_ops *rsk_prot) 3229 { 3230 if (!rsk_prot) 3231 return; 3232 kfree(rsk_prot->slab_name); 3233 rsk_prot->slab_name = NULL; 3234 kmem_cache_destroy(rsk_prot->slab); 3235 rsk_prot->slab = NULL; 3236 } 3237 3238 static int req_prot_init(const struct proto *prot) 3239 { 3240 struct request_sock_ops *rsk_prot = prot->rsk_prot; 3241 3242 if (!rsk_prot) 3243 return 0; 3244 3245 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", 3246 prot->name); 3247 if (!rsk_prot->slab_name) 3248 return -ENOMEM; 3249 3250 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, 3251 rsk_prot->obj_size, 0, 3252 prot->slab_flags, NULL); 3253 3254 if (!rsk_prot->slab) { 3255 pr_crit("%s: Can't create request sock SLAB cache!\n", 3256 prot->name); 3257 return -ENOMEM; 3258 } 3259 return 0; 3260 } 3261 3262 int proto_register(struct proto *prot, int alloc_slab) 3263 { 3264 if (alloc_slab) { 3265 prot->slab = kmem_cache_create_usercopy(prot->name, 3266 prot->obj_size, 0, 3267 SLAB_HWCACHE_ALIGN | prot->slab_flags, 3268 prot->useroffset, prot->usersize, 3269 NULL); 3270 3271 if (prot->slab == NULL) { 3272 pr_crit("%s: Can't create sock SLAB cache!\n", 3273 prot->name); 3274 goto out; 3275 } 3276 3277 if (req_prot_init(prot)) 3278 goto out_free_request_sock_slab; 3279 3280 if (prot->twsk_prot != NULL) { 3281 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name); 3282 3283 if (prot->twsk_prot->twsk_slab_name == NULL) 3284 goto out_free_request_sock_slab; 3285 3286 prot->twsk_prot->twsk_slab = 3287 kmem_cache_create(prot->twsk_prot->twsk_slab_name, 3288 prot->twsk_prot->twsk_obj_size, 3289 0, 3290 prot->slab_flags, 3291 NULL); 3292 if (prot->twsk_prot->twsk_slab == NULL) 3293 goto out_free_timewait_sock_slab_name; 3294 } 3295 } 3296 3297 mutex_lock(&proto_list_mutex); 3298 list_add(&prot->node, &proto_list); 3299 assign_proto_idx(prot); 3300 mutex_unlock(&proto_list_mutex); 3301 return 0; 3302 3303 out_free_timewait_sock_slab_name: 3304 kfree(prot->twsk_prot->twsk_slab_name); 3305 out_free_request_sock_slab: 3306 req_prot_cleanup(prot->rsk_prot); 3307 3308 kmem_cache_destroy(prot->slab); 3309 prot->slab = NULL; 3310 out: 3311 return -ENOBUFS; 3312 } 3313 EXPORT_SYMBOL(proto_register); 3314 3315 void proto_unregister(struct proto *prot) 3316 { 3317 mutex_lock(&proto_list_mutex); 3318 release_proto_idx(prot); 3319 list_del(&prot->node); 3320 mutex_unlock(&proto_list_mutex); 3321 3322 kmem_cache_destroy(prot->slab); 3323 prot->slab = NULL; 3324 3325 req_prot_cleanup(prot->rsk_prot); 3326 3327 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) { 3328 kmem_cache_destroy(prot->twsk_prot->twsk_slab); 3329 kfree(prot->twsk_prot->twsk_slab_name); 3330 prot->twsk_prot->twsk_slab = NULL; 3331 } 3332 } 3333 EXPORT_SYMBOL(proto_unregister); 3334 3335 int sock_load_diag_module(int family, int protocol) 3336 { 3337 if (!protocol) { 3338 if (!sock_is_registered(family)) 3339 return -ENOENT; 3340 3341 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, 3342 NETLINK_SOCK_DIAG, family); 3343 } 3344 3345 #ifdef CONFIG_INET 3346 if (family == AF_INET && 3347 !rcu_access_pointer(inet_protos[protocol])) 3348 return -ENOENT; 3349 #endif 3350 3351 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, 3352 NETLINK_SOCK_DIAG, family, protocol); 3353 } 3354 EXPORT_SYMBOL(sock_load_diag_module); 3355 3356 #ifdef CONFIG_PROC_FS 3357 static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 3358 __acquires(proto_list_mutex) 3359 { 3360 mutex_lock(&proto_list_mutex); 3361 return seq_list_start_head(&proto_list, *pos); 3362 } 3363 3364 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3365 { 3366 return seq_list_next(v, &proto_list, pos); 3367 } 3368 3369 static void proto_seq_stop(struct seq_file *seq, void *v) 3370 __releases(proto_list_mutex) 3371 { 3372 mutex_unlock(&proto_list_mutex); 3373 } 3374 3375 static char proto_method_implemented(const void *method) 3376 { 3377 return method == NULL ? 'n' : 'y'; 3378 } 3379 static long sock_prot_memory_allocated(struct proto *proto) 3380 { 3381 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; 3382 } 3383 3384 static char *sock_prot_memory_pressure(struct proto *proto) 3385 { 3386 return proto->memory_pressure != NULL ? 3387 proto_memory_pressure(proto) ? "yes" : "no" : "NI"; 3388 } 3389 3390 static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 3391 { 3392 3393 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " 3394 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 3395 proto->name, 3396 proto->obj_size, 3397 sock_prot_inuse_get(seq_file_net(seq), proto), 3398 sock_prot_memory_allocated(proto), 3399 sock_prot_memory_pressure(proto), 3400 proto->max_header, 3401 proto->slab == NULL ? "no" : "yes", 3402 module_name(proto->owner), 3403 proto_method_implemented(proto->close), 3404 proto_method_implemented(proto->connect), 3405 proto_method_implemented(proto->disconnect), 3406 proto_method_implemented(proto->accept), 3407 proto_method_implemented(proto->ioctl), 3408 proto_method_implemented(proto->init), 3409 proto_method_implemented(proto->destroy), 3410 proto_method_implemented(proto->shutdown), 3411 proto_method_implemented(proto->setsockopt), 3412 proto_method_implemented(proto->getsockopt), 3413 proto_method_implemented(proto->sendmsg), 3414 proto_method_implemented(proto->recvmsg), 3415 proto_method_implemented(proto->sendpage), 3416 proto_method_implemented(proto->bind), 3417 proto_method_implemented(proto->backlog_rcv), 3418 proto_method_implemented(proto->hash), 3419 proto_method_implemented(proto->unhash), 3420 proto_method_implemented(proto->get_port), 3421 proto_method_implemented(proto->enter_memory_pressure)); 3422 } 3423 3424 static int proto_seq_show(struct seq_file *seq, void *v) 3425 { 3426 if (v == &proto_list) 3427 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", 3428 "protocol", 3429 "size", 3430 "sockets", 3431 "memory", 3432 "press", 3433 "maxhdr", 3434 "slab", 3435 "module", 3436 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); 3437 else 3438 proto_seq_printf(seq, list_entry(v, struct proto, node)); 3439 return 0; 3440 } 3441 3442 static const struct seq_operations proto_seq_ops = { 3443 .start = proto_seq_start, 3444 .next = proto_seq_next, 3445 .stop = proto_seq_stop, 3446 .show = proto_seq_show, 3447 }; 3448 3449 static int proto_seq_open(struct inode *inode, struct file *file) 3450 { 3451 return seq_open_net(inode, file, &proto_seq_ops, 3452 sizeof(struct seq_net_private)); 3453 } 3454 3455 static const struct file_operations proto_seq_fops = { 3456 .open = proto_seq_open, 3457 .read = seq_read, 3458 .llseek = seq_lseek, 3459 .release = seq_release_net, 3460 }; 3461 3462 static __net_init int proto_init_net(struct net *net) 3463 { 3464 if (!proc_create("protocols", 0444, net->proc_net, &proto_seq_fops)) 3465 return -ENOMEM; 3466 3467 return 0; 3468 } 3469 3470 static __net_exit void proto_exit_net(struct net *net) 3471 { 3472 remove_proc_entry("protocols", net->proc_net); 3473 } 3474 3475 3476 static __net_initdata struct pernet_operations proto_net_ops = { 3477 .init = proto_init_net, 3478 .exit = proto_exit_net, 3479 }; 3480 3481 static int __init proto_init(void) 3482 { 3483 return register_pernet_subsys(&proto_net_ops); 3484 } 3485 3486 subsys_initcall(proto_init); 3487 3488 #endif /* PROC_FS */ 3489 3490 #ifdef CONFIG_NET_RX_BUSY_POLL 3491 bool sk_busy_loop_end(void *p, unsigned long start_time) 3492 { 3493 struct sock *sk = p; 3494 3495 return !skb_queue_empty(&sk->sk_receive_queue) || 3496 sk_busy_loop_timeout(sk, start_time); 3497 } 3498 EXPORT_SYMBOL(sk_busy_loop_end); 3499 #endif /* CONFIG_NET_RX_BUSY_POLL */ 3500