1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Generic socket support routines. Memory allocators, socket lock/release 7 * handler for protocols to use and generic option handler. 8 * 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Alan Cox, <A.Cox@swansea.ac.uk> 14 * 15 * Fixes: 16 * Alan Cox : Numerous verify_area() problems 17 * Alan Cox : Connecting on a connecting socket 18 * now returns an error for tcp. 19 * Alan Cox : sock->protocol is set correctly. 20 * and is not sometimes left as 0. 21 * Alan Cox : connect handles icmp errors on a 22 * connect properly. Unfortunately there 23 * is a restart syscall nasty there. I 24 * can't match BSD without hacking the C 25 * library. Ideas urgently sought! 26 * Alan Cox : Disallow bind() to addresses that are 27 * not ours - especially broadcast ones!! 28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) 29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, 30 * instead they leave that for the DESTROY timer. 31 * Alan Cox : Clean up error flag in accept 32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer 33 * was buggy. Put a remove_sock() in the handler 34 * for memory when we hit 0. Also altered the timer 35 * code. The ACK stuff can wait and needs major 36 * TCP layer surgery. 37 * Alan Cox : Fixed TCP ack bug, removed remove sock 38 * and fixed timer/inet_bh race. 39 * Alan Cox : Added zapped flag for TCP 40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code 41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb 42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources 43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. 44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... 45 * Rick Sladkey : Relaxed UDP rules for matching packets. 46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support 47 * Pauline Middelink : identd support 48 * Alan Cox : Fixed connect() taking signals I think. 49 * Alan Cox : SO_LINGER supported 50 * Alan Cox : Error reporting fixes 51 * Anonymous : inet_create tidied up (sk->reuse setting) 52 * Alan Cox : inet sockets don't set sk->type! 53 * Alan Cox : Split socket option code 54 * Alan Cox : Callbacks 55 * Alan Cox : Nagle flag for Charles & Johannes stuff 56 * Alex : Removed restriction on inet fioctl 57 * Alan Cox : Splitting INET from NET core 58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() 59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code 60 * Alan Cox : Split IP from generic code 61 * Alan Cox : New kfree_skbmem() 62 * Alan Cox : Make SO_DEBUG superuser only. 63 * Alan Cox : Allow anyone to clear SO_DEBUG 64 * (compatibility fix) 65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. 66 * Alan Cox : Allocator for a socket is settable. 67 * Alan Cox : SO_ERROR includes soft errors. 68 * Alan Cox : Allow NULL arguments on some SO_ opts 69 * Alan Cox : Generic socket allocation to make hooks 70 * easier (suggested by Craig Metz). 71 * Michael Pall : SO_ERROR returns positive errno again 72 * Steve Whitehouse: Added default destructor to free 73 * protocol private data. 74 * Steve Whitehouse: Added various other default routines 75 * common to several socket families. 76 * Chris Evans : Call suser() check last on F_SETOWN 77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. 78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() 79 * Andi Kleen : Fix write_space callback 80 * Chris Evans : Security fixes - signedness again 81 * Arnaldo C. Melo : cleanups, use skb_queue_purge 82 * 83 * To Fix: 84 * 85 * 86 * This program is free software; you can redistribute it and/or 87 * modify it under the terms of the GNU General Public License 88 * as published by the Free Software Foundation; either version 89 * 2 of the License, or (at your option) any later version. 90 */ 91 92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 93 94 #include <linux/capability.h> 95 #include <linux/errno.h> 96 #include <linux/errqueue.h> 97 #include <linux/types.h> 98 #include <linux/socket.h> 99 #include <linux/in.h> 100 #include <linux/kernel.h> 101 #include <linux/module.h> 102 #include <linux/proc_fs.h> 103 #include <linux/seq_file.h> 104 #include <linux/sched.h> 105 #include <linux/sched/mm.h> 106 #include <linux/timer.h> 107 #include <linux/string.h> 108 #include <linux/sockios.h> 109 #include <linux/net.h> 110 #include <linux/mm.h> 111 #include <linux/slab.h> 112 #include <linux/interrupt.h> 113 #include <linux/poll.h> 114 #include <linux/tcp.h> 115 #include <linux/init.h> 116 #include <linux/highmem.h> 117 #include <linux/user_namespace.h> 118 #include <linux/static_key.h> 119 #include <linux/memcontrol.h> 120 #include <linux/prefetch.h> 121 122 #include <linux/uaccess.h> 123 124 #include <linux/netdevice.h> 125 #include <net/protocol.h> 126 #include <linux/skbuff.h> 127 #include <net/net_namespace.h> 128 #include <net/request_sock.h> 129 #include <net/sock.h> 130 #include <linux/net_tstamp.h> 131 #include <net/xfrm.h> 132 #include <linux/ipsec.h> 133 #include <net/cls_cgroup.h> 134 #include <net/netprio_cgroup.h> 135 #include <linux/sock_diag.h> 136 137 #include <linux/filter.h> 138 #include <net/sock_reuseport.h> 139 140 #include <trace/events/sock.h> 141 142 #include <net/tcp.h> 143 #include <net/busy_poll.h> 144 145 static DEFINE_MUTEX(proto_list_mutex); 146 static LIST_HEAD(proto_list); 147 148 static void sock_inuse_add(struct net *net, int val); 149 150 /** 151 * sk_ns_capable - General socket capability test 152 * @sk: Socket to use a capability on or through 153 * @user_ns: The user namespace of the capability to use 154 * @cap: The capability to use 155 * 156 * Test to see if the opener of the socket had when the socket was 157 * created and the current process has the capability @cap in the user 158 * namespace @user_ns. 159 */ 160 bool sk_ns_capable(const struct sock *sk, 161 struct user_namespace *user_ns, int cap) 162 { 163 return file_ns_capable(sk->sk_socket->file, user_ns, cap) && 164 ns_capable(user_ns, cap); 165 } 166 EXPORT_SYMBOL(sk_ns_capable); 167 168 /** 169 * sk_capable - Socket global capability test 170 * @sk: Socket to use a capability on or through 171 * @cap: The global capability to use 172 * 173 * Test to see if the opener of the socket had when the socket was 174 * created and the current process has the capability @cap in all user 175 * namespaces. 176 */ 177 bool sk_capable(const struct sock *sk, int cap) 178 { 179 return sk_ns_capable(sk, &init_user_ns, cap); 180 } 181 EXPORT_SYMBOL(sk_capable); 182 183 /** 184 * sk_net_capable - Network namespace socket capability test 185 * @sk: Socket to use a capability on or through 186 * @cap: The capability to use 187 * 188 * Test to see if the opener of the socket had when the socket was created 189 * and the current process has the capability @cap over the network namespace 190 * the socket is a member of. 191 */ 192 bool sk_net_capable(const struct sock *sk, int cap) 193 { 194 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); 195 } 196 EXPORT_SYMBOL(sk_net_capable); 197 198 /* 199 * Each address family might have different locking rules, so we have 200 * one slock key per address family and separate keys for internal and 201 * userspace sockets. 202 */ 203 static struct lock_class_key af_family_keys[AF_MAX]; 204 static struct lock_class_key af_family_kern_keys[AF_MAX]; 205 static struct lock_class_key af_family_slock_keys[AF_MAX]; 206 static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; 207 208 /* 209 * Make lock validator output more readable. (we pre-construct these 210 * strings build-time, so that runtime initialization of socket 211 * locks is fast): 212 */ 213 214 #define _sock_locks(x) \ 215 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ 216 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ 217 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ 218 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ 219 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ 220 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ 221 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ 222 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ 223 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ 224 x "27" , x "28" , x "AF_CAN" , \ 225 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ 226 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ 227 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ 228 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ 229 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ 230 x "AF_MAX" 231 232 static const char *const af_family_key_strings[AF_MAX+1] = { 233 _sock_locks("sk_lock-") 234 }; 235 static const char *const af_family_slock_key_strings[AF_MAX+1] = { 236 _sock_locks("slock-") 237 }; 238 static const char *const af_family_clock_key_strings[AF_MAX+1] = { 239 _sock_locks("clock-") 240 }; 241 242 static const char *const af_family_kern_key_strings[AF_MAX+1] = { 243 _sock_locks("k-sk_lock-") 244 }; 245 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { 246 _sock_locks("k-slock-") 247 }; 248 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { 249 _sock_locks("k-clock-") 250 }; 251 static const char *const af_family_rlock_key_strings[AF_MAX+1] = { 252 "rlock-AF_UNSPEC", "rlock-AF_UNIX" , "rlock-AF_INET" , 253 "rlock-AF_AX25" , "rlock-AF_IPX" , "rlock-AF_APPLETALK", 254 "rlock-AF_NETROM", "rlock-AF_BRIDGE" , "rlock-AF_ATMPVC" , 255 "rlock-AF_X25" , "rlock-AF_INET6" , "rlock-AF_ROSE" , 256 "rlock-AF_DECnet", "rlock-AF_NETBEUI" , "rlock-AF_SECURITY" , 257 "rlock-AF_KEY" , "rlock-AF_NETLINK" , "rlock-AF_PACKET" , 258 "rlock-AF_ASH" , "rlock-AF_ECONET" , "rlock-AF_ATMSVC" , 259 "rlock-AF_RDS" , "rlock-AF_SNA" , "rlock-AF_IRDA" , 260 "rlock-AF_PPPOX" , "rlock-AF_WANPIPE" , "rlock-AF_LLC" , 261 "rlock-27" , "rlock-28" , "rlock-AF_CAN" , 262 "rlock-AF_TIPC" , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV" , 263 "rlock-AF_RXRPC" , "rlock-AF_ISDN" , "rlock-AF_PHONET" , 264 "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG" , 265 "rlock-AF_NFC" , "rlock-AF_VSOCK" , "rlock-AF_KCM" , 266 "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_XDP" , 267 "rlock-AF_MAX" 268 }; 269 static const char *const af_family_wlock_key_strings[AF_MAX+1] = { 270 "wlock-AF_UNSPEC", "wlock-AF_UNIX" , "wlock-AF_INET" , 271 "wlock-AF_AX25" , "wlock-AF_IPX" , "wlock-AF_APPLETALK", 272 "wlock-AF_NETROM", "wlock-AF_BRIDGE" , "wlock-AF_ATMPVC" , 273 "wlock-AF_X25" , "wlock-AF_INET6" , "wlock-AF_ROSE" , 274 "wlock-AF_DECnet", "wlock-AF_NETBEUI" , "wlock-AF_SECURITY" , 275 "wlock-AF_KEY" , "wlock-AF_NETLINK" , "wlock-AF_PACKET" , 276 "wlock-AF_ASH" , "wlock-AF_ECONET" , "wlock-AF_ATMSVC" , 277 "wlock-AF_RDS" , "wlock-AF_SNA" , "wlock-AF_IRDA" , 278 "wlock-AF_PPPOX" , "wlock-AF_WANPIPE" , "wlock-AF_LLC" , 279 "wlock-27" , "wlock-28" , "wlock-AF_CAN" , 280 "wlock-AF_TIPC" , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV" , 281 "wlock-AF_RXRPC" , "wlock-AF_ISDN" , "wlock-AF_PHONET" , 282 "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG" , 283 "wlock-AF_NFC" , "wlock-AF_VSOCK" , "wlock-AF_KCM" , 284 "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_XDP" , 285 "wlock-AF_MAX" 286 }; 287 static const char *const af_family_elock_key_strings[AF_MAX+1] = { 288 "elock-AF_UNSPEC", "elock-AF_UNIX" , "elock-AF_INET" , 289 "elock-AF_AX25" , "elock-AF_IPX" , "elock-AF_APPLETALK", 290 "elock-AF_NETROM", "elock-AF_BRIDGE" , "elock-AF_ATMPVC" , 291 "elock-AF_X25" , "elock-AF_INET6" , "elock-AF_ROSE" , 292 "elock-AF_DECnet", "elock-AF_NETBEUI" , "elock-AF_SECURITY" , 293 "elock-AF_KEY" , "elock-AF_NETLINK" , "elock-AF_PACKET" , 294 "elock-AF_ASH" , "elock-AF_ECONET" , "elock-AF_ATMSVC" , 295 "elock-AF_RDS" , "elock-AF_SNA" , "elock-AF_IRDA" , 296 "elock-AF_PPPOX" , "elock-AF_WANPIPE" , "elock-AF_LLC" , 297 "elock-27" , "elock-28" , "elock-AF_CAN" , 298 "elock-AF_TIPC" , "elock-AF_BLUETOOTH", "elock-AF_IUCV" , 299 "elock-AF_RXRPC" , "elock-AF_ISDN" , "elock-AF_PHONET" , 300 "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG" , 301 "elock-AF_NFC" , "elock-AF_VSOCK" , "elock-AF_KCM" , 302 "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_XDP" , 303 "elock-AF_MAX" 304 }; 305 306 /* 307 * sk_callback_lock and sk queues locking rules are per-address-family, 308 * so split the lock classes by using a per-AF key: 309 */ 310 static struct lock_class_key af_callback_keys[AF_MAX]; 311 static struct lock_class_key af_rlock_keys[AF_MAX]; 312 static struct lock_class_key af_wlock_keys[AF_MAX]; 313 static struct lock_class_key af_elock_keys[AF_MAX]; 314 static struct lock_class_key af_kern_callback_keys[AF_MAX]; 315 316 /* Run time adjustable parameters. */ 317 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; 318 EXPORT_SYMBOL(sysctl_wmem_max); 319 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; 320 EXPORT_SYMBOL(sysctl_rmem_max); 321 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; 322 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; 323 324 /* Maximal space eaten by iovec or ancillary data plus some space */ 325 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 326 EXPORT_SYMBOL(sysctl_optmem_max); 327 328 int sysctl_tstamp_allow_data __read_mostly = 1; 329 330 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); 331 EXPORT_SYMBOL_GPL(memalloc_socks_key); 332 333 /** 334 * sk_set_memalloc - sets %SOCK_MEMALLOC 335 * @sk: socket to set it on 336 * 337 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. 338 * It's the responsibility of the admin to adjust min_free_kbytes 339 * to meet the requirements 340 */ 341 void sk_set_memalloc(struct sock *sk) 342 { 343 sock_set_flag(sk, SOCK_MEMALLOC); 344 sk->sk_allocation |= __GFP_MEMALLOC; 345 static_branch_inc(&memalloc_socks_key); 346 } 347 EXPORT_SYMBOL_GPL(sk_set_memalloc); 348 349 void sk_clear_memalloc(struct sock *sk) 350 { 351 sock_reset_flag(sk, SOCK_MEMALLOC); 352 sk->sk_allocation &= ~__GFP_MEMALLOC; 353 static_branch_dec(&memalloc_socks_key); 354 355 /* 356 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward 357 * progress of swapping. SOCK_MEMALLOC may be cleared while 358 * it has rmem allocations due to the last swapfile being deactivated 359 * but there is a risk that the socket is unusable due to exceeding 360 * the rmem limits. Reclaim the reserves and obey rmem limits again. 361 */ 362 sk_mem_reclaim(sk); 363 } 364 EXPORT_SYMBOL_GPL(sk_clear_memalloc); 365 366 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 367 { 368 int ret; 369 unsigned int noreclaim_flag; 370 371 /* these should have been dropped before queueing */ 372 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); 373 374 noreclaim_flag = memalloc_noreclaim_save(); 375 ret = sk->sk_backlog_rcv(sk, skb); 376 memalloc_noreclaim_restore(noreclaim_flag); 377 378 return ret; 379 } 380 EXPORT_SYMBOL(__sk_backlog_rcv); 381 382 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) 383 { 384 struct timeval tv; 385 386 if (optlen < sizeof(tv)) 387 return -EINVAL; 388 if (copy_from_user(&tv, optval, sizeof(tv))) 389 return -EFAULT; 390 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) 391 return -EDOM; 392 393 if (tv.tv_sec < 0) { 394 static int warned __read_mostly; 395 396 *timeo_p = 0; 397 if (warned < 10 && net_ratelimit()) { 398 warned++; 399 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", 400 __func__, current->comm, task_pid_nr(current)); 401 } 402 return 0; 403 } 404 *timeo_p = MAX_SCHEDULE_TIMEOUT; 405 if (tv.tv_sec == 0 && tv.tv_usec == 0) 406 return 0; 407 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1)) 408 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ); 409 return 0; 410 } 411 412 static void sock_warn_obsolete_bsdism(const char *name) 413 { 414 static int warned; 415 static char warncomm[TASK_COMM_LEN]; 416 if (strcmp(warncomm, current->comm) && warned < 5) { 417 strcpy(warncomm, current->comm); 418 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n", 419 warncomm, name); 420 warned++; 421 } 422 } 423 424 static bool sock_needs_netstamp(const struct sock *sk) 425 { 426 switch (sk->sk_family) { 427 case AF_UNSPEC: 428 case AF_UNIX: 429 return false; 430 default: 431 return true; 432 } 433 } 434 435 static void sock_disable_timestamp(struct sock *sk, unsigned long flags) 436 { 437 if (sk->sk_flags & flags) { 438 sk->sk_flags &= ~flags; 439 if (sock_needs_netstamp(sk) && 440 !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) 441 net_disable_timestamp(); 442 } 443 } 444 445 446 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 447 { 448 unsigned long flags; 449 struct sk_buff_head *list = &sk->sk_receive_queue; 450 451 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { 452 atomic_inc(&sk->sk_drops); 453 trace_sock_rcvqueue_full(sk, skb); 454 return -ENOMEM; 455 } 456 457 if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 458 atomic_inc(&sk->sk_drops); 459 return -ENOBUFS; 460 } 461 462 skb->dev = NULL; 463 skb_set_owner_r(skb, sk); 464 465 /* we escape from rcu protected region, make sure we dont leak 466 * a norefcounted dst 467 */ 468 skb_dst_force(skb); 469 470 spin_lock_irqsave(&list->lock, flags); 471 sock_skb_set_dropcount(sk, skb); 472 __skb_queue_tail(list, skb); 473 spin_unlock_irqrestore(&list->lock, flags); 474 475 if (!sock_flag(sk, SOCK_DEAD)) 476 sk->sk_data_ready(sk); 477 return 0; 478 } 479 EXPORT_SYMBOL(__sock_queue_rcv_skb); 480 481 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 482 { 483 int err; 484 485 err = sk_filter(sk, skb); 486 if (err) 487 return err; 488 489 return __sock_queue_rcv_skb(sk, skb); 490 } 491 EXPORT_SYMBOL(sock_queue_rcv_skb); 492 493 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, 494 const int nested, unsigned int trim_cap, bool refcounted) 495 { 496 int rc = NET_RX_SUCCESS; 497 498 if (sk_filter_trim_cap(sk, skb, trim_cap)) 499 goto discard_and_relse; 500 501 skb->dev = NULL; 502 503 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { 504 atomic_inc(&sk->sk_drops); 505 goto discard_and_relse; 506 } 507 if (nested) 508 bh_lock_sock_nested(sk); 509 else 510 bh_lock_sock(sk); 511 if (!sock_owned_by_user(sk)) { 512 /* 513 * trylock + unlock semantics: 514 */ 515 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); 516 517 rc = sk_backlog_rcv(sk, skb); 518 519 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); 520 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) { 521 bh_unlock_sock(sk); 522 atomic_inc(&sk->sk_drops); 523 goto discard_and_relse; 524 } 525 526 bh_unlock_sock(sk); 527 out: 528 if (refcounted) 529 sock_put(sk); 530 return rc; 531 discard_and_relse: 532 kfree_skb(skb); 533 goto out; 534 } 535 EXPORT_SYMBOL(__sk_receive_skb); 536 537 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 538 { 539 struct dst_entry *dst = __sk_dst_get(sk); 540 541 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 542 sk_tx_queue_clear(sk); 543 sk->sk_dst_pending_confirm = 0; 544 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 545 dst_release(dst); 546 return NULL; 547 } 548 549 return dst; 550 } 551 EXPORT_SYMBOL(__sk_dst_check); 552 553 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) 554 { 555 struct dst_entry *dst = sk_dst_get(sk); 556 557 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 558 sk_dst_reset(sk); 559 dst_release(dst); 560 return NULL; 561 } 562 563 return dst; 564 } 565 EXPORT_SYMBOL(sk_dst_check); 566 567 static int sock_setbindtodevice(struct sock *sk, char __user *optval, 568 int optlen) 569 { 570 int ret = -ENOPROTOOPT; 571 #ifdef CONFIG_NETDEVICES 572 struct net *net = sock_net(sk); 573 char devname[IFNAMSIZ]; 574 int index; 575 576 /* Sorry... */ 577 ret = -EPERM; 578 if (!ns_capable(net->user_ns, CAP_NET_RAW)) 579 goto out; 580 581 ret = -EINVAL; 582 if (optlen < 0) 583 goto out; 584 585 /* Bind this socket to a particular device like "eth0", 586 * as specified in the passed interface name. If the 587 * name is "" or the option length is zero the socket 588 * is not bound. 589 */ 590 if (optlen > IFNAMSIZ - 1) 591 optlen = IFNAMSIZ - 1; 592 memset(devname, 0, sizeof(devname)); 593 594 ret = -EFAULT; 595 if (copy_from_user(devname, optval, optlen)) 596 goto out; 597 598 index = 0; 599 if (devname[0] != '\0') { 600 struct net_device *dev; 601 602 rcu_read_lock(); 603 dev = dev_get_by_name_rcu(net, devname); 604 if (dev) 605 index = dev->ifindex; 606 rcu_read_unlock(); 607 ret = -ENODEV; 608 if (!dev) 609 goto out; 610 } 611 612 lock_sock(sk); 613 sk->sk_bound_dev_if = index; 614 sk_dst_reset(sk); 615 release_sock(sk); 616 617 ret = 0; 618 619 out: 620 #endif 621 622 return ret; 623 } 624 625 static int sock_getbindtodevice(struct sock *sk, char __user *optval, 626 int __user *optlen, int len) 627 { 628 int ret = -ENOPROTOOPT; 629 #ifdef CONFIG_NETDEVICES 630 struct net *net = sock_net(sk); 631 char devname[IFNAMSIZ]; 632 633 if (sk->sk_bound_dev_if == 0) { 634 len = 0; 635 goto zero; 636 } 637 638 ret = -EINVAL; 639 if (len < IFNAMSIZ) 640 goto out; 641 642 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if); 643 if (ret) 644 goto out; 645 646 len = strlen(devname) + 1; 647 648 ret = -EFAULT; 649 if (copy_to_user(optval, devname, len)) 650 goto out; 651 652 zero: 653 ret = -EFAULT; 654 if (put_user(len, optlen)) 655 goto out; 656 657 ret = 0; 658 659 out: 660 #endif 661 662 return ret; 663 } 664 665 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) 666 { 667 if (valbool) 668 sock_set_flag(sk, bit); 669 else 670 sock_reset_flag(sk, bit); 671 } 672 673 bool sk_mc_loop(struct sock *sk) 674 { 675 if (dev_recursion_level()) 676 return false; 677 if (!sk) 678 return true; 679 switch (sk->sk_family) { 680 case AF_INET: 681 return inet_sk(sk)->mc_loop; 682 #if IS_ENABLED(CONFIG_IPV6) 683 case AF_INET6: 684 return inet6_sk(sk)->mc_loop; 685 #endif 686 } 687 WARN_ON(1); 688 return true; 689 } 690 EXPORT_SYMBOL(sk_mc_loop); 691 692 /* 693 * This is meant for all protocols to use and covers goings on 694 * at the socket level. Everything here is generic. 695 */ 696 697 int sock_setsockopt(struct socket *sock, int level, int optname, 698 char __user *optval, unsigned int optlen) 699 { 700 struct sock *sk = sock->sk; 701 int val; 702 int valbool; 703 struct linger ling; 704 int ret = 0; 705 706 /* 707 * Options without arguments 708 */ 709 710 if (optname == SO_BINDTODEVICE) 711 return sock_setbindtodevice(sk, optval, optlen); 712 713 if (optlen < sizeof(int)) 714 return -EINVAL; 715 716 if (get_user(val, (int __user *)optval)) 717 return -EFAULT; 718 719 valbool = val ? 1 : 0; 720 721 lock_sock(sk); 722 723 switch (optname) { 724 case SO_DEBUG: 725 if (val && !capable(CAP_NET_ADMIN)) 726 ret = -EACCES; 727 else 728 sock_valbool_flag(sk, SOCK_DBG, valbool); 729 break; 730 case SO_REUSEADDR: 731 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); 732 break; 733 case SO_REUSEPORT: 734 sk->sk_reuseport = valbool; 735 break; 736 case SO_TYPE: 737 case SO_PROTOCOL: 738 case SO_DOMAIN: 739 case SO_ERROR: 740 ret = -ENOPROTOOPT; 741 break; 742 case SO_DONTROUTE: 743 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); 744 break; 745 case SO_BROADCAST: 746 sock_valbool_flag(sk, SOCK_BROADCAST, valbool); 747 break; 748 case SO_SNDBUF: 749 /* Don't error on this BSD doesn't and if you think 750 * about it this is right. Otherwise apps have to 751 * play 'guess the biggest size' games. RCVBUF/SNDBUF 752 * are treated in BSD as hints 753 */ 754 val = min_t(u32, val, sysctl_wmem_max); 755 set_sndbuf: 756 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 757 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); 758 /* Wake up sending tasks if we upped the value. */ 759 sk->sk_write_space(sk); 760 break; 761 762 case SO_SNDBUFFORCE: 763 if (!capable(CAP_NET_ADMIN)) { 764 ret = -EPERM; 765 break; 766 } 767 goto set_sndbuf; 768 769 case SO_RCVBUF: 770 /* Don't error on this BSD doesn't and if you think 771 * about it this is right. Otherwise apps have to 772 * play 'guess the biggest size' games. RCVBUF/SNDBUF 773 * are treated in BSD as hints 774 */ 775 val = min_t(u32, val, sysctl_rmem_max); 776 set_rcvbuf: 777 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 778 /* 779 * We double it on the way in to account for 780 * "struct sk_buff" etc. overhead. Applications 781 * assume that the SO_RCVBUF setting they make will 782 * allow that much actual data to be received on that 783 * socket. 784 * 785 * Applications are unaware that "struct sk_buff" and 786 * other overheads allocate from the receive buffer 787 * during socket buffer allocation. 788 * 789 * And after considering the possible alternatives, 790 * returning the value we actually used in getsockopt 791 * is the most desirable behavior. 792 */ 793 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF); 794 break; 795 796 case SO_RCVBUFFORCE: 797 if (!capable(CAP_NET_ADMIN)) { 798 ret = -EPERM; 799 break; 800 } 801 goto set_rcvbuf; 802 803 case SO_KEEPALIVE: 804 if (sk->sk_prot->keepalive) 805 sk->sk_prot->keepalive(sk, valbool); 806 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 807 break; 808 809 case SO_OOBINLINE: 810 sock_valbool_flag(sk, SOCK_URGINLINE, valbool); 811 break; 812 813 case SO_NO_CHECK: 814 sk->sk_no_check_tx = valbool; 815 break; 816 817 case SO_PRIORITY: 818 if ((val >= 0 && val <= 6) || 819 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 820 sk->sk_priority = val; 821 else 822 ret = -EPERM; 823 break; 824 825 case SO_LINGER: 826 if (optlen < sizeof(ling)) { 827 ret = -EINVAL; /* 1003.1g */ 828 break; 829 } 830 if (copy_from_user(&ling, optval, sizeof(ling))) { 831 ret = -EFAULT; 832 break; 833 } 834 if (!ling.l_onoff) 835 sock_reset_flag(sk, SOCK_LINGER); 836 else { 837 #if (BITS_PER_LONG == 32) 838 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) 839 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; 840 else 841 #endif 842 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; 843 sock_set_flag(sk, SOCK_LINGER); 844 } 845 break; 846 847 case SO_BSDCOMPAT: 848 sock_warn_obsolete_bsdism("setsockopt"); 849 break; 850 851 case SO_PASSCRED: 852 if (valbool) 853 set_bit(SOCK_PASSCRED, &sock->flags); 854 else 855 clear_bit(SOCK_PASSCRED, &sock->flags); 856 break; 857 858 case SO_TIMESTAMP: 859 case SO_TIMESTAMPNS: 860 if (valbool) { 861 if (optname == SO_TIMESTAMP) 862 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 863 else 864 sock_set_flag(sk, SOCK_RCVTSTAMPNS); 865 sock_set_flag(sk, SOCK_RCVTSTAMP); 866 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 867 } else { 868 sock_reset_flag(sk, SOCK_RCVTSTAMP); 869 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 870 } 871 break; 872 873 case SO_TIMESTAMPING: 874 if (val & ~SOF_TIMESTAMPING_MASK) { 875 ret = -EINVAL; 876 break; 877 } 878 879 if (val & SOF_TIMESTAMPING_OPT_ID && 880 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { 881 if (sk->sk_protocol == IPPROTO_TCP && 882 sk->sk_type == SOCK_STREAM) { 883 if ((1 << sk->sk_state) & 884 (TCPF_CLOSE | TCPF_LISTEN)) { 885 ret = -EINVAL; 886 break; 887 } 888 sk->sk_tskey = tcp_sk(sk)->snd_una; 889 } else { 890 sk->sk_tskey = 0; 891 } 892 } 893 894 if (val & SOF_TIMESTAMPING_OPT_STATS && 895 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) { 896 ret = -EINVAL; 897 break; 898 } 899 900 sk->sk_tsflags = val; 901 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 902 sock_enable_timestamp(sk, 903 SOCK_TIMESTAMPING_RX_SOFTWARE); 904 else 905 sock_disable_timestamp(sk, 906 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); 907 break; 908 909 case SO_RCVLOWAT: 910 if (val < 0) 911 val = INT_MAX; 912 if (sock->ops->set_rcvlowat) 913 ret = sock->ops->set_rcvlowat(sk, val); 914 else 915 sk->sk_rcvlowat = val ? : 1; 916 break; 917 918 case SO_RCVTIMEO: 919 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen); 920 break; 921 922 case SO_SNDTIMEO: 923 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen); 924 break; 925 926 case SO_ATTACH_FILTER: 927 ret = -EINVAL; 928 if (optlen == sizeof(struct sock_fprog)) { 929 struct sock_fprog fprog; 930 931 ret = -EFAULT; 932 if (copy_from_user(&fprog, optval, sizeof(fprog))) 933 break; 934 935 ret = sk_attach_filter(&fprog, sk); 936 } 937 break; 938 939 case SO_ATTACH_BPF: 940 ret = -EINVAL; 941 if (optlen == sizeof(u32)) { 942 u32 ufd; 943 944 ret = -EFAULT; 945 if (copy_from_user(&ufd, optval, sizeof(ufd))) 946 break; 947 948 ret = sk_attach_bpf(ufd, sk); 949 } 950 break; 951 952 case SO_ATTACH_REUSEPORT_CBPF: 953 ret = -EINVAL; 954 if (optlen == sizeof(struct sock_fprog)) { 955 struct sock_fprog fprog; 956 957 ret = -EFAULT; 958 if (copy_from_user(&fprog, optval, sizeof(fprog))) 959 break; 960 961 ret = sk_reuseport_attach_filter(&fprog, sk); 962 } 963 break; 964 965 case SO_ATTACH_REUSEPORT_EBPF: 966 ret = -EINVAL; 967 if (optlen == sizeof(u32)) { 968 u32 ufd; 969 970 ret = -EFAULT; 971 if (copy_from_user(&ufd, optval, sizeof(ufd))) 972 break; 973 974 ret = sk_reuseport_attach_bpf(ufd, sk); 975 } 976 break; 977 978 case SO_DETACH_FILTER: 979 ret = sk_detach_filter(sk); 980 break; 981 982 case SO_LOCK_FILTER: 983 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) 984 ret = -EPERM; 985 else 986 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); 987 break; 988 989 case SO_PASSSEC: 990 if (valbool) 991 set_bit(SOCK_PASSSEC, &sock->flags); 992 else 993 clear_bit(SOCK_PASSSEC, &sock->flags); 994 break; 995 case SO_MARK: 996 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 997 ret = -EPERM; 998 else 999 sk->sk_mark = val; 1000 break; 1001 1002 case SO_RXQ_OVFL: 1003 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); 1004 break; 1005 1006 case SO_WIFI_STATUS: 1007 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); 1008 break; 1009 1010 case SO_PEEK_OFF: 1011 if (sock->ops->set_peek_off) 1012 ret = sock->ops->set_peek_off(sk, val); 1013 else 1014 ret = -EOPNOTSUPP; 1015 break; 1016 1017 case SO_NOFCS: 1018 sock_valbool_flag(sk, SOCK_NOFCS, valbool); 1019 break; 1020 1021 case SO_SELECT_ERR_QUEUE: 1022 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); 1023 break; 1024 1025 #ifdef CONFIG_NET_RX_BUSY_POLL 1026 case SO_BUSY_POLL: 1027 /* allow unprivileged users to decrease the value */ 1028 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) 1029 ret = -EPERM; 1030 else { 1031 if (val < 0) 1032 ret = -EINVAL; 1033 else 1034 sk->sk_ll_usec = val; 1035 } 1036 break; 1037 #endif 1038 1039 case SO_MAX_PACING_RATE: 1040 if (val != ~0U) 1041 cmpxchg(&sk->sk_pacing_status, 1042 SK_PACING_NONE, 1043 SK_PACING_NEEDED); 1044 sk->sk_max_pacing_rate = val; 1045 sk->sk_pacing_rate = min(sk->sk_pacing_rate, 1046 sk->sk_max_pacing_rate); 1047 break; 1048 1049 case SO_INCOMING_CPU: 1050 sk->sk_incoming_cpu = val; 1051 break; 1052 1053 case SO_CNX_ADVICE: 1054 if (val == 1) 1055 dst_negative_advice(sk); 1056 break; 1057 1058 case SO_ZEROCOPY: 1059 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { 1060 if (sk->sk_protocol != IPPROTO_TCP) 1061 ret = -ENOTSUPP; 1062 } else if (sk->sk_family != PF_RDS) { 1063 ret = -ENOTSUPP; 1064 } 1065 if (!ret) { 1066 if (val < 0 || val > 1) 1067 ret = -EINVAL; 1068 else 1069 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); 1070 } 1071 break; 1072 1073 default: 1074 ret = -ENOPROTOOPT; 1075 break; 1076 } 1077 release_sock(sk); 1078 return ret; 1079 } 1080 EXPORT_SYMBOL(sock_setsockopt); 1081 1082 1083 static void cred_to_ucred(struct pid *pid, const struct cred *cred, 1084 struct ucred *ucred) 1085 { 1086 ucred->pid = pid_vnr(pid); 1087 ucred->uid = ucred->gid = -1; 1088 if (cred) { 1089 struct user_namespace *current_ns = current_user_ns(); 1090 1091 ucred->uid = from_kuid_munged(current_ns, cred->euid); 1092 ucred->gid = from_kgid_munged(current_ns, cred->egid); 1093 } 1094 } 1095 1096 static int groups_to_user(gid_t __user *dst, const struct group_info *src) 1097 { 1098 struct user_namespace *user_ns = current_user_ns(); 1099 int i; 1100 1101 for (i = 0; i < src->ngroups; i++) 1102 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i)) 1103 return -EFAULT; 1104 1105 return 0; 1106 } 1107 1108 int sock_getsockopt(struct socket *sock, int level, int optname, 1109 char __user *optval, int __user *optlen) 1110 { 1111 struct sock *sk = sock->sk; 1112 1113 union { 1114 int val; 1115 u64 val64; 1116 struct linger ling; 1117 struct timeval tm; 1118 } v; 1119 1120 int lv = sizeof(int); 1121 int len; 1122 1123 if (get_user(len, optlen)) 1124 return -EFAULT; 1125 if (len < 0) 1126 return -EINVAL; 1127 1128 memset(&v, 0, sizeof(v)); 1129 1130 switch (optname) { 1131 case SO_DEBUG: 1132 v.val = sock_flag(sk, SOCK_DBG); 1133 break; 1134 1135 case SO_DONTROUTE: 1136 v.val = sock_flag(sk, SOCK_LOCALROUTE); 1137 break; 1138 1139 case SO_BROADCAST: 1140 v.val = sock_flag(sk, SOCK_BROADCAST); 1141 break; 1142 1143 case SO_SNDBUF: 1144 v.val = sk->sk_sndbuf; 1145 break; 1146 1147 case SO_RCVBUF: 1148 v.val = sk->sk_rcvbuf; 1149 break; 1150 1151 case SO_REUSEADDR: 1152 v.val = sk->sk_reuse; 1153 break; 1154 1155 case SO_REUSEPORT: 1156 v.val = sk->sk_reuseport; 1157 break; 1158 1159 case SO_KEEPALIVE: 1160 v.val = sock_flag(sk, SOCK_KEEPOPEN); 1161 break; 1162 1163 case SO_TYPE: 1164 v.val = sk->sk_type; 1165 break; 1166 1167 case SO_PROTOCOL: 1168 v.val = sk->sk_protocol; 1169 break; 1170 1171 case SO_DOMAIN: 1172 v.val = sk->sk_family; 1173 break; 1174 1175 case SO_ERROR: 1176 v.val = -sock_error(sk); 1177 if (v.val == 0) 1178 v.val = xchg(&sk->sk_err_soft, 0); 1179 break; 1180 1181 case SO_OOBINLINE: 1182 v.val = sock_flag(sk, SOCK_URGINLINE); 1183 break; 1184 1185 case SO_NO_CHECK: 1186 v.val = sk->sk_no_check_tx; 1187 break; 1188 1189 case SO_PRIORITY: 1190 v.val = sk->sk_priority; 1191 break; 1192 1193 case SO_LINGER: 1194 lv = sizeof(v.ling); 1195 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); 1196 v.ling.l_linger = sk->sk_lingertime / HZ; 1197 break; 1198 1199 case SO_BSDCOMPAT: 1200 sock_warn_obsolete_bsdism("getsockopt"); 1201 break; 1202 1203 case SO_TIMESTAMP: 1204 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && 1205 !sock_flag(sk, SOCK_RCVTSTAMPNS); 1206 break; 1207 1208 case SO_TIMESTAMPNS: 1209 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS); 1210 break; 1211 1212 case SO_TIMESTAMPING: 1213 v.val = sk->sk_tsflags; 1214 break; 1215 1216 case SO_RCVTIMEO: 1217 lv = sizeof(struct timeval); 1218 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) { 1219 v.tm.tv_sec = 0; 1220 v.tm.tv_usec = 0; 1221 } else { 1222 v.tm.tv_sec = sk->sk_rcvtimeo / HZ; 1223 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ; 1224 } 1225 break; 1226 1227 case SO_SNDTIMEO: 1228 lv = sizeof(struct timeval); 1229 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) { 1230 v.tm.tv_sec = 0; 1231 v.tm.tv_usec = 0; 1232 } else { 1233 v.tm.tv_sec = sk->sk_sndtimeo / HZ; 1234 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ; 1235 } 1236 break; 1237 1238 case SO_RCVLOWAT: 1239 v.val = sk->sk_rcvlowat; 1240 break; 1241 1242 case SO_SNDLOWAT: 1243 v.val = 1; 1244 break; 1245 1246 case SO_PASSCRED: 1247 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); 1248 break; 1249 1250 case SO_PEERCRED: 1251 { 1252 struct ucred peercred; 1253 if (len > sizeof(peercred)) 1254 len = sizeof(peercred); 1255 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); 1256 if (copy_to_user(optval, &peercred, len)) 1257 return -EFAULT; 1258 goto lenout; 1259 } 1260 1261 case SO_PEERGROUPS: 1262 { 1263 int ret, n; 1264 1265 if (!sk->sk_peer_cred) 1266 return -ENODATA; 1267 1268 n = sk->sk_peer_cred->group_info->ngroups; 1269 if (len < n * sizeof(gid_t)) { 1270 len = n * sizeof(gid_t); 1271 return put_user(len, optlen) ? -EFAULT : -ERANGE; 1272 } 1273 len = n * sizeof(gid_t); 1274 1275 ret = groups_to_user((gid_t __user *)optval, 1276 sk->sk_peer_cred->group_info); 1277 if (ret) 1278 return ret; 1279 goto lenout; 1280 } 1281 1282 case SO_PEERNAME: 1283 { 1284 char address[128]; 1285 1286 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2); 1287 if (lv < 0) 1288 return -ENOTCONN; 1289 if (lv < len) 1290 return -EINVAL; 1291 if (copy_to_user(optval, address, len)) 1292 return -EFAULT; 1293 goto lenout; 1294 } 1295 1296 /* Dubious BSD thing... Probably nobody even uses it, but 1297 * the UNIX standard wants it for whatever reason... -DaveM 1298 */ 1299 case SO_ACCEPTCONN: 1300 v.val = sk->sk_state == TCP_LISTEN; 1301 break; 1302 1303 case SO_PASSSEC: 1304 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); 1305 break; 1306 1307 case SO_PEERSEC: 1308 return security_socket_getpeersec_stream(sock, optval, optlen, len); 1309 1310 case SO_MARK: 1311 v.val = sk->sk_mark; 1312 break; 1313 1314 case SO_RXQ_OVFL: 1315 v.val = sock_flag(sk, SOCK_RXQ_OVFL); 1316 break; 1317 1318 case SO_WIFI_STATUS: 1319 v.val = sock_flag(sk, SOCK_WIFI_STATUS); 1320 break; 1321 1322 case SO_PEEK_OFF: 1323 if (!sock->ops->set_peek_off) 1324 return -EOPNOTSUPP; 1325 1326 v.val = sk->sk_peek_off; 1327 break; 1328 case SO_NOFCS: 1329 v.val = sock_flag(sk, SOCK_NOFCS); 1330 break; 1331 1332 case SO_BINDTODEVICE: 1333 return sock_getbindtodevice(sk, optval, optlen, len); 1334 1335 case SO_GET_FILTER: 1336 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); 1337 if (len < 0) 1338 return len; 1339 1340 goto lenout; 1341 1342 case SO_LOCK_FILTER: 1343 v.val = sock_flag(sk, SOCK_FILTER_LOCKED); 1344 break; 1345 1346 case SO_BPF_EXTENSIONS: 1347 v.val = bpf_tell_extensions(); 1348 break; 1349 1350 case SO_SELECT_ERR_QUEUE: 1351 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); 1352 break; 1353 1354 #ifdef CONFIG_NET_RX_BUSY_POLL 1355 case SO_BUSY_POLL: 1356 v.val = sk->sk_ll_usec; 1357 break; 1358 #endif 1359 1360 case SO_MAX_PACING_RATE: 1361 v.val = sk->sk_max_pacing_rate; 1362 break; 1363 1364 case SO_INCOMING_CPU: 1365 v.val = sk->sk_incoming_cpu; 1366 break; 1367 1368 case SO_MEMINFO: 1369 { 1370 u32 meminfo[SK_MEMINFO_VARS]; 1371 1372 if (get_user(len, optlen)) 1373 return -EFAULT; 1374 1375 sk_get_meminfo(sk, meminfo); 1376 1377 len = min_t(unsigned int, len, sizeof(meminfo)); 1378 if (copy_to_user(optval, &meminfo, len)) 1379 return -EFAULT; 1380 1381 goto lenout; 1382 } 1383 1384 #ifdef CONFIG_NET_RX_BUSY_POLL 1385 case SO_INCOMING_NAPI_ID: 1386 v.val = READ_ONCE(sk->sk_napi_id); 1387 1388 /* aggregate non-NAPI IDs down to 0 */ 1389 if (v.val < MIN_NAPI_ID) 1390 v.val = 0; 1391 1392 break; 1393 #endif 1394 1395 case SO_COOKIE: 1396 lv = sizeof(u64); 1397 if (len < lv) 1398 return -EINVAL; 1399 v.val64 = sock_gen_cookie(sk); 1400 break; 1401 1402 case SO_ZEROCOPY: 1403 v.val = sock_flag(sk, SOCK_ZEROCOPY); 1404 break; 1405 1406 default: 1407 /* We implement the SO_SNDLOWAT etc to not be settable 1408 * (1003.1g 7). 1409 */ 1410 return -ENOPROTOOPT; 1411 } 1412 1413 if (len > lv) 1414 len = lv; 1415 if (copy_to_user(optval, &v, len)) 1416 return -EFAULT; 1417 lenout: 1418 if (put_user(len, optlen)) 1419 return -EFAULT; 1420 return 0; 1421 } 1422 1423 /* 1424 * Initialize an sk_lock. 1425 * 1426 * (We also register the sk_lock with the lock validator.) 1427 */ 1428 static inline void sock_lock_init(struct sock *sk) 1429 { 1430 if (sk->sk_kern_sock) 1431 sock_lock_init_class_and_name( 1432 sk, 1433 af_family_kern_slock_key_strings[sk->sk_family], 1434 af_family_kern_slock_keys + sk->sk_family, 1435 af_family_kern_key_strings[sk->sk_family], 1436 af_family_kern_keys + sk->sk_family); 1437 else 1438 sock_lock_init_class_and_name( 1439 sk, 1440 af_family_slock_key_strings[sk->sk_family], 1441 af_family_slock_keys + sk->sk_family, 1442 af_family_key_strings[sk->sk_family], 1443 af_family_keys + sk->sk_family); 1444 } 1445 1446 /* 1447 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 1448 * even temporarly, because of RCU lookups. sk_node should also be left as is. 1449 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end 1450 */ 1451 static void sock_copy(struct sock *nsk, const struct sock *osk) 1452 { 1453 #ifdef CONFIG_SECURITY_NETWORK 1454 void *sptr = nsk->sk_security; 1455 #endif 1456 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); 1457 1458 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, 1459 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); 1460 1461 #ifdef CONFIG_SECURITY_NETWORK 1462 nsk->sk_security = sptr; 1463 security_sk_clone(osk, nsk); 1464 #endif 1465 } 1466 1467 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 1468 int family) 1469 { 1470 struct sock *sk; 1471 struct kmem_cache *slab; 1472 1473 slab = prot->slab; 1474 if (slab != NULL) { 1475 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 1476 if (!sk) 1477 return sk; 1478 if (priority & __GFP_ZERO) 1479 sk_prot_clear_nulls(sk, prot->obj_size); 1480 } else 1481 sk = kmalloc(prot->obj_size, priority); 1482 1483 if (sk != NULL) { 1484 if (security_sk_alloc(sk, family, priority)) 1485 goto out_free; 1486 1487 if (!try_module_get(prot->owner)) 1488 goto out_free_sec; 1489 sk_tx_queue_clear(sk); 1490 } 1491 1492 return sk; 1493 1494 out_free_sec: 1495 security_sk_free(sk); 1496 out_free: 1497 if (slab != NULL) 1498 kmem_cache_free(slab, sk); 1499 else 1500 kfree(sk); 1501 return NULL; 1502 } 1503 1504 static void sk_prot_free(struct proto *prot, struct sock *sk) 1505 { 1506 struct kmem_cache *slab; 1507 struct module *owner; 1508 1509 owner = prot->owner; 1510 slab = prot->slab; 1511 1512 cgroup_sk_free(&sk->sk_cgrp_data); 1513 mem_cgroup_sk_free(sk); 1514 security_sk_free(sk); 1515 if (slab != NULL) 1516 kmem_cache_free(slab, sk); 1517 else 1518 kfree(sk); 1519 module_put(owner); 1520 } 1521 1522 /** 1523 * sk_alloc - All socket objects are allocated here 1524 * @net: the applicable net namespace 1525 * @family: protocol family 1526 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1527 * @prot: struct proto associated with this new sock instance 1528 * @kern: is this to be a kernel socket? 1529 */ 1530 struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 1531 struct proto *prot, int kern) 1532 { 1533 struct sock *sk; 1534 1535 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); 1536 if (sk) { 1537 sk->sk_family = family; 1538 /* 1539 * See comment in struct sock definition to understand 1540 * why we need sk_prot_creator -acme 1541 */ 1542 sk->sk_prot = sk->sk_prot_creator = prot; 1543 sk->sk_kern_sock = kern; 1544 sock_lock_init(sk); 1545 sk->sk_net_refcnt = kern ? 0 : 1; 1546 if (likely(sk->sk_net_refcnt)) { 1547 get_net(net); 1548 sock_inuse_add(net, 1); 1549 } 1550 1551 sock_net_set(sk, net); 1552 refcount_set(&sk->sk_wmem_alloc, 1); 1553 1554 mem_cgroup_sk_alloc(sk); 1555 cgroup_sk_alloc(&sk->sk_cgrp_data); 1556 sock_update_classid(&sk->sk_cgrp_data); 1557 sock_update_netprioidx(&sk->sk_cgrp_data); 1558 } 1559 1560 return sk; 1561 } 1562 EXPORT_SYMBOL(sk_alloc); 1563 1564 /* Sockets having SOCK_RCU_FREE will call this function after one RCU 1565 * grace period. This is the case for UDP sockets and TCP listeners. 1566 */ 1567 static void __sk_destruct(struct rcu_head *head) 1568 { 1569 struct sock *sk = container_of(head, struct sock, sk_rcu); 1570 struct sk_filter *filter; 1571 1572 if (sk->sk_destruct) 1573 sk->sk_destruct(sk); 1574 1575 filter = rcu_dereference_check(sk->sk_filter, 1576 refcount_read(&sk->sk_wmem_alloc) == 0); 1577 if (filter) { 1578 sk_filter_uncharge(sk, filter); 1579 RCU_INIT_POINTER(sk->sk_filter, NULL); 1580 } 1581 if (rcu_access_pointer(sk->sk_reuseport_cb)) 1582 reuseport_detach_sock(sk); 1583 1584 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); 1585 1586 if (atomic_read(&sk->sk_omem_alloc)) 1587 pr_debug("%s: optmem leakage (%d bytes) detected\n", 1588 __func__, atomic_read(&sk->sk_omem_alloc)); 1589 1590 if (sk->sk_frag.page) { 1591 put_page(sk->sk_frag.page); 1592 sk->sk_frag.page = NULL; 1593 } 1594 1595 if (sk->sk_peer_cred) 1596 put_cred(sk->sk_peer_cred); 1597 put_pid(sk->sk_peer_pid); 1598 if (likely(sk->sk_net_refcnt)) 1599 put_net(sock_net(sk)); 1600 sk_prot_free(sk->sk_prot_creator, sk); 1601 } 1602 1603 void sk_destruct(struct sock *sk) 1604 { 1605 if (sock_flag(sk, SOCK_RCU_FREE)) 1606 call_rcu(&sk->sk_rcu, __sk_destruct); 1607 else 1608 __sk_destruct(&sk->sk_rcu); 1609 } 1610 1611 static void __sk_free(struct sock *sk) 1612 { 1613 if (likely(sk->sk_net_refcnt)) 1614 sock_inuse_add(sock_net(sk), -1); 1615 1616 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) 1617 sock_diag_broadcast_destroy(sk); 1618 else 1619 sk_destruct(sk); 1620 } 1621 1622 void sk_free(struct sock *sk) 1623 { 1624 /* 1625 * We subtract one from sk_wmem_alloc and can know if 1626 * some packets are still in some tx queue. 1627 * If not null, sock_wfree() will call __sk_free(sk) later 1628 */ 1629 if (refcount_dec_and_test(&sk->sk_wmem_alloc)) 1630 __sk_free(sk); 1631 } 1632 EXPORT_SYMBOL(sk_free); 1633 1634 static void sk_init_common(struct sock *sk) 1635 { 1636 skb_queue_head_init(&sk->sk_receive_queue); 1637 skb_queue_head_init(&sk->sk_write_queue); 1638 skb_queue_head_init(&sk->sk_error_queue); 1639 1640 rwlock_init(&sk->sk_callback_lock); 1641 lockdep_set_class_and_name(&sk->sk_receive_queue.lock, 1642 af_rlock_keys + sk->sk_family, 1643 af_family_rlock_key_strings[sk->sk_family]); 1644 lockdep_set_class_and_name(&sk->sk_write_queue.lock, 1645 af_wlock_keys + sk->sk_family, 1646 af_family_wlock_key_strings[sk->sk_family]); 1647 lockdep_set_class_and_name(&sk->sk_error_queue.lock, 1648 af_elock_keys + sk->sk_family, 1649 af_family_elock_key_strings[sk->sk_family]); 1650 lockdep_set_class_and_name(&sk->sk_callback_lock, 1651 af_callback_keys + sk->sk_family, 1652 af_family_clock_key_strings[sk->sk_family]); 1653 } 1654 1655 /** 1656 * sk_clone_lock - clone a socket, and lock its clone 1657 * @sk: the socket to clone 1658 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1659 * 1660 * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) 1661 */ 1662 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) 1663 { 1664 struct sock *newsk; 1665 bool is_charged = true; 1666 1667 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family); 1668 if (newsk != NULL) { 1669 struct sk_filter *filter; 1670 1671 sock_copy(newsk, sk); 1672 1673 newsk->sk_prot_creator = sk->sk_prot; 1674 1675 /* SANITY */ 1676 if (likely(newsk->sk_net_refcnt)) 1677 get_net(sock_net(newsk)); 1678 sk_node_init(&newsk->sk_node); 1679 sock_lock_init(newsk); 1680 bh_lock_sock(newsk); 1681 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 1682 newsk->sk_backlog.len = 0; 1683 1684 atomic_set(&newsk->sk_rmem_alloc, 0); 1685 /* 1686 * sk_wmem_alloc set to one (see sk_free() and sock_wfree()) 1687 */ 1688 refcount_set(&newsk->sk_wmem_alloc, 1); 1689 atomic_set(&newsk->sk_omem_alloc, 0); 1690 sk_init_common(newsk); 1691 1692 newsk->sk_dst_cache = NULL; 1693 newsk->sk_dst_pending_confirm = 0; 1694 newsk->sk_wmem_queued = 0; 1695 newsk->sk_forward_alloc = 0; 1696 atomic_set(&newsk->sk_drops, 0); 1697 newsk->sk_send_head = NULL; 1698 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; 1699 atomic_set(&newsk->sk_zckey, 0); 1700 1701 sock_reset_flag(newsk, SOCK_DONE); 1702 mem_cgroup_sk_alloc(newsk); 1703 cgroup_sk_alloc(&newsk->sk_cgrp_data); 1704 1705 rcu_read_lock(); 1706 filter = rcu_dereference(sk->sk_filter); 1707 if (filter != NULL) 1708 /* though it's an empty new sock, the charging may fail 1709 * if sysctl_optmem_max was changed between creation of 1710 * original socket and cloning 1711 */ 1712 is_charged = sk_filter_charge(newsk, filter); 1713 RCU_INIT_POINTER(newsk->sk_filter, filter); 1714 rcu_read_unlock(); 1715 1716 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { 1717 /* We need to make sure that we don't uncharge the new 1718 * socket if we couldn't charge it in the first place 1719 * as otherwise we uncharge the parent's filter. 1720 */ 1721 if (!is_charged) 1722 RCU_INIT_POINTER(newsk->sk_filter, NULL); 1723 sk_free_unlock_clone(newsk); 1724 newsk = NULL; 1725 goto out; 1726 } 1727 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); 1728 1729 newsk->sk_err = 0; 1730 newsk->sk_err_soft = 0; 1731 newsk->sk_priority = 0; 1732 newsk->sk_incoming_cpu = raw_smp_processor_id(); 1733 atomic64_set(&newsk->sk_cookie, 0); 1734 if (likely(newsk->sk_net_refcnt)) 1735 sock_inuse_add(sock_net(newsk), 1); 1736 1737 /* 1738 * Before updating sk_refcnt, we must commit prior changes to memory 1739 * (Documentation/RCU/rculist_nulls.txt for details) 1740 */ 1741 smp_wmb(); 1742 refcount_set(&newsk->sk_refcnt, 2); 1743 1744 /* 1745 * Increment the counter in the same struct proto as the master 1746 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that 1747 * is the same as sk->sk_prot->socks, as this field was copied 1748 * with memcpy). 1749 * 1750 * This _changes_ the previous behaviour, where 1751 * tcp_create_openreq_child always was incrementing the 1752 * equivalent to tcp_prot->socks (inet_sock_nr), so this have 1753 * to be taken into account in all callers. -acme 1754 */ 1755 sk_refcnt_debug_inc(newsk); 1756 sk_set_socket(newsk, NULL); 1757 newsk->sk_wq = NULL; 1758 1759 if (newsk->sk_prot->sockets_allocated) 1760 sk_sockets_allocated_inc(newsk); 1761 1762 if (sock_needs_netstamp(sk) && 1763 newsk->sk_flags & SK_FLAGS_TIMESTAMP) 1764 net_enable_timestamp(); 1765 } 1766 out: 1767 return newsk; 1768 } 1769 EXPORT_SYMBOL_GPL(sk_clone_lock); 1770 1771 void sk_free_unlock_clone(struct sock *sk) 1772 { 1773 /* It is still raw copy of parent, so invalidate 1774 * destructor and make plain sk_free() */ 1775 sk->sk_destruct = NULL; 1776 bh_unlock_sock(sk); 1777 sk_free(sk); 1778 } 1779 EXPORT_SYMBOL_GPL(sk_free_unlock_clone); 1780 1781 void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 1782 { 1783 u32 max_segs = 1; 1784 1785 sk_dst_set(sk, dst); 1786 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps; 1787 if (sk->sk_route_caps & NETIF_F_GSO) 1788 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 1789 sk->sk_route_caps &= ~sk->sk_route_nocaps; 1790 if (sk_can_gso(sk)) { 1791 if (dst->header_len && !xfrm_dst_offload_ok(dst)) { 1792 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 1793 } else { 1794 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 1795 sk->sk_gso_max_size = dst->dev->gso_max_size; 1796 max_segs = max_t(u32, dst->dev->gso_max_segs, 1); 1797 } 1798 } 1799 sk->sk_gso_max_segs = max_segs; 1800 } 1801 EXPORT_SYMBOL_GPL(sk_setup_caps); 1802 1803 /* 1804 * Simple resource managers for sockets. 1805 */ 1806 1807 1808 /* 1809 * Write buffer destructor automatically called from kfree_skb. 1810 */ 1811 void sock_wfree(struct sk_buff *skb) 1812 { 1813 struct sock *sk = skb->sk; 1814 unsigned int len = skb->truesize; 1815 1816 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { 1817 /* 1818 * Keep a reference on sk_wmem_alloc, this will be released 1819 * after sk_write_space() call 1820 */ 1821 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); 1822 sk->sk_write_space(sk); 1823 len = 1; 1824 } 1825 /* 1826 * if sk_wmem_alloc reaches 0, we must finish what sk_free() 1827 * could not do because of in-flight packets 1828 */ 1829 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) 1830 __sk_free(sk); 1831 } 1832 EXPORT_SYMBOL(sock_wfree); 1833 1834 /* This variant of sock_wfree() is used by TCP, 1835 * since it sets SOCK_USE_WRITE_QUEUE. 1836 */ 1837 void __sock_wfree(struct sk_buff *skb) 1838 { 1839 struct sock *sk = skb->sk; 1840 1841 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) 1842 __sk_free(sk); 1843 } 1844 1845 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) 1846 { 1847 skb_orphan(skb); 1848 skb->sk = sk; 1849 #ifdef CONFIG_INET 1850 if (unlikely(!sk_fullsock(sk))) { 1851 skb->destructor = sock_edemux; 1852 sock_hold(sk); 1853 return; 1854 } 1855 #endif 1856 skb->destructor = sock_wfree; 1857 skb_set_hash_from_sk(skb, sk); 1858 /* 1859 * We used to take a refcount on sk, but following operation 1860 * is enough to guarantee sk_free() wont free this sock until 1861 * all in-flight packets are completed 1862 */ 1863 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 1864 } 1865 EXPORT_SYMBOL(skb_set_owner_w); 1866 1867 /* This helper is used by netem, as it can hold packets in its 1868 * delay queue. We want to allow the owner socket to send more 1869 * packets, as if they were already TX completed by a typical driver. 1870 * But we also want to keep skb->sk set because some packet schedulers 1871 * rely on it (sch_fq for example). 1872 */ 1873 void skb_orphan_partial(struct sk_buff *skb) 1874 { 1875 if (skb_is_tcp_pure_ack(skb)) 1876 return; 1877 1878 if (skb->destructor == sock_wfree 1879 #ifdef CONFIG_INET 1880 || skb->destructor == tcp_wfree 1881 #endif 1882 ) { 1883 struct sock *sk = skb->sk; 1884 1885 if (refcount_inc_not_zero(&sk->sk_refcnt)) { 1886 WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)); 1887 skb->destructor = sock_efree; 1888 } 1889 } else { 1890 skb_orphan(skb); 1891 } 1892 } 1893 EXPORT_SYMBOL(skb_orphan_partial); 1894 1895 /* 1896 * Read buffer destructor automatically called from kfree_skb. 1897 */ 1898 void sock_rfree(struct sk_buff *skb) 1899 { 1900 struct sock *sk = skb->sk; 1901 unsigned int len = skb->truesize; 1902 1903 atomic_sub(len, &sk->sk_rmem_alloc); 1904 sk_mem_uncharge(sk, len); 1905 } 1906 EXPORT_SYMBOL(sock_rfree); 1907 1908 /* 1909 * Buffer destructor for skbs that are not used directly in read or write 1910 * path, e.g. for error handler skbs. Automatically called from kfree_skb. 1911 */ 1912 void sock_efree(struct sk_buff *skb) 1913 { 1914 sock_put(skb->sk); 1915 } 1916 EXPORT_SYMBOL(sock_efree); 1917 1918 kuid_t sock_i_uid(struct sock *sk) 1919 { 1920 kuid_t uid; 1921 1922 read_lock_bh(&sk->sk_callback_lock); 1923 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; 1924 read_unlock_bh(&sk->sk_callback_lock); 1925 return uid; 1926 } 1927 EXPORT_SYMBOL(sock_i_uid); 1928 1929 unsigned long sock_i_ino(struct sock *sk) 1930 { 1931 unsigned long ino; 1932 1933 read_lock_bh(&sk->sk_callback_lock); 1934 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; 1935 read_unlock_bh(&sk->sk_callback_lock); 1936 return ino; 1937 } 1938 EXPORT_SYMBOL(sock_i_ino); 1939 1940 /* 1941 * Allocate a skb from the socket's send buffer. 1942 */ 1943 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 1944 gfp_t priority) 1945 { 1946 if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { 1947 struct sk_buff *skb = alloc_skb(size, priority); 1948 if (skb) { 1949 skb_set_owner_w(skb, sk); 1950 return skb; 1951 } 1952 } 1953 return NULL; 1954 } 1955 EXPORT_SYMBOL(sock_wmalloc); 1956 1957 static void sock_ofree(struct sk_buff *skb) 1958 { 1959 struct sock *sk = skb->sk; 1960 1961 atomic_sub(skb->truesize, &sk->sk_omem_alloc); 1962 } 1963 1964 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, 1965 gfp_t priority) 1966 { 1967 struct sk_buff *skb; 1968 1969 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ 1970 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > 1971 sysctl_optmem_max) 1972 return NULL; 1973 1974 skb = alloc_skb(size, priority); 1975 if (!skb) 1976 return NULL; 1977 1978 atomic_add(skb->truesize, &sk->sk_omem_alloc); 1979 skb->sk = sk; 1980 skb->destructor = sock_ofree; 1981 return skb; 1982 } 1983 1984 /* 1985 * Allocate a memory block from the socket's option memory buffer. 1986 */ 1987 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) 1988 { 1989 if ((unsigned int)size <= sysctl_optmem_max && 1990 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { 1991 void *mem; 1992 /* First do the add, to avoid the race if kmalloc 1993 * might sleep. 1994 */ 1995 atomic_add(size, &sk->sk_omem_alloc); 1996 mem = kmalloc(size, priority); 1997 if (mem) 1998 return mem; 1999 atomic_sub(size, &sk->sk_omem_alloc); 2000 } 2001 return NULL; 2002 } 2003 EXPORT_SYMBOL(sock_kmalloc); 2004 2005 /* Free an option memory block. Note, we actually want the inline 2006 * here as this allows gcc to detect the nullify and fold away the 2007 * condition entirely. 2008 */ 2009 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, 2010 const bool nullify) 2011 { 2012 if (WARN_ON_ONCE(!mem)) 2013 return; 2014 if (nullify) 2015 kzfree(mem); 2016 else 2017 kfree(mem); 2018 atomic_sub(size, &sk->sk_omem_alloc); 2019 } 2020 2021 void sock_kfree_s(struct sock *sk, void *mem, int size) 2022 { 2023 __sock_kfree_s(sk, mem, size, false); 2024 } 2025 EXPORT_SYMBOL(sock_kfree_s); 2026 2027 void sock_kzfree_s(struct sock *sk, void *mem, int size) 2028 { 2029 __sock_kfree_s(sk, mem, size, true); 2030 } 2031 EXPORT_SYMBOL(sock_kzfree_s); 2032 2033 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. 2034 I think, these locks should be removed for datagram sockets. 2035 */ 2036 static long sock_wait_for_wmem(struct sock *sk, long timeo) 2037 { 2038 DEFINE_WAIT(wait); 2039 2040 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2041 for (;;) { 2042 if (!timeo) 2043 break; 2044 if (signal_pending(current)) 2045 break; 2046 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2047 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 2048 if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) 2049 break; 2050 if (sk->sk_shutdown & SEND_SHUTDOWN) 2051 break; 2052 if (sk->sk_err) 2053 break; 2054 timeo = schedule_timeout(timeo); 2055 } 2056 finish_wait(sk_sleep(sk), &wait); 2057 return timeo; 2058 } 2059 2060 2061 /* 2062 * Generic send/receive buffer handlers 2063 */ 2064 2065 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 2066 unsigned long data_len, int noblock, 2067 int *errcode, int max_page_order) 2068 { 2069 struct sk_buff *skb; 2070 long timeo; 2071 int err; 2072 2073 timeo = sock_sndtimeo(sk, noblock); 2074 for (;;) { 2075 err = sock_error(sk); 2076 if (err != 0) 2077 goto failure; 2078 2079 err = -EPIPE; 2080 if (sk->sk_shutdown & SEND_SHUTDOWN) 2081 goto failure; 2082 2083 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf) 2084 break; 2085 2086 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2087 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2088 err = -EAGAIN; 2089 if (!timeo) 2090 goto failure; 2091 if (signal_pending(current)) 2092 goto interrupted; 2093 timeo = sock_wait_for_wmem(sk, timeo); 2094 } 2095 skb = alloc_skb_with_frags(header_len, data_len, max_page_order, 2096 errcode, sk->sk_allocation); 2097 if (skb) 2098 skb_set_owner_w(skb, sk); 2099 return skb; 2100 2101 interrupted: 2102 err = sock_intr_errno(timeo); 2103 failure: 2104 *errcode = err; 2105 return NULL; 2106 } 2107 EXPORT_SYMBOL(sock_alloc_send_pskb); 2108 2109 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 2110 int noblock, int *errcode) 2111 { 2112 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); 2113 } 2114 EXPORT_SYMBOL(sock_alloc_send_skb); 2115 2116 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, 2117 struct sockcm_cookie *sockc) 2118 { 2119 u32 tsflags; 2120 2121 switch (cmsg->cmsg_type) { 2122 case SO_MARK: 2123 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 2124 return -EPERM; 2125 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2126 return -EINVAL; 2127 sockc->mark = *(u32 *)CMSG_DATA(cmsg); 2128 break; 2129 case SO_TIMESTAMPING: 2130 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2131 return -EINVAL; 2132 2133 tsflags = *(u32 *)CMSG_DATA(cmsg); 2134 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) 2135 return -EINVAL; 2136 2137 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; 2138 sockc->tsflags |= tsflags; 2139 break; 2140 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ 2141 case SCM_RIGHTS: 2142 case SCM_CREDENTIALS: 2143 break; 2144 default: 2145 return -EINVAL; 2146 } 2147 return 0; 2148 } 2149 EXPORT_SYMBOL(__sock_cmsg_send); 2150 2151 int sock_cmsg_send(struct sock *sk, struct msghdr *msg, 2152 struct sockcm_cookie *sockc) 2153 { 2154 struct cmsghdr *cmsg; 2155 int ret; 2156 2157 for_each_cmsghdr(cmsg, msg) { 2158 if (!CMSG_OK(msg, cmsg)) 2159 return -EINVAL; 2160 if (cmsg->cmsg_level != SOL_SOCKET) 2161 continue; 2162 ret = __sock_cmsg_send(sk, msg, cmsg, sockc); 2163 if (ret) 2164 return ret; 2165 } 2166 return 0; 2167 } 2168 EXPORT_SYMBOL(sock_cmsg_send); 2169 2170 static void sk_enter_memory_pressure(struct sock *sk) 2171 { 2172 if (!sk->sk_prot->enter_memory_pressure) 2173 return; 2174 2175 sk->sk_prot->enter_memory_pressure(sk); 2176 } 2177 2178 static void sk_leave_memory_pressure(struct sock *sk) 2179 { 2180 if (sk->sk_prot->leave_memory_pressure) { 2181 sk->sk_prot->leave_memory_pressure(sk); 2182 } else { 2183 unsigned long *memory_pressure = sk->sk_prot->memory_pressure; 2184 2185 if (memory_pressure && *memory_pressure) 2186 *memory_pressure = 0; 2187 } 2188 } 2189 2190 /* On 32bit arches, an skb frag is limited to 2^15 */ 2191 #define SKB_FRAG_PAGE_ORDER get_order(32768) 2192 2193 /** 2194 * skb_page_frag_refill - check that a page_frag contains enough room 2195 * @sz: minimum size of the fragment we want to get 2196 * @pfrag: pointer to page_frag 2197 * @gfp: priority for memory allocation 2198 * 2199 * Note: While this allocator tries to use high order pages, there is 2200 * no guarantee that allocations succeed. Therefore, @sz MUST be 2201 * less or equal than PAGE_SIZE. 2202 */ 2203 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) 2204 { 2205 if (pfrag->page) { 2206 if (page_ref_count(pfrag->page) == 1) { 2207 pfrag->offset = 0; 2208 return true; 2209 } 2210 if (pfrag->offset + sz <= pfrag->size) 2211 return true; 2212 put_page(pfrag->page); 2213 } 2214 2215 pfrag->offset = 0; 2216 if (SKB_FRAG_PAGE_ORDER) { 2217 /* Avoid direct reclaim but allow kswapd to wake */ 2218 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | 2219 __GFP_COMP | __GFP_NOWARN | 2220 __GFP_NORETRY, 2221 SKB_FRAG_PAGE_ORDER); 2222 if (likely(pfrag->page)) { 2223 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; 2224 return true; 2225 } 2226 } 2227 pfrag->page = alloc_page(gfp); 2228 if (likely(pfrag->page)) { 2229 pfrag->size = PAGE_SIZE; 2230 return true; 2231 } 2232 return false; 2233 } 2234 EXPORT_SYMBOL(skb_page_frag_refill); 2235 2236 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 2237 { 2238 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) 2239 return true; 2240 2241 sk_enter_memory_pressure(sk); 2242 sk_stream_moderate_sndbuf(sk); 2243 return false; 2244 } 2245 EXPORT_SYMBOL(sk_page_frag_refill); 2246 2247 int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, 2248 int sg_start, int *sg_curr_index, unsigned int *sg_curr_size, 2249 int first_coalesce) 2250 { 2251 int sg_curr = *sg_curr_index, use = 0, rc = 0; 2252 unsigned int size = *sg_curr_size; 2253 struct page_frag *pfrag; 2254 struct scatterlist *sge; 2255 2256 len -= size; 2257 pfrag = sk_page_frag(sk); 2258 2259 while (len > 0) { 2260 unsigned int orig_offset; 2261 2262 if (!sk_page_frag_refill(sk, pfrag)) { 2263 rc = -ENOMEM; 2264 goto out; 2265 } 2266 2267 use = min_t(int, len, pfrag->size - pfrag->offset); 2268 2269 if (!sk_wmem_schedule(sk, use)) { 2270 rc = -ENOMEM; 2271 goto out; 2272 } 2273 2274 sk_mem_charge(sk, use); 2275 size += use; 2276 orig_offset = pfrag->offset; 2277 pfrag->offset += use; 2278 2279 sge = sg + sg_curr - 1; 2280 if (sg_curr > first_coalesce && sg_page(sg) == pfrag->page && 2281 sg->offset + sg->length == orig_offset) { 2282 sg->length += use; 2283 } else { 2284 sge = sg + sg_curr; 2285 sg_unmark_end(sge); 2286 sg_set_page(sge, pfrag->page, use, orig_offset); 2287 get_page(pfrag->page); 2288 sg_curr++; 2289 2290 if (sg_curr == MAX_SKB_FRAGS) 2291 sg_curr = 0; 2292 2293 if (sg_curr == sg_start) { 2294 rc = -ENOSPC; 2295 break; 2296 } 2297 } 2298 2299 len -= use; 2300 } 2301 out: 2302 *sg_curr_size = size; 2303 *sg_curr_index = sg_curr; 2304 return rc; 2305 } 2306 EXPORT_SYMBOL(sk_alloc_sg); 2307 2308 static void __lock_sock(struct sock *sk) 2309 __releases(&sk->sk_lock.slock) 2310 __acquires(&sk->sk_lock.slock) 2311 { 2312 DEFINE_WAIT(wait); 2313 2314 for (;;) { 2315 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, 2316 TASK_UNINTERRUPTIBLE); 2317 spin_unlock_bh(&sk->sk_lock.slock); 2318 schedule(); 2319 spin_lock_bh(&sk->sk_lock.slock); 2320 if (!sock_owned_by_user(sk)) 2321 break; 2322 } 2323 finish_wait(&sk->sk_lock.wq, &wait); 2324 } 2325 2326 static void __release_sock(struct sock *sk) 2327 __releases(&sk->sk_lock.slock) 2328 __acquires(&sk->sk_lock.slock) 2329 { 2330 struct sk_buff *skb, *next; 2331 2332 while ((skb = sk->sk_backlog.head) != NULL) { 2333 sk->sk_backlog.head = sk->sk_backlog.tail = NULL; 2334 2335 spin_unlock_bh(&sk->sk_lock.slock); 2336 2337 do { 2338 next = skb->next; 2339 prefetch(next); 2340 WARN_ON_ONCE(skb_dst_is_noref(skb)); 2341 skb->next = NULL; 2342 sk_backlog_rcv(sk, skb); 2343 2344 cond_resched(); 2345 2346 skb = next; 2347 } while (skb != NULL); 2348 2349 spin_lock_bh(&sk->sk_lock.slock); 2350 } 2351 2352 /* 2353 * Doing the zeroing here guarantee we can not loop forever 2354 * while a wild producer attempts to flood us. 2355 */ 2356 sk->sk_backlog.len = 0; 2357 } 2358 2359 void __sk_flush_backlog(struct sock *sk) 2360 { 2361 spin_lock_bh(&sk->sk_lock.slock); 2362 __release_sock(sk); 2363 spin_unlock_bh(&sk->sk_lock.slock); 2364 } 2365 2366 /** 2367 * sk_wait_data - wait for data to arrive at sk_receive_queue 2368 * @sk: sock to wait on 2369 * @timeo: for how long 2370 * @skb: last skb seen on sk_receive_queue 2371 * 2372 * Now socket state including sk->sk_err is changed only under lock, 2373 * hence we may omit checks after joining wait queue. 2374 * We check receive queue before schedule() only as optimization; 2375 * it is very likely that release_sock() added new data. 2376 */ 2377 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) 2378 { 2379 DEFINE_WAIT_FUNC(wait, woken_wake_function); 2380 int rc; 2381 2382 add_wait_queue(sk_sleep(sk), &wait); 2383 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2384 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); 2385 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2386 remove_wait_queue(sk_sleep(sk), &wait); 2387 return rc; 2388 } 2389 EXPORT_SYMBOL(sk_wait_data); 2390 2391 /** 2392 * __sk_mem_raise_allocated - increase memory_allocated 2393 * @sk: socket 2394 * @size: memory size to allocate 2395 * @amt: pages to allocate 2396 * @kind: allocation type 2397 * 2398 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc 2399 */ 2400 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) 2401 { 2402 struct proto *prot = sk->sk_prot; 2403 long allocated = sk_memory_allocated_add(sk, amt); 2404 2405 if (mem_cgroup_sockets_enabled && sk->sk_memcg && 2406 !mem_cgroup_charge_skmem(sk->sk_memcg, amt)) 2407 goto suppress_allocation; 2408 2409 /* Under limit. */ 2410 if (allocated <= sk_prot_mem_limits(sk, 0)) { 2411 sk_leave_memory_pressure(sk); 2412 return 1; 2413 } 2414 2415 /* Under pressure. */ 2416 if (allocated > sk_prot_mem_limits(sk, 1)) 2417 sk_enter_memory_pressure(sk); 2418 2419 /* Over hard limit. */ 2420 if (allocated > sk_prot_mem_limits(sk, 2)) 2421 goto suppress_allocation; 2422 2423 /* guarantee minimum buffer size under pressure */ 2424 if (kind == SK_MEM_RECV) { 2425 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) 2426 return 1; 2427 2428 } else { /* SK_MEM_SEND */ 2429 int wmem0 = sk_get_wmem0(sk, prot); 2430 2431 if (sk->sk_type == SOCK_STREAM) { 2432 if (sk->sk_wmem_queued < wmem0) 2433 return 1; 2434 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { 2435 return 1; 2436 } 2437 } 2438 2439 if (sk_has_memory_pressure(sk)) { 2440 int alloc; 2441 2442 if (!sk_under_memory_pressure(sk)) 2443 return 1; 2444 alloc = sk_sockets_allocated_read_positive(sk); 2445 if (sk_prot_mem_limits(sk, 2) > alloc * 2446 sk_mem_pages(sk->sk_wmem_queued + 2447 atomic_read(&sk->sk_rmem_alloc) + 2448 sk->sk_forward_alloc)) 2449 return 1; 2450 } 2451 2452 suppress_allocation: 2453 2454 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { 2455 sk_stream_moderate_sndbuf(sk); 2456 2457 /* Fail only if socket is _under_ its sndbuf. 2458 * In this case we cannot block, so that we have to fail. 2459 */ 2460 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) 2461 return 1; 2462 } 2463 2464 trace_sock_exceed_buf_limit(sk, prot, allocated); 2465 2466 sk_memory_allocated_sub(sk, amt); 2467 2468 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2469 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); 2470 2471 return 0; 2472 } 2473 EXPORT_SYMBOL(__sk_mem_raise_allocated); 2474 2475 /** 2476 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 2477 * @sk: socket 2478 * @size: memory size to allocate 2479 * @kind: allocation type 2480 * 2481 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 2482 * rmem allocation. This function assumes that protocols which have 2483 * memory_pressure use sk_wmem_queued as write buffer accounting. 2484 */ 2485 int __sk_mem_schedule(struct sock *sk, int size, int kind) 2486 { 2487 int ret, amt = sk_mem_pages(size); 2488 2489 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT; 2490 ret = __sk_mem_raise_allocated(sk, size, amt, kind); 2491 if (!ret) 2492 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT; 2493 return ret; 2494 } 2495 EXPORT_SYMBOL(__sk_mem_schedule); 2496 2497 /** 2498 * __sk_mem_reduce_allocated - reclaim memory_allocated 2499 * @sk: socket 2500 * @amount: number of quanta 2501 * 2502 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc 2503 */ 2504 void __sk_mem_reduce_allocated(struct sock *sk, int amount) 2505 { 2506 sk_memory_allocated_sub(sk, amount); 2507 2508 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2509 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); 2510 2511 if (sk_under_memory_pressure(sk) && 2512 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 2513 sk_leave_memory_pressure(sk); 2514 } 2515 EXPORT_SYMBOL(__sk_mem_reduce_allocated); 2516 2517 /** 2518 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated 2519 * @sk: socket 2520 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) 2521 */ 2522 void __sk_mem_reclaim(struct sock *sk, int amount) 2523 { 2524 amount >>= SK_MEM_QUANTUM_SHIFT; 2525 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; 2526 __sk_mem_reduce_allocated(sk, amount); 2527 } 2528 EXPORT_SYMBOL(__sk_mem_reclaim); 2529 2530 int sk_set_peek_off(struct sock *sk, int val) 2531 { 2532 sk->sk_peek_off = val; 2533 return 0; 2534 } 2535 EXPORT_SYMBOL_GPL(sk_set_peek_off); 2536 2537 /* 2538 * Set of default routines for initialising struct proto_ops when 2539 * the protocol does not support a particular function. In certain 2540 * cases where it makes no sense for a protocol to have a "do nothing" 2541 * function, some default processing is provided. 2542 */ 2543 2544 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) 2545 { 2546 return -EOPNOTSUPP; 2547 } 2548 EXPORT_SYMBOL(sock_no_bind); 2549 2550 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 2551 int len, int flags) 2552 { 2553 return -EOPNOTSUPP; 2554 } 2555 EXPORT_SYMBOL(sock_no_connect); 2556 2557 int sock_no_socketpair(struct socket *sock1, struct socket *sock2) 2558 { 2559 return -EOPNOTSUPP; 2560 } 2561 EXPORT_SYMBOL(sock_no_socketpair); 2562 2563 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, 2564 bool kern) 2565 { 2566 return -EOPNOTSUPP; 2567 } 2568 EXPORT_SYMBOL(sock_no_accept); 2569 2570 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 2571 int peer) 2572 { 2573 return -EOPNOTSUPP; 2574 } 2575 EXPORT_SYMBOL(sock_no_getname); 2576 2577 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 2578 { 2579 return -EOPNOTSUPP; 2580 } 2581 EXPORT_SYMBOL(sock_no_ioctl); 2582 2583 int sock_no_listen(struct socket *sock, int backlog) 2584 { 2585 return -EOPNOTSUPP; 2586 } 2587 EXPORT_SYMBOL(sock_no_listen); 2588 2589 int sock_no_shutdown(struct socket *sock, int how) 2590 { 2591 return -EOPNOTSUPP; 2592 } 2593 EXPORT_SYMBOL(sock_no_shutdown); 2594 2595 int sock_no_setsockopt(struct socket *sock, int level, int optname, 2596 char __user *optval, unsigned int optlen) 2597 { 2598 return -EOPNOTSUPP; 2599 } 2600 EXPORT_SYMBOL(sock_no_setsockopt); 2601 2602 int sock_no_getsockopt(struct socket *sock, int level, int optname, 2603 char __user *optval, int __user *optlen) 2604 { 2605 return -EOPNOTSUPP; 2606 } 2607 EXPORT_SYMBOL(sock_no_getsockopt); 2608 2609 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) 2610 { 2611 return -EOPNOTSUPP; 2612 } 2613 EXPORT_SYMBOL(sock_no_sendmsg); 2614 2615 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) 2616 { 2617 return -EOPNOTSUPP; 2618 } 2619 EXPORT_SYMBOL(sock_no_sendmsg_locked); 2620 2621 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, 2622 int flags) 2623 { 2624 return -EOPNOTSUPP; 2625 } 2626 EXPORT_SYMBOL(sock_no_recvmsg); 2627 2628 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 2629 { 2630 /* Mirror missing mmap method error code */ 2631 return -ENODEV; 2632 } 2633 EXPORT_SYMBOL(sock_no_mmap); 2634 2635 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) 2636 { 2637 ssize_t res; 2638 struct msghdr msg = {.msg_flags = flags}; 2639 struct kvec iov; 2640 char *kaddr = kmap(page); 2641 iov.iov_base = kaddr + offset; 2642 iov.iov_len = size; 2643 res = kernel_sendmsg(sock, &msg, &iov, 1, size); 2644 kunmap(page); 2645 return res; 2646 } 2647 EXPORT_SYMBOL(sock_no_sendpage); 2648 2649 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page, 2650 int offset, size_t size, int flags) 2651 { 2652 ssize_t res; 2653 struct msghdr msg = {.msg_flags = flags}; 2654 struct kvec iov; 2655 char *kaddr = kmap(page); 2656 2657 iov.iov_base = kaddr + offset; 2658 iov.iov_len = size; 2659 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size); 2660 kunmap(page); 2661 return res; 2662 } 2663 EXPORT_SYMBOL(sock_no_sendpage_locked); 2664 2665 /* 2666 * Default Socket Callbacks 2667 */ 2668 2669 static void sock_def_wakeup(struct sock *sk) 2670 { 2671 struct socket_wq *wq; 2672 2673 rcu_read_lock(); 2674 wq = rcu_dereference(sk->sk_wq); 2675 if (skwq_has_sleeper(wq)) 2676 wake_up_interruptible_all(&wq->wait); 2677 rcu_read_unlock(); 2678 } 2679 2680 static void sock_def_error_report(struct sock *sk) 2681 { 2682 struct socket_wq *wq; 2683 2684 rcu_read_lock(); 2685 wq = rcu_dereference(sk->sk_wq); 2686 if (skwq_has_sleeper(wq)) 2687 wake_up_interruptible_poll(&wq->wait, EPOLLERR); 2688 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); 2689 rcu_read_unlock(); 2690 } 2691 2692 static void sock_def_readable(struct sock *sk) 2693 { 2694 struct socket_wq *wq; 2695 2696 rcu_read_lock(); 2697 wq = rcu_dereference(sk->sk_wq); 2698 if (skwq_has_sleeper(wq)) 2699 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | 2700 EPOLLRDNORM | EPOLLRDBAND); 2701 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 2702 rcu_read_unlock(); 2703 } 2704 2705 static void sock_def_write_space(struct sock *sk) 2706 { 2707 struct socket_wq *wq; 2708 2709 rcu_read_lock(); 2710 2711 /* Do not wake up a writer until he can make "significant" 2712 * progress. --DaveM 2713 */ 2714 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { 2715 wq = rcu_dereference(sk->sk_wq); 2716 if (skwq_has_sleeper(wq)) 2717 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 2718 EPOLLWRNORM | EPOLLWRBAND); 2719 2720 /* Should agree with poll, otherwise some programs break */ 2721 if (sock_writeable(sk)) 2722 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 2723 } 2724 2725 rcu_read_unlock(); 2726 } 2727 2728 static void sock_def_destruct(struct sock *sk) 2729 { 2730 } 2731 2732 void sk_send_sigurg(struct sock *sk) 2733 { 2734 if (sk->sk_socket && sk->sk_socket->file) 2735 if (send_sigurg(&sk->sk_socket->file->f_owner)) 2736 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 2737 } 2738 EXPORT_SYMBOL(sk_send_sigurg); 2739 2740 void sk_reset_timer(struct sock *sk, struct timer_list* timer, 2741 unsigned long expires) 2742 { 2743 if (!mod_timer(timer, expires)) 2744 sock_hold(sk); 2745 } 2746 EXPORT_SYMBOL(sk_reset_timer); 2747 2748 void sk_stop_timer(struct sock *sk, struct timer_list* timer) 2749 { 2750 if (del_timer(timer)) 2751 __sock_put(sk); 2752 } 2753 EXPORT_SYMBOL(sk_stop_timer); 2754 2755 void sock_init_data(struct socket *sock, struct sock *sk) 2756 { 2757 sk_init_common(sk); 2758 sk->sk_send_head = NULL; 2759 2760 timer_setup(&sk->sk_timer, NULL, 0); 2761 2762 sk->sk_allocation = GFP_KERNEL; 2763 sk->sk_rcvbuf = sysctl_rmem_default; 2764 sk->sk_sndbuf = sysctl_wmem_default; 2765 sk->sk_state = TCP_CLOSE; 2766 sk_set_socket(sk, sock); 2767 2768 sock_set_flag(sk, SOCK_ZAPPED); 2769 2770 if (sock) { 2771 sk->sk_type = sock->type; 2772 sk->sk_wq = sock->wq; 2773 sock->sk = sk; 2774 sk->sk_uid = SOCK_INODE(sock)->i_uid; 2775 } else { 2776 sk->sk_wq = NULL; 2777 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0); 2778 } 2779 2780 rwlock_init(&sk->sk_callback_lock); 2781 if (sk->sk_kern_sock) 2782 lockdep_set_class_and_name( 2783 &sk->sk_callback_lock, 2784 af_kern_callback_keys + sk->sk_family, 2785 af_family_kern_clock_key_strings[sk->sk_family]); 2786 else 2787 lockdep_set_class_and_name( 2788 &sk->sk_callback_lock, 2789 af_callback_keys + sk->sk_family, 2790 af_family_clock_key_strings[sk->sk_family]); 2791 2792 sk->sk_state_change = sock_def_wakeup; 2793 sk->sk_data_ready = sock_def_readable; 2794 sk->sk_write_space = sock_def_write_space; 2795 sk->sk_error_report = sock_def_error_report; 2796 sk->sk_destruct = sock_def_destruct; 2797 2798 sk->sk_frag.page = NULL; 2799 sk->sk_frag.offset = 0; 2800 sk->sk_peek_off = -1; 2801 2802 sk->sk_peer_pid = NULL; 2803 sk->sk_peer_cred = NULL; 2804 sk->sk_write_pending = 0; 2805 sk->sk_rcvlowat = 1; 2806 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 2807 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 2808 2809 sk->sk_stamp = SK_DEFAULT_STAMP; 2810 atomic_set(&sk->sk_zckey, 0); 2811 2812 #ifdef CONFIG_NET_RX_BUSY_POLL 2813 sk->sk_napi_id = 0; 2814 sk->sk_ll_usec = sysctl_net_busy_read; 2815 #endif 2816 2817 sk->sk_max_pacing_rate = ~0U; 2818 sk->sk_pacing_rate = ~0U; 2819 sk->sk_pacing_shift = 10; 2820 sk->sk_incoming_cpu = -1; 2821 /* 2822 * Before updating sk_refcnt, we must commit prior changes to memory 2823 * (Documentation/RCU/rculist_nulls.txt for details) 2824 */ 2825 smp_wmb(); 2826 refcount_set(&sk->sk_refcnt, 1); 2827 atomic_set(&sk->sk_drops, 0); 2828 } 2829 EXPORT_SYMBOL(sock_init_data); 2830 2831 void lock_sock_nested(struct sock *sk, int subclass) 2832 { 2833 might_sleep(); 2834 spin_lock_bh(&sk->sk_lock.slock); 2835 if (sk->sk_lock.owned) 2836 __lock_sock(sk); 2837 sk->sk_lock.owned = 1; 2838 spin_unlock(&sk->sk_lock.slock); 2839 /* 2840 * The sk_lock has mutex_lock() semantics here: 2841 */ 2842 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); 2843 local_bh_enable(); 2844 } 2845 EXPORT_SYMBOL(lock_sock_nested); 2846 2847 void release_sock(struct sock *sk) 2848 { 2849 spin_lock_bh(&sk->sk_lock.slock); 2850 if (sk->sk_backlog.tail) 2851 __release_sock(sk); 2852 2853 /* Warning : release_cb() might need to release sk ownership, 2854 * ie call sock_release_ownership(sk) before us. 2855 */ 2856 if (sk->sk_prot->release_cb) 2857 sk->sk_prot->release_cb(sk); 2858 2859 sock_release_ownership(sk); 2860 if (waitqueue_active(&sk->sk_lock.wq)) 2861 wake_up(&sk->sk_lock.wq); 2862 spin_unlock_bh(&sk->sk_lock.slock); 2863 } 2864 EXPORT_SYMBOL(release_sock); 2865 2866 /** 2867 * lock_sock_fast - fast version of lock_sock 2868 * @sk: socket 2869 * 2870 * This version should be used for very small section, where process wont block 2871 * return false if fast path is taken: 2872 * 2873 * sk_lock.slock locked, owned = 0, BH disabled 2874 * 2875 * return true if slow path is taken: 2876 * 2877 * sk_lock.slock unlocked, owned = 1, BH enabled 2878 */ 2879 bool lock_sock_fast(struct sock *sk) 2880 { 2881 might_sleep(); 2882 spin_lock_bh(&sk->sk_lock.slock); 2883 2884 if (!sk->sk_lock.owned) 2885 /* 2886 * Note : We must disable BH 2887 */ 2888 return false; 2889 2890 __lock_sock(sk); 2891 sk->sk_lock.owned = 1; 2892 spin_unlock(&sk->sk_lock.slock); 2893 /* 2894 * The sk_lock has mutex_lock() semantics here: 2895 */ 2896 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); 2897 local_bh_enable(); 2898 return true; 2899 } 2900 EXPORT_SYMBOL(lock_sock_fast); 2901 2902 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) 2903 { 2904 struct timeval tv; 2905 if (!sock_flag(sk, SOCK_TIMESTAMP)) 2906 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 2907 tv = ktime_to_timeval(sk->sk_stamp); 2908 if (tv.tv_sec == -1) 2909 return -ENOENT; 2910 if (tv.tv_sec == 0) { 2911 sk->sk_stamp = ktime_get_real(); 2912 tv = ktime_to_timeval(sk->sk_stamp); 2913 } 2914 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0; 2915 } 2916 EXPORT_SYMBOL(sock_get_timestamp); 2917 2918 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) 2919 { 2920 struct timespec ts; 2921 if (!sock_flag(sk, SOCK_TIMESTAMP)) 2922 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 2923 ts = ktime_to_timespec(sk->sk_stamp); 2924 if (ts.tv_sec == -1) 2925 return -ENOENT; 2926 if (ts.tv_sec == 0) { 2927 sk->sk_stamp = ktime_get_real(); 2928 ts = ktime_to_timespec(sk->sk_stamp); 2929 } 2930 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0; 2931 } 2932 EXPORT_SYMBOL(sock_get_timestampns); 2933 2934 void sock_enable_timestamp(struct sock *sk, int flag) 2935 { 2936 if (!sock_flag(sk, flag)) { 2937 unsigned long previous_flags = sk->sk_flags; 2938 2939 sock_set_flag(sk, flag); 2940 /* 2941 * we just set one of the two flags which require net 2942 * time stamping, but time stamping might have been on 2943 * already because of the other one 2944 */ 2945 if (sock_needs_netstamp(sk) && 2946 !(previous_flags & SK_FLAGS_TIMESTAMP)) 2947 net_enable_timestamp(); 2948 } 2949 } 2950 2951 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, 2952 int level, int type) 2953 { 2954 struct sock_exterr_skb *serr; 2955 struct sk_buff *skb; 2956 int copied, err; 2957 2958 err = -EAGAIN; 2959 skb = sock_dequeue_err_skb(sk); 2960 if (skb == NULL) 2961 goto out; 2962 2963 copied = skb->len; 2964 if (copied > len) { 2965 msg->msg_flags |= MSG_TRUNC; 2966 copied = len; 2967 } 2968 err = skb_copy_datagram_msg(skb, 0, msg, copied); 2969 if (err) 2970 goto out_free_skb; 2971 2972 sock_recv_timestamp(msg, sk, skb); 2973 2974 serr = SKB_EXT_ERR(skb); 2975 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); 2976 2977 msg->msg_flags |= MSG_ERRQUEUE; 2978 err = copied; 2979 2980 out_free_skb: 2981 kfree_skb(skb); 2982 out: 2983 return err; 2984 } 2985 EXPORT_SYMBOL(sock_recv_errqueue); 2986 2987 /* 2988 * Get a socket option on an socket. 2989 * 2990 * FIX: POSIX 1003.1g is very ambiguous here. It states that 2991 * asynchronous errors should be reported by getsockopt. We assume 2992 * this means if you specify SO_ERROR (otherwise whats the point of it). 2993 */ 2994 int sock_common_getsockopt(struct socket *sock, int level, int optname, 2995 char __user *optval, int __user *optlen) 2996 { 2997 struct sock *sk = sock->sk; 2998 2999 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 3000 } 3001 EXPORT_SYMBOL(sock_common_getsockopt); 3002 3003 #ifdef CONFIG_COMPAT 3004 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname, 3005 char __user *optval, int __user *optlen) 3006 { 3007 struct sock *sk = sock->sk; 3008 3009 if (sk->sk_prot->compat_getsockopt != NULL) 3010 return sk->sk_prot->compat_getsockopt(sk, level, optname, 3011 optval, optlen); 3012 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 3013 } 3014 EXPORT_SYMBOL(compat_sock_common_getsockopt); 3015 #endif 3016 3017 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 3018 int flags) 3019 { 3020 struct sock *sk = sock->sk; 3021 int addr_len = 0; 3022 int err; 3023 3024 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, 3025 flags & ~MSG_DONTWAIT, &addr_len); 3026 if (err >= 0) 3027 msg->msg_namelen = addr_len; 3028 return err; 3029 } 3030 EXPORT_SYMBOL(sock_common_recvmsg); 3031 3032 /* 3033 * Set socket options on an inet socket. 3034 */ 3035 int sock_common_setsockopt(struct socket *sock, int level, int optname, 3036 char __user *optval, unsigned int optlen) 3037 { 3038 struct sock *sk = sock->sk; 3039 3040 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 3041 } 3042 EXPORT_SYMBOL(sock_common_setsockopt); 3043 3044 #ifdef CONFIG_COMPAT 3045 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname, 3046 char __user *optval, unsigned int optlen) 3047 { 3048 struct sock *sk = sock->sk; 3049 3050 if (sk->sk_prot->compat_setsockopt != NULL) 3051 return sk->sk_prot->compat_setsockopt(sk, level, optname, 3052 optval, optlen); 3053 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 3054 } 3055 EXPORT_SYMBOL(compat_sock_common_setsockopt); 3056 #endif 3057 3058 void sk_common_release(struct sock *sk) 3059 { 3060 if (sk->sk_prot->destroy) 3061 sk->sk_prot->destroy(sk); 3062 3063 /* 3064 * Observation: when sock_common_release is called, processes have 3065 * no access to socket. But net still has. 3066 * Step one, detach it from networking: 3067 * 3068 * A. Remove from hash tables. 3069 */ 3070 3071 sk->sk_prot->unhash(sk); 3072 3073 /* 3074 * In this point socket cannot receive new packets, but it is possible 3075 * that some packets are in flight because some CPU runs receiver and 3076 * did hash table lookup before we unhashed socket. They will achieve 3077 * receive queue and will be purged by socket destructor. 3078 * 3079 * Also we still have packets pending on receive queue and probably, 3080 * our own packets waiting in device queues. sock_destroy will drain 3081 * receive queue, but transmitted packets will delay socket destruction 3082 * until the last reference will be released. 3083 */ 3084 3085 sock_orphan(sk); 3086 3087 xfrm_sk_free_policy(sk); 3088 3089 sk_refcnt_debug_release(sk); 3090 3091 sock_put(sk); 3092 } 3093 EXPORT_SYMBOL(sk_common_release); 3094 3095 void sk_get_meminfo(const struct sock *sk, u32 *mem) 3096 { 3097 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); 3098 3099 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); 3100 mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf; 3101 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); 3102 mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf; 3103 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; 3104 mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; 3105 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); 3106 mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; 3107 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); 3108 } 3109 3110 #ifdef CONFIG_PROC_FS 3111 #define PROTO_INUSE_NR 64 /* should be enough for the first time */ 3112 struct prot_inuse { 3113 int val[PROTO_INUSE_NR]; 3114 }; 3115 3116 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); 3117 3118 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) 3119 { 3120 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val); 3121 } 3122 EXPORT_SYMBOL_GPL(sock_prot_inuse_add); 3123 3124 int sock_prot_inuse_get(struct net *net, struct proto *prot) 3125 { 3126 int cpu, idx = prot->inuse_idx; 3127 int res = 0; 3128 3129 for_each_possible_cpu(cpu) 3130 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; 3131 3132 return res >= 0 ? res : 0; 3133 } 3134 EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 3135 3136 static void sock_inuse_add(struct net *net, int val) 3137 { 3138 this_cpu_add(*net->core.sock_inuse, val); 3139 } 3140 3141 int sock_inuse_get(struct net *net) 3142 { 3143 int cpu, res = 0; 3144 3145 for_each_possible_cpu(cpu) 3146 res += *per_cpu_ptr(net->core.sock_inuse, cpu); 3147 3148 return res; 3149 } 3150 3151 EXPORT_SYMBOL_GPL(sock_inuse_get); 3152 3153 static int __net_init sock_inuse_init_net(struct net *net) 3154 { 3155 net->core.prot_inuse = alloc_percpu(struct prot_inuse); 3156 if (net->core.prot_inuse == NULL) 3157 return -ENOMEM; 3158 3159 net->core.sock_inuse = alloc_percpu(int); 3160 if (net->core.sock_inuse == NULL) 3161 goto out; 3162 3163 return 0; 3164 3165 out: 3166 free_percpu(net->core.prot_inuse); 3167 return -ENOMEM; 3168 } 3169 3170 static void __net_exit sock_inuse_exit_net(struct net *net) 3171 { 3172 free_percpu(net->core.prot_inuse); 3173 free_percpu(net->core.sock_inuse); 3174 } 3175 3176 static struct pernet_operations net_inuse_ops = { 3177 .init = sock_inuse_init_net, 3178 .exit = sock_inuse_exit_net, 3179 }; 3180 3181 static __init int net_inuse_init(void) 3182 { 3183 if (register_pernet_subsys(&net_inuse_ops)) 3184 panic("Cannot initialize net inuse counters"); 3185 3186 return 0; 3187 } 3188 3189 core_initcall(net_inuse_init); 3190 3191 static void assign_proto_idx(struct proto *prot) 3192 { 3193 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); 3194 3195 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { 3196 pr_err("PROTO_INUSE_NR exhausted\n"); 3197 return; 3198 } 3199 3200 set_bit(prot->inuse_idx, proto_inuse_idx); 3201 } 3202 3203 static void release_proto_idx(struct proto *prot) 3204 { 3205 if (prot->inuse_idx != PROTO_INUSE_NR - 1) 3206 clear_bit(prot->inuse_idx, proto_inuse_idx); 3207 } 3208 #else 3209 static inline void assign_proto_idx(struct proto *prot) 3210 { 3211 } 3212 3213 static inline void release_proto_idx(struct proto *prot) 3214 { 3215 } 3216 3217 static void sock_inuse_add(struct net *net, int val) 3218 { 3219 } 3220 #endif 3221 3222 static void req_prot_cleanup(struct request_sock_ops *rsk_prot) 3223 { 3224 if (!rsk_prot) 3225 return; 3226 kfree(rsk_prot->slab_name); 3227 rsk_prot->slab_name = NULL; 3228 kmem_cache_destroy(rsk_prot->slab); 3229 rsk_prot->slab = NULL; 3230 } 3231 3232 static int req_prot_init(const struct proto *prot) 3233 { 3234 struct request_sock_ops *rsk_prot = prot->rsk_prot; 3235 3236 if (!rsk_prot) 3237 return 0; 3238 3239 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", 3240 prot->name); 3241 if (!rsk_prot->slab_name) 3242 return -ENOMEM; 3243 3244 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, 3245 rsk_prot->obj_size, 0, 3246 prot->slab_flags, NULL); 3247 3248 if (!rsk_prot->slab) { 3249 pr_crit("%s: Can't create request sock SLAB cache!\n", 3250 prot->name); 3251 return -ENOMEM; 3252 } 3253 return 0; 3254 } 3255 3256 int proto_register(struct proto *prot, int alloc_slab) 3257 { 3258 if (alloc_slab) { 3259 prot->slab = kmem_cache_create_usercopy(prot->name, 3260 prot->obj_size, 0, 3261 SLAB_HWCACHE_ALIGN | prot->slab_flags, 3262 prot->useroffset, prot->usersize, 3263 NULL); 3264 3265 if (prot->slab == NULL) { 3266 pr_crit("%s: Can't create sock SLAB cache!\n", 3267 prot->name); 3268 goto out; 3269 } 3270 3271 if (req_prot_init(prot)) 3272 goto out_free_request_sock_slab; 3273 3274 if (prot->twsk_prot != NULL) { 3275 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name); 3276 3277 if (prot->twsk_prot->twsk_slab_name == NULL) 3278 goto out_free_request_sock_slab; 3279 3280 prot->twsk_prot->twsk_slab = 3281 kmem_cache_create(prot->twsk_prot->twsk_slab_name, 3282 prot->twsk_prot->twsk_obj_size, 3283 0, 3284 prot->slab_flags, 3285 NULL); 3286 if (prot->twsk_prot->twsk_slab == NULL) 3287 goto out_free_timewait_sock_slab_name; 3288 } 3289 } 3290 3291 mutex_lock(&proto_list_mutex); 3292 list_add(&prot->node, &proto_list); 3293 assign_proto_idx(prot); 3294 mutex_unlock(&proto_list_mutex); 3295 return 0; 3296 3297 out_free_timewait_sock_slab_name: 3298 kfree(prot->twsk_prot->twsk_slab_name); 3299 out_free_request_sock_slab: 3300 req_prot_cleanup(prot->rsk_prot); 3301 3302 kmem_cache_destroy(prot->slab); 3303 prot->slab = NULL; 3304 out: 3305 return -ENOBUFS; 3306 } 3307 EXPORT_SYMBOL(proto_register); 3308 3309 void proto_unregister(struct proto *prot) 3310 { 3311 mutex_lock(&proto_list_mutex); 3312 release_proto_idx(prot); 3313 list_del(&prot->node); 3314 mutex_unlock(&proto_list_mutex); 3315 3316 kmem_cache_destroy(prot->slab); 3317 prot->slab = NULL; 3318 3319 req_prot_cleanup(prot->rsk_prot); 3320 3321 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) { 3322 kmem_cache_destroy(prot->twsk_prot->twsk_slab); 3323 kfree(prot->twsk_prot->twsk_slab_name); 3324 prot->twsk_prot->twsk_slab = NULL; 3325 } 3326 } 3327 EXPORT_SYMBOL(proto_unregister); 3328 3329 int sock_load_diag_module(int family, int protocol) 3330 { 3331 if (!protocol) { 3332 if (!sock_is_registered(family)) 3333 return -ENOENT; 3334 3335 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, 3336 NETLINK_SOCK_DIAG, family); 3337 } 3338 3339 #ifdef CONFIG_INET 3340 if (family == AF_INET && 3341 !rcu_access_pointer(inet_protos[protocol])) 3342 return -ENOENT; 3343 #endif 3344 3345 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, 3346 NETLINK_SOCK_DIAG, family, protocol); 3347 } 3348 EXPORT_SYMBOL(sock_load_diag_module); 3349 3350 #ifdef CONFIG_PROC_FS 3351 static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 3352 __acquires(proto_list_mutex) 3353 { 3354 mutex_lock(&proto_list_mutex); 3355 return seq_list_start_head(&proto_list, *pos); 3356 } 3357 3358 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3359 { 3360 return seq_list_next(v, &proto_list, pos); 3361 } 3362 3363 static void proto_seq_stop(struct seq_file *seq, void *v) 3364 __releases(proto_list_mutex) 3365 { 3366 mutex_unlock(&proto_list_mutex); 3367 } 3368 3369 static char proto_method_implemented(const void *method) 3370 { 3371 return method == NULL ? 'n' : 'y'; 3372 } 3373 static long sock_prot_memory_allocated(struct proto *proto) 3374 { 3375 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; 3376 } 3377 3378 static char *sock_prot_memory_pressure(struct proto *proto) 3379 { 3380 return proto->memory_pressure != NULL ? 3381 proto_memory_pressure(proto) ? "yes" : "no" : "NI"; 3382 } 3383 3384 static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 3385 { 3386 3387 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " 3388 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 3389 proto->name, 3390 proto->obj_size, 3391 sock_prot_inuse_get(seq_file_net(seq), proto), 3392 sock_prot_memory_allocated(proto), 3393 sock_prot_memory_pressure(proto), 3394 proto->max_header, 3395 proto->slab == NULL ? "no" : "yes", 3396 module_name(proto->owner), 3397 proto_method_implemented(proto->close), 3398 proto_method_implemented(proto->connect), 3399 proto_method_implemented(proto->disconnect), 3400 proto_method_implemented(proto->accept), 3401 proto_method_implemented(proto->ioctl), 3402 proto_method_implemented(proto->init), 3403 proto_method_implemented(proto->destroy), 3404 proto_method_implemented(proto->shutdown), 3405 proto_method_implemented(proto->setsockopt), 3406 proto_method_implemented(proto->getsockopt), 3407 proto_method_implemented(proto->sendmsg), 3408 proto_method_implemented(proto->recvmsg), 3409 proto_method_implemented(proto->sendpage), 3410 proto_method_implemented(proto->bind), 3411 proto_method_implemented(proto->backlog_rcv), 3412 proto_method_implemented(proto->hash), 3413 proto_method_implemented(proto->unhash), 3414 proto_method_implemented(proto->get_port), 3415 proto_method_implemented(proto->enter_memory_pressure)); 3416 } 3417 3418 static int proto_seq_show(struct seq_file *seq, void *v) 3419 { 3420 if (v == &proto_list) 3421 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", 3422 "protocol", 3423 "size", 3424 "sockets", 3425 "memory", 3426 "press", 3427 "maxhdr", 3428 "slab", 3429 "module", 3430 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); 3431 else 3432 proto_seq_printf(seq, list_entry(v, struct proto, node)); 3433 return 0; 3434 } 3435 3436 static const struct seq_operations proto_seq_ops = { 3437 .start = proto_seq_start, 3438 .next = proto_seq_next, 3439 .stop = proto_seq_stop, 3440 .show = proto_seq_show, 3441 }; 3442 3443 static __net_init int proto_init_net(struct net *net) 3444 { 3445 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, 3446 sizeof(struct seq_net_private))) 3447 return -ENOMEM; 3448 3449 return 0; 3450 } 3451 3452 static __net_exit void proto_exit_net(struct net *net) 3453 { 3454 remove_proc_entry("protocols", net->proc_net); 3455 } 3456 3457 3458 static __net_initdata struct pernet_operations proto_net_ops = { 3459 .init = proto_init_net, 3460 .exit = proto_exit_net, 3461 }; 3462 3463 static int __init proto_init(void) 3464 { 3465 return register_pernet_subsys(&proto_net_ops); 3466 } 3467 3468 subsys_initcall(proto_init); 3469 3470 #endif /* PROC_FS */ 3471 3472 #ifdef CONFIG_NET_RX_BUSY_POLL 3473 bool sk_busy_loop_end(void *p, unsigned long start_time) 3474 { 3475 struct sock *sk = p; 3476 3477 return !skb_queue_empty(&sk->sk_receive_queue) || 3478 sk_busy_loop_timeout(sk, start_time); 3479 } 3480 EXPORT_SYMBOL(sk_busy_loop_end); 3481 #endif /* CONFIG_NET_RX_BUSY_POLL */ 3482