1 /* 2 * SUCS NET3: 3 * 4 * Generic datagram handling routines. These are generic for all 5 * protocols. Possibly a generic IP version on top of these would 6 * make sense. Not tonight however 8-). 7 * This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and 8 * NetROM layer all have identical poll code and mostly 9 * identical recvmsg() code. So we share it here. The poll was 10 * shared before but buried in udp.c so I moved it. 11 * 12 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>. (datagram_poll() from old 13 * udp.c code) 14 * 15 * Fixes: 16 * Alan Cox : NULL return from skb_peek_copy() 17 * understood 18 * Alan Cox : Rewrote skb_read_datagram to avoid the 19 * skb_peek_copy stuff. 20 * Alan Cox : Added support for SOCK_SEQPACKET. 21 * IPX can no longer use the SO_TYPE hack 22 * but AX.25 now works right, and SPX is 23 * feasible. 24 * Alan Cox : Fixed write poll of non IP protocol 25 * crash. 26 * Florian La Roche: Changed for my new skbuff handling. 27 * Darryl Miles : Fixed non-blocking SOCK_SEQPACKET. 28 * Linus Torvalds : BSD semantic fixes. 29 * Alan Cox : Datagram iovec handling 30 * Darryl Miles : Fixed non-blocking SOCK_STREAM. 31 * Alan Cox : POSIXisms 32 * Pete Wyckoff : Unconnected accept() fix. 33 * 34 */ 35 36 #include <linux/module.h> 37 #include <linux/types.h> 38 #include <linux/kernel.h> 39 #include <asm/uaccess.h> 40 #include <linux/mm.h> 41 #include <linux/interrupt.h> 42 #include <linux/errno.h> 43 #include <linux/sched.h> 44 #include <linux/inet.h> 45 #include <linux/netdevice.h> 46 #include <linux/rtnetlink.h> 47 #include <linux/poll.h> 48 #include <linux/highmem.h> 49 #include <linux/spinlock.h> 50 #include <linux/slab.h> 51 #include <linux/pagemap.h> 52 #include <linux/uio.h> 53 54 #include <net/protocol.h> 55 #include <linux/skbuff.h> 56 57 #include <net/checksum.h> 58 #include <net/sock.h> 59 #include <net/tcp_states.h> 60 #include <trace/events/skb.h> 61 #include <net/busy_poll.h> 62 63 /* 64 * Is a socket 'connection oriented' ? 65 */ 66 static inline int connection_based(struct sock *sk) 67 { 68 return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM; 69 } 70 71 static int receiver_wake_function(wait_queue_t *wait, unsigned int mode, int sync, 72 void *key) 73 { 74 unsigned long bits = (unsigned long)key; 75 76 /* 77 * Avoid a wakeup if event not interesting for us 78 */ 79 if (bits && !(bits & (POLLIN | POLLERR))) 80 return 0; 81 return autoremove_wake_function(wait, mode, sync, key); 82 } 83 /* 84 * Wait for the last received packet to be different from skb 85 */ 86 static int wait_for_more_packets(struct sock *sk, int *err, long *timeo_p, 87 const struct sk_buff *skb) 88 { 89 int error; 90 DEFINE_WAIT_FUNC(wait, receiver_wake_function); 91 92 prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 93 94 /* Socket errors? */ 95 error = sock_error(sk); 96 if (error) 97 goto out_err; 98 99 if (sk->sk_receive_queue.prev != skb) 100 goto out; 101 102 /* Socket shut down? */ 103 if (sk->sk_shutdown & RCV_SHUTDOWN) 104 goto out_noerr; 105 106 /* Sequenced packets can come disconnected. 107 * If so we report the problem 108 */ 109 error = -ENOTCONN; 110 if (connection_based(sk) && 111 !(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN)) 112 goto out_err; 113 114 /* handle signals */ 115 if (signal_pending(current)) 116 goto interrupted; 117 118 error = 0; 119 *timeo_p = schedule_timeout(*timeo_p); 120 out: 121 finish_wait(sk_sleep(sk), &wait); 122 return error; 123 interrupted: 124 error = sock_intr_errno(*timeo_p); 125 out_err: 126 *err = error; 127 goto out; 128 out_noerr: 129 *err = 0; 130 error = 1; 131 goto out; 132 } 133 134 /** 135 * __skb_recv_datagram - Receive a datagram skbuff 136 * @sk: socket 137 * @flags: MSG_ flags 138 * @peeked: returns non-zero if this packet has been seen before 139 * @off: an offset in bytes to peek skb from. Returns an offset 140 * within an skb where data actually starts 141 * @err: error code returned 142 * 143 * Get a datagram skbuff, understands the peeking, nonblocking wakeups 144 * and possible races. This replaces identical code in packet, raw and 145 * udp, as well as the IPX AX.25 and Appletalk. It also finally fixes 146 * the long standing peek and read race for datagram sockets. If you 147 * alter this routine remember it must be re-entrant. 148 * 149 * This function will lock the socket if a skb is returned, so the caller 150 * needs to unlock the socket in that case (usually by calling 151 * skb_free_datagram) 152 * 153 * * It does not lock socket since today. This function is 154 * * free of race conditions. This measure should/can improve 155 * * significantly datagram socket latencies at high loads, 156 * * when data copying to user space takes lots of time. 157 * * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet 158 * * 8) Great win.) 159 * * --ANK (980729) 160 * 161 * The order of the tests when we find no data waiting are specified 162 * quite explicitly by POSIX 1003.1g, don't change them without having 163 * the standard around please. 164 */ 165 struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, 166 int *peeked, int *off, int *err) 167 { 168 struct sk_buff *skb, *last; 169 long timeo; 170 /* 171 * Caller is allowed not to check sk->sk_err before skb_recv_datagram() 172 */ 173 int error = sock_error(sk); 174 175 if (error) 176 goto no_packet; 177 178 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 179 180 do { 181 /* Again only user level code calls this function, so nothing 182 * interrupt level will suddenly eat the receive_queue. 183 * 184 * Look at current nfs client by the way... 185 * However, this function was correct in any case. 8) 186 */ 187 unsigned long cpu_flags; 188 struct sk_buff_head *queue = &sk->sk_receive_queue; 189 int _off = *off; 190 191 last = (struct sk_buff *)queue; 192 spin_lock_irqsave(&queue->lock, cpu_flags); 193 skb_queue_walk(queue, skb) { 194 last = skb; 195 *peeked = skb->peeked; 196 if (flags & MSG_PEEK) { 197 if (_off >= skb->len && (skb->len || _off || 198 skb->peeked)) { 199 _off -= skb->len; 200 continue; 201 } 202 skb->peeked = 1; 203 atomic_inc(&skb->users); 204 } else 205 __skb_unlink(skb, queue); 206 207 spin_unlock_irqrestore(&queue->lock, cpu_flags); 208 *off = _off; 209 return skb; 210 } 211 spin_unlock_irqrestore(&queue->lock, cpu_flags); 212 213 if (sk_can_busy_loop(sk) && 214 sk_busy_loop(sk, flags & MSG_DONTWAIT)) 215 continue; 216 217 /* User doesn't want to wait */ 218 error = -EAGAIN; 219 if (!timeo) 220 goto no_packet; 221 222 } while (!wait_for_more_packets(sk, err, &timeo, last)); 223 224 return NULL; 225 226 no_packet: 227 *err = error; 228 return NULL; 229 } 230 EXPORT_SYMBOL(__skb_recv_datagram); 231 232 struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, 233 int noblock, int *err) 234 { 235 int peeked, off = 0; 236 237 return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), 238 &peeked, &off, err); 239 } 240 EXPORT_SYMBOL(skb_recv_datagram); 241 242 void skb_free_datagram(struct sock *sk, struct sk_buff *skb) 243 { 244 consume_skb(skb); 245 sk_mem_reclaim_partial(sk); 246 } 247 EXPORT_SYMBOL(skb_free_datagram); 248 249 void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb) 250 { 251 bool slow; 252 253 if (likely(atomic_read(&skb->users) == 1)) 254 smp_rmb(); 255 else if (likely(!atomic_dec_and_test(&skb->users))) 256 return; 257 258 slow = lock_sock_fast(sk); 259 skb_orphan(skb); 260 sk_mem_reclaim_partial(sk); 261 unlock_sock_fast(sk, slow); 262 263 /* skb is now orphaned, can be freed outside of locked section */ 264 __kfree_skb(skb); 265 } 266 EXPORT_SYMBOL(skb_free_datagram_locked); 267 268 /** 269 * skb_kill_datagram - Free a datagram skbuff forcibly 270 * @sk: socket 271 * @skb: datagram skbuff 272 * @flags: MSG_ flags 273 * 274 * This function frees a datagram skbuff that was received by 275 * skb_recv_datagram. The flags argument must match the one 276 * used for skb_recv_datagram. 277 * 278 * If the MSG_PEEK flag is set, and the packet is still on the 279 * receive queue of the socket, it will be taken off the queue 280 * before it is freed. 281 * 282 * This function currently only disables BH when acquiring the 283 * sk_receive_queue lock. Therefore it must not be used in a 284 * context where that lock is acquired in an IRQ context. 285 * 286 * It returns 0 if the packet was removed by us. 287 */ 288 289 int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) 290 { 291 int err = 0; 292 293 if (flags & MSG_PEEK) { 294 err = -ENOENT; 295 spin_lock_bh(&sk->sk_receive_queue.lock); 296 if (skb == skb_peek(&sk->sk_receive_queue)) { 297 __skb_unlink(skb, &sk->sk_receive_queue); 298 atomic_dec(&skb->users); 299 err = 0; 300 } 301 spin_unlock_bh(&sk->sk_receive_queue.lock); 302 } 303 304 kfree_skb(skb); 305 atomic_inc(&sk->sk_drops); 306 sk_mem_reclaim_partial(sk); 307 308 return err; 309 } 310 EXPORT_SYMBOL(skb_kill_datagram); 311 312 /** 313 * skb_copy_datagram_iter - Copy a datagram to an iovec iterator. 314 * @skb: buffer to copy 315 * @offset: offset in the buffer to start copying from 316 * @to: iovec iterator to copy to 317 * @len: amount of data to copy from buffer to iovec 318 */ 319 int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, 320 struct iov_iter *to, int len) 321 { 322 int start = skb_headlen(skb); 323 int i, copy = start - offset; 324 struct sk_buff *frag_iter; 325 326 trace_skb_copy_datagram_iovec(skb, len); 327 328 /* Copy header. */ 329 if (copy > 0) { 330 if (copy > len) 331 copy = len; 332 if (copy_to_iter(skb->data + offset, copy, to) != copy) 333 goto short_copy; 334 if ((len -= copy) == 0) 335 return 0; 336 offset += copy; 337 } 338 339 /* Copy paged appendix. Hmm... why does this look so complicated? */ 340 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 341 int end; 342 const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 343 344 WARN_ON(start > offset + len); 345 346 end = start + skb_frag_size(frag); 347 if ((copy = end - offset) > 0) { 348 if (copy > len) 349 copy = len; 350 if (copy_page_to_iter(skb_frag_page(frag), 351 frag->page_offset + offset - 352 start, copy, to) != copy) 353 goto short_copy; 354 if (!(len -= copy)) 355 return 0; 356 offset += copy; 357 } 358 start = end; 359 } 360 361 skb_walk_frags(skb, frag_iter) { 362 int end; 363 364 WARN_ON(start > offset + len); 365 366 end = start + frag_iter->len; 367 if ((copy = end - offset) > 0) { 368 if (copy > len) 369 copy = len; 370 if (skb_copy_datagram_iter(frag_iter, offset - start, 371 to, copy)) 372 goto fault; 373 if ((len -= copy) == 0) 374 return 0; 375 offset += copy; 376 } 377 start = end; 378 } 379 if (!len) 380 return 0; 381 382 /* This is not really a user copy fault, but rather someone 383 * gave us a bogus length on the skb. We should probably 384 * print a warning here as it may indicate a kernel bug. 385 */ 386 387 fault: 388 return -EFAULT; 389 390 short_copy: 391 if (iov_iter_count(to)) 392 goto fault; 393 394 return 0; 395 } 396 EXPORT_SYMBOL(skb_copy_datagram_iter); 397 398 /** 399 * skb_copy_datagram_from_iter - Copy a datagram from an iov_iter. 400 * @skb: buffer to copy 401 * @offset: offset in the buffer to start copying to 402 * @from: the copy source 403 * @len: amount of data to copy to buffer from iovec 404 * 405 * Returns 0 or -EFAULT. 406 */ 407 int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, 408 struct iov_iter *from, 409 int len) 410 { 411 int start = skb_headlen(skb); 412 int i, copy = start - offset; 413 struct sk_buff *frag_iter; 414 415 /* Copy header. */ 416 if (copy > 0) { 417 if (copy > len) 418 copy = len; 419 if (copy_from_iter(skb->data + offset, copy, from) != copy) 420 goto fault; 421 if ((len -= copy) == 0) 422 return 0; 423 offset += copy; 424 } 425 426 /* Copy paged appendix. Hmm... why does this look so complicated? */ 427 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 428 int end; 429 const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 430 431 WARN_ON(start > offset + len); 432 433 end = start + skb_frag_size(frag); 434 if ((copy = end - offset) > 0) { 435 size_t copied; 436 437 if (copy > len) 438 copy = len; 439 copied = copy_page_from_iter(skb_frag_page(frag), 440 frag->page_offset + offset - start, 441 copy, from); 442 if (copied != copy) 443 goto fault; 444 445 if (!(len -= copy)) 446 return 0; 447 offset += copy; 448 } 449 start = end; 450 } 451 452 skb_walk_frags(skb, frag_iter) { 453 int end; 454 455 WARN_ON(start > offset + len); 456 457 end = start + frag_iter->len; 458 if ((copy = end - offset) > 0) { 459 if (copy > len) 460 copy = len; 461 if (skb_copy_datagram_from_iter(frag_iter, 462 offset - start, 463 from, copy)) 464 goto fault; 465 if ((len -= copy) == 0) 466 return 0; 467 offset += copy; 468 } 469 start = end; 470 } 471 if (!len) 472 return 0; 473 474 fault: 475 return -EFAULT; 476 } 477 EXPORT_SYMBOL(skb_copy_datagram_from_iter); 478 479 /** 480 * zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter 481 * @skb: buffer to copy 482 * @from: the source to copy from 483 * 484 * The function will first copy up to headlen, and then pin the userspace 485 * pages and build frags through them. 486 * 487 * Returns 0, -EFAULT or -EMSGSIZE. 488 */ 489 int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) 490 { 491 int len = iov_iter_count(from); 492 int copy = min_t(int, skb_headlen(skb), len); 493 int frag = 0; 494 495 /* copy up to skb headlen */ 496 if (skb_copy_datagram_from_iter(skb, 0, from, copy)) 497 return -EFAULT; 498 499 while (iov_iter_count(from)) { 500 struct page *pages[MAX_SKB_FRAGS]; 501 size_t start; 502 ssize_t copied; 503 unsigned long truesize; 504 int n = 0; 505 506 if (frag == MAX_SKB_FRAGS) 507 return -EMSGSIZE; 508 509 copied = iov_iter_get_pages(from, pages, ~0U, 510 MAX_SKB_FRAGS - frag, &start); 511 if (copied < 0) 512 return -EFAULT; 513 514 iov_iter_advance(from, copied); 515 516 truesize = PAGE_ALIGN(copied + start); 517 skb->data_len += copied; 518 skb->len += copied; 519 skb->truesize += truesize; 520 atomic_add(truesize, &skb->sk->sk_wmem_alloc); 521 while (copied) { 522 int size = min_t(int, copied, PAGE_SIZE - start); 523 skb_fill_page_desc(skb, frag++, pages[n], start, size); 524 start = 0; 525 copied -= size; 526 n++; 527 } 528 } 529 return 0; 530 } 531 EXPORT_SYMBOL(zerocopy_sg_from_iter); 532 533 static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, 534 struct iov_iter *to, int len, 535 __wsum *csump) 536 { 537 int start = skb_headlen(skb); 538 int i, copy = start - offset; 539 struct sk_buff *frag_iter; 540 int pos = 0; 541 int n; 542 543 /* Copy header. */ 544 if (copy > 0) { 545 if (copy > len) 546 copy = len; 547 n = csum_and_copy_to_iter(skb->data + offset, copy, csump, to); 548 if (n != copy) 549 goto fault; 550 if ((len -= copy) == 0) 551 return 0; 552 offset += copy; 553 pos = copy; 554 } 555 556 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 557 int end; 558 const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 559 560 WARN_ON(start > offset + len); 561 562 end = start + skb_frag_size(frag); 563 if ((copy = end - offset) > 0) { 564 __wsum csum2 = 0; 565 struct page *page = skb_frag_page(frag); 566 u8 *vaddr = kmap(page); 567 568 if (copy > len) 569 copy = len; 570 n = csum_and_copy_to_iter(vaddr + frag->page_offset + 571 offset - start, copy, 572 &csum2, to); 573 kunmap(page); 574 if (n != copy) 575 goto fault; 576 *csump = csum_block_add(*csump, csum2, pos); 577 if (!(len -= copy)) 578 return 0; 579 offset += copy; 580 pos += copy; 581 } 582 start = end; 583 } 584 585 skb_walk_frags(skb, frag_iter) { 586 int end; 587 588 WARN_ON(start > offset + len); 589 590 end = start + frag_iter->len; 591 if ((copy = end - offset) > 0) { 592 __wsum csum2 = 0; 593 if (copy > len) 594 copy = len; 595 if (skb_copy_and_csum_datagram(frag_iter, 596 offset - start, 597 to, copy, 598 &csum2)) 599 goto fault; 600 *csump = csum_block_add(*csump, csum2, pos); 601 if ((len -= copy) == 0) 602 return 0; 603 offset += copy; 604 pos += copy; 605 } 606 start = end; 607 } 608 if (!len) 609 return 0; 610 611 fault: 612 return -EFAULT; 613 } 614 615 __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) 616 { 617 __sum16 sum; 618 619 sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); 620 if (likely(!sum)) { 621 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && 622 !skb->csum_complete_sw) 623 netdev_rx_csum_fault(skb->dev); 624 } 625 skb->csum_valid = !sum; 626 return sum; 627 } 628 EXPORT_SYMBOL(__skb_checksum_complete_head); 629 630 __sum16 __skb_checksum_complete(struct sk_buff *skb) 631 { 632 __wsum csum; 633 __sum16 sum; 634 635 csum = skb_checksum(skb, 0, skb->len, 0); 636 637 /* skb->csum holds pseudo checksum */ 638 sum = csum_fold(csum_add(skb->csum, csum)); 639 if (likely(!sum)) { 640 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && 641 !skb->csum_complete_sw) 642 netdev_rx_csum_fault(skb->dev); 643 } 644 645 /* Save full packet checksum */ 646 skb->csum = csum; 647 skb->ip_summed = CHECKSUM_COMPLETE; 648 skb->csum_complete_sw = 1; 649 skb->csum_valid = !sum; 650 651 return sum; 652 } 653 EXPORT_SYMBOL(__skb_checksum_complete); 654 655 /** 656 * skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec. 657 * @skb: skbuff 658 * @hlen: hardware length 659 * @msg: destination 660 * 661 * Caller _must_ check that skb will fit to this iovec. 662 * 663 * Returns: 0 - success. 664 * -EINVAL - checksum failure. 665 * -EFAULT - fault during copy. 666 */ 667 int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, 668 int hlen, struct msghdr *msg) 669 { 670 __wsum csum; 671 int chunk = skb->len - hlen; 672 673 if (!chunk) 674 return 0; 675 676 if (iov_iter_count(&msg->msg_iter) < chunk) { 677 if (__skb_checksum_complete(skb)) 678 goto csum_error; 679 if (skb_copy_datagram_msg(skb, hlen, msg, chunk)) 680 goto fault; 681 } else { 682 csum = csum_partial(skb->data, hlen, skb->csum); 683 if (skb_copy_and_csum_datagram(skb, hlen, &msg->msg_iter, 684 chunk, &csum)) 685 goto fault; 686 if (csum_fold(csum)) 687 goto csum_error; 688 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) 689 netdev_rx_csum_fault(skb->dev); 690 } 691 return 0; 692 csum_error: 693 return -EINVAL; 694 fault: 695 return -EFAULT; 696 } 697 EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg); 698 699 /** 700 * datagram_poll - generic datagram poll 701 * @file: file struct 702 * @sock: socket 703 * @wait: poll table 704 * 705 * Datagram poll: Again totally generic. This also handles 706 * sequenced packet sockets providing the socket receive queue 707 * is only ever holding data ready to receive. 708 * 709 * Note: when you _don't_ use this routine for this protocol, 710 * and you use a different write policy from sock_writeable() 711 * then please supply your own write_space callback. 712 */ 713 unsigned int datagram_poll(struct file *file, struct socket *sock, 714 poll_table *wait) 715 { 716 struct sock *sk = sock->sk; 717 unsigned int mask; 718 719 sock_poll_wait(file, sk_sleep(sk), wait); 720 mask = 0; 721 722 /* exceptional events? */ 723 if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) 724 mask |= POLLERR | 725 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0); 726 727 if (sk->sk_shutdown & RCV_SHUTDOWN) 728 mask |= POLLRDHUP | POLLIN | POLLRDNORM; 729 if (sk->sk_shutdown == SHUTDOWN_MASK) 730 mask |= POLLHUP; 731 732 /* readable? */ 733 if (!skb_queue_empty(&sk->sk_receive_queue)) 734 mask |= POLLIN | POLLRDNORM; 735 736 /* Connection-based need to check for termination and startup */ 737 if (connection_based(sk)) { 738 if (sk->sk_state == TCP_CLOSE) 739 mask |= POLLHUP; 740 /* connection hasn't started yet? */ 741 if (sk->sk_state == TCP_SYN_SENT) 742 return mask; 743 } 744 745 /* writable? */ 746 if (sock_writeable(sk)) 747 mask |= POLLOUT | POLLWRNORM | POLLWRBAND; 748 else 749 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 750 751 return mask; 752 } 753 EXPORT_SYMBOL(datagram_poll); 754