1 /* 2 * SUCS NET3: 3 * 4 * Generic datagram handling routines. These are generic for all 5 * protocols. Possibly a generic IP version on top of these would 6 * make sense. Not tonight however 8-). 7 * This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and 8 * NetROM layer all have identical poll code and mostly 9 * identical recvmsg() code. So we share it here. The poll was 10 * shared before but buried in udp.c so I moved it. 11 * 12 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>. (datagram_poll() from old 13 * udp.c code) 14 * 15 * Fixes: 16 * Alan Cox : NULL return from skb_peek_copy() 17 * understood 18 * Alan Cox : Rewrote skb_read_datagram to avoid the 19 * skb_peek_copy stuff. 20 * Alan Cox : Added support for SOCK_SEQPACKET. 21 * IPX can no longer use the SO_TYPE hack 22 * but AX.25 now works right, and SPX is 23 * feasible. 24 * Alan Cox : Fixed write poll of non IP protocol 25 * crash. 26 * Florian La Roche: Changed for my new skbuff handling. 27 * Darryl Miles : Fixed non-blocking SOCK_SEQPACKET. 28 * Linus Torvalds : BSD semantic fixes. 29 * Alan Cox : Datagram iovec handling 30 * Darryl Miles : Fixed non-blocking SOCK_STREAM. 31 * Alan Cox : POSIXisms 32 * Pete Wyckoff : Unconnected accept() fix. 33 * 34 */ 35 36 #include <linux/module.h> 37 #include <linux/types.h> 38 #include <linux/kernel.h> 39 #include <asm/uaccess.h> 40 #include <linux/mm.h> 41 #include <linux/interrupt.h> 42 #include <linux/errno.h> 43 #include <linux/sched.h> 44 #include <linux/inet.h> 45 #include <linux/netdevice.h> 46 #include <linux/rtnetlink.h> 47 #include <linux/poll.h> 48 #include <linux/highmem.h> 49 #include <linux/spinlock.h> 50 #include <linux/slab.h> 51 52 #include <net/protocol.h> 53 #include <linux/skbuff.h> 54 55 #include <net/checksum.h> 56 #include <net/sock.h> 57 #include <net/tcp_states.h> 58 #include <trace/events/skb.h> 59 60 /* 61 * Is a socket 'connection oriented' ? 62 */ 63 static inline int connection_based(struct sock *sk) 64 { 65 return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM; 66 } 67 68 static int receiver_wake_function(wait_queue_t *wait, unsigned int mode, int sync, 69 void *key) 70 { 71 unsigned long bits = (unsigned long)key; 72 73 /* 74 * Avoid a wakeup if event not interesting for us 75 */ 76 if (bits && !(bits & (POLLIN | POLLERR))) 77 return 0; 78 return autoremove_wake_function(wait, mode, sync, key); 79 } 80 /* 81 * Wait for a packet.. 82 */ 83 static int wait_for_packet(struct sock *sk, int *err, long *timeo_p) 84 { 85 int error; 86 DEFINE_WAIT_FUNC(wait, receiver_wake_function); 87 88 prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 89 90 /* Socket errors? */ 91 error = sock_error(sk); 92 if (error) 93 goto out_err; 94 95 if (!skb_queue_empty(&sk->sk_receive_queue)) 96 goto out; 97 98 /* Socket shut down? */ 99 if (sk->sk_shutdown & RCV_SHUTDOWN) 100 goto out_noerr; 101 102 /* Sequenced packets can come disconnected. 103 * If so we report the problem 104 */ 105 error = -ENOTCONN; 106 if (connection_based(sk) && 107 !(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN)) 108 goto out_err; 109 110 /* handle signals */ 111 if (signal_pending(current)) 112 goto interrupted; 113 114 error = 0; 115 *timeo_p = schedule_timeout(*timeo_p); 116 out: 117 finish_wait(sk_sleep(sk), &wait); 118 return error; 119 interrupted: 120 error = sock_intr_errno(*timeo_p); 121 out_err: 122 *err = error; 123 goto out; 124 out_noerr: 125 *err = 0; 126 error = 1; 127 goto out; 128 } 129 130 /** 131 * __skb_recv_datagram - Receive a datagram skbuff 132 * @sk: socket 133 * @flags: MSG_ flags 134 * @off: an offset in bytes to peek skb from. Returns an offset 135 * within an skb where data actually starts 136 * @peeked: returns non-zero if this packet has been seen before 137 * @err: error code returned 138 * 139 * Get a datagram skbuff, understands the peeking, nonblocking wakeups 140 * and possible races. This replaces identical code in packet, raw and 141 * udp, as well as the IPX AX.25 and Appletalk. It also finally fixes 142 * the long standing peek and read race for datagram sockets. If you 143 * alter this routine remember it must be re-entrant. 144 * 145 * This function will lock the socket if a skb is returned, so the caller 146 * needs to unlock the socket in that case (usually by calling 147 * skb_free_datagram) 148 * 149 * * It does not lock socket since today. This function is 150 * * free of race conditions. This measure should/can improve 151 * * significantly datagram socket latencies at high loads, 152 * * when data copying to user space takes lots of time. 153 * * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet 154 * * 8) Great win.) 155 * * --ANK (980729) 156 * 157 * The order of the tests when we find no data waiting are specified 158 * quite explicitly by POSIX 1003.1g, don't change them without having 159 * the standard around please. 160 */ 161 struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, 162 int *peeked, int *off, int *err) 163 { 164 struct sk_buff *skb; 165 long timeo; 166 /* 167 * Caller is allowed not to check sk->sk_err before skb_recv_datagram() 168 */ 169 int error = sock_error(sk); 170 171 if (error) 172 goto no_packet; 173 174 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 175 176 do { 177 /* Again only user level code calls this function, so nothing 178 * interrupt level will suddenly eat the receive_queue. 179 * 180 * Look at current nfs client by the way... 181 * However, this function was correct in any case. 8) 182 */ 183 unsigned long cpu_flags; 184 struct sk_buff_head *queue = &sk->sk_receive_queue; 185 186 spin_lock_irqsave(&queue->lock, cpu_flags); 187 skb_queue_walk(queue, skb) { 188 *peeked = skb->peeked; 189 if (flags & MSG_PEEK) { 190 if (*off >= skb->len && skb->len) { 191 *off -= skb->len; 192 continue; 193 } 194 skb->peeked = 1; 195 atomic_inc(&skb->users); 196 } else 197 __skb_unlink(skb, queue); 198 199 spin_unlock_irqrestore(&queue->lock, cpu_flags); 200 return skb; 201 } 202 spin_unlock_irqrestore(&queue->lock, cpu_flags); 203 204 /* User doesn't want to wait */ 205 error = -EAGAIN; 206 if (!timeo) 207 goto no_packet; 208 209 } while (!wait_for_packet(sk, err, &timeo)); 210 211 return NULL; 212 213 no_packet: 214 *err = error; 215 return NULL; 216 } 217 EXPORT_SYMBOL(__skb_recv_datagram); 218 219 struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, 220 int noblock, int *err) 221 { 222 int peeked, off = 0; 223 224 return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), 225 &peeked, &off, err); 226 } 227 EXPORT_SYMBOL(skb_recv_datagram); 228 229 void skb_free_datagram(struct sock *sk, struct sk_buff *skb) 230 { 231 consume_skb(skb); 232 sk_mem_reclaim_partial(sk); 233 } 234 EXPORT_SYMBOL(skb_free_datagram); 235 236 void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb) 237 { 238 bool slow; 239 240 if (likely(atomic_read(&skb->users) == 1)) 241 smp_rmb(); 242 else if (likely(!atomic_dec_and_test(&skb->users))) 243 return; 244 245 slow = lock_sock_fast(sk); 246 skb_orphan(skb); 247 sk_mem_reclaim_partial(sk); 248 unlock_sock_fast(sk, slow); 249 250 /* skb is now orphaned, can be freed outside of locked section */ 251 __kfree_skb(skb); 252 } 253 EXPORT_SYMBOL(skb_free_datagram_locked); 254 255 /** 256 * skb_kill_datagram - Free a datagram skbuff forcibly 257 * @sk: socket 258 * @skb: datagram skbuff 259 * @flags: MSG_ flags 260 * 261 * This function frees a datagram skbuff that was received by 262 * skb_recv_datagram. The flags argument must match the one 263 * used for skb_recv_datagram. 264 * 265 * If the MSG_PEEK flag is set, and the packet is still on the 266 * receive queue of the socket, it will be taken off the queue 267 * before it is freed. 268 * 269 * This function currently only disables BH when acquiring the 270 * sk_receive_queue lock. Therefore it must not be used in a 271 * context where that lock is acquired in an IRQ context. 272 * 273 * It returns 0 if the packet was removed by us. 274 */ 275 276 int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) 277 { 278 int err = 0; 279 280 if (flags & MSG_PEEK) { 281 err = -ENOENT; 282 spin_lock_bh(&sk->sk_receive_queue.lock); 283 if (skb == skb_peek(&sk->sk_receive_queue)) { 284 __skb_unlink(skb, &sk->sk_receive_queue); 285 atomic_dec(&skb->users); 286 err = 0; 287 } 288 spin_unlock_bh(&sk->sk_receive_queue.lock); 289 } 290 291 kfree_skb(skb); 292 atomic_inc(&sk->sk_drops); 293 sk_mem_reclaim_partial(sk); 294 295 return err; 296 } 297 EXPORT_SYMBOL(skb_kill_datagram); 298 299 /** 300 * skb_copy_datagram_iovec - Copy a datagram to an iovec. 301 * @skb: buffer to copy 302 * @offset: offset in the buffer to start copying from 303 * @to: io vector to copy to 304 * @len: amount of data to copy from buffer to iovec 305 * 306 * Note: the iovec is modified during the copy. 307 */ 308 int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, 309 struct iovec *to, int len) 310 { 311 int start = skb_headlen(skb); 312 int i, copy = start - offset; 313 struct sk_buff *frag_iter; 314 315 trace_skb_copy_datagram_iovec(skb, len); 316 317 /* Copy header. */ 318 if (copy > 0) { 319 if (copy > len) 320 copy = len; 321 if (memcpy_toiovec(to, skb->data + offset, copy)) 322 goto fault; 323 if ((len -= copy) == 0) 324 return 0; 325 offset += copy; 326 } 327 328 /* Copy paged appendix. Hmm... why does this look so complicated? */ 329 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 330 int end; 331 const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 332 333 WARN_ON(start > offset + len); 334 335 end = start + skb_frag_size(frag); 336 if ((copy = end - offset) > 0) { 337 int err; 338 u8 *vaddr; 339 struct page *page = skb_frag_page(frag); 340 341 if (copy > len) 342 copy = len; 343 vaddr = kmap(page); 344 err = memcpy_toiovec(to, vaddr + frag->page_offset + 345 offset - start, copy); 346 kunmap(page); 347 if (err) 348 goto fault; 349 if (!(len -= copy)) 350 return 0; 351 offset += copy; 352 } 353 start = end; 354 } 355 356 skb_walk_frags(skb, frag_iter) { 357 int end; 358 359 WARN_ON(start > offset + len); 360 361 end = start + frag_iter->len; 362 if ((copy = end - offset) > 0) { 363 if (copy > len) 364 copy = len; 365 if (skb_copy_datagram_iovec(frag_iter, 366 offset - start, 367 to, copy)) 368 goto fault; 369 if ((len -= copy) == 0) 370 return 0; 371 offset += copy; 372 } 373 start = end; 374 } 375 if (!len) 376 return 0; 377 378 fault: 379 return -EFAULT; 380 } 381 EXPORT_SYMBOL(skb_copy_datagram_iovec); 382 383 /** 384 * skb_copy_datagram_const_iovec - Copy a datagram to an iovec. 385 * @skb: buffer to copy 386 * @offset: offset in the buffer to start copying from 387 * @to: io vector to copy to 388 * @to_offset: offset in the io vector to start copying to 389 * @len: amount of data to copy from buffer to iovec 390 * 391 * Returns 0 or -EFAULT. 392 * Note: the iovec is not modified during the copy. 393 */ 394 int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset, 395 const struct iovec *to, int to_offset, 396 int len) 397 { 398 int start = skb_headlen(skb); 399 int i, copy = start - offset; 400 struct sk_buff *frag_iter; 401 402 /* Copy header. */ 403 if (copy > 0) { 404 if (copy > len) 405 copy = len; 406 if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy)) 407 goto fault; 408 if ((len -= copy) == 0) 409 return 0; 410 offset += copy; 411 to_offset += copy; 412 } 413 414 /* Copy paged appendix. Hmm... why does this look so complicated? */ 415 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 416 int end; 417 const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 418 419 WARN_ON(start > offset + len); 420 421 end = start + skb_frag_size(frag); 422 if ((copy = end - offset) > 0) { 423 int err; 424 u8 *vaddr; 425 struct page *page = skb_frag_page(frag); 426 427 if (copy > len) 428 copy = len; 429 vaddr = kmap(page); 430 err = memcpy_toiovecend(to, vaddr + frag->page_offset + 431 offset - start, to_offset, copy); 432 kunmap(page); 433 if (err) 434 goto fault; 435 if (!(len -= copy)) 436 return 0; 437 offset += copy; 438 to_offset += copy; 439 } 440 start = end; 441 } 442 443 skb_walk_frags(skb, frag_iter) { 444 int end; 445 446 WARN_ON(start > offset + len); 447 448 end = start + frag_iter->len; 449 if ((copy = end - offset) > 0) { 450 if (copy > len) 451 copy = len; 452 if (skb_copy_datagram_const_iovec(frag_iter, 453 offset - start, 454 to, to_offset, 455 copy)) 456 goto fault; 457 if ((len -= copy) == 0) 458 return 0; 459 offset += copy; 460 to_offset += copy; 461 } 462 start = end; 463 } 464 if (!len) 465 return 0; 466 467 fault: 468 return -EFAULT; 469 } 470 EXPORT_SYMBOL(skb_copy_datagram_const_iovec); 471 472 /** 473 * skb_copy_datagram_from_iovec - Copy a datagram from an iovec. 474 * @skb: buffer to copy 475 * @offset: offset in the buffer to start copying to 476 * @from: io vector to copy to 477 * @from_offset: offset in the io vector to start copying from 478 * @len: amount of data to copy to buffer from iovec 479 * 480 * Returns 0 or -EFAULT. 481 * Note: the iovec is not modified during the copy. 482 */ 483 int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, 484 const struct iovec *from, int from_offset, 485 int len) 486 { 487 int start = skb_headlen(skb); 488 int i, copy = start - offset; 489 struct sk_buff *frag_iter; 490 491 /* Copy header. */ 492 if (copy > 0) { 493 if (copy > len) 494 copy = len; 495 if (memcpy_fromiovecend(skb->data + offset, from, from_offset, 496 copy)) 497 goto fault; 498 if ((len -= copy) == 0) 499 return 0; 500 offset += copy; 501 from_offset += copy; 502 } 503 504 /* Copy paged appendix. Hmm... why does this look so complicated? */ 505 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 506 int end; 507 const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 508 509 WARN_ON(start > offset + len); 510 511 end = start + skb_frag_size(frag); 512 if ((copy = end - offset) > 0) { 513 int err; 514 u8 *vaddr; 515 struct page *page = skb_frag_page(frag); 516 517 if (copy > len) 518 copy = len; 519 vaddr = kmap(page); 520 err = memcpy_fromiovecend(vaddr + frag->page_offset + 521 offset - start, 522 from, from_offset, copy); 523 kunmap(page); 524 if (err) 525 goto fault; 526 527 if (!(len -= copy)) 528 return 0; 529 offset += copy; 530 from_offset += copy; 531 } 532 start = end; 533 } 534 535 skb_walk_frags(skb, frag_iter) { 536 int end; 537 538 WARN_ON(start > offset + len); 539 540 end = start + frag_iter->len; 541 if ((copy = end - offset) > 0) { 542 if (copy > len) 543 copy = len; 544 if (skb_copy_datagram_from_iovec(frag_iter, 545 offset - start, 546 from, 547 from_offset, 548 copy)) 549 goto fault; 550 if ((len -= copy) == 0) 551 return 0; 552 offset += copy; 553 from_offset += copy; 554 } 555 start = end; 556 } 557 if (!len) 558 return 0; 559 560 fault: 561 return -EFAULT; 562 } 563 EXPORT_SYMBOL(skb_copy_datagram_from_iovec); 564 565 static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, 566 u8 __user *to, int len, 567 __wsum *csump) 568 { 569 int start = skb_headlen(skb); 570 int i, copy = start - offset; 571 struct sk_buff *frag_iter; 572 int pos = 0; 573 574 /* Copy header. */ 575 if (copy > 0) { 576 int err = 0; 577 if (copy > len) 578 copy = len; 579 *csump = csum_and_copy_to_user(skb->data + offset, to, copy, 580 *csump, &err); 581 if (err) 582 goto fault; 583 if ((len -= copy) == 0) 584 return 0; 585 offset += copy; 586 to += copy; 587 pos = copy; 588 } 589 590 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 591 int end; 592 const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 593 594 WARN_ON(start > offset + len); 595 596 end = start + skb_frag_size(frag); 597 if ((copy = end - offset) > 0) { 598 __wsum csum2; 599 int err = 0; 600 u8 *vaddr; 601 struct page *page = skb_frag_page(frag); 602 603 if (copy > len) 604 copy = len; 605 vaddr = kmap(page); 606 csum2 = csum_and_copy_to_user(vaddr + 607 frag->page_offset + 608 offset - start, 609 to, copy, 0, &err); 610 kunmap(page); 611 if (err) 612 goto fault; 613 *csump = csum_block_add(*csump, csum2, pos); 614 if (!(len -= copy)) 615 return 0; 616 offset += copy; 617 to += copy; 618 pos += copy; 619 } 620 start = end; 621 } 622 623 skb_walk_frags(skb, frag_iter) { 624 int end; 625 626 WARN_ON(start > offset + len); 627 628 end = start + frag_iter->len; 629 if ((copy = end - offset) > 0) { 630 __wsum csum2 = 0; 631 if (copy > len) 632 copy = len; 633 if (skb_copy_and_csum_datagram(frag_iter, 634 offset - start, 635 to, copy, 636 &csum2)) 637 goto fault; 638 *csump = csum_block_add(*csump, csum2, pos); 639 if ((len -= copy) == 0) 640 return 0; 641 offset += copy; 642 to += copy; 643 pos += copy; 644 } 645 start = end; 646 } 647 if (!len) 648 return 0; 649 650 fault: 651 return -EFAULT; 652 } 653 654 __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) 655 { 656 __sum16 sum; 657 658 sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); 659 if (likely(!sum)) { 660 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) 661 netdev_rx_csum_fault(skb->dev); 662 skb->ip_summed = CHECKSUM_UNNECESSARY; 663 } 664 return sum; 665 } 666 EXPORT_SYMBOL(__skb_checksum_complete_head); 667 668 __sum16 __skb_checksum_complete(struct sk_buff *skb) 669 { 670 return __skb_checksum_complete_head(skb, skb->len); 671 } 672 EXPORT_SYMBOL(__skb_checksum_complete); 673 674 /** 675 * skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec. 676 * @skb: skbuff 677 * @hlen: hardware length 678 * @iov: io vector 679 * 680 * Caller _must_ check that skb will fit to this iovec. 681 * 682 * Returns: 0 - success. 683 * -EINVAL - checksum failure. 684 * -EFAULT - fault during copy. Beware, in this case iovec 685 * can be modified! 686 */ 687 int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, 688 int hlen, struct iovec *iov) 689 { 690 __wsum csum; 691 int chunk = skb->len - hlen; 692 693 if (!chunk) 694 return 0; 695 696 /* Skip filled elements. 697 * Pretty silly, look at memcpy_toiovec, though 8) 698 */ 699 while (!iov->iov_len) 700 iov++; 701 702 if (iov->iov_len < chunk) { 703 if (__skb_checksum_complete(skb)) 704 goto csum_error; 705 if (skb_copy_datagram_iovec(skb, hlen, iov, chunk)) 706 goto fault; 707 } else { 708 csum = csum_partial(skb->data, hlen, skb->csum); 709 if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base, 710 chunk, &csum)) 711 goto fault; 712 if (csum_fold(csum)) 713 goto csum_error; 714 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) 715 netdev_rx_csum_fault(skb->dev); 716 iov->iov_len -= chunk; 717 iov->iov_base += chunk; 718 } 719 return 0; 720 csum_error: 721 return -EINVAL; 722 fault: 723 return -EFAULT; 724 } 725 EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec); 726 727 /** 728 * datagram_poll - generic datagram poll 729 * @file: file struct 730 * @sock: socket 731 * @wait: poll table 732 * 733 * Datagram poll: Again totally generic. This also handles 734 * sequenced packet sockets providing the socket receive queue 735 * is only ever holding data ready to receive. 736 * 737 * Note: when you _don't_ use this routine for this protocol, 738 * and you use a different write policy from sock_writeable() 739 * then please supply your own write_space callback. 740 */ 741 unsigned int datagram_poll(struct file *file, struct socket *sock, 742 poll_table *wait) 743 { 744 struct sock *sk = sock->sk; 745 unsigned int mask; 746 747 sock_poll_wait(file, sk_sleep(sk), wait); 748 mask = 0; 749 750 /* exceptional events? */ 751 if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) 752 mask |= POLLERR; 753 if (sk->sk_shutdown & RCV_SHUTDOWN) 754 mask |= POLLRDHUP | POLLIN | POLLRDNORM; 755 if (sk->sk_shutdown == SHUTDOWN_MASK) 756 mask |= POLLHUP; 757 758 /* readable? */ 759 if (!skb_queue_empty(&sk->sk_receive_queue)) 760 mask |= POLLIN | POLLRDNORM; 761 762 /* Connection-based need to check for termination and startup */ 763 if (connection_based(sk)) { 764 if (sk->sk_state == TCP_CLOSE) 765 mask |= POLLHUP; 766 /* connection hasn't started yet? */ 767 if (sk->sk_state == TCP_SYN_SENT) 768 return mask; 769 } 770 771 /* writable? */ 772 if (sock_writeable(sk)) 773 mask |= POLLOUT | POLLWRNORM | POLLWRBAND; 774 else 775 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 776 777 return mask; 778 } 779 EXPORT_SYMBOL(datagram_poll); 780