1 /* 2 * Routines having to do with the 'struct sk_buff' memory handlers. 3 * 4 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> 5 * Florian La Roche <rzsfl@rz.uni-sb.de> 6 * 7 * Fixes: 8 * Alan Cox : Fixed the worst of the load 9 * balancer bugs. 10 * Dave Platt : Interrupt stacking fix. 11 * Richard Kooijman : Timestamp fixes. 12 * Alan Cox : Changed buffer format. 13 * Alan Cox : destructor hook for AF_UNIX etc. 14 * Linus Torvalds : Better skb_clone. 15 * Alan Cox : Added skb_copy. 16 * Alan Cox : Added all the changed routines Linus 17 * only put in the headers 18 * Ray VanTassle : Fixed --skb->lock in free 19 * Alan Cox : skb_copy copy arp field 20 * Andi Kleen : slabified it. 21 * Robert Olsson : Removed skb_head_pool 22 * 23 * NOTE: 24 * The __skb_ routines should be called with interrupts 25 * disabled, or you better be *real* sure that the operation is atomic 26 * with respect to whatever list is being frobbed (e.g. via lock_sock() 27 * or via disabling bottom half handlers, etc). 28 * 29 * This program is free software; you can redistribute it and/or 30 * modify it under the terms of the GNU General Public License 31 * as published by the Free Software Foundation; either version 32 * 2 of the License, or (at your option) any later version. 33 */ 34 35 /* 36 * The functions in this file will not compile correctly with gcc 2.4.x 37 */ 38 39 #include <linux/module.h> 40 #include <linux/types.h> 41 #include <linux/kernel.h> 42 #include <linux/kmemcheck.h> 43 #include <linux/mm.h> 44 #include <linux/interrupt.h> 45 #include <linux/in.h> 46 #include <linux/inet.h> 47 #include <linux/slab.h> 48 #include <linux/netdevice.h> 49 #ifdef CONFIG_NET_CLS_ACT 50 #include <net/pkt_sched.h> 51 #endif 52 #include <linux/string.h> 53 #include <linux/skbuff.h> 54 #include <linux/splice.h> 55 #include <linux/cache.h> 56 #include <linux/rtnetlink.h> 57 #include <linux/init.h> 58 #include <linux/scatterlist.h> 59 #include <linux/errqueue.h> 60 #include <linux/prefetch.h> 61 62 #include <net/protocol.h> 63 #include <net/dst.h> 64 #include <net/sock.h> 65 #include <net/checksum.h> 66 #include <net/xfrm.h> 67 68 #include <asm/uaccess.h> 69 #include <asm/system.h> 70 #include <trace/events/skb.h> 71 72 #include "kmap_skb.h" 73 74 static struct kmem_cache *skbuff_head_cache __read_mostly; 75 static struct kmem_cache *skbuff_fclone_cache __read_mostly; 76 77 static void sock_pipe_buf_release(struct pipe_inode_info *pipe, 78 struct pipe_buffer *buf) 79 { 80 put_page(buf->page); 81 } 82 83 static void sock_pipe_buf_get(struct pipe_inode_info *pipe, 84 struct pipe_buffer *buf) 85 { 86 get_page(buf->page); 87 } 88 89 static int sock_pipe_buf_steal(struct pipe_inode_info *pipe, 90 struct pipe_buffer *buf) 91 { 92 return 1; 93 } 94 95 96 /* Pipe buffer operations for a socket. */ 97 static const struct pipe_buf_operations sock_pipe_buf_ops = { 98 .can_merge = 0, 99 .map = generic_pipe_buf_map, 100 .unmap = generic_pipe_buf_unmap, 101 .confirm = generic_pipe_buf_confirm, 102 .release = sock_pipe_buf_release, 103 .steal = sock_pipe_buf_steal, 104 .get = sock_pipe_buf_get, 105 }; 106 107 /* 108 * Keep out-of-line to prevent kernel bloat. 109 * __builtin_return_address is not used because it is not always 110 * reliable. 111 */ 112 113 /** 114 * skb_over_panic - private function 115 * @skb: buffer 116 * @sz: size 117 * @here: address 118 * 119 * Out of line support code for skb_put(). Not user callable. 120 */ 121 static void skb_over_panic(struct sk_buff *skb, int sz, void *here) 122 { 123 printk(KERN_EMERG "skb_over_panic: text:%p len:%d put:%d head:%p " 124 "data:%p tail:%#lx end:%#lx dev:%s\n", 125 here, skb->len, sz, skb->head, skb->data, 126 (unsigned long)skb->tail, (unsigned long)skb->end, 127 skb->dev ? skb->dev->name : "<NULL>"); 128 BUG(); 129 } 130 131 /** 132 * skb_under_panic - private function 133 * @skb: buffer 134 * @sz: size 135 * @here: address 136 * 137 * Out of line support code for skb_push(). Not user callable. 138 */ 139 140 static void skb_under_panic(struct sk_buff *skb, int sz, void *here) 141 { 142 printk(KERN_EMERG "skb_under_panic: text:%p len:%d put:%d head:%p " 143 "data:%p tail:%#lx end:%#lx dev:%s\n", 144 here, skb->len, sz, skb->head, skb->data, 145 (unsigned long)skb->tail, (unsigned long)skb->end, 146 skb->dev ? skb->dev->name : "<NULL>"); 147 BUG(); 148 } 149 150 /* Allocate a new skbuff. We do this ourselves so we can fill in a few 151 * 'private' fields and also do memory statistics to find all the 152 * [BEEP] leaks. 153 * 154 */ 155 156 /** 157 * __alloc_skb - allocate a network buffer 158 * @size: size to allocate 159 * @gfp_mask: allocation mask 160 * @fclone: allocate from fclone cache instead of head cache 161 * and allocate a cloned (child) skb 162 * @node: numa node to allocate memory on 163 * 164 * Allocate a new &sk_buff. The returned buffer has no headroom and a 165 * tail room of size bytes. The object has a reference count of one. 166 * The return is the buffer. On a failure the return is %NULL. 167 * 168 * Buffers may only be allocated from interrupts using a @gfp_mask of 169 * %GFP_ATOMIC. 170 */ 171 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, 172 int fclone, int node) 173 { 174 struct kmem_cache *cache; 175 struct skb_shared_info *shinfo; 176 struct sk_buff *skb; 177 u8 *data; 178 179 cache = fclone ? skbuff_fclone_cache : skbuff_head_cache; 180 181 /* Get the HEAD */ 182 skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); 183 if (!skb) 184 goto out; 185 prefetchw(skb); 186 187 size = SKB_DATA_ALIGN(size); 188 data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info), 189 gfp_mask, node); 190 if (!data) 191 goto nodata; 192 prefetchw(data + size); 193 194 /* 195 * Only clear those fields we need to clear, not those that we will 196 * actually initialise below. Hence, don't put any more fields after 197 * the tail pointer in struct sk_buff! 198 */ 199 memset(skb, 0, offsetof(struct sk_buff, tail)); 200 skb->truesize = size + sizeof(struct sk_buff); 201 atomic_set(&skb->users, 1); 202 skb->head = data; 203 skb->data = data; 204 skb_reset_tail_pointer(skb); 205 skb->end = skb->tail + size; 206 #ifdef NET_SKBUFF_DATA_USES_OFFSET 207 skb->mac_header = ~0U; 208 #endif 209 210 /* make sure we initialize shinfo sequentially */ 211 shinfo = skb_shinfo(skb); 212 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); 213 atomic_set(&shinfo->dataref, 1); 214 kmemcheck_annotate_variable(shinfo->destructor_arg); 215 216 if (fclone) { 217 struct sk_buff *child = skb + 1; 218 atomic_t *fclone_ref = (atomic_t *) (child + 1); 219 220 kmemcheck_annotate_bitfield(child, flags1); 221 kmemcheck_annotate_bitfield(child, flags2); 222 skb->fclone = SKB_FCLONE_ORIG; 223 atomic_set(fclone_ref, 1); 224 225 child->fclone = SKB_FCLONE_UNAVAILABLE; 226 } 227 out: 228 return skb; 229 nodata: 230 kmem_cache_free(cache, skb); 231 skb = NULL; 232 goto out; 233 } 234 EXPORT_SYMBOL(__alloc_skb); 235 236 /** 237 * __netdev_alloc_skb - allocate an skbuff for rx on a specific device 238 * @dev: network device to receive on 239 * @length: length to allocate 240 * @gfp_mask: get_free_pages mask, passed to alloc_skb 241 * 242 * Allocate a new &sk_buff and assign it a usage count of one. The 243 * buffer has unspecified headroom built in. Users should allocate 244 * the headroom they think they need without accounting for the 245 * built in space. The built in space is used for optimisations. 246 * 247 * %NULL is returned if there is no free memory. 248 */ 249 struct sk_buff *__netdev_alloc_skb(struct net_device *dev, 250 unsigned int length, gfp_t gfp_mask) 251 { 252 struct sk_buff *skb; 253 254 skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE); 255 if (likely(skb)) { 256 skb_reserve(skb, NET_SKB_PAD); 257 skb->dev = dev; 258 } 259 return skb; 260 } 261 EXPORT_SYMBOL(__netdev_alloc_skb); 262 263 void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, 264 int size) 265 { 266 skb_fill_page_desc(skb, i, page, off, size); 267 skb->len += size; 268 skb->data_len += size; 269 skb->truesize += size; 270 } 271 EXPORT_SYMBOL(skb_add_rx_frag); 272 273 /** 274 * dev_alloc_skb - allocate an skbuff for receiving 275 * @length: length to allocate 276 * 277 * Allocate a new &sk_buff and assign it a usage count of one. The 278 * buffer has unspecified headroom built in. Users should allocate 279 * the headroom they think they need without accounting for the 280 * built in space. The built in space is used for optimisations. 281 * 282 * %NULL is returned if there is no free memory. Although this function 283 * allocates memory it can be called from an interrupt. 284 */ 285 struct sk_buff *dev_alloc_skb(unsigned int length) 286 { 287 /* 288 * There is more code here than it seems: 289 * __dev_alloc_skb is an inline 290 */ 291 return __dev_alloc_skb(length, GFP_ATOMIC); 292 } 293 EXPORT_SYMBOL(dev_alloc_skb); 294 295 static void skb_drop_list(struct sk_buff **listp) 296 { 297 struct sk_buff *list = *listp; 298 299 *listp = NULL; 300 301 do { 302 struct sk_buff *this = list; 303 list = list->next; 304 kfree_skb(this); 305 } while (list); 306 } 307 308 static inline void skb_drop_fraglist(struct sk_buff *skb) 309 { 310 skb_drop_list(&skb_shinfo(skb)->frag_list); 311 } 312 313 static void skb_clone_fraglist(struct sk_buff *skb) 314 { 315 struct sk_buff *list; 316 317 skb_walk_frags(skb, list) 318 skb_get(list); 319 } 320 321 static void skb_release_data(struct sk_buff *skb) 322 { 323 if (!skb->cloned || 324 !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, 325 &skb_shinfo(skb)->dataref)) { 326 if (skb_shinfo(skb)->nr_frags) { 327 int i; 328 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 329 put_page(skb_shinfo(skb)->frags[i].page); 330 } 331 332 /* 333 * If skb buf is from userspace, we need to notify the caller 334 * the lower device DMA has done; 335 */ 336 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { 337 struct ubuf_info *uarg; 338 339 uarg = skb_shinfo(skb)->destructor_arg; 340 if (uarg->callback) 341 uarg->callback(uarg); 342 } 343 344 if (skb_has_frag_list(skb)) 345 skb_drop_fraglist(skb); 346 347 kfree(skb->head); 348 } 349 } 350 351 /* 352 * Free an skbuff by memory without cleaning the state. 353 */ 354 static void kfree_skbmem(struct sk_buff *skb) 355 { 356 struct sk_buff *other; 357 atomic_t *fclone_ref; 358 359 switch (skb->fclone) { 360 case SKB_FCLONE_UNAVAILABLE: 361 kmem_cache_free(skbuff_head_cache, skb); 362 break; 363 364 case SKB_FCLONE_ORIG: 365 fclone_ref = (atomic_t *) (skb + 2); 366 if (atomic_dec_and_test(fclone_ref)) 367 kmem_cache_free(skbuff_fclone_cache, skb); 368 break; 369 370 case SKB_FCLONE_CLONE: 371 fclone_ref = (atomic_t *) (skb + 1); 372 other = skb - 1; 373 374 /* The clone portion is available for 375 * fast-cloning again. 376 */ 377 skb->fclone = SKB_FCLONE_UNAVAILABLE; 378 379 if (atomic_dec_and_test(fclone_ref)) 380 kmem_cache_free(skbuff_fclone_cache, other); 381 break; 382 } 383 } 384 385 static void skb_release_head_state(struct sk_buff *skb) 386 { 387 skb_dst_drop(skb); 388 #ifdef CONFIG_XFRM 389 secpath_put(skb->sp); 390 #endif 391 if (skb->destructor) { 392 WARN_ON(in_irq()); 393 skb->destructor(skb); 394 } 395 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) 396 nf_conntrack_put(skb->nfct); 397 #endif 398 #ifdef NET_SKBUFF_NF_DEFRAG_NEEDED 399 nf_conntrack_put_reasm(skb->nfct_reasm); 400 #endif 401 #ifdef CONFIG_BRIDGE_NETFILTER 402 nf_bridge_put(skb->nf_bridge); 403 #endif 404 /* XXX: IS this still necessary? - JHS */ 405 #ifdef CONFIG_NET_SCHED 406 skb->tc_index = 0; 407 #ifdef CONFIG_NET_CLS_ACT 408 skb->tc_verd = 0; 409 #endif 410 #endif 411 } 412 413 /* Free everything but the sk_buff shell. */ 414 static void skb_release_all(struct sk_buff *skb) 415 { 416 skb_release_head_state(skb); 417 skb_release_data(skb); 418 } 419 420 /** 421 * __kfree_skb - private function 422 * @skb: buffer 423 * 424 * Free an sk_buff. Release anything attached to the buffer. 425 * Clean the state. This is an internal helper function. Users should 426 * always call kfree_skb 427 */ 428 429 void __kfree_skb(struct sk_buff *skb) 430 { 431 skb_release_all(skb); 432 kfree_skbmem(skb); 433 } 434 EXPORT_SYMBOL(__kfree_skb); 435 436 /** 437 * kfree_skb - free an sk_buff 438 * @skb: buffer to free 439 * 440 * Drop a reference to the buffer and free it if the usage count has 441 * hit zero. 442 */ 443 void kfree_skb(struct sk_buff *skb) 444 { 445 if (unlikely(!skb)) 446 return; 447 if (likely(atomic_read(&skb->users) == 1)) 448 smp_rmb(); 449 else if (likely(!atomic_dec_and_test(&skb->users))) 450 return; 451 trace_kfree_skb(skb, __builtin_return_address(0)); 452 __kfree_skb(skb); 453 } 454 EXPORT_SYMBOL(kfree_skb); 455 456 /** 457 * consume_skb - free an skbuff 458 * @skb: buffer to free 459 * 460 * Drop a ref to the buffer and free it if the usage count has hit zero 461 * Functions identically to kfree_skb, but kfree_skb assumes that the frame 462 * is being dropped after a failure and notes that 463 */ 464 void consume_skb(struct sk_buff *skb) 465 { 466 if (unlikely(!skb)) 467 return; 468 if (likely(atomic_read(&skb->users) == 1)) 469 smp_rmb(); 470 else if (likely(!atomic_dec_and_test(&skb->users))) 471 return; 472 trace_consume_skb(skb); 473 __kfree_skb(skb); 474 } 475 EXPORT_SYMBOL(consume_skb); 476 477 /** 478 * skb_recycle_check - check if skb can be reused for receive 479 * @skb: buffer 480 * @skb_size: minimum receive buffer size 481 * 482 * Checks that the skb passed in is not shared or cloned, and 483 * that it is linear and its head portion at least as large as 484 * skb_size so that it can be recycled as a receive buffer. 485 * If these conditions are met, this function does any necessary 486 * reference count dropping and cleans up the skbuff as if it 487 * just came from __alloc_skb(). 488 */ 489 bool skb_recycle_check(struct sk_buff *skb, int skb_size) 490 { 491 struct skb_shared_info *shinfo; 492 493 if (irqs_disabled()) 494 return false; 495 496 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) 497 return false; 498 499 if (skb_is_nonlinear(skb) || skb->fclone != SKB_FCLONE_UNAVAILABLE) 500 return false; 501 502 skb_size = SKB_DATA_ALIGN(skb_size + NET_SKB_PAD); 503 if (skb_end_pointer(skb) - skb->head < skb_size) 504 return false; 505 506 if (skb_shared(skb) || skb_cloned(skb)) 507 return false; 508 509 skb_release_head_state(skb); 510 511 shinfo = skb_shinfo(skb); 512 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); 513 atomic_set(&shinfo->dataref, 1); 514 515 memset(skb, 0, offsetof(struct sk_buff, tail)); 516 skb->data = skb->head + NET_SKB_PAD; 517 skb_reset_tail_pointer(skb); 518 519 return true; 520 } 521 EXPORT_SYMBOL(skb_recycle_check); 522 523 static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) 524 { 525 new->tstamp = old->tstamp; 526 new->dev = old->dev; 527 new->transport_header = old->transport_header; 528 new->network_header = old->network_header; 529 new->mac_header = old->mac_header; 530 skb_dst_copy(new, old); 531 new->rxhash = old->rxhash; 532 #ifdef CONFIG_XFRM 533 new->sp = secpath_get(old->sp); 534 #endif 535 memcpy(new->cb, old->cb, sizeof(old->cb)); 536 new->csum = old->csum; 537 new->local_df = old->local_df; 538 new->pkt_type = old->pkt_type; 539 new->ip_summed = old->ip_summed; 540 skb_copy_queue_mapping(new, old); 541 new->priority = old->priority; 542 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) 543 new->ipvs_property = old->ipvs_property; 544 #endif 545 new->protocol = old->protocol; 546 new->mark = old->mark; 547 new->skb_iif = old->skb_iif; 548 __nf_copy(new, old); 549 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ 550 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) 551 new->nf_trace = old->nf_trace; 552 #endif 553 #ifdef CONFIG_NET_SCHED 554 new->tc_index = old->tc_index; 555 #ifdef CONFIG_NET_CLS_ACT 556 new->tc_verd = old->tc_verd; 557 #endif 558 #endif 559 new->vlan_tci = old->vlan_tci; 560 561 skb_copy_secmark(new, old); 562 } 563 564 /* 565 * You should not add any new code to this function. Add it to 566 * __copy_skb_header above instead. 567 */ 568 static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) 569 { 570 #define C(x) n->x = skb->x 571 572 n->next = n->prev = NULL; 573 n->sk = NULL; 574 __copy_skb_header(n, skb); 575 576 C(len); 577 C(data_len); 578 C(mac_len); 579 n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; 580 n->cloned = 1; 581 n->nohdr = 0; 582 n->destructor = NULL; 583 C(tail); 584 C(end); 585 C(head); 586 C(data); 587 C(truesize); 588 atomic_set(&n->users, 1); 589 590 atomic_inc(&(skb_shinfo(skb)->dataref)); 591 skb->cloned = 1; 592 593 return n; 594 #undef C 595 } 596 597 /** 598 * skb_morph - morph one skb into another 599 * @dst: the skb to receive the contents 600 * @src: the skb to supply the contents 601 * 602 * This is identical to skb_clone except that the target skb is 603 * supplied by the user. 604 * 605 * The target skb is returned upon exit. 606 */ 607 struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) 608 { 609 skb_release_all(dst); 610 return __skb_clone(dst, src); 611 } 612 EXPORT_SYMBOL_GPL(skb_morph); 613 614 /* skb frags copy userspace buffers to kernel */ 615 static int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) 616 { 617 int i; 618 int num_frags = skb_shinfo(skb)->nr_frags; 619 struct page *page, *head = NULL; 620 struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg; 621 622 for (i = 0; i < num_frags; i++) { 623 u8 *vaddr; 624 skb_frag_t *f = &skb_shinfo(skb)->frags[i]; 625 626 page = alloc_page(GFP_ATOMIC); 627 if (!page) { 628 while (head) { 629 struct page *next = (struct page *)head->private; 630 put_page(head); 631 head = next; 632 } 633 return -ENOMEM; 634 } 635 vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]); 636 memcpy(page_address(page), 637 vaddr + f->page_offset, f->size); 638 kunmap_skb_frag(vaddr); 639 page->private = (unsigned long)head; 640 head = page; 641 } 642 643 /* skb frags release userspace buffers */ 644 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 645 put_page(skb_shinfo(skb)->frags[i].page); 646 647 uarg->callback(uarg); 648 649 /* skb frags point to kernel buffers */ 650 for (i = skb_shinfo(skb)->nr_frags; i > 0; i--) { 651 skb_shinfo(skb)->frags[i - 1].page_offset = 0; 652 skb_shinfo(skb)->frags[i - 1].page = head; 653 head = (struct page *)head->private; 654 } 655 return 0; 656 } 657 658 659 /** 660 * skb_clone - duplicate an sk_buff 661 * @skb: buffer to clone 662 * @gfp_mask: allocation priority 663 * 664 * Duplicate an &sk_buff. The new one is not owned by a socket. Both 665 * copies share the same packet data but not structure. The new 666 * buffer has a reference count of 1. If the allocation fails the 667 * function returns %NULL otherwise the new buffer is returned. 668 * 669 * If this function is called from an interrupt gfp_mask() must be 670 * %GFP_ATOMIC. 671 */ 672 673 struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) 674 { 675 struct sk_buff *n; 676 677 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { 678 if (skb_copy_ubufs(skb, gfp_mask)) 679 return NULL; 680 } 681 682 n = skb + 1; 683 if (skb->fclone == SKB_FCLONE_ORIG && 684 n->fclone == SKB_FCLONE_UNAVAILABLE) { 685 atomic_t *fclone_ref = (atomic_t *) (n + 1); 686 n->fclone = SKB_FCLONE_CLONE; 687 atomic_inc(fclone_ref); 688 } else { 689 n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); 690 if (!n) 691 return NULL; 692 693 kmemcheck_annotate_bitfield(n, flags1); 694 kmemcheck_annotate_bitfield(n, flags2); 695 n->fclone = SKB_FCLONE_UNAVAILABLE; 696 } 697 698 return __skb_clone(n, skb); 699 } 700 EXPORT_SYMBOL(skb_clone); 701 702 static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) 703 { 704 #ifndef NET_SKBUFF_DATA_USES_OFFSET 705 /* 706 * Shift between the two data areas in bytes 707 */ 708 unsigned long offset = new->data - old->data; 709 #endif 710 711 __copy_skb_header(new, old); 712 713 #ifndef NET_SKBUFF_DATA_USES_OFFSET 714 /* {transport,network,mac}_header are relative to skb->head */ 715 new->transport_header += offset; 716 new->network_header += offset; 717 if (skb_mac_header_was_set(new)) 718 new->mac_header += offset; 719 #endif 720 skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; 721 skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; 722 skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; 723 } 724 725 /** 726 * skb_copy - create private copy of an sk_buff 727 * @skb: buffer to copy 728 * @gfp_mask: allocation priority 729 * 730 * Make a copy of both an &sk_buff and its data. This is used when the 731 * caller wishes to modify the data and needs a private copy of the 732 * data to alter. Returns %NULL on failure or the pointer to the buffer 733 * on success. The returned buffer has a reference count of 1. 734 * 735 * As by-product this function converts non-linear &sk_buff to linear 736 * one, so that &sk_buff becomes completely private and caller is allowed 737 * to modify all the data of returned buffer. This means that this 738 * function is not recommended for use in circumstances when only 739 * header is going to be modified. Use pskb_copy() instead. 740 */ 741 742 struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) 743 { 744 int headerlen = skb_headroom(skb); 745 unsigned int size = (skb_end_pointer(skb) - skb->head) + skb->data_len; 746 struct sk_buff *n = alloc_skb(size, gfp_mask); 747 748 if (!n) 749 return NULL; 750 751 /* Set the data pointer */ 752 skb_reserve(n, headerlen); 753 /* Set the tail pointer and length */ 754 skb_put(n, skb->len); 755 756 if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)) 757 BUG(); 758 759 copy_skb_header(n, skb); 760 return n; 761 } 762 EXPORT_SYMBOL(skb_copy); 763 764 /** 765 * pskb_copy - create copy of an sk_buff with private head. 766 * @skb: buffer to copy 767 * @gfp_mask: allocation priority 768 * 769 * Make a copy of both an &sk_buff and part of its data, located 770 * in header. Fragmented data remain shared. This is used when 771 * the caller wishes to modify only header of &sk_buff and needs 772 * private copy of the header to alter. Returns %NULL on failure 773 * or the pointer to the buffer on success. 774 * The returned buffer has a reference count of 1. 775 */ 776 777 struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) 778 { 779 unsigned int size = skb_end_pointer(skb) - skb->head; 780 struct sk_buff *n = alloc_skb(size, gfp_mask); 781 782 if (!n) 783 goto out; 784 785 /* Set the data pointer */ 786 skb_reserve(n, skb_headroom(skb)); 787 /* Set the tail pointer and length */ 788 skb_put(n, skb_headlen(skb)); 789 /* Copy the bytes */ 790 skb_copy_from_linear_data(skb, n->data, n->len); 791 792 n->truesize += skb->data_len; 793 n->data_len = skb->data_len; 794 n->len = skb->len; 795 796 if (skb_shinfo(skb)->nr_frags) { 797 int i; 798 799 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { 800 if (skb_copy_ubufs(skb, gfp_mask)) { 801 kfree(n); 802 goto out; 803 } 804 } 805 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 806 skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; 807 get_page(skb_shinfo(n)->frags[i].page); 808 } 809 skb_shinfo(n)->nr_frags = i; 810 } 811 812 if (skb_has_frag_list(skb)) { 813 skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; 814 skb_clone_fraglist(n); 815 } 816 817 copy_skb_header(n, skb); 818 out: 819 return n; 820 } 821 EXPORT_SYMBOL(pskb_copy); 822 823 /** 824 * pskb_expand_head - reallocate header of &sk_buff 825 * @skb: buffer to reallocate 826 * @nhead: room to add at head 827 * @ntail: room to add at tail 828 * @gfp_mask: allocation priority 829 * 830 * Expands (or creates identical copy, if &nhead and &ntail are zero) 831 * header of skb. &sk_buff itself is not changed. &sk_buff MUST have 832 * reference count of 1. Returns zero in the case of success or error, 833 * if expansion failed. In the last case, &sk_buff is not changed. 834 * 835 * All the pointers pointing into skb header may change and must be 836 * reloaded after call to this function. 837 */ 838 839 int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, 840 gfp_t gfp_mask) 841 { 842 int i; 843 u8 *data; 844 int size = nhead + (skb_end_pointer(skb) - skb->head) + ntail; 845 long off; 846 bool fastpath; 847 848 BUG_ON(nhead < 0); 849 850 if (skb_shared(skb)) 851 BUG(); 852 853 size = SKB_DATA_ALIGN(size); 854 855 /* Check if we can avoid taking references on fragments if we own 856 * the last reference on skb->head. (see skb_release_data()) 857 */ 858 if (!skb->cloned) 859 fastpath = true; 860 else { 861 int delta = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1; 862 fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta; 863 } 864 865 if (fastpath && 866 size + sizeof(struct skb_shared_info) <= ksize(skb->head)) { 867 memmove(skb->head + size, skb_shinfo(skb), 868 offsetof(struct skb_shared_info, 869 frags[skb_shinfo(skb)->nr_frags])); 870 memmove(skb->head + nhead, skb->head, 871 skb_tail_pointer(skb) - skb->head); 872 off = nhead; 873 goto adjust_others; 874 } 875 876 data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); 877 if (!data) 878 goto nodata; 879 880 /* Copy only real data... and, alas, header. This should be 881 * optimized for the cases when header is void. 882 */ 883 memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head); 884 885 memcpy((struct skb_shared_info *)(data + size), 886 skb_shinfo(skb), 887 offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); 888 889 if (fastpath) { 890 kfree(skb->head); 891 } else { 892 /* copy this zero copy skb frags */ 893 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { 894 if (skb_copy_ubufs(skb, gfp_mask)) 895 goto nofrags; 896 } 897 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 898 get_page(skb_shinfo(skb)->frags[i].page); 899 900 if (skb_has_frag_list(skb)) 901 skb_clone_fraglist(skb); 902 903 skb_release_data(skb); 904 } 905 off = (data + nhead) - skb->head; 906 907 skb->head = data; 908 adjust_others: 909 skb->data += off; 910 #ifdef NET_SKBUFF_DATA_USES_OFFSET 911 skb->end = size; 912 off = nhead; 913 #else 914 skb->end = skb->head + size; 915 #endif 916 /* {transport,network,mac}_header and tail are relative to skb->head */ 917 skb->tail += off; 918 skb->transport_header += off; 919 skb->network_header += off; 920 if (skb_mac_header_was_set(skb)) 921 skb->mac_header += off; 922 /* Only adjust this if it actually is csum_start rather than csum */ 923 if (skb->ip_summed == CHECKSUM_PARTIAL) 924 skb->csum_start += nhead; 925 skb->cloned = 0; 926 skb->hdr_len = 0; 927 skb->nohdr = 0; 928 atomic_set(&skb_shinfo(skb)->dataref, 1); 929 return 0; 930 931 nofrags: 932 kfree(data); 933 nodata: 934 return -ENOMEM; 935 } 936 EXPORT_SYMBOL(pskb_expand_head); 937 938 /* Make private copy of skb with writable head and some headroom */ 939 940 struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) 941 { 942 struct sk_buff *skb2; 943 int delta = headroom - skb_headroom(skb); 944 945 if (delta <= 0) 946 skb2 = pskb_copy(skb, GFP_ATOMIC); 947 else { 948 skb2 = skb_clone(skb, GFP_ATOMIC); 949 if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, 950 GFP_ATOMIC)) { 951 kfree_skb(skb2); 952 skb2 = NULL; 953 } 954 } 955 return skb2; 956 } 957 EXPORT_SYMBOL(skb_realloc_headroom); 958 959 /** 960 * skb_copy_expand - copy and expand sk_buff 961 * @skb: buffer to copy 962 * @newheadroom: new free bytes at head 963 * @newtailroom: new free bytes at tail 964 * @gfp_mask: allocation priority 965 * 966 * Make a copy of both an &sk_buff and its data and while doing so 967 * allocate additional space. 968 * 969 * This is used when the caller wishes to modify the data and needs a 970 * private copy of the data to alter as well as more space for new fields. 971 * Returns %NULL on failure or the pointer to the buffer 972 * on success. The returned buffer has a reference count of 1. 973 * 974 * You must pass %GFP_ATOMIC as the allocation priority if this function 975 * is called from an interrupt. 976 */ 977 struct sk_buff *skb_copy_expand(const struct sk_buff *skb, 978 int newheadroom, int newtailroom, 979 gfp_t gfp_mask) 980 { 981 /* 982 * Allocate the copy buffer 983 */ 984 struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom, 985 gfp_mask); 986 int oldheadroom = skb_headroom(skb); 987 int head_copy_len, head_copy_off; 988 int off; 989 990 if (!n) 991 return NULL; 992 993 skb_reserve(n, newheadroom); 994 995 /* Set the tail pointer and length */ 996 skb_put(n, skb->len); 997 998 head_copy_len = oldheadroom; 999 head_copy_off = 0; 1000 if (newheadroom <= head_copy_len) 1001 head_copy_len = newheadroom; 1002 else 1003 head_copy_off = newheadroom - head_copy_len; 1004 1005 /* Copy the linear header and data. */ 1006 if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, 1007 skb->len + head_copy_len)) 1008 BUG(); 1009 1010 copy_skb_header(n, skb); 1011 1012 off = newheadroom - oldheadroom; 1013 if (n->ip_summed == CHECKSUM_PARTIAL) 1014 n->csum_start += off; 1015 #ifdef NET_SKBUFF_DATA_USES_OFFSET 1016 n->transport_header += off; 1017 n->network_header += off; 1018 if (skb_mac_header_was_set(skb)) 1019 n->mac_header += off; 1020 #endif 1021 1022 return n; 1023 } 1024 EXPORT_SYMBOL(skb_copy_expand); 1025 1026 /** 1027 * skb_pad - zero pad the tail of an skb 1028 * @skb: buffer to pad 1029 * @pad: space to pad 1030 * 1031 * Ensure that a buffer is followed by a padding area that is zero 1032 * filled. Used by network drivers which may DMA or transfer data 1033 * beyond the buffer end onto the wire. 1034 * 1035 * May return error in out of memory cases. The skb is freed on error. 1036 */ 1037 1038 int skb_pad(struct sk_buff *skb, int pad) 1039 { 1040 int err; 1041 int ntail; 1042 1043 /* If the skbuff is non linear tailroom is always zero.. */ 1044 if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) { 1045 memset(skb->data+skb->len, 0, pad); 1046 return 0; 1047 } 1048 1049 ntail = skb->data_len + pad - (skb->end - skb->tail); 1050 if (likely(skb_cloned(skb) || ntail > 0)) { 1051 err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC); 1052 if (unlikely(err)) 1053 goto free_skb; 1054 } 1055 1056 /* FIXME: The use of this function with non-linear skb's really needs 1057 * to be audited. 1058 */ 1059 err = skb_linearize(skb); 1060 if (unlikely(err)) 1061 goto free_skb; 1062 1063 memset(skb->data + skb->len, 0, pad); 1064 return 0; 1065 1066 free_skb: 1067 kfree_skb(skb); 1068 return err; 1069 } 1070 EXPORT_SYMBOL(skb_pad); 1071 1072 /** 1073 * skb_put - add data to a buffer 1074 * @skb: buffer to use 1075 * @len: amount of data to add 1076 * 1077 * This function extends the used data area of the buffer. If this would 1078 * exceed the total buffer size the kernel will panic. A pointer to the 1079 * first byte of the extra data is returned. 1080 */ 1081 unsigned char *skb_put(struct sk_buff *skb, unsigned int len) 1082 { 1083 unsigned char *tmp = skb_tail_pointer(skb); 1084 SKB_LINEAR_ASSERT(skb); 1085 skb->tail += len; 1086 skb->len += len; 1087 if (unlikely(skb->tail > skb->end)) 1088 skb_over_panic(skb, len, __builtin_return_address(0)); 1089 return tmp; 1090 } 1091 EXPORT_SYMBOL(skb_put); 1092 1093 /** 1094 * skb_push - add data to the start of a buffer 1095 * @skb: buffer to use 1096 * @len: amount of data to add 1097 * 1098 * This function extends the used data area of the buffer at the buffer 1099 * start. If this would exceed the total buffer headroom the kernel will 1100 * panic. A pointer to the first byte of the extra data is returned. 1101 */ 1102 unsigned char *skb_push(struct sk_buff *skb, unsigned int len) 1103 { 1104 skb->data -= len; 1105 skb->len += len; 1106 if (unlikely(skb->data<skb->head)) 1107 skb_under_panic(skb, len, __builtin_return_address(0)); 1108 return skb->data; 1109 } 1110 EXPORT_SYMBOL(skb_push); 1111 1112 /** 1113 * skb_pull - remove data from the start of a buffer 1114 * @skb: buffer to use 1115 * @len: amount of data to remove 1116 * 1117 * This function removes data from the start of a buffer, returning 1118 * the memory to the headroom. A pointer to the next data in the buffer 1119 * is returned. Once the data has been pulled future pushes will overwrite 1120 * the old data. 1121 */ 1122 unsigned char *skb_pull(struct sk_buff *skb, unsigned int len) 1123 { 1124 return skb_pull_inline(skb, len); 1125 } 1126 EXPORT_SYMBOL(skb_pull); 1127 1128 /** 1129 * skb_trim - remove end from a buffer 1130 * @skb: buffer to alter 1131 * @len: new length 1132 * 1133 * Cut the length of a buffer down by removing data from the tail. If 1134 * the buffer is already under the length specified it is not modified. 1135 * The skb must be linear. 1136 */ 1137 void skb_trim(struct sk_buff *skb, unsigned int len) 1138 { 1139 if (skb->len > len) 1140 __skb_trim(skb, len); 1141 } 1142 EXPORT_SYMBOL(skb_trim); 1143 1144 /* Trims skb to length len. It can change skb pointers. 1145 */ 1146 1147 int ___pskb_trim(struct sk_buff *skb, unsigned int len) 1148 { 1149 struct sk_buff **fragp; 1150 struct sk_buff *frag; 1151 int offset = skb_headlen(skb); 1152 int nfrags = skb_shinfo(skb)->nr_frags; 1153 int i; 1154 int err; 1155 1156 if (skb_cloned(skb) && 1157 unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))) 1158 return err; 1159 1160 i = 0; 1161 if (offset >= len) 1162 goto drop_pages; 1163 1164 for (; i < nfrags; i++) { 1165 int end = offset + skb_shinfo(skb)->frags[i].size; 1166 1167 if (end < len) { 1168 offset = end; 1169 continue; 1170 } 1171 1172 skb_shinfo(skb)->frags[i++].size = len - offset; 1173 1174 drop_pages: 1175 skb_shinfo(skb)->nr_frags = i; 1176 1177 for (; i < nfrags; i++) 1178 put_page(skb_shinfo(skb)->frags[i].page); 1179 1180 if (skb_has_frag_list(skb)) 1181 skb_drop_fraglist(skb); 1182 goto done; 1183 } 1184 1185 for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp); 1186 fragp = &frag->next) { 1187 int end = offset + frag->len; 1188 1189 if (skb_shared(frag)) { 1190 struct sk_buff *nfrag; 1191 1192 nfrag = skb_clone(frag, GFP_ATOMIC); 1193 if (unlikely(!nfrag)) 1194 return -ENOMEM; 1195 1196 nfrag->next = frag->next; 1197 kfree_skb(frag); 1198 frag = nfrag; 1199 *fragp = frag; 1200 } 1201 1202 if (end < len) { 1203 offset = end; 1204 continue; 1205 } 1206 1207 if (end > len && 1208 unlikely((err = pskb_trim(frag, len - offset)))) 1209 return err; 1210 1211 if (frag->next) 1212 skb_drop_list(&frag->next); 1213 break; 1214 } 1215 1216 done: 1217 if (len > skb_headlen(skb)) { 1218 skb->data_len -= skb->len - len; 1219 skb->len = len; 1220 } else { 1221 skb->len = len; 1222 skb->data_len = 0; 1223 skb_set_tail_pointer(skb, len); 1224 } 1225 1226 return 0; 1227 } 1228 EXPORT_SYMBOL(___pskb_trim); 1229 1230 /** 1231 * __pskb_pull_tail - advance tail of skb header 1232 * @skb: buffer to reallocate 1233 * @delta: number of bytes to advance tail 1234 * 1235 * The function makes a sense only on a fragmented &sk_buff, 1236 * it expands header moving its tail forward and copying necessary 1237 * data from fragmented part. 1238 * 1239 * &sk_buff MUST have reference count of 1. 1240 * 1241 * Returns %NULL (and &sk_buff does not change) if pull failed 1242 * or value of new tail of skb in the case of success. 1243 * 1244 * All the pointers pointing into skb header may change and must be 1245 * reloaded after call to this function. 1246 */ 1247 1248 /* Moves tail of skb head forward, copying data from fragmented part, 1249 * when it is necessary. 1250 * 1. It may fail due to malloc failure. 1251 * 2. It may change skb pointers. 1252 * 1253 * It is pretty complicated. Luckily, it is called only in exceptional cases. 1254 */ 1255 unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) 1256 { 1257 /* If skb has not enough free space at tail, get new one 1258 * plus 128 bytes for future expansions. If we have enough 1259 * room at tail, reallocate without expansion only if skb is cloned. 1260 */ 1261 int i, k, eat = (skb->tail + delta) - skb->end; 1262 1263 if (eat > 0 || skb_cloned(skb)) { 1264 if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, 1265 GFP_ATOMIC)) 1266 return NULL; 1267 } 1268 1269 if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta)) 1270 BUG(); 1271 1272 /* Optimization: no fragments, no reasons to preestimate 1273 * size of pulled pages. Superb. 1274 */ 1275 if (!skb_has_frag_list(skb)) 1276 goto pull_pages; 1277 1278 /* Estimate size of pulled pages. */ 1279 eat = delta; 1280 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1281 if (skb_shinfo(skb)->frags[i].size >= eat) 1282 goto pull_pages; 1283 eat -= skb_shinfo(skb)->frags[i].size; 1284 } 1285 1286 /* If we need update frag list, we are in troubles. 1287 * Certainly, it possible to add an offset to skb data, 1288 * but taking into account that pulling is expected to 1289 * be very rare operation, it is worth to fight against 1290 * further bloating skb head and crucify ourselves here instead. 1291 * Pure masohism, indeed. 8)8) 1292 */ 1293 if (eat) { 1294 struct sk_buff *list = skb_shinfo(skb)->frag_list; 1295 struct sk_buff *clone = NULL; 1296 struct sk_buff *insp = NULL; 1297 1298 do { 1299 BUG_ON(!list); 1300 1301 if (list->len <= eat) { 1302 /* Eaten as whole. */ 1303 eat -= list->len; 1304 list = list->next; 1305 insp = list; 1306 } else { 1307 /* Eaten partially. */ 1308 1309 if (skb_shared(list)) { 1310 /* Sucks! We need to fork list. :-( */ 1311 clone = skb_clone(list, GFP_ATOMIC); 1312 if (!clone) 1313 return NULL; 1314 insp = list->next; 1315 list = clone; 1316 } else { 1317 /* This may be pulled without 1318 * problems. */ 1319 insp = list; 1320 } 1321 if (!pskb_pull(list, eat)) { 1322 kfree_skb(clone); 1323 return NULL; 1324 } 1325 break; 1326 } 1327 } while (eat); 1328 1329 /* Free pulled out fragments. */ 1330 while ((list = skb_shinfo(skb)->frag_list) != insp) { 1331 skb_shinfo(skb)->frag_list = list->next; 1332 kfree_skb(list); 1333 } 1334 /* And insert new clone at head. */ 1335 if (clone) { 1336 clone->next = list; 1337 skb_shinfo(skb)->frag_list = clone; 1338 } 1339 } 1340 /* Success! Now we may commit changes to skb data. */ 1341 1342 pull_pages: 1343 eat = delta; 1344 k = 0; 1345 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1346 if (skb_shinfo(skb)->frags[i].size <= eat) { 1347 put_page(skb_shinfo(skb)->frags[i].page); 1348 eat -= skb_shinfo(skb)->frags[i].size; 1349 } else { 1350 skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; 1351 if (eat) { 1352 skb_shinfo(skb)->frags[k].page_offset += eat; 1353 skb_shinfo(skb)->frags[k].size -= eat; 1354 eat = 0; 1355 } 1356 k++; 1357 } 1358 } 1359 skb_shinfo(skb)->nr_frags = k; 1360 1361 skb->tail += delta; 1362 skb->data_len -= delta; 1363 1364 return skb_tail_pointer(skb); 1365 } 1366 EXPORT_SYMBOL(__pskb_pull_tail); 1367 1368 /* Copy some data bits from skb to kernel buffer. */ 1369 1370 int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) 1371 { 1372 int start = skb_headlen(skb); 1373 struct sk_buff *frag_iter; 1374 int i, copy; 1375 1376 if (offset > (int)skb->len - len) 1377 goto fault; 1378 1379 /* Copy header. */ 1380 if ((copy = start - offset) > 0) { 1381 if (copy > len) 1382 copy = len; 1383 skb_copy_from_linear_data_offset(skb, offset, to, copy); 1384 if ((len -= copy) == 0) 1385 return 0; 1386 offset += copy; 1387 to += copy; 1388 } 1389 1390 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1391 int end; 1392 1393 WARN_ON(start > offset + len); 1394 1395 end = start + skb_shinfo(skb)->frags[i].size; 1396 if ((copy = end - offset) > 0) { 1397 u8 *vaddr; 1398 1399 if (copy > len) 1400 copy = len; 1401 1402 vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]); 1403 memcpy(to, 1404 vaddr + skb_shinfo(skb)->frags[i].page_offset+ 1405 offset - start, copy); 1406 kunmap_skb_frag(vaddr); 1407 1408 if ((len -= copy) == 0) 1409 return 0; 1410 offset += copy; 1411 to += copy; 1412 } 1413 start = end; 1414 } 1415 1416 skb_walk_frags(skb, frag_iter) { 1417 int end; 1418 1419 WARN_ON(start > offset + len); 1420 1421 end = start + frag_iter->len; 1422 if ((copy = end - offset) > 0) { 1423 if (copy > len) 1424 copy = len; 1425 if (skb_copy_bits(frag_iter, offset - start, to, copy)) 1426 goto fault; 1427 if ((len -= copy) == 0) 1428 return 0; 1429 offset += copy; 1430 to += copy; 1431 } 1432 start = end; 1433 } 1434 1435 if (!len) 1436 return 0; 1437 1438 fault: 1439 return -EFAULT; 1440 } 1441 EXPORT_SYMBOL(skb_copy_bits); 1442 1443 /* 1444 * Callback from splice_to_pipe(), if we need to release some pages 1445 * at the end of the spd in case we error'ed out in filling the pipe. 1446 */ 1447 static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) 1448 { 1449 put_page(spd->pages[i]); 1450 } 1451 1452 static inline struct page *linear_to_page(struct page *page, unsigned int *len, 1453 unsigned int *offset, 1454 struct sk_buff *skb, struct sock *sk) 1455 { 1456 struct page *p = sk->sk_sndmsg_page; 1457 unsigned int off; 1458 1459 if (!p) { 1460 new_page: 1461 p = sk->sk_sndmsg_page = alloc_pages(sk->sk_allocation, 0); 1462 if (!p) 1463 return NULL; 1464 1465 off = sk->sk_sndmsg_off = 0; 1466 /* hold one ref to this page until it's full */ 1467 } else { 1468 unsigned int mlen; 1469 1470 off = sk->sk_sndmsg_off; 1471 mlen = PAGE_SIZE - off; 1472 if (mlen < 64 && mlen < *len) { 1473 put_page(p); 1474 goto new_page; 1475 } 1476 1477 *len = min_t(unsigned int, *len, mlen); 1478 } 1479 1480 memcpy(page_address(p) + off, page_address(page) + *offset, *len); 1481 sk->sk_sndmsg_off += *len; 1482 *offset = off; 1483 get_page(p); 1484 1485 return p; 1486 } 1487 1488 /* 1489 * Fill page/offset/length into spd, if it can hold more pages. 1490 */ 1491 static inline int spd_fill_page(struct splice_pipe_desc *spd, 1492 struct pipe_inode_info *pipe, struct page *page, 1493 unsigned int *len, unsigned int offset, 1494 struct sk_buff *skb, int linear, 1495 struct sock *sk) 1496 { 1497 if (unlikely(spd->nr_pages == pipe->buffers)) 1498 return 1; 1499 1500 if (linear) { 1501 page = linear_to_page(page, len, &offset, skb, sk); 1502 if (!page) 1503 return 1; 1504 } else 1505 get_page(page); 1506 1507 spd->pages[spd->nr_pages] = page; 1508 spd->partial[spd->nr_pages].len = *len; 1509 spd->partial[spd->nr_pages].offset = offset; 1510 spd->nr_pages++; 1511 1512 return 0; 1513 } 1514 1515 static inline void __segment_seek(struct page **page, unsigned int *poff, 1516 unsigned int *plen, unsigned int off) 1517 { 1518 unsigned long n; 1519 1520 *poff += off; 1521 n = *poff / PAGE_SIZE; 1522 if (n) 1523 *page = nth_page(*page, n); 1524 1525 *poff = *poff % PAGE_SIZE; 1526 *plen -= off; 1527 } 1528 1529 static inline int __splice_segment(struct page *page, unsigned int poff, 1530 unsigned int plen, unsigned int *off, 1531 unsigned int *len, struct sk_buff *skb, 1532 struct splice_pipe_desc *spd, int linear, 1533 struct sock *sk, 1534 struct pipe_inode_info *pipe) 1535 { 1536 if (!*len) 1537 return 1; 1538 1539 /* skip this segment if already processed */ 1540 if (*off >= plen) { 1541 *off -= plen; 1542 return 0; 1543 } 1544 1545 /* ignore any bits we already processed */ 1546 if (*off) { 1547 __segment_seek(&page, &poff, &plen, *off); 1548 *off = 0; 1549 } 1550 1551 do { 1552 unsigned int flen = min(*len, plen); 1553 1554 /* the linear region may spread across several pages */ 1555 flen = min_t(unsigned int, flen, PAGE_SIZE - poff); 1556 1557 if (spd_fill_page(spd, pipe, page, &flen, poff, skb, linear, sk)) 1558 return 1; 1559 1560 __segment_seek(&page, &poff, &plen, flen); 1561 *len -= flen; 1562 1563 } while (*len && plen); 1564 1565 return 0; 1566 } 1567 1568 /* 1569 * Map linear and fragment data from the skb to spd. It reports failure if the 1570 * pipe is full or if we already spliced the requested length. 1571 */ 1572 static int __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, 1573 unsigned int *offset, unsigned int *len, 1574 struct splice_pipe_desc *spd, struct sock *sk) 1575 { 1576 int seg; 1577 1578 /* 1579 * map the linear part 1580 */ 1581 if (__splice_segment(virt_to_page(skb->data), 1582 (unsigned long) skb->data & (PAGE_SIZE - 1), 1583 skb_headlen(skb), 1584 offset, len, skb, spd, 1, sk, pipe)) 1585 return 1; 1586 1587 /* 1588 * then map the fragments 1589 */ 1590 for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { 1591 const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; 1592 1593 if (__splice_segment(f->page, f->page_offset, f->size, 1594 offset, len, skb, spd, 0, sk, pipe)) 1595 return 1; 1596 } 1597 1598 return 0; 1599 } 1600 1601 /* 1602 * Map data from the skb to a pipe. Should handle both the linear part, 1603 * the fragments, and the frag list. It does NOT handle frag lists within 1604 * the frag list, if such a thing exists. We'd probably need to recurse to 1605 * handle that cleanly. 1606 */ 1607 int skb_splice_bits(struct sk_buff *skb, unsigned int offset, 1608 struct pipe_inode_info *pipe, unsigned int tlen, 1609 unsigned int flags) 1610 { 1611 struct partial_page partial[PIPE_DEF_BUFFERS]; 1612 struct page *pages[PIPE_DEF_BUFFERS]; 1613 struct splice_pipe_desc spd = { 1614 .pages = pages, 1615 .partial = partial, 1616 .flags = flags, 1617 .ops = &sock_pipe_buf_ops, 1618 .spd_release = sock_spd_release, 1619 }; 1620 struct sk_buff *frag_iter; 1621 struct sock *sk = skb->sk; 1622 int ret = 0; 1623 1624 if (splice_grow_spd(pipe, &spd)) 1625 return -ENOMEM; 1626 1627 /* 1628 * __skb_splice_bits() only fails if the output has no room left, 1629 * so no point in going over the frag_list for the error case. 1630 */ 1631 if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk)) 1632 goto done; 1633 else if (!tlen) 1634 goto done; 1635 1636 /* 1637 * now see if we have a frag_list to map 1638 */ 1639 skb_walk_frags(skb, frag_iter) { 1640 if (!tlen) 1641 break; 1642 if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk)) 1643 break; 1644 } 1645 1646 done: 1647 if (spd.nr_pages) { 1648 /* 1649 * Drop the socket lock, otherwise we have reverse 1650 * locking dependencies between sk_lock and i_mutex 1651 * here as compared to sendfile(). We enter here 1652 * with the socket lock held, and splice_to_pipe() will 1653 * grab the pipe inode lock. For sendfile() emulation, 1654 * we call into ->sendpage() with the i_mutex lock held 1655 * and networking will grab the socket lock. 1656 */ 1657 release_sock(sk); 1658 ret = splice_to_pipe(pipe, &spd); 1659 lock_sock(sk); 1660 } 1661 1662 splice_shrink_spd(pipe, &spd); 1663 return ret; 1664 } 1665 1666 /** 1667 * skb_store_bits - store bits from kernel buffer to skb 1668 * @skb: destination buffer 1669 * @offset: offset in destination 1670 * @from: source buffer 1671 * @len: number of bytes to copy 1672 * 1673 * Copy the specified number of bytes from the source buffer to the 1674 * destination skb. This function handles all the messy bits of 1675 * traversing fragment lists and such. 1676 */ 1677 1678 int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) 1679 { 1680 int start = skb_headlen(skb); 1681 struct sk_buff *frag_iter; 1682 int i, copy; 1683 1684 if (offset > (int)skb->len - len) 1685 goto fault; 1686 1687 if ((copy = start - offset) > 0) { 1688 if (copy > len) 1689 copy = len; 1690 skb_copy_to_linear_data_offset(skb, offset, from, copy); 1691 if ((len -= copy) == 0) 1692 return 0; 1693 offset += copy; 1694 from += copy; 1695 } 1696 1697 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1698 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 1699 int end; 1700 1701 WARN_ON(start > offset + len); 1702 1703 end = start + frag->size; 1704 if ((copy = end - offset) > 0) { 1705 u8 *vaddr; 1706 1707 if (copy > len) 1708 copy = len; 1709 1710 vaddr = kmap_skb_frag(frag); 1711 memcpy(vaddr + frag->page_offset + offset - start, 1712 from, copy); 1713 kunmap_skb_frag(vaddr); 1714 1715 if ((len -= copy) == 0) 1716 return 0; 1717 offset += copy; 1718 from += copy; 1719 } 1720 start = end; 1721 } 1722 1723 skb_walk_frags(skb, frag_iter) { 1724 int end; 1725 1726 WARN_ON(start > offset + len); 1727 1728 end = start + frag_iter->len; 1729 if ((copy = end - offset) > 0) { 1730 if (copy > len) 1731 copy = len; 1732 if (skb_store_bits(frag_iter, offset - start, 1733 from, copy)) 1734 goto fault; 1735 if ((len -= copy) == 0) 1736 return 0; 1737 offset += copy; 1738 from += copy; 1739 } 1740 start = end; 1741 } 1742 if (!len) 1743 return 0; 1744 1745 fault: 1746 return -EFAULT; 1747 } 1748 EXPORT_SYMBOL(skb_store_bits); 1749 1750 /* Checksum skb data. */ 1751 1752 __wsum skb_checksum(const struct sk_buff *skb, int offset, 1753 int len, __wsum csum) 1754 { 1755 int start = skb_headlen(skb); 1756 int i, copy = start - offset; 1757 struct sk_buff *frag_iter; 1758 int pos = 0; 1759 1760 /* Checksum header. */ 1761 if (copy > 0) { 1762 if (copy > len) 1763 copy = len; 1764 csum = csum_partial(skb->data + offset, copy, csum); 1765 if ((len -= copy) == 0) 1766 return csum; 1767 offset += copy; 1768 pos = copy; 1769 } 1770 1771 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1772 int end; 1773 1774 WARN_ON(start > offset + len); 1775 1776 end = start + skb_shinfo(skb)->frags[i].size; 1777 if ((copy = end - offset) > 0) { 1778 __wsum csum2; 1779 u8 *vaddr; 1780 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 1781 1782 if (copy > len) 1783 copy = len; 1784 vaddr = kmap_skb_frag(frag); 1785 csum2 = csum_partial(vaddr + frag->page_offset + 1786 offset - start, copy, 0); 1787 kunmap_skb_frag(vaddr); 1788 csum = csum_block_add(csum, csum2, pos); 1789 if (!(len -= copy)) 1790 return csum; 1791 offset += copy; 1792 pos += copy; 1793 } 1794 start = end; 1795 } 1796 1797 skb_walk_frags(skb, frag_iter) { 1798 int end; 1799 1800 WARN_ON(start > offset + len); 1801 1802 end = start + frag_iter->len; 1803 if ((copy = end - offset) > 0) { 1804 __wsum csum2; 1805 if (copy > len) 1806 copy = len; 1807 csum2 = skb_checksum(frag_iter, offset - start, 1808 copy, 0); 1809 csum = csum_block_add(csum, csum2, pos); 1810 if ((len -= copy) == 0) 1811 return csum; 1812 offset += copy; 1813 pos += copy; 1814 } 1815 start = end; 1816 } 1817 BUG_ON(len); 1818 1819 return csum; 1820 } 1821 EXPORT_SYMBOL(skb_checksum); 1822 1823 /* Both of above in one bottle. */ 1824 1825 __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, 1826 u8 *to, int len, __wsum csum) 1827 { 1828 int start = skb_headlen(skb); 1829 int i, copy = start - offset; 1830 struct sk_buff *frag_iter; 1831 int pos = 0; 1832 1833 /* Copy header. */ 1834 if (copy > 0) { 1835 if (copy > len) 1836 copy = len; 1837 csum = csum_partial_copy_nocheck(skb->data + offset, to, 1838 copy, csum); 1839 if ((len -= copy) == 0) 1840 return csum; 1841 offset += copy; 1842 to += copy; 1843 pos = copy; 1844 } 1845 1846 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1847 int end; 1848 1849 WARN_ON(start > offset + len); 1850 1851 end = start + skb_shinfo(skb)->frags[i].size; 1852 if ((copy = end - offset) > 0) { 1853 __wsum csum2; 1854 u8 *vaddr; 1855 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 1856 1857 if (copy > len) 1858 copy = len; 1859 vaddr = kmap_skb_frag(frag); 1860 csum2 = csum_partial_copy_nocheck(vaddr + 1861 frag->page_offset + 1862 offset - start, to, 1863 copy, 0); 1864 kunmap_skb_frag(vaddr); 1865 csum = csum_block_add(csum, csum2, pos); 1866 if (!(len -= copy)) 1867 return csum; 1868 offset += copy; 1869 to += copy; 1870 pos += copy; 1871 } 1872 start = end; 1873 } 1874 1875 skb_walk_frags(skb, frag_iter) { 1876 __wsum csum2; 1877 int end; 1878 1879 WARN_ON(start > offset + len); 1880 1881 end = start + frag_iter->len; 1882 if ((copy = end - offset) > 0) { 1883 if (copy > len) 1884 copy = len; 1885 csum2 = skb_copy_and_csum_bits(frag_iter, 1886 offset - start, 1887 to, copy, 0); 1888 csum = csum_block_add(csum, csum2, pos); 1889 if ((len -= copy) == 0) 1890 return csum; 1891 offset += copy; 1892 to += copy; 1893 pos += copy; 1894 } 1895 start = end; 1896 } 1897 BUG_ON(len); 1898 return csum; 1899 } 1900 EXPORT_SYMBOL(skb_copy_and_csum_bits); 1901 1902 void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) 1903 { 1904 __wsum csum; 1905 long csstart; 1906 1907 if (skb->ip_summed == CHECKSUM_PARTIAL) 1908 csstart = skb_checksum_start_offset(skb); 1909 else 1910 csstart = skb_headlen(skb); 1911 1912 BUG_ON(csstart > skb_headlen(skb)); 1913 1914 skb_copy_from_linear_data(skb, to, csstart); 1915 1916 csum = 0; 1917 if (csstart != skb->len) 1918 csum = skb_copy_and_csum_bits(skb, csstart, to + csstart, 1919 skb->len - csstart, 0); 1920 1921 if (skb->ip_summed == CHECKSUM_PARTIAL) { 1922 long csstuff = csstart + skb->csum_offset; 1923 1924 *((__sum16 *)(to + csstuff)) = csum_fold(csum); 1925 } 1926 } 1927 EXPORT_SYMBOL(skb_copy_and_csum_dev); 1928 1929 /** 1930 * skb_dequeue - remove from the head of the queue 1931 * @list: list to dequeue from 1932 * 1933 * Remove the head of the list. The list lock is taken so the function 1934 * may be used safely with other locking list functions. The head item is 1935 * returned or %NULL if the list is empty. 1936 */ 1937 1938 struct sk_buff *skb_dequeue(struct sk_buff_head *list) 1939 { 1940 unsigned long flags; 1941 struct sk_buff *result; 1942 1943 spin_lock_irqsave(&list->lock, flags); 1944 result = __skb_dequeue(list); 1945 spin_unlock_irqrestore(&list->lock, flags); 1946 return result; 1947 } 1948 EXPORT_SYMBOL(skb_dequeue); 1949 1950 /** 1951 * skb_dequeue_tail - remove from the tail of the queue 1952 * @list: list to dequeue from 1953 * 1954 * Remove the tail of the list. The list lock is taken so the function 1955 * may be used safely with other locking list functions. The tail item is 1956 * returned or %NULL if the list is empty. 1957 */ 1958 struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) 1959 { 1960 unsigned long flags; 1961 struct sk_buff *result; 1962 1963 spin_lock_irqsave(&list->lock, flags); 1964 result = __skb_dequeue_tail(list); 1965 spin_unlock_irqrestore(&list->lock, flags); 1966 return result; 1967 } 1968 EXPORT_SYMBOL(skb_dequeue_tail); 1969 1970 /** 1971 * skb_queue_purge - empty a list 1972 * @list: list to empty 1973 * 1974 * Delete all buffers on an &sk_buff list. Each buffer is removed from 1975 * the list and one reference dropped. This function takes the list 1976 * lock and is atomic with respect to other list locking functions. 1977 */ 1978 void skb_queue_purge(struct sk_buff_head *list) 1979 { 1980 struct sk_buff *skb; 1981 while ((skb = skb_dequeue(list)) != NULL) 1982 kfree_skb(skb); 1983 } 1984 EXPORT_SYMBOL(skb_queue_purge); 1985 1986 /** 1987 * skb_queue_head - queue a buffer at the list head 1988 * @list: list to use 1989 * @newsk: buffer to queue 1990 * 1991 * Queue a buffer at the start of the list. This function takes the 1992 * list lock and can be used safely with other locking &sk_buff functions 1993 * safely. 1994 * 1995 * A buffer cannot be placed on two lists at the same time. 1996 */ 1997 void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) 1998 { 1999 unsigned long flags; 2000 2001 spin_lock_irqsave(&list->lock, flags); 2002 __skb_queue_head(list, newsk); 2003 spin_unlock_irqrestore(&list->lock, flags); 2004 } 2005 EXPORT_SYMBOL(skb_queue_head); 2006 2007 /** 2008 * skb_queue_tail - queue a buffer at the list tail 2009 * @list: list to use 2010 * @newsk: buffer to queue 2011 * 2012 * Queue a buffer at the tail of the list. This function takes the 2013 * list lock and can be used safely with other locking &sk_buff functions 2014 * safely. 2015 * 2016 * A buffer cannot be placed on two lists at the same time. 2017 */ 2018 void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) 2019 { 2020 unsigned long flags; 2021 2022 spin_lock_irqsave(&list->lock, flags); 2023 __skb_queue_tail(list, newsk); 2024 spin_unlock_irqrestore(&list->lock, flags); 2025 } 2026 EXPORT_SYMBOL(skb_queue_tail); 2027 2028 /** 2029 * skb_unlink - remove a buffer from a list 2030 * @skb: buffer to remove 2031 * @list: list to use 2032 * 2033 * Remove a packet from a list. The list locks are taken and this 2034 * function is atomic with respect to other list locked calls 2035 * 2036 * You must know what list the SKB is on. 2037 */ 2038 void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) 2039 { 2040 unsigned long flags; 2041 2042 spin_lock_irqsave(&list->lock, flags); 2043 __skb_unlink(skb, list); 2044 spin_unlock_irqrestore(&list->lock, flags); 2045 } 2046 EXPORT_SYMBOL(skb_unlink); 2047 2048 /** 2049 * skb_append - append a buffer 2050 * @old: buffer to insert after 2051 * @newsk: buffer to insert 2052 * @list: list to use 2053 * 2054 * Place a packet after a given packet in a list. The list locks are taken 2055 * and this function is atomic with respect to other list locked calls. 2056 * A buffer cannot be placed on two lists at the same time. 2057 */ 2058 void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) 2059 { 2060 unsigned long flags; 2061 2062 spin_lock_irqsave(&list->lock, flags); 2063 __skb_queue_after(list, old, newsk); 2064 spin_unlock_irqrestore(&list->lock, flags); 2065 } 2066 EXPORT_SYMBOL(skb_append); 2067 2068 /** 2069 * skb_insert - insert a buffer 2070 * @old: buffer to insert before 2071 * @newsk: buffer to insert 2072 * @list: list to use 2073 * 2074 * Place a packet before a given packet in a list. The list locks are 2075 * taken and this function is atomic with respect to other list locked 2076 * calls. 2077 * 2078 * A buffer cannot be placed on two lists at the same time. 2079 */ 2080 void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) 2081 { 2082 unsigned long flags; 2083 2084 spin_lock_irqsave(&list->lock, flags); 2085 __skb_insert(newsk, old->prev, old, list); 2086 spin_unlock_irqrestore(&list->lock, flags); 2087 } 2088 EXPORT_SYMBOL(skb_insert); 2089 2090 static inline void skb_split_inside_header(struct sk_buff *skb, 2091 struct sk_buff* skb1, 2092 const u32 len, const int pos) 2093 { 2094 int i; 2095 2096 skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len), 2097 pos - len); 2098 /* And move data appendix as is. */ 2099 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 2100 skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; 2101 2102 skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; 2103 skb_shinfo(skb)->nr_frags = 0; 2104 skb1->data_len = skb->data_len; 2105 skb1->len += skb1->data_len; 2106 skb->data_len = 0; 2107 skb->len = len; 2108 skb_set_tail_pointer(skb, len); 2109 } 2110 2111 static inline void skb_split_no_header(struct sk_buff *skb, 2112 struct sk_buff* skb1, 2113 const u32 len, int pos) 2114 { 2115 int i, k = 0; 2116 const int nfrags = skb_shinfo(skb)->nr_frags; 2117 2118 skb_shinfo(skb)->nr_frags = 0; 2119 skb1->len = skb1->data_len = skb->len - len; 2120 skb->len = len; 2121 skb->data_len = len - pos; 2122 2123 for (i = 0; i < nfrags; i++) { 2124 int size = skb_shinfo(skb)->frags[i].size; 2125 2126 if (pos + size > len) { 2127 skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; 2128 2129 if (pos < len) { 2130 /* Split frag. 2131 * We have two variants in this case: 2132 * 1. Move all the frag to the second 2133 * part, if it is possible. F.e. 2134 * this approach is mandatory for TUX, 2135 * where splitting is expensive. 2136 * 2. Split is accurately. We make this. 2137 */ 2138 get_page(skb_shinfo(skb)->frags[i].page); 2139 skb_shinfo(skb1)->frags[0].page_offset += len - pos; 2140 skb_shinfo(skb1)->frags[0].size -= len - pos; 2141 skb_shinfo(skb)->frags[i].size = len - pos; 2142 skb_shinfo(skb)->nr_frags++; 2143 } 2144 k++; 2145 } else 2146 skb_shinfo(skb)->nr_frags++; 2147 pos += size; 2148 } 2149 skb_shinfo(skb1)->nr_frags = k; 2150 } 2151 2152 /** 2153 * skb_split - Split fragmented skb to two parts at length len. 2154 * @skb: the buffer to split 2155 * @skb1: the buffer to receive the second part 2156 * @len: new length for skb 2157 */ 2158 void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) 2159 { 2160 int pos = skb_headlen(skb); 2161 2162 if (len < pos) /* Split line is inside header. */ 2163 skb_split_inside_header(skb, skb1, len, pos); 2164 else /* Second chunk has no header, nothing to copy. */ 2165 skb_split_no_header(skb, skb1, len, pos); 2166 } 2167 EXPORT_SYMBOL(skb_split); 2168 2169 /* Shifting from/to a cloned skb is a no-go. 2170 * 2171 * Caller cannot keep skb_shinfo related pointers past calling here! 2172 */ 2173 static int skb_prepare_for_shift(struct sk_buff *skb) 2174 { 2175 return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 2176 } 2177 2178 /** 2179 * skb_shift - Shifts paged data partially from skb to another 2180 * @tgt: buffer into which tail data gets added 2181 * @skb: buffer from which the paged data comes from 2182 * @shiftlen: shift up to this many bytes 2183 * 2184 * Attempts to shift up to shiftlen worth of bytes, which may be less than 2185 * the length of the skb, from tgt to skb. Returns number bytes shifted. 2186 * It's up to caller to free skb if everything was shifted. 2187 * 2188 * If @tgt runs out of frags, the whole operation is aborted. 2189 * 2190 * Skb cannot include anything else but paged data while tgt is allowed 2191 * to have non-paged data as well. 2192 * 2193 * TODO: full sized shift could be optimized but that would need 2194 * specialized skb free'er to handle frags without up-to-date nr_frags. 2195 */ 2196 int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) 2197 { 2198 int from, to, merge, todo; 2199 struct skb_frag_struct *fragfrom, *fragto; 2200 2201 BUG_ON(shiftlen > skb->len); 2202 BUG_ON(skb_headlen(skb)); /* Would corrupt stream */ 2203 2204 todo = shiftlen; 2205 from = 0; 2206 to = skb_shinfo(tgt)->nr_frags; 2207 fragfrom = &skb_shinfo(skb)->frags[from]; 2208 2209 /* Actual merge is delayed until the point when we know we can 2210 * commit all, so that we don't have to undo partial changes 2211 */ 2212 if (!to || 2213 !skb_can_coalesce(tgt, to, fragfrom->page, fragfrom->page_offset)) { 2214 merge = -1; 2215 } else { 2216 merge = to - 1; 2217 2218 todo -= fragfrom->size; 2219 if (todo < 0) { 2220 if (skb_prepare_for_shift(skb) || 2221 skb_prepare_for_shift(tgt)) 2222 return 0; 2223 2224 /* All previous frag pointers might be stale! */ 2225 fragfrom = &skb_shinfo(skb)->frags[from]; 2226 fragto = &skb_shinfo(tgt)->frags[merge]; 2227 2228 fragto->size += shiftlen; 2229 fragfrom->size -= shiftlen; 2230 fragfrom->page_offset += shiftlen; 2231 2232 goto onlymerged; 2233 } 2234 2235 from++; 2236 } 2237 2238 /* Skip full, not-fitting skb to avoid expensive operations */ 2239 if ((shiftlen == skb->len) && 2240 (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to)) 2241 return 0; 2242 2243 if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt)) 2244 return 0; 2245 2246 while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) { 2247 if (to == MAX_SKB_FRAGS) 2248 return 0; 2249 2250 fragfrom = &skb_shinfo(skb)->frags[from]; 2251 fragto = &skb_shinfo(tgt)->frags[to]; 2252 2253 if (todo >= fragfrom->size) { 2254 *fragto = *fragfrom; 2255 todo -= fragfrom->size; 2256 from++; 2257 to++; 2258 2259 } else { 2260 get_page(fragfrom->page); 2261 fragto->page = fragfrom->page; 2262 fragto->page_offset = fragfrom->page_offset; 2263 fragto->size = todo; 2264 2265 fragfrom->page_offset += todo; 2266 fragfrom->size -= todo; 2267 todo = 0; 2268 2269 to++; 2270 break; 2271 } 2272 } 2273 2274 /* Ready to "commit" this state change to tgt */ 2275 skb_shinfo(tgt)->nr_frags = to; 2276 2277 if (merge >= 0) { 2278 fragfrom = &skb_shinfo(skb)->frags[0]; 2279 fragto = &skb_shinfo(tgt)->frags[merge]; 2280 2281 fragto->size += fragfrom->size; 2282 put_page(fragfrom->page); 2283 } 2284 2285 /* Reposition in the original skb */ 2286 to = 0; 2287 while (from < skb_shinfo(skb)->nr_frags) 2288 skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++]; 2289 skb_shinfo(skb)->nr_frags = to; 2290 2291 BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags); 2292 2293 onlymerged: 2294 /* Most likely the tgt won't ever need its checksum anymore, skb on 2295 * the other hand might need it if it needs to be resent 2296 */ 2297 tgt->ip_summed = CHECKSUM_PARTIAL; 2298 skb->ip_summed = CHECKSUM_PARTIAL; 2299 2300 /* Yak, is it really working this way? Some helper please? */ 2301 skb->len -= shiftlen; 2302 skb->data_len -= shiftlen; 2303 skb->truesize -= shiftlen; 2304 tgt->len += shiftlen; 2305 tgt->data_len += shiftlen; 2306 tgt->truesize += shiftlen; 2307 2308 return shiftlen; 2309 } 2310 2311 /** 2312 * skb_prepare_seq_read - Prepare a sequential read of skb data 2313 * @skb: the buffer to read 2314 * @from: lower offset of data to be read 2315 * @to: upper offset of data to be read 2316 * @st: state variable 2317 * 2318 * Initializes the specified state variable. Must be called before 2319 * invoking skb_seq_read() for the first time. 2320 */ 2321 void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, 2322 unsigned int to, struct skb_seq_state *st) 2323 { 2324 st->lower_offset = from; 2325 st->upper_offset = to; 2326 st->root_skb = st->cur_skb = skb; 2327 st->frag_idx = st->stepped_offset = 0; 2328 st->frag_data = NULL; 2329 } 2330 EXPORT_SYMBOL(skb_prepare_seq_read); 2331 2332 /** 2333 * skb_seq_read - Sequentially read skb data 2334 * @consumed: number of bytes consumed by the caller so far 2335 * @data: destination pointer for data to be returned 2336 * @st: state variable 2337 * 2338 * Reads a block of skb data at &consumed relative to the 2339 * lower offset specified to skb_prepare_seq_read(). Assigns 2340 * the head of the data block to &data and returns the length 2341 * of the block or 0 if the end of the skb data or the upper 2342 * offset has been reached. 2343 * 2344 * The caller is not required to consume all of the data 2345 * returned, i.e. &consumed is typically set to the number 2346 * of bytes already consumed and the next call to 2347 * skb_seq_read() will return the remaining part of the block. 2348 * 2349 * Note 1: The size of each block of data returned can be arbitrary, 2350 * this limitation is the cost for zerocopy seqeuental 2351 * reads of potentially non linear data. 2352 * 2353 * Note 2: Fragment lists within fragments are not implemented 2354 * at the moment, state->root_skb could be replaced with 2355 * a stack for this purpose. 2356 */ 2357 unsigned int skb_seq_read(unsigned int consumed, const u8 **data, 2358 struct skb_seq_state *st) 2359 { 2360 unsigned int block_limit, abs_offset = consumed + st->lower_offset; 2361 skb_frag_t *frag; 2362 2363 if (unlikely(abs_offset >= st->upper_offset)) 2364 return 0; 2365 2366 next_skb: 2367 block_limit = skb_headlen(st->cur_skb) + st->stepped_offset; 2368 2369 if (abs_offset < block_limit && !st->frag_data) { 2370 *data = st->cur_skb->data + (abs_offset - st->stepped_offset); 2371 return block_limit - abs_offset; 2372 } 2373 2374 if (st->frag_idx == 0 && !st->frag_data) 2375 st->stepped_offset += skb_headlen(st->cur_skb); 2376 2377 while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { 2378 frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; 2379 block_limit = frag->size + st->stepped_offset; 2380 2381 if (abs_offset < block_limit) { 2382 if (!st->frag_data) 2383 st->frag_data = kmap_skb_frag(frag); 2384 2385 *data = (u8 *) st->frag_data + frag->page_offset + 2386 (abs_offset - st->stepped_offset); 2387 2388 return block_limit - abs_offset; 2389 } 2390 2391 if (st->frag_data) { 2392 kunmap_skb_frag(st->frag_data); 2393 st->frag_data = NULL; 2394 } 2395 2396 st->frag_idx++; 2397 st->stepped_offset += frag->size; 2398 } 2399 2400 if (st->frag_data) { 2401 kunmap_skb_frag(st->frag_data); 2402 st->frag_data = NULL; 2403 } 2404 2405 if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) { 2406 st->cur_skb = skb_shinfo(st->root_skb)->frag_list; 2407 st->frag_idx = 0; 2408 goto next_skb; 2409 } else if (st->cur_skb->next) { 2410 st->cur_skb = st->cur_skb->next; 2411 st->frag_idx = 0; 2412 goto next_skb; 2413 } 2414 2415 return 0; 2416 } 2417 EXPORT_SYMBOL(skb_seq_read); 2418 2419 /** 2420 * skb_abort_seq_read - Abort a sequential read of skb data 2421 * @st: state variable 2422 * 2423 * Must be called if skb_seq_read() was not called until it 2424 * returned 0. 2425 */ 2426 void skb_abort_seq_read(struct skb_seq_state *st) 2427 { 2428 if (st->frag_data) 2429 kunmap_skb_frag(st->frag_data); 2430 } 2431 EXPORT_SYMBOL(skb_abort_seq_read); 2432 2433 #define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) 2434 2435 static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text, 2436 struct ts_config *conf, 2437 struct ts_state *state) 2438 { 2439 return skb_seq_read(offset, text, TS_SKB_CB(state)); 2440 } 2441 2442 static void skb_ts_finish(struct ts_config *conf, struct ts_state *state) 2443 { 2444 skb_abort_seq_read(TS_SKB_CB(state)); 2445 } 2446 2447 /** 2448 * skb_find_text - Find a text pattern in skb data 2449 * @skb: the buffer to look in 2450 * @from: search offset 2451 * @to: search limit 2452 * @config: textsearch configuration 2453 * @state: uninitialized textsearch state variable 2454 * 2455 * Finds a pattern in the skb data according to the specified 2456 * textsearch configuration. Use textsearch_next() to retrieve 2457 * subsequent occurrences of the pattern. Returns the offset 2458 * to the first occurrence or UINT_MAX if no match was found. 2459 */ 2460 unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, 2461 unsigned int to, struct ts_config *config, 2462 struct ts_state *state) 2463 { 2464 unsigned int ret; 2465 2466 config->get_next_block = skb_ts_get_next_block; 2467 config->finish = skb_ts_finish; 2468 2469 skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state)); 2470 2471 ret = textsearch_find(config, state); 2472 return (ret <= to - from ? ret : UINT_MAX); 2473 } 2474 EXPORT_SYMBOL(skb_find_text); 2475 2476 /** 2477 * skb_append_datato_frags: - append the user data to a skb 2478 * @sk: sock structure 2479 * @skb: skb structure to be appened with user data. 2480 * @getfrag: call back function to be used for getting the user data 2481 * @from: pointer to user message iov 2482 * @length: length of the iov message 2483 * 2484 * Description: This procedure append the user data in the fragment part 2485 * of the skb if any page alloc fails user this procedure returns -ENOMEM 2486 */ 2487 int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, 2488 int (*getfrag)(void *from, char *to, int offset, 2489 int len, int odd, struct sk_buff *skb), 2490 void *from, int length) 2491 { 2492 int frg_cnt = 0; 2493 skb_frag_t *frag = NULL; 2494 struct page *page = NULL; 2495 int copy, left; 2496 int offset = 0; 2497 int ret; 2498 2499 do { 2500 /* Return error if we don't have space for new frag */ 2501 frg_cnt = skb_shinfo(skb)->nr_frags; 2502 if (frg_cnt >= MAX_SKB_FRAGS) 2503 return -EFAULT; 2504 2505 /* allocate a new page for next frag */ 2506 page = alloc_pages(sk->sk_allocation, 0); 2507 2508 /* If alloc_page fails just return failure and caller will 2509 * free previous allocated pages by doing kfree_skb() 2510 */ 2511 if (page == NULL) 2512 return -ENOMEM; 2513 2514 /* initialize the next frag */ 2515 skb_fill_page_desc(skb, frg_cnt, page, 0, 0); 2516 skb->truesize += PAGE_SIZE; 2517 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc); 2518 2519 /* get the new initialized frag */ 2520 frg_cnt = skb_shinfo(skb)->nr_frags; 2521 frag = &skb_shinfo(skb)->frags[frg_cnt - 1]; 2522 2523 /* copy the user data to page */ 2524 left = PAGE_SIZE - frag->page_offset; 2525 copy = (length > left)? left : length; 2526 2527 ret = getfrag(from, (page_address(frag->page) + 2528 frag->page_offset + frag->size), 2529 offset, copy, 0, skb); 2530 if (ret < 0) 2531 return -EFAULT; 2532 2533 /* copy was successful so update the size parameters */ 2534 frag->size += copy; 2535 skb->len += copy; 2536 skb->data_len += copy; 2537 offset += copy; 2538 length -= copy; 2539 2540 } while (length > 0); 2541 2542 return 0; 2543 } 2544 EXPORT_SYMBOL(skb_append_datato_frags); 2545 2546 /** 2547 * skb_pull_rcsum - pull skb and update receive checksum 2548 * @skb: buffer to update 2549 * @len: length of data pulled 2550 * 2551 * This function performs an skb_pull on the packet and updates 2552 * the CHECKSUM_COMPLETE checksum. It should be used on 2553 * receive path processing instead of skb_pull unless you know 2554 * that the checksum difference is zero (e.g., a valid IP header) 2555 * or you are setting ip_summed to CHECKSUM_NONE. 2556 */ 2557 unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) 2558 { 2559 BUG_ON(len > skb->len); 2560 skb->len -= len; 2561 BUG_ON(skb->len < skb->data_len); 2562 skb_postpull_rcsum(skb, skb->data, len); 2563 return skb->data += len; 2564 } 2565 EXPORT_SYMBOL_GPL(skb_pull_rcsum); 2566 2567 /** 2568 * skb_segment - Perform protocol segmentation on skb. 2569 * @skb: buffer to segment 2570 * @features: features for the output path (see dev->features) 2571 * 2572 * This function performs segmentation on the given skb. It returns 2573 * a pointer to the first in a list of new skbs for the segments. 2574 * In case of error it returns ERR_PTR(err). 2575 */ 2576 struct sk_buff *skb_segment(struct sk_buff *skb, u32 features) 2577 { 2578 struct sk_buff *segs = NULL; 2579 struct sk_buff *tail = NULL; 2580 struct sk_buff *fskb = skb_shinfo(skb)->frag_list; 2581 unsigned int mss = skb_shinfo(skb)->gso_size; 2582 unsigned int doffset = skb->data - skb_mac_header(skb); 2583 unsigned int offset = doffset; 2584 unsigned int headroom; 2585 unsigned int len; 2586 int sg = !!(features & NETIF_F_SG); 2587 int nfrags = skb_shinfo(skb)->nr_frags; 2588 int err = -ENOMEM; 2589 int i = 0; 2590 int pos; 2591 2592 __skb_push(skb, doffset); 2593 headroom = skb_headroom(skb); 2594 pos = skb_headlen(skb); 2595 2596 do { 2597 struct sk_buff *nskb; 2598 skb_frag_t *frag; 2599 int hsize; 2600 int size; 2601 2602 len = skb->len - offset; 2603 if (len > mss) 2604 len = mss; 2605 2606 hsize = skb_headlen(skb) - offset; 2607 if (hsize < 0) 2608 hsize = 0; 2609 if (hsize > len || !sg) 2610 hsize = len; 2611 2612 if (!hsize && i >= nfrags) { 2613 BUG_ON(fskb->len != len); 2614 2615 pos += len; 2616 nskb = skb_clone(fskb, GFP_ATOMIC); 2617 fskb = fskb->next; 2618 2619 if (unlikely(!nskb)) 2620 goto err; 2621 2622 hsize = skb_end_pointer(nskb) - nskb->head; 2623 if (skb_cow_head(nskb, doffset + headroom)) { 2624 kfree_skb(nskb); 2625 goto err; 2626 } 2627 2628 nskb->truesize += skb_end_pointer(nskb) - nskb->head - 2629 hsize; 2630 skb_release_head_state(nskb); 2631 __skb_push(nskb, doffset); 2632 } else { 2633 nskb = alloc_skb(hsize + doffset + headroom, 2634 GFP_ATOMIC); 2635 2636 if (unlikely(!nskb)) 2637 goto err; 2638 2639 skb_reserve(nskb, headroom); 2640 __skb_put(nskb, doffset); 2641 } 2642 2643 if (segs) 2644 tail->next = nskb; 2645 else 2646 segs = nskb; 2647 tail = nskb; 2648 2649 __copy_skb_header(nskb, skb); 2650 nskb->mac_len = skb->mac_len; 2651 2652 /* nskb and skb might have different headroom */ 2653 if (nskb->ip_summed == CHECKSUM_PARTIAL) 2654 nskb->csum_start += skb_headroom(nskb) - headroom; 2655 2656 skb_reset_mac_header(nskb); 2657 skb_set_network_header(nskb, skb->mac_len); 2658 nskb->transport_header = (nskb->network_header + 2659 skb_network_header_len(skb)); 2660 skb_copy_from_linear_data(skb, nskb->data, doffset); 2661 2662 if (fskb != skb_shinfo(skb)->frag_list) 2663 continue; 2664 2665 if (!sg) { 2666 nskb->ip_summed = CHECKSUM_NONE; 2667 nskb->csum = skb_copy_and_csum_bits(skb, offset, 2668 skb_put(nskb, len), 2669 len, 0); 2670 continue; 2671 } 2672 2673 frag = skb_shinfo(nskb)->frags; 2674 2675 skb_copy_from_linear_data_offset(skb, offset, 2676 skb_put(nskb, hsize), hsize); 2677 2678 while (pos < offset + len && i < nfrags) { 2679 *frag = skb_shinfo(skb)->frags[i]; 2680 get_page(frag->page); 2681 size = frag->size; 2682 2683 if (pos < offset) { 2684 frag->page_offset += offset - pos; 2685 frag->size -= offset - pos; 2686 } 2687 2688 skb_shinfo(nskb)->nr_frags++; 2689 2690 if (pos + size <= offset + len) { 2691 i++; 2692 pos += size; 2693 } else { 2694 frag->size -= pos + size - (offset + len); 2695 goto skip_fraglist; 2696 } 2697 2698 frag++; 2699 } 2700 2701 if (pos < offset + len) { 2702 struct sk_buff *fskb2 = fskb; 2703 2704 BUG_ON(pos + fskb->len != offset + len); 2705 2706 pos += fskb->len; 2707 fskb = fskb->next; 2708 2709 if (fskb2->next) { 2710 fskb2 = skb_clone(fskb2, GFP_ATOMIC); 2711 if (!fskb2) 2712 goto err; 2713 } else 2714 skb_get(fskb2); 2715 2716 SKB_FRAG_ASSERT(nskb); 2717 skb_shinfo(nskb)->frag_list = fskb2; 2718 } 2719 2720 skip_fraglist: 2721 nskb->data_len = len - hsize; 2722 nskb->len += nskb->data_len; 2723 nskb->truesize += nskb->data_len; 2724 } while ((offset += len) < skb->len); 2725 2726 return segs; 2727 2728 err: 2729 while ((skb = segs)) { 2730 segs = skb->next; 2731 kfree_skb(skb); 2732 } 2733 return ERR_PTR(err); 2734 } 2735 EXPORT_SYMBOL_GPL(skb_segment); 2736 2737 int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) 2738 { 2739 struct sk_buff *p = *head; 2740 struct sk_buff *nskb; 2741 struct skb_shared_info *skbinfo = skb_shinfo(skb); 2742 struct skb_shared_info *pinfo = skb_shinfo(p); 2743 unsigned int headroom; 2744 unsigned int len = skb_gro_len(skb); 2745 unsigned int offset = skb_gro_offset(skb); 2746 unsigned int headlen = skb_headlen(skb); 2747 2748 if (p->len + len >= 65536) 2749 return -E2BIG; 2750 2751 if (pinfo->frag_list) 2752 goto merge; 2753 else if (headlen <= offset) { 2754 skb_frag_t *frag; 2755 skb_frag_t *frag2; 2756 int i = skbinfo->nr_frags; 2757 int nr_frags = pinfo->nr_frags + i; 2758 2759 offset -= headlen; 2760 2761 if (nr_frags > MAX_SKB_FRAGS) 2762 return -E2BIG; 2763 2764 pinfo->nr_frags = nr_frags; 2765 skbinfo->nr_frags = 0; 2766 2767 frag = pinfo->frags + nr_frags; 2768 frag2 = skbinfo->frags + i; 2769 do { 2770 *--frag = *--frag2; 2771 } while (--i); 2772 2773 frag->page_offset += offset; 2774 frag->size -= offset; 2775 2776 skb->truesize -= skb->data_len; 2777 skb->len -= skb->data_len; 2778 skb->data_len = 0; 2779 2780 NAPI_GRO_CB(skb)->free = 1; 2781 goto done; 2782 } else if (skb_gro_len(p) != pinfo->gso_size) 2783 return -E2BIG; 2784 2785 headroom = skb_headroom(p); 2786 nskb = alloc_skb(headroom + skb_gro_offset(p), GFP_ATOMIC); 2787 if (unlikely(!nskb)) 2788 return -ENOMEM; 2789 2790 __copy_skb_header(nskb, p); 2791 nskb->mac_len = p->mac_len; 2792 2793 skb_reserve(nskb, headroom); 2794 __skb_put(nskb, skb_gro_offset(p)); 2795 2796 skb_set_mac_header(nskb, skb_mac_header(p) - p->data); 2797 skb_set_network_header(nskb, skb_network_offset(p)); 2798 skb_set_transport_header(nskb, skb_transport_offset(p)); 2799 2800 __skb_pull(p, skb_gro_offset(p)); 2801 memcpy(skb_mac_header(nskb), skb_mac_header(p), 2802 p->data - skb_mac_header(p)); 2803 2804 *NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p); 2805 skb_shinfo(nskb)->frag_list = p; 2806 skb_shinfo(nskb)->gso_size = pinfo->gso_size; 2807 pinfo->gso_size = 0; 2808 skb_header_release(p); 2809 nskb->prev = p; 2810 2811 nskb->data_len += p->len; 2812 nskb->truesize += p->len; 2813 nskb->len += p->len; 2814 2815 *head = nskb; 2816 nskb->next = p->next; 2817 p->next = NULL; 2818 2819 p = nskb; 2820 2821 merge: 2822 if (offset > headlen) { 2823 unsigned int eat = offset - headlen; 2824 2825 skbinfo->frags[0].page_offset += eat; 2826 skbinfo->frags[0].size -= eat; 2827 skb->data_len -= eat; 2828 skb->len -= eat; 2829 offset = headlen; 2830 } 2831 2832 __skb_pull(skb, offset); 2833 2834 p->prev->next = skb; 2835 p->prev = skb; 2836 skb_header_release(skb); 2837 2838 done: 2839 NAPI_GRO_CB(p)->count++; 2840 p->data_len += len; 2841 p->truesize += len; 2842 p->len += len; 2843 2844 NAPI_GRO_CB(skb)->same_flow = 1; 2845 return 0; 2846 } 2847 EXPORT_SYMBOL_GPL(skb_gro_receive); 2848 2849 void __init skb_init(void) 2850 { 2851 skbuff_head_cache = kmem_cache_create("skbuff_head_cache", 2852 sizeof(struct sk_buff), 2853 0, 2854 SLAB_HWCACHE_ALIGN|SLAB_PANIC, 2855 NULL); 2856 skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", 2857 (2*sizeof(struct sk_buff)) + 2858 sizeof(atomic_t), 2859 0, 2860 SLAB_HWCACHE_ALIGN|SLAB_PANIC, 2861 NULL); 2862 } 2863 2864 /** 2865 * skb_to_sgvec - Fill a scatter-gather list from a socket buffer 2866 * @skb: Socket buffer containing the buffers to be mapped 2867 * @sg: The scatter-gather list to map into 2868 * @offset: The offset into the buffer's contents to start mapping 2869 * @len: Length of buffer space to be mapped 2870 * 2871 * Fill the specified scatter-gather list with mappings/pointers into a 2872 * region of the buffer space attached to a socket buffer. 2873 */ 2874 static int 2875 __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) 2876 { 2877 int start = skb_headlen(skb); 2878 int i, copy = start - offset; 2879 struct sk_buff *frag_iter; 2880 int elt = 0; 2881 2882 if (copy > 0) { 2883 if (copy > len) 2884 copy = len; 2885 sg_set_buf(sg, skb->data + offset, copy); 2886 elt++; 2887 if ((len -= copy) == 0) 2888 return elt; 2889 offset += copy; 2890 } 2891 2892 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2893 int end; 2894 2895 WARN_ON(start > offset + len); 2896 2897 end = start + skb_shinfo(skb)->frags[i].size; 2898 if ((copy = end - offset) > 0) { 2899 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2900 2901 if (copy > len) 2902 copy = len; 2903 sg_set_page(&sg[elt], frag->page, copy, 2904 frag->page_offset+offset-start); 2905 elt++; 2906 if (!(len -= copy)) 2907 return elt; 2908 offset += copy; 2909 } 2910 start = end; 2911 } 2912 2913 skb_walk_frags(skb, frag_iter) { 2914 int end; 2915 2916 WARN_ON(start > offset + len); 2917 2918 end = start + frag_iter->len; 2919 if ((copy = end - offset) > 0) { 2920 if (copy > len) 2921 copy = len; 2922 elt += __skb_to_sgvec(frag_iter, sg+elt, offset - start, 2923 copy); 2924 if ((len -= copy) == 0) 2925 return elt; 2926 offset += copy; 2927 } 2928 start = end; 2929 } 2930 BUG_ON(len); 2931 return elt; 2932 } 2933 2934 int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) 2935 { 2936 int nsg = __skb_to_sgvec(skb, sg, offset, len); 2937 2938 sg_mark_end(&sg[nsg - 1]); 2939 2940 return nsg; 2941 } 2942 EXPORT_SYMBOL_GPL(skb_to_sgvec); 2943 2944 /** 2945 * skb_cow_data - Check that a socket buffer's data buffers are writable 2946 * @skb: The socket buffer to check. 2947 * @tailbits: Amount of trailing space to be added 2948 * @trailer: Returned pointer to the skb where the @tailbits space begins 2949 * 2950 * Make sure that the data buffers attached to a socket buffer are 2951 * writable. If they are not, private copies are made of the data buffers 2952 * and the socket buffer is set to use these instead. 2953 * 2954 * If @tailbits is given, make sure that there is space to write @tailbits 2955 * bytes of data beyond current end of socket buffer. @trailer will be 2956 * set to point to the skb in which this space begins. 2957 * 2958 * The number of scatterlist elements required to completely map the 2959 * COW'd and extended socket buffer will be returned. 2960 */ 2961 int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) 2962 { 2963 int copyflag; 2964 int elt; 2965 struct sk_buff *skb1, **skb_p; 2966 2967 /* If skb is cloned or its head is paged, reallocate 2968 * head pulling out all the pages (pages are considered not writable 2969 * at the moment even if they are anonymous). 2970 */ 2971 if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) && 2972 __pskb_pull_tail(skb, skb_pagelen(skb)-skb_headlen(skb)) == NULL) 2973 return -ENOMEM; 2974 2975 /* Easy case. Most of packets will go this way. */ 2976 if (!skb_has_frag_list(skb)) { 2977 /* A little of trouble, not enough of space for trailer. 2978 * This should not happen, when stack is tuned to generate 2979 * good frames. OK, on miss we reallocate and reserve even more 2980 * space, 128 bytes is fair. */ 2981 2982 if (skb_tailroom(skb) < tailbits && 2983 pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC)) 2984 return -ENOMEM; 2985 2986 /* Voila! */ 2987 *trailer = skb; 2988 return 1; 2989 } 2990 2991 /* Misery. We are in troubles, going to mincer fragments... */ 2992 2993 elt = 1; 2994 skb_p = &skb_shinfo(skb)->frag_list; 2995 copyflag = 0; 2996 2997 while ((skb1 = *skb_p) != NULL) { 2998 int ntail = 0; 2999 3000 /* The fragment is partially pulled by someone, 3001 * this can happen on input. Copy it and everything 3002 * after it. */ 3003 3004 if (skb_shared(skb1)) 3005 copyflag = 1; 3006 3007 /* If the skb is the last, worry about trailer. */ 3008 3009 if (skb1->next == NULL && tailbits) { 3010 if (skb_shinfo(skb1)->nr_frags || 3011 skb_has_frag_list(skb1) || 3012 skb_tailroom(skb1) < tailbits) 3013 ntail = tailbits + 128; 3014 } 3015 3016 if (copyflag || 3017 skb_cloned(skb1) || 3018 ntail || 3019 skb_shinfo(skb1)->nr_frags || 3020 skb_has_frag_list(skb1)) { 3021 struct sk_buff *skb2; 3022 3023 /* Fuck, we are miserable poor guys... */ 3024 if (ntail == 0) 3025 skb2 = skb_copy(skb1, GFP_ATOMIC); 3026 else 3027 skb2 = skb_copy_expand(skb1, 3028 skb_headroom(skb1), 3029 ntail, 3030 GFP_ATOMIC); 3031 if (unlikely(skb2 == NULL)) 3032 return -ENOMEM; 3033 3034 if (skb1->sk) 3035 skb_set_owner_w(skb2, skb1->sk); 3036 3037 /* Looking around. Are we still alive? 3038 * OK, link new skb, drop old one */ 3039 3040 skb2->next = skb1->next; 3041 *skb_p = skb2; 3042 kfree_skb(skb1); 3043 skb1 = skb2; 3044 } 3045 elt++; 3046 *trailer = skb1; 3047 skb_p = &skb1->next; 3048 } 3049 3050 return elt; 3051 } 3052 EXPORT_SYMBOL_GPL(skb_cow_data); 3053 3054 static void sock_rmem_free(struct sk_buff *skb) 3055 { 3056 struct sock *sk = skb->sk; 3057 3058 atomic_sub(skb->truesize, &sk->sk_rmem_alloc); 3059 } 3060 3061 /* 3062 * Note: We dont mem charge error packets (no sk_forward_alloc changes) 3063 */ 3064 int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) 3065 { 3066 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= 3067 (unsigned)sk->sk_rcvbuf) 3068 return -ENOMEM; 3069 3070 skb_orphan(skb); 3071 skb->sk = sk; 3072 skb->destructor = sock_rmem_free; 3073 atomic_add(skb->truesize, &sk->sk_rmem_alloc); 3074 3075 /* before exiting rcu section, make sure dst is refcounted */ 3076 skb_dst_force(skb); 3077 3078 skb_queue_tail(&sk->sk_error_queue, skb); 3079 if (!sock_flag(sk, SOCK_DEAD)) 3080 sk->sk_data_ready(sk, skb->len); 3081 return 0; 3082 } 3083 EXPORT_SYMBOL(sock_queue_err_skb); 3084 3085 void skb_tstamp_tx(struct sk_buff *orig_skb, 3086 struct skb_shared_hwtstamps *hwtstamps) 3087 { 3088 struct sock *sk = orig_skb->sk; 3089 struct sock_exterr_skb *serr; 3090 struct sk_buff *skb; 3091 int err; 3092 3093 if (!sk) 3094 return; 3095 3096 skb = skb_clone(orig_skb, GFP_ATOMIC); 3097 if (!skb) 3098 return; 3099 3100 if (hwtstamps) { 3101 *skb_hwtstamps(skb) = 3102 *hwtstamps; 3103 } else { 3104 /* 3105 * no hardware time stamps available, 3106 * so keep the shared tx_flags and only 3107 * store software time stamp 3108 */ 3109 skb->tstamp = ktime_get_real(); 3110 } 3111 3112 serr = SKB_EXT_ERR(skb); 3113 memset(serr, 0, sizeof(*serr)); 3114 serr->ee.ee_errno = ENOMSG; 3115 serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; 3116 3117 err = sock_queue_err_skb(sk, skb); 3118 3119 if (err) 3120 kfree_skb(skb); 3121 } 3122 EXPORT_SYMBOL_GPL(skb_tstamp_tx); 3123 3124 3125 /** 3126 * skb_partial_csum_set - set up and verify partial csum values for packet 3127 * @skb: the skb to set 3128 * @start: the number of bytes after skb->data to start checksumming. 3129 * @off: the offset from start to place the checksum. 3130 * 3131 * For untrusted partially-checksummed packets, we need to make sure the values 3132 * for skb->csum_start and skb->csum_offset are valid so we don't oops. 3133 * 3134 * This function checks and sets those values and skb->ip_summed: if this 3135 * returns false you should drop the packet. 3136 */ 3137 bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) 3138 { 3139 if (unlikely(start > skb_headlen(skb)) || 3140 unlikely((int)start + off > skb_headlen(skb) - 2)) { 3141 if (net_ratelimit()) 3142 printk(KERN_WARNING 3143 "bad partial csum: csum=%u/%u len=%u\n", 3144 start, off, skb_headlen(skb)); 3145 return false; 3146 } 3147 skb->ip_summed = CHECKSUM_PARTIAL; 3148 skb->csum_start = skb_headroom(skb) + start; 3149 skb->csum_offset = off; 3150 return true; 3151 } 3152 EXPORT_SYMBOL_GPL(skb_partial_csum_set); 3153 3154 void __skb_warn_lro_forwarding(const struct sk_buff *skb) 3155 { 3156 if (net_ratelimit()) 3157 pr_warning("%s: received packets cannot be forwarded" 3158 " while LRO is enabled\n", skb->dev->name); 3159 } 3160 EXPORT_SYMBOL(__skb_warn_lro_forwarding); 3161