1 /* 2 * Routines having to do with the 'struct sk_buff' memory handlers. 3 * 4 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> 5 * Florian La Roche <rzsfl@rz.uni-sb.de> 6 * 7 * Fixes: 8 * Alan Cox : Fixed the worst of the load 9 * balancer bugs. 10 * Dave Platt : Interrupt stacking fix. 11 * Richard Kooijman : Timestamp fixes. 12 * Alan Cox : Changed buffer format. 13 * Alan Cox : destructor hook for AF_UNIX etc. 14 * Linus Torvalds : Better skb_clone. 15 * Alan Cox : Added skb_copy. 16 * Alan Cox : Added all the changed routines Linus 17 * only put in the headers 18 * Ray VanTassle : Fixed --skb->lock in free 19 * Alan Cox : skb_copy copy arp field 20 * Andi Kleen : slabified it. 21 * Robert Olsson : Removed skb_head_pool 22 * 23 * NOTE: 24 * The __skb_ routines should be called with interrupts 25 * disabled, or you better be *real* sure that the operation is atomic 26 * with respect to whatever list is being frobbed (e.g. via lock_sock() 27 * or via disabling bottom half handlers, etc). 28 * 29 * This program is free software; you can redistribute it and/or 30 * modify it under the terms of the GNU General Public License 31 * as published by the Free Software Foundation; either version 32 * 2 of the License, or (at your option) any later version. 33 */ 34 35 /* 36 * The functions in this file will not compile correctly with gcc 2.4.x 37 */ 38 39 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 40 41 #include <linux/module.h> 42 #include <linux/types.h> 43 #include <linux/kernel.h> 44 #include <linux/kmemcheck.h> 45 #include <linux/mm.h> 46 #include <linux/interrupt.h> 47 #include <linux/in.h> 48 #include <linux/inet.h> 49 #include <linux/slab.h> 50 #include <linux/netdevice.h> 51 #ifdef CONFIG_NET_CLS_ACT 52 #include <net/pkt_sched.h> 53 #endif 54 #include <linux/string.h> 55 #include <linux/skbuff.h> 56 #include <linux/splice.h> 57 #include <linux/cache.h> 58 #include <linux/rtnetlink.h> 59 #include <linux/init.h> 60 #include <linux/scatterlist.h> 61 #include <linux/errqueue.h> 62 #include <linux/prefetch.h> 63 64 #include <net/protocol.h> 65 #include <net/dst.h> 66 #include <net/sock.h> 67 #include <net/checksum.h> 68 #include <net/xfrm.h> 69 70 #include <asm/uaccess.h> 71 #include <trace/events/skb.h> 72 #include <linux/highmem.h> 73 74 struct kmem_cache *skbuff_head_cache __read_mostly; 75 static struct kmem_cache *skbuff_fclone_cache __read_mostly; 76 77 static void sock_pipe_buf_release(struct pipe_inode_info *pipe, 78 struct pipe_buffer *buf) 79 { 80 put_page(buf->page); 81 } 82 83 static void sock_pipe_buf_get(struct pipe_inode_info *pipe, 84 struct pipe_buffer *buf) 85 { 86 get_page(buf->page); 87 } 88 89 static int sock_pipe_buf_steal(struct pipe_inode_info *pipe, 90 struct pipe_buffer *buf) 91 { 92 return 1; 93 } 94 95 96 /* Pipe buffer operations for a socket. */ 97 static const struct pipe_buf_operations sock_pipe_buf_ops = { 98 .can_merge = 0, 99 .map = generic_pipe_buf_map, 100 .unmap = generic_pipe_buf_unmap, 101 .confirm = generic_pipe_buf_confirm, 102 .release = sock_pipe_buf_release, 103 .steal = sock_pipe_buf_steal, 104 .get = sock_pipe_buf_get, 105 }; 106 107 /* 108 * Keep out-of-line to prevent kernel bloat. 109 * __builtin_return_address is not used because it is not always 110 * reliable. 111 */ 112 113 /** 114 * skb_over_panic - private function 115 * @skb: buffer 116 * @sz: size 117 * @here: address 118 * 119 * Out of line support code for skb_put(). Not user callable. 120 */ 121 static void skb_over_panic(struct sk_buff *skb, int sz, void *here) 122 { 123 pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n", 124 __func__, here, skb->len, sz, skb->head, skb->data, 125 (unsigned long)skb->tail, (unsigned long)skb->end, 126 skb->dev ? skb->dev->name : "<NULL>"); 127 BUG(); 128 } 129 130 /** 131 * skb_under_panic - private function 132 * @skb: buffer 133 * @sz: size 134 * @here: address 135 * 136 * Out of line support code for skb_push(). Not user callable. 137 */ 138 139 static void skb_under_panic(struct sk_buff *skb, int sz, void *here) 140 { 141 pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n", 142 __func__, here, skb->len, sz, skb->head, skb->data, 143 (unsigned long)skb->tail, (unsigned long)skb->end, 144 skb->dev ? skb->dev->name : "<NULL>"); 145 BUG(); 146 } 147 148 /* Allocate a new skbuff. We do this ourselves so we can fill in a few 149 * 'private' fields and also do memory statistics to find all the 150 * [BEEP] leaks. 151 * 152 */ 153 154 /** 155 * __alloc_skb - allocate a network buffer 156 * @size: size to allocate 157 * @gfp_mask: allocation mask 158 * @fclone: allocate from fclone cache instead of head cache 159 * and allocate a cloned (child) skb 160 * @node: numa node to allocate memory on 161 * 162 * Allocate a new &sk_buff. The returned buffer has no headroom and a 163 * tail room of size bytes. The object has a reference count of one. 164 * The return is the buffer. On a failure the return is %NULL. 165 * 166 * Buffers may only be allocated from interrupts using a @gfp_mask of 167 * %GFP_ATOMIC. 168 */ 169 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, 170 int fclone, int node) 171 { 172 struct kmem_cache *cache; 173 struct skb_shared_info *shinfo; 174 struct sk_buff *skb; 175 u8 *data; 176 177 cache = fclone ? skbuff_fclone_cache : skbuff_head_cache; 178 179 /* Get the HEAD */ 180 skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); 181 if (!skb) 182 goto out; 183 prefetchw(skb); 184 185 /* We do our best to align skb_shared_info on a separate cache 186 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives 187 * aligned memory blocks, unless SLUB/SLAB debug is enabled. 188 * Both skb->head and skb_shared_info are cache line aligned. 189 */ 190 size = SKB_DATA_ALIGN(size); 191 size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 192 data = kmalloc_node_track_caller(size, gfp_mask, node); 193 if (!data) 194 goto nodata; 195 /* kmalloc(size) might give us more room than requested. 196 * Put skb_shared_info exactly at the end of allocated zone, 197 * to allow max possible filling before reallocation. 198 */ 199 size = SKB_WITH_OVERHEAD(ksize(data)); 200 prefetchw(data + size); 201 202 /* 203 * Only clear those fields we need to clear, not those that we will 204 * actually initialise below. Hence, don't put any more fields after 205 * the tail pointer in struct sk_buff! 206 */ 207 memset(skb, 0, offsetof(struct sk_buff, tail)); 208 /* Account for allocated memory : skb + skb->head */ 209 skb->truesize = SKB_TRUESIZE(size); 210 atomic_set(&skb->users, 1); 211 skb->head = data; 212 skb->data = data; 213 skb_reset_tail_pointer(skb); 214 skb->end = skb->tail + size; 215 #ifdef NET_SKBUFF_DATA_USES_OFFSET 216 skb->mac_header = ~0U; 217 #endif 218 219 /* make sure we initialize shinfo sequentially */ 220 shinfo = skb_shinfo(skb); 221 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); 222 atomic_set(&shinfo->dataref, 1); 223 kmemcheck_annotate_variable(shinfo->destructor_arg); 224 225 if (fclone) { 226 struct sk_buff *child = skb + 1; 227 atomic_t *fclone_ref = (atomic_t *) (child + 1); 228 229 kmemcheck_annotate_bitfield(child, flags1); 230 kmemcheck_annotate_bitfield(child, flags2); 231 skb->fclone = SKB_FCLONE_ORIG; 232 atomic_set(fclone_ref, 1); 233 234 child->fclone = SKB_FCLONE_UNAVAILABLE; 235 } 236 out: 237 return skb; 238 nodata: 239 kmem_cache_free(cache, skb); 240 skb = NULL; 241 goto out; 242 } 243 EXPORT_SYMBOL(__alloc_skb); 244 245 /** 246 * build_skb - build a network buffer 247 * @data: data buffer provided by caller 248 * @frag_size: size of fragment, or 0 if head was kmalloced 249 * 250 * Allocate a new &sk_buff. Caller provides space holding head and 251 * skb_shared_info. @data must have been allocated by kmalloc() 252 * The return is the new skb buffer. 253 * On a failure the return is %NULL, and @data is not freed. 254 * Notes : 255 * Before IO, driver allocates only data buffer where NIC put incoming frame 256 * Driver should add room at head (NET_SKB_PAD) and 257 * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info)) 258 * After IO, driver calls build_skb(), to allocate sk_buff and populate it 259 * before giving packet to stack. 260 * RX rings only contains data buffers, not full skbs. 261 */ 262 struct sk_buff *build_skb(void *data, unsigned int frag_size) 263 { 264 struct skb_shared_info *shinfo; 265 struct sk_buff *skb; 266 unsigned int size = frag_size ? : ksize(data); 267 268 skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); 269 if (!skb) 270 return NULL; 271 272 size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 273 274 memset(skb, 0, offsetof(struct sk_buff, tail)); 275 skb->truesize = SKB_TRUESIZE(size); 276 skb->head_frag = frag_size != 0; 277 atomic_set(&skb->users, 1); 278 skb->head = data; 279 skb->data = data; 280 skb_reset_tail_pointer(skb); 281 skb->end = skb->tail + size; 282 #ifdef NET_SKBUFF_DATA_USES_OFFSET 283 skb->mac_header = ~0U; 284 #endif 285 286 /* make sure we initialize shinfo sequentially */ 287 shinfo = skb_shinfo(skb); 288 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); 289 atomic_set(&shinfo->dataref, 1); 290 kmemcheck_annotate_variable(shinfo->destructor_arg); 291 292 return skb; 293 } 294 EXPORT_SYMBOL(build_skb); 295 296 struct netdev_alloc_cache { 297 struct page *page; 298 unsigned int offset; 299 }; 300 static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache); 301 302 /** 303 * netdev_alloc_frag - allocate a page fragment 304 * @fragsz: fragment size 305 * 306 * Allocates a frag from a page for receive buffer. 307 * Uses GFP_ATOMIC allocations. 308 */ 309 void *netdev_alloc_frag(unsigned int fragsz) 310 { 311 struct netdev_alloc_cache *nc; 312 void *data = NULL; 313 unsigned long flags; 314 315 local_irq_save(flags); 316 nc = &__get_cpu_var(netdev_alloc_cache); 317 if (unlikely(!nc->page)) { 318 refill: 319 nc->page = alloc_page(GFP_ATOMIC | __GFP_COLD); 320 nc->offset = 0; 321 } 322 if (likely(nc->page)) { 323 if (nc->offset + fragsz > PAGE_SIZE) { 324 put_page(nc->page); 325 goto refill; 326 } 327 data = page_address(nc->page) + nc->offset; 328 nc->offset += fragsz; 329 get_page(nc->page); 330 } 331 local_irq_restore(flags); 332 return data; 333 } 334 EXPORT_SYMBOL(netdev_alloc_frag); 335 336 /** 337 * __netdev_alloc_skb - allocate an skbuff for rx on a specific device 338 * @dev: network device to receive on 339 * @length: length to allocate 340 * @gfp_mask: get_free_pages mask, passed to alloc_skb 341 * 342 * Allocate a new &sk_buff and assign it a usage count of one. The 343 * buffer has unspecified headroom built in. Users should allocate 344 * the headroom they think they need without accounting for the 345 * built in space. The built in space is used for optimisations. 346 * 347 * %NULL is returned if there is no free memory. 348 */ 349 struct sk_buff *__netdev_alloc_skb(struct net_device *dev, 350 unsigned int length, gfp_t gfp_mask) 351 { 352 struct sk_buff *skb = NULL; 353 unsigned int fragsz = SKB_DATA_ALIGN(length + NET_SKB_PAD) + 354 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 355 356 if (fragsz <= PAGE_SIZE && !(gfp_mask & __GFP_WAIT)) { 357 void *data = netdev_alloc_frag(fragsz); 358 359 if (likely(data)) { 360 skb = build_skb(data, fragsz); 361 if (unlikely(!skb)) 362 put_page(virt_to_head_page(data)); 363 } 364 } else { 365 skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE); 366 } 367 if (likely(skb)) { 368 skb_reserve(skb, NET_SKB_PAD); 369 skb->dev = dev; 370 } 371 return skb; 372 } 373 EXPORT_SYMBOL(__netdev_alloc_skb); 374 375 void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, 376 int size, unsigned int truesize) 377 { 378 skb_fill_page_desc(skb, i, page, off, size); 379 skb->len += size; 380 skb->data_len += size; 381 skb->truesize += truesize; 382 } 383 EXPORT_SYMBOL(skb_add_rx_frag); 384 385 static void skb_drop_list(struct sk_buff **listp) 386 { 387 struct sk_buff *list = *listp; 388 389 *listp = NULL; 390 391 do { 392 struct sk_buff *this = list; 393 list = list->next; 394 kfree_skb(this); 395 } while (list); 396 } 397 398 static inline void skb_drop_fraglist(struct sk_buff *skb) 399 { 400 skb_drop_list(&skb_shinfo(skb)->frag_list); 401 } 402 403 static void skb_clone_fraglist(struct sk_buff *skb) 404 { 405 struct sk_buff *list; 406 407 skb_walk_frags(skb, list) 408 skb_get(list); 409 } 410 411 static void skb_free_head(struct sk_buff *skb) 412 { 413 if (skb->head_frag) 414 put_page(virt_to_head_page(skb->head)); 415 else 416 kfree(skb->head); 417 } 418 419 static void skb_release_data(struct sk_buff *skb) 420 { 421 if (!skb->cloned || 422 !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, 423 &skb_shinfo(skb)->dataref)) { 424 if (skb_shinfo(skb)->nr_frags) { 425 int i; 426 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 427 skb_frag_unref(skb, i); 428 } 429 430 /* 431 * If skb buf is from userspace, we need to notify the caller 432 * the lower device DMA has done; 433 */ 434 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { 435 struct ubuf_info *uarg; 436 437 uarg = skb_shinfo(skb)->destructor_arg; 438 if (uarg->callback) 439 uarg->callback(uarg); 440 } 441 442 if (skb_has_frag_list(skb)) 443 skb_drop_fraglist(skb); 444 445 skb_free_head(skb); 446 } 447 } 448 449 /* 450 * Free an skbuff by memory without cleaning the state. 451 */ 452 static void kfree_skbmem(struct sk_buff *skb) 453 { 454 struct sk_buff *other; 455 atomic_t *fclone_ref; 456 457 switch (skb->fclone) { 458 case SKB_FCLONE_UNAVAILABLE: 459 kmem_cache_free(skbuff_head_cache, skb); 460 break; 461 462 case SKB_FCLONE_ORIG: 463 fclone_ref = (atomic_t *) (skb + 2); 464 if (atomic_dec_and_test(fclone_ref)) 465 kmem_cache_free(skbuff_fclone_cache, skb); 466 break; 467 468 case SKB_FCLONE_CLONE: 469 fclone_ref = (atomic_t *) (skb + 1); 470 other = skb - 1; 471 472 /* The clone portion is available for 473 * fast-cloning again. 474 */ 475 skb->fclone = SKB_FCLONE_UNAVAILABLE; 476 477 if (atomic_dec_and_test(fclone_ref)) 478 kmem_cache_free(skbuff_fclone_cache, other); 479 break; 480 } 481 } 482 483 static void skb_release_head_state(struct sk_buff *skb) 484 { 485 skb_dst_drop(skb); 486 #ifdef CONFIG_XFRM 487 secpath_put(skb->sp); 488 #endif 489 if (skb->destructor) { 490 WARN_ON(in_irq()); 491 skb->destructor(skb); 492 } 493 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 494 nf_conntrack_put(skb->nfct); 495 #endif 496 #ifdef NET_SKBUFF_NF_DEFRAG_NEEDED 497 nf_conntrack_put_reasm(skb->nfct_reasm); 498 #endif 499 #ifdef CONFIG_BRIDGE_NETFILTER 500 nf_bridge_put(skb->nf_bridge); 501 #endif 502 /* XXX: IS this still necessary? - JHS */ 503 #ifdef CONFIG_NET_SCHED 504 skb->tc_index = 0; 505 #ifdef CONFIG_NET_CLS_ACT 506 skb->tc_verd = 0; 507 #endif 508 #endif 509 } 510 511 /* Free everything but the sk_buff shell. */ 512 static void skb_release_all(struct sk_buff *skb) 513 { 514 skb_release_head_state(skb); 515 skb_release_data(skb); 516 } 517 518 /** 519 * __kfree_skb - private function 520 * @skb: buffer 521 * 522 * Free an sk_buff. Release anything attached to the buffer. 523 * Clean the state. This is an internal helper function. Users should 524 * always call kfree_skb 525 */ 526 527 void __kfree_skb(struct sk_buff *skb) 528 { 529 skb_release_all(skb); 530 kfree_skbmem(skb); 531 } 532 EXPORT_SYMBOL(__kfree_skb); 533 534 /** 535 * kfree_skb - free an sk_buff 536 * @skb: buffer to free 537 * 538 * Drop a reference to the buffer and free it if the usage count has 539 * hit zero. 540 */ 541 void kfree_skb(struct sk_buff *skb) 542 { 543 if (unlikely(!skb)) 544 return; 545 if (likely(atomic_read(&skb->users) == 1)) 546 smp_rmb(); 547 else if (likely(!atomic_dec_and_test(&skb->users))) 548 return; 549 trace_kfree_skb(skb, __builtin_return_address(0)); 550 __kfree_skb(skb); 551 } 552 EXPORT_SYMBOL(kfree_skb); 553 554 /** 555 * consume_skb - free an skbuff 556 * @skb: buffer to free 557 * 558 * Drop a ref to the buffer and free it if the usage count has hit zero 559 * Functions identically to kfree_skb, but kfree_skb assumes that the frame 560 * is being dropped after a failure and notes that 561 */ 562 void consume_skb(struct sk_buff *skb) 563 { 564 if (unlikely(!skb)) 565 return; 566 if (likely(atomic_read(&skb->users) == 1)) 567 smp_rmb(); 568 else if (likely(!atomic_dec_and_test(&skb->users))) 569 return; 570 trace_consume_skb(skb); 571 __kfree_skb(skb); 572 } 573 EXPORT_SYMBOL(consume_skb); 574 575 /** 576 * skb_recycle - clean up an skb for reuse 577 * @skb: buffer 578 * 579 * Recycles the skb to be reused as a receive buffer. This 580 * function does any necessary reference count dropping, and 581 * cleans up the skbuff as if it just came from __alloc_skb(). 582 */ 583 void skb_recycle(struct sk_buff *skb) 584 { 585 struct skb_shared_info *shinfo; 586 587 skb_release_head_state(skb); 588 589 shinfo = skb_shinfo(skb); 590 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); 591 atomic_set(&shinfo->dataref, 1); 592 593 memset(skb, 0, offsetof(struct sk_buff, tail)); 594 skb->data = skb->head + NET_SKB_PAD; 595 skb_reset_tail_pointer(skb); 596 } 597 EXPORT_SYMBOL(skb_recycle); 598 599 /** 600 * skb_recycle_check - check if skb can be reused for receive 601 * @skb: buffer 602 * @skb_size: minimum receive buffer size 603 * 604 * Checks that the skb passed in is not shared or cloned, and 605 * that it is linear and its head portion at least as large as 606 * skb_size so that it can be recycled as a receive buffer. 607 * If these conditions are met, this function does any necessary 608 * reference count dropping and cleans up the skbuff as if it 609 * just came from __alloc_skb(). 610 */ 611 bool skb_recycle_check(struct sk_buff *skb, int skb_size) 612 { 613 if (!skb_is_recycleable(skb, skb_size)) 614 return false; 615 616 skb_recycle(skb); 617 618 return true; 619 } 620 EXPORT_SYMBOL(skb_recycle_check); 621 622 static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) 623 { 624 new->tstamp = old->tstamp; 625 new->dev = old->dev; 626 new->transport_header = old->transport_header; 627 new->network_header = old->network_header; 628 new->mac_header = old->mac_header; 629 skb_dst_copy(new, old); 630 new->rxhash = old->rxhash; 631 new->ooo_okay = old->ooo_okay; 632 new->l4_rxhash = old->l4_rxhash; 633 new->no_fcs = old->no_fcs; 634 #ifdef CONFIG_XFRM 635 new->sp = secpath_get(old->sp); 636 #endif 637 memcpy(new->cb, old->cb, sizeof(old->cb)); 638 new->csum = old->csum; 639 new->local_df = old->local_df; 640 new->pkt_type = old->pkt_type; 641 new->ip_summed = old->ip_summed; 642 skb_copy_queue_mapping(new, old); 643 new->priority = old->priority; 644 #if IS_ENABLED(CONFIG_IP_VS) 645 new->ipvs_property = old->ipvs_property; 646 #endif 647 new->protocol = old->protocol; 648 new->mark = old->mark; 649 new->skb_iif = old->skb_iif; 650 __nf_copy(new, old); 651 #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) 652 new->nf_trace = old->nf_trace; 653 #endif 654 #ifdef CONFIG_NET_SCHED 655 new->tc_index = old->tc_index; 656 #ifdef CONFIG_NET_CLS_ACT 657 new->tc_verd = old->tc_verd; 658 #endif 659 #endif 660 new->vlan_tci = old->vlan_tci; 661 662 skb_copy_secmark(new, old); 663 } 664 665 /* 666 * You should not add any new code to this function. Add it to 667 * __copy_skb_header above instead. 668 */ 669 static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) 670 { 671 #define C(x) n->x = skb->x 672 673 n->next = n->prev = NULL; 674 n->sk = NULL; 675 __copy_skb_header(n, skb); 676 677 C(len); 678 C(data_len); 679 C(mac_len); 680 n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; 681 n->cloned = 1; 682 n->nohdr = 0; 683 n->destructor = NULL; 684 C(tail); 685 C(end); 686 C(head); 687 C(head_frag); 688 C(data); 689 C(truesize); 690 atomic_set(&n->users, 1); 691 692 atomic_inc(&(skb_shinfo(skb)->dataref)); 693 skb->cloned = 1; 694 695 return n; 696 #undef C 697 } 698 699 /** 700 * skb_morph - morph one skb into another 701 * @dst: the skb to receive the contents 702 * @src: the skb to supply the contents 703 * 704 * This is identical to skb_clone except that the target skb is 705 * supplied by the user. 706 * 707 * The target skb is returned upon exit. 708 */ 709 struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) 710 { 711 skb_release_all(dst); 712 return __skb_clone(dst, src); 713 } 714 EXPORT_SYMBOL_GPL(skb_morph); 715 716 /* skb_copy_ubufs - copy userspace skb frags buffers to kernel 717 * @skb: the skb to modify 718 * @gfp_mask: allocation priority 719 * 720 * This must be called on SKBTX_DEV_ZEROCOPY skb. 721 * It will copy all frags into kernel and drop the reference 722 * to userspace pages. 723 * 724 * If this function is called from an interrupt gfp_mask() must be 725 * %GFP_ATOMIC. 726 * 727 * Returns 0 on success or a negative error code on failure 728 * to allocate kernel memory to copy to. 729 */ 730 int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) 731 { 732 int i; 733 int num_frags = skb_shinfo(skb)->nr_frags; 734 struct page *page, *head = NULL; 735 struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg; 736 737 for (i = 0; i < num_frags; i++) { 738 u8 *vaddr; 739 skb_frag_t *f = &skb_shinfo(skb)->frags[i]; 740 741 page = alloc_page(GFP_ATOMIC); 742 if (!page) { 743 while (head) { 744 struct page *next = (struct page *)head->private; 745 put_page(head); 746 head = next; 747 } 748 return -ENOMEM; 749 } 750 vaddr = kmap_atomic(skb_frag_page(f)); 751 memcpy(page_address(page), 752 vaddr + f->page_offset, skb_frag_size(f)); 753 kunmap_atomic(vaddr); 754 page->private = (unsigned long)head; 755 head = page; 756 } 757 758 /* skb frags release userspace buffers */ 759 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 760 skb_frag_unref(skb, i); 761 762 uarg->callback(uarg); 763 764 /* skb frags point to kernel buffers */ 765 for (i = skb_shinfo(skb)->nr_frags; i > 0; i--) { 766 __skb_fill_page_desc(skb, i-1, head, 0, 767 skb_shinfo(skb)->frags[i - 1].size); 768 head = (struct page *)head->private; 769 } 770 771 skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; 772 return 0; 773 } 774 775 776 /** 777 * skb_clone - duplicate an sk_buff 778 * @skb: buffer to clone 779 * @gfp_mask: allocation priority 780 * 781 * Duplicate an &sk_buff. The new one is not owned by a socket. Both 782 * copies share the same packet data but not structure. The new 783 * buffer has a reference count of 1. If the allocation fails the 784 * function returns %NULL otherwise the new buffer is returned. 785 * 786 * If this function is called from an interrupt gfp_mask() must be 787 * %GFP_ATOMIC. 788 */ 789 790 struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) 791 { 792 struct sk_buff *n; 793 794 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { 795 if (skb_copy_ubufs(skb, gfp_mask)) 796 return NULL; 797 } 798 799 n = skb + 1; 800 if (skb->fclone == SKB_FCLONE_ORIG && 801 n->fclone == SKB_FCLONE_UNAVAILABLE) { 802 atomic_t *fclone_ref = (atomic_t *) (n + 1); 803 n->fclone = SKB_FCLONE_CLONE; 804 atomic_inc(fclone_ref); 805 } else { 806 n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); 807 if (!n) 808 return NULL; 809 810 kmemcheck_annotate_bitfield(n, flags1); 811 kmemcheck_annotate_bitfield(n, flags2); 812 n->fclone = SKB_FCLONE_UNAVAILABLE; 813 } 814 815 return __skb_clone(n, skb); 816 } 817 EXPORT_SYMBOL(skb_clone); 818 819 static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) 820 { 821 #ifndef NET_SKBUFF_DATA_USES_OFFSET 822 /* 823 * Shift between the two data areas in bytes 824 */ 825 unsigned long offset = new->data - old->data; 826 #endif 827 828 __copy_skb_header(new, old); 829 830 #ifndef NET_SKBUFF_DATA_USES_OFFSET 831 /* {transport,network,mac}_header are relative to skb->head */ 832 new->transport_header += offset; 833 new->network_header += offset; 834 if (skb_mac_header_was_set(new)) 835 new->mac_header += offset; 836 #endif 837 skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; 838 skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; 839 skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; 840 } 841 842 /** 843 * skb_copy - create private copy of an sk_buff 844 * @skb: buffer to copy 845 * @gfp_mask: allocation priority 846 * 847 * Make a copy of both an &sk_buff and its data. This is used when the 848 * caller wishes to modify the data and needs a private copy of the 849 * data to alter. Returns %NULL on failure or the pointer to the buffer 850 * on success. The returned buffer has a reference count of 1. 851 * 852 * As by-product this function converts non-linear &sk_buff to linear 853 * one, so that &sk_buff becomes completely private and caller is allowed 854 * to modify all the data of returned buffer. This means that this 855 * function is not recommended for use in circumstances when only 856 * header is going to be modified. Use pskb_copy() instead. 857 */ 858 859 struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) 860 { 861 int headerlen = skb_headroom(skb); 862 unsigned int size = skb_end_offset(skb) + skb->data_len; 863 struct sk_buff *n = alloc_skb(size, gfp_mask); 864 865 if (!n) 866 return NULL; 867 868 /* Set the data pointer */ 869 skb_reserve(n, headerlen); 870 /* Set the tail pointer and length */ 871 skb_put(n, skb->len); 872 873 if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)) 874 BUG(); 875 876 copy_skb_header(n, skb); 877 return n; 878 } 879 EXPORT_SYMBOL(skb_copy); 880 881 /** 882 * __pskb_copy - create copy of an sk_buff with private head. 883 * @skb: buffer to copy 884 * @headroom: headroom of new skb 885 * @gfp_mask: allocation priority 886 * 887 * Make a copy of both an &sk_buff and part of its data, located 888 * in header. Fragmented data remain shared. This is used when 889 * the caller wishes to modify only header of &sk_buff and needs 890 * private copy of the header to alter. Returns %NULL on failure 891 * or the pointer to the buffer on success. 892 * The returned buffer has a reference count of 1. 893 */ 894 895 struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask) 896 { 897 unsigned int size = skb_headlen(skb) + headroom; 898 struct sk_buff *n = alloc_skb(size, gfp_mask); 899 900 if (!n) 901 goto out; 902 903 /* Set the data pointer */ 904 skb_reserve(n, headroom); 905 /* Set the tail pointer and length */ 906 skb_put(n, skb_headlen(skb)); 907 /* Copy the bytes */ 908 skb_copy_from_linear_data(skb, n->data, n->len); 909 910 n->truesize += skb->data_len; 911 n->data_len = skb->data_len; 912 n->len = skb->len; 913 914 if (skb_shinfo(skb)->nr_frags) { 915 int i; 916 917 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { 918 if (skb_copy_ubufs(skb, gfp_mask)) { 919 kfree_skb(n); 920 n = NULL; 921 goto out; 922 } 923 } 924 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 925 skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; 926 skb_frag_ref(skb, i); 927 } 928 skb_shinfo(n)->nr_frags = i; 929 } 930 931 if (skb_has_frag_list(skb)) { 932 skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; 933 skb_clone_fraglist(n); 934 } 935 936 copy_skb_header(n, skb); 937 out: 938 return n; 939 } 940 EXPORT_SYMBOL(__pskb_copy); 941 942 /** 943 * pskb_expand_head - reallocate header of &sk_buff 944 * @skb: buffer to reallocate 945 * @nhead: room to add at head 946 * @ntail: room to add at tail 947 * @gfp_mask: allocation priority 948 * 949 * Expands (or creates identical copy, if &nhead and &ntail are zero) 950 * header of skb. &sk_buff itself is not changed. &sk_buff MUST have 951 * reference count of 1. Returns zero in the case of success or error, 952 * if expansion failed. In the last case, &sk_buff is not changed. 953 * 954 * All the pointers pointing into skb header may change and must be 955 * reloaded after call to this function. 956 */ 957 958 int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, 959 gfp_t gfp_mask) 960 { 961 int i; 962 u8 *data; 963 int size = nhead + skb_end_offset(skb) + ntail; 964 long off; 965 966 BUG_ON(nhead < 0); 967 968 if (skb_shared(skb)) 969 BUG(); 970 971 size = SKB_DATA_ALIGN(size); 972 973 data = kmalloc(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), 974 gfp_mask); 975 if (!data) 976 goto nodata; 977 size = SKB_WITH_OVERHEAD(ksize(data)); 978 979 /* Copy only real data... and, alas, header. This should be 980 * optimized for the cases when header is void. 981 */ 982 memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head); 983 984 memcpy((struct skb_shared_info *)(data + size), 985 skb_shinfo(skb), 986 offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); 987 988 /* 989 * if shinfo is shared we must drop the old head gracefully, but if it 990 * is not we can just drop the old head and let the existing refcount 991 * be since all we did is relocate the values 992 */ 993 if (skb_cloned(skb)) { 994 /* copy this zero copy skb frags */ 995 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { 996 if (skb_copy_ubufs(skb, gfp_mask)) 997 goto nofrags; 998 } 999 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 1000 skb_frag_ref(skb, i); 1001 1002 if (skb_has_frag_list(skb)) 1003 skb_clone_fraglist(skb); 1004 1005 skb_release_data(skb); 1006 } else { 1007 skb_free_head(skb); 1008 } 1009 off = (data + nhead) - skb->head; 1010 1011 skb->head = data; 1012 skb->head_frag = 0; 1013 skb->data += off; 1014 #ifdef NET_SKBUFF_DATA_USES_OFFSET 1015 skb->end = size; 1016 off = nhead; 1017 #else 1018 skb->end = skb->head + size; 1019 #endif 1020 /* {transport,network,mac}_header and tail are relative to skb->head */ 1021 skb->tail += off; 1022 skb->transport_header += off; 1023 skb->network_header += off; 1024 if (skb_mac_header_was_set(skb)) 1025 skb->mac_header += off; 1026 /* Only adjust this if it actually is csum_start rather than csum */ 1027 if (skb->ip_summed == CHECKSUM_PARTIAL) 1028 skb->csum_start += nhead; 1029 skb->cloned = 0; 1030 skb->hdr_len = 0; 1031 skb->nohdr = 0; 1032 atomic_set(&skb_shinfo(skb)->dataref, 1); 1033 return 0; 1034 1035 nofrags: 1036 kfree(data); 1037 nodata: 1038 return -ENOMEM; 1039 } 1040 EXPORT_SYMBOL(pskb_expand_head); 1041 1042 /* Make private copy of skb with writable head and some headroom */ 1043 1044 struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) 1045 { 1046 struct sk_buff *skb2; 1047 int delta = headroom - skb_headroom(skb); 1048 1049 if (delta <= 0) 1050 skb2 = pskb_copy(skb, GFP_ATOMIC); 1051 else { 1052 skb2 = skb_clone(skb, GFP_ATOMIC); 1053 if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, 1054 GFP_ATOMIC)) { 1055 kfree_skb(skb2); 1056 skb2 = NULL; 1057 } 1058 } 1059 return skb2; 1060 } 1061 EXPORT_SYMBOL(skb_realloc_headroom); 1062 1063 /** 1064 * skb_copy_expand - copy and expand sk_buff 1065 * @skb: buffer to copy 1066 * @newheadroom: new free bytes at head 1067 * @newtailroom: new free bytes at tail 1068 * @gfp_mask: allocation priority 1069 * 1070 * Make a copy of both an &sk_buff and its data and while doing so 1071 * allocate additional space. 1072 * 1073 * This is used when the caller wishes to modify the data and needs a 1074 * private copy of the data to alter as well as more space for new fields. 1075 * Returns %NULL on failure or the pointer to the buffer 1076 * on success. The returned buffer has a reference count of 1. 1077 * 1078 * You must pass %GFP_ATOMIC as the allocation priority if this function 1079 * is called from an interrupt. 1080 */ 1081 struct sk_buff *skb_copy_expand(const struct sk_buff *skb, 1082 int newheadroom, int newtailroom, 1083 gfp_t gfp_mask) 1084 { 1085 /* 1086 * Allocate the copy buffer 1087 */ 1088 struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom, 1089 gfp_mask); 1090 int oldheadroom = skb_headroom(skb); 1091 int head_copy_len, head_copy_off; 1092 int off; 1093 1094 if (!n) 1095 return NULL; 1096 1097 skb_reserve(n, newheadroom); 1098 1099 /* Set the tail pointer and length */ 1100 skb_put(n, skb->len); 1101 1102 head_copy_len = oldheadroom; 1103 head_copy_off = 0; 1104 if (newheadroom <= head_copy_len) 1105 head_copy_len = newheadroom; 1106 else 1107 head_copy_off = newheadroom - head_copy_len; 1108 1109 /* Copy the linear header and data. */ 1110 if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, 1111 skb->len + head_copy_len)) 1112 BUG(); 1113 1114 copy_skb_header(n, skb); 1115 1116 off = newheadroom - oldheadroom; 1117 if (n->ip_summed == CHECKSUM_PARTIAL) 1118 n->csum_start += off; 1119 #ifdef NET_SKBUFF_DATA_USES_OFFSET 1120 n->transport_header += off; 1121 n->network_header += off; 1122 if (skb_mac_header_was_set(skb)) 1123 n->mac_header += off; 1124 #endif 1125 1126 return n; 1127 } 1128 EXPORT_SYMBOL(skb_copy_expand); 1129 1130 /** 1131 * skb_pad - zero pad the tail of an skb 1132 * @skb: buffer to pad 1133 * @pad: space to pad 1134 * 1135 * Ensure that a buffer is followed by a padding area that is zero 1136 * filled. Used by network drivers which may DMA or transfer data 1137 * beyond the buffer end onto the wire. 1138 * 1139 * May return error in out of memory cases. The skb is freed on error. 1140 */ 1141 1142 int skb_pad(struct sk_buff *skb, int pad) 1143 { 1144 int err; 1145 int ntail; 1146 1147 /* If the skbuff is non linear tailroom is always zero.. */ 1148 if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) { 1149 memset(skb->data+skb->len, 0, pad); 1150 return 0; 1151 } 1152 1153 ntail = skb->data_len + pad - (skb->end - skb->tail); 1154 if (likely(skb_cloned(skb) || ntail > 0)) { 1155 err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC); 1156 if (unlikely(err)) 1157 goto free_skb; 1158 } 1159 1160 /* FIXME: The use of this function with non-linear skb's really needs 1161 * to be audited. 1162 */ 1163 err = skb_linearize(skb); 1164 if (unlikely(err)) 1165 goto free_skb; 1166 1167 memset(skb->data + skb->len, 0, pad); 1168 return 0; 1169 1170 free_skb: 1171 kfree_skb(skb); 1172 return err; 1173 } 1174 EXPORT_SYMBOL(skb_pad); 1175 1176 /** 1177 * skb_put - add data to a buffer 1178 * @skb: buffer to use 1179 * @len: amount of data to add 1180 * 1181 * This function extends the used data area of the buffer. If this would 1182 * exceed the total buffer size the kernel will panic. A pointer to the 1183 * first byte of the extra data is returned. 1184 */ 1185 unsigned char *skb_put(struct sk_buff *skb, unsigned int len) 1186 { 1187 unsigned char *tmp = skb_tail_pointer(skb); 1188 SKB_LINEAR_ASSERT(skb); 1189 skb->tail += len; 1190 skb->len += len; 1191 if (unlikely(skb->tail > skb->end)) 1192 skb_over_panic(skb, len, __builtin_return_address(0)); 1193 return tmp; 1194 } 1195 EXPORT_SYMBOL(skb_put); 1196 1197 /** 1198 * skb_push - add data to the start of a buffer 1199 * @skb: buffer to use 1200 * @len: amount of data to add 1201 * 1202 * This function extends the used data area of the buffer at the buffer 1203 * start. If this would exceed the total buffer headroom the kernel will 1204 * panic. A pointer to the first byte of the extra data is returned. 1205 */ 1206 unsigned char *skb_push(struct sk_buff *skb, unsigned int len) 1207 { 1208 skb->data -= len; 1209 skb->len += len; 1210 if (unlikely(skb->data<skb->head)) 1211 skb_under_panic(skb, len, __builtin_return_address(0)); 1212 return skb->data; 1213 } 1214 EXPORT_SYMBOL(skb_push); 1215 1216 /** 1217 * skb_pull - remove data from the start of a buffer 1218 * @skb: buffer to use 1219 * @len: amount of data to remove 1220 * 1221 * This function removes data from the start of a buffer, returning 1222 * the memory to the headroom. A pointer to the next data in the buffer 1223 * is returned. Once the data has been pulled future pushes will overwrite 1224 * the old data. 1225 */ 1226 unsigned char *skb_pull(struct sk_buff *skb, unsigned int len) 1227 { 1228 return skb_pull_inline(skb, len); 1229 } 1230 EXPORT_SYMBOL(skb_pull); 1231 1232 /** 1233 * skb_trim - remove end from a buffer 1234 * @skb: buffer to alter 1235 * @len: new length 1236 * 1237 * Cut the length of a buffer down by removing data from the tail. If 1238 * the buffer is already under the length specified it is not modified. 1239 * The skb must be linear. 1240 */ 1241 void skb_trim(struct sk_buff *skb, unsigned int len) 1242 { 1243 if (skb->len > len) 1244 __skb_trim(skb, len); 1245 } 1246 EXPORT_SYMBOL(skb_trim); 1247 1248 /* Trims skb to length len. It can change skb pointers. 1249 */ 1250 1251 int ___pskb_trim(struct sk_buff *skb, unsigned int len) 1252 { 1253 struct sk_buff **fragp; 1254 struct sk_buff *frag; 1255 int offset = skb_headlen(skb); 1256 int nfrags = skb_shinfo(skb)->nr_frags; 1257 int i; 1258 int err; 1259 1260 if (skb_cloned(skb) && 1261 unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))) 1262 return err; 1263 1264 i = 0; 1265 if (offset >= len) 1266 goto drop_pages; 1267 1268 for (; i < nfrags; i++) { 1269 int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]); 1270 1271 if (end < len) { 1272 offset = end; 1273 continue; 1274 } 1275 1276 skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset); 1277 1278 drop_pages: 1279 skb_shinfo(skb)->nr_frags = i; 1280 1281 for (; i < nfrags; i++) 1282 skb_frag_unref(skb, i); 1283 1284 if (skb_has_frag_list(skb)) 1285 skb_drop_fraglist(skb); 1286 goto done; 1287 } 1288 1289 for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp); 1290 fragp = &frag->next) { 1291 int end = offset + frag->len; 1292 1293 if (skb_shared(frag)) { 1294 struct sk_buff *nfrag; 1295 1296 nfrag = skb_clone(frag, GFP_ATOMIC); 1297 if (unlikely(!nfrag)) 1298 return -ENOMEM; 1299 1300 nfrag->next = frag->next; 1301 consume_skb(frag); 1302 frag = nfrag; 1303 *fragp = frag; 1304 } 1305 1306 if (end < len) { 1307 offset = end; 1308 continue; 1309 } 1310 1311 if (end > len && 1312 unlikely((err = pskb_trim(frag, len - offset)))) 1313 return err; 1314 1315 if (frag->next) 1316 skb_drop_list(&frag->next); 1317 break; 1318 } 1319 1320 done: 1321 if (len > skb_headlen(skb)) { 1322 skb->data_len -= skb->len - len; 1323 skb->len = len; 1324 } else { 1325 skb->len = len; 1326 skb->data_len = 0; 1327 skb_set_tail_pointer(skb, len); 1328 } 1329 1330 return 0; 1331 } 1332 EXPORT_SYMBOL(___pskb_trim); 1333 1334 /** 1335 * __pskb_pull_tail - advance tail of skb header 1336 * @skb: buffer to reallocate 1337 * @delta: number of bytes to advance tail 1338 * 1339 * The function makes a sense only on a fragmented &sk_buff, 1340 * it expands header moving its tail forward and copying necessary 1341 * data from fragmented part. 1342 * 1343 * &sk_buff MUST have reference count of 1. 1344 * 1345 * Returns %NULL (and &sk_buff does not change) if pull failed 1346 * or value of new tail of skb in the case of success. 1347 * 1348 * All the pointers pointing into skb header may change and must be 1349 * reloaded after call to this function. 1350 */ 1351 1352 /* Moves tail of skb head forward, copying data from fragmented part, 1353 * when it is necessary. 1354 * 1. It may fail due to malloc failure. 1355 * 2. It may change skb pointers. 1356 * 1357 * It is pretty complicated. Luckily, it is called only in exceptional cases. 1358 */ 1359 unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) 1360 { 1361 /* If skb has not enough free space at tail, get new one 1362 * plus 128 bytes for future expansions. If we have enough 1363 * room at tail, reallocate without expansion only if skb is cloned. 1364 */ 1365 int i, k, eat = (skb->tail + delta) - skb->end; 1366 1367 if (eat > 0 || skb_cloned(skb)) { 1368 if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, 1369 GFP_ATOMIC)) 1370 return NULL; 1371 } 1372 1373 if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta)) 1374 BUG(); 1375 1376 /* Optimization: no fragments, no reasons to preestimate 1377 * size of pulled pages. Superb. 1378 */ 1379 if (!skb_has_frag_list(skb)) 1380 goto pull_pages; 1381 1382 /* Estimate size of pulled pages. */ 1383 eat = delta; 1384 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1385 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 1386 1387 if (size >= eat) 1388 goto pull_pages; 1389 eat -= size; 1390 } 1391 1392 /* If we need update frag list, we are in troubles. 1393 * Certainly, it possible to add an offset to skb data, 1394 * but taking into account that pulling is expected to 1395 * be very rare operation, it is worth to fight against 1396 * further bloating skb head and crucify ourselves here instead. 1397 * Pure masohism, indeed. 8)8) 1398 */ 1399 if (eat) { 1400 struct sk_buff *list = skb_shinfo(skb)->frag_list; 1401 struct sk_buff *clone = NULL; 1402 struct sk_buff *insp = NULL; 1403 1404 do { 1405 BUG_ON(!list); 1406 1407 if (list->len <= eat) { 1408 /* Eaten as whole. */ 1409 eat -= list->len; 1410 list = list->next; 1411 insp = list; 1412 } else { 1413 /* Eaten partially. */ 1414 1415 if (skb_shared(list)) { 1416 /* Sucks! We need to fork list. :-( */ 1417 clone = skb_clone(list, GFP_ATOMIC); 1418 if (!clone) 1419 return NULL; 1420 insp = list->next; 1421 list = clone; 1422 } else { 1423 /* This may be pulled without 1424 * problems. */ 1425 insp = list; 1426 } 1427 if (!pskb_pull(list, eat)) { 1428 kfree_skb(clone); 1429 return NULL; 1430 } 1431 break; 1432 } 1433 } while (eat); 1434 1435 /* Free pulled out fragments. */ 1436 while ((list = skb_shinfo(skb)->frag_list) != insp) { 1437 skb_shinfo(skb)->frag_list = list->next; 1438 kfree_skb(list); 1439 } 1440 /* And insert new clone at head. */ 1441 if (clone) { 1442 clone->next = list; 1443 skb_shinfo(skb)->frag_list = clone; 1444 } 1445 } 1446 /* Success! Now we may commit changes to skb data. */ 1447 1448 pull_pages: 1449 eat = delta; 1450 k = 0; 1451 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1452 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 1453 1454 if (size <= eat) { 1455 skb_frag_unref(skb, i); 1456 eat -= size; 1457 } else { 1458 skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; 1459 if (eat) { 1460 skb_shinfo(skb)->frags[k].page_offset += eat; 1461 skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat); 1462 eat = 0; 1463 } 1464 k++; 1465 } 1466 } 1467 skb_shinfo(skb)->nr_frags = k; 1468 1469 skb->tail += delta; 1470 skb->data_len -= delta; 1471 1472 return skb_tail_pointer(skb); 1473 } 1474 EXPORT_SYMBOL(__pskb_pull_tail); 1475 1476 /** 1477 * skb_copy_bits - copy bits from skb to kernel buffer 1478 * @skb: source skb 1479 * @offset: offset in source 1480 * @to: destination buffer 1481 * @len: number of bytes to copy 1482 * 1483 * Copy the specified number of bytes from the source skb to the 1484 * destination buffer. 1485 * 1486 * CAUTION ! : 1487 * If its prototype is ever changed, 1488 * check arch/{*}/net/{*}.S files, 1489 * since it is called from BPF assembly code. 1490 */ 1491 int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) 1492 { 1493 int start = skb_headlen(skb); 1494 struct sk_buff *frag_iter; 1495 int i, copy; 1496 1497 if (offset > (int)skb->len - len) 1498 goto fault; 1499 1500 /* Copy header. */ 1501 if ((copy = start - offset) > 0) { 1502 if (copy > len) 1503 copy = len; 1504 skb_copy_from_linear_data_offset(skb, offset, to, copy); 1505 if ((len -= copy) == 0) 1506 return 0; 1507 offset += copy; 1508 to += copy; 1509 } 1510 1511 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1512 int end; 1513 skb_frag_t *f = &skb_shinfo(skb)->frags[i]; 1514 1515 WARN_ON(start > offset + len); 1516 1517 end = start + skb_frag_size(f); 1518 if ((copy = end - offset) > 0) { 1519 u8 *vaddr; 1520 1521 if (copy > len) 1522 copy = len; 1523 1524 vaddr = kmap_atomic(skb_frag_page(f)); 1525 memcpy(to, 1526 vaddr + f->page_offset + offset - start, 1527 copy); 1528 kunmap_atomic(vaddr); 1529 1530 if ((len -= copy) == 0) 1531 return 0; 1532 offset += copy; 1533 to += copy; 1534 } 1535 start = end; 1536 } 1537 1538 skb_walk_frags(skb, frag_iter) { 1539 int end; 1540 1541 WARN_ON(start > offset + len); 1542 1543 end = start + frag_iter->len; 1544 if ((copy = end - offset) > 0) { 1545 if (copy > len) 1546 copy = len; 1547 if (skb_copy_bits(frag_iter, offset - start, to, copy)) 1548 goto fault; 1549 if ((len -= copy) == 0) 1550 return 0; 1551 offset += copy; 1552 to += copy; 1553 } 1554 start = end; 1555 } 1556 1557 if (!len) 1558 return 0; 1559 1560 fault: 1561 return -EFAULT; 1562 } 1563 EXPORT_SYMBOL(skb_copy_bits); 1564 1565 /* 1566 * Callback from splice_to_pipe(), if we need to release some pages 1567 * at the end of the spd in case we error'ed out in filling the pipe. 1568 */ 1569 static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) 1570 { 1571 put_page(spd->pages[i]); 1572 } 1573 1574 static struct page *linear_to_page(struct page *page, unsigned int *len, 1575 unsigned int *offset, 1576 struct sk_buff *skb, struct sock *sk) 1577 { 1578 struct page *p = sk->sk_sndmsg_page; 1579 unsigned int off; 1580 1581 if (!p) { 1582 new_page: 1583 p = sk->sk_sndmsg_page = alloc_pages(sk->sk_allocation, 0); 1584 if (!p) 1585 return NULL; 1586 1587 off = sk->sk_sndmsg_off = 0; 1588 /* hold one ref to this page until it's full */ 1589 } else { 1590 unsigned int mlen; 1591 1592 /* If we are the only user of the page, we can reset offset */ 1593 if (page_count(p) == 1) 1594 sk->sk_sndmsg_off = 0; 1595 off = sk->sk_sndmsg_off; 1596 mlen = PAGE_SIZE - off; 1597 if (mlen < 64 && mlen < *len) { 1598 put_page(p); 1599 goto new_page; 1600 } 1601 1602 *len = min_t(unsigned int, *len, mlen); 1603 } 1604 1605 memcpy(page_address(p) + off, page_address(page) + *offset, *len); 1606 sk->sk_sndmsg_off += *len; 1607 *offset = off; 1608 1609 return p; 1610 } 1611 1612 static bool spd_can_coalesce(const struct splice_pipe_desc *spd, 1613 struct page *page, 1614 unsigned int offset) 1615 { 1616 return spd->nr_pages && 1617 spd->pages[spd->nr_pages - 1] == page && 1618 (spd->partial[spd->nr_pages - 1].offset + 1619 spd->partial[spd->nr_pages - 1].len == offset); 1620 } 1621 1622 /* 1623 * Fill page/offset/length into spd, if it can hold more pages. 1624 */ 1625 static bool spd_fill_page(struct splice_pipe_desc *spd, 1626 struct pipe_inode_info *pipe, struct page *page, 1627 unsigned int *len, unsigned int offset, 1628 struct sk_buff *skb, bool linear, 1629 struct sock *sk) 1630 { 1631 if (unlikely(spd->nr_pages == MAX_SKB_FRAGS)) 1632 return true; 1633 1634 if (linear) { 1635 page = linear_to_page(page, len, &offset, skb, sk); 1636 if (!page) 1637 return true; 1638 } 1639 if (spd_can_coalesce(spd, page, offset)) { 1640 spd->partial[spd->nr_pages - 1].len += *len; 1641 return false; 1642 } 1643 get_page(page); 1644 spd->pages[spd->nr_pages] = page; 1645 spd->partial[spd->nr_pages].len = *len; 1646 spd->partial[spd->nr_pages].offset = offset; 1647 spd->nr_pages++; 1648 1649 return false; 1650 } 1651 1652 static inline void __segment_seek(struct page **page, unsigned int *poff, 1653 unsigned int *plen, unsigned int off) 1654 { 1655 unsigned long n; 1656 1657 *poff += off; 1658 n = *poff / PAGE_SIZE; 1659 if (n) 1660 *page = nth_page(*page, n); 1661 1662 *poff = *poff % PAGE_SIZE; 1663 *plen -= off; 1664 } 1665 1666 static bool __splice_segment(struct page *page, unsigned int poff, 1667 unsigned int plen, unsigned int *off, 1668 unsigned int *len, struct sk_buff *skb, 1669 struct splice_pipe_desc *spd, bool linear, 1670 struct sock *sk, 1671 struct pipe_inode_info *pipe) 1672 { 1673 if (!*len) 1674 return true; 1675 1676 /* skip this segment if already processed */ 1677 if (*off >= plen) { 1678 *off -= plen; 1679 return false; 1680 } 1681 1682 /* ignore any bits we already processed */ 1683 if (*off) { 1684 __segment_seek(&page, &poff, &plen, *off); 1685 *off = 0; 1686 } 1687 1688 do { 1689 unsigned int flen = min(*len, plen); 1690 1691 /* the linear region may spread across several pages */ 1692 flen = min_t(unsigned int, flen, PAGE_SIZE - poff); 1693 1694 if (spd_fill_page(spd, pipe, page, &flen, poff, skb, linear, sk)) 1695 return true; 1696 1697 __segment_seek(&page, &poff, &plen, flen); 1698 *len -= flen; 1699 1700 } while (*len && plen); 1701 1702 return false; 1703 } 1704 1705 /* 1706 * Map linear and fragment data from the skb to spd. It reports true if the 1707 * pipe is full or if we already spliced the requested length. 1708 */ 1709 static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, 1710 unsigned int *offset, unsigned int *len, 1711 struct splice_pipe_desc *spd, struct sock *sk) 1712 { 1713 int seg; 1714 1715 /* map the linear part : 1716 * If skb->head_frag is set, this 'linear' part is backed by a 1717 * fragment, and if the head is not shared with any clones then 1718 * we can avoid a copy since we own the head portion of this page. 1719 */ 1720 if (__splice_segment(virt_to_page(skb->data), 1721 (unsigned long) skb->data & (PAGE_SIZE - 1), 1722 skb_headlen(skb), 1723 offset, len, skb, spd, 1724 skb_head_is_locked(skb), 1725 sk, pipe)) 1726 return true; 1727 1728 /* 1729 * then map the fragments 1730 */ 1731 for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { 1732 const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; 1733 1734 if (__splice_segment(skb_frag_page(f), 1735 f->page_offset, skb_frag_size(f), 1736 offset, len, skb, spd, false, sk, pipe)) 1737 return true; 1738 } 1739 1740 return false; 1741 } 1742 1743 /* 1744 * Map data from the skb to a pipe. Should handle both the linear part, 1745 * the fragments, and the frag list. It does NOT handle frag lists within 1746 * the frag list, if such a thing exists. We'd probably need to recurse to 1747 * handle that cleanly. 1748 */ 1749 int skb_splice_bits(struct sk_buff *skb, unsigned int offset, 1750 struct pipe_inode_info *pipe, unsigned int tlen, 1751 unsigned int flags) 1752 { 1753 struct partial_page partial[MAX_SKB_FRAGS]; 1754 struct page *pages[MAX_SKB_FRAGS]; 1755 struct splice_pipe_desc spd = { 1756 .pages = pages, 1757 .partial = partial, 1758 .flags = flags, 1759 .ops = &sock_pipe_buf_ops, 1760 .spd_release = sock_spd_release, 1761 }; 1762 struct sk_buff *frag_iter; 1763 struct sock *sk = skb->sk; 1764 int ret = 0; 1765 1766 /* 1767 * __skb_splice_bits() only fails if the output has no room left, 1768 * so no point in going over the frag_list for the error case. 1769 */ 1770 if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk)) 1771 goto done; 1772 else if (!tlen) 1773 goto done; 1774 1775 /* 1776 * now see if we have a frag_list to map 1777 */ 1778 skb_walk_frags(skb, frag_iter) { 1779 if (!tlen) 1780 break; 1781 if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk)) 1782 break; 1783 } 1784 1785 done: 1786 if (spd.nr_pages) { 1787 /* 1788 * Drop the socket lock, otherwise we have reverse 1789 * locking dependencies between sk_lock and i_mutex 1790 * here as compared to sendfile(). We enter here 1791 * with the socket lock held, and splice_to_pipe() will 1792 * grab the pipe inode lock. For sendfile() emulation, 1793 * we call into ->sendpage() with the i_mutex lock held 1794 * and networking will grab the socket lock. 1795 */ 1796 release_sock(sk); 1797 ret = splice_to_pipe(pipe, &spd); 1798 lock_sock(sk); 1799 } 1800 1801 return ret; 1802 } 1803 1804 /** 1805 * skb_store_bits - store bits from kernel buffer to skb 1806 * @skb: destination buffer 1807 * @offset: offset in destination 1808 * @from: source buffer 1809 * @len: number of bytes to copy 1810 * 1811 * Copy the specified number of bytes from the source buffer to the 1812 * destination skb. This function handles all the messy bits of 1813 * traversing fragment lists and such. 1814 */ 1815 1816 int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) 1817 { 1818 int start = skb_headlen(skb); 1819 struct sk_buff *frag_iter; 1820 int i, copy; 1821 1822 if (offset > (int)skb->len - len) 1823 goto fault; 1824 1825 if ((copy = start - offset) > 0) { 1826 if (copy > len) 1827 copy = len; 1828 skb_copy_to_linear_data_offset(skb, offset, from, copy); 1829 if ((len -= copy) == 0) 1830 return 0; 1831 offset += copy; 1832 from += copy; 1833 } 1834 1835 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1836 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 1837 int end; 1838 1839 WARN_ON(start > offset + len); 1840 1841 end = start + skb_frag_size(frag); 1842 if ((copy = end - offset) > 0) { 1843 u8 *vaddr; 1844 1845 if (copy > len) 1846 copy = len; 1847 1848 vaddr = kmap_atomic(skb_frag_page(frag)); 1849 memcpy(vaddr + frag->page_offset + offset - start, 1850 from, copy); 1851 kunmap_atomic(vaddr); 1852 1853 if ((len -= copy) == 0) 1854 return 0; 1855 offset += copy; 1856 from += copy; 1857 } 1858 start = end; 1859 } 1860 1861 skb_walk_frags(skb, frag_iter) { 1862 int end; 1863 1864 WARN_ON(start > offset + len); 1865 1866 end = start + frag_iter->len; 1867 if ((copy = end - offset) > 0) { 1868 if (copy > len) 1869 copy = len; 1870 if (skb_store_bits(frag_iter, offset - start, 1871 from, copy)) 1872 goto fault; 1873 if ((len -= copy) == 0) 1874 return 0; 1875 offset += copy; 1876 from += copy; 1877 } 1878 start = end; 1879 } 1880 if (!len) 1881 return 0; 1882 1883 fault: 1884 return -EFAULT; 1885 } 1886 EXPORT_SYMBOL(skb_store_bits); 1887 1888 /* Checksum skb data. */ 1889 1890 __wsum skb_checksum(const struct sk_buff *skb, int offset, 1891 int len, __wsum csum) 1892 { 1893 int start = skb_headlen(skb); 1894 int i, copy = start - offset; 1895 struct sk_buff *frag_iter; 1896 int pos = 0; 1897 1898 /* Checksum header. */ 1899 if (copy > 0) { 1900 if (copy > len) 1901 copy = len; 1902 csum = csum_partial(skb->data + offset, copy, csum); 1903 if ((len -= copy) == 0) 1904 return csum; 1905 offset += copy; 1906 pos = copy; 1907 } 1908 1909 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1910 int end; 1911 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 1912 1913 WARN_ON(start > offset + len); 1914 1915 end = start + skb_frag_size(frag); 1916 if ((copy = end - offset) > 0) { 1917 __wsum csum2; 1918 u8 *vaddr; 1919 1920 if (copy > len) 1921 copy = len; 1922 vaddr = kmap_atomic(skb_frag_page(frag)); 1923 csum2 = csum_partial(vaddr + frag->page_offset + 1924 offset - start, copy, 0); 1925 kunmap_atomic(vaddr); 1926 csum = csum_block_add(csum, csum2, pos); 1927 if (!(len -= copy)) 1928 return csum; 1929 offset += copy; 1930 pos += copy; 1931 } 1932 start = end; 1933 } 1934 1935 skb_walk_frags(skb, frag_iter) { 1936 int end; 1937 1938 WARN_ON(start > offset + len); 1939 1940 end = start + frag_iter->len; 1941 if ((copy = end - offset) > 0) { 1942 __wsum csum2; 1943 if (copy > len) 1944 copy = len; 1945 csum2 = skb_checksum(frag_iter, offset - start, 1946 copy, 0); 1947 csum = csum_block_add(csum, csum2, pos); 1948 if ((len -= copy) == 0) 1949 return csum; 1950 offset += copy; 1951 pos += copy; 1952 } 1953 start = end; 1954 } 1955 BUG_ON(len); 1956 1957 return csum; 1958 } 1959 EXPORT_SYMBOL(skb_checksum); 1960 1961 /* Both of above in one bottle. */ 1962 1963 __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, 1964 u8 *to, int len, __wsum csum) 1965 { 1966 int start = skb_headlen(skb); 1967 int i, copy = start - offset; 1968 struct sk_buff *frag_iter; 1969 int pos = 0; 1970 1971 /* Copy header. */ 1972 if (copy > 0) { 1973 if (copy > len) 1974 copy = len; 1975 csum = csum_partial_copy_nocheck(skb->data + offset, to, 1976 copy, csum); 1977 if ((len -= copy) == 0) 1978 return csum; 1979 offset += copy; 1980 to += copy; 1981 pos = copy; 1982 } 1983 1984 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1985 int end; 1986 1987 WARN_ON(start > offset + len); 1988 1989 end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); 1990 if ((copy = end - offset) > 0) { 1991 __wsum csum2; 1992 u8 *vaddr; 1993 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 1994 1995 if (copy > len) 1996 copy = len; 1997 vaddr = kmap_atomic(skb_frag_page(frag)); 1998 csum2 = csum_partial_copy_nocheck(vaddr + 1999 frag->page_offset + 2000 offset - start, to, 2001 copy, 0); 2002 kunmap_atomic(vaddr); 2003 csum = csum_block_add(csum, csum2, pos); 2004 if (!(len -= copy)) 2005 return csum; 2006 offset += copy; 2007 to += copy; 2008 pos += copy; 2009 } 2010 start = end; 2011 } 2012 2013 skb_walk_frags(skb, frag_iter) { 2014 __wsum csum2; 2015 int end; 2016 2017 WARN_ON(start > offset + len); 2018 2019 end = start + frag_iter->len; 2020 if ((copy = end - offset) > 0) { 2021 if (copy > len) 2022 copy = len; 2023 csum2 = skb_copy_and_csum_bits(frag_iter, 2024 offset - start, 2025 to, copy, 0); 2026 csum = csum_block_add(csum, csum2, pos); 2027 if ((len -= copy) == 0) 2028 return csum; 2029 offset += copy; 2030 to += copy; 2031 pos += copy; 2032 } 2033 start = end; 2034 } 2035 BUG_ON(len); 2036 return csum; 2037 } 2038 EXPORT_SYMBOL(skb_copy_and_csum_bits); 2039 2040 void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) 2041 { 2042 __wsum csum; 2043 long csstart; 2044 2045 if (skb->ip_summed == CHECKSUM_PARTIAL) 2046 csstart = skb_checksum_start_offset(skb); 2047 else 2048 csstart = skb_headlen(skb); 2049 2050 BUG_ON(csstart > skb_headlen(skb)); 2051 2052 skb_copy_from_linear_data(skb, to, csstart); 2053 2054 csum = 0; 2055 if (csstart != skb->len) 2056 csum = skb_copy_and_csum_bits(skb, csstart, to + csstart, 2057 skb->len - csstart, 0); 2058 2059 if (skb->ip_summed == CHECKSUM_PARTIAL) { 2060 long csstuff = csstart + skb->csum_offset; 2061 2062 *((__sum16 *)(to + csstuff)) = csum_fold(csum); 2063 } 2064 } 2065 EXPORT_SYMBOL(skb_copy_and_csum_dev); 2066 2067 /** 2068 * skb_dequeue - remove from the head of the queue 2069 * @list: list to dequeue from 2070 * 2071 * Remove the head of the list. The list lock is taken so the function 2072 * may be used safely with other locking list functions. The head item is 2073 * returned or %NULL if the list is empty. 2074 */ 2075 2076 struct sk_buff *skb_dequeue(struct sk_buff_head *list) 2077 { 2078 unsigned long flags; 2079 struct sk_buff *result; 2080 2081 spin_lock_irqsave(&list->lock, flags); 2082 result = __skb_dequeue(list); 2083 spin_unlock_irqrestore(&list->lock, flags); 2084 return result; 2085 } 2086 EXPORT_SYMBOL(skb_dequeue); 2087 2088 /** 2089 * skb_dequeue_tail - remove from the tail of the queue 2090 * @list: list to dequeue from 2091 * 2092 * Remove the tail of the list. The list lock is taken so the function 2093 * may be used safely with other locking list functions. The tail item is 2094 * returned or %NULL if the list is empty. 2095 */ 2096 struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) 2097 { 2098 unsigned long flags; 2099 struct sk_buff *result; 2100 2101 spin_lock_irqsave(&list->lock, flags); 2102 result = __skb_dequeue_tail(list); 2103 spin_unlock_irqrestore(&list->lock, flags); 2104 return result; 2105 } 2106 EXPORT_SYMBOL(skb_dequeue_tail); 2107 2108 /** 2109 * skb_queue_purge - empty a list 2110 * @list: list to empty 2111 * 2112 * Delete all buffers on an &sk_buff list. Each buffer is removed from 2113 * the list and one reference dropped. This function takes the list 2114 * lock and is atomic with respect to other list locking functions. 2115 */ 2116 void skb_queue_purge(struct sk_buff_head *list) 2117 { 2118 struct sk_buff *skb; 2119 while ((skb = skb_dequeue(list)) != NULL) 2120 kfree_skb(skb); 2121 } 2122 EXPORT_SYMBOL(skb_queue_purge); 2123 2124 /** 2125 * skb_queue_head - queue a buffer at the list head 2126 * @list: list to use 2127 * @newsk: buffer to queue 2128 * 2129 * Queue a buffer at the start of the list. This function takes the 2130 * list lock and can be used safely with other locking &sk_buff functions 2131 * safely. 2132 * 2133 * A buffer cannot be placed on two lists at the same time. 2134 */ 2135 void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) 2136 { 2137 unsigned long flags; 2138 2139 spin_lock_irqsave(&list->lock, flags); 2140 __skb_queue_head(list, newsk); 2141 spin_unlock_irqrestore(&list->lock, flags); 2142 } 2143 EXPORT_SYMBOL(skb_queue_head); 2144 2145 /** 2146 * skb_queue_tail - queue a buffer at the list tail 2147 * @list: list to use 2148 * @newsk: buffer to queue 2149 * 2150 * Queue a buffer at the tail of the list. This function takes the 2151 * list lock and can be used safely with other locking &sk_buff functions 2152 * safely. 2153 * 2154 * A buffer cannot be placed on two lists at the same time. 2155 */ 2156 void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) 2157 { 2158 unsigned long flags; 2159 2160 spin_lock_irqsave(&list->lock, flags); 2161 __skb_queue_tail(list, newsk); 2162 spin_unlock_irqrestore(&list->lock, flags); 2163 } 2164 EXPORT_SYMBOL(skb_queue_tail); 2165 2166 /** 2167 * skb_unlink - remove a buffer from a list 2168 * @skb: buffer to remove 2169 * @list: list to use 2170 * 2171 * Remove a packet from a list. The list locks are taken and this 2172 * function is atomic with respect to other list locked calls 2173 * 2174 * You must know what list the SKB is on. 2175 */ 2176 void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) 2177 { 2178 unsigned long flags; 2179 2180 spin_lock_irqsave(&list->lock, flags); 2181 __skb_unlink(skb, list); 2182 spin_unlock_irqrestore(&list->lock, flags); 2183 } 2184 EXPORT_SYMBOL(skb_unlink); 2185 2186 /** 2187 * skb_append - append a buffer 2188 * @old: buffer to insert after 2189 * @newsk: buffer to insert 2190 * @list: list to use 2191 * 2192 * Place a packet after a given packet in a list. The list locks are taken 2193 * and this function is atomic with respect to other list locked calls. 2194 * A buffer cannot be placed on two lists at the same time. 2195 */ 2196 void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) 2197 { 2198 unsigned long flags; 2199 2200 spin_lock_irqsave(&list->lock, flags); 2201 __skb_queue_after(list, old, newsk); 2202 spin_unlock_irqrestore(&list->lock, flags); 2203 } 2204 EXPORT_SYMBOL(skb_append); 2205 2206 /** 2207 * skb_insert - insert a buffer 2208 * @old: buffer to insert before 2209 * @newsk: buffer to insert 2210 * @list: list to use 2211 * 2212 * Place a packet before a given packet in a list. The list locks are 2213 * taken and this function is atomic with respect to other list locked 2214 * calls. 2215 * 2216 * A buffer cannot be placed on two lists at the same time. 2217 */ 2218 void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) 2219 { 2220 unsigned long flags; 2221 2222 spin_lock_irqsave(&list->lock, flags); 2223 __skb_insert(newsk, old->prev, old, list); 2224 spin_unlock_irqrestore(&list->lock, flags); 2225 } 2226 EXPORT_SYMBOL(skb_insert); 2227 2228 static inline void skb_split_inside_header(struct sk_buff *skb, 2229 struct sk_buff* skb1, 2230 const u32 len, const int pos) 2231 { 2232 int i; 2233 2234 skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len), 2235 pos - len); 2236 /* And move data appendix as is. */ 2237 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 2238 skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; 2239 2240 skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; 2241 skb_shinfo(skb)->nr_frags = 0; 2242 skb1->data_len = skb->data_len; 2243 skb1->len += skb1->data_len; 2244 skb->data_len = 0; 2245 skb->len = len; 2246 skb_set_tail_pointer(skb, len); 2247 } 2248 2249 static inline void skb_split_no_header(struct sk_buff *skb, 2250 struct sk_buff* skb1, 2251 const u32 len, int pos) 2252 { 2253 int i, k = 0; 2254 const int nfrags = skb_shinfo(skb)->nr_frags; 2255 2256 skb_shinfo(skb)->nr_frags = 0; 2257 skb1->len = skb1->data_len = skb->len - len; 2258 skb->len = len; 2259 skb->data_len = len - pos; 2260 2261 for (i = 0; i < nfrags; i++) { 2262 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 2263 2264 if (pos + size > len) { 2265 skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; 2266 2267 if (pos < len) { 2268 /* Split frag. 2269 * We have two variants in this case: 2270 * 1. Move all the frag to the second 2271 * part, if it is possible. F.e. 2272 * this approach is mandatory for TUX, 2273 * where splitting is expensive. 2274 * 2. Split is accurately. We make this. 2275 */ 2276 skb_frag_ref(skb, i); 2277 skb_shinfo(skb1)->frags[0].page_offset += len - pos; 2278 skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos); 2279 skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos); 2280 skb_shinfo(skb)->nr_frags++; 2281 } 2282 k++; 2283 } else 2284 skb_shinfo(skb)->nr_frags++; 2285 pos += size; 2286 } 2287 skb_shinfo(skb1)->nr_frags = k; 2288 } 2289 2290 /** 2291 * skb_split - Split fragmented skb to two parts at length len. 2292 * @skb: the buffer to split 2293 * @skb1: the buffer to receive the second part 2294 * @len: new length for skb 2295 */ 2296 void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) 2297 { 2298 int pos = skb_headlen(skb); 2299 2300 if (len < pos) /* Split line is inside header. */ 2301 skb_split_inside_header(skb, skb1, len, pos); 2302 else /* Second chunk has no header, nothing to copy. */ 2303 skb_split_no_header(skb, skb1, len, pos); 2304 } 2305 EXPORT_SYMBOL(skb_split); 2306 2307 /* Shifting from/to a cloned skb is a no-go. 2308 * 2309 * Caller cannot keep skb_shinfo related pointers past calling here! 2310 */ 2311 static int skb_prepare_for_shift(struct sk_buff *skb) 2312 { 2313 return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 2314 } 2315 2316 /** 2317 * skb_shift - Shifts paged data partially from skb to another 2318 * @tgt: buffer into which tail data gets added 2319 * @skb: buffer from which the paged data comes from 2320 * @shiftlen: shift up to this many bytes 2321 * 2322 * Attempts to shift up to shiftlen worth of bytes, which may be less than 2323 * the length of the skb, from skb to tgt. Returns number bytes shifted. 2324 * It's up to caller to free skb if everything was shifted. 2325 * 2326 * If @tgt runs out of frags, the whole operation is aborted. 2327 * 2328 * Skb cannot include anything else but paged data while tgt is allowed 2329 * to have non-paged data as well. 2330 * 2331 * TODO: full sized shift could be optimized but that would need 2332 * specialized skb free'er to handle frags without up-to-date nr_frags. 2333 */ 2334 int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) 2335 { 2336 int from, to, merge, todo; 2337 struct skb_frag_struct *fragfrom, *fragto; 2338 2339 BUG_ON(shiftlen > skb->len); 2340 BUG_ON(skb_headlen(skb)); /* Would corrupt stream */ 2341 2342 todo = shiftlen; 2343 from = 0; 2344 to = skb_shinfo(tgt)->nr_frags; 2345 fragfrom = &skb_shinfo(skb)->frags[from]; 2346 2347 /* Actual merge is delayed until the point when we know we can 2348 * commit all, so that we don't have to undo partial changes 2349 */ 2350 if (!to || 2351 !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), 2352 fragfrom->page_offset)) { 2353 merge = -1; 2354 } else { 2355 merge = to - 1; 2356 2357 todo -= skb_frag_size(fragfrom); 2358 if (todo < 0) { 2359 if (skb_prepare_for_shift(skb) || 2360 skb_prepare_for_shift(tgt)) 2361 return 0; 2362 2363 /* All previous frag pointers might be stale! */ 2364 fragfrom = &skb_shinfo(skb)->frags[from]; 2365 fragto = &skb_shinfo(tgt)->frags[merge]; 2366 2367 skb_frag_size_add(fragto, shiftlen); 2368 skb_frag_size_sub(fragfrom, shiftlen); 2369 fragfrom->page_offset += shiftlen; 2370 2371 goto onlymerged; 2372 } 2373 2374 from++; 2375 } 2376 2377 /* Skip full, not-fitting skb to avoid expensive operations */ 2378 if ((shiftlen == skb->len) && 2379 (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to)) 2380 return 0; 2381 2382 if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt)) 2383 return 0; 2384 2385 while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) { 2386 if (to == MAX_SKB_FRAGS) 2387 return 0; 2388 2389 fragfrom = &skb_shinfo(skb)->frags[from]; 2390 fragto = &skb_shinfo(tgt)->frags[to]; 2391 2392 if (todo >= skb_frag_size(fragfrom)) { 2393 *fragto = *fragfrom; 2394 todo -= skb_frag_size(fragfrom); 2395 from++; 2396 to++; 2397 2398 } else { 2399 __skb_frag_ref(fragfrom); 2400 fragto->page = fragfrom->page; 2401 fragto->page_offset = fragfrom->page_offset; 2402 skb_frag_size_set(fragto, todo); 2403 2404 fragfrom->page_offset += todo; 2405 skb_frag_size_sub(fragfrom, todo); 2406 todo = 0; 2407 2408 to++; 2409 break; 2410 } 2411 } 2412 2413 /* Ready to "commit" this state change to tgt */ 2414 skb_shinfo(tgt)->nr_frags = to; 2415 2416 if (merge >= 0) { 2417 fragfrom = &skb_shinfo(skb)->frags[0]; 2418 fragto = &skb_shinfo(tgt)->frags[merge]; 2419 2420 skb_frag_size_add(fragto, skb_frag_size(fragfrom)); 2421 __skb_frag_unref(fragfrom); 2422 } 2423 2424 /* Reposition in the original skb */ 2425 to = 0; 2426 while (from < skb_shinfo(skb)->nr_frags) 2427 skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++]; 2428 skb_shinfo(skb)->nr_frags = to; 2429 2430 BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags); 2431 2432 onlymerged: 2433 /* Most likely the tgt won't ever need its checksum anymore, skb on 2434 * the other hand might need it if it needs to be resent 2435 */ 2436 tgt->ip_summed = CHECKSUM_PARTIAL; 2437 skb->ip_summed = CHECKSUM_PARTIAL; 2438 2439 /* Yak, is it really working this way? Some helper please? */ 2440 skb->len -= shiftlen; 2441 skb->data_len -= shiftlen; 2442 skb->truesize -= shiftlen; 2443 tgt->len += shiftlen; 2444 tgt->data_len += shiftlen; 2445 tgt->truesize += shiftlen; 2446 2447 return shiftlen; 2448 } 2449 2450 /** 2451 * skb_prepare_seq_read - Prepare a sequential read of skb data 2452 * @skb: the buffer to read 2453 * @from: lower offset of data to be read 2454 * @to: upper offset of data to be read 2455 * @st: state variable 2456 * 2457 * Initializes the specified state variable. Must be called before 2458 * invoking skb_seq_read() for the first time. 2459 */ 2460 void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, 2461 unsigned int to, struct skb_seq_state *st) 2462 { 2463 st->lower_offset = from; 2464 st->upper_offset = to; 2465 st->root_skb = st->cur_skb = skb; 2466 st->frag_idx = st->stepped_offset = 0; 2467 st->frag_data = NULL; 2468 } 2469 EXPORT_SYMBOL(skb_prepare_seq_read); 2470 2471 /** 2472 * skb_seq_read - Sequentially read skb data 2473 * @consumed: number of bytes consumed by the caller so far 2474 * @data: destination pointer for data to be returned 2475 * @st: state variable 2476 * 2477 * Reads a block of skb data at &consumed relative to the 2478 * lower offset specified to skb_prepare_seq_read(). Assigns 2479 * the head of the data block to &data and returns the length 2480 * of the block or 0 if the end of the skb data or the upper 2481 * offset has been reached. 2482 * 2483 * The caller is not required to consume all of the data 2484 * returned, i.e. &consumed is typically set to the number 2485 * of bytes already consumed and the next call to 2486 * skb_seq_read() will return the remaining part of the block. 2487 * 2488 * Note 1: The size of each block of data returned can be arbitrary, 2489 * this limitation is the cost for zerocopy seqeuental 2490 * reads of potentially non linear data. 2491 * 2492 * Note 2: Fragment lists within fragments are not implemented 2493 * at the moment, state->root_skb could be replaced with 2494 * a stack for this purpose. 2495 */ 2496 unsigned int skb_seq_read(unsigned int consumed, const u8 **data, 2497 struct skb_seq_state *st) 2498 { 2499 unsigned int block_limit, abs_offset = consumed + st->lower_offset; 2500 skb_frag_t *frag; 2501 2502 if (unlikely(abs_offset >= st->upper_offset)) 2503 return 0; 2504 2505 next_skb: 2506 block_limit = skb_headlen(st->cur_skb) + st->stepped_offset; 2507 2508 if (abs_offset < block_limit && !st->frag_data) { 2509 *data = st->cur_skb->data + (abs_offset - st->stepped_offset); 2510 return block_limit - abs_offset; 2511 } 2512 2513 if (st->frag_idx == 0 && !st->frag_data) 2514 st->stepped_offset += skb_headlen(st->cur_skb); 2515 2516 while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { 2517 frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; 2518 block_limit = skb_frag_size(frag) + st->stepped_offset; 2519 2520 if (abs_offset < block_limit) { 2521 if (!st->frag_data) 2522 st->frag_data = kmap_atomic(skb_frag_page(frag)); 2523 2524 *data = (u8 *) st->frag_data + frag->page_offset + 2525 (abs_offset - st->stepped_offset); 2526 2527 return block_limit - abs_offset; 2528 } 2529 2530 if (st->frag_data) { 2531 kunmap_atomic(st->frag_data); 2532 st->frag_data = NULL; 2533 } 2534 2535 st->frag_idx++; 2536 st->stepped_offset += skb_frag_size(frag); 2537 } 2538 2539 if (st->frag_data) { 2540 kunmap_atomic(st->frag_data); 2541 st->frag_data = NULL; 2542 } 2543 2544 if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) { 2545 st->cur_skb = skb_shinfo(st->root_skb)->frag_list; 2546 st->frag_idx = 0; 2547 goto next_skb; 2548 } else if (st->cur_skb->next) { 2549 st->cur_skb = st->cur_skb->next; 2550 st->frag_idx = 0; 2551 goto next_skb; 2552 } 2553 2554 return 0; 2555 } 2556 EXPORT_SYMBOL(skb_seq_read); 2557 2558 /** 2559 * skb_abort_seq_read - Abort a sequential read of skb data 2560 * @st: state variable 2561 * 2562 * Must be called if skb_seq_read() was not called until it 2563 * returned 0. 2564 */ 2565 void skb_abort_seq_read(struct skb_seq_state *st) 2566 { 2567 if (st->frag_data) 2568 kunmap_atomic(st->frag_data); 2569 } 2570 EXPORT_SYMBOL(skb_abort_seq_read); 2571 2572 #define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) 2573 2574 static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text, 2575 struct ts_config *conf, 2576 struct ts_state *state) 2577 { 2578 return skb_seq_read(offset, text, TS_SKB_CB(state)); 2579 } 2580 2581 static void skb_ts_finish(struct ts_config *conf, struct ts_state *state) 2582 { 2583 skb_abort_seq_read(TS_SKB_CB(state)); 2584 } 2585 2586 /** 2587 * skb_find_text - Find a text pattern in skb data 2588 * @skb: the buffer to look in 2589 * @from: search offset 2590 * @to: search limit 2591 * @config: textsearch configuration 2592 * @state: uninitialized textsearch state variable 2593 * 2594 * Finds a pattern in the skb data according to the specified 2595 * textsearch configuration. Use textsearch_next() to retrieve 2596 * subsequent occurrences of the pattern. Returns the offset 2597 * to the first occurrence or UINT_MAX if no match was found. 2598 */ 2599 unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, 2600 unsigned int to, struct ts_config *config, 2601 struct ts_state *state) 2602 { 2603 unsigned int ret; 2604 2605 config->get_next_block = skb_ts_get_next_block; 2606 config->finish = skb_ts_finish; 2607 2608 skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state)); 2609 2610 ret = textsearch_find(config, state); 2611 return (ret <= to - from ? ret : UINT_MAX); 2612 } 2613 EXPORT_SYMBOL(skb_find_text); 2614 2615 /** 2616 * skb_append_datato_frags: - append the user data to a skb 2617 * @sk: sock structure 2618 * @skb: skb structure to be appened with user data. 2619 * @getfrag: call back function to be used for getting the user data 2620 * @from: pointer to user message iov 2621 * @length: length of the iov message 2622 * 2623 * Description: This procedure append the user data in the fragment part 2624 * of the skb if any page alloc fails user this procedure returns -ENOMEM 2625 */ 2626 int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, 2627 int (*getfrag)(void *from, char *to, int offset, 2628 int len, int odd, struct sk_buff *skb), 2629 void *from, int length) 2630 { 2631 int frg_cnt = 0; 2632 skb_frag_t *frag = NULL; 2633 struct page *page = NULL; 2634 int copy, left; 2635 int offset = 0; 2636 int ret; 2637 2638 do { 2639 /* Return error if we don't have space for new frag */ 2640 frg_cnt = skb_shinfo(skb)->nr_frags; 2641 if (frg_cnt >= MAX_SKB_FRAGS) 2642 return -EFAULT; 2643 2644 /* allocate a new page for next frag */ 2645 page = alloc_pages(sk->sk_allocation, 0); 2646 2647 /* If alloc_page fails just return failure and caller will 2648 * free previous allocated pages by doing kfree_skb() 2649 */ 2650 if (page == NULL) 2651 return -ENOMEM; 2652 2653 /* initialize the next frag */ 2654 skb_fill_page_desc(skb, frg_cnt, page, 0, 0); 2655 skb->truesize += PAGE_SIZE; 2656 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc); 2657 2658 /* get the new initialized frag */ 2659 frg_cnt = skb_shinfo(skb)->nr_frags; 2660 frag = &skb_shinfo(skb)->frags[frg_cnt - 1]; 2661 2662 /* copy the user data to page */ 2663 left = PAGE_SIZE - frag->page_offset; 2664 copy = (length > left)? left : length; 2665 2666 ret = getfrag(from, skb_frag_address(frag) + skb_frag_size(frag), 2667 offset, copy, 0, skb); 2668 if (ret < 0) 2669 return -EFAULT; 2670 2671 /* copy was successful so update the size parameters */ 2672 skb_frag_size_add(frag, copy); 2673 skb->len += copy; 2674 skb->data_len += copy; 2675 offset += copy; 2676 length -= copy; 2677 2678 } while (length > 0); 2679 2680 return 0; 2681 } 2682 EXPORT_SYMBOL(skb_append_datato_frags); 2683 2684 /** 2685 * skb_pull_rcsum - pull skb and update receive checksum 2686 * @skb: buffer to update 2687 * @len: length of data pulled 2688 * 2689 * This function performs an skb_pull on the packet and updates 2690 * the CHECKSUM_COMPLETE checksum. It should be used on 2691 * receive path processing instead of skb_pull unless you know 2692 * that the checksum difference is zero (e.g., a valid IP header) 2693 * or you are setting ip_summed to CHECKSUM_NONE. 2694 */ 2695 unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) 2696 { 2697 BUG_ON(len > skb->len); 2698 skb->len -= len; 2699 BUG_ON(skb->len < skb->data_len); 2700 skb_postpull_rcsum(skb, skb->data, len); 2701 return skb->data += len; 2702 } 2703 EXPORT_SYMBOL_GPL(skb_pull_rcsum); 2704 2705 /** 2706 * skb_segment - Perform protocol segmentation on skb. 2707 * @skb: buffer to segment 2708 * @features: features for the output path (see dev->features) 2709 * 2710 * This function performs segmentation on the given skb. It returns 2711 * a pointer to the first in a list of new skbs for the segments. 2712 * In case of error it returns ERR_PTR(err). 2713 */ 2714 struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) 2715 { 2716 struct sk_buff *segs = NULL; 2717 struct sk_buff *tail = NULL; 2718 struct sk_buff *fskb = skb_shinfo(skb)->frag_list; 2719 unsigned int mss = skb_shinfo(skb)->gso_size; 2720 unsigned int doffset = skb->data - skb_mac_header(skb); 2721 unsigned int offset = doffset; 2722 unsigned int headroom; 2723 unsigned int len; 2724 int sg = !!(features & NETIF_F_SG); 2725 int nfrags = skb_shinfo(skb)->nr_frags; 2726 int err = -ENOMEM; 2727 int i = 0; 2728 int pos; 2729 2730 __skb_push(skb, doffset); 2731 headroom = skb_headroom(skb); 2732 pos = skb_headlen(skb); 2733 2734 do { 2735 struct sk_buff *nskb; 2736 skb_frag_t *frag; 2737 int hsize; 2738 int size; 2739 2740 len = skb->len - offset; 2741 if (len > mss) 2742 len = mss; 2743 2744 hsize = skb_headlen(skb) - offset; 2745 if (hsize < 0) 2746 hsize = 0; 2747 if (hsize > len || !sg) 2748 hsize = len; 2749 2750 if (!hsize && i >= nfrags) { 2751 BUG_ON(fskb->len != len); 2752 2753 pos += len; 2754 nskb = skb_clone(fskb, GFP_ATOMIC); 2755 fskb = fskb->next; 2756 2757 if (unlikely(!nskb)) 2758 goto err; 2759 2760 hsize = skb_end_offset(nskb); 2761 if (skb_cow_head(nskb, doffset + headroom)) { 2762 kfree_skb(nskb); 2763 goto err; 2764 } 2765 2766 nskb->truesize += skb_end_offset(nskb) - hsize; 2767 skb_release_head_state(nskb); 2768 __skb_push(nskb, doffset); 2769 } else { 2770 nskb = alloc_skb(hsize + doffset + headroom, 2771 GFP_ATOMIC); 2772 2773 if (unlikely(!nskb)) 2774 goto err; 2775 2776 skb_reserve(nskb, headroom); 2777 __skb_put(nskb, doffset); 2778 } 2779 2780 if (segs) 2781 tail->next = nskb; 2782 else 2783 segs = nskb; 2784 tail = nskb; 2785 2786 __copy_skb_header(nskb, skb); 2787 nskb->mac_len = skb->mac_len; 2788 2789 /* nskb and skb might have different headroom */ 2790 if (nskb->ip_summed == CHECKSUM_PARTIAL) 2791 nskb->csum_start += skb_headroom(nskb) - headroom; 2792 2793 skb_reset_mac_header(nskb); 2794 skb_set_network_header(nskb, skb->mac_len); 2795 nskb->transport_header = (nskb->network_header + 2796 skb_network_header_len(skb)); 2797 skb_copy_from_linear_data(skb, nskb->data, doffset); 2798 2799 if (fskb != skb_shinfo(skb)->frag_list) 2800 continue; 2801 2802 if (!sg) { 2803 nskb->ip_summed = CHECKSUM_NONE; 2804 nskb->csum = skb_copy_and_csum_bits(skb, offset, 2805 skb_put(nskb, len), 2806 len, 0); 2807 continue; 2808 } 2809 2810 frag = skb_shinfo(nskb)->frags; 2811 2812 skb_copy_from_linear_data_offset(skb, offset, 2813 skb_put(nskb, hsize), hsize); 2814 2815 while (pos < offset + len && i < nfrags) { 2816 *frag = skb_shinfo(skb)->frags[i]; 2817 __skb_frag_ref(frag); 2818 size = skb_frag_size(frag); 2819 2820 if (pos < offset) { 2821 frag->page_offset += offset - pos; 2822 skb_frag_size_sub(frag, offset - pos); 2823 } 2824 2825 skb_shinfo(nskb)->nr_frags++; 2826 2827 if (pos + size <= offset + len) { 2828 i++; 2829 pos += size; 2830 } else { 2831 skb_frag_size_sub(frag, pos + size - (offset + len)); 2832 goto skip_fraglist; 2833 } 2834 2835 frag++; 2836 } 2837 2838 if (pos < offset + len) { 2839 struct sk_buff *fskb2 = fskb; 2840 2841 BUG_ON(pos + fskb->len != offset + len); 2842 2843 pos += fskb->len; 2844 fskb = fskb->next; 2845 2846 if (fskb2->next) { 2847 fskb2 = skb_clone(fskb2, GFP_ATOMIC); 2848 if (!fskb2) 2849 goto err; 2850 } else 2851 skb_get(fskb2); 2852 2853 SKB_FRAG_ASSERT(nskb); 2854 skb_shinfo(nskb)->frag_list = fskb2; 2855 } 2856 2857 skip_fraglist: 2858 nskb->data_len = len - hsize; 2859 nskb->len += nskb->data_len; 2860 nskb->truesize += nskb->data_len; 2861 } while ((offset += len) < skb->len); 2862 2863 return segs; 2864 2865 err: 2866 while ((skb = segs)) { 2867 segs = skb->next; 2868 kfree_skb(skb); 2869 } 2870 return ERR_PTR(err); 2871 } 2872 EXPORT_SYMBOL_GPL(skb_segment); 2873 2874 int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) 2875 { 2876 struct sk_buff *p = *head; 2877 struct sk_buff *nskb; 2878 struct skb_shared_info *skbinfo = skb_shinfo(skb); 2879 struct skb_shared_info *pinfo = skb_shinfo(p); 2880 unsigned int headroom; 2881 unsigned int len = skb_gro_len(skb); 2882 unsigned int offset = skb_gro_offset(skb); 2883 unsigned int headlen = skb_headlen(skb); 2884 unsigned int delta_truesize; 2885 2886 if (p->len + len >= 65536) 2887 return -E2BIG; 2888 2889 if (pinfo->frag_list) 2890 goto merge; 2891 else if (headlen <= offset) { 2892 skb_frag_t *frag; 2893 skb_frag_t *frag2; 2894 int i = skbinfo->nr_frags; 2895 int nr_frags = pinfo->nr_frags + i; 2896 2897 offset -= headlen; 2898 2899 if (nr_frags > MAX_SKB_FRAGS) 2900 return -E2BIG; 2901 2902 pinfo->nr_frags = nr_frags; 2903 skbinfo->nr_frags = 0; 2904 2905 frag = pinfo->frags + nr_frags; 2906 frag2 = skbinfo->frags + i; 2907 do { 2908 *--frag = *--frag2; 2909 } while (--i); 2910 2911 frag->page_offset += offset; 2912 skb_frag_size_sub(frag, offset); 2913 2914 /* all fragments truesize : remove (head size + sk_buff) */ 2915 delta_truesize = skb->truesize - 2916 SKB_TRUESIZE(skb_end_offset(skb)); 2917 2918 skb->truesize -= skb->data_len; 2919 skb->len -= skb->data_len; 2920 skb->data_len = 0; 2921 2922 NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE; 2923 goto done; 2924 } else if (skb->head_frag) { 2925 int nr_frags = pinfo->nr_frags; 2926 skb_frag_t *frag = pinfo->frags + nr_frags; 2927 struct page *page = virt_to_head_page(skb->head); 2928 unsigned int first_size = headlen - offset; 2929 unsigned int first_offset; 2930 2931 if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS) 2932 return -E2BIG; 2933 2934 first_offset = skb->data - 2935 (unsigned char *)page_address(page) + 2936 offset; 2937 2938 pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; 2939 2940 frag->page.p = page; 2941 frag->page_offset = first_offset; 2942 skb_frag_size_set(frag, first_size); 2943 2944 memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); 2945 /* We dont need to clear skbinfo->nr_frags here */ 2946 2947 delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); 2948 NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; 2949 goto done; 2950 } else if (skb_gro_len(p) != pinfo->gso_size) 2951 return -E2BIG; 2952 2953 headroom = skb_headroom(p); 2954 nskb = alloc_skb(headroom + skb_gro_offset(p), GFP_ATOMIC); 2955 if (unlikely(!nskb)) 2956 return -ENOMEM; 2957 2958 __copy_skb_header(nskb, p); 2959 nskb->mac_len = p->mac_len; 2960 2961 skb_reserve(nskb, headroom); 2962 __skb_put(nskb, skb_gro_offset(p)); 2963 2964 skb_set_mac_header(nskb, skb_mac_header(p) - p->data); 2965 skb_set_network_header(nskb, skb_network_offset(p)); 2966 skb_set_transport_header(nskb, skb_transport_offset(p)); 2967 2968 __skb_pull(p, skb_gro_offset(p)); 2969 memcpy(skb_mac_header(nskb), skb_mac_header(p), 2970 p->data - skb_mac_header(p)); 2971 2972 *NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p); 2973 skb_shinfo(nskb)->frag_list = p; 2974 skb_shinfo(nskb)->gso_size = pinfo->gso_size; 2975 pinfo->gso_size = 0; 2976 skb_header_release(p); 2977 nskb->prev = p; 2978 2979 nskb->data_len += p->len; 2980 nskb->truesize += p->truesize; 2981 nskb->len += p->len; 2982 2983 *head = nskb; 2984 nskb->next = p->next; 2985 p->next = NULL; 2986 2987 p = nskb; 2988 2989 merge: 2990 delta_truesize = skb->truesize; 2991 if (offset > headlen) { 2992 unsigned int eat = offset - headlen; 2993 2994 skbinfo->frags[0].page_offset += eat; 2995 skb_frag_size_sub(&skbinfo->frags[0], eat); 2996 skb->data_len -= eat; 2997 skb->len -= eat; 2998 offset = headlen; 2999 } 3000 3001 __skb_pull(skb, offset); 3002 3003 p->prev->next = skb; 3004 p->prev = skb; 3005 skb_header_release(skb); 3006 3007 done: 3008 NAPI_GRO_CB(p)->count++; 3009 p->data_len += len; 3010 p->truesize += delta_truesize; 3011 p->len += len; 3012 3013 NAPI_GRO_CB(skb)->same_flow = 1; 3014 return 0; 3015 } 3016 EXPORT_SYMBOL_GPL(skb_gro_receive); 3017 3018 void __init skb_init(void) 3019 { 3020 skbuff_head_cache = kmem_cache_create("skbuff_head_cache", 3021 sizeof(struct sk_buff), 3022 0, 3023 SLAB_HWCACHE_ALIGN|SLAB_PANIC, 3024 NULL); 3025 skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", 3026 (2*sizeof(struct sk_buff)) + 3027 sizeof(atomic_t), 3028 0, 3029 SLAB_HWCACHE_ALIGN|SLAB_PANIC, 3030 NULL); 3031 } 3032 3033 /** 3034 * skb_to_sgvec - Fill a scatter-gather list from a socket buffer 3035 * @skb: Socket buffer containing the buffers to be mapped 3036 * @sg: The scatter-gather list to map into 3037 * @offset: The offset into the buffer's contents to start mapping 3038 * @len: Length of buffer space to be mapped 3039 * 3040 * Fill the specified scatter-gather list with mappings/pointers into a 3041 * region of the buffer space attached to a socket buffer. 3042 */ 3043 static int 3044 __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) 3045 { 3046 int start = skb_headlen(skb); 3047 int i, copy = start - offset; 3048 struct sk_buff *frag_iter; 3049 int elt = 0; 3050 3051 if (copy > 0) { 3052 if (copy > len) 3053 copy = len; 3054 sg_set_buf(sg, skb->data + offset, copy); 3055 elt++; 3056 if ((len -= copy) == 0) 3057 return elt; 3058 offset += copy; 3059 } 3060 3061 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 3062 int end; 3063 3064 WARN_ON(start > offset + len); 3065 3066 end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); 3067 if ((copy = end - offset) > 0) { 3068 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 3069 3070 if (copy > len) 3071 copy = len; 3072 sg_set_page(&sg[elt], skb_frag_page(frag), copy, 3073 frag->page_offset+offset-start); 3074 elt++; 3075 if (!(len -= copy)) 3076 return elt; 3077 offset += copy; 3078 } 3079 start = end; 3080 } 3081 3082 skb_walk_frags(skb, frag_iter) { 3083 int end; 3084 3085 WARN_ON(start > offset + len); 3086 3087 end = start + frag_iter->len; 3088 if ((copy = end - offset) > 0) { 3089 if (copy > len) 3090 copy = len; 3091 elt += __skb_to_sgvec(frag_iter, sg+elt, offset - start, 3092 copy); 3093 if ((len -= copy) == 0) 3094 return elt; 3095 offset += copy; 3096 } 3097 start = end; 3098 } 3099 BUG_ON(len); 3100 return elt; 3101 } 3102 3103 int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) 3104 { 3105 int nsg = __skb_to_sgvec(skb, sg, offset, len); 3106 3107 sg_mark_end(&sg[nsg - 1]); 3108 3109 return nsg; 3110 } 3111 EXPORT_SYMBOL_GPL(skb_to_sgvec); 3112 3113 /** 3114 * skb_cow_data - Check that a socket buffer's data buffers are writable 3115 * @skb: The socket buffer to check. 3116 * @tailbits: Amount of trailing space to be added 3117 * @trailer: Returned pointer to the skb where the @tailbits space begins 3118 * 3119 * Make sure that the data buffers attached to a socket buffer are 3120 * writable. If they are not, private copies are made of the data buffers 3121 * and the socket buffer is set to use these instead. 3122 * 3123 * If @tailbits is given, make sure that there is space to write @tailbits 3124 * bytes of data beyond current end of socket buffer. @trailer will be 3125 * set to point to the skb in which this space begins. 3126 * 3127 * The number of scatterlist elements required to completely map the 3128 * COW'd and extended socket buffer will be returned. 3129 */ 3130 int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) 3131 { 3132 int copyflag; 3133 int elt; 3134 struct sk_buff *skb1, **skb_p; 3135 3136 /* If skb is cloned or its head is paged, reallocate 3137 * head pulling out all the pages (pages are considered not writable 3138 * at the moment even if they are anonymous). 3139 */ 3140 if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) && 3141 __pskb_pull_tail(skb, skb_pagelen(skb)-skb_headlen(skb)) == NULL) 3142 return -ENOMEM; 3143 3144 /* Easy case. Most of packets will go this way. */ 3145 if (!skb_has_frag_list(skb)) { 3146 /* A little of trouble, not enough of space for trailer. 3147 * This should not happen, when stack is tuned to generate 3148 * good frames. OK, on miss we reallocate and reserve even more 3149 * space, 128 bytes is fair. */ 3150 3151 if (skb_tailroom(skb) < tailbits && 3152 pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC)) 3153 return -ENOMEM; 3154 3155 /* Voila! */ 3156 *trailer = skb; 3157 return 1; 3158 } 3159 3160 /* Misery. We are in troubles, going to mincer fragments... */ 3161 3162 elt = 1; 3163 skb_p = &skb_shinfo(skb)->frag_list; 3164 copyflag = 0; 3165 3166 while ((skb1 = *skb_p) != NULL) { 3167 int ntail = 0; 3168 3169 /* The fragment is partially pulled by someone, 3170 * this can happen on input. Copy it and everything 3171 * after it. */ 3172 3173 if (skb_shared(skb1)) 3174 copyflag = 1; 3175 3176 /* If the skb is the last, worry about trailer. */ 3177 3178 if (skb1->next == NULL && tailbits) { 3179 if (skb_shinfo(skb1)->nr_frags || 3180 skb_has_frag_list(skb1) || 3181 skb_tailroom(skb1) < tailbits) 3182 ntail = tailbits + 128; 3183 } 3184 3185 if (copyflag || 3186 skb_cloned(skb1) || 3187 ntail || 3188 skb_shinfo(skb1)->nr_frags || 3189 skb_has_frag_list(skb1)) { 3190 struct sk_buff *skb2; 3191 3192 /* Fuck, we are miserable poor guys... */ 3193 if (ntail == 0) 3194 skb2 = skb_copy(skb1, GFP_ATOMIC); 3195 else 3196 skb2 = skb_copy_expand(skb1, 3197 skb_headroom(skb1), 3198 ntail, 3199 GFP_ATOMIC); 3200 if (unlikely(skb2 == NULL)) 3201 return -ENOMEM; 3202 3203 if (skb1->sk) 3204 skb_set_owner_w(skb2, skb1->sk); 3205 3206 /* Looking around. Are we still alive? 3207 * OK, link new skb, drop old one */ 3208 3209 skb2->next = skb1->next; 3210 *skb_p = skb2; 3211 kfree_skb(skb1); 3212 skb1 = skb2; 3213 } 3214 elt++; 3215 *trailer = skb1; 3216 skb_p = &skb1->next; 3217 } 3218 3219 return elt; 3220 } 3221 EXPORT_SYMBOL_GPL(skb_cow_data); 3222 3223 static void sock_rmem_free(struct sk_buff *skb) 3224 { 3225 struct sock *sk = skb->sk; 3226 3227 atomic_sub(skb->truesize, &sk->sk_rmem_alloc); 3228 } 3229 3230 /* 3231 * Note: We dont mem charge error packets (no sk_forward_alloc changes) 3232 */ 3233 int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) 3234 { 3235 int len = skb->len; 3236 3237 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= 3238 (unsigned int)sk->sk_rcvbuf) 3239 return -ENOMEM; 3240 3241 skb_orphan(skb); 3242 skb->sk = sk; 3243 skb->destructor = sock_rmem_free; 3244 atomic_add(skb->truesize, &sk->sk_rmem_alloc); 3245 3246 /* before exiting rcu section, make sure dst is refcounted */ 3247 skb_dst_force(skb); 3248 3249 skb_queue_tail(&sk->sk_error_queue, skb); 3250 if (!sock_flag(sk, SOCK_DEAD)) 3251 sk->sk_data_ready(sk, len); 3252 return 0; 3253 } 3254 EXPORT_SYMBOL(sock_queue_err_skb); 3255 3256 void skb_tstamp_tx(struct sk_buff *orig_skb, 3257 struct skb_shared_hwtstamps *hwtstamps) 3258 { 3259 struct sock *sk = orig_skb->sk; 3260 struct sock_exterr_skb *serr; 3261 struct sk_buff *skb; 3262 int err; 3263 3264 if (!sk) 3265 return; 3266 3267 skb = skb_clone(orig_skb, GFP_ATOMIC); 3268 if (!skb) 3269 return; 3270 3271 if (hwtstamps) { 3272 *skb_hwtstamps(skb) = 3273 *hwtstamps; 3274 } else { 3275 /* 3276 * no hardware time stamps available, 3277 * so keep the shared tx_flags and only 3278 * store software time stamp 3279 */ 3280 skb->tstamp = ktime_get_real(); 3281 } 3282 3283 serr = SKB_EXT_ERR(skb); 3284 memset(serr, 0, sizeof(*serr)); 3285 serr->ee.ee_errno = ENOMSG; 3286 serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; 3287 3288 err = sock_queue_err_skb(sk, skb); 3289 3290 if (err) 3291 kfree_skb(skb); 3292 } 3293 EXPORT_SYMBOL_GPL(skb_tstamp_tx); 3294 3295 void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) 3296 { 3297 struct sock *sk = skb->sk; 3298 struct sock_exterr_skb *serr; 3299 int err; 3300 3301 skb->wifi_acked_valid = 1; 3302 skb->wifi_acked = acked; 3303 3304 serr = SKB_EXT_ERR(skb); 3305 memset(serr, 0, sizeof(*serr)); 3306 serr->ee.ee_errno = ENOMSG; 3307 serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; 3308 3309 err = sock_queue_err_skb(sk, skb); 3310 if (err) 3311 kfree_skb(skb); 3312 } 3313 EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); 3314 3315 3316 /** 3317 * skb_partial_csum_set - set up and verify partial csum values for packet 3318 * @skb: the skb to set 3319 * @start: the number of bytes after skb->data to start checksumming. 3320 * @off: the offset from start to place the checksum. 3321 * 3322 * For untrusted partially-checksummed packets, we need to make sure the values 3323 * for skb->csum_start and skb->csum_offset are valid so we don't oops. 3324 * 3325 * This function checks and sets those values and skb->ip_summed: if this 3326 * returns false you should drop the packet. 3327 */ 3328 bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) 3329 { 3330 if (unlikely(start > skb_headlen(skb)) || 3331 unlikely((int)start + off > skb_headlen(skb) - 2)) { 3332 net_warn_ratelimited("bad partial csum: csum=%u/%u len=%u\n", 3333 start, off, skb_headlen(skb)); 3334 return false; 3335 } 3336 skb->ip_summed = CHECKSUM_PARTIAL; 3337 skb->csum_start = skb_headroom(skb) + start; 3338 skb->csum_offset = off; 3339 return true; 3340 } 3341 EXPORT_SYMBOL_GPL(skb_partial_csum_set); 3342 3343 void __skb_warn_lro_forwarding(const struct sk_buff *skb) 3344 { 3345 net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n", 3346 skb->dev->name); 3347 } 3348 EXPORT_SYMBOL(__skb_warn_lro_forwarding); 3349 3350 void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) 3351 { 3352 if (head_stolen) 3353 kmem_cache_free(skbuff_head_cache, skb); 3354 else 3355 __kfree_skb(skb); 3356 } 3357 EXPORT_SYMBOL(kfree_skb_partial); 3358 3359 /** 3360 * skb_try_coalesce - try to merge skb to prior one 3361 * @to: prior buffer 3362 * @from: buffer to add 3363 * @fragstolen: pointer to boolean 3364 * 3365 */ 3366 bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, 3367 bool *fragstolen, int *delta_truesize) 3368 { 3369 int i, delta, len = from->len; 3370 3371 *fragstolen = false; 3372 3373 if (skb_cloned(to)) 3374 return false; 3375 3376 if (len <= skb_tailroom(to)) { 3377 BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); 3378 *delta_truesize = 0; 3379 return true; 3380 } 3381 3382 if (skb_has_frag_list(to) || skb_has_frag_list(from)) 3383 return false; 3384 3385 if (skb_headlen(from) != 0) { 3386 struct page *page; 3387 unsigned int offset; 3388 3389 if (skb_shinfo(to)->nr_frags + 3390 skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) 3391 return false; 3392 3393 if (skb_head_is_locked(from)) 3394 return false; 3395 3396 delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); 3397 3398 page = virt_to_head_page(from->head); 3399 offset = from->data - (unsigned char *)page_address(page); 3400 3401 skb_fill_page_desc(to, skb_shinfo(to)->nr_frags, 3402 page, offset, skb_headlen(from)); 3403 *fragstolen = true; 3404 } else { 3405 if (skb_shinfo(to)->nr_frags + 3406 skb_shinfo(from)->nr_frags > MAX_SKB_FRAGS) 3407 return false; 3408 3409 delta = from->truesize - 3410 SKB_TRUESIZE(skb_end_pointer(from) - from->head); 3411 } 3412 3413 WARN_ON_ONCE(delta < len); 3414 3415 memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags, 3416 skb_shinfo(from)->frags, 3417 skb_shinfo(from)->nr_frags * sizeof(skb_frag_t)); 3418 skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags; 3419 3420 if (!skb_cloned(from)) 3421 skb_shinfo(from)->nr_frags = 0; 3422 3423 /* if the skb is cloned this does nothing since we set nr_frags to 0 */ 3424 for (i = 0; i < skb_shinfo(from)->nr_frags; i++) 3425 skb_frag_ref(from, i); 3426 3427 to->truesize += delta; 3428 to->len += len; 3429 to->data_len += len; 3430 3431 *delta_truesize = delta; 3432 return true; 3433 } 3434 EXPORT_SYMBOL(skb_try_coalesce); 3435