1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Routines having to do with the 'struct sk_buff' memory handlers. 4 * 5 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> 6 * Florian La Roche <rzsfl@rz.uni-sb.de> 7 * 8 * Fixes: 9 * Alan Cox : Fixed the worst of the load 10 * balancer bugs. 11 * Dave Platt : Interrupt stacking fix. 12 * Richard Kooijman : Timestamp fixes. 13 * Alan Cox : Changed buffer format. 14 * Alan Cox : destructor hook for AF_UNIX etc. 15 * Linus Torvalds : Better skb_clone. 16 * Alan Cox : Added skb_copy. 17 * Alan Cox : Added all the changed routines Linus 18 * only put in the headers 19 * Ray VanTassle : Fixed --skb->lock in free 20 * Alan Cox : skb_copy copy arp field 21 * Andi Kleen : slabified it. 22 * Robert Olsson : Removed skb_head_pool 23 * 24 * NOTE: 25 * The __skb_ routines should be called with interrupts 26 * disabled, or you better be *real* sure that the operation is atomic 27 * with respect to whatever list is being frobbed (e.g. via lock_sock() 28 * or via disabling bottom half handlers, etc). 29 */ 30 31 /* 32 * The functions in this file will not compile correctly with gcc 2.4.x 33 */ 34 35 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 36 37 #include <linux/module.h> 38 #include <linux/types.h> 39 #include <linux/kernel.h> 40 #include <linux/mm.h> 41 #include <linux/interrupt.h> 42 #include <linux/in.h> 43 #include <linux/inet.h> 44 #include <linux/slab.h> 45 #include <linux/tcp.h> 46 #include <linux/udp.h> 47 #include <linux/sctp.h> 48 #include <linux/netdevice.h> 49 #ifdef CONFIG_NET_CLS_ACT 50 #include <net/pkt_sched.h> 51 #endif 52 #include <linux/string.h> 53 #include <linux/skbuff.h> 54 #include <linux/splice.h> 55 #include <linux/cache.h> 56 #include <linux/rtnetlink.h> 57 #include <linux/init.h> 58 #include <linux/scatterlist.h> 59 #include <linux/errqueue.h> 60 #include <linux/prefetch.h> 61 #include <linux/if_vlan.h> 62 #include <linux/mpls.h> 63 64 #include <net/protocol.h> 65 #include <net/dst.h> 66 #include <net/sock.h> 67 #include <net/checksum.h> 68 #include <net/ip6_checksum.h> 69 #include <net/xfrm.h> 70 #include <net/mpls.h> 71 #include <net/mptcp.h> 72 73 #include <linux/uaccess.h> 74 #include <trace/events/skb.h> 75 #include <linux/highmem.h> 76 #include <linux/capability.h> 77 #include <linux/user_namespace.h> 78 #include <linux/indirect_call_wrapper.h> 79 80 #include "datagram.h" 81 82 struct kmem_cache *skbuff_head_cache __ro_after_init; 83 static struct kmem_cache *skbuff_fclone_cache __ro_after_init; 84 #ifdef CONFIG_SKB_EXTENSIONS 85 static struct kmem_cache *skbuff_ext_cache __ro_after_init; 86 #endif 87 int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; 88 EXPORT_SYMBOL(sysctl_max_skb_frags); 89 90 /** 91 * skb_panic - private function for out-of-line support 92 * @skb: buffer 93 * @sz: size 94 * @addr: address 95 * @msg: skb_over_panic or skb_under_panic 96 * 97 * Out-of-line support for skb_put() and skb_push(). 98 * Called via the wrapper skb_over_panic() or skb_under_panic(). 99 * Keep out of line to prevent kernel bloat. 100 * __builtin_return_address is not used because it is not always reliable. 101 */ 102 static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr, 103 const char msg[]) 104 { 105 pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n", 106 msg, addr, skb->len, sz, skb->head, skb->data, 107 (unsigned long)skb->tail, (unsigned long)skb->end, 108 skb->dev ? skb->dev->name : "<NULL>"); 109 BUG(); 110 } 111 112 static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr) 113 { 114 skb_panic(skb, sz, addr, __func__); 115 } 116 117 static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) 118 { 119 skb_panic(skb, sz, addr, __func__); 120 } 121 122 /* 123 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells 124 * the caller if emergency pfmemalloc reserves are being used. If it is and 125 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves 126 * may be used. Otherwise, the packet data may be discarded until enough 127 * memory is free 128 */ 129 #define kmalloc_reserve(size, gfp, node, pfmemalloc) \ 130 __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc) 131 132 static void *__kmalloc_reserve(size_t size, gfp_t flags, int node, 133 unsigned long ip, bool *pfmemalloc) 134 { 135 void *obj; 136 bool ret_pfmemalloc = false; 137 138 /* 139 * Try a regular allocation, when that fails and we're not entitled 140 * to the reserves, fail. 141 */ 142 obj = kmalloc_node_track_caller(size, 143 flags | __GFP_NOMEMALLOC | __GFP_NOWARN, 144 node); 145 if (obj || !(gfp_pfmemalloc_allowed(flags))) 146 goto out; 147 148 /* Try again but now we are using pfmemalloc reserves */ 149 ret_pfmemalloc = true; 150 obj = kmalloc_node_track_caller(size, flags, node); 151 152 out: 153 if (pfmemalloc) 154 *pfmemalloc = ret_pfmemalloc; 155 156 return obj; 157 } 158 159 /* Allocate a new skbuff. We do this ourselves so we can fill in a few 160 * 'private' fields and also do memory statistics to find all the 161 * [BEEP] leaks. 162 * 163 */ 164 165 /** 166 * __alloc_skb - allocate a network buffer 167 * @size: size to allocate 168 * @gfp_mask: allocation mask 169 * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache 170 * instead of head cache and allocate a cloned (child) skb. 171 * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for 172 * allocations in case the data is required for writeback 173 * @node: numa node to allocate memory on 174 * 175 * Allocate a new &sk_buff. The returned buffer has no headroom and a 176 * tail room of at least size bytes. The object has a reference count 177 * of one. The return is the buffer. On a failure the return is %NULL. 178 * 179 * Buffers may only be allocated from interrupts using a @gfp_mask of 180 * %GFP_ATOMIC. 181 */ 182 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, 183 int flags, int node) 184 { 185 struct kmem_cache *cache; 186 struct skb_shared_info *shinfo; 187 struct sk_buff *skb; 188 u8 *data; 189 bool pfmemalloc; 190 191 cache = (flags & SKB_ALLOC_FCLONE) 192 ? skbuff_fclone_cache : skbuff_head_cache; 193 194 if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) 195 gfp_mask |= __GFP_MEMALLOC; 196 197 /* Get the HEAD */ 198 skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); 199 if (!skb) 200 goto out; 201 prefetchw(skb); 202 203 /* We do our best to align skb_shared_info on a separate cache 204 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives 205 * aligned memory blocks, unless SLUB/SLAB debug is enabled. 206 * Both skb->head and skb_shared_info are cache line aligned. 207 */ 208 size = SKB_DATA_ALIGN(size); 209 size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 210 data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc); 211 if (!data) 212 goto nodata; 213 /* kmalloc(size) might give us more room than requested. 214 * Put skb_shared_info exactly at the end of allocated zone, 215 * to allow max possible filling before reallocation. 216 */ 217 size = SKB_WITH_OVERHEAD(ksize(data)); 218 prefetchw(data + size); 219 220 /* 221 * Only clear those fields we need to clear, not those that we will 222 * actually initialise below. Hence, don't put any more fields after 223 * the tail pointer in struct sk_buff! 224 */ 225 memset(skb, 0, offsetof(struct sk_buff, tail)); 226 /* Account for allocated memory : skb + skb->head */ 227 skb->truesize = SKB_TRUESIZE(size); 228 skb->pfmemalloc = pfmemalloc; 229 refcount_set(&skb->users, 1); 230 skb->head = data; 231 skb->data = data; 232 skb_reset_tail_pointer(skb); 233 skb->end = skb->tail + size; 234 skb->mac_header = (typeof(skb->mac_header))~0U; 235 skb->transport_header = (typeof(skb->transport_header))~0U; 236 237 /* make sure we initialize shinfo sequentially */ 238 shinfo = skb_shinfo(skb); 239 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); 240 atomic_set(&shinfo->dataref, 1); 241 242 if (flags & SKB_ALLOC_FCLONE) { 243 struct sk_buff_fclones *fclones; 244 245 fclones = container_of(skb, struct sk_buff_fclones, skb1); 246 247 skb->fclone = SKB_FCLONE_ORIG; 248 refcount_set(&fclones->fclone_ref, 1); 249 250 fclones->skb2.fclone = SKB_FCLONE_CLONE; 251 } 252 out: 253 return skb; 254 nodata: 255 kmem_cache_free(cache, skb); 256 skb = NULL; 257 goto out; 258 } 259 EXPORT_SYMBOL(__alloc_skb); 260 261 /* Caller must provide SKB that is memset cleared */ 262 static struct sk_buff *__build_skb_around(struct sk_buff *skb, 263 void *data, unsigned int frag_size) 264 { 265 struct skb_shared_info *shinfo; 266 unsigned int size = frag_size ? : ksize(data); 267 268 size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 269 270 /* Assumes caller memset cleared SKB */ 271 skb->truesize = SKB_TRUESIZE(size); 272 refcount_set(&skb->users, 1); 273 skb->head = data; 274 skb->data = data; 275 skb_reset_tail_pointer(skb); 276 skb->end = skb->tail + size; 277 skb->mac_header = (typeof(skb->mac_header))~0U; 278 skb->transport_header = (typeof(skb->transport_header))~0U; 279 280 /* make sure we initialize shinfo sequentially */ 281 shinfo = skb_shinfo(skb); 282 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); 283 atomic_set(&shinfo->dataref, 1); 284 285 return skb; 286 } 287 288 /** 289 * __build_skb - build a network buffer 290 * @data: data buffer provided by caller 291 * @frag_size: size of data, or 0 if head was kmalloced 292 * 293 * Allocate a new &sk_buff. Caller provides space holding head and 294 * skb_shared_info. @data must have been allocated by kmalloc() only if 295 * @frag_size is 0, otherwise data should come from the page allocator 296 * or vmalloc() 297 * The return is the new skb buffer. 298 * On a failure the return is %NULL, and @data is not freed. 299 * Notes : 300 * Before IO, driver allocates only data buffer where NIC put incoming frame 301 * Driver should add room at head (NET_SKB_PAD) and 302 * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info)) 303 * After IO, driver calls build_skb(), to allocate sk_buff and populate it 304 * before giving packet to stack. 305 * RX rings only contains data buffers, not full skbs. 306 */ 307 struct sk_buff *__build_skb(void *data, unsigned int frag_size) 308 { 309 struct sk_buff *skb; 310 311 skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); 312 if (unlikely(!skb)) 313 return NULL; 314 315 memset(skb, 0, offsetof(struct sk_buff, tail)); 316 317 return __build_skb_around(skb, data, frag_size); 318 } 319 320 /* build_skb() is wrapper over __build_skb(), that specifically 321 * takes care of skb->head and skb->pfmemalloc 322 * This means that if @frag_size is not zero, then @data must be backed 323 * by a page fragment, not kmalloc() or vmalloc() 324 */ 325 struct sk_buff *build_skb(void *data, unsigned int frag_size) 326 { 327 struct sk_buff *skb = __build_skb(data, frag_size); 328 329 if (skb && frag_size) { 330 skb->head_frag = 1; 331 if (page_is_pfmemalloc(virt_to_head_page(data))) 332 skb->pfmemalloc = 1; 333 } 334 return skb; 335 } 336 EXPORT_SYMBOL(build_skb); 337 338 /** 339 * build_skb_around - build a network buffer around provided skb 340 * @skb: sk_buff provide by caller, must be memset cleared 341 * @data: data buffer provided by caller 342 * @frag_size: size of data, or 0 if head was kmalloced 343 */ 344 struct sk_buff *build_skb_around(struct sk_buff *skb, 345 void *data, unsigned int frag_size) 346 { 347 if (unlikely(!skb)) 348 return NULL; 349 350 skb = __build_skb_around(skb, data, frag_size); 351 352 if (skb && frag_size) { 353 skb->head_frag = 1; 354 if (page_is_pfmemalloc(virt_to_head_page(data))) 355 skb->pfmemalloc = 1; 356 } 357 return skb; 358 } 359 EXPORT_SYMBOL(build_skb_around); 360 361 #define NAPI_SKB_CACHE_SIZE 64 362 363 struct napi_alloc_cache { 364 struct page_frag_cache page; 365 unsigned int skb_count; 366 void *skb_cache[NAPI_SKB_CACHE_SIZE]; 367 }; 368 369 static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); 370 static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); 371 372 static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) 373 { 374 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); 375 376 return page_frag_alloc(&nc->page, fragsz, gfp_mask); 377 } 378 379 void *napi_alloc_frag(unsigned int fragsz) 380 { 381 fragsz = SKB_DATA_ALIGN(fragsz); 382 383 return __napi_alloc_frag(fragsz, GFP_ATOMIC); 384 } 385 EXPORT_SYMBOL(napi_alloc_frag); 386 387 /** 388 * netdev_alloc_frag - allocate a page fragment 389 * @fragsz: fragment size 390 * 391 * Allocates a frag from a page for receive buffer. 392 * Uses GFP_ATOMIC allocations. 393 */ 394 void *netdev_alloc_frag(unsigned int fragsz) 395 { 396 struct page_frag_cache *nc; 397 void *data; 398 399 fragsz = SKB_DATA_ALIGN(fragsz); 400 if (in_irq() || irqs_disabled()) { 401 nc = this_cpu_ptr(&netdev_alloc_cache); 402 data = page_frag_alloc(nc, fragsz, GFP_ATOMIC); 403 } else { 404 local_bh_disable(); 405 data = __napi_alloc_frag(fragsz, GFP_ATOMIC); 406 local_bh_enable(); 407 } 408 return data; 409 } 410 EXPORT_SYMBOL(netdev_alloc_frag); 411 412 /** 413 * __netdev_alloc_skb - allocate an skbuff for rx on a specific device 414 * @dev: network device to receive on 415 * @len: length to allocate 416 * @gfp_mask: get_free_pages mask, passed to alloc_skb 417 * 418 * Allocate a new &sk_buff and assign it a usage count of one. The 419 * buffer has NET_SKB_PAD headroom built in. Users should allocate 420 * the headroom they think they need without accounting for the 421 * built in space. The built in space is used for optimisations. 422 * 423 * %NULL is returned if there is no free memory. 424 */ 425 struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, 426 gfp_t gfp_mask) 427 { 428 struct page_frag_cache *nc; 429 struct sk_buff *skb; 430 bool pfmemalloc; 431 void *data; 432 433 len += NET_SKB_PAD; 434 435 if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || 436 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { 437 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); 438 if (!skb) 439 goto skb_fail; 440 goto skb_success; 441 } 442 443 len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 444 len = SKB_DATA_ALIGN(len); 445 446 if (sk_memalloc_socks()) 447 gfp_mask |= __GFP_MEMALLOC; 448 449 if (in_irq() || irqs_disabled()) { 450 nc = this_cpu_ptr(&netdev_alloc_cache); 451 data = page_frag_alloc(nc, len, gfp_mask); 452 pfmemalloc = nc->pfmemalloc; 453 } else { 454 local_bh_disable(); 455 nc = this_cpu_ptr(&napi_alloc_cache.page); 456 data = page_frag_alloc(nc, len, gfp_mask); 457 pfmemalloc = nc->pfmemalloc; 458 local_bh_enable(); 459 } 460 461 if (unlikely(!data)) 462 return NULL; 463 464 skb = __build_skb(data, len); 465 if (unlikely(!skb)) { 466 skb_free_frag(data); 467 return NULL; 468 } 469 470 if (pfmemalloc) 471 skb->pfmemalloc = 1; 472 skb->head_frag = 1; 473 474 skb_success: 475 skb_reserve(skb, NET_SKB_PAD); 476 skb->dev = dev; 477 478 skb_fail: 479 return skb; 480 } 481 EXPORT_SYMBOL(__netdev_alloc_skb); 482 483 /** 484 * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance 485 * @napi: napi instance this buffer was allocated for 486 * @len: length to allocate 487 * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages 488 * 489 * Allocate a new sk_buff for use in NAPI receive. This buffer will 490 * attempt to allocate the head from a special reserved region used 491 * only for NAPI Rx allocation. By doing this we can save several 492 * CPU cycles by avoiding having to disable and re-enable IRQs. 493 * 494 * %NULL is returned if there is no free memory. 495 */ 496 struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, 497 gfp_t gfp_mask) 498 { 499 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); 500 struct sk_buff *skb; 501 void *data; 502 503 len += NET_SKB_PAD + NET_IP_ALIGN; 504 505 if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || 506 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { 507 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); 508 if (!skb) 509 goto skb_fail; 510 goto skb_success; 511 } 512 513 len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 514 len = SKB_DATA_ALIGN(len); 515 516 if (sk_memalloc_socks()) 517 gfp_mask |= __GFP_MEMALLOC; 518 519 data = page_frag_alloc(&nc->page, len, gfp_mask); 520 if (unlikely(!data)) 521 return NULL; 522 523 skb = __build_skb(data, len); 524 if (unlikely(!skb)) { 525 skb_free_frag(data); 526 return NULL; 527 } 528 529 if (nc->page.pfmemalloc) 530 skb->pfmemalloc = 1; 531 skb->head_frag = 1; 532 533 skb_success: 534 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); 535 skb->dev = napi->dev; 536 537 skb_fail: 538 return skb; 539 } 540 EXPORT_SYMBOL(__napi_alloc_skb); 541 542 void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, 543 int size, unsigned int truesize) 544 { 545 skb_fill_page_desc(skb, i, page, off, size); 546 skb->len += size; 547 skb->data_len += size; 548 skb->truesize += truesize; 549 } 550 EXPORT_SYMBOL(skb_add_rx_frag); 551 552 void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, 553 unsigned int truesize) 554 { 555 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 556 557 skb_frag_size_add(frag, size); 558 skb->len += size; 559 skb->data_len += size; 560 skb->truesize += truesize; 561 } 562 EXPORT_SYMBOL(skb_coalesce_rx_frag); 563 564 static void skb_drop_list(struct sk_buff **listp) 565 { 566 kfree_skb_list(*listp); 567 *listp = NULL; 568 } 569 570 static inline void skb_drop_fraglist(struct sk_buff *skb) 571 { 572 skb_drop_list(&skb_shinfo(skb)->frag_list); 573 } 574 575 static void skb_clone_fraglist(struct sk_buff *skb) 576 { 577 struct sk_buff *list; 578 579 skb_walk_frags(skb, list) 580 skb_get(list); 581 } 582 583 static void skb_free_head(struct sk_buff *skb) 584 { 585 unsigned char *head = skb->head; 586 587 if (skb->head_frag) 588 skb_free_frag(head); 589 else 590 kfree(head); 591 } 592 593 static void skb_release_data(struct sk_buff *skb) 594 { 595 struct skb_shared_info *shinfo = skb_shinfo(skb); 596 int i; 597 598 if (skb->cloned && 599 atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, 600 &shinfo->dataref)) 601 return; 602 603 for (i = 0; i < shinfo->nr_frags; i++) 604 __skb_frag_unref(&shinfo->frags[i]); 605 606 if (shinfo->frag_list) 607 kfree_skb_list(shinfo->frag_list); 608 609 skb_zcopy_clear(skb, true); 610 skb_free_head(skb); 611 } 612 613 /* 614 * Free an skbuff by memory without cleaning the state. 615 */ 616 static void kfree_skbmem(struct sk_buff *skb) 617 { 618 struct sk_buff_fclones *fclones; 619 620 switch (skb->fclone) { 621 case SKB_FCLONE_UNAVAILABLE: 622 kmem_cache_free(skbuff_head_cache, skb); 623 return; 624 625 case SKB_FCLONE_ORIG: 626 fclones = container_of(skb, struct sk_buff_fclones, skb1); 627 628 /* We usually free the clone (TX completion) before original skb 629 * This test would have no chance to be true for the clone, 630 * while here, branch prediction will be good. 631 */ 632 if (refcount_read(&fclones->fclone_ref) == 1) 633 goto fastpath; 634 break; 635 636 default: /* SKB_FCLONE_CLONE */ 637 fclones = container_of(skb, struct sk_buff_fclones, skb2); 638 break; 639 } 640 if (!refcount_dec_and_test(&fclones->fclone_ref)) 641 return; 642 fastpath: 643 kmem_cache_free(skbuff_fclone_cache, fclones); 644 } 645 646 void skb_release_head_state(struct sk_buff *skb) 647 { 648 skb_dst_drop(skb); 649 if (skb->destructor) { 650 WARN_ON(in_irq()); 651 skb->destructor(skb); 652 } 653 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 654 nf_conntrack_put(skb_nfct(skb)); 655 #endif 656 skb_ext_put(skb); 657 } 658 659 /* Free everything but the sk_buff shell. */ 660 static void skb_release_all(struct sk_buff *skb) 661 { 662 skb_release_head_state(skb); 663 if (likely(skb->head)) 664 skb_release_data(skb); 665 } 666 667 /** 668 * __kfree_skb - private function 669 * @skb: buffer 670 * 671 * Free an sk_buff. Release anything attached to the buffer. 672 * Clean the state. This is an internal helper function. Users should 673 * always call kfree_skb 674 */ 675 676 void __kfree_skb(struct sk_buff *skb) 677 { 678 skb_release_all(skb); 679 kfree_skbmem(skb); 680 } 681 EXPORT_SYMBOL(__kfree_skb); 682 683 /** 684 * kfree_skb - free an sk_buff 685 * @skb: buffer to free 686 * 687 * Drop a reference to the buffer and free it if the usage count has 688 * hit zero. 689 */ 690 void kfree_skb(struct sk_buff *skb) 691 { 692 if (!skb_unref(skb)) 693 return; 694 695 trace_kfree_skb(skb, __builtin_return_address(0)); 696 __kfree_skb(skb); 697 } 698 EXPORT_SYMBOL(kfree_skb); 699 700 void kfree_skb_list(struct sk_buff *segs) 701 { 702 while (segs) { 703 struct sk_buff *next = segs->next; 704 705 kfree_skb(segs); 706 segs = next; 707 } 708 } 709 EXPORT_SYMBOL(kfree_skb_list); 710 711 /* Dump skb information and contents. 712 * 713 * Must only be called from net_ratelimit()-ed paths. 714 * 715 * Dumps up to can_dump_full whole packets if full_pkt, headers otherwise. 716 */ 717 void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) 718 { 719 static atomic_t can_dump_full = ATOMIC_INIT(5); 720 struct skb_shared_info *sh = skb_shinfo(skb); 721 struct net_device *dev = skb->dev; 722 struct sock *sk = skb->sk; 723 struct sk_buff *list_skb; 724 bool has_mac, has_trans; 725 int headroom, tailroom; 726 int i, len, seg_len; 727 728 if (full_pkt) 729 full_pkt = atomic_dec_if_positive(&can_dump_full) >= 0; 730 731 if (full_pkt) 732 len = skb->len; 733 else 734 len = min_t(int, skb->len, MAX_HEADER + 128); 735 736 headroom = skb_headroom(skb); 737 tailroom = skb_tailroom(skb); 738 739 has_mac = skb_mac_header_was_set(skb); 740 has_trans = skb_transport_header_was_set(skb); 741 742 printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n" 743 "mac=(%d,%d) net=(%d,%d) trans=%d\n" 744 "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n" 745 "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n" 746 "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n", 747 level, skb->len, headroom, skb_headlen(skb), tailroom, 748 has_mac ? skb->mac_header : -1, 749 has_mac ? skb_mac_header_len(skb) : -1, 750 skb->network_header, 751 has_trans ? skb_network_header_len(skb) : -1, 752 has_trans ? skb->transport_header : -1, 753 sh->tx_flags, sh->nr_frags, 754 sh->gso_size, sh->gso_type, sh->gso_segs, 755 skb->csum, skb->ip_summed, skb->csum_complete_sw, 756 skb->csum_valid, skb->csum_level, 757 skb->hash, skb->sw_hash, skb->l4_hash, 758 ntohs(skb->protocol), skb->pkt_type, skb->skb_iif); 759 760 if (dev) 761 printk("%sdev name=%s feat=0x%pNF\n", 762 level, dev->name, &dev->features); 763 if (sk) 764 printk("%ssk family=%hu type=%u proto=%u\n", 765 level, sk->sk_family, sk->sk_type, sk->sk_protocol); 766 767 if (full_pkt && headroom) 768 print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET, 769 16, 1, skb->head, headroom, false); 770 771 seg_len = min_t(int, skb_headlen(skb), len); 772 if (seg_len) 773 print_hex_dump(level, "skb linear: ", DUMP_PREFIX_OFFSET, 774 16, 1, skb->data, seg_len, false); 775 len -= seg_len; 776 777 if (full_pkt && tailroom) 778 print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET, 779 16, 1, skb_tail_pointer(skb), tailroom, false); 780 781 for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) { 782 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 783 u32 p_off, p_len, copied; 784 struct page *p; 785 u8 *vaddr; 786 787 skb_frag_foreach_page(frag, skb_frag_off(frag), 788 skb_frag_size(frag), p, p_off, p_len, 789 copied) { 790 seg_len = min_t(int, p_len, len); 791 vaddr = kmap_atomic(p); 792 print_hex_dump(level, "skb frag: ", 793 DUMP_PREFIX_OFFSET, 794 16, 1, vaddr + p_off, seg_len, false); 795 kunmap_atomic(vaddr); 796 len -= seg_len; 797 if (!len) 798 break; 799 } 800 } 801 802 if (full_pkt && skb_has_frag_list(skb)) { 803 printk("skb fraglist:\n"); 804 skb_walk_frags(skb, list_skb) 805 skb_dump(level, list_skb, true); 806 } 807 } 808 EXPORT_SYMBOL(skb_dump); 809 810 /** 811 * skb_tx_error - report an sk_buff xmit error 812 * @skb: buffer that triggered an error 813 * 814 * Report xmit error if a device callback is tracking this skb. 815 * skb must be freed afterwards. 816 */ 817 void skb_tx_error(struct sk_buff *skb) 818 { 819 skb_zcopy_clear(skb, true); 820 } 821 EXPORT_SYMBOL(skb_tx_error); 822 823 #ifdef CONFIG_TRACEPOINTS 824 /** 825 * consume_skb - free an skbuff 826 * @skb: buffer to free 827 * 828 * Drop a ref to the buffer and free it if the usage count has hit zero 829 * Functions identically to kfree_skb, but kfree_skb assumes that the frame 830 * is being dropped after a failure and notes that 831 */ 832 void consume_skb(struct sk_buff *skb) 833 { 834 if (!skb_unref(skb)) 835 return; 836 837 trace_consume_skb(skb); 838 __kfree_skb(skb); 839 } 840 EXPORT_SYMBOL(consume_skb); 841 #endif 842 843 /** 844 * consume_stateless_skb - free an skbuff, assuming it is stateless 845 * @skb: buffer to free 846 * 847 * Alike consume_skb(), but this variant assumes that this is the last 848 * skb reference and all the head states have been already dropped 849 */ 850 void __consume_stateless_skb(struct sk_buff *skb) 851 { 852 trace_consume_skb(skb); 853 skb_release_data(skb); 854 kfree_skbmem(skb); 855 } 856 857 void __kfree_skb_flush(void) 858 { 859 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); 860 861 /* flush skb_cache if containing objects */ 862 if (nc->skb_count) { 863 kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count, 864 nc->skb_cache); 865 nc->skb_count = 0; 866 } 867 } 868 869 static inline void _kfree_skb_defer(struct sk_buff *skb) 870 { 871 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); 872 873 /* drop skb->head and call any destructors for packet */ 874 skb_release_all(skb); 875 876 /* record skb to CPU local list */ 877 nc->skb_cache[nc->skb_count++] = skb; 878 879 #ifdef CONFIG_SLUB 880 /* SLUB writes into objects when freeing */ 881 prefetchw(skb); 882 #endif 883 884 /* flush skb_cache if it is filled */ 885 if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { 886 kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_SIZE, 887 nc->skb_cache); 888 nc->skb_count = 0; 889 } 890 } 891 void __kfree_skb_defer(struct sk_buff *skb) 892 { 893 _kfree_skb_defer(skb); 894 } 895 896 void napi_consume_skb(struct sk_buff *skb, int budget) 897 { 898 /* Zero budget indicate non-NAPI context called us, like netpoll */ 899 if (unlikely(!budget)) { 900 dev_consume_skb_any(skb); 901 return; 902 } 903 904 if (!skb_unref(skb)) 905 return; 906 907 /* if reaching here SKB is ready to free */ 908 trace_consume_skb(skb); 909 910 /* if SKB is a clone, don't handle this case */ 911 if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { 912 __kfree_skb(skb); 913 return; 914 } 915 916 _kfree_skb_defer(skb); 917 } 918 EXPORT_SYMBOL(napi_consume_skb); 919 920 /* Make sure a field is enclosed inside headers_start/headers_end section */ 921 #define CHECK_SKB_FIELD(field) \ 922 BUILD_BUG_ON(offsetof(struct sk_buff, field) < \ 923 offsetof(struct sk_buff, headers_start)); \ 924 BUILD_BUG_ON(offsetof(struct sk_buff, field) > \ 925 offsetof(struct sk_buff, headers_end)); \ 926 927 static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) 928 { 929 new->tstamp = old->tstamp; 930 /* We do not copy old->sk */ 931 new->dev = old->dev; 932 memcpy(new->cb, old->cb, sizeof(old->cb)); 933 skb_dst_copy(new, old); 934 __skb_ext_copy(new, old); 935 __nf_copy(new, old, false); 936 937 /* Note : this field could be in headers_start/headers_end section 938 * It is not yet because we do not want to have a 16 bit hole 939 */ 940 new->queue_mapping = old->queue_mapping; 941 942 memcpy(&new->headers_start, &old->headers_start, 943 offsetof(struct sk_buff, headers_end) - 944 offsetof(struct sk_buff, headers_start)); 945 CHECK_SKB_FIELD(protocol); 946 CHECK_SKB_FIELD(csum); 947 CHECK_SKB_FIELD(hash); 948 CHECK_SKB_FIELD(priority); 949 CHECK_SKB_FIELD(skb_iif); 950 CHECK_SKB_FIELD(vlan_proto); 951 CHECK_SKB_FIELD(vlan_tci); 952 CHECK_SKB_FIELD(transport_header); 953 CHECK_SKB_FIELD(network_header); 954 CHECK_SKB_FIELD(mac_header); 955 CHECK_SKB_FIELD(inner_protocol); 956 CHECK_SKB_FIELD(inner_transport_header); 957 CHECK_SKB_FIELD(inner_network_header); 958 CHECK_SKB_FIELD(inner_mac_header); 959 CHECK_SKB_FIELD(mark); 960 #ifdef CONFIG_NETWORK_SECMARK 961 CHECK_SKB_FIELD(secmark); 962 #endif 963 #ifdef CONFIG_NET_RX_BUSY_POLL 964 CHECK_SKB_FIELD(napi_id); 965 #endif 966 #ifdef CONFIG_XPS 967 CHECK_SKB_FIELD(sender_cpu); 968 #endif 969 #ifdef CONFIG_NET_SCHED 970 CHECK_SKB_FIELD(tc_index); 971 #endif 972 973 } 974 975 /* 976 * You should not add any new code to this function. Add it to 977 * __copy_skb_header above instead. 978 */ 979 static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) 980 { 981 #define C(x) n->x = skb->x 982 983 n->next = n->prev = NULL; 984 n->sk = NULL; 985 __copy_skb_header(n, skb); 986 987 C(len); 988 C(data_len); 989 C(mac_len); 990 n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; 991 n->cloned = 1; 992 n->nohdr = 0; 993 n->peeked = 0; 994 C(pfmemalloc); 995 n->destructor = NULL; 996 C(tail); 997 C(end); 998 C(head); 999 C(head_frag); 1000 C(data); 1001 C(truesize); 1002 refcount_set(&n->users, 1); 1003 1004 atomic_inc(&(skb_shinfo(skb)->dataref)); 1005 skb->cloned = 1; 1006 1007 return n; 1008 #undef C 1009 } 1010 1011 /** 1012 * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg 1013 * @first: first sk_buff of the msg 1014 */ 1015 struct sk_buff *alloc_skb_for_msg(struct sk_buff *first) 1016 { 1017 struct sk_buff *n; 1018 1019 n = alloc_skb(0, GFP_ATOMIC); 1020 if (!n) 1021 return NULL; 1022 1023 n->len = first->len; 1024 n->data_len = first->len; 1025 n->truesize = first->truesize; 1026 1027 skb_shinfo(n)->frag_list = first; 1028 1029 __copy_skb_header(n, first); 1030 n->destructor = NULL; 1031 1032 return n; 1033 } 1034 EXPORT_SYMBOL_GPL(alloc_skb_for_msg); 1035 1036 /** 1037 * skb_morph - morph one skb into another 1038 * @dst: the skb to receive the contents 1039 * @src: the skb to supply the contents 1040 * 1041 * This is identical to skb_clone except that the target skb is 1042 * supplied by the user. 1043 * 1044 * The target skb is returned upon exit. 1045 */ 1046 struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) 1047 { 1048 skb_release_all(dst); 1049 return __skb_clone(dst, src); 1050 } 1051 EXPORT_SYMBOL_GPL(skb_morph); 1052 1053 int mm_account_pinned_pages(struct mmpin *mmp, size_t size) 1054 { 1055 unsigned long max_pg, num_pg, new_pg, old_pg; 1056 struct user_struct *user; 1057 1058 if (capable(CAP_IPC_LOCK) || !size) 1059 return 0; 1060 1061 num_pg = (size >> PAGE_SHIFT) + 2; /* worst case */ 1062 max_pg = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 1063 user = mmp->user ? : current_user(); 1064 1065 do { 1066 old_pg = atomic_long_read(&user->locked_vm); 1067 new_pg = old_pg + num_pg; 1068 if (new_pg > max_pg) 1069 return -ENOBUFS; 1070 } while (atomic_long_cmpxchg(&user->locked_vm, old_pg, new_pg) != 1071 old_pg); 1072 1073 if (!mmp->user) { 1074 mmp->user = get_uid(user); 1075 mmp->num_pg = num_pg; 1076 } else { 1077 mmp->num_pg += num_pg; 1078 } 1079 1080 return 0; 1081 } 1082 EXPORT_SYMBOL_GPL(mm_account_pinned_pages); 1083 1084 void mm_unaccount_pinned_pages(struct mmpin *mmp) 1085 { 1086 if (mmp->user) { 1087 atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm); 1088 free_uid(mmp->user); 1089 } 1090 } 1091 EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); 1092 1093 struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size) 1094 { 1095 struct ubuf_info *uarg; 1096 struct sk_buff *skb; 1097 1098 WARN_ON_ONCE(!in_task()); 1099 1100 skb = sock_omalloc(sk, 0, GFP_KERNEL); 1101 if (!skb) 1102 return NULL; 1103 1104 BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb)); 1105 uarg = (void *)skb->cb; 1106 uarg->mmp.user = NULL; 1107 1108 if (mm_account_pinned_pages(&uarg->mmp, size)) { 1109 kfree_skb(skb); 1110 return NULL; 1111 } 1112 1113 uarg->callback = sock_zerocopy_callback; 1114 uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1; 1115 uarg->len = 1; 1116 uarg->bytelen = size; 1117 uarg->zerocopy = 1; 1118 refcount_set(&uarg->refcnt, 1); 1119 sock_hold(sk); 1120 1121 return uarg; 1122 } 1123 EXPORT_SYMBOL_GPL(sock_zerocopy_alloc); 1124 1125 static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg) 1126 { 1127 return container_of((void *)uarg, struct sk_buff, cb); 1128 } 1129 1130 struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size, 1131 struct ubuf_info *uarg) 1132 { 1133 if (uarg) { 1134 const u32 byte_limit = 1 << 19; /* limit to a few TSO */ 1135 u32 bytelen, next; 1136 1137 /* realloc only when socket is locked (TCP, UDP cork), 1138 * so uarg->len and sk_zckey access is serialized 1139 */ 1140 if (!sock_owned_by_user(sk)) { 1141 WARN_ON_ONCE(1); 1142 return NULL; 1143 } 1144 1145 bytelen = uarg->bytelen + size; 1146 if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) { 1147 /* TCP can create new skb to attach new uarg */ 1148 if (sk->sk_type == SOCK_STREAM) 1149 goto new_alloc; 1150 return NULL; 1151 } 1152 1153 next = (u32)atomic_read(&sk->sk_zckey); 1154 if ((u32)(uarg->id + uarg->len) == next) { 1155 if (mm_account_pinned_pages(&uarg->mmp, size)) 1156 return NULL; 1157 uarg->len++; 1158 uarg->bytelen = bytelen; 1159 atomic_set(&sk->sk_zckey, ++next); 1160 1161 /* no extra ref when appending to datagram (MSG_MORE) */ 1162 if (sk->sk_type == SOCK_STREAM) 1163 sock_zerocopy_get(uarg); 1164 1165 return uarg; 1166 } 1167 } 1168 1169 new_alloc: 1170 return sock_zerocopy_alloc(sk, size); 1171 } 1172 EXPORT_SYMBOL_GPL(sock_zerocopy_realloc); 1173 1174 static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len) 1175 { 1176 struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); 1177 u32 old_lo, old_hi; 1178 u64 sum_len; 1179 1180 old_lo = serr->ee.ee_info; 1181 old_hi = serr->ee.ee_data; 1182 sum_len = old_hi - old_lo + 1ULL + len; 1183 1184 if (sum_len >= (1ULL << 32)) 1185 return false; 1186 1187 if (lo != old_hi + 1) 1188 return false; 1189 1190 serr->ee.ee_data += len; 1191 return true; 1192 } 1193 1194 void sock_zerocopy_callback(struct ubuf_info *uarg, bool success) 1195 { 1196 struct sk_buff *tail, *skb = skb_from_uarg(uarg); 1197 struct sock_exterr_skb *serr; 1198 struct sock *sk = skb->sk; 1199 struct sk_buff_head *q; 1200 unsigned long flags; 1201 u32 lo, hi; 1202 u16 len; 1203 1204 mm_unaccount_pinned_pages(&uarg->mmp); 1205 1206 /* if !len, there was only 1 call, and it was aborted 1207 * so do not queue a completion notification 1208 */ 1209 if (!uarg->len || sock_flag(sk, SOCK_DEAD)) 1210 goto release; 1211 1212 len = uarg->len; 1213 lo = uarg->id; 1214 hi = uarg->id + len - 1; 1215 1216 serr = SKB_EXT_ERR(skb); 1217 memset(serr, 0, sizeof(*serr)); 1218 serr->ee.ee_errno = 0; 1219 serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY; 1220 serr->ee.ee_data = hi; 1221 serr->ee.ee_info = lo; 1222 if (!success) 1223 serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED; 1224 1225 q = &sk->sk_error_queue; 1226 spin_lock_irqsave(&q->lock, flags); 1227 tail = skb_peek_tail(q); 1228 if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY || 1229 !skb_zerocopy_notify_extend(tail, lo, len)) { 1230 __skb_queue_tail(q, skb); 1231 skb = NULL; 1232 } 1233 spin_unlock_irqrestore(&q->lock, flags); 1234 1235 sk->sk_error_report(sk); 1236 1237 release: 1238 consume_skb(skb); 1239 sock_put(sk); 1240 } 1241 EXPORT_SYMBOL_GPL(sock_zerocopy_callback); 1242 1243 void sock_zerocopy_put(struct ubuf_info *uarg) 1244 { 1245 if (uarg && refcount_dec_and_test(&uarg->refcnt)) { 1246 if (uarg->callback) 1247 uarg->callback(uarg, uarg->zerocopy); 1248 else 1249 consume_skb(skb_from_uarg(uarg)); 1250 } 1251 } 1252 EXPORT_SYMBOL_GPL(sock_zerocopy_put); 1253 1254 void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) 1255 { 1256 if (uarg) { 1257 struct sock *sk = skb_from_uarg(uarg)->sk; 1258 1259 atomic_dec(&sk->sk_zckey); 1260 uarg->len--; 1261 1262 if (have_uref) 1263 sock_zerocopy_put(uarg); 1264 } 1265 } 1266 EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort); 1267 1268 int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) 1269 { 1270 return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len); 1271 } 1272 EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram); 1273 1274 int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, 1275 struct msghdr *msg, int len, 1276 struct ubuf_info *uarg) 1277 { 1278 struct ubuf_info *orig_uarg = skb_zcopy(skb); 1279 struct iov_iter orig_iter = msg->msg_iter; 1280 int err, orig_len = skb->len; 1281 1282 /* An skb can only point to one uarg. This edge case happens when 1283 * TCP appends to an skb, but zerocopy_realloc triggered a new alloc. 1284 */ 1285 if (orig_uarg && uarg != orig_uarg) 1286 return -EEXIST; 1287 1288 err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len); 1289 if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { 1290 struct sock *save_sk = skb->sk; 1291 1292 /* Streams do not free skb on error. Reset to prev state. */ 1293 msg->msg_iter = orig_iter; 1294 skb->sk = sk; 1295 ___pskb_trim(skb, orig_len); 1296 skb->sk = save_sk; 1297 return err; 1298 } 1299 1300 skb_zcopy_set(skb, uarg, NULL); 1301 return skb->len - orig_len; 1302 } 1303 EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); 1304 1305 static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, 1306 gfp_t gfp_mask) 1307 { 1308 if (skb_zcopy(orig)) { 1309 if (skb_zcopy(nskb)) { 1310 /* !gfp_mask callers are verified to !skb_zcopy(nskb) */ 1311 if (!gfp_mask) { 1312 WARN_ON_ONCE(1); 1313 return -ENOMEM; 1314 } 1315 if (skb_uarg(nskb) == skb_uarg(orig)) 1316 return 0; 1317 if (skb_copy_ubufs(nskb, GFP_ATOMIC)) 1318 return -EIO; 1319 } 1320 skb_zcopy_set(nskb, skb_uarg(orig), NULL); 1321 } 1322 return 0; 1323 } 1324 1325 /** 1326 * skb_copy_ubufs - copy userspace skb frags buffers to kernel 1327 * @skb: the skb to modify 1328 * @gfp_mask: allocation priority 1329 * 1330 * This must be called on SKBTX_DEV_ZEROCOPY skb. 1331 * It will copy all frags into kernel and drop the reference 1332 * to userspace pages. 1333 * 1334 * If this function is called from an interrupt gfp_mask() must be 1335 * %GFP_ATOMIC. 1336 * 1337 * Returns 0 on success or a negative error code on failure 1338 * to allocate kernel memory to copy to. 1339 */ 1340 int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) 1341 { 1342 int num_frags = skb_shinfo(skb)->nr_frags; 1343 struct page *page, *head = NULL; 1344 int i, new_frags; 1345 u32 d_off; 1346 1347 if (skb_shared(skb) || skb_unclone(skb, gfp_mask)) 1348 return -EINVAL; 1349 1350 if (!num_frags) 1351 goto release; 1352 1353 new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT; 1354 for (i = 0; i < new_frags; i++) { 1355 page = alloc_page(gfp_mask); 1356 if (!page) { 1357 while (head) { 1358 struct page *next = (struct page *)page_private(head); 1359 put_page(head); 1360 head = next; 1361 } 1362 return -ENOMEM; 1363 } 1364 set_page_private(page, (unsigned long)head); 1365 head = page; 1366 } 1367 1368 page = head; 1369 d_off = 0; 1370 for (i = 0; i < num_frags; i++) { 1371 skb_frag_t *f = &skb_shinfo(skb)->frags[i]; 1372 u32 p_off, p_len, copied; 1373 struct page *p; 1374 u8 *vaddr; 1375 1376 skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f), 1377 p, p_off, p_len, copied) { 1378 u32 copy, done = 0; 1379 vaddr = kmap_atomic(p); 1380 1381 while (done < p_len) { 1382 if (d_off == PAGE_SIZE) { 1383 d_off = 0; 1384 page = (struct page *)page_private(page); 1385 } 1386 copy = min_t(u32, PAGE_SIZE - d_off, p_len - done); 1387 memcpy(page_address(page) + d_off, 1388 vaddr + p_off + done, copy); 1389 done += copy; 1390 d_off += copy; 1391 } 1392 kunmap_atomic(vaddr); 1393 } 1394 } 1395 1396 /* skb frags release userspace buffers */ 1397 for (i = 0; i < num_frags; i++) 1398 skb_frag_unref(skb, i); 1399 1400 /* skb frags point to kernel buffers */ 1401 for (i = 0; i < new_frags - 1; i++) { 1402 __skb_fill_page_desc(skb, i, head, 0, PAGE_SIZE); 1403 head = (struct page *)page_private(head); 1404 } 1405 __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off); 1406 skb_shinfo(skb)->nr_frags = new_frags; 1407 1408 release: 1409 skb_zcopy_clear(skb, false); 1410 return 0; 1411 } 1412 EXPORT_SYMBOL_GPL(skb_copy_ubufs); 1413 1414 /** 1415 * skb_clone - duplicate an sk_buff 1416 * @skb: buffer to clone 1417 * @gfp_mask: allocation priority 1418 * 1419 * Duplicate an &sk_buff. The new one is not owned by a socket. Both 1420 * copies share the same packet data but not structure. The new 1421 * buffer has a reference count of 1. If the allocation fails the 1422 * function returns %NULL otherwise the new buffer is returned. 1423 * 1424 * If this function is called from an interrupt gfp_mask() must be 1425 * %GFP_ATOMIC. 1426 */ 1427 1428 struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) 1429 { 1430 struct sk_buff_fclones *fclones = container_of(skb, 1431 struct sk_buff_fclones, 1432 skb1); 1433 struct sk_buff *n; 1434 1435 if (skb_orphan_frags(skb, gfp_mask)) 1436 return NULL; 1437 1438 if (skb->fclone == SKB_FCLONE_ORIG && 1439 refcount_read(&fclones->fclone_ref) == 1) { 1440 n = &fclones->skb2; 1441 refcount_set(&fclones->fclone_ref, 2); 1442 } else { 1443 if (skb_pfmemalloc(skb)) 1444 gfp_mask |= __GFP_MEMALLOC; 1445 1446 n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); 1447 if (!n) 1448 return NULL; 1449 1450 n->fclone = SKB_FCLONE_UNAVAILABLE; 1451 } 1452 1453 return __skb_clone(n, skb); 1454 } 1455 EXPORT_SYMBOL(skb_clone); 1456 1457 void skb_headers_offset_update(struct sk_buff *skb, int off) 1458 { 1459 /* Only adjust this if it actually is csum_start rather than csum */ 1460 if (skb->ip_summed == CHECKSUM_PARTIAL) 1461 skb->csum_start += off; 1462 /* {transport,network,mac}_header and tail are relative to skb->head */ 1463 skb->transport_header += off; 1464 skb->network_header += off; 1465 if (skb_mac_header_was_set(skb)) 1466 skb->mac_header += off; 1467 skb->inner_transport_header += off; 1468 skb->inner_network_header += off; 1469 skb->inner_mac_header += off; 1470 } 1471 EXPORT_SYMBOL(skb_headers_offset_update); 1472 1473 void skb_copy_header(struct sk_buff *new, const struct sk_buff *old) 1474 { 1475 __copy_skb_header(new, old); 1476 1477 skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; 1478 skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; 1479 skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; 1480 } 1481 EXPORT_SYMBOL(skb_copy_header); 1482 1483 static inline int skb_alloc_rx_flag(const struct sk_buff *skb) 1484 { 1485 if (skb_pfmemalloc(skb)) 1486 return SKB_ALLOC_RX; 1487 return 0; 1488 } 1489 1490 /** 1491 * skb_copy - create private copy of an sk_buff 1492 * @skb: buffer to copy 1493 * @gfp_mask: allocation priority 1494 * 1495 * Make a copy of both an &sk_buff and its data. This is used when the 1496 * caller wishes to modify the data and needs a private copy of the 1497 * data to alter. Returns %NULL on failure or the pointer to the buffer 1498 * on success. The returned buffer has a reference count of 1. 1499 * 1500 * As by-product this function converts non-linear &sk_buff to linear 1501 * one, so that &sk_buff becomes completely private and caller is allowed 1502 * to modify all the data of returned buffer. This means that this 1503 * function is not recommended for use in circumstances when only 1504 * header is going to be modified. Use pskb_copy() instead. 1505 */ 1506 1507 struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) 1508 { 1509 int headerlen = skb_headroom(skb); 1510 unsigned int size = skb_end_offset(skb) + skb->data_len; 1511 struct sk_buff *n = __alloc_skb(size, gfp_mask, 1512 skb_alloc_rx_flag(skb), NUMA_NO_NODE); 1513 1514 if (!n) 1515 return NULL; 1516 1517 /* Set the data pointer */ 1518 skb_reserve(n, headerlen); 1519 /* Set the tail pointer and length */ 1520 skb_put(n, skb->len); 1521 1522 BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)); 1523 1524 skb_copy_header(n, skb); 1525 return n; 1526 } 1527 EXPORT_SYMBOL(skb_copy); 1528 1529 /** 1530 * __pskb_copy_fclone - create copy of an sk_buff with private head. 1531 * @skb: buffer to copy 1532 * @headroom: headroom of new skb 1533 * @gfp_mask: allocation priority 1534 * @fclone: if true allocate the copy of the skb from the fclone 1535 * cache instead of the head cache; it is recommended to set this 1536 * to true for the cases where the copy will likely be cloned 1537 * 1538 * Make a copy of both an &sk_buff and part of its data, located 1539 * in header. Fragmented data remain shared. This is used when 1540 * the caller wishes to modify only header of &sk_buff and needs 1541 * private copy of the header to alter. Returns %NULL on failure 1542 * or the pointer to the buffer on success. 1543 * The returned buffer has a reference count of 1. 1544 */ 1545 1546 struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, 1547 gfp_t gfp_mask, bool fclone) 1548 { 1549 unsigned int size = skb_headlen(skb) + headroom; 1550 int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0); 1551 struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE); 1552 1553 if (!n) 1554 goto out; 1555 1556 /* Set the data pointer */ 1557 skb_reserve(n, headroom); 1558 /* Set the tail pointer and length */ 1559 skb_put(n, skb_headlen(skb)); 1560 /* Copy the bytes */ 1561 skb_copy_from_linear_data(skb, n->data, n->len); 1562 1563 n->truesize += skb->data_len; 1564 n->data_len = skb->data_len; 1565 n->len = skb->len; 1566 1567 if (skb_shinfo(skb)->nr_frags) { 1568 int i; 1569 1570 if (skb_orphan_frags(skb, gfp_mask) || 1571 skb_zerocopy_clone(n, skb, gfp_mask)) { 1572 kfree_skb(n); 1573 n = NULL; 1574 goto out; 1575 } 1576 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1577 skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; 1578 skb_frag_ref(skb, i); 1579 } 1580 skb_shinfo(n)->nr_frags = i; 1581 } 1582 1583 if (skb_has_frag_list(skb)) { 1584 skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; 1585 skb_clone_fraglist(n); 1586 } 1587 1588 skb_copy_header(n, skb); 1589 out: 1590 return n; 1591 } 1592 EXPORT_SYMBOL(__pskb_copy_fclone); 1593 1594 /** 1595 * pskb_expand_head - reallocate header of &sk_buff 1596 * @skb: buffer to reallocate 1597 * @nhead: room to add at head 1598 * @ntail: room to add at tail 1599 * @gfp_mask: allocation priority 1600 * 1601 * Expands (or creates identical copy, if @nhead and @ntail are zero) 1602 * header of @skb. &sk_buff itself is not changed. &sk_buff MUST have 1603 * reference count of 1. Returns zero in the case of success or error, 1604 * if expansion failed. In the last case, &sk_buff is not changed. 1605 * 1606 * All the pointers pointing into skb header may change and must be 1607 * reloaded after call to this function. 1608 */ 1609 1610 int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, 1611 gfp_t gfp_mask) 1612 { 1613 int i, osize = skb_end_offset(skb); 1614 int size = osize + nhead + ntail; 1615 long off; 1616 u8 *data; 1617 1618 BUG_ON(nhead < 0); 1619 1620 BUG_ON(skb_shared(skb)); 1621 1622 size = SKB_DATA_ALIGN(size); 1623 1624 if (skb_pfmemalloc(skb)) 1625 gfp_mask |= __GFP_MEMALLOC; 1626 data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), 1627 gfp_mask, NUMA_NO_NODE, NULL); 1628 if (!data) 1629 goto nodata; 1630 size = SKB_WITH_OVERHEAD(ksize(data)); 1631 1632 /* Copy only real data... and, alas, header. This should be 1633 * optimized for the cases when header is void. 1634 */ 1635 memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head); 1636 1637 memcpy((struct skb_shared_info *)(data + size), 1638 skb_shinfo(skb), 1639 offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); 1640 1641 /* 1642 * if shinfo is shared we must drop the old head gracefully, but if it 1643 * is not we can just drop the old head and let the existing refcount 1644 * be since all we did is relocate the values 1645 */ 1646 if (skb_cloned(skb)) { 1647 if (skb_orphan_frags(skb, gfp_mask)) 1648 goto nofrags; 1649 if (skb_zcopy(skb)) 1650 refcount_inc(&skb_uarg(skb)->refcnt); 1651 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 1652 skb_frag_ref(skb, i); 1653 1654 if (skb_has_frag_list(skb)) 1655 skb_clone_fraglist(skb); 1656 1657 skb_release_data(skb); 1658 } else { 1659 skb_free_head(skb); 1660 } 1661 off = (data + nhead) - skb->head; 1662 1663 skb->head = data; 1664 skb->head_frag = 0; 1665 skb->data += off; 1666 #ifdef NET_SKBUFF_DATA_USES_OFFSET 1667 skb->end = size; 1668 off = nhead; 1669 #else 1670 skb->end = skb->head + size; 1671 #endif 1672 skb->tail += off; 1673 skb_headers_offset_update(skb, nhead); 1674 skb->cloned = 0; 1675 skb->hdr_len = 0; 1676 skb->nohdr = 0; 1677 atomic_set(&skb_shinfo(skb)->dataref, 1); 1678 1679 skb_metadata_clear(skb); 1680 1681 /* It is not generally safe to change skb->truesize. 1682 * For the moment, we really care of rx path, or 1683 * when skb is orphaned (not attached to a socket). 1684 */ 1685 if (!skb->sk || skb->destructor == sock_edemux) 1686 skb->truesize += size - osize; 1687 1688 return 0; 1689 1690 nofrags: 1691 kfree(data); 1692 nodata: 1693 return -ENOMEM; 1694 } 1695 EXPORT_SYMBOL(pskb_expand_head); 1696 1697 /* Make private copy of skb with writable head and some headroom */ 1698 1699 struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) 1700 { 1701 struct sk_buff *skb2; 1702 int delta = headroom - skb_headroom(skb); 1703 1704 if (delta <= 0) 1705 skb2 = pskb_copy(skb, GFP_ATOMIC); 1706 else { 1707 skb2 = skb_clone(skb, GFP_ATOMIC); 1708 if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, 1709 GFP_ATOMIC)) { 1710 kfree_skb(skb2); 1711 skb2 = NULL; 1712 } 1713 } 1714 return skb2; 1715 } 1716 EXPORT_SYMBOL(skb_realloc_headroom); 1717 1718 /** 1719 * skb_copy_expand - copy and expand sk_buff 1720 * @skb: buffer to copy 1721 * @newheadroom: new free bytes at head 1722 * @newtailroom: new free bytes at tail 1723 * @gfp_mask: allocation priority 1724 * 1725 * Make a copy of both an &sk_buff and its data and while doing so 1726 * allocate additional space. 1727 * 1728 * This is used when the caller wishes to modify the data and needs a 1729 * private copy of the data to alter as well as more space for new fields. 1730 * Returns %NULL on failure or the pointer to the buffer 1731 * on success. The returned buffer has a reference count of 1. 1732 * 1733 * You must pass %GFP_ATOMIC as the allocation priority if this function 1734 * is called from an interrupt. 1735 */ 1736 struct sk_buff *skb_copy_expand(const struct sk_buff *skb, 1737 int newheadroom, int newtailroom, 1738 gfp_t gfp_mask) 1739 { 1740 /* 1741 * Allocate the copy buffer 1742 */ 1743 struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom, 1744 gfp_mask, skb_alloc_rx_flag(skb), 1745 NUMA_NO_NODE); 1746 int oldheadroom = skb_headroom(skb); 1747 int head_copy_len, head_copy_off; 1748 1749 if (!n) 1750 return NULL; 1751 1752 skb_reserve(n, newheadroom); 1753 1754 /* Set the tail pointer and length */ 1755 skb_put(n, skb->len); 1756 1757 head_copy_len = oldheadroom; 1758 head_copy_off = 0; 1759 if (newheadroom <= head_copy_len) 1760 head_copy_len = newheadroom; 1761 else 1762 head_copy_off = newheadroom - head_copy_len; 1763 1764 /* Copy the linear header and data. */ 1765 BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, 1766 skb->len + head_copy_len)); 1767 1768 skb_copy_header(n, skb); 1769 1770 skb_headers_offset_update(n, newheadroom - oldheadroom); 1771 1772 return n; 1773 } 1774 EXPORT_SYMBOL(skb_copy_expand); 1775 1776 /** 1777 * __skb_pad - zero pad the tail of an skb 1778 * @skb: buffer to pad 1779 * @pad: space to pad 1780 * @free_on_error: free buffer on error 1781 * 1782 * Ensure that a buffer is followed by a padding area that is zero 1783 * filled. Used by network drivers which may DMA or transfer data 1784 * beyond the buffer end onto the wire. 1785 * 1786 * May return error in out of memory cases. The skb is freed on error 1787 * if @free_on_error is true. 1788 */ 1789 1790 int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error) 1791 { 1792 int err; 1793 int ntail; 1794 1795 /* If the skbuff is non linear tailroom is always zero.. */ 1796 if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) { 1797 memset(skb->data+skb->len, 0, pad); 1798 return 0; 1799 } 1800 1801 ntail = skb->data_len + pad - (skb->end - skb->tail); 1802 if (likely(skb_cloned(skb) || ntail > 0)) { 1803 err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC); 1804 if (unlikely(err)) 1805 goto free_skb; 1806 } 1807 1808 /* FIXME: The use of this function with non-linear skb's really needs 1809 * to be audited. 1810 */ 1811 err = skb_linearize(skb); 1812 if (unlikely(err)) 1813 goto free_skb; 1814 1815 memset(skb->data + skb->len, 0, pad); 1816 return 0; 1817 1818 free_skb: 1819 if (free_on_error) 1820 kfree_skb(skb); 1821 return err; 1822 } 1823 EXPORT_SYMBOL(__skb_pad); 1824 1825 /** 1826 * pskb_put - add data to the tail of a potentially fragmented buffer 1827 * @skb: start of the buffer to use 1828 * @tail: tail fragment of the buffer to use 1829 * @len: amount of data to add 1830 * 1831 * This function extends the used data area of the potentially 1832 * fragmented buffer. @tail must be the last fragment of @skb -- or 1833 * @skb itself. If this would exceed the total buffer size the kernel 1834 * will panic. A pointer to the first byte of the extra data is 1835 * returned. 1836 */ 1837 1838 void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) 1839 { 1840 if (tail != skb) { 1841 skb->data_len += len; 1842 skb->len += len; 1843 } 1844 return skb_put(tail, len); 1845 } 1846 EXPORT_SYMBOL_GPL(pskb_put); 1847 1848 /** 1849 * skb_put - add data to a buffer 1850 * @skb: buffer to use 1851 * @len: amount of data to add 1852 * 1853 * This function extends the used data area of the buffer. If this would 1854 * exceed the total buffer size the kernel will panic. A pointer to the 1855 * first byte of the extra data is returned. 1856 */ 1857 void *skb_put(struct sk_buff *skb, unsigned int len) 1858 { 1859 void *tmp = skb_tail_pointer(skb); 1860 SKB_LINEAR_ASSERT(skb); 1861 skb->tail += len; 1862 skb->len += len; 1863 if (unlikely(skb->tail > skb->end)) 1864 skb_over_panic(skb, len, __builtin_return_address(0)); 1865 return tmp; 1866 } 1867 EXPORT_SYMBOL(skb_put); 1868 1869 /** 1870 * skb_push - add data to the start of a buffer 1871 * @skb: buffer to use 1872 * @len: amount of data to add 1873 * 1874 * This function extends the used data area of the buffer at the buffer 1875 * start. If this would exceed the total buffer headroom the kernel will 1876 * panic. A pointer to the first byte of the extra data is returned. 1877 */ 1878 void *skb_push(struct sk_buff *skb, unsigned int len) 1879 { 1880 skb->data -= len; 1881 skb->len += len; 1882 if (unlikely(skb->data < skb->head)) 1883 skb_under_panic(skb, len, __builtin_return_address(0)); 1884 return skb->data; 1885 } 1886 EXPORT_SYMBOL(skb_push); 1887 1888 /** 1889 * skb_pull - remove data from the start of a buffer 1890 * @skb: buffer to use 1891 * @len: amount of data to remove 1892 * 1893 * This function removes data from the start of a buffer, returning 1894 * the memory to the headroom. A pointer to the next data in the buffer 1895 * is returned. Once the data has been pulled future pushes will overwrite 1896 * the old data. 1897 */ 1898 void *skb_pull(struct sk_buff *skb, unsigned int len) 1899 { 1900 return skb_pull_inline(skb, len); 1901 } 1902 EXPORT_SYMBOL(skb_pull); 1903 1904 /** 1905 * skb_trim - remove end from a buffer 1906 * @skb: buffer to alter 1907 * @len: new length 1908 * 1909 * Cut the length of a buffer down by removing data from the tail. If 1910 * the buffer is already under the length specified it is not modified. 1911 * The skb must be linear. 1912 */ 1913 void skb_trim(struct sk_buff *skb, unsigned int len) 1914 { 1915 if (skb->len > len) 1916 __skb_trim(skb, len); 1917 } 1918 EXPORT_SYMBOL(skb_trim); 1919 1920 /* Trims skb to length len. It can change skb pointers. 1921 */ 1922 1923 int ___pskb_trim(struct sk_buff *skb, unsigned int len) 1924 { 1925 struct sk_buff **fragp; 1926 struct sk_buff *frag; 1927 int offset = skb_headlen(skb); 1928 int nfrags = skb_shinfo(skb)->nr_frags; 1929 int i; 1930 int err; 1931 1932 if (skb_cloned(skb) && 1933 unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))) 1934 return err; 1935 1936 i = 0; 1937 if (offset >= len) 1938 goto drop_pages; 1939 1940 for (; i < nfrags; i++) { 1941 int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]); 1942 1943 if (end < len) { 1944 offset = end; 1945 continue; 1946 } 1947 1948 skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset); 1949 1950 drop_pages: 1951 skb_shinfo(skb)->nr_frags = i; 1952 1953 for (; i < nfrags; i++) 1954 skb_frag_unref(skb, i); 1955 1956 if (skb_has_frag_list(skb)) 1957 skb_drop_fraglist(skb); 1958 goto done; 1959 } 1960 1961 for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp); 1962 fragp = &frag->next) { 1963 int end = offset + frag->len; 1964 1965 if (skb_shared(frag)) { 1966 struct sk_buff *nfrag; 1967 1968 nfrag = skb_clone(frag, GFP_ATOMIC); 1969 if (unlikely(!nfrag)) 1970 return -ENOMEM; 1971 1972 nfrag->next = frag->next; 1973 consume_skb(frag); 1974 frag = nfrag; 1975 *fragp = frag; 1976 } 1977 1978 if (end < len) { 1979 offset = end; 1980 continue; 1981 } 1982 1983 if (end > len && 1984 unlikely((err = pskb_trim(frag, len - offset)))) 1985 return err; 1986 1987 if (frag->next) 1988 skb_drop_list(&frag->next); 1989 break; 1990 } 1991 1992 done: 1993 if (len > skb_headlen(skb)) { 1994 skb->data_len -= skb->len - len; 1995 skb->len = len; 1996 } else { 1997 skb->len = len; 1998 skb->data_len = 0; 1999 skb_set_tail_pointer(skb, len); 2000 } 2001 2002 if (!skb->sk || skb->destructor == sock_edemux) 2003 skb_condense(skb); 2004 return 0; 2005 } 2006 EXPORT_SYMBOL(___pskb_trim); 2007 2008 /* Note : use pskb_trim_rcsum() instead of calling this directly 2009 */ 2010 int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len) 2011 { 2012 if (skb->ip_summed == CHECKSUM_COMPLETE) { 2013 int delta = skb->len - len; 2014 2015 skb->csum = csum_block_sub(skb->csum, 2016 skb_checksum(skb, len, delta, 0), 2017 len); 2018 } 2019 return __pskb_trim(skb, len); 2020 } 2021 EXPORT_SYMBOL(pskb_trim_rcsum_slow); 2022 2023 /** 2024 * __pskb_pull_tail - advance tail of skb header 2025 * @skb: buffer to reallocate 2026 * @delta: number of bytes to advance tail 2027 * 2028 * The function makes a sense only on a fragmented &sk_buff, 2029 * it expands header moving its tail forward and copying necessary 2030 * data from fragmented part. 2031 * 2032 * &sk_buff MUST have reference count of 1. 2033 * 2034 * Returns %NULL (and &sk_buff does not change) if pull failed 2035 * or value of new tail of skb in the case of success. 2036 * 2037 * All the pointers pointing into skb header may change and must be 2038 * reloaded after call to this function. 2039 */ 2040 2041 /* Moves tail of skb head forward, copying data from fragmented part, 2042 * when it is necessary. 2043 * 1. It may fail due to malloc failure. 2044 * 2. It may change skb pointers. 2045 * 2046 * It is pretty complicated. Luckily, it is called only in exceptional cases. 2047 */ 2048 void *__pskb_pull_tail(struct sk_buff *skb, int delta) 2049 { 2050 /* If skb has not enough free space at tail, get new one 2051 * plus 128 bytes for future expansions. If we have enough 2052 * room at tail, reallocate without expansion only if skb is cloned. 2053 */ 2054 int i, k, eat = (skb->tail + delta) - skb->end; 2055 2056 if (eat > 0 || skb_cloned(skb)) { 2057 if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, 2058 GFP_ATOMIC)) 2059 return NULL; 2060 } 2061 2062 BUG_ON(skb_copy_bits(skb, skb_headlen(skb), 2063 skb_tail_pointer(skb), delta)); 2064 2065 /* Optimization: no fragments, no reasons to preestimate 2066 * size of pulled pages. Superb. 2067 */ 2068 if (!skb_has_frag_list(skb)) 2069 goto pull_pages; 2070 2071 /* Estimate size of pulled pages. */ 2072 eat = delta; 2073 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2074 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 2075 2076 if (size >= eat) 2077 goto pull_pages; 2078 eat -= size; 2079 } 2080 2081 /* If we need update frag list, we are in troubles. 2082 * Certainly, it is possible to add an offset to skb data, 2083 * but taking into account that pulling is expected to 2084 * be very rare operation, it is worth to fight against 2085 * further bloating skb head and crucify ourselves here instead. 2086 * Pure masohism, indeed. 8)8) 2087 */ 2088 if (eat) { 2089 struct sk_buff *list = skb_shinfo(skb)->frag_list; 2090 struct sk_buff *clone = NULL; 2091 struct sk_buff *insp = NULL; 2092 2093 do { 2094 if (list->len <= eat) { 2095 /* Eaten as whole. */ 2096 eat -= list->len; 2097 list = list->next; 2098 insp = list; 2099 } else { 2100 /* Eaten partially. */ 2101 2102 if (skb_shared(list)) { 2103 /* Sucks! We need to fork list. :-( */ 2104 clone = skb_clone(list, GFP_ATOMIC); 2105 if (!clone) 2106 return NULL; 2107 insp = list->next; 2108 list = clone; 2109 } else { 2110 /* This may be pulled without 2111 * problems. */ 2112 insp = list; 2113 } 2114 if (!pskb_pull(list, eat)) { 2115 kfree_skb(clone); 2116 return NULL; 2117 } 2118 break; 2119 } 2120 } while (eat); 2121 2122 /* Free pulled out fragments. */ 2123 while ((list = skb_shinfo(skb)->frag_list) != insp) { 2124 skb_shinfo(skb)->frag_list = list->next; 2125 kfree_skb(list); 2126 } 2127 /* And insert new clone at head. */ 2128 if (clone) { 2129 clone->next = list; 2130 skb_shinfo(skb)->frag_list = clone; 2131 } 2132 } 2133 /* Success! Now we may commit changes to skb data. */ 2134 2135 pull_pages: 2136 eat = delta; 2137 k = 0; 2138 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2139 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 2140 2141 if (size <= eat) { 2142 skb_frag_unref(skb, i); 2143 eat -= size; 2144 } else { 2145 skb_frag_t *frag = &skb_shinfo(skb)->frags[k]; 2146 2147 *frag = skb_shinfo(skb)->frags[i]; 2148 if (eat) { 2149 skb_frag_off_add(frag, eat); 2150 skb_frag_size_sub(frag, eat); 2151 if (!i) 2152 goto end; 2153 eat = 0; 2154 } 2155 k++; 2156 } 2157 } 2158 skb_shinfo(skb)->nr_frags = k; 2159 2160 end: 2161 skb->tail += delta; 2162 skb->data_len -= delta; 2163 2164 if (!skb->data_len) 2165 skb_zcopy_clear(skb, false); 2166 2167 return skb_tail_pointer(skb); 2168 } 2169 EXPORT_SYMBOL(__pskb_pull_tail); 2170 2171 /** 2172 * skb_copy_bits - copy bits from skb to kernel buffer 2173 * @skb: source skb 2174 * @offset: offset in source 2175 * @to: destination buffer 2176 * @len: number of bytes to copy 2177 * 2178 * Copy the specified number of bytes from the source skb to the 2179 * destination buffer. 2180 * 2181 * CAUTION ! : 2182 * If its prototype is ever changed, 2183 * check arch/{*}/net/{*}.S files, 2184 * since it is called from BPF assembly code. 2185 */ 2186 int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) 2187 { 2188 int start = skb_headlen(skb); 2189 struct sk_buff *frag_iter; 2190 int i, copy; 2191 2192 if (offset > (int)skb->len - len) 2193 goto fault; 2194 2195 /* Copy header. */ 2196 if ((copy = start - offset) > 0) { 2197 if (copy > len) 2198 copy = len; 2199 skb_copy_from_linear_data_offset(skb, offset, to, copy); 2200 if ((len -= copy) == 0) 2201 return 0; 2202 offset += copy; 2203 to += copy; 2204 } 2205 2206 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2207 int end; 2208 skb_frag_t *f = &skb_shinfo(skb)->frags[i]; 2209 2210 WARN_ON(start > offset + len); 2211 2212 end = start + skb_frag_size(f); 2213 if ((copy = end - offset) > 0) { 2214 u32 p_off, p_len, copied; 2215 struct page *p; 2216 u8 *vaddr; 2217 2218 if (copy > len) 2219 copy = len; 2220 2221 skb_frag_foreach_page(f, 2222 skb_frag_off(f) + offset - start, 2223 copy, p, p_off, p_len, copied) { 2224 vaddr = kmap_atomic(p); 2225 memcpy(to + copied, vaddr + p_off, p_len); 2226 kunmap_atomic(vaddr); 2227 } 2228 2229 if ((len -= copy) == 0) 2230 return 0; 2231 offset += copy; 2232 to += copy; 2233 } 2234 start = end; 2235 } 2236 2237 skb_walk_frags(skb, frag_iter) { 2238 int end; 2239 2240 WARN_ON(start > offset + len); 2241 2242 end = start + frag_iter->len; 2243 if ((copy = end - offset) > 0) { 2244 if (copy > len) 2245 copy = len; 2246 if (skb_copy_bits(frag_iter, offset - start, to, copy)) 2247 goto fault; 2248 if ((len -= copy) == 0) 2249 return 0; 2250 offset += copy; 2251 to += copy; 2252 } 2253 start = end; 2254 } 2255 2256 if (!len) 2257 return 0; 2258 2259 fault: 2260 return -EFAULT; 2261 } 2262 EXPORT_SYMBOL(skb_copy_bits); 2263 2264 /* 2265 * Callback from splice_to_pipe(), if we need to release some pages 2266 * at the end of the spd in case we error'ed out in filling the pipe. 2267 */ 2268 static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) 2269 { 2270 put_page(spd->pages[i]); 2271 } 2272 2273 static struct page *linear_to_page(struct page *page, unsigned int *len, 2274 unsigned int *offset, 2275 struct sock *sk) 2276 { 2277 struct page_frag *pfrag = sk_page_frag(sk); 2278 2279 if (!sk_page_frag_refill(sk, pfrag)) 2280 return NULL; 2281 2282 *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset); 2283 2284 memcpy(page_address(pfrag->page) + pfrag->offset, 2285 page_address(page) + *offset, *len); 2286 *offset = pfrag->offset; 2287 pfrag->offset += *len; 2288 2289 return pfrag->page; 2290 } 2291 2292 static bool spd_can_coalesce(const struct splice_pipe_desc *spd, 2293 struct page *page, 2294 unsigned int offset) 2295 { 2296 return spd->nr_pages && 2297 spd->pages[spd->nr_pages - 1] == page && 2298 (spd->partial[spd->nr_pages - 1].offset + 2299 spd->partial[spd->nr_pages - 1].len == offset); 2300 } 2301 2302 /* 2303 * Fill page/offset/length into spd, if it can hold more pages. 2304 */ 2305 static bool spd_fill_page(struct splice_pipe_desc *spd, 2306 struct pipe_inode_info *pipe, struct page *page, 2307 unsigned int *len, unsigned int offset, 2308 bool linear, 2309 struct sock *sk) 2310 { 2311 if (unlikely(spd->nr_pages == MAX_SKB_FRAGS)) 2312 return true; 2313 2314 if (linear) { 2315 page = linear_to_page(page, len, &offset, sk); 2316 if (!page) 2317 return true; 2318 } 2319 if (spd_can_coalesce(spd, page, offset)) { 2320 spd->partial[spd->nr_pages - 1].len += *len; 2321 return false; 2322 } 2323 get_page(page); 2324 spd->pages[spd->nr_pages] = page; 2325 spd->partial[spd->nr_pages].len = *len; 2326 spd->partial[spd->nr_pages].offset = offset; 2327 spd->nr_pages++; 2328 2329 return false; 2330 } 2331 2332 static bool __splice_segment(struct page *page, unsigned int poff, 2333 unsigned int plen, unsigned int *off, 2334 unsigned int *len, 2335 struct splice_pipe_desc *spd, bool linear, 2336 struct sock *sk, 2337 struct pipe_inode_info *pipe) 2338 { 2339 if (!*len) 2340 return true; 2341 2342 /* skip this segment if already processed */ 2343 if (*off >= plen) { 2344 *off -= plen; 2345 return false; 2346 } 2347 2348 /* ignore any bits we already processed */ 2349 poff += *off; 2350 plen -= *off; 2351 *off = 0; 2352 2353 do { 2354 unsigned int flen = min(*len, plen); 2355 2356 if (spd_fill_page(spd, pipe, page, &flen, poff, 2357 linear, sk)) 2358 return true; 2359 poff += flen; 2360 plen -= flen; 2361 *len -= flen; 2362 } while (*len && plen); 2363 2364 return false; 2365 } 2366 2367 /* 2368 * Map linear and fragment data from the skb to spd. It reports true if the 2369 * pipe is full or if we already spliced the requested length. 2370 */ 2371 static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, 2372 unsigned int *offset, unsigned int *len, 2373 struct splice_pipe_desc *spd, struct sock *sk) 2374 { 2375 int seg; 2376 struct sk_buff *iter; 2377 2378 /* map the linear part : 2379 * If skb->head_frag is set, this 'linear' part is backed by a 2380 * fragment, and if the head is not shared with any clones then 2381 * we can avoid a copy since we own the head portion of this page. 2382 */ 2383 if (__splice_segment(virt_to_page(skb->data), 2384 (unsigned long) skb->data & (PAGE_SIZE - 1), 2385 skb_headlen(skb), 2386 offset, len, spd, 2387 skb_head_is_locked(skb), 2388 sk, pipe)) 2389 return true; 2390 2391 /* 2392 * then map the fragments 2393 */ 2394 for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { 2395 const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; 2396 2397 if (__splice_segment(skb_frag_page(f), 2398 skb_frag_off(f), skb_frag_size(f), 2399 offset, len, spd, false, sk, pipe)) 2400 return true; 2401 } 2402 2403 skb_walk_frags(skb, iter) { 2404 if (*offset >= iter->len) { 2405 *offset -= iter->len; 2406 continue; 2407 } 2408 /* __skb_splice_bits() only fails if the output has no room 2409 * left, so no point in going over the frag_list for the error 2410 * case. 2411 */ 2412 if (__skb_splice_bits(iter, pipe, offset, len, spd, sk)) 2413 return true; 2414 } 2415 2416 return false; 2417 } 2418 2419 /* 2420 * Map data from the skb to a pipe. Should handle both the linear part, 2421 * the fragments, and the frag list. 2422 */ 2423 int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, 2424 struct pipe_inode_info *pipe, unsigned int tlen, 2425 unsigned int flags) 2426 { 2427 struct partial_page partial[MAX_SKB_FRAGS]; 2428 struct page *pages[MAX_SKB_FRAGS]; 2429 struct splice_pipe_desc spd = { 2430 .pages = pages, 2431 .partial = partial, 2432 .nr_pages_max = MAX_SKB_FRAGS, 2433 .ops = &nosteal_pipe_buf_ops, 2434 .spd_release = sock_spd_release, 2435 }; 2436 int ret = 0; 2437 2438 __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk); 2439 2440 if (spd.nr_pages) 2441 ret = splice_to_pipe(pipe, &spd); 2442 2443 return ret; 2444 } 2445 EXPORT_SYMBOL_GPL(skb_splice_bits); 2446 2447 /* Send skb data on a socket. Socket must be locked. */ 2448 int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, 2449 int len) 2450 { 2451 unsigned int orig_len = len; 2452 struct sk_buff *head = skb; 2453 unsigned short fragidx; 2454 int slen, ret; 2455 2456 do_frag_list: 2457 2458 /* Deal with head data */ 2459 while (offset < skb_headlen(skb) && len) { 2460 struct kvec kv; 2461 struct msghdr msg; 2462 2463 slen = min_t(int, len, skb_headlen(skb) - offset); 2464 kv.iov_base = skb->data + offset; 2465 kv.iov_len = slen; 2466 memset(&msg, 0, sizeof(msg)); 2467 msg.msg_flags = MSG_DONTWAIT; 2468 2469 ret = kernel_sendmsg_locked(sk, &msg, &kv, 1, slen); 2470 if (ret <= 0) 2471 goto error; 2472 2473 offset += ret; 2474 len -= ret; 2475 } 2476 2477 /* All the data was skb head? */ 2478 if (!len) 2479 goto out; 2480 2481 /* Make offset relative to start of frags */ 2482 offset -= skb_headlen(skb); 2483 2484 /* Find where we are in frag list */ 2485 for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { 2486 skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; 2487 2488 if (offset < skb_frag_size(frag)) 2489 break; 2490 2491 offset -= skb_frag_size(frag); 2492 } 2493 2494 for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { 2495 skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; 2496 2497 slen = min_t(size_t, len, skb_frag_size(frag) - offset); 2498 2499 while (slen) { 2500 ret = kernel_sendpage_locked(sk, skb_frag_page(frag), 2501 skb_frag_off(frag) + offset, 2502 slen, MSG_DONTWAIT); 2503 if (ret <= 0) 2504 goto error; 2505 2506 len -= ret; 2507 offset += ret; 2508 slen -= ret; 2509 } 2510 2511 offset = 0; 2512 } 2513 2514 if (len) { 2515 /* Process any frag lists */ 2516 2517 if (skb == head) { 2518 if (skb_has_frag_list(skb)) { 2519 skb = skb_shinfo(skb)->frag_list; 2520 goto do_frag_list; 2521 } 2522 } else if (skb->next) { 2523 skb = skb->next; 2524 goto do_frag_list; 2525 } 2526 } 2527 2528 out: 2529 return orig_len - len; 2530 2531 error: 2532 return orig_len == len ? ret : orig_len - len; 2533 } 2534 EXPORT_SYMBOL_GPL(skb_send_sock_locked); 2535 2536 /** 2537 * skb_store_bits - store bits from kernel buffer to skb 2538 * @skb: destination buffer 2539 * @offset: offset in destination 2540 * @from: source buffer 2541 * @len: number of bytes to copy 2542 * 2543 * Copy the specified number of bytes from the source buffer to the 2544 * destination skb. This function handles all the messy bits of 2545 * traversing fragment lists and such. 2546 */ 2547 2548 int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) 2549 { 2550 int start = skb_headlen(skb); 2551 struct sk_buff *frag_iter; 2552 int i, copy; 2553 2554 if (offset > (int)skb->len - len) 2555 goto fault; 2556 2557 if ((copy = start - offset) > 0) { 2558 if (copy > len) 2559 copy = len; 2560 skb_copy_to_linear_data_offset(skb, offset, from, copy); 2561 if ((len -= copy) == 0) 2562 return 0; 2563 offset += copy; 2564 from += copy; 2565 } 2566 2567 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2568 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2569 int end; 2570 2571 WARN_ON(start > offset + len); 2572 2573 end = start + skb_frag_size(frag); 2574 if ((copy = end - offset) > 0) { 2575 u32 p_off, p_len, copied; 2576 struct page *p; 2577 u8 *vaddr; 2578 2579 if (copy > len) 2580 copy = len; 2581 2582 skb_frag_foreach_page(frag, 2583 skb_frag_off(frag) + offset - start, 2584 copy, p, p_off, p_len, copied) { 2585 vaddr = kmap_atomic(p); 2586 memcpy(vaddr + p_off, from + copied, p_len); 2587 kunmap_atomic(vaddr); 2588 } 2589 2590 if ((len -= copy) == 0) 2591 return 0; 2592 offset += copy; 2593 from += copy; 2594 } 2595 start = end; 2596 } 2597 2598 skb_walk_frags(skb, frag_iter) { 2599 int end; 2600 2601 WARN_ON(start > offset + len); 2602 2603 end = start + frag_iter->len; 2604 if ((copy = end - offset) > 0) { 2605 if (copy > len) 2606 copy = len; 2607 if (skb_store_bits(frag_iter, offset - start, 2608 from, copy)) 2609 goto fault; 2610 if ((len -= copy) == 0) 2611 return 0; 2612 offset += copy; 2613 from += copy; 2614 } 2615 start = end; 2616 } 2617 if (!len) 2618 return 0; 2619 2620 fault: 2621 return -EFAULT; 2622 } 2623 EXPORT_SYMBOL(skb_store_bits); 2624 2625 /* Checksum skb data. */ 2626 __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, 2627 __wsum csum, const struct skb_checksum_ops *ops) 2628 { 2629 int start = skb_headlen(skb); 2630 int i, copy = start - offset; 2631 struct sk_buff *frag_iter; 2632 int pos = 0; 2633 2634 /* Checksum header. */ 2635 if (copy > 0) { 2636 if (copy > len) 2637 copy = len; 2638 csum = INDIRECT_CALL_1(ops->update, csum_partial_ext, 2639 skb->data + offset, copy, csum); 2640 if ((len -= copy) == 0) 2641 return csum; 2642 offset += copy; 2643 pos = copy; 2644 } 2645 2646 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2647 int end; 2648 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2649 2650 WARN_ON(start > offset + len); 2651 2652 end = start + skb_frag_size(frag); 2653 if ((copy = end - offset) > 0) { 2654 u32 p_off, p_len, copied; 2655 struct page *p; 2656 __wsum csum2; 2657 u8 *vaddr; 2658 2659 if (copy > len) 2660 copy = len; 2661 2662 skb_frag_foreach_page(frag, 2663 skb_frag_off(frag) + offset - start, 2664 copy, p, p_off, p_len, copied) { 2665 vaddr = kmap_atomic(p); 2666 csum2 = INDIRECT_CALL_1(ops->update, 2667 csum_partial_ext, 2668 vaddr + p_off, p_len, 0); 2669 kunmap_atomic(vaddr); 2670 csum = INDIRECT_CALL_1(ops->combine, 2671 csum_block_add_ext, csum, 2672 csum2, pos, p_len); 2673 pos += p_len; 2674 } 2675 2676 if (!(len -= copy)) 2677 return csum; 2678 offset += copy; 2679 } 2680 start = end; 2681 } 2682 2683 skb_walk_frags(skb, frag_iter) { 2684 int end; 2685 2686 WARN_ON(start > offset + len); 2687 2688 end = start + frag_iter->len; 2689 if ((copy = end - offset) > 0) { 2690 __wsum csum2; 2691 if (copy > len) 2692 copy = len; 2693 csum2 = __skb_checksum(frag_iter, offset - start, 2694 copy, 0, ops); 2695 csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext, 2696 csum, csum2, pos, copy); 2697 if ((len -= copy) == 0) 2698 return csum; 2699 offset += copy; 2700 pos += copy; 2701 } 2702 start = end; 2703 } 2704 BUG_ON(len); 2705 2706 return csum; 2707 } 2708 EXPORT_SYMBOL(__skb_checksum); 2709 2710 __wsum skb_checksum(const struct sk_buff *skb, int offset, 2711 int len, __wsum csum) 2712 { 2713 const struct skb_checksum_ops ops = { 2714 .update = csum_partial_ext, 2715 .combine = csum_block_add_ext, 2716 }; 2717 2718 return __skb_checksum(skb, offset, len, csum, &ops); 2719 } 2720 EXPORT_SYMBOL(skb_checksum); 2721 2722 /* Both of above in one bottle. */ 2723 2724 __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, 2725 u8 *to, int len, __wsum csum) 2726 { 2727 int start = skb_headlen(skb); 2728 int i, copy = start - offset; 2729 struct sk_buff *frag_iter; 2730 int pos = 0; 2731 2732 /* Copy header. */ 2733 if (copy > 0) { 2734 if (copy > len) 2735 copy = len; 2736 csum = csum_partial_copy_nocheck(skb->data + offset, to, 2737 copy, csum); 2738 if ((len -= copy) == 0) 2739 return csum; 2740 offset += copy; 2741 to += copy; 2742 pos = copy; 2743 } 2744 2745 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2746 int end; 2747 2748 WARN_ON(start > offset + len); 2749 2750 end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); 2751 if ((copy = end - offset) > 0) { 2752 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2753 u32 p_off, p_len, copied; 2754 struct page *p; 2755 __wsum csum2; 2756 u8 *vaddr; 2757 2758 if (copy > len) 2759 copy = len; 2760 2761 skb_frag_foreach_page(frag, 2762 skb_frag_off(frag) + offset - start, 2763 copy, p, p_off, p_len, copied) { 2764 vaddr = kmap_atomic(p); 2765 csum2 = csum_partial_copy_nocheck(vaddr + p_off, 2766 to + copied, 2767 p_len, 0); 2768 kunmap_atomic(vaddr); 2769 csum = csum_block_add(csum, csum2, pos); 2770 pos += p_len; 2771 } 2772 2773 if (!(len -= copy)) 2774 return csum; 2775 offset += copy; 2776 to += copy; 2777 } 2778 start = end; 2779 } 2780 2781 skb_walk_frags(skb, frag_iter) { 2782 __wsum csum2; 2783 int end; 2784 2785 WARN_ON(start > offset + len); 2786 2787 end = start + frag_iter->len; 2788 if ((copy = end - offset) > 0) { 2789 if (copy > len) 2790 copy = len; 2791 csum2 = skb_copy_and_csum_bits(frag_iter, 2792 offset - start, 2793 to, copy, 0); 2794 csum = csum_block_add(csum, csum2, pos); 2795 if ((len -= copy) == 0) 2796 return csum; 2797 offset += copy; 2798 to += copy; 2799 pos += copy; 2800 } 2801 start = end; 2802 } 2803 BUG_ON(len); 2804 return csum; 2805 } 2806 EXPORT_SYMBOL(skb_copy_and_csum_bits); 2807 2808 __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) 2809 { 2810 __sum16 sum; 2811 2812 sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); 2813 /* See comments in __skb_checksum_complete(). */ 2814 if (likely(!sum)) { 2815 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && 2816 !skb->csum_complete_sw) 2817 netdev_rx_csum_fault(skb->dev, skb); 2818 } 2819 if (!skb_shared(skb)) 2820 skb->csum_valid = !sum; 2821 return sum; 2822 } 2823 EXPORT_SYMBOL(__skb_checksum_complete_head); 2824 2825 /* This function assumes skb->csum already holds pseudo header's checksum, 2826 * which has been changed from the hardware checksum, for example, by 2827 * __skb_checksum_validate_complete(). And, the original skb->csum must 2828 * have been validated unsuccessfully for CHECKSUM_COMPLETE case. 2829 * 2830 * It returns non-zero if the recomputed checksum is still invalid, otherwise 2831 * zero. The new checksum is stored back into skb->csum unless the skb is 2832 * shared. 2833 */ 2834 __sum16 __skb_checksum_complete(struct sk_buff *skb) 2835 { 2836 __wsum csum; 2837 __sum16 sum; 2838 2839 csum = skb_checksum(skb, 0, skb->len, 0); 2840 2841 sum = csum_fold(csum_add(skb->csum, csum)); 2842 /* This check is inverted, because we already knew the hardware 2843 * checksum is invalid before calling this function. So, if the 2844 * re-computed checksum is valid instead, then we have a mismatch 2845 * between the original skb->csum and skb_checksum(). This means either 2846 * the original hardware checksum is incorrect or we screw up skb->csum 2847 * when moving skb->data around. 2848 */ 2849 if (likely(!sum)) { 2850 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && 2851 !skb->csum_complete_sw) 2852 netdev_rx_csum_fault(skb->dev, skb); 2853 } 2854 2855 if (!skb_shared(skb)) { 2856 /* Save full packet checksum */ 2857 skb->csum = csum; 2858 skb->ip_summed = CHECKSUM_COMPLETE; 2859 skb->csum_complete_sw = 1; 2860 skb->csum_valid = !sum; 2861 } 2862 2863 return sum; 2864 } 2865 EXPORT_SYMBOL(__skb_checksum_complete); 2866 2867 static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum) 2868 { 2869 net_warn_ratelimited( 2870 "%s: attempt to compute crc32c without libcrc32c.ko\n", 2871 __func__); 2872 return 0; 2873 } 2874 2875 static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2, 2876 int offset, int len) 2877 { 2878 net_warn_ratelimited( 2879 "%s: attempt to compute crc32c without libcrc32c.ko\n", 2880 __func__); 2881 return 0; 2882 } 2883 2884 static const struct skb_checksum_ops default_crc32c_ops = { 2885 .update = warn_crc32c_csum_update, 2886 .combine = warn_crc32c_csum_combine, 2887 }; 2888 2889 const struct skb_checksum_ops *crc32c_csum_stub __read_mostly = 2890 &default_crc32c_ops; 2891 EXPORT_SYMBOL(crc32c_csum_stub); 2892 2893 /** 2894 * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() 2895 * @from: source buffer 2896 * 2897 * Calculates the amount of linear headroom needed in the 'to' skb passed 2898 * into skb_zerocopy(). 2899 */ 2900 unsigned int 2901 skb_zerocopy_headlen(const struct sk_buff *from) 2902 { 2903 unsigned int hlen = 0; 2904 2905 if (!from->head_frag || 2906 skb_headlen(from) < L1_CACHE_BYTES || 2907 skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) 2908 hlen = skb_headlen(from); 2909 2910 if (skb_has_frag_list(from)) 2911 hlen = from->len; 2912 2913 return hlen; 2914 } 2915 EXPORT_SYMBOL_GPL(skb_zerocopy_headlen); 2916 2917 /** 2918 * skb_zerocopy - Zero copy skb to skb 2919 * @to: destination buffer 2920 * @from: source buffer 2921 * @len: number of bytes to copy from source buffer 2922 * @hlen: size of linear headroom in destination buffer 2923 * 2924 * Copies up to `len` bytes from `from` to `to` by creating references 2925 * to the frags in the source buffer. 2926 * 2927 * The `hlen` as calculated by skb_zerocopy_headlen() specifies the 2928 * headroom in the `to` buffer. 2929 * 2930 * Return value: 2931 * 0: everything is OK 2932 * -ENOMEM: couldn't orphan frags of @from due to lack of memory 2933 * -EFAULT: skb_copy_bits() found some problem with skb geometry 2934 */ 2935 int 2936 skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) 2937 { 2938 int i, j = 0; 2939 int plen = 0; /* length of skb->head fragment */ 2940 int ret; 2941 struct page *page; 2942 unsigned int offset; 2943 2944 BUG_ON(!from->head_frag && !hlen); 2945 2946 /* dont bother with small payloads */ 2947 if (len <= skb_tailroom(to)) 2948 return skb_copy_bits(from, 0, skb_put(to, len), len); 2949 2950 if (hlen) { 2951 ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen); 2952 if (unlikely(ret)) 2953 return ret; 2954 len -= hlen; 2955 } else { 2956 plen = min_t(int, skb_headlen(from), len); 2957 if (plen) { 2958 page = virt_to_head_page(from->head); 2959 offset = from->data - (unsigned char *)page_address(page); 2960 __skb_fill_page_desc(to, 0, page, offset, plen); 2961 get_page(page); 2962 j = 1; 2963 len -= plen; 2964 } 2965 } 2966 2967 to->truesize += len + plen; 2968 to->len += len + plen; 2969 to->data_len += len + plen; 2970 2971 if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) { 2972 skb_tx_error(from); 2973 return -ENOMEM; 2974 } 2975 skb_zerocopy_clone(to, from, GFP_ATOMIC); 2976 2977 for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { 2978 int size; 2979 2980 if (!len) 2981 break; 2982 skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; 2983 size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]), 2984 len); 2985 skb_frag_size_set(&skb_shinfo(to)->frags[j], size); 2986 len -= size; 2987 skb_frag_ref(to, j); 2988 j++; 2989 } 2990 skb_shinfo(to)->nr_frags = j; 2991 2992 return 0; 2993 } 2994 EXPORT_SYMBOL_GPL(skb_zerocopy); 2995 2996 void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) 2997 { 2998 __wsum csum; 2999 long csstart; 3000 3001 if (skb->ip_summed == CHECKSUM_PARTIAL) 3002 csstart = skb_checksum_start_offset(skb); 3003 else 3004 csstart = skb_headlen(skb); 3005 3006 BUG_ON(csstart > skb_headlen(skb)); 3007 3008 skb_copy_from_linear_data(skb, to, csstart); 3009 3010 csum = 0; 3011 if (csstart != skb->len) 3012 csum = skb_copy_and_csum_bits(skb, csstart, to + csstart, 3013 skb->len - csstart, 0); 3014 3015 if (skb->ip_summed == CHECKSUM_PARTIAL) { 3016 long csstuff = csstart + skb->csum_offset; 3017 3018 *((__sum16 *)(to + csstuff)) = csum_fold(csum); 3019 } 3020 } 3021 EXPORT_SYMBOL(skb_copy_and_csum_dev); 3022 3023 /** 3024 * skb_dequeue - remove from the head of the queue 3025 * @list: list to dequeue from 3026 * 3027 * Remove the head of the list. The list lock is taken so the function 3028 * may be used safely with other locking list functions. The head item is 3029 * returned or %NULL if the list is empty. 3030 */ 3031 3032 struct sk_buff *skb_dequeue(struct sk_buff_head *list) 3033 { 3034 unsigned long flags; 3035 struct sk_buff *result; 3036 3037 spin_lock_irqsave(&list->lock, flags); 3038 result = __skb_dequeue(list); 3039 spin_unlock_irqrestore(&list->lock, flags); 3040 return result; 3041 } 3042 EXPORT_SYMBOL(skb_dequeue); 3043 3044 /** 3045 * skb_dequeue_tail - remove from the tail of the queue 3046 * @list: list to dequeue from 3047 * 3048 * Remove the tail of the list. The list lock is taken so the function 3049 * may be used safely with other locking list functions. The tail item is 3050 * returned or %NULL if the list is empty. 3051 */ 3052 struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) 3053 { 3054 unsigned long flags; 3055 struct sk_buff *result; 3056 3057 spin_lock_irqsave(&list->lock, flags); 3058 result = __skb_dequeue_tail(list); 3059 spin_unlock_irqrestore(&list->lock, flags); 3060 return result; 3061 } 3062 EXPORT_SYMBOL(skb_dequeue_tail); 3063 3064 /** 3065 * skb_queue_purge - empty a list 3066 * @list: list to empty 3067 * 3068 * Delete all buffers on an &sk_buff list. Each buffer is removed from 3069 * the list and one reference dropped. This function takes the list 3070 * lock and is atomic with respect to other list locking functions. 3071 */ 3072 void skb_queue_purge(struct sk_buff_head *list) 3073 { 3074 struct sk_buff *skb; 3075 while ((skb = skb_dequeue(list)) != NULL) 3076 kfree_skb(skb); 3077 } 3078 EXPORT_SYMBOL(skb_queue_purge); 3079 3080 /** 3081 * skb_rbtree_purge - empty a skb rbtree 3082 * @root: root of the rbtree to empty 3083 * Return value: the sum of truesizes of all purged skbs. 3084 * 3085 * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from 3086 * the list and one reference dropped. This function does not take 3087 * any lock. Synchronization should be handled by the caller (e.g., TCP 3088 * out-of-order queue is protected by the socket lock). 3089 */ 3090 unsigned int skb_rbtree_purge(struct rb_root *root) 3091 { 3092 struct rb_node *p = rb_first(root); 3093 unsigned int sum = 0; 3094 3095 while (p) { 3096 struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); 3097 3098 p = rb_next(p); 3099 rb_erase(&skb->rbnode, root); 3100 sum += skb->truesize; 3101 kfree_skb(skb); 3102 } 3103 return sum; 3104 } 3105 3106 /** 3107 * skb_queue_head - queue a buffer at the list head 3108 * @list: list to use 3109 * @newsk: buffer to queue 3110 * 3111 * Queue a buffer at the start of the list. This function takes the 3112 * list lock and can be used safely with other locking &sk_buff functions 3113 * safely. 3114 * 3115 * A buffer cannot be placed on two lists at the same time. 3116 */ 3117 void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) 3118 { 3119 unsigned long flags; 3120 3121 spin_lock_irqsave(&list->lock, flags); 3122 __skb_queue_head(list, newsk); 3123 spin_unlock_irqrestore(&list->lock, flags); 3124 } 3125 EXPORT_SYMBOL(skb_queue_head); 3126 3127 /** 3128 * skb_queue_tail - queue a buffer at the list tail 3129 * @list: list to use 3130 * @newsk: buffer to queue 3131 * 3132 * Queue a buffer at the tail of the list. This function takes the 3133 * list lock and can be used safely with other locking &sk_buff functions 3134 * safely. 3135 * 3136 * A buffer cannot be placed on two lists at the same time. 3137 */ 3138 void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) 3139 { 3140 unsigned long flags; 3141 3142 spin_lock_irqsave(&list->lock, flags); 3143 __skb_queue_tail(list, newsk); 3144 spin_unlock_irqrestore(&list->lock, flags); 3145 } 3146 EXPORT_SYMBOL(skb_queue_tail); 3147 3148 /** 3149 * skb_unlink - remove a buffer from a list 3150 * @skb: buffer to remove 3151 * @list: list to use 3152 * 3153 * Remove a packet from a list. The list locks are taken and this 3154 * function is atomic with respect to other list locked calls 3155 * 3156 * You must know what list the SKB is on. 3157 */ 3158 void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) 3159 { 3160 unsigned long flags; 3161 3162 spin_lock_irqsave(&list->lock, flags); 3163 __skb_unlink(skb, list); 3164 spin_unlock_irqrestore(&list->lock, flags); 3165 } 3166 EXPORT_SYMBOL(skb_unlink); 3167 3168 /** 3169 * skb_append - append a buffer 3170 * @old: buffer to insert after 3171 * @newsk: buffer to insert 3172 * @list: list to use 3173 * 3174 * Place a packet after a given packet in a list. The list locks are taken 3175 * and this function is atomic with respect to other list locked calls. 3176 * A buffer cannot be placed on two lists at the same time. 3177 */ 3178 void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) 3179 { 3180 unsigned long flags; 3181 3182 spin_lock_irqsave(&list->lock, flags); 3183 __skb_queue_after(list, old, newsk); 3184 spin_unlock_irqrestore(&list->lock, flags); 3185 } 3186 EXPORT_SYMBOL(skb_append); 3187 3188 static inline void skb_split_inside_header(struct sk_buff *skb, 3189 struct sk_buff* skb1, 3190 const u32 len, const int pos) 3191 { 3192 int i; 3193 3194 skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len), 3195 pos - len); 3196 /* And move data appendix as is. */ 3197 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 3198 skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; 3199 3200 skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; 3201 skb_shinfo(skb)->nr_frags = 0; 3202 skb1->data_len = skb->data_len; 3203 skb1->len += skb1->data_len; 3204 skb->data_len = 0; 3205 skb->len = len; 3206 skb_set_tail_pointer(skb, len); 3207 } 3208 3209 static inline void skb_split_no_header(struct sk_buff *skb, 3210 struct sk_buff* skb1, 3211 const u32 len, int pos) 3212 { 3213 int i, k = 0; 3214 const int nfrags = skb_shinfo(skb)->nr_frags; 3215 3216 skb_shinfo(skb)->nr_frags = 0; 3217 skb1->len = skb1->data_len = skb->len - len; 3218 skb->len = len; 3219 skb->data_len = len - pos; 3220 3221 for (i = 0; i < nfrags; i++) { 3222 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 3223 3224 if (pos + size > len) { 3225 skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; 3226 3227 if (pos < len) { 3228 /* Split frag. 3229 * We have two variants in this case: 3230 * 1. Move all the frag to the second 3231 * part, if it is possible. F.e. 3232 * this approach is mandatory for TUX, 3233 * where splitting is expensive. 3234 * 2. Split is accurately. We make this. 3235 */ 3236 skb_frag_ref(skb, i); 3237 skb_frag_off_add(&skb_shinfo(skb1)->frags[0], len - pos); 3238 skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos); 3239 skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos); 3240 skb_shinfo(skb)->nr_frags++; 3241 } 3242 k++; 3243 } else 3244 skb_shinfo(skb)->nr_frags++; 3245 pos += size; 3246 } 3247 skb_shinfo(skb1)->nr_frags = k; 3248 } 3249 3250 /** 3251 * skb_split - Split fragmented skb to two parts at length len. 3252 * @skb: the buffer to split 3253 * @skb1: the buffer to receive the second part 3254 * @len: new length for skb 3255 */ 3256 void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) 3257 { 3258 int pos = skb_headlen(skb); 3259 3260 skb_shinfo(skb1)->tx_flags |= skb_shinfo(skb)->tx_flags & 3261 SKBTX_SHARED_FRAG; 3262 skb_zerocopy_clone(skb1, skb, 0); 3263 if (len < pos) /* Split line is inside header. */ 3264 skb_split_inside_header(skb, skb1, len, pos); 3265 else /* Second chunk has no header, nothing to copy. */ 3266 skb_split_no_header(skb, skb1, len, pos); 3267 } 3268 EXPORT_SYMBOL(skb_split); 3269 3270 /* Shifting from/to a cloned skb is a no-go. 3271 * 3272 * Caller cannot keep skb_shinfo related pointers past calling here! 3273 */ 3274 static int skb_prepare_for_shift(struct sk_buff *skb) 3275 { 3276 return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 3277 } 3278 3279 /** 3280 * skb_shift - Shifts paged data partially from skb to another 3281 * @tgt: buffer into which tail data gets added 3282 * @skb: buffer from which the paged data comes from 3283 * @shiftlen: shift up to this many bytes 3284 * 3285 * Attempts to shift up to shiftlen worth of bytes, which may be less than 3286 * the length of the skb, from skb to tgt. Returns number bytes shifted. 3287 * It's up to caller to free skb if everything was shifted. 3288 * 3289 * If @tgt runs out of frags, the whole operation is aborted. 3290 * 3291 * Skb cannot include anything else but paged data while tgt is allowed 3292 * to have non-paged data as well. 3293 * 3294 * TODO: full sized shift could be optimized but that would need 3295 * specialized skb free'er to handle frags without up-to-date nr_frags. 3296 */ 3297 int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) 3298 { 3299 int from, to, merge, todo; 3300 skb_frag_t *fragfrom, *fragto; 3301 3302 BUG_ON(shiftlen > skb->len); 3303 3304 if (skb_headlen(skb)) 3305 return 0; 3306 if (skb_zcopy(tgt) || skb_zcopy(skb)) 3307 return 0; 3308 3309 todo = shiftlen; 3310 from = 0; 3311 to = skb_shinfo(tgt)->nr_frags; 3312 fragfrom = &skb_shinfo(skb)->frags[from]; 3313 3314 /* Actual merge is delayed until the point when we know we can 3315 * commit all, so that we don't have to undo partial changes 3316 */ 3317 if (!to || 3318 !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), 3319 skb_frag_off(fragfrom))) { 3320 merge = -1; 3321 } else { 3322 merge = to - 1; 3323 3324 todo -= skb_frag_size(fragfrom); 3325 if (todo < 0) { 3326 if (skb_prepare_for_shift(skb) || 3327 skb_prepare_for_shift(tgt)) 3328 return 0; 3329 3330 /* All previous frag pointers might be stale! */ 3331 fragfrom = &skb_shinfo(skb)->frags[from]; 3332 fragto = &skb_shinfo(tgt)->frags[merge]; 3333 3334 skb_frag_size_add(fragto, shiftlen); 3335 skb_frag_size_sub(fragfrom, shiftlen); 3336 skb_frag_off_add(fragfrom, shiftlen); 3337 3338 goto onlymerged; 3339 } 3340 3341 from++; 3342 } 3343 3344 /* Skip full, not-fitting skb to avoid expensive operations */ 3345 if ((shiftlen == skb->len) && 3346 (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to)) 3347 return 0; 3348 3349 if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt)) 3350 return 0; 3351 3352 while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) { 3353 if (to == MAX_SKB_FRAGS) 3354 return 0; 3355 3356 fragfrom = &skb_shinfo(skb)->frags[from]; 3357 fragto = &skb_shinfo(tgt)->frags[to]; 3358 3359 if (todo >= skb_frag_size(fragfrom)) { 3360 *fragto = *fragfrom; 3361 todo -= skb_frag_size(fragfrom); 3362 from++; 3363 to++; 3364 3365 } else { 3366 __skb_frag_ref(fragfrom); 3367 skb_frag_page_copy(fragto, fragfrom); 3368 skb_frag_off_copy(fragto, fragfrom); 3369 skb_frag_size_set(fragto, todo); 3370 3371 skb_frag_off_add(fragfrom, todo); 3372 skb_frag_size_sub(fragfrom, todo); 3373 todo = 0; 3374 3375 to++; 3376 break; 3377 } 3378 } 3379 3380 /* Ready to "commit" this state change to tgt */ 3381 skb_shinfo(tgt)->nr_frags = to; 3382 3383 if (merge >= 0) { 3384 fragfrom = &skb_shinfo(skb)->frags[0]; 3385 fragto = &skb_shinfo(tgt)->frags[merge]; 3386 3387 skb_frag_size_add(fragto, skb_frag_size(fragfrom)); 3388 __skb_frag_unref(fragfrom); 3389 } 3390 3391 /* Reposition in the original skb */ 3392 to = 0; 3393 while (from < skb_shinfo(skb)->nr_frags) 3394 skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++]; 3395 skb_shinfo(skb)->nr_frags = to; 3396 3397 BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags); 3398 3399 onlymerged: 3400 /* Most likely the tgt won't ever need its checksum anymore, skb on 3401 * the other hand might need it if it needs to be resent 3402 */ 3403 tgt->ip_summed = CHECKSUM_PARTIAL; 3404 skb->ip_summed = CHECKSUM_PARTIAL; 3405 3406 /* Yak, is it really working this way? Some helper please? */ 3407 skb->len -= shiftlen; 3408 skb->data_len -= shiftlen; 3409 skb->truesize -= shiftlen; 3410 tgt->len += shiftlen; 3411 tgt->data_len += shiftlen; 3412 tgt->truesize += shiftlen; 3413 3414 return shiftlen; 3415 } 3416 3417 /** 3418 * skb_prepare_seq_read - Prepare a sequential read of skb data 3419 * @skb: the buffer to read 3420 * @from: lower offset of data to be read 3421 * @to: upper offset of data to be read 3422 * @st: state variable 3423 * 3424 * Initializes the specified state variable. Must be called before 3425 * invoking skb_seq_read() for the first time. 3426 */ 3427 void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, 3428 unsigned int to, struct skb_seq_state *st) 3429 { 3430 st->lower_offset = from; 3431 st->upper_offset = to; 3432 st->root_skb = st->cur_skb = skb; 3433 st->frag_idx = st->stepped_offset = 0; 3434 st->frag_data = NULL; 3435 } 3436 EXPORT_SYMBOL(skb_prepare_seq_read); 3437 3438 /** 3439 * skb_seq_read - Sequentially read skb data 3440 * @consumed: number of bytes consumed by the caller so far 3441 * @data: destination pointer for data to be returned 3442 * @st: state variable 3443 * 3444 * Reads a block of skb data at @consumed relative to the 3445 * lower offset specified to skb_prepare_seq_read(). Assigns 3446 * the head of the data block to @data and returns the length 3447 * of the block or 0 if the end of the skb data or the upper 3448 * offset has been reached. 3449 * 3450 * The caller is not required to consume all of the data 3451 * returned, i.e. @consumed is typically set to the number 3452 * of bytes already consumed and the next call to 3453 * skb_seq_read() will return the remaining part of the block. 3454 * 3455 * Note 1: The size of each block of data returned can be arbitrary, 3456 * this limitation is the cost for zerocopy sequential 3457 * reads of potentially non linear data. 3458 * 3459 * Note 2: Fragment lists within fragments are not implemented 3460 * at the moment, state->root_skb could be replaced with 3461 * a stack for this purpose. 3462 */ 3463 unsigned int skb_seq_read(unsigned int consumed, const u8 **data, 3464 struct skb_seq_state *st) 3465 { 3466 unsigned int block_limit, abs_offset = consumed + st->lower_offset; 3467 skb_frag_t *frag; 3468 3469 if (unlikely(abs_offset >= st->upper_offset)) { 3470 if (st->frag_data) { 3471 kunmap_atomic(st->frag_data); 3472 st->frag_data = NULL; 3473 } 3474 return 0; 3475 } 3476 3477 next_skb: 3478 block_limit = skb_headlen(st->cur_skb) + st->stepped_offset; 3479 3480 if (abs_offset < block_limit && !st->frag_data) { 3481 *data = st->cur_skb->data + (abs_offset - st->stepped_offset); 3482 return block_limit - abs_offset; 3483 } 3484 3485 if (st->frag_idx == 0 && !st->frag_data) 3486 st->stepped_offset += skb_headlen(st->cur_skb); 3487 3488 while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { 3489 frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; 3490 block_limit = skb_frag_size(frag) + st->stepped_offset; 3491 3492 if (abs_offset < block_limit) { 3493 if (!st->frag_data) 3494 st->frag_data = kmap_atomic(skb_frag_page(frag)); 3495 3496 *data = (u8 *) st->frag_data + skb_frag_off(frag) + 3497 (abs_offset - st->stepped_offset); 3498 3499 return block_limit - abs_offset; 3500 } 3501 3502 if (st->frag_data) { 3503 kunmap_atomic(st->frag_data); 3504 st->frag_data = NULL; 3505 } 3506 3507 st->frag_idx++; 3508 st->stepped_offset += skb_frag_size(frag); 3509 } 3510 3511 if (st->frag_data) { 3512 kunmap_atomic(st->frag_data); 3513 st->frag_data = NULL; 3514 } 3515 3516 if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) { 3517 st->cur_skb = skb_shinfo(st->root_skb)->frag_list; 3518 st->frag_idx = 0; 3519 goto next_skb; 3520 } else if (st->cur_skb->next) { 3521 st->cur_skb = st->cur_skb->next; 3522 st->frag_idx = 0; 3523 goto next_skb; 3524 } 3525 3526 return 0; 3527 } 3528 EXPORT_SYMBOL(skb_seq_read); 3529 3530 /** 3531 * skb_abort_seq_read - Abort a sequential read of skb data 3532 * @st: state variable 3533 * 3534 * Must be called if skb_seq_read() was not called until it 3535 * returned 0. 3536 */ 3537 void skb_abort_seq_read(struct skb_seq_state *st) 3538 { 3539 if (st->frag_data) 3540 kunmap_atomic(st->frag_data); 3541 } 3542 EXPORT_SYMBOL(skb_abort_seq_read); 3543 3544 #define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) 3545 3546 static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text, 3547 struct ts_config *conf, 3548 struct ts_state *state) 3549 { 3550 return skb_seq_read(offset, text, TS_SKB_CB(state)); 3551 } 3552 3553 static void skb_ts_finish(struct ts_config *conf, struct ts_state *state) 3554 { 3555 skb_abort_seq_read(TS_SKB_CB(state)); 3556 } 3557 3558 /** 3559 * skb_find_text - Find a text pattern in skb data 3560 * @skb: the buffer to look in 3561 * @from: search offset 3562 * @to: search limit 3563 * @config: textsearch configuration 3564 * 3565 * Finds a pattern in the skb data according to the specified 3566 * textsearch configuration. Use textsearch_next() to retrieve 3567 * subsequent occurrences of the pattern. Returns the offset 3568 * to the first occurrence or UINT_MAX if no match was found. 3569 */ 3570 unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, 3571 unsigned int to, struct ts_config *config) 3572 { 3573 struct ts_state state; 3574 unsigned int ret; 3575 3576 config->get_next_block = skb_ts_get_next_block; 3577 config->finish = skb_ts_finish; 3578 3579 skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state)); 3580 3581 ret = textsearch_find(config, &state); 3582 return (ret <= to - from ? ret : UINT_MAX); 3583 } 3584 EXPORT_SYMBOL(skb_find_text); 3585 3586 int skb_append_pagefrags(struct sk_buff *skb, struct page *page, 3587 int offset, size_t size) 3588 { 3589 int i = skb_shinfo(skb)->nr_frags; 3590 3591 if (skb_can_coalesce(skb, i, page, offset)) { 3592 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); 3593 } else if (i < MAX_SKB_FRAGS) { 3594 get_page(page); 3595 skb_fill_page_desc(skb, i, page, offset, size); 3596 } else { 3597 return -EMSGSIZE; 3598 } 3599 3600 return 0; 3601 } 3602 EXPORT_SYMBOL_GPL(skb_append_pagefrags); 3603 3604 /** 3605 * skb_pull_rcsum - pull skb and update receive checksum 3606 * @skb: buffer to update 3607 * @len: length of data pulled 3608 * 3609 * This function performs an skb_pull on the packet and updates 3610 * the CHECKSUM_COMPLETE checksum. It should be used on 3611 * receive path processing instead of skb_pull unless you know 3612 * that the checksum difference is zero (e.g., a valid IP header) 3613 * or you are setting ip_summed to CHECKSUM_NONE. 3614 */ 3615 void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) 3616 { 3617 unsigned char *data = skb->data; 3618 3619 BUG_ON(len > skb->len); 3620 __skb_pull(skb, len); 3621 skb_postpull_rcsum(skb, data, len); 3622 return skb->data; 3623 } 3624 EXPORT_SYMBOL_GPL(skb_pull_rcsum); 3625 3626 static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb) 3627 { 3628 skb_frag_t head_frag; 3629 struct page *page; 3630 3631 page = virt_to_head_page(frag_skb->head); 3632 __skb_frag_set_page(&head_frag, page); 3633 skb_frag_off_set(&head_frag, frag_skb->data - 3634 (unsigned char *)page_address(page)); 3635 skb_frag_size_set(&head_frag, skb_headlen(frag_skb)); 3636 return head_frag; 3637 } 3638 3639 struct sk_buff *skb_segment_list(struct sk_buff *skb, 3640 netdev_features_t features, 3641 unsigned int offset) 3642 { 3643 struct sk_buff *list_skb = skb_shinfo(skb)->frag_list; 3644 unsigned int tnl_hlen = skb_tnl_header_len(skb); 3645 unsigned int delta_truesize = 0; 3646 unsigned int delta_len = 0; 3647 struct sk_buff *tail = NULL; 3648 struct sk_buff *nskb; 3649 3650 skb_push(skb, -skb_network_offset(skb) + offset); 3651 3652 skb_shinfo(skb)->frag_list = NULL; 3653 3654 do { 3655 nskb = list_skb; 3656 list_skb = list_skb->next; 3657 3658 if (!tail) 3659 skb->next = nskb; 3660 else 3661 tail->next = nskb; 3662 3663 tail = nskb; 3664 3665 delta_len += nskb->len; 3666 delta_truesize += nskb->truesize; 3667 3668 skb_push(nskb, -skb_network_offset(nskb) + offset); 3669 3670 skb_release_head_state(nskb); 3671 __copy_skb_header(nskb, skb); 3672 3673 skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb)); 3674 skb_copy_from_linear_data_offset(skb, -tnl_hlen, 3675 nskb->data - tnl_hlen, 3676 offset + tnl_hlen); 3677 3678 if (skb_needs_linearize(nskb, features) && 3679 __skb_linearize(nskb)) 3680 goto err_linearize; 3681 3682 } while (list_skb); 3683 3684 skb->truesize = skb->truesize - delta_truesize; 3685 skb->data_len = skb->data_len - delta_len; 3686 skb->len = skb->len - delta_len; 3687 3688 skb_gso_reset(skb); 3689 3690 skb->prev = tail; 3691 3692 if (skb_needs_linearize(skb, features) && 3693 __skb_linearize(skb)) 3694 goto err_linearize; 3695 3696 skb_get(skb); 3697 3698 return skb; 3699 3700 err_linearize: 3701 kfree_skb_list(skb->next); 3702 skb->next = NULL; 3703 return ERR_PTR(-ENOMEM); 3704 } 3705 EXPORT_SYMBOL_GPL(skb_segment_list); 3706 3707 int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) 3708 { 3709 if (unlikely(p->len + skb->len >= 65536)) 3710 return -E2BIG; 3711 3712 if (NAPI_GRO_CB(p)->last == p) 3713 skb_shinfo(p)->frag_list = skb; 3714 else 3715 NAPI_GRO_CB(p)->last->next = skb; 3716 3717 skb_pull(skb, skb_gro_offset(skb)); 3718 3719 NAPI_GRO_CB(p)->last = skb; 3720 NAPI_GRO_CB(p)->count++; 3721 p->data_len += skb->len; 3722 p->truesize += skb->truesize; 3723 p->len += skb->len; 3724 3725 NAPI_GRO_CB(skb)->same_flow = 1; 3726 3727 return 0; 3728 } 3729 3730 /** 3731 * skb_segment - Perform protocol segmentation on skb. 3732 * @head_skb: buffer to segment 3733 * @features: features for the output path (see dev->features) 3734 * 3735 * This function performs segmentation on the given skb. It returns 3736 * a pointer to the first in a list of new skbs for the segments. 3737 * In case of error it returns ERR_PTR(err). 3738 */ 3739 struct sk_buff *skb_segment(struct sk_buff *head_skb, 3740 netdev_features_t features) 3741 { 3742 struct sk_buff *segs = NULL; 3743 struct sk_buff *tail = NULL; 3744 struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list; 3745 skb_frag_t *frag = skb_shinfo(head_skb)->frags; 3746 unsigned int mss = skb_shinfo(head_skb)->gso_size; 3747 unsigned int doffset = head_skb->data - skb_mac_header(head_skb); 3748 struct sk_buff *frag_skb = head_skb; 3749 unsigned int offset = doffset; 3750 unsigned int tnl_hlen = skb_tnl_header_len(head_skb); 3751 unsigned int partial_segs = 0; 3752 unsigned int headroom; 3753 unsigned int len = head_skb->len; 3754 __be16 proto; 3755 bool csum, sg; 3756 int nfrags = skb_shinfo(head_skb)->nr_frags; 3757 int err = -ENOMEM; 3758 int i = 0; 3759 int pos; 3760 3761 if (list_skb && !list_skb->head_frag && skb_headlen(list_skb) && 3762 (skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY)) { 3763 /* gso_size is untrusted, and we have a frag_list with a linear 3764 * non head_frag head. 3765 * 3766 * (we assume checking the first list_skb member suffices; 3767 * i.e if either of the list_skb members have non head_frag 3768 * head, then the first one has too). 3769 * 3770 * If head_skb's headlen does not fit requested gso_size, it 3771 * means that the frag_list members do NOT terminate on exact 3772 * gso_size boundaries. Hence we cannot perform skb_frag_t page 3773 * sharing. Therefore we must fallback to copying the frag_list 3774 * skbs; we do so by disabling SG. 3775 */ 3776 if (mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) 3777 features &= ~NETIF_F_SG; 3778 } 3779 3780 __skb_push(head_skb, doffset); 3781 proto = skb_network_protocol(head_skb, NULL); 3782 if (unlikely(!proto)) 3783 return ERR_PTR(-EINVAL); 3784 3785 sg = !!(features & NETIF_F_SG); 3786 csum = !!can_checksum_protocol(features, proto); 3787 3788 if (sg && csum && (mss != GSO_BY_FRAGS)) { 3789 if (!(features & NETIF_F_GSO_PARTIAL)) { 3790 struct sk_buff *iter; 3791 unsigned int frag_len; 3792 3793 if (!list_skb || 3794 !net_gso_ok(features, skb_shinfo(head_skb)->gso_type)) 3795 goto normal; 3796 3797 /* If we get here then all the required 3798 * GSO features except frag_list are supported. 3799 * Try to split the SKB to multiple GSO SKBs 3800 * with no frag_list. 3801 * Currently we can do that only when the buffers don't 3802 * have a linear part and all the buffers except 3803 * the last are of the same length. 3804 */ 3805 frag_len = list_skb->len; 3806 skb_walk_frags(head_skb, iter) { 3807 if (frag_len != iter->len && iter->next) 3808 goto normal; 3809 if (skb_headlen(iter) && !iter->head_frag) 3810 goto normal; 3811 3812 len -= iter->len; 3813 } 3814 3815 if (len != frag_len) 3816 goto normal; 3817 } 3818 3819 /* GSO partial only requires that we trim off any excess that 3820 * doesn't fit into an MSS sized block, so take care of that 3821 * now. 3822 */ 3823 partial_segs = len / mss; 3824 if (partial_segs > 1) 3825 mss *= partial_segs; 3826 else 3827 partial_segs = 0; 3828 } 3829 3830 normal: 3831 headroom = skb_headroom(head_skb); 3832 pos = skb_headlen(head_skb); 3833 3834 do { 3835 struct sk_buff *nskb; 3836 skb_frag_t *nskb_frag; 3837 int hsize; 3838 int size; 3839 3840 if (unlikely(mss == GSO_BY_FRAGS)) { 3841 len = list_skb->len; 3842 } else { 3843 len = head_skb->len - offset; 3844 if (len > mss) 3845 len = mss; 3846 } 3847 3848 hsize = skb_headlen(head_skb) - offset; 3849 if (hsize < 0) 3850 hsize = 0; 3851 if (hsize > len || !sg) 3852 hsize = len; 3853 3854 if (!hsize && i >= nfrags && skb_headlen(list_skb) && 3855 (skb_headlen(list_skb) == len || sg)) { 3856 BUG_ON(skb_headlen(list_skb) > len); 3857 3858 i = 0; 3859 nfrags = skb_shinfo(list_skb)->nr_frags; 3860 frag = skb_shinfo(list_skb)->frags; 3861 frag_skb = list_skb; 3862 pos += skb_headlen(list_skb); 3863 3864 while (pos < offset + len) { 3865 BUG_ON(i >= nfrags); 3866 3867 size = skb_frag_size(frag); 3868 if (pos + size > offset + len) 3869 break; 3870 3871 i++; 3872 pos += size; 3873 frag++; 3874 } 3875 3876 nskb = skb_clone(list_skb, GFP_ATOMIC); 3877 list_skb = list_skb->next; 3878 3879 if (unlikely(!nskb)) 3880 goto err; 3881 3882 if (unlikely(pskb_trim(nskb, len))) { 3883 kfree_skb(nskb); 3884 goto err; 3885 } 3886 3887 hsize = skb_end_offset(nskb); 3888 if (skb_cow_head(nskb, doffset + headroom)) { 3889 kfree_skb(nskb); 3890 goto err; 3891 } 3892 3893 nskb->truesize += skb_end_offset(nskb) - hsize; 3894 skb_release_head_state(nskb); 3895 __skb_push(nskb, doffset); 3896 } else { 3897 nskb = __alloc_skb(hsize + doffset + headroom, 3898 GFP_ATOMIC, skb_alloc_rx_flag(head_skb), 3899 NUMA_NO_NODE); 3900 3901 if (unlikely(!nskb)) 3902 goto err; 3903 3904 skb_reserve(nskb, headroom); 3905 __skb_put(nskb, doffset); 3906 } 3907 3908 if (segs) 3909 tail->next = nskb; 3910 else 3911 segs = nskb; 3912 tail = nskb; 3913 3914 __copy_skb_header(nskb, head_skb); 3915 3916 skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom); 3917 skb_reset_mac_len(nskb); 3918 3919 skb_copy_from_linear_data_offset(head_skb, -tnl_hlen, 3920 nskb->data - tnl_hlen, 3921 doffset + tnl_hlen); 3922 3923 if (nskb->len == len + doffset) 3924 goto perform_csum_check; 3925 3926 if (!sg) { 3927 if (!csum) { 3928 if (!nskb->remcsum_offload) 3929 nskb->ip_summed = CHECKSUM_NONE; 3930 SKB_GSO_CB(nskb)->csum = 3931 skb_copy_and_csum_bits(head_skb, offset, 3932 skb_put(nskb, 3933 len), 3934 len, 0); 3935 SKB_GSO_CB(nskb)->csum_start = 3936 skb_headroom(nskb) + doffset; 3937 } else { 3938 skb_copy_bits(head_skb, offset, 3939 skb_put(nskb, len), 3940 len); 3941 } 3942 continue; 3943 } 3944 3945 nskb_frag = skb_shinfo(nskb)->frags; 3946 3947 skb_copy_from_linear_data_offset(head_skb, offset, 3948 skb_put(nskb, hsize), hsize); 3949 3950 skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags & 3951 SKBTX_SHARED_FRAG; 3952 3953 if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || 3954 skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) 3955 goto err; 3956 3957 while (pos < offset + len) { 3958 if (i >= nfrags) { 3959 i = 0; 3960 nfrags = skb_shinfo(list_skb)->nr_frags; 3961 frag = skb_shinfo(list_skb)->frags; 3962 frag_skb = list_skb; 3963 if (!skb_headlen(list_skb)) { 3964 BUG_ON(!nfrags); 3965 } else { 3966 BUG_ON(!list_skb->head_frag); 3967 3968 /* to make room for head_frag. */ 3969 i--; 3970 frag--; 3971 } 3972 if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || 3973 skb_zerocopy_clone(nskb, frag_skb, 3974 GFP_ATOMIC)) 3975 goto err; 3976 3977 list_skb = list_skb->next; 3978 } 3979 3980 if (unlikely(skb_shinfo(nskb)->nr_frags >= 3981 MAX_SKB_FRAGS)) { 3982 net_warn_ratelimited( 3983 "skb_segment: too many frags: %u %u\n", 3984 pos, mss); 3985 err = -EINVAL; 3986 goto err; 3987 } 3988 3989 *nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag; 3990 __skb_frag_ref(nskb_frag); 3991 size = skb_frag_size(nskb_frag); 3992 3993 if (pos < offset) { 3994 skb_frag_off_add(nskb_frag, offset - pos); 3995 skb_frag_size_sub(nskb_frag, offset - pos); 3996 } 3997 3998 skb_shinfo(nskb)->nr_frags++; 3999 4000 if (pos + size <= offset + len) { 4001 i++; 4002 frag++; 4003 pos += size; 4004 } else { 4005 skb_frag_size_sub(nskb_frag, pos + size - (offset + len)); 4006 goto skip_fraglist; 4007 } 4008 4009 nskb_frag++; 4010 } 4011 4012 skip_fraglist: 4013 nskb->data_len = len - hsize; 4014 nskb->len += nskb->data_len; 4015 nskb->truesize += nskb->data_len; 4016 4017 perform_csum_check: 4018 if (!csum) { 4019 if (skb_has_shared_frag(nskb) && 4020 __skb_linearize(nskb)) 4021 goto err; 4022 4023 if (!nskb->remcsum_offload) 4024 nskb->ip_summed = CHECKSUM_NONE; 4025 SKB_GSO_CB(nskb)->csum = 4026 skb_checksum(nskb, doffset, 4027 nskb->len - doffset, 0); 4028 SKB_GSO_CB(nskb)->csum_start = 4029 skb_headroom(nskb) + doffset; 4030 } 4031 } while ((offset += len) < head_skb->len); 4032 4033 /* Some callers want to get the end of the list. 4034 * Put it in segs->prev to avoid walking the list. 4035 * (see validate_xmit_skb_list() for example) 4036 */ 4037 segs->prev = tail; 4038 4039 if (partial_segs) { 4040 struct sk_buff *iter; 4041 int type = skb_shinfo(head_skb)->gso_type; 4042 unsigned short gso_size = skb_shinfo(head_skb)->gso_size; 4043 4044 /* Update type to add partial and then remove dodgy if set */ 4045 type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL; 4046 type &= ~SKB_GSO_DODGY; 4047 4048 /* Update GSO info and prepare to start updating headers on 4049 * our way back down the stack of protocols. 4050 */ 4051 for (iter = segs; iter; iter = iter->next) { 4052 skb_shinfo(iter)->gso_size = gso_size; 4053 skb_shinfo(iter)->gso_segs = partial_segs; 4054 skb_shinfo(iter)->gso_type = type; 4055 SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset; 4056 } 4057 4058 if (tail->len - doffset <= gso_size) 4059 skb_shinfo(tail)->gso_size = 0; 4060 else if (tail != segs) 4061 skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size); 4062 } 4063 4064 /* Following permits correct backpressure, for protocols 4065 * using skb_set_owner_w(). 4066 * Idea is to tranfert ownership from head_skb to last segment. 4067 */ 4068 if (head_skb->destructor == sock_wfree) { 4069 swap(tail->truesize, head_skb->truesize); 4070 swap(tail->destructor, head_skb->destructor); 4071 swap(tail->sk, head_skb->sk); 4072 } 4073 return segs; 4074 4075 err: 4076 kfree_skb_list(segs); 4077 return ERR_PTR(err); 4078 } 4079 EXPORT_SYMBOL_GPL(skb_segment); 4080 4081 int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) 4082 { 4083 struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb); 4084 unsigned int offset = skb_gro_offset(skb); 4085 unsigned int headlen = skb_headlen(skb); 4086 unsigned int len = skb_gro_len(skb); 4087 unsigned int delta_truesize; 4088 struct sk_buff *lp; 4089 4090 if (unlikely(p->len + len >= 65536 || NAPI_GRO_CB(skb)->flush)) 4091 return -E2BIG; 4092 4093 lp = NAPI_GRO_CB(p)->last; 4094 pinfo = skb_shinfo(lp); 4095 4096 if (headlen <= offset) { 4097 skb_frag_t *frag; 4098 skb_frag_t *frag2; 4099 int i = skbinfo->nr_frags; 4100 int nr_frags = pinfo->nr_frags + i; 4101 4102 if (nr_frags > MAX_SKB_FRAGS) 4103 goto merge; 4104 4105 offset -= headlen; 4106 pinfo->nr_frags = nr_frags; 4107 skbinfo->nr_frags = 0; 4108 4109 frag = pinfo->frags + nr_frags; 4110 frag2 = skbinfo->frags + i; 4111 do { 4112 *--frag = *--frag2; 4113 } while (--i); 4114 4115 skb_frag_off_add(frag, offset); 4116 skb_frag_size_sub(frag, offset); 4117 4118 /* all fragments truesize : remove (head size + sk_buff) */ 4119 delta_truesize = skb->truesize - 4120 SKB_TRUESIZE(skb_end_offset(skb)); 4121 4122 skb->truesize -= skb->data_len; 4123 skb->len -= skb->data_len; 4124 skb->data_len = 0; 4125 4126 NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE; 4127 goto done; 4128 } else if (skb->head_frag) { 4129 int nr_frags = pinfo->nr_frags; 4130 skb_frag_t *frag = pinfo->frags + nr_frags; 4131 struct page *page = virt_to_head_page(skb->head); 4132 unsigned int first_size = headlen - offset; 4133 unsigned int first_offset; 4134 4135 if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS) 4136 goto merge; 4137 4138 first_offset = skb->data - 4139 (unsigned char *)page_address(page) + 4140 offset; 4141 4142 pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; 4143 4144 __skb_frag_set_page(frag, page); 4145 skb_frag_off_set(frag, first_offset); 4146 skb_frag_size_set(frag, first_size); 4147 4148 memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); 4149 /* We dont need to clear skbinfo->nr_frags here */ 4150 4151 delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); 4152 NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; 4153 goto done; 4154 } 4155 4156 merge: 4157 delta_truesize = skb->truesize; 4158 if (offset > headlen) { 4159 unsigned int eat = offset - headlen; 4160 4161 skb_frag_off_add(&skbinfo->frags[0], eat); 4162 skb_frag_size_sub(&skbinfo->frags[0], eat); 4163 skb->data_len -= eat; 4164 skb->len -= eat; 4165 offset = headlen; 4166 } 4167 4168 __skb_pull(skb, offset); 4169 4170 if (NAPI_GRO_CB(p)->last == p) 4171 skb_shinfo(p)->frag_list = skb; 4172 else 4173 NAPI_GRO_CB(p)->last->next = skb; 4174 NAPI_GRO_CB(p)->last = skb; 4175 __skb_header_release(skb); 4176 lp = p; 4177 4178 done: 4179 NAPI_GRO_CB(p)->count++; 4180 p->data_len += len; 4181 p->truesize += delta_truesize; 4182 p->len += len; 4183 if (lp != p) { 4184 lp->data_len += len; 4185 lp->truesize += delta_truesize; 4186 lp->len += len; 4187 } 4188 NAPI_GRO_CB(skb)->same_flow = 1; 4189 return 0; 4190 } 4191 4192 #ifdef CONFIG_SKB_EXTENSIONS 4193 #define SKB_EXT_ALIGN_VALUE 8 4194 #define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE) 4195 4196 static const u8 skb_ext_type_len[] = { 4197 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) 4198 [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info), 4199 #endif 4200 #ifdef CONFIG_XFRM 4201 [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path), 4202 #endif 4203 #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) 4204 [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext), 4205 #endif 4206 #if IS_ENABLED(CONFIG_MPTCP) 4207 [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext), 4208 #endif 4209 }; 4210 4211 static __always_inline unsigned int skb_ext_total_length(void) 4212 { 4213 return SKB_EXT_CHUNKSIZEOF(struct skb_ext) + 4214 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) 4215 skb_ext_type_len[SKB_EXT_BRIDGE_NF] + 4216 #endif 4217 #ifdef CONFIG_XFRM 4218 skb_ext_type_len[SKB_EXT_SEC_PATH] + 4219 #endif 4220 #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) 4221 skb_ext_type_len[TC_SKB_EXT] + 4222 #endif 4223 #if IS_ENABLED(CONFIG_MPTCP) 4224 skb_ext_type_len[SKB_EXT_MPTCP] + 4225 #endif 4226 0; 4227 } 4228 4229 static void skb_extensions_init(void) 4230 { 4231 BUILD_BUG_ON(SKB_EXT_NUM >= 8); 4232 BUILD_BUG_ON(skb_ext_total_length() > 255); 4233 4234 skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache", 4235 SKB_EXT_ALIGN_VALUE * skb_ext_total_length(), 4236 0, 4237 SLAB_HWCACHE_ALIGN|SLAB_PANIC, 4238 NULL); 4239 } 4240 #else 4241 static void skb_extensions_init(void) {} 4242 #endif 4243 4244 void __init skb_init(void) 4245 { 4246 skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache", 4247 sizeof(struct sk_buff), 4248 0, 4249 SLAB_HWCACHE_ALIGN|SLAB_PANIC, 4250 offsetof(struct sk_buff, cb), 4251 sizeof_field(struct sk_buff, cb), 4252 NULL); 4253 skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", 4254 sizeof(struct sk_buff_fclones), 4255 0, 4256 SLAB_HWCACHE_ALIGN|SLAB_PANIC, 4257 NULL); 4258 skb_extensions_init(); 4259 } 4260 4261 static int 4262 __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len, 4263 unsigned int recursion_level) 4264 { 4265 int start = skb_headlen(skb); 4266 int i, copy = start - offset; 4267 struct sk_buff *frag_iter; 4268 int elt = 0; 4269 4270 if (unlikely(recursion_level >= 24)) 4271 return -EMSGSIZE; 4272 4273 if (copy > 0) { 4274 if (copy > len) 4275 copy = len; 4276 sg_set_buf(sg, skb->data + offset, copy); 4277 elt++; 4278 if ((len -= copy) == 0) 4279 return elt; 4280 offset += copy; 4281 } 4282 4283 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 4284 int end; 4285 4286 WARN_ON(start > offset + len); 4287 4288 end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); 4289 if ((copy = end - offset) > 0) { 4290 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 4291 if (unlikely(elt && sg_is_last(&sg[elt - 1]))) 4292 return -EMSGSIZE; 4293 4294 if (copy > len) 4295 copy = len; 4296 sg_set_page(&sg[elt], skb_frag_page(frag), copy, 4297 skb_frag_off(frag) + offset - start); 4298 elt++; 4299 if (!(len -= copy)) 4300 return elt; 4301 offset += copy; 4302 } 4303 start = end; 4304 } 4305 4306 skb_walk_frags(skb, frag_iter) { 4307 int end, ret; 4308 4309 WARN_ON(start > offset + len); 4310 4311 end = start + frag_iter->len; 4312 if ((copy = end - offset) > 0) { 4313 if (unlikely(elt && sg_is_last(&sg[elt - 1]))) 4314 return -EMSGSIZE; 4315 4316 if (copy > len) 4317 copy = len; 4318 ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start, 4319 copy, recursion_level + 1); 4320 if (unlikely(ret < 0)) 4321 return ret; 4322 elt += ret; 4323 if ((len -= copy) == 0) 4324 return elt; 4325 offset += copy; 4326 } 4327 start = end; 4328 } 4329 BUG_ON(len); 4330 return elt; 4331 } 4332 4333 /** 4334 * skb_to_sgvec - Fill a scatter-gather list from a socket buffer 4335 * @skb: Socket buffer containing the buffers to be mapped 4336 * @sg: The scatter-gather list to map into 4337 * @offset: The offset into the buffer's contents to start mapping 4338 * @len: Length of buffer space to be mapped 4339 * 4340 * Fill the specified scatter-gather list with mappings/pointers into a 4341 * region of the buffer space attached to a socket buffer. Returns either 4342 * the number of scatterlist items used, or -EMSGSIZE if the contents 4343 * could not fit. 4344 */ 4345 int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) 4346 { 4347 int nsg = __skb_to_sgvec(skb, sg, offset, len, 0); 4348 4349 if (nsg <= 0) 4350 return nsg; 4351 4352 sg_mark_end(&sg[nsg - 1]); 4353 4354 return nsg; 4355 } 4356 EXPORT_SYMBOL_GPL(skb_to_sgvec); 4357 4358 /* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given 4359 * sglist without mark the sg which contain last skb data as the end. 4360 * So the caller can mannipulate sg list as will when padding new data after 4361 * the first call without calling sg_unmark_end to expend sg list. 4362 * 4363 * Scenario to use skb_to_sgvec_nomark: 4364 * 1. sg_init_table 4365 * 2. skb_to_sgvec_nomark(payload1) 4366 * 3. skb_to_sgvec_nomark(payload2) 4367 * 4368 * This is equivalent to: 4369 * 1. sg_init_table 4370 * 2. skb_to_sgvec(payload1) 4371 * 3. sg_unmark_end 4372 * 4. skb_to_sgvec(payload2) 4373 * 4374 * When mapping mutilple payload conditionally, skb_to_sgvec_nomark 4375 * is more preferable. 4376 */ 4377 int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, 4378 int offset, int len) 4379 { 4380 return __skb_to_sgvec(skb, sg, offset, len, 0); 4381 } 4382 EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark); 4383 4384 4385 4386 /** 4387 * skb_cow_data - Check that a socket buffer's data buffers are writable 4388 * @skb: The socket buffer to check. 4389 * @tailbits: Amount of trailing space to be added 4390 * @trailer: Returned pointer to the skb where the @tailbits space begins 4391 * 4392 * Make sure that the data buffers attached to a socket buffer are 4393 * writable. If they are not, private copies are made of the data buffers 4394 * and the socket buffer is set to use these instead. 4395 * 4396 * If @tailbits is given, make sure that there is space to write @tailbits 4397 * bytes of data beyond current end of socket buffer. @trailer will be 4398 * set to point to the skb in which this space begins. 4399 * 4400 * The number of scatterlist elements required to completely map the 4401 * COW'd and extended socket buffer will be returned. 4402 */ 4403 int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) 4404 { 4405 int copyflag; 4406 int elt; 4407 struct sk_buff *skb1, **skb_p; 4408 4409 /* If skb is cloned or its head is paged, reallocate 4410 * head pulling out all the pages (pages are considered not writable 4411 * at the moment even if they are anonymous). 4412 */ 4413 if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) && 4414 !__pskb_pull_tail(skb, __skb_pagelen(skb))) 4415 return -ENOMEM; 4416 4417 /* Easy case. Most of packets will go this way. */ 4418 if (!skb_has_frag_list(skb)) { 4419 /* A little of trouble, not enough of space for trailer. 4420 * This should not happen, when stack is tuned to generate 4421 * good frames. OK, on miss we reallocate and reserve even more 4422 * space, 128 bytes is fair. */ 4423 4424 if (skb_tailroom(skb) < tailbits && 4425 pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC)) 4426 return -ENOMEM; 4427 4428 /* Voila! */ 4429 *trailer = skb; 4430 return 1; 4431 } 4432 4433 /* Misery. We are in troubles, going to mincer fragments... */ 4434 4435 elt = 1; 4436 skb_p = &skb_shinfo(skb)->frag_list; 4437 copyflag = 0; 4438 4439 while ((skb1 = *skb_p) != NULL) { 4440 int ntail = 0; 4441 4442 /* The fragment is partially pulled by someone, 4443 * this can happen on input. Copy it and everything 4444 * after it. */ 4445 4446 if (skb_shared(skb1)) 4447 copyflag = 1; 4448 4449 /* If the skb is the last, worry about trailer. */ 4450 4451 if (skb1->next == NULL && tailbits) { 4452 if (skb_shinfo(skb1)->nr_frags || 4453 skb_has_frag_list(skb1) || 4454 skb_tailroom(skb1) < tailbits) 4455 ntail = tailbits + 128; 4456 } 4457 4458 if (copyflag || 4459 skb_cloned(skb1) || 4460 ntail || 4461 skb_shinfo(skb1)->nr_frags || 4462 skb_has_frag_list(skb1)) { 4463 struct sk_buff *skb2; 4464 4465 /* Fuck, we are miserable poor guys... */ 4466 if (ntail == 0) 4467 skb2 = skb_copy(skb1, GFP_ATOMIC); 4468 else 4469 skb2 = skb_copy_expand(skb1, 4470 skb_headroom(skb1), 4471 ntail, 4472 GFP_ATOMIC); 4473 if (unlikely(skb2 == NULL)) 4474 return -ENOMEM; 4475 4476 if (skb1->sk) 4477 skb_set_owner_w(skb2, skb1->sk); 4478 4479 /* Looking around. Are we still alive? 4480 * OK, link new skb, drop old one */ 4481 4482 skb2->next = skb1->next; 4483 *skb_p = skb2; 4484 kfree_skb(skb1); 4485 skb1 = skb2; 4486 } 4487 elt++; 4488 *trailer = skb1; 4489 skb_p = &skb1->next; 4490 } 4491 4492 return elt; 4493 } 4494 EXPORT_SYMBOL_GPL(skb_cow_data); 4495 4496 static void sock_rmem_free(struct sk_buff *skb) 4497 { 4498 struct sock *sk = skb->sk; 4499 4500 atomic_sub(skb->truesize, &sk->sk_rmem_alloc); 4501 } 4502 4503 static void skb_set_err_queue(struct sk_buff *skb) 4504 { 4505 /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING. 4506 * So, it is safe to (mis)use it to mark skbs on the error queue. 4507 */ 4508 skb->pkt_type = PACKET_OUTGOING; 4509 BUILD_BUG_ON(PACKET_OUTGOING == 0); 4510 } 4511 4512 /* 4513 * Note: We dont mem charge error packets (no sk_forward_alloc changes) 4514 */ 4515 int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) 4516 { 4517 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= 4518 (unsigned int)READ_ONCE(sk->sk_rcvbuf)) 4519 return -ENOMEM; 4520 4521 skb_orphan(skb); 4522 skb->sk = sk; 4523 skb->destructor = sock_rmem_free; 4524 atomic_add(skb->truesize, &sk->sk_rmem_alloc); 4525 skb_set_err_queue(skb); 4526 4527 /* before exiting rcu section, make sure dst is refcounted */ 4528 skb_dst_force(skb); 4529 4530 skb_queue_tail(&sk->sk_error_queue, skb); 4531 if (!sock_flag(sk, SOCK_DEAD)) 4532 sk->sk_error_report(sk); 4533 return 0; 4534 } 4535 EXPORT_SYMBOL(sock_queue_err_skb); 4536 4537 static bool is_icmp_err_skb(const struct sk_buff *skb) 4538 { 4539 return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP || 4540 SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6); 4541 } 4542 4543 struct sk_buff *sock_dequeue_err_skb(struct sock *sk) 4544 { 4545 struct sk_buff_head *q = &sk->sk_error_queue; 4546 struct sk_buff *skb, *skb_next = NULL; 4547 bool icmp_next = false; 4548 unsigned long flags; 4549 4550 spin_lock_irqsave(&q->lock, flags); 4551 skb = __skb_dequeue(q); 4552 if (skb && (skb_next = skb_peek(q))) { 4553 icmp_next = is_icmp_err_skb(skb_next); 4554 if (icmp_next) 4555 sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_origin; 4556 } 4557 spin_unlock_irqrestore(&q->lock, flags); 4558 4559 if (is_icmp_err_skb(skb) && !icmp_next) 4560 sk->sk_err = 0; 4561 4562 if (skb_next) 4563 sk->sk_error_report(sk); 4564 4565 return skb; 4566 } 4567 EXPORT_SYMBOL(sock_dequeue_err_skb); 4568 4569 /** 4570 * skb_clone_sk - create clone of skb, and take reference to socket 4571 * @skb: the skb to clone 4572 * 4573 * This function creates a clone of a buffer that holds a reference on 4574 * sk_refcnt. Buffers created via this function are meant to be 4575 * returned using sock_queue_err_skb, or free via kfree_skb. 4576 * 4577 * When passing buffers allocated with this function to sock_queue_err_skb 4578 * it is necessary to wrap the call with sock_hold/sock_put in order to 4579 * prevent the socket from being released prior to being enqueued on 4580 * the sk_error_queue. 4581 */ 4582 struct sk_buff *skb_clone_sk(struct sk_buff *skb) 4583 { 4584 struct sock *sk = skb->sk; 4585 struct sk_buff *clone; 4586 4587 if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt)) 4588 return NULL; 4589 4590 clone = skb_clone(skb, GFP_ATOMIC); 4591 if (!clone) { 4592 sock_put(sk); 4593 return NULL; 4594 } 4595 4596 clone->sk = sk; 4597 clone->destructor = sock_efree; 4598 4599 return clone; 4600 } 4601 EXPORT_SYMBOL(skb_clone_sk); 4602 4603 static void __skb_complete_tx_timestamp(struct sk_buff *skb, 4604 struct sock *sk, 4605 int tstype, 4606 bool opt_stats) 4607 { 4608 struct sock_exterr_skb *serr; 4609 int err; 4610 4611 BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb)); 4612 4613 serr = SKB_EXT_ERR(skb); 4614 memset(serr, 0, sizeof(*serr)); 4615 serr->ee.ee_errno = ENOMSG; 4616 serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; 4617 serr->ee.ee_info = tstype; 4618 serr->opt_stats = opt_stats; 4619 serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0; 4620 if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { 4621 serr->ee.ee_data = skb_shinfo(skb)->tskey; 4622 if (sk->sk_protocol == IPPROTO_TCP && 4623 sk->sk_type == SOCK_STREAM) 4624 serr->ee.ee_data -= sk->sk_tskey; 4625 } 4626 4627 err = sock_queue_err_skb(sk, skb); 4628 4629 if (err) 4630 kfree_skb(skb); 4631 } 4632 4633 static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly) 4634 { 4635 bool ret; 4636 4637 if (likely(sysctl_tstamp_allow_data || tsonly)) 4638 return true; 4639 4640 read_lock_bh(&sk->sk_callback_lock); 4641 ret = sk->sk_socket && sk->sk_socket->file && 4642 file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW); 4643 read_unlock_bh(&sk->sk_callback_lock); 4644 return ret; 4645 } 4646 4647 void skb_complete_tx_timestamp(struct sk_buff *skb, 4648 struct skb_shared_hwtstamps *hwtstamps) 4649 { 4650 struct sock *sk = skb->sk; 4651 4652 if (!skb_may_tx_timestamp(sk, false)) 4653 goto err; 4654 4655 /* Take a reference to prevent skb_orphan() from freeing the socket, 4656 * but only if the socket refcount is not zero. 4657 */ 4658 if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { 4659 *skb_hwtstamps(skb) = *hwtstamps; 4660 __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false); 4661 sock_put(sk); 4662 return; 4663 } 4664 4665 err: 4666 kfree_skb(skb); 4667 } 4668 EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); 4669 4670 void __skb_tstamp_tx(struct sk_buff *orig_skb, 4671 struct skb_shared_hwtstamps *hwtstamps, 4672 struct sock *sk, int tstype) 4673 { 4674 struct sk_buff *skb; 4675 bool tsonly, opt_stats = false; 4676 4677 if (!sk) 4678 return; 4679 4680 if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && 4681 skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) 4682 return; 4683 4684 tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY; 4685 if (!skb_may_tx_timestamp(sk, tsonly)) 4686 return; 4687 4688 if (tsonly) { 4689 #ifdef CONFIG_INET 4690 if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && 4691 sk->sk_protocol == IPPROTO_TCP && 4692 sk->sk_type == SOCK_STREAM) { 4693 skb = tcp_get_timestamping_opt_stats(sk, orig_skb); 4694 opt_stats = true; 4695 } else 4696 #endif 4697 skb = alloc_skb(0, GFP_ATOMIC); 4698 } else { 4699 skb = skb_clone(orig_skb, GFP_ATOMIC); 4700 } 4701 if (!skb) 4702 return; 4703 4704 if (tsonly) { 4705 skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags & 4706 SKBTX_ANY_TSTAMP; 4707 skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey; 4708 } 4709 4710 if (hwtstamps) 4711 *skb_hwtstamps(skb) = *hwtstamps; 4712 else 4713 skb->tstamp = ktime_get_real(); 4714 4715 __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats); 4716 } 4717 EXPORT_SYMBOL_GPL(__skb_tstamp_tx); 4718 4719 void skb_tstamp_tx(struct sk_buff *orig_skb, 4720 struct skb_shared_hwtstamps *hwtstamps) 4721 { 4722 return __skb_tstamp_tx(orig_skb, hwtstamps, orig_skb->sk, 4723 SCM_TSTAMP_SND); 4724 } 4725 EXPORT_SYMBOL_GPL(skb_tstamp_tx); 4726 4727 void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) 4728 { 4729 struct sock *sk = skb->sk; 4730 struct sock_exterr_skb *serr; 4731 int err = 1; 4732 4733 skb->wifi_acked_valid = 1; 4734 skb->wifi_acked = acked; 4735 4736 serr = SKB_EXT_ERR(skb); 4737 memset(serr, 0, sizeof(*serr)); 4738 serr->ee.ee_errno = ENOMSG; 4739 serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; 4740 4741 /* Take a reference to prevent skb_orphan() from freeing the socket, 4742 * but only if the socket refcount is not zero. 4743 */ 4744 if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { 4745 err = sock_queue_err_skb(sk, skb); 4746 sock_put(sk); 4747 } 4748 if (err) 4749 kfree_skb(skb); 4750 } 4751 EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); 4752 4753 /** 4754 * skb_partial_csum_set - set up and verify partial csum values for packet 4755 * @skb: the skb to set 4756 * @start: the number of bytes after skb->data to start checksumming. 4757 * @off: the offset from start to place the checksum. 4758 * 4759 * For untrusted partially-checksummed packets, we need to make sure the values 4760 * for skb->csum_start and skb->csum_offset are valid so we don't oops. 4761 * 4762 * This function checks and sets those values and skb->ip_summed: if this 4763 * returns false you should drop the packet. 4764 */ 4765 bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) 4766 { 4767 u32 csum_end = (u32)start + (u32)off + sizeof(__sum16); 4768 u32 csum_start = skb_headroom(skb) + (u32)start; 4769 4770 if (unlikely(csum_start > U16_MAX || csum_end > skb_headlen(skb))) { 4771 net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n", 4772 start, off, skb_headroom(skb), skb_headlen(skb)); 4773 return false; 4774 } 4775 skb->ip_summed = CHECKSUM_PARTIAL; 4776 skb->csum_start = csum_start; 4777 skb->csum_offset = off; 4778 skb_set_transport_header(skb, start); 4779 return true; 4780 } 4781 EXPORT_SYMBOL_GPL(skb_partial_csum_set); 4782 4783 static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len, 4784 unsigned int max) 4785 { 4786 if (skb_headlen(skb) >= len) 4787 return 0; 4788 4789 /* If we need to pullup then pullup to the max, so we 4790 * won't need to do it again. 4791 */ 4792 if (max > skb->len) 4793 max = skb->len; 4794 4795 if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL) 4796 return -ENOMEM; 4797 4798 if (skb_headlen(skb) < len) 4799 return -EPROTO; 4800 4801 return 0; 4802 } 4803 4804 #define MAX_TCP_HDR_LEN (15 * 4) 4805 4806 static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb, 4807 typeof(IPPROTO_IP) proto, 4808 unsigned int off) 4809 { 4810 int err; 4811 4812 switch (proto) { 4813 case IPPROTO_TCP: 4814 err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr), 4815 off + MAX_TCP_HDR_LEN); 4816 if (!err && !skb_partial_csum_set(skb, off, 4817 offsetof(struct tcphdr, 4818 check))) 4819 err = -EPROTO; 4820 return err ? ERR_PTR(err) : &tcp_hdr(skb)->check; 4821 4822 case IPPROTO_UDP: 4823 err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr), 4824 off + sizeof(struct udphdr)); 4825 if (!err && !skb_partial_csum_set(skb, off, 4826 offsetof(struct udphdr, 4827 check))) 4828 err = -EPROTO; 4829 return err ? ERR_PTR(err) : &udp_hdr(skb)->check; 4830 } 4831 4832 return ERR_PTR(-EPROTO); 4833 } 4834 4835 /* This value should be large enough to cover a tagged ethernet header plus 4836 * maximally sized IP and TCP or UDP headers. 4837 */ 4838 #define MAX_IP_HDR_LEN 128 4839 4840 static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate) 4841 { 4842 unsigned int off; 4843 bool fragment; 4844 __sum16 *csum; 4845 int err; 4846 4847 fragment = false; 4848 4849 err = skb_maybe_pull_tail(skb, 4850 sizeof(struct iphdr), 4851 MAX_IP_HDR_LEN); 4852 if (err < 0) 4853 goto out; 4854 4855 if (ip_is_fragment(ip_hdr(skb))) 4856 fragment = true; 4857 4858 off = ip_hdrlen(skb); 4859 4860 err = -EPROTO; 4861 4862 if (fragment) 4863 goto out; 4864 4865 csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off); 4866 if (IS_ERR(csum)) 4867 return PTR_ERR(csum); 4868 4869 if (recalculate) 4870 *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr, 4871 ip_hdr(skb)->daddr, 4872 skb->len - off, 4873 ip_hdr(skb)->protocol, 0); 4874 err = 0; 4875 4876 out: 4877 return err; 4878 } 4879 4880 /* This value should be large enough to cover a tagged ethernet header plus 4881 * an IPv6 header, all options, and a maximal TCP or UDP header. 4882 */ 4883 #define MAX_IPV6_HDR_LEN 256 4884 4885 #define OPT_HDR(type, skb, off) \ 4886 (type *)(skb_network_header(skb) + (off)) 4887 4888 static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate) 4889 { 4890 int err; 4891 u8 nexthdr; 4892 unsigned int off; 4893 unsigned int len; 4894 bool fragment; 4895 bool done; 4896 __sum16 *csum; 4897 4898 fragment = false; 4899 done = false; 4900 4901 off = sizeof(struct ipv6hdr); 4902 4903 err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN); 4904 if (err < 0) 4905 goto out; 4906 4907 nexthdr = ipv6_hdr(skb)->nexthdr; 4908 4909 len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len); 4910 while (off <= len && !done) { 4911 switch (nexthdr) { 4912 case IPPROTO_DSTOPTS: 4913 case IPPROTO_HOPOPTS: 4914 case IPPROTO_ROUTING: { 4915 struct ipv6_opt_hdr *hp; 4916 4917 err = skb_maybe_pull_tail(skb, 4918 off + 4919 sizeof(struct ipv6_opt_hdr), 4920 MAX_IPV6_HDR_LEN); 4921 if (err < 0) 4922 goto out; 4923 4924 hp = OPT_HDR(struct ipv6_opt_hdr, skb, off); 4925 nexthdr = hp->nexthdr; 4926 off += ipv6_optlen(hp); 4927 break; 4928 } 4929 case IPPROTO_AH: { 4930 struct ip_auth_hdr *hp; 4931 4932 err = skb_maybe_pull_tail(skb, 4933 off + 4934 sizeof(struct ip_auth_hdr), 4935 MAX_IPV6_HDR_LEN); 4936 if (err < 0) 4937 goto out; 4938 4939 hp = OPT_HDR(struct ip_auth_hdr, skb, off); 4940 nexthdr = hp->nexthdr; 4941 off += ipv6_authlen(hp); 4942 break; 4943 } 4944 case IPPROTO_FRAGMENT: { 4945 struct frag_hdr *hp; 4946 4947 err = skb_maybe_pull_tail(skb, 4948 off + 4949 sizeof(struct frag_hdr), 4950 MAX_IPV6_HDR_LEN); 4951 if (err < 0) 4952 goto out; 4953 4954 hp = OPT_HDR(struct frag_hdr, skb, off); 4955 4956 if (hp->frag_off & htons(IP6_OFFSET | IP6_MF)) 4957 fragment = true; 4958 4959 nexthdr = hp->nexthdr; 4960 off += sizeof(struct frag_hdr); 4961 break; 4962 } 4963 default: 4964 done = true; 4965 break; 4966 } 4967 } 4968 4969 err = -EPROTO; 4970 4971 if (!done || fragment) 4972 goto out; 4973 4974 csum = skb_checksum_setup_ip(skb, nexthdr, off); 4975 if (IS_ERR(csum)) 4976 return PTR_ERR(csum); 4977 4978 if (recalculate) 4979 *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, 4980 &ipv6_hdr(skb)->daddr, 4981 skb->len - off, nexthdr, 0); 4982 err = 0; 4983 4984 out: 4985 return err; 4986 } 4987 4988 /** 4989 * skb_checksum_setup - set up partial checksum offset 4990 * @skb: the skb to set up 4991 * @recalculate: if true the pseudo-header checksum will be recalculated 4992 */ 4993 int skb_checksum_setup(struct sk_buff *skb, bool recalculate) 4994 { 4995 int err; 4996 4997 switch (skb->protocol) { 4998 case htons(ETH_P_IP): 4999 err = skb_checksum_setup_ipv4(skb, recalculate); 5000 break; 5001 5002 case htons(ETH_P_IPV6): 5003 err = skb_checksum_setup_ipv6(skb, recalculate); 5004 break; 5005 5006 default: 5007 err = -EPROTO; 5008 break; 5009 } 5010 5011 return err; 5012 } 5013 EXPORT_SYMBOL(skb_checksum_setup); 5014 5015 /** 5016 * skb_checksum_maybe_trim - maybe trims the given skb 5017 * @skb: the skb to check 5018 * @transport_len: the data length beyond the network header 5019 * 5020 * Checks whether the given skb has data beyond the given transport length. 5021 * If so, returns a cloned skb trimmed to this transport length. 5022 * Otherwise returns the provided skb. Returns NULL in error cases 5023 * (e.g. transport_len exceeds skb length or out-of-memory). 5024 * 5025 * Caller needs to set the skb transport header and free any returned skb if it 5026 * differs from the provided skb. 5027 */ 5028 static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb, 5029 unsigned int transport_len) 5030 { 5031 struct sk_buff *skb_chk; 5032 unsigned int len = skb_transport_offset(skb) + transport_len; 5033 int ret; 5034 5035 if (skb->len < len) 5036 return NULL; 5037 else if (skb->len == len) 5038 return skb; 5039 5040 skb_chk = skb_clone(skb, GFP_ATOMIC); 5041 if (!skb_chk) 5042 return NULL; 5043 5044 ret = pskb_trim_rcsum(skb_chk, len); 5045 if (ret) { 5046 kfree_skb(skb_chk); 5047 return NULL; 5048 } 5049 5050 return skb_chk; 5051 } 5052 5053 /** 5054 * skb_checksum_trimmed - validate checksum of an skb 5055 * @skb: the skb to check 5056 * @transport_len: the data length beyond the network header 5057 * @skb_chkf: checksum function to use 5058 * 5059 * Applies the given checksum function skb_chkf to the provided skb. 5060 * Returns a checked and maybe trimmed skb. Returns NULL on error. 5061 * 5062 * If the skb has data beyond the given transport length, then a 5063 * trimmed & cloned skb is checked and returned. 5064 * 5065 * Caller needs to set the skb transport header and free any returned skb if it 5066 * differs from the provided skb. 5067 */ 5068 struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, 5069 unsigned int transport_len, 5070 __sum16(*skb_chkf)(struct sk_buff *skb)) 5071 { 5072 struct sk_buff *skb_chk; 5073 unsigned int offset = skb_transport_offset(skb); 5074 __sum16 ret; 5075 5076 skb_chk = skb_checksum_maybe_trim(skb, transport_len); 5077 if (!skb_chk) 5078 goto err; 5079 5080 if (!pskb_may_pull(skb_chk, offset)) 5081 goto err; 5082 5083 skb_pull_rcsum(skb_chk, offset); 5084 ret = skb_chkf(skb_chk); 5085 skb_push_rcsum(skb_chk, offset); 5086 5087 if (ret) 5088 goto err; 5089 5090 return skb_chk; 5091 5092 err: 5093 if (skb_chk && skb_chk != skb) 5094 kfree_skb(skb_chk); 5095 5096 return NULL; 5097 5098 } 5099 EXPORT_SYMBOL(skb_checksum_trimmed); 5100 5101 void __skb_warn_lro_forwarding(const struct sk_buff *skb) 5102 { 5103 net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n", 5104 skb->dev->name); 5105 } 5106 EXPORT_SYMBOL(__skb_warn_lro_forwarding); 5107 5108 void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) 5109 { 5110 if (head_stolen) { 5111 skb_release_head_state(skb); 5112 kmem_cache_free(skbuff_head_cache, skb); 5113 } else { 5114 __kfree_skb(skb); 5115 } 5116 } 5117 EXPORT_SYMBOL(kfree_skb_partial); 5118 5119 /** 5120 * skb_try_coalesce - try to merge skb to prior one 5121 * @to: prior buffer 5122 * @from: buffer to add 5123 * @fragstolen: pointer to boolean 5124 * @delta_truesize: how much more was allocated than was requested 5125 */ 5126 bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, 5127 bool *fragstolen, int *delta_truesize) 5128 { 5129 struct skb_shared_info *to_shinfo, *from_shinfo; 5130 int i, delta, len = from->len; 5131 5132 *fragstolen = false; 5133 5134 if (skb_cloned(to)) 5135 return false; 5136 5137 if (len <= skb_tailroom(to)) { 5138 if (len) 5139 BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); 5140 *delta_truesize = 0; 5141 return true; 5142 } 5143 5144 to_shinfo = skb_shinfo(to); 5145 from_shinfo = skb_shinfo(from); 5146 if (to_shinfo->frag_list || from_shinfo->frag_list) 5147 return false; 5148 if (skb_zcopy(to) || skb_zcopy(from)) 5149 return false; 5150 5151 if (skb_headlen(from) != 0) { 5152 struct page *page; 5153 unsigned int offset; 5154 5155 if (to_shinfo->nr_frags + 5156 from_shinfo->nr_frags >= MAX_SKB_FRAGS) 5157 return false; 5158 5159 if (skb_head_is_locked(from)) 5160 return false; 5161 5162 delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); 5163 5164 page = virt_to_head_page(from->head); 5165 offset = from->data - (unsigned char *)page_address(page); 5166 5167 skb_fill_page_desc(to, to_shinfo->nr_frags, 5168 page, offset, skb_headlen(from)); 5169 *fragstolen = true; 5170 } else { 5171 if (to_shinfo->nr_frags + 5172 from_shinfo->nr_frags > MAX_SKB_FRAGS) 5173 return false; 5174 5175 delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from)); 5176 } 5177 5178 WARN_ON_ONCE(delta < len); 5179 5180 memcpy(to_shinfo->frags + to_shinfo->nr_frags, 5181 from_shinfo->frags, 5182 from_shinfo->nr_frags * sizeof(skb_frag_t)); 5183 to_shinfo->nr_frags += from_shinfo->nr_frags; 5184 5185 if (!skb_cloned(from)) 5186 from_shinfo->nr_frags = 0; 5187 5188 /* if the skb is not cloned this does nothing 5189 * since we set nr_frags to 0. 5190 */ 5191 for (i = 0; i < from_shinfo->nr_frags; i++) 5192 __skb_frag_ref(&from_shinfo->frags[i]); 5193 5194 to->truesize += delta; 5195 to->len += len; 5196 to->data_len += len; 5197 5198 *delta_truesize = delta; 5199 return true; 5200 } 5201 EXPORT_SYMBOL(skb_try_coalesce); 5202 5203 /** 5204 * skb_scrub_packet - scrub an skb 5205 * 5206 * @skb: buffer to clean 5207 * @xnet: packet is crossing netns 5208 * 5209 * skb_scrub_packet can be used after encapsulating or decapsulting a packet 5210 * into/from a tunnel. Some information have to be cleared during these 5211 * operations. 5212 * skb_scrub_packet can also be used to clean a skb before injecting it in 5213 * another namespace (@xnet == true). We have to clear all information in the 5214 * skb that could impact namespace isolation. 5215 */ 5216 void skb_scrub_packet(struct sk_buff *skb, bool xnet) 5217 { 5218 skb->pkt_type = PACKET_HOST; 5219 skb->skb_iif = 0; 5220 skb->ignore_df = 0; 5221 skb_dst_drop(skb); 5222 skb_ext_reset(skb); 5223 nf_reset_ct(skb); 5224 nf_reset_trace(skb); 5225 5226 #ifdef CONFIG_NET_SWITCHDEV 5227 skb->offload_fwd_mark = 0; 5228 skb->offload_l3_fwd_mark = 0; 5229 #endif 5230 5231 if (!xnet) 5232 return; 5233 5234 ipvs_reset(skb); 5235 skb->mark = 0; 5236 skb->tstamp = 0; 5237 } 5238 EXPORT_SYMBOL_GPL(skb_scrub_packet); 5239 5240 /** 5241 * skb_gso_transport_seglen - Return length of individual segments of a gso packet 5242 * 5243 * @skb: GSO skb 5244 * 5245 * skb_gso_transport_seglen is used to determine the real size of the 5246 * individual segments, including Layer4 headers (TCP/UDP). 5247 * 5248 * The MAC/L2 or network (IP, IPv6) headers are not accounted for. 5249 */ 5250 static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) 5251 { 5252 const struct skb_shared_info *shinfo = skb_shinfo(skb); 5253 unsigned int thlen = 0; 5254 5255 if (skb->encapsulation) { 5256 thlen = skb_inner_transport_header(skb) - 5257 skb_transport_header(skb); 5258 5259 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) 5260 thlen += inner_tcp_hdrlen(skb); 5261 } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { 5262 thlen = tcp_hdrlen(skb); 5263 } else if (unlikely(skb_is_gso_sctp(skb))) { 5264 thlen = sizeof(struct sctphdr); 5265 } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { 5266 thlen = sizeof(struct udphdr); 5267 } 5268 /* UFO sets gso_size to the size of the fragmentation 5269 * payload, i.e. the size of the L4 (UDP) header is already 5270 * accounted for. 5271 */ 5272 return thlen + shinfo->gso_size; 5273 } 5274 5275 /** 5276 * skb_gso_network_seglen - Return length of individual segments of a gso packet 5277 * 5278 * @skb: GSO skb 5279 * 5280 * skb_gso_network_seglen is used to determine the real size of the 5281 * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP). 5282 * 5283 * The MAC/L2 header is not accounted for. 5284 */ 5285 static unsigned int skb_gso_network_seglen(const struct sk_buff *skb) 5286 { 5287 unsigned int hdr_len = skb_transport_header(skb) - 5288 skb_network_header(skb); 5289 5290 return hdr_len + skb_gso_transport_seglen(skb); 5291 } 5292 5293 /** 5294 * skb_gso_mac_seglen - Return length of individual segments of a gso packet 5295 * 5296 * @skb: GSO skb 5297 * 5298 * skb_gso_mac_seglen is used to determine the real size of the 5299 * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4 5300 * headers (TCP/UDP). 5301 */ 5302 static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) 5303 { 5304 unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); 5305 5306 return hdr_len + skb_gso_transport_seglen(skb); 5307 } 5308 5309 /** 5310 * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS 5311 * 5312 * There are a couple of instances where we have a GSO skb, and we 5313 * want to determine what size it would be after it is segmented. 5314 * 5315 * We might want to check: 5316 * - L3+L4+payload size (e.g. IP forwarding) 5317 * - L2+L3+L4+payload size (e.g. sanity check before passing to driver) 5318 * 5319 * This is a helper to do that correctly considering GSO_BY_FRAGS. 5320 * 5321 * @skb: GSO skb 5322 * 5323 * @seg_len: The segmented length (from skb_gso_*_seglen). In the 5324 * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS]. 5325 * 5326 * @max_len: The maximum permissible length. 5327 * 5328 * Returns true if the segmented length <= max length. 5329 */ 5330 static inline bool skb_gso_size_check(const struct sk_buff *skb, 5331 unsigned int seg_len, 5332 unsigned int max_len) { 5333 const struct skb_shared_info *shinfo = skb_shinfo(skb); 5334 const struct sk_buff *iter; 5335 5336 if (shinfo->gso_size != GSO_BY_FRAGS) 5337 return seg_len <= max_len; 5338 5339 /* Undo this so we can re-use header sizes */ 5340 seg_len -= GSO_BY_FRAGS; 5341 5342 skb_walk_frags(skb, iter) { 5343 if (seg_len + skb_headlen(iter) > max_len) 5344 return false; 5345 } 5346 5347 return true; 5348 } 5349 5350 /** 5351 * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU? 5352 * 5353 * @skb: GSO skb 5354 * @mtu: MTU to validate against 5355 * 5356 * skb_gso_validate_network_len validates if a given skb will fit a 5357 * wanted MTU once split. It considers L3 headers, L4 headers, and the 5358 * payload. 5359 */ 5360 bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu) 5361 { 5362 return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu); 5363 } 5364 EXPORT_SYMBOL_GPL(skb_gso_validate_network_len); 5365 5366 /** 5367 * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length? 5368 * 5369 * @skb: GSO skb 5370 * @len: length to validate against 5371 * 5372 * skb_gso_validate_mac_len validates if a given skb will fit a wanted 5373 * length once split, including L2, L3 and L4 headers and the payload. 5374 */ 5375 bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len) 5376 { 5377 return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len); 5378 } 5379 EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len); 5380 5381 static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) 5382 { 5383 int mac_len, meta_len; 5384 void *meta; 5385 5386 if (skb_cow(skb, skb_headroom(skb)) < 0) { 5387 kfree_skb(skb); 5388 return NULL; 5389 } 5390 5391 mac_len = skb->data - skb_mac_header(skb); 5392 if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) { 5393 memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb), 5394 mac_len - VLAN_HLEN - ETH_TLEN); 5395 } 5396 5397 meta_len = skb_metadata_len(skb); 5398 if (meta_len) { 5399 meta = skb_metadata_end(skb) - meta_len; 5400 memmove(meta + VLAN_HLEN, meta, meta_len); 5401 } 5402 5403 skb->mac_header += VLAN_HLEN; 5404 return skb; 5405 } 5406 5407 struct sk_buff *skb_vlan_untag(struct sk_buff *skb) 5408 { 5409 struct vlan_hdr *vhdr; 5410 u16 vlan_tci; 5411 5412 if (unlikely(skb_vlan_tag_present(skb))) { 5413 /* vlan_tci is already set-up so leave this for another time */ 5414 return skb; 5415 } 5416 5417 skb = skb_share_check(skb, GFP_ATOMIC); 5418 if (unlikely(!skb)) 5419 goto err_free; 5420 /* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */ 5421 if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short)))) 5422 goto err_free; 5423 5424 vhdr = (struct vlan_hdr *)skb->data; 5425 vlan_tci = ntohs(vhdr->h_vlan_TCI); 5426 __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci); 5427 5428 skb_pull_rcsum(skb, VLAN_HLEN); 5429 vlan_set_encap_proto(skb, vhdr); 5430 5431 skb = skb_reorder_vlan_header(skb); 5432 if (unlikely(!skb)) 5433 goto err_free; 5434 5435 skb_reset_network_header(skb); 5436 skb_reset_transport_header(skb); 5437 skb_reset_mac_len(skb); 5438 5439 return skb; 5440 5441 err_free: 5442 kfree_skb(skb); 5443 return NULL; 5444 } 5445 EXPORT_SYMBOL(skb_vlan_untag); 5446 5447 int skb_ensure_writable(struct sk_buff *skb, int write_len) 5448 { 5449 if (!pskb_may_pull(skb, write_len)) 5450 return -ENOMEM; 5451 5452 if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) 5453 return 0; 5454 5455 return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 5456 } 5457 EXPORT_SYMBOL(skb_ensure_writable); 5458 5459 /* remove VLAN header from packet and update csum accordingly. 5460 * expects a non skb_vlan_tag_present skb with a vlan tag payload 5461 */ 5462 int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) 5463 { 5464 struct vlan_hdr *vhdr; 5465 int offset = skb->data - skb_mac_header(skb); 5466 int err; 5467 5468 if (WARN_ONCE(offset, 5469 "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n", 5470 offset)) { 5471 return -EINVAL; 5472 } 5473 5474 err = skb_ensure_writable(skb, VLAN_ETH_HLEN); 5475 if (unlikely(err)) 5476 return err; 5477 5478 skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); 5479 5480 vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); 5481 *vlan_tci = ntohs(vhdr->h_vlan_TCI); 5482 5483 memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); 5484 __skb_pull(skb, VLAN_HLEN); 5485 5486 vlan_set_encap_proto(skb, vhdr); 5487 skb->mac_header += VLAN_HLEN; 5488 5489 if (skb_network_offset(skb) < ETH_HLEN) 5490 skb_set_network_header(skb, ETH_HLEN); 5491 5492 skb_reset_mac_len(skb); 5493 5494 return err; 5495 } 5496 EXPORT_SYMBOL(__skb_vlan_pop); 5497 5498 /* Pop a vlan tag either from hwaccel or from payload. 5499 * Expects skb->data at mac header. 5500 */ 5501 int skb_vlan_pop(struct sk_buff *skb) 5502 { 5503 u16 vlan_tci; 5504 __be16 vlan_proto; 5505 int err; 5506 5507 if (likely(skb_vlan_tag_present(skb))) { 5508 __vlan_hwaccel_clear_tag(skb); 5509 } else { 5510 if (unlikely(!eth_type_vlan(skb->protocol))) 5511 return 0; 5512 5513 err = __skb_vlan_pop(skb, &vlan_tci); 5514 if (err) 5515 return err; 5516 } 5517 /* move next vlan tag to hw accel tag */ 5518 if (likely(!eth_type_vlan(skb->protocol))) 5519 return 0; 5520 5521 vlan_proto = skb->protocol; 5522 err = __skb_vlan_pop(skb, &vlan_tci); 5523 if (unlikely(err)) 5524 return err; 5525 5526 __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); 5527 return 0; 5528 } 5529 EXPORT_SYMBOL(skb_vlan_pop); 5530 5531 /* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present). 5532 * Expects skb->data at mac header. 5533 */ 5534 int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) 5535 { 5536 if (skb_vlan_tag_present(skb)) { 5537 int offset = skb->data - skb_mac_header(skb); 5538 int err; 5539 5540 if (WARN_ONCE(offset, 5541 "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n", 5542 offset)) { 5543 return -EINVAL; 5544 } 5545 5546 err = __vlan_insert_tag(skb, skb->vlan_proto, 5547 skb_vlan_tag_get(skb)); 5548 if (err) 5549 return err; 5550 5551 skb->protocol = skb->vlan_proto; 5552 skb->mac_len += VLAN_HLEN; 5553 5554 skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); 5555 } 5556 __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); 5557 return 0; 5558 } 5559 EXPORT_SYMBOL(skb_vlan_push); 5560 5561 /* Update the ethertype of hdr and the skb csum value if required. */ 5562 static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr, 5563 __be16 ethertype) 5564 { 5565 if (skb->ip_summed == CHECKSUM_COMPLETE) { 5566 __be16 diff[] = { ~hdr->h_proto, ethertype }; 5567 5568 skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); 5569 } 5570 5571 hdr->h_proto = ethertype; 5572 } 5573 5574 /** 5575 * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of 5576 * the packet 5577 * 5578 * @skb: buffer 5579 * @mpls_lse: MPLS label stack entry to push 5580 * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848) 5581 * @mac_len: length of the MAC header 5582 * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is 5583 * ethernet 5584 * 5585 * Expects skb->data at mac header. 5586 * 5587 * Returns 0 on success, -errno otherwise. 5588 */ 5589 int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, 5590 int mac_len, bool ethernet) 5591 { 5592 struct mpls_shim_hdr *lse; 5593 int err; 5594 5595 if (unlikely(!eth_p_mpls(mpls_proto))) 5596 return -EINVAL; 5597 5598 /* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */ 5599 if (skb->encapsulation) 5600 return -EINVAL; 5601 5602 err = skb_cow_head(skb, MPLS_HLEN); 5603 if (unlikely(err)) 5604 return err; 5605 5606 if (!skb->inner_protocol) { 5607 skb_set_inner_network_header(skb, skb_network_offset(skb)); 5608 skb_set_inner_protocol(skb, skb->protocol); 5609 } 5610 5611 skb_push(skb, MPLS_HLEN); 5612 memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), 5613 mac_len); 5614 skb_reset_mac_header(skb); 5615 skb_set_network_header(skb, mac_len); 5616 skb_reset_mac_len(skb); 5617 5618 lse = mpls_hdr(skb); 5619 lse->label_stack_entry = mpls_lse; 5620 skb_postpush_rcsum(skb, lse, MPLS_HLEN); 5621 5622 if (ethernet) 5623 skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto); 5624 skb->protocol = mpls_proto; 5625 5626 return 0; 5627 } 5628 EXPORT_SYMBOL_GPL(skb_mpls_push); 5629 5630 /** 5631 * skb_mpls_pop() - pop the outermost MPLS header 5632 * 5633 * @skb: buffer 5634 * @next_proto: ethertype of header after popped MPLS header 5635 * @mac_len: length of the MAC header 5636 * @ethernet: flag to indicate if the packet is ethernet 5637 * 5638 * Expects skb->data at mac header. 5639 * 5640 * Returns 0 on success, -errno otherwise. 5641 */ 5642 int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, 5643 bool ethernet) 5644 { 5645 int err; 5646 5647 if (unlikely(!eth_p_mpls(skb->protocol))) 5648 return 0; 5649 5650 err = skb_ensure_writable(skb, mac_len + MPLS_HLEN); 5651 if (unlikely(err)) 5652 return err; 5653 5654 skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN); 5655 memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), 5656 mac_len); 5657 5658 __skb_pull(skb, MPLS_HLEN); 5659 skb_reset_mac_header(skb); 5660 skb_set_network_header(skb, mac_len); 5661 5662 if (ethernet) { 5663 struct ethhdr *hdr; 5664 5665 /* use mpls_hdr() to get ethertype to account for VLANs. */ 5666 hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN); 5667 skb_mod_eth_type(skb, hdr, next_proto); 5668 } 5669 skb->protocol = next_proto; 5670 5671 return 0; 5672 } 5673 EXPORT_SYMBOL_GPL(skb_mpls_pop); 5674 5675 /** 5676 * skb_mpls_update_lse() - modify outermost MPLS header and update csum 5677 * 5678 * @skb: buffer 5679 * @mpls_lse: new MPLS label stack entry to update to 5680 * 5681 * Expects skb->data at mac header. 5682 * 5683 * Returns 0 on success, -errno otherwise. 5684 */ 5685 int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse) 5686 { 5687 int err; 5688 5689 if (unlikely(!eth_p_mpls(skb->protocol))) 5690 return -EINVAL; 5691 5692 err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); 5693 if (unlikely(err)) 5694 return err; 5695 5696 if (skb->ip_summed == CHECKSUM_COMPLETE) { 5697 __be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse }; 5698 5699 skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); 5700 } 5701 5702 mpls_hdr(skb)->label_stack_entry = mpls_lse; 5703 5704 return 0; 5705 } 5706 EXPORT_SYMBOL_GPL(skb_mpls_update_lse); 5707 5708 /** 5709 * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header 5710 * 5711 * @skb: buffer 5712 * 5713 * Expects skb->data at mac header. 5714 * 5715 * Returns 0 on success, -errno otherwise. 5716 */ 5717 int skb_mpls_dec_ttl(struct sk_buff *skb) 5718 { 5719 u32 lse; 5720 u8 ttl; 5721 5722 if (unlikely(!eth_p_mpls(skb->protocol))) 5723 return -EINVAL; 5724 5725 lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry); 5726 ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; 5727 if (!--ttl) 5728 return -EINVAL; 5729 5730 lse &= ~MPLS_LS_TTL_MASK; 5731 lse |= ttl << MPLS_LS_TTL_SHIFT; 5732 5733 return skb_mpls_update_lse(skb, cpu_to_be32(lse)); 5734 } 5735 EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl); 5736 5737 /** 5738 * alloc_skb_with_frags - allocate skb with page frags 5739 * 5740 * @header_len: size of linear part 5741 * @data_len: needed length in frags 5742 * @max_page_order: max page order desired. 5743 * @errcode: pointer to error code if any 5744 * @gfp_mask: allocation mask 5745 * 5746 * This can be used to allocate a paged skb, given a maximal order for frags. 5747 */ 5748 struct sk_buff *alloc_skb_with_frags(unsigned long header_len, 5749 unsigned long data_len, 5750 int max_page_order, 5751 int *errcode, 5752 gfp_t gfp_mask) 5753 { 5754 int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; 5755 unsigned long chunk; 5756 struct sk_buff *skb; 5757 struct page *page; 5758 int i; 5759 5760 *errcode = -EMSGSIZE; 5761 /* Note this test could be relaxed, if we succeed to allocate 5762 * high order pages... 5763 */ 5764 if (npages > MAX_SKB_FRAGS) 5765 return NULL; 5766 5767 *errcode = -ENOBUFS; 5768 skb = alloc_skb(header_len, gfp_mask); 5769 if (!skb) 5770 return NULL; 5771 5772 skb->truesize += npages << PAGE_SHIFT; 5773 5774 for (i = 0; npages > 0; i++) { 5775 int order = max_page_order; 5776 5777 while (order) { 5778 if (npages >= 1 << order) { 5779 page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | 5780 __GFP_COMP | 5781 __GFP_NOWARN, 5782 order); 5783 if (page) 5784 goto fill_page; 5785 /* Do not retry other high order allocations */ 5786 order = 1; 5787 max_page_order = 0; 5788 } 5789 order--; 5790 } 5791 page = alloc_page(gfp_mask); 5792 if (!page) 5793 goto failure; 5794 fill_page: 5795 chunk = min_t(unsigned long, data_len, 5796 PAGE_SIZE << order); 5797 skb_fill_page_desc(skb, i, page, 0, chunk); 5798 data_len -= chunk; 5799 npages -= 1 << order; 5800 } 5801 return skb; 5802 5803 failure: 5804 kfree_skb(skb); 5805 return NULL; 5806 } 5807 EXPORT_SYMBOL(alloc_skb_with_frags); 5808 5809 /* carve out the first off bytes from skb when off < headlen */ 5810 static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, 5811 const int headlen, gfp_t gfp_mask) 5812 { 5813 int i; 5814 int size = skb_end_offset(skb); 5815 int new_hlen = headlen - off; 5816 u8 *data; 5817 5818 size = SKB_DATA_ALIGN(size); 5819 5820 if (skb_pfmemalloc(skb)) 5821 gfp_mask |= __GFP_MEMALLOC; 5822 data = kmalloc_reserve(size + 5823 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), 5824 gfp_mask, NUMA_NO_NODE, NULL); 5825 if (!data) 5826 return -ENOMEM; 5827 5828 size = SKB_WITH_OVERHEAD(ksize(data)); 5829 5830 /* Copy real data, and all frags */ 5831 skb_copy_from_linear_data_offset(skb, off, data, new_hlen); 5832 skb->len -= off; 5833 5834 memcpy((struct skb_shared_info *)(data + size), 5835 skb_shinfo(skb), 5836 offsetof(struct skb_shared_info, 5837 frags[skb_shinfo(skb)->nr_frags])); 5838 if (skb_cloned(skb)) { 5839 /* drop the old head gracefully */ 5840 if (skb_orphan_frags(skb, gfp_mask)) { 5841 kfree(data); 5842 return -ENOMEM; 5843 } 5844 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 5845 skb_frag_ref(skb, i); 5846 if (skb_has_frag_list(skb)) 5847 skb_clone_fraglist(skb); 5848 skb_release_data(skb); 5849 } else { 5850 /* we can reuse existing recount- all we did was 5851 * relocate values 5852 */ 5853 skb_free_head(skb); 5854 } 5855 5856 skb->head = data; 5857 skb->data = data; 5858 skb->head_frag = 0; 5859 #ifdef NET_SKBUFF_DATA_USES_OFFSET 5860 skb->end = size; 5861 #else 5862 skb->end = skb->head + size; 5863 #endif 5864 skb_set_tail_pointer(skb, skb_headlen(skb)); 5865 skb_headers_offset_update(skb, 0); 5866 skb->cloned = 0; 5867 skb->hdr_len = 0; 5868 skb->nohdr = 0; 5869 atomic_set(&skb_shinfo(skb)->dataref, 1); 5870 5871 return 0; 5872 } 5873 5874 static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp); 5875 5876 /* carve out the first eat bytes from skb's frag_list. May recurse into 5877 * pskb_carve() 5878 */ 5879 static int pskb_carve_frag_list(struct sk_buff *skb, 5880 struct skb_shared_info *shinfo, int eat, 5881 gfp_t gfp_mask) 5882 { 5883 struct sk_buff *list = shinfo->frag_list; 5884 struct sk_buff *clone = NULL; 5885 struct sk_buff *insp = NULL; 5886 5887 do { 5888 if (!list) { 5889 pr_err("Not enough bytes to eat. Want %d\n", eat); 5890 return -EFAULT; 5891 } 5892 if (list->len <= eat) { 5893 /* Eaten as whole. */ 5894 eat -= list->len; 5895 list = list->next; 5896 insp = list; 5897 } else { 5898 /* Eaten partially. */ 5899 if (skb_shared(list)) { 5900 clone = skb_clone(list, gfp_mask); 5901 if (!clone) 5902 return -ENOMEM; 5903 insp = list->next; 5904 list = clone; 5905 } else { 5906 /* This may be pulled without problems. */ 5907 insp = list; 5908 } 5909 if (pskb_carve(list, eat, gfp_mask) < 0) { 5910 kfree_skb(clone); 5911 return -ENOMEM; 5912 } 5913 break; 5914 } 5915 } while (eat); 5916 5917 /* Free pulled out fragments. */ 5918 while ((list = shinfo->frag_list) != insp) { 5919 shinfo->frag_list = list->next; 5920 kfree_skb(list); 5921 } 5922 /* And insert new clone at head. */ 5923 if (clone) { 5924 clone->next = list; 5925 shinfo->frag_list = clone; 5926 } 5927 return 0; 5928 } 5929 5930 /* carve off first len bytes from skb. Split line (off) is in the 5931 * non-linear part of skb 5932 */ 5933 static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, 5934 int pos, gfp_t gfp_mask) 5935 { 5936 int i, k = 0; 5937 int size = skb_end_offset(skb); 5938 u8 *data; 5939 const int nfrags = skb_shinfo(skb)->nr_frags; 5940 struct skb_shared_info *shinfo; 5941 5942 size = SKB_DATA_ALIGN(size); 5943 5944 if (skb_pfmemalloc(skb)) 5945 gfp_mask |= __GFP_MEMALLOC; 5946 data = kmalloc_reserve(size + 5947 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), 5948 gfp_mask, NUMA_NO_NODE, NULL); 5949 if (!data) 5950 return -ENOMEM; 5951 5952 size = SKB_WITH_OVERHEAD(ksize(data)); 5953 5954 memcpy((struct skb_shared_info *)(data + size), 5955 skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0])); 5956 if (skb_orphan_frags(skb, gfp_mask)) { 5957 kfree(data); 5958 return -ENOMEM; 5959 } 5960 shinfo = (struct skb_shared_info *)(data + size); 5961 for (i = 0; i < nfrags; i++) { 5962 int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]); 5963 5964 if (pos + fsize > off) { 5965 shinfo->frags[k] = skb_shinfo(skb)->frags[i]; 5966 5967 if (pos < off) { 5968 /* Split frag. 5969 * We have two variants in this case: 5970 * 1. Move all the frag to the second 5971 * part, if it is possible. F.e. 5972 * this approach is mandatory for TUX, 5973 * where splitting is expensive. 5974 * 2. Split is accurately. We make this. 5975 */ 5976 skb_frag_off_add(&shinfo->frags[0], off - pos); 5977 skb_frag_size_sub(&shinfo->frags[0], off - pos); 5978 } 5979 skb_frag_ref(skb, i); 5980 k++; 5981 } 5982 pos += fsize; 5983 } 5984 shinfo->nr_frags = k; 5985 if (skb_has_frag_list(skb)) 5986 skb_clone_fraglist(skb); 5987 5988 /* split line is in frag list */ 5989 if (k == 0 && pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask)) { 5990 /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */ 5991 if (skb_has_frag_list(skb)) 5992 kfree_skb_list(skb_shinfo(skb)->frag_list); 5993 kfree(data); 5994 return -ENOMEM; 5995 } 5996 skb_release_data(skb); 5997 5998 skb->head = data; 5999 skb->head_frag = 0; 6000 skb->data = data; 6001 #ifdef NET_SKBUFF_DATA_USES_OFFSET 6002 skb->end = size; 6003 #else 6004 skb->end = skb->head + size; 6005 #endif 6006 skb_reset_tail_pointer(skb); 6007 skb_headers_offset_update(skb, 0); 6008 skb->cloned = 0; 6009 skb->hdr_len = 0; 6010 skb->nohdr = 0; 6011 skb->len -= off; 6012 skb->data_len = skb->len; 6013 atomic_set(&skb_shinfo(skb)->dataref, 1); 6014 return 0; 6015 } 6016 6017 /* remove len bytes from the beginning of the skb */ 6018 static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp) 6019 { 6020 int headlen = skb_headlen(skb); 6021 6022 if (len < headlen) 6023 return pskb_carve_inside_header(skb, len, headlen, gfp); 6024 else 6025 return pskb_carve_inside_nonlinear(skb, len, headlen, gfp); 6026 } 6027 6028 /* Extract to_copy bytes starting at off from skb, and return this in 6029 * a new skb 6030 */ 6031 struct sk_buff *pskb_extract(struct sk_buff *skb, int off, 6032 int to_copy, gfp_t gfp) 6033 { 6034 struct sk_buff *clone = skb_clone(skb, gfp); 6035 6036 if (!clone) 6037 return NULL; 6038 6039 if (pskb_carve(clone, off, gfp) < 0 || 6040 pskb_trim(clone, to_copy)) { 6041 kfree_skb(clone); 6042 return NULL; 6043 } 6044 return clone; 6045 } 6046 EXPORT_SYMBOL(pskb_extract); 6047 6048 /** 6049 * skb_condense - try to get rid of fragments/frag_list if possible 6050 * @skb: buffer 6051 * 6052 * Can be used to save memory before skb is added to a busy queue. 6053 * If packet has bytes in frags and enough tail room in skb->head, 6054 * pull all of them, so that we can free the frags right now and adjust 6055 * truesize. 6056 * Notes: 6057 * We do not reallocate skb->head thus can not fail. 6058 * Caller must re-evaluate skb->truesize if needed. 6059 */ 6060 void skb_condense(struct sk_buff *skb) 6061 { 6062 if (skb->data_len) { 6063 if (skb->data_len > skb->end - skb->tail || 6064 skb_cloned(skb)) 6065 return; 6066 6067 /* Nice, we can free page frag(s) right now */ 6068 __pskb_pull_tail(skb, skb->data_len); 6069 } 6070 /* At this point, skb->truesize might be over estimated, 6071 * because skb had a fragment, and fragments do not tell 6072 * their truesize. 6073 * When we pulled its content into skb->head, fragment 6074 * was freed, but __pskb_pull_tail() could not possibly 6075 * adjust skb->truesize, not knowing the frag truesize. 6076 */ 6077 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); 6078 } 6079 6080 #ifdef CONFIG_SKB_EXTENSIONS 6081 static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) 6082 { 6083 return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE); 6084 } 6085 6086 /** 6087 * __skb_ext_alloc - allocate a new skb extensions storage 6088 * 6089 * @flags: See kmalloc(). 6090 * 6091 * Returns the newly allocated pointer. The pointer can later attached to a 6092 * skb via __skb_ext_set(). 6093 * Note: caller must handle the skb_ext as an opaque data. 6094 */ 6095 struct skb_ext *__skb_ext_alloc(gfp_t flags) 6096 { 6097 struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags); 6098 6099 if (new) { 6100 memset(new->offset, 0, sizeof(new->offset)); 6101 refcount_set(&new->refcnt, 1); 6102 } 6103 6104 return new; 6105 } 6106 6107 static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old, 6108 unsigned int old_active) 6109 { 6110 struct skb_ext *new; 6111 6112 if (refcount_read(&old->refcnt) == 1) 6113 return old; 6114 6115 new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); 6116 if (!new) 6117 return NULL; 6118 6119 memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE); 6120 refcount_set(&new->refcnt, 1); 6121 6122 #ifdef CONFIG_XFRM 6123 if (old_active & (1 << SKB_EXT_SEC_PATH)) { 6124 struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH); 6125 unsigned int i; 6126 6127 for (i = 0; i < sp->len; i++) 6128 xfrm_state_hold(sp->xvec[i]); 6129 } 6130 #endif 6131 __skb_ext_put(old); 6132 return new; 6133 } 6134 6135 /** 6136 * __skb_ext_set - attach the specified extension storage to this skb 6137 * @skb: buffer 6138 * @id: extension id 6139 * @ext: extension storage previously allocated via __skb_ext_alloc() 6140 * 6141 * Existing extensions, if any, are cleared. 6142 * 6143 * Returns the pointer to the extension. 6144 */ 6145 void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, 6146 struct skb_ext *ext) 6147 { 6148 unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext); 6149 6150 skb_ext_put(skb); 6151 newlen = newoff + skb_ext_type_len[id]; 6152 ext->chunks = newlen; 6153 ext->offset[id] = newoff; 6154 skb->extensions = ext; 6155 skb->active_extensions = 1 << id; 6156 return skb_ext_get_ptr(ext, id); 6157 } 6158 6159 /** 6160 * skb_ext_add - allocate space for given extension, COW if needed 6161 * @skb: buffer 6162 * @id: extension to allocate space for 6163 * 6164 * Allocates enough space for the given extension. 6165 * If the extension is already present, a pointer to that extension 6166 * is returned. 6167 * 6168 * If the skb was cloned, COW applies and the returned memory can be 6169 * modified without changing the extension space of clones buffers. 6170 * 6171 * Returns pointer to the extension or NULL on allocation failure. 6172 */ 6173 void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) 6174 { 6175 struct skb_ext *new, *old = NULL; 6176 unsigned int newlen, newoff; 6177 6178 if (skb->active_extensions) { 6179 old = skb->extensions; 6180 6181 new = skb_ext_maybe_cow(old, skb->active_extensions); 6182 if (!new) 6183 return NULL; 6184 6185 if (__skb_ext_exist(new, id)) 6186 goto set_active; 6187 6188 newoff = new->chunks; 6189 } else { 6190 newoff = SKB_EXT_CHUNKSIZEOF(*new); 6191 6192 new = __skb_ext_alloc(GFP_ATOMIC); 6193 if (!new) 6194 return NULL; 6195 } 6196 6197 newlen = newoff + skb_ext_type_len[id]; 6198 new->chunks = newlen; 6199 new->offset[id] = newoff; 6200 set_active: 6201 skb->extensions = new; 6202 skb->active_extensions |= 1 << id; 6203 return skb_ext_get_ptr(new, id); 6204 } 6205 EXPORT_SYMBOL(skb_ext_add); 6206 6207 #ifdef CONFIG_XFRM 6208 static void skb_ext_put_sp(struct sec_path *sp) 6209 { 6210 unsigned int i; 6211 6212 for (i = 0; i < sp->len; i++) 6213 xfrm_state_put(sp->xvec[i]); 6214 } 6215 #endif 6216 6217 void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) 6218 { 6219 struct skb_ext *ext = skb->extensions; 6220 6221 skb->active_extensions &= ~(1 << id); 6222 if (skb->active_extensions == 0) { 6223 skb->extensions = NULL; 6224 __skb_ext_put(ext); 6225 #ifdef CONFIG_XFRM 6226 } else if (id == SKB_EXT_SEC_PATH && 6227 refcount_read(&ext->refcnt) == 1) { 6228 struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH); 6229 6230 skb_ext_put_sp(sp); 6231 sp->len = 0; 6232 #endif 6233 } 6234 } 6235 EXPORT_SYMBOL(__skb_ext_del); 6236 6237 void __skb_ext_put(struct skb_ext *ext) 6238 { 6239 /* If this is last clone, nothing can increment 6240 * it after check passes. Avoids one atomic op. 6241 */ 6242 if (refcount_read(&ext->refcnt) == 1) 6243 goto free_now; 6244 6245 if (!refcount_dec_and_test(&ext->refcnt)) 6246 return; 6247 free_now: 6248 #ifdef CONFIG_XFRM 6249 if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH)) 6250 skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH)); 6251 #endif 6252 6253 kmem_cache_free(skbuff_ext_cache, ext); 6254 } 6255 EXPORT_SYMBOL(__skb_ext_put); 6256 #endif /* CONFIG_SKB_EXTENSIONS */ 6257