1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Routines having to do with the 'struct sk_buff' memory handlers. 4 * 5 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> 6 * Florian La Roche <rzsfl@rz.uni-sb.de> 7 * 8 * Fixes: 9 * Alan Cox : Fixed the worst of the load 10 * balancer bugs. 11 * Dave Platt : Interrupt stacking fix. 12 * Richard Kooijman : Timestamp fixes. 13 * Alan Cox : Changed buffer format. 14 * Alan Cox : destructor hook for AF_UNIX etc. 15 * Linus Torvalds : Better skb_clone. 16 * Alan Cox : Added skb_copy. 17 * Alan Cox : Added all the changed routines Linus 18 * only put in the headers 19 * Ray VanTassle : Fixed --skb->lock in free 20 * Alan Cox : skb_copy copy arp field 21 * Andi Kleen : slabified it. 22 * Robert Olsson : Removed skb_head_pool 23 * 24 * NOTE: 25 * The __skb_ routines should be called with interrupts 26 * disabled, or you better be *real* sure that the operation is atomic 27 * with respect to whatever list is being frobbed (e.g. via lock_sock() 28 * or via disabling bottom half handlers, etc). 29 */ 30 31 /* 32 * The functions in this file will not compile correctly with gcc 2.4.x 33 */ 34 35 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 36 37 #include <linux/module.h> 38 #include <linux/types.h> 39 #include <linux/kernel.h> 40 #include <linux/mm.h> 41 #include <linux/interrupt.h> 42 #include <linux/in.h> 43 #include <linux/inet.h> 44 #include <linux/slab.h> 45 #include <linux/tcp.h> 46 #include <linux/udp.h> 47 #include <linux/sctp.h> 48 #include <linux/netdevice.h> 49 #ifdef CONFIG_NET_CLS_ACT 50 #include <net/pkt_sched.h> 51 #endif 52 #include <linux/string.h> 53 #include <linux/skbuff.h> 54 #include <linux/splice.h> 55 #include <linux/cache.h> 56 #include <linux/rtnetlink.h> 57 #include <linux/init.h> 58 #include <linux/scatterlist.h> 59 #include <linux/errqueue.h> 60 #include <linux/prefetch.h> 61 #include <linux/if_vlan.h> 62 #include <linux/mpls.h> 63 64 #include <net/protocol.h> 65 #include <net/dst.h> 66 #include <net/sock.h> 67 #include <net/checksum.h> 68 #include <net/ip6_checksum.h> 69 #include <net/xfrm.h> 70 #include <net/mpls.h> 71 #include <net/mptcp.h> 72 73 #include <linux/uaccess.h> 74 #include <trace/events/skb.h> 75 #include <linux/highmem.h> 76 #include <linux/capability.h> 77 #include <linux/user_namespace.h> 78 #include <linux/indirect_call_wrapper.h> 79 80 #include "datagram.h" 81 82 struct kmem_cache *skbuff_head_cache __ro_after_init; 83 static struct kmem_cache *skbuff_fclone_cache __ro_after_init; 84 #ifdef CONFIG_SKB_EXTENSIONS 85 static struct kmem_cache *skbuff_ext_cache __ro_after_init; 86 #endif 87 int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; 88 EXPORT_SYMBOL(sysctl_max_skb_frags); 89 90 /** 91 * skb_panic - private function for out-of-line support 92 * @skb: buffer 93 * @sz: size 94 * @addr: address 95 * @msg: skb_over_panic or skb_under_panic 96 * 97 * Out-of-line support for skb_put() and skb_push(). 98 * Called via the wrapper skb_over_panic() or skb_under_panic(). 99 * Keep out of line to prevent kernel bloat. 100 * __builtin_return_address is not used because it is not always reliable. 101 */ 102 static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr, 103 const char msg[]) 104 { 105 pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n", 106 msg, addr, skb->len, sz, skb->head, skb->data, 107 (unsigned long)skb->tail, (unsigned long)skb->end, 108 skb->dev ? skb->dev->name : "<NULL>"); 109 BUG(); 110 } 111 112 static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr) 113 { 114 skb_panic(skb, sz, addr, __func__); 115 } 116 117 static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) 118 { 119 skb_panic(skb, sz, addr, __func__); 120 } 121 122 /* 123 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells 124 * the caller if emergency pfmemalloc reserves are being used. If it is and 125 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves 126 * may be used. Otherwise, the packet data may be discarded until enough 127 * memory is free 128 */ 129 #define kmalloc_reserve(size, gfp, node, pfmemalloc) \ 130 __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc) 131 132 static void *__kmalloc_reserve(size_t size, gfp_t flags, int node, 133 unsigned long ip, bool *pfmemalloc) 134 { 135 void *obj; 136 bool ret_pfmemalloc = false; 137 138 /* 139 * Try a regular allocation, when that fails and we're not entitled 140 * to the reserves, fail. 141 */ 142 obj = kmalloc_node_track_caller(size, 143 flags | __GFP_NOMEMALLOC | __GFP_NOWARN, 144 node); 145 if (obj || !(gfp_pfmemalloc_allowed(flags))) 146 goto out; 147 148 /* Try again but now we are using pfmemalloc reserves */ 149 ret_pfmemalloc = true; 150 obj = kmalloc_node_track_caller(size, flags, node); 151 152 out: 153 if (pfmemalloc) 154 *pfmemalloc = ret_pfmemalloc; 155 156 return obj; 157 } 158 159 /* Allocate a new skbuff. We do this ourselves so we can fill in a few 160 * 'private' fields and also do memory statistics to find all the 161 * [BEEP] leaks. 162 * 163 */ 164 165 /** 166 * __alloc_skb - allocate a network buffer 167 * @size: size to allocate 168 * @gfp_mask: allocation mask 169 * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache 170 * instead of head cache and allocate a cloned (child) skb. 171 * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for 172 * allocations in case the data is required for writeback 173 * @node: numa node to allocate memory on 174 * 175 * Allocate a new &sk_buff. The returned buffer has no headroom and a 176 * tail room of at least size bytes. The object has a reference count 177 * of one. The return is the buffer. On a failure the return is %NULL. 178 * 179 * Buffers may only be allocated from interrupts using a @gfp_mask of 180 * %GFP_ATOMIC. 181 */ 182 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, 183 int flags, int node) 184 { 185 struct kmem_cache *cache; 186 struct skb_shared_info *shinfo; 187 struct sk_buff *skb; 188 u8 *data; 189 bool pfmemalloc; 190 191 cache = (flags & SKB_ALLOC_FCLONE) 192 ? skbuff_fclone_cache : skbuff_head_cache; 193 194 if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) 195 gfp_mask |= __GFP_MEMALLOC; 196 197 /* Get the HEAD */ 198 skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); 199 if (!skb) 200 goto out; 201 prefetchw(skb); 202 203 /* We do our best to align skb_shared_info on a separate cache 204 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives 205 * aligned memory blocks, unless SLUB/SLAB debug is enabled. 206 * Both skb->head and skb_shared_info are cache line aligned. 207 */ 208 size = SKB_DATA_ALIGN(size); 209 size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 210 data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc); 211 if (!data) 212 goto nodata; 213 /* kmalloc(size) might give us more room than requested. 214 * Put skb_shared_info exactly at the end of allocated zone, 215 * to allow max possible filling before reallocation. 216 */ 217 size = SKB_WITH_OVERHEAD(ksize(data)); 218 prefetchw(data + size); 219 220 /* 221 * Only clear those fields we need to clear, not those that we will 222 * actually initialise below. Hence, don't put any more fields after 223 * the tail pointer in struct sk_buff! 224 */ 225 memset(skb, 0, offsetof(struct sk_buff, tail)); 226 /* Account for allocated memory : skb + skb->head */ 227 skb->truesize = SKB_TRUESIZE(size); 228 skb->pfmemalloc = pfmemalloc; 229 refcount_set(&skb->users, 1); 230 skb->head = data; 231 skb->data = data; 232 skb_reset_tail_pointer(skb); 233 skb->end = skb->tail + size; 234 skb->mac_header = (typeof(skb->mac_header))~0U; 235 skb->transport_header = (typeof(skb->transport_header))~0U; 236 237 /* make sure we initialize shinfo sequentially */ 238 shinfo = skb_shinfo(skb); 239 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); 240 atomic_set(&shinfo->dataref, 1); 241 242 if (flags & SKB_ALLOC_FCLONE) { 243 struct sk_buff_fclones *fclones; 244 245 fclones = container_of(skb, struct sk_buff_fclones, skb1); 246 247 skb->fclone = SKB_FCLONE_ORIG; 248 refcount_set(&fclones->fclone_ref, 1); 249 250 fclones->skb2.fclone = SKB_FCLONE_CLONE; 251 } 252 out: 253 return skb; 254 nodata: 255 kmem_cache_free(cache, skb); 256 skb = NULL; 257 goto out; 258 } 259 EXPORT_SYMBOL(__alloc_skb); 260 261 /* Caller must provide SKB that is memset cleared */ 262 static struct sk_buff *__build_skb_around(struct sk_buff *skb, 263 void *data, unsigned int frag_size) 264 { 265 struct skb_shared_info *shinfo; 266 unsigned int size = frag_size ? : ksize(data); 267 268 size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 269 270 /* Assumes caller memset cleared SKB */ 271 skb->truesize = SKB_TRUESIZE(size); 272 refcount_set(&skb->users, 1); 273 skb->head = data; 274 skb->data = data; 275 skb_reset_tail_pointer(skb); 276 skb->end = skb->tail + size; 277 skb->mac_header = (typeof(skb->mac_header))~0U; 278 skb->transport_header = (typeof(skb->transport_header))~0U; 279 280 /* make sure we initialize shinfo sequentially */ 281 shinfo = skb_shinfo(skb); 282 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); 283 atomic_set(&shinfo->dataref, 1); 284 285 return skb; 286 } 287 288 /** 289 * __build_skb - build a network buffer 290 * @data: data buffer provided by caller 291 * @frag_size: size of data, or 0 if head was kmalloced 292 * 293 * Allocate a new &sk_buff. Caller provides space holding head and 294 * skb_shared_info. @data must have been allocated by kmalloc() only if 295 * @frag_size is 0, otherwise data should come from the page allocator 296 * or vmalloc() 297 * The return is the new skb buffer. 298 * On a failure the return is %NULL, and @data is not freed. 299 * Notes : 300 * Before IO, driver allocates only data buffer where NIC put incoming frame 301 * Driver should add room at head (NET_SKB_PAD) and 302 * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info)) 303 * After IO, driver calls build_skb(), to allocate sk_buff and populate it 304 * before giving packet to stack. 305 * RX rings only contains data buffers, not full skbs. 306 */ 307 struct sk_buff *__build_skb(void *data, unsigned int frag_size) 308 { 309 struct sk_buff *skb; 310 311 skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); 312 if (unlikely(!skb)) 313 return NULL; 314 315 memset(skb, 0, offsetof(struct sk_buff, tail)); 316 317 return __build_skb_around(skb, data, frag_size); 318 } 319 320 /* build_skb() is wrapper over __build_skb(), that specifically 321 * takes care of skb->head and skb->pfmemalloc 322 * This means that if @frag_size is not zero, then @data must be backed 323 * by a page fragment, not kmalloc() or vmalloc() 324 */ 325 struct sk_buff *build_skb(void *data, unsigned int frag_size) 326 { 327 struct sk_buff *skb = __build_skb(data, frag_size); 328 329 if (skb && frag_size) { 330 skb->head_frag = 1; 331 if (page_is_pfmemalloc(virt_to_head_page(data))) 332 skb->pfmemalloc = 1; 333 } 334 return skb; 335 } 336 EXPORT_SYMBOL(build_skb); 337 338 /** 339 * build_skb_around - build a network buffer around provided skb 340 * @skb: sk_buff provide by caller, must be memset cleared 341 * @data: data buffer provided by caller 342 * @frag_size: size of data, or 0 if head was kmalloced 343 */ 344 struct sk_buff *build_skb_around(struct sk_buff *skb, 345 void *data, unsigned int frag_size) 346 { 347 if (unlikely(!skb)) 348 return NULL; 349 350 skb = __build_skb_around(skb, data, frag_size); 351 352 if (skb && frag_size) { 353 skb->head_frag = 1; 354 if (page_is_pfmemalloc(virt_to_head_page(data))) 355 skb->pfmemalloc = 1; 356 } 357 return skb; 358 } 359 EXPORT_SYMBOL(build_skb_around); 360 361 #define NAPI_SKB_CACHE_SIZE 64 362 363 struct napi_alloc_cache { 364 struct page_frag_cache page; 365 unsigned int skb_count; 366 void *skb_cache[NAPI_SKB_CACHE_SIZE]; 367 }; 368 369 static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); 370 static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); 371 372 static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) 373 { 374 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); 375 376 return page_frag_alloc(&nc->page, fragsz, gfp_mask); 377 } 378 379 void *napi_alloc_frag(unsigned int fragsz) 380 { 381 fragsz = SKB_DATA_ALIGN(fragsz); 382 383 return __napi_alloc_frag(fragsz, GFP_ATOMIC); 384 } 385 EXPORT_SYMBOL(napi_alloc_frag); 386 387 /** 388 * netdev_alloc_frag - allocate a page fragment 389 * @fragsz: fragment size 390 * 391 * Allocates a frag from a page for receive buffer. 392 * Uses GFP_ATOMIC allocations. 393 */ 394 void *netdev_alloc_frag(unsigned int fragsz) 395 { 396 struct page_frag_cache *nc; 397 void *data; 398 399 fragsz = SKB_DATA_ALIGN(fragsz); 400 if (in_irq() || irqs_disabled()) { 401 nc = this_cpu_ptr(&netdev_alloc_cache); 402 data = page_frag_alloc(nc, fragsz, GFP_ATOMIC); 403 } else { 404 local_bh_disable(); 405 data = __napi_alloc_frag(fragsz, GFP_ATOMIC); 406 local_bh_enable(); 407 } 408 return data; 409 } 410 EXPORT_SYMBOL(netdev_alloc_frag); 411 412 /** 413 * __netdev_alloc_skb - allocate an skbuff for rx on a specific device 414 * @dev: network device to receive on 415 * @len: length to allocate 416 * @gfp_mask: get_free_pages mask, passed to alloc_skb 417 * 418 * Allocate a new &sk_buff and assign it a usage count of one. The 419 * buffer has NET_SKB_PAD headroom built in. Users should allocate 420 * the headroom they think they need without accounting for the 421 * built in space. The built in space is used for optimisations. 422 * 423 * %NULL is returned if there is no free memory. 424 */ 425 struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, 426 gfp_t gfp_mask) 427 { 428 struct page_frag_cache *nc; 429 struct sk_buff *skb; 430 bool pfmemalloc; 431 void *data; 432 433 len += NET_SKB_PAD; 434 435 if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || 436 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { 437 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); 438 if (!skb) 439 goto skb_fail; 440 goto skb_success; 441 } 442 443 len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 444 len = SKB_DATA_ALIGN(len); 445 446 if (sk_memalloc_socks()) 447 gfp_mask |= __GFP_MEMALLOC; 448 449 if (in_irq() || irqs_disabled()) { 450 nc = this_cpu_ptr(&netdev_alloc_cache); 451 data = page_frag_alloc(nc, len, gfp_mask); 452 pfmemalloc = nc->pfmemalloc; 453 } else { 454 local_bh_disable(); 455 nc = this_cpu_ptr(&napi_alloc_cache.page); 456 data = page_frag_alloc(nc, len, gfp_mask); 457 pfmemalloc = nc->pfmemalloc; 458 local_bh_enable(); 459 } 460 461 if (unlikely(!data)) 462 return NULL; 463 464 skb = __build_skb(data, len); 465 if (unlikely(!skb)) { 466 skb_free_frag(data); 467 return NULL; 468 } 469 470 if (pfmemalloc) 471 skb->pfmemalloc = 1; 472 skb->head_frag = 1; 473 474 skb_success: 475 skb_reserve(skb, NET_SKB_PAD); 476 skb->dev = dev; 477 478 skb_fail: 479 return skb; 480 } 481 EXPORT_SYMBOL(__netdev_alloc_skb); 482 483 /** 484 * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance 485 * @napi: napi instance this buffer was allocated for 486 * @len: length to allocate 487 * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages 488 * 489 * Allocate a new sk_buff for use in NAPI receive. This buffer will 490 * attempt to allocate the head from a special reserved region used 491 * only for NAPI Rx allocation. By doing this we can save several 492 * CPU cycles by avoiding having to disable and re-enable IRQs. 493 * 494 * %NULL is returned if there is no free memory. 495 */ 496 struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, 497 gfp_t gfp_mask) 498 { 499 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); 500 struct sk_buff *skb; 501 void *data; 502 503 len += NET_SKB_PAD + NET_IP_ALIGN; 504 505 if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || 506 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { 507 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); 508 if (!skb) 509 goto skb_fail; 510 goto skb_success; 511 } 512 513 len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 514 len = SKB_DATA_ALIGN(len); 515 516 if (sk_memalloc_socks()) 517 gfp_mask |= __GFP_MEMALLOC; 518 519 data = page_frag_alloc(&nc->page, len, gfp_mask); 520 if (unlikely(!data)) 521 return NULL; 522 523 skb = __build_skb(data, len); 524 if (unlikely(!skb)) { 525 skb_free_frag(data); 526 return NULL; 527 } 528 529 if (nc->page.pfmemalloc) 530 skb->pfmemalloc = 1; 531 skb->head_frag = 1; 532 533 skb_success: 534 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); 535 skb->dev = napi->dev; 536 537 skb_fail: 538 return skb; 539 } 540 EXPORT_SYMBOL(__napi_alloc_skb); 541 542 void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, 543 int size, unsigned int truesize) 544 { 545 skb_fill_page_desc(skb, i, page, off, size); 546 skb->len += size; 547 skb->data_len += size; 548 skb->truesize += truesize; 549 } 550 EXPORT_SYMBOL(skb_add_rx_frag); 551 552 void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, 553 unsigned int truesize) 554 { 555 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 556 557 skb_frag_size_add(frag, size); 558 skb->len += size; 559 skb->data_len += size; 560 skb->truesize += truesize; 561 } 562 EXPORT_SYMBOL(skb_coalesce_rx_frag); 563 564 static void skb_drop_list(struct sk_buff **listp) 565 { 566 kfree_skb_list(*listp); 567 *listp = NULL; 568 } 569 570 static inline void skb_drop_fraglist(struct sk_buff *skb) 571 { 572 skb_drop_list(&skb_shinfo(skb)->frag_list); 573 } 574 575 static void skb_clone_fraglist(struct sk_buff *skb) 576 { 577 struct sk_buff *list; 578 579 skb_walk_frags(skb, list) 580 skb_get(list); 581 } 582 583 static void skb_free_head(struct sk_buff *skb) 584 { 585 unsigned char *head = skb->head; 586 587 if (skb->head_frag) 588 skb_free_frag(head); 589 else 590 kfree(head); 591 } 592 593 static void skb_release_data(struct sk_buff *skb) 594 { 595 struct skb_shared_info *shinfo = skb_shinfo(skb); 596 int i; 597 598 if (skb->cloned && 599 atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, 600 &shinfo->dataref)) 601 return; 602 603 for (i = 0; i < shinfo->nr_frags; i++) 604 __skb_frag_unref(&shinfo->frags[i]); 605 606 if (shinfo->frag_list) 607 kfree_skb_list(shinfo->frag_list); 608 609 skb_zcopy_clear(skb, true); 610 skb_free_head(skb); 611 } 612 613 /* 614 * Free an skbuff by memory without cleaning the state. 615 */ 616 static void kfree_skbmem(struct sk_buff *skb) 617 { 618 struct sk_buff_fclones *fclones; 619 620 switch (skb->fclone) { 621 case SKB_FCLONE_UNAVAILABLE: 622 kmem_cache_free(skbuff_head_cache, skb); 623 return; 624 625 case SKB_FCLONE_ORIG: 626 fclones = container_of(skb, struct sk_buff_fclones, skb1); 627 628 /* We usually free the clone (TX completion) before original skb 629 * This test would have no chance to be true for the clone, 630 * while here, branch prediction will be good. 631 */ 632 if (refcount_read(&fclones->fclone_ref) == 1) 633 goto fastpath; 634 break; 635 636 default: /* SKB_FCLONE_CLONE */ 637 fclones = container_of(skb, struct sk_buff_fclones, skb2); 638 break; 639 } 640 if (!refcount_dec_and_test(&fclones->fclone_ref)) 641 return; 642 fastpath: 643 kmem_cache_free(skbuff_fclone_cache, fclones); 644 } 645 646 void skb_release_head_state(struct sk_buff *skb) 647 { 648 skb_dst_drop(skb); 649 if (skb->destructor) { 650 WARN_ON(in_irq()); 651 skb->destructor(skb); 652 } 653 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 654 nf_conntrack_put(skb_nfct(skb)); 655 #endif 656 skb_ext_put(skb); 657 } 658 659 /* Free everything but the sk_buff shell. */ 660 static void skb_release_all(struct sk_buff *skb) 661 { 662 skb_release_head_state(skb); 663 if (likely(skb->head)) 664 skb_release_data(skb); 665 } 666 667 /** 668 * __kfree_skb - private function 669 * @skb: buffer 670 * 671 * Free an sk_buff. Release anything attached to the buffer. 672 * Clean the state. This is an internal helper function. Users should 673 * always call kfree_skb 674 */ 675 676 void __kfree_skb(struct sk_buff *skb) 677 { 678 skb_release_all(skb); 679 kfree_skbmem(skb); 680 } 681 EXPORT_SYMBOL(__kfree_skb); 682 683 /** 684 * kfree_skb - free an sk_buff 685 * @skb: buffer to free 686 * 687 * Drop a reference to the buffer and free it if the usage count has 688 * hit zero. 689 */ 690 void kfree_skb(struct sk_buff *skb) 691 { 692 if (!skb_unref(skb)) 693 return; 694 695 trace_kfree_skb(skb, __builtin_return_address(0)); 696 __kfree_skb(skb); 697 } 698 EXPORT_SYMBOL(kfree_skb); 699 700 void kfree_skb_list(struct sk_buff *segs) 701 { 702 while (segs) { 703 struct sk_buff *next = segs->next; 704 705 kfree_skb(segs); 706 segs = next; 707 } 708 } 709 EXPORT_SYMBOL(kfree_skb_list); 710 711 /* Dump skb information and contents. 712 * 713 * Must only be called from net_ratelimit()-ed paths. 714 * 715 * Dumps whole packets if full_pkt, only headers otherwise. 716 */ 717 void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) 718 { 719 struct skb_shared_info *sh = skb_shinfo(skb); 720 struct net_device *dev = skb->dev; 721 struct sock *sk = skb->sk; 722 struct sk_buff *list_skb; 723 bool has_mac, has_trans; 724 int headroom, tailroom; 725 int i, len, seg_len; 726 727 if (full_pkt) 728 len = skb->len; 729 else 730 len = min_t(int, skb->len, MAX_HEADER + 128); 731 732 headroom = skb_headroom(skb); 733 tailroom = skb_tailroom(skb); 734 735 has_mac = skb_mac_header_was_set(skb); 736 has_trans = skb_transport_header_was_set(skb); 737 738 printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n" 739 "mac=(%d,%d) net=(%d,%d) trans=%d\n" 740 "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n" 741 "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n" 742 "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n", 743 level, skb->len, headroom, skb_headlen(skb), tailroom, 744 has_mac ? skb->mac_header : -1, 745 has_mac ? skb_mac_header_len(skb) : -1, 746 skb->network_header, 747 has_trans ? skb_network_header_len(skb) : -1, 748 has_trans ? skb->transport_header : -1, 749 sh->tx_flags, sh->nr_frags, 750 sh->gso_size, sh->gso_type, sh->gso_segs, 751 skb->csum, skb->ip_summed, skb->csum_complete_sw, 752 skb->csum_valid, skb->csum_level, 753 skb->hash, skb->sw_hash, skb->l4_hash, 754 ntohs(skb->protocol), skb->pkt_type, skb->skb_iif); 755 756 if (dev) 757 printk("%sdev name=%s feat=0x%pNF\n", 758 level, dev->name, &dev->features); 759 if (sk) 760 printk("%ssk family=%hu type=%u proto=%u\n", 761 level, sk->sk_family, sk->sk_type, sk->sk_protocol); 762 763 if (full_pkt && headroom) 764 print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET, 765 16, 1, skb->head, headroom, false); 766 767 seg_len = min_t(int, skb_headlen(skb), len); 768 if (seg_len) 769 print_hex_dump(level, "skb linear: ", DUMP_PREFIX_OFFSET, 770 16, 1, skb->data, seg_len, false); 771 len -= seg_len; 772 773 if (full_pkt && tailroom) 774 print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET, 775 16, 1, skb_tail_pointer(skb), tailroom, false); 776 777 for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) { 778 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 779 u32 p_off, p_len, copied; 780 struct page *p; 781 u8 *vaddr; 782 783 skb_frag_foreach_page(frag, skb_frag_off(frag), 784 skb_frag_size(frag), p, p_off, p_len, 785 copied) { 786 seg_len = min_t(int, p_len, len); 787 vaddr = kmap_atomic(p); 788 print_hex_dump(level, "skb frag: ", 789 DUMP_PREFIX_OFFSET, 790 16, 1, vaddr + p_off, seg_len, false); 791 kunmap_atomic(vaddr); 792 len -= seg_len; 793 if (!len) 794 break; 795 } 796 } 797 798 if (full_pkt && skb_has_frag_list(skb)) { 799 printk("skb fraglist:\n"); 800 skb_walk_frags(skb, list_skb) 801 skb_dump(level, list_skb, true); 802 } 803 } 804 EXPORT_SYMBOL(skb_dump); 805 806 /** 807 * skb_tx_error - report an sk_buff xmit error 808 * @skb: buffer that triggered an error 809 * 810 * Report xmit error if a device callback is tracking this skb. 811 * skb must be freed afterwards. 812 */ 813 void skb_tx_error(struct sk_buff *skb) 814 { 815 skb_zcopy_clear(skb, true); 816 } 817 EXPORT_SYMBOL(skb_tx_error); 818 819 #ifdef CONFIG_TRACEPOINTS 820 /** 821 * consume_skb - free an skbuff 822 * @skb: buffer to free 823 * 824 * Drop a ref to the buffer and free it if the usage count has hit zero 825 * Functions identically to kfree_skb, but kfree_skb assumes that the frame 826 * is being dropped after a failure and notes that 827 */ 828 void consume_skb(struct sk_buff *skb) 829 { 830 if (!skb_unref(skb)) 831 return; 832 833 trace_consume_skb(skb); 834 __kfree_skb(skb); 835 } 836 EXPORT_SYMBOL(consume_skb); 837 #endif 838 839 /** 840 * consume_stateless_skb - free an skbuff, assuming it is stateless 841 * @skb: buffer to free 842 * 843 * Alike consume_skb(), but this variant assumes that this is the last 844 * skb reference and all the head states have been already dropped 845 */ 846 void __consume_stateless_skb(struct sk_buff *skb) 847 { 848 trace_consume_skb(skb); 849 skb_release_data(skb); 850 kfree_skbmem(skb); 851 } 852 853 void __kfree_skb_flush(void) 854 { 855 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); 856 857 /* flush skb_cache if containing objects */ 858 if (nc->skb_count) { 859 kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count, 860 nc->skb_cache); 861 nc->skb_count = 0; 862 } 863 } 864 865 static inline void _kfree_skb_defer(struct sk_buff *skb) 866 { 867 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); 868 869 /* drop skb->head and call any destructors for packet */ 870 skb_release_all(skb); 871 872 /* record skb to CPU local list */ 873 nc->skb_cache[nc->skb_count++] = skb; 874 875 #ifdef CONFIG_SLUB 876 /* SLUB writes into objects when freeing */ 877 prefetchw(skb); 878 #endif 879 880 /* flush skb_cache if it is filled */ 881 if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { 882 kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_SIZE, 883 nc->skb_cache); 884 nc->skb_count = 0; 885 } 886 } 887 void __kfree_skb_defer(struct sk_buff *skb) 888 { 889 _kfree_skb_defer(skb); 890 } 891 892 void napi_consume_skb(struct sk_buff *skb, int budget) 893 { 894 /* Zero budget indicate non-NAPI context called us, like netpoll */ 895 if (unlikely(!budget)) { 896 dev_consume_skb_any(skb); 897 return; 898 } 899 900 if (!skb_unref(skb)) 901 return; 902 903 /* if reaching here SKB is ready to free */ 904 trace_consume_skb(skb); 905 906 /* if SKB is a clone, don't handle this case */ 907 if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { 908 __kfree_skb(skb); 909 return; 910 } 911 912 _kfree_skb_defer(skb); 913 } 914 EXPORT_SYMBOL(napi_consume_skb); 915 916 /* Make sure a field is enclosed inside headers_start/headers_end section */ 917 #define CHECK_SKB_FIELD(field) \ 918 BUILD_BUG_ON(offsetof(struct sk_buff, field) < \ 919 offsetof(struct sk_buff, headers_start)); \ 920 BUILD_BUG_ON(offsetof(struct sk_buff, field) > \ 921 offsetof(struct sk_buff, headers_end)); \ 922 923 static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) 924 { 925 new->tstamp = old->tstamp; 926 /* We do not copy old->sk */ 927 new->dev = old->dev; 928 memcpy(new->cb, old->cb, sizeof(old->cb)); 929 skb_dst_copy(new, old); 930 __skb_ext_copy(new, old); 931 __nf_copy(new, old, false); 932 933 /* Note : this field could be in headers_start/headers_end section 934 * It is not yet because we do not want to have a 16 bit hole 935 */ 936 new->queue_mapping = old->queue_mapping; 937 938 memcpy(&new->headers_start, &old->headers_start, 939 offsetof(struct sk_buff, headers_end) - 940 offsetof(struct sk_buff, headers_start)); 941 CHECK_SKB_FIELD(protocol); 942 CHECK_SKB_FIELD(csum); 943 CHECK_SKB_FIELD(hash); 944 CHECK_SKB_FIELD(priority); 945 CHECK_SKB_FIELD(skb_iif); 946 CHECK_SKB_FIELD(vlan_proto); 947 CHECK_SKB_FIELD(vlan_tci); 948 CHECK_SKB_FIELD(transport_header); 949 CHECK_SKB_FIELD(network_header); 950 CHECK_SKB_FIELD(mac_header); 951 CHECK_SKB_FIELD(inner_protocol); 952 CHECK_SKB_FIELD(inner_transport_header); 953 CHECK_SKB_FIELD(inner_network_header); 954 CHECK_SKB_FIELD(inner_mac_header); 955 CHECK_SKB_FIELD(mark); 956 #ifdef CONFIG_NETWORK_SECMARK 957 CHECK_SKB_FIELD(secmark); 958 #endif 959 #ifdef CONFIG_NET_RX_BUSY_POLL 960 CHECK_SKB_FIELD(napi_id); 961 #endif 962 #ifdef CONFIG_XPS 963 CHECK_SKB_FIELD(sender_cpu); 964 #endif 965 #ifdef CONFIG_NET_SCHED 966 CHECK_SKB_FIELD(tc_index); 967 #endif 968 969 } 970 971 /* 972 * You should not add any new code to this function. Add it to 973 * __copy_skb_header above instead. 974 */ 975 static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) 976 { 977 #define C(x) n->x = skb->x 978 979 n->next = n->prev = NULL; 980 n->sk = NULL; 981 __copy_skb_header(n, skb); 982 983 C(len); 984 C(data_len); 985 C(mac_len); 986 n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; 987 n->cloned = 1; 988 n->nohdr = 0; 989 n->peeked = 0; 990 C(pfmemalloc); 991 n->destructor = NULL; 992 C(tail); 993 C(end); 994 C(head); 995 C(head_frag); 996 C(data); 997 C(truesize); 998 refcount_set(&n->users, 1); 999 1000 atomic_inc(&(skb_shinfo(skb)->dataref)); 1001 skb->cloned = 1; 1002 1003 return n; 1004 #undef C 1005 } 1006 1007 /** 1008 * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg 1009 * @first: first sk_buff of the msg 1010 */ 1011 struct sk_buff *alloc_skb_for_msg(struct sk_buff *first) 1012 { 1013 struct sk_buff *n; 1014 1015 n = alloc_skb(0, GFP_ATOMIC); 1016 if (!n) 1017 return NULL; 1018 1019 n->len = first->len; 1020 n->data_len = first->len; 1021 n->truesize = first->truesize; 1022 1023 skb_shinfo(n)->frag_list = first; 1024 1025 __copy_skb_header(n, first); 1026 n->destructor = NULL; 1027 1028 return n; 1029 } 1030 EXPORT_SYMBOL_GPL(alloc_skb_for_msg); 1031 1032 /** 1033 * skb_morph - morph one skb into another 1034 * @dst: the skb to receive the contents 1035 * @src: the skb to supply the contents 1036 * 1037 * This is identical to skb_clone except that the target skb is 1038 * supplied by the user. 1039 * 1040 * The target skb is returned upon exit. 1041 */ 1042 struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) 1043 { 1044 skb_release_all(dst); 1045 return __skb_clone(dst, src); 1046 } 1047 EXPORT_SYMBOL_GPL(skb_morph); 1048 1049 int mm_account_pinned_pages(struct mmpin *mmp, size_t size) 1050 { 1051 unsigned long max_pg, num_pg, new_pg, old_pg; 1052 struct user_struct *user; 1053 1054 if (capable(CAP_IPC_LOCK) || !size) 1055 return 0; 1056 1057 num_pg = (size >> PAGE_SHIFT) + 2; /* worst case */ 1058 max_pg = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 1059 user = mmp->user ? : current_user(); 1060 1061 do { 1062 old_pg = atomic_long_read(&user->locked_vm); 1063 new_pg = old_pg + num_pg; 1064 if (new_pg > max_pg) 1065 return -ENOBUFS; 1066 } while (atomic_long_cmpxchg(&user->locked_vm, old_pg, new_pg) != 1067 old_pg); 1068 1069 if (!mmp->user) { 1070 mmp->user = get_uid(user); 1071 mmp->num_pg = num_pg; 1072 } else { 1073 mmp->num_pg += num_pg; 1074 } 1075 1076 return 0; 1077 } 1078 EXPORT_SYMBOL_GPL(mm_account_pinned_pages); 1079 1080 void mm_unaccount_pinned_pages(struct mmpin *mmp) 1081 { 1082 if (mmp->user) { 1083 atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm); 1084 free_uid(mmp->user); 1085 } 1086 } 1087 EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); 1088 1089 struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size) 1090 { 1091 struct ubuf_info *uarg; 1092 struct sk_buff *skb; 1093 1094 WARN_ON_ONCE(!in_task()); 1095 1096 skb = sock_omalloc(sk, 0, GFP_KERNEL); 1097 if (!skb) 1098 return NULL; 1099 1100 BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb)); 1101 uarg = (void *)skb->cb; 1102 uarg->mmp.user = NULL; 1103 1104 if (mm_account_pinned_pages(&uarg->mmp, size)) { 1105 kfree_skb(skb); 1106 return NULL; 1107 } 1108 1109 uarg->callback = sock_zerocopy_callback; 1110 uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1; 1111 uarg->len = 1; 1112 uarg->bytelen = size; 1113 uarg->zerocopy = 1; 1114 refcount_set(&uarg->refcnt, 1); 1115 sock_hold(sk); 1116 1117 return uarg; 1118 } 1119 EXPORT_SYMBOL_GPL(sock_zerocopy_alloc); 1120 1121 static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg) 1122 { 1123 return container_of((void *)uarg, struct sk_buff, cb); 1124 } 1125 1126 struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size, 1127 struct ubuf_info *uarg) 1128 { 1129 if (uarg) { 1130 const u32 byte_limit = 1 << 19; /* limit to a few TSO */ 1131 u32 bytelen, next; 1132 1133 /* realloc only when socket is locked (TCP, UDP cork), 1134 * so uarg->len and sk_zckey access is serialized 1135 */ 1136 if (!sock_owned_by_user(sk)) { 1137 WARN_ON_ONCE(1); 1138 return NULL; 1139 } 1140 1141 bytelen = uarg->bytelen + size; 1142 if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) { 1143 /* TCP can create new skb to attach new uarg */ 1144 if (sk->sk_type == SOCK_STREAM) 1145 goto new_alloc; 1146 return NULL; 1147 } 1148 1149 next = (u32)atomic_read(&sk->sk_zckey); 1150 if ((u32)(uarg->id + uarg->len) == next) { 1151 if (mm_account_pinned_pages(&uarg->mmp, size)) 1152 return NULL; 1153 uarg->len++; 1154 uarg->bytelen = bytelen; 1155 atomic_set(&sk->sk_zckey, ++next); 1156 1157 /* no extra ref when appending to datagram (MSG_MORE) */ 1158 if (sk->sk_type == SOCK_STREAM) 1159 sock_zerocopy_get(uarg); 1160 1161 return uarg; 1162 } 1163 } 1164 1165 new_alloc: 1166 return sock_zerocopy_alloc(sk, size); 1167 } 1168 EXPORT_SYMBOL_GPL(sock_zerocopy_realloc); 1169 1170 static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len) 1171 { 1172 struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); 1173 u32 old_lo, old_hi; 1174 u64 sum_len; 1175 1176 old_lo = serr->ee.ee_info; 1177 old_hi = serr->ee.ee_data; 1178 sum_len = old_hi - old_lo + 1ULL + len; 1179 1180 if (sum_len >= (1ULL << 32)) 1181 return false; 1182 1183 if (lo != old_hi + 1) 1184 return false; 1185 1186 serr->ee.ee_data += len; 1187 return true; 1188 } 1189 1190 void sock_zerocopy_callback(struct ubuf_info *uarg, bool success) 1191 { 1192 struct sk_buff *tail, *skb = skb_from_uarg(uarg); 1193 struct sock_exterr_skb *serr; 1194 struct sock *sk = skb->sk; 1195 struct sk_buff_head *q; 1196 unsigned long flags; 1197 u32 lo, hi; 1198 u16 len; 1199 1200 mm_unaccount_pinned_pages(&uarg->mmp); 1201 1202 /* if !len, there was only 1 call, and it was aborted 1203 * so do not queue a completion notification 1204 */ 1205 if (!uarg->len || sock_flag(sk, SOCK_DEAD)) 1206 goto release; 1207 1208 len = uarg->len; 1209 lo = uarg->id; 1210 hi = uarg->id + len - 1; 1211 1212 serr = SKB_EXT_ERR(skb); 1213 memset(serr, 0, sizeof(*serr)); 1214 serr->ee.ee_errno = 0; 1215 serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY; 1216 serr->ee.ee_data = hi; 1217 serr->ee.ee_info = lo; 1218 if (!success) 1219 serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED; 1220 1221 q = &sk->sk_error_queue; 1222 spin_lock_irqsave(&q->lock, flags); 1223 tail = skb_peek_tail(q); 1224 if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY || 1225 !skb_zerocopy_notify_extend(tail, lo, len)) { 1226 __skb_queue_tail(q, skb); 1227 skb = NULL; 1228 } 1229 spin_unlock_irqrestore(&q->lock, flags); 1230 1231 sk->sk_error_report(sk); 1232 1233 release: 1234 consume_skb(skb); 1235 sock_put(sk); 1236 } 1237 EXPORT_SYMBOL_GPL(sock_zerocopy_callback); 1238 1239 void sock_zerocopy_put(struct ubuf_info *uarg) 1240 { 1241 if (uarg && refcount_dec_and_test(&uarg->refcnt)) { 1242 if (uarg->callback) 1243 uarg->callback(uarg, uarg->zerocopy); 1244 else 1245 consume_skb(skb_from_uarg(uarg)); 1246 } 1247 } 1248 EXPORT_SYMBOL_GPL(sock_zerocopy_put); 1249 1250 void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) 1251 { 1252 if (uarg) { 1253 struct sock *sk = skb_from_uarg(uarg)->sk; 1254 1255 atomic_dec(&sk->sk_zckey); 1256 uarg->len--; 1257 1258 if (have_uref) 1259 sock_zerocopy_put(uarg); 1260 } 1261 } 1262 EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort); 1263 1264 int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) 1265 { 1266 return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len); 1267 } 1268 EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram); 1269 1270 int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, 1271 struct msghdr *msg, int len, 1272 struct ubuf_info *uarg) 1273 { 1274 struct ubuf_info *orig_uarg = skb_zcopy(skb); 1275 struct iov_iter orig_iter = msg->msg_iter; 1276 int err, orig_len = skb->len; 1277 1278 /* An skb can only point to one uarg. This edge case happens when 1279 * TCP appends to an skb, but zerocopy_realloc triggered a new alloc. 1280 */ 1281 if (orig_uarg && uarg != orig_uarg) 1282 return -EEXIST; 1283 1284 err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len); 1285 if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { 1286 struct sock *save_sk = skb->sk; 1287 1288 /* Streams do not free skb on error. Reset to prev state. */ 1289 msg->msg_iter = orig_iter; 1290 skb->sk = sk; 1291 ___pskb_trim(skb, orig_len); 1292 skb->sk = save_sk; 1293 return err; 1294 } 1295 1296 skb_zcopy_set(skb, uarg, NULL); 1297 return skb->len - orig_len; 1298 } 1299 EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); 1300 1301 static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, 1302 gfp_t gfp_mask) 1303 { 1304 if (skb_zcopy(orig)) { 1305 if (skb_zcopy(nskb)) { 1306 /* !gfp_mask callers are verified to !skb_zcopy(nskb) */ 1307 if (!gfp_mask) { 1308 WARN_ON_ONCE(1); 1309 return -ENOMEM; 1310 } 1311 if (skb_uarg(nskb) == skb_uarg(orig)) 1312 return 0; 1313 if (skb_copy_ubufs(nskb, GFP_ATOMIC)) 1314 return -EIO; 1315 } 1316 skb_zcopy_set(nskb, skb_uarg(orig), NULL); 1317 } 1318 return 0; 1319 } 1320 1321 /** 1322 * skb_copy_ubufs - copy userspace skb frags buffers to kernel 1323 * @skb: the skb to modify 1324 * @gfp_mask: allocation priority 1325 * 1326 * This must be called on SKBTX_DEV_ZEROCOPY skb. 1327 * It will copy all frags into kernel and drop the reference 1328 * to userspace pages. 1329 * 1330 * If this function is called from an interrupt gfp_mask() must be 1331 * %GFP_ATOMIC. 1332 * 1333 * Returns 0 on success or a negative error code on failure 1334 * to allocate kernel memory to copy to. 1335 */ 1336 int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) 1337 { 1338 int num_frags = skb_shinfo(skb)->nr_frags; 1339 struct page *page, *head = NULL; 1340 int i, new_frags; 1341 u32 d_off; 1342 1343 if (skb_shared(skb) || skb_unclone(skb, gfp_mask)) 1344 return -EINVAL; 1345 1346 if (!num_frags) 1347 goto release; 1348 1349 new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT; 1350 for (i = 0; i < new_frags; i++) { 1351 page = alloc_page(gfp_mask); 1352 if (!page) { 1353 while (head) { 1354 struct page *next = (struct page *)page_private(head); 1355 put_page(head); 1356 head = next; 1357 } 1358 return -ENOMEM; 1359 } 1360 set_page_private(page, (unsigned long)head); 1361 head = page; 1362 } 1363 1364 page = head; 1365 d_off = 0; 1366 for (i = 0; i < num_frags; i++) { 1367 skb_frag_t *f = &skb_shinfo(skb)->frags[i]; 1368 u32 p_off, p_len, copied; 1369 struct page *p; 1370 u8 *vaddr; 1371 1372 skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f), 1373 p, p_off, p_len, copied) { 1374 u32 copy, done = 0; 1375 vaddr = kmap_atomic(p); 1376 1377 while (done < p_len) { 1378 if (d_off == PAGE_SIZE) { 1379 d_off = 0; 1380 page = (struct page *)page_private(page); 1381 } 1382 copy = min_t(u32, PAGE_SIZE - d_off, p_len - done); 1383 memcpy(page_address(page) + d_off, 1384 vaddr + p_off + done, copy); 1385 done += copy; 1386 d_off += copy; 1387 } 1388 kunmap_atomic(vaddr); 1389 } 1390 } 1391 1392 /* skb frags release userspace buffers */ 1393 for (i = 0; i < num_frags; i++) 1394 skb_frag_unref(skb, i); 1395 1396 /* skb frags point to kernel buffers */ 1397 for (i = 0; i < new_frags - 1; i++) { 1398 __skb_fill_page_desc(skb, i, head, 0, PAGE_SIZE); 1399 head = (struct page *)page_private(head); 1400 } 1401 __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off); 1402 skb_shinfo(skb)->nr_frags = new_frags; 1403 1404 release: 1405 skb_zcopy_clear(skb, false); 1406 return 0; 1407 } 1408 EXPORT_SYMBOL_GPL(skb_copy_ubufs); 1409 1410 /** 1411 * skb_clone - duplicate an sk_buff 1412 * @skb: buffer to clone 1413 * @gfp_mask: allocation priority 1414 * 1415 * Duplicate an &sk_buff. The new one is not owned by a socket. Both 1416 * copies share the same packet data but not structure. The new 1417 * buffer has a reference count of 1. If the allocation fails the 1418 * function returns %NULL otherwise the new buffer is returned. 1419 * 1420 * If this function is called from an interrupt gfp_mask() must be 1421 * %GFP_ATOMIC. 1422 */ 1423 1424 struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) 1425 { 1426 struct sk_buff_fclones *fclones = container_of(skb, 1427 struct sk_buff_fclones, 1428 skb1); 1429 struct sk_buff *n; 1430 1431 if (skb_orphan_frags(skb, gfp_mask)) 1432 return NULL; 1433 1434 if (skb->fclone == SKB_FCLONE_ORIG && 1435 refcount_read(&fclones->fclone_ref) == 1) { 1436 n = &fclones->skb2; 1437 refcount_set(&fclones->fclone_ref, 2); 1438 } else { 1439 if (skb_pfmemalloc(skb)) 1440 gfp_mask |= __GFP_MEMALLOC; 1441 1442 n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); 1443 if (!n) 1444 return NULL; 1445 1446 n->fclone = SKB_FCLONE_UNAVAILABLE; 1447 } 1448 1449 return __skb_clone(n, skb); 1450 } 1451 EXPORT_SYMBOL(skb_clone); 1452 1453 void skb_headers_offset_update(struct sk_buff *skb, int off) 1454 { 1455 /* Only adjust this if it actually is csum_start rather than csum */ 1456 if (skb->ip_summed == CHECKSUM_PARTIAL) 1457 skb->csum_start += off; 1458 /* {transport,network,mac}_header and tail are relative to skb->head */ 1459 skb->transport_header += off; 1460 skb->network_header += off; 1461 if (skb_mac_header_was_set(skb)) 1462 skb->mac_header += off; 1463 skb->inner_transport_header += off; 1464 skb->inner_network_header += off; 1465 skb->inner_mac_header += off; 1466 } 1467 EXPORT_SYMBOL(skb_headers_offset_update); 1468 1469 void skb_copy_header(struct sk_buff *new, const struct sk_buff *old) 1470 { 1471 __copy_skb_header(new, old); 1472 1473 skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; 1474 skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; 1475 skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; 1476 } 1477 EXPORT_SYMBOL(skb_copy_header); 1478 1479 static inline int skb_alloc_rx_flag(const struct sk_buff *skb) 1480 { 1481 if (skb_pfmemalloc(skb)) 1482 return SKB_ALLOC_RX; 1483 return 0; 1484 } 1485 1486 /** 1487 * skb_copy - create private copy of an sk_buff 1488 * @skb: buffer to copy 1489 * @gfp_mask: allocation priority 1490 * 1491 * Make a copy of both an &sk_buff and its data. This is used when the 1492 * caller wishes to modify the data and needs a private copy of the 1493 * data to alter. Returns %NULL on failure or the pointer to the buffer 1494 * on success. The returned buffer has a reference count of 1. 1495 * 1496 * As by-product this function converts non-linear &sk_buff to linear 1497 * one, so that &sk_buff becomes completely private and caller is allowed 1498 * to modify all the data of returned buffer. This means that this 1499 * function is not recommended for use in circumstances when only 1500 * header is going to be modified. Use pskb_copy() instead. 1501 */ 1502 1503 struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) 1504 { 1505 int headerlen = skb_headroom(skb); 1506 unsigned int size = skb_end_offset(skb) + skb->data_len; 1507 struct sk_buff *n = __alloc_skb(size, gfp_mask, 1508 skb_alloc_rx_flag(skb), NUMA_NO_NODE); 1509 1510 if (!n) 1511 return NULL; 1512 1513 /* Set the data pointer */ 1514 skb_reserve(n, headerlen); 1515 /* Set the tail pointer and length */ 1516 skb_put(n, skb->len); 1517 1518 BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)); 1519 1520 skb_copy_header(n, skb); 1521 return n; 1522 } 1523 EXPORT_SYMBOL(skb_copy); 1524 1525 /** 1526 * __pskb_copy_fclone - create copy of an sk_buff with private head. 1527 * @skb: buffer to copy 1528 * @headroom: headroom of new skb 1529 * @gfp_mask: allocation priority 1530 * @fclone: if true allocate the copy of the skb from the fclone 1531 * cache instead of the head cache; it is recommended to set this 1532 * to true for the cases where the copy will likely be cloned 1533 * 1534 * Make a copy of both an &sk_buff and part of its data, located 1535 * in header. Fragmented data remain shared. This is used when 1536 * the caller wishes to modify only header of &sk_buff and needs 1537 * private copy of the header to alter. Returns %NULL on failure 1538 * or the pointer to the buffer on success. 1539 * The returned buffer has a reference count of 1. 1540 */ 1541 1542 struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, 1543 gfp_t gfp_mask, bool fclone) 1544 { 1545 unsigned int size = skb_headlen(skb) + headroom; 1546 int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0); 1547 struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE); 1548 1549 if (!n) 1550 goto out; 1551 1552 /* Set the data pointer */ 1553 skb_reserve(n, headroom); 1554 /* Set the tail pointer and length */ 1555 skb_put(n, skb_headlen(skb)); 1556 /* Copy the bytes */ 1557 skb_copy_from_linear_data(skb, n->data, n->len); 1558 1559 n->truesize += skb->data_len; 1560 n->data_len = skb->data_len; 1561 n->len = skb->len; 1562 1563 if (skb_shinfo(skb)->nr_frags) { 1564 int i; 1565 1566 if (skb_orphan_frags(skb, gfp_mask) || 1567 skb_zerocopy_clone(n, skb, gfp_mask)) { 1568 kfree_skb(n); 1569 n = NULL; 1570 goto out; 1571 } 1572 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1573 skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; 1574 skb_frag_ref(skb, i); 1575 } 1576 skb_shinfo(n)->nr_frags = i; 1577 } 1578 1579 if (skb_has_frag_list(skb)) { 1580 skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; 1581 skb_clone_fraglist(n); 1582 } 1583 1584 skb_copy_header(n, skb); 1585 out: 1586 return n; 1587 } 1588 EXPORT_SYMBOL(__pskb_copy_fclone); 1589 1590 /** 1591 * pskb_expand_head - reallocate header of &sk_buff 1592 * @skb: buffer to reallocate 1593 * @nhead: room to add at head 1594 * @ntail: room to add at tail 1595 * @gfp_mask: allocation priority 1596 * 1597 * Expands (or creates identical copy, if @nhead and @ntail are zero) 1598 * header of @skb. &sk_buff itself is not changed. &sk_buff MUST have 1599 * reference count of 1. Returns zero in the case of success or error, 1600 * if expansion failed. In the last case, &sk_buff is not changed. 1601 * 1602 * All the pointers pointing into skb header may change and must be 1603 * reloaded after call to this function. 1604 */ 1605 1606 int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, 1607 gfp_t gfp_mask) 1608 { 1609 int i, osize = skb_end_offset(skb); 1610 int size = osize + nhead + ntail; 1611 long off; 1612 u8 *data; 1613 1614 BUG_ON(nhead < 0); 1615 1616 BUG_ON(skb_shared(skb)); 1617 1618 size = SKB_DATA_ALIGN(size); 1619 1620 if (skb_pfmemalloc(skb)) 1621 gfp_mask |= __GFP_MEMALLOC; 1622 data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), 1623 gfp_mask, NUMA_NO_NODE, NULL); 1624 if (!data) 1625 goto nodata; 1626 size = SKB_WITH_OVERHEAD(ksize(data)); 1627 1628 /* Copy only real data... and, alas, header. This should be 1629 * optimized for the cases when header is void. 1630 */ 1631 memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head); 1632 1633 memcpy((struct skb_shared_info *)(data + size), 1634 skb_shinfo(skb), 1635 offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); 1636 1637 /* 1638 * if shinfo is shared we must drop the old head gracefully, but if it 1639 * is not we can just drop the old head and let the existing refcount 1640 * be since all we did is relocate the values 1641 */ 1642 if (skb_cloned(skb)) { 1643 if (skb_orphan_frags(skb, gfp_mask)) 1644 goto nofrags; 1645 if (skb_zcopy(skb)) 1646 refcount_inc(&skb_uarg(skb)->refcnt); 1647 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 1648 skb_frag_ref(skb, i); 1649 1650 if (skb_has_frag_list(skb)) 1651 skb_clone_fraglist(skb); 1652 1653 skb_release_data(skb); 1654 } else { 1655 skb_free_head(skb); 1656 } 1657 off = (data + nhead) - skb->head; 1658 1659 skb->head = data; 1660 skb->head_frag = 0; 1661 skb->data += off; 1662 #ifdef NET_SKBUFF_DATA_USES_OFFSET 1663 skb->end = size; 1664 off = nhead; 1665 #else 1666 skb->end = skb->head + size; 1667 #endif 1668 skb->tail += off; 1669 skb_headers_offset_update(skb, nhead); 1670 skb->cloned = 0; 1671 skb->hdr_len = 0; 1672 skb->nohdr = 0; 1673 atomic_set(&skb_shinfo(skb)->dataref, 1); 1674 1675 skb_metadata_clear(skb); 1676 1677 /* It is not generally safe to change skb->truesize. 1678 * For the moment, we really care of rx path, or 1679 * when skb is orphaned (not attached to a socket). 1680 */ 1681 if (!skb->sk || skb->destructor == sock_edemux) 1682 skb->truesize += size - osize; 1683 1684 return 0; 1685 1686 nofrags: 1687 kfree(data); 1688 nodata: 1689 return -ENOMEM; 1690 } 1691 EXPORT_SYMBOL(pskb_expand_head); 1692 1693 /* Make private copy of skb with writable head and some headroom */ 1694 1695 struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) 1696 { 1697 struct sk_buff *skb2; 1698 int delta = headroom - skb_headroom(skb); 1699 1700 if (delta <= 0) 1701 skb2 = pskb_copy(skb, GFP_ATOMIC); 1702 else { 1703 skb2 = skb_clone(skb, GFP_ATOMIC); 1704 if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, 1705 GFP_ATOMIC)) { 1706 kfree_skb(skb2); 1707 skb2 = NULL; 1708 } 1709 } 1710 return skb2; 1711 } 1712 EXPORT_SYMBOL(skb_realloc_headroom); 1713 1714 /** 1715 * skb_copy_expand - copy and expand sk_buff 1716 * @skb: buffer to copy 1717 * @newheadroom: new free bytes at head 1718 * @newtailroom: new free bytes at tail 1719 * @gfp_mask: allocation priority 1720 * 1721 * Make a copy of both an &sk_buff and its data and while doing so 1722 * allocate additional space. 1723 * 1724 * This is used when the caller wishes to modify the data and needs a 1725 * private copy of the data to alter as well as more space for new fields. 1726 * Returns %NULL on failure or the pointer to the buffer 1727 * on success. The returned buffer has a reference count of 1. 1728 * 1729 * You must pass %GFP_ATOMIC as the allocation priority if this function 1730 * is called from an interrupt. 1731 */ 1732 struct sk_buff *skb_copy_expand(const struct sk_buff *skb, 1733 int newheadroom, int newtailroom, 1734 gfp_t gfp_mask) 1735 { 1736 /* 1737 * Allocate the copy buffer 1738 */ 1739 struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom, 1740 gfp_mask, skb_alloc_rx_flag(skb), 1741 NUMA_NO_NODE); 1742 int oldheadroom = skb_headroom(skb); 1743 int head_copy_len, head_copy_off; 1744 1745 if (!n) 1746 return NULL; 1747 1748 skb_reserve(n, newheadroom); 1749 1750 /* Set the tail pointer and length */ 1751 skb_put(n, skb->len); 1752 1753 head_copy_len = oldheadroom; 1754 head_copy_off = 0; 1755 if (newheadroom <= head_copy_len) 1756 head_copy_len = newheadroom; 1757 else 1758 head_copy_off = newheadroom - head_copy_len; 1759 1760 /* Copy the linear header and data. */ 1761 BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, 1762 skb->len + head_copy_len)); 1763 1764 skb_copy_header(n, skb); 1765 1766 skb_headers_offset_update(n, newheadroom - oldheadroom); 1767 1768 return n; 1769 } 1770 EXPORT_SYMBOL(skb_copy_expand); 1771 1772 /** 1773 * __skb_pad - zero pad the tail of an skb 1774 * @skb: buffer to pad 1775 * @pad: space to pad 1776 * @free_on_error: free buffer on error 1777 * 1778 * Ensure that a buffer is followed by a padding area that is zero 1779 * filled. Used by network drivers which may DMA or transfer data 1780 * beyond the buffer end onto the wire. 1781 * 1782 * May return error in out of memory cases. The skb is freed on error 1783 * if @free_on_error is true. 1784 */ 1785 1786 int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error) 1787 { 1788 int err; 1789 int ntail; 1790 1791 /* If the skbuff is non linear tailroom is always zero.. */ 1792 if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) { 1793 memset(skb->data+skb->len, 0, pad); 1794 return 0; 1795 } 1796 1797 ntail = skb->data_len + pad - (skb->end - skb->tail); 1798 if (likely(skb_cloned(skb) || ntail > 0)) { 1799 err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC); 1800 if (unlikely(err)) 1801 goto free_skb; 1802 } 1803 1804 /* FIXME: The use of this function with non-linear skb's really needs 1805 * to be audited. 1806 */ 1807 err = skb_linearize(skb); 1808 if (unlikely(err)) 1809 goto free_skb; 1810 1811 memset(skb->data + skb->len, 0, pad); 1812 return 0; 1813 1814 free_skb: 1815 if (free_on_error) 1816 kfree_skb(skb); 1817 return err; 1818 } 1819 EXPORT_SYMBOL(__skb_pad); 1820 1821 /** 1822 * pskb_put - add data to the tail of a potentially fragmented buffer 1823 * @skb: start of the buffer to use 1824 * @tail: tail fragment of the buffer to use 1825 * @len: amount of data to add 1826 * 1827 * This function extends the used data area of the potentially 1828 * fragmented buffer. @tail must be the last fragment of @skb -- or 1829 * @skb itself. If this would exceed the total buffer size the kernel 1830 * will panic. A pointer to the first byte of the extra data is 1831 * returned. 1832 */ 1833 1834 void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) 1835 { 1836 if (tail != skb) { 1837 skb->data_len += len; 1838 skb->len += len; 1839 } 1840 return skb_put(tail, len); 1841 } 1842 EXPORT_SYMBOL_GPL(pskb_put); 1843 1844 /** 1845 * skb_put - add data to a buffer 1846 * @skb: buffer to use 1847 * @len: amount of data to add 1848 * 1849 * This function extends the used data area of the buffer. If this would 1850 * exceed the total buffer size the kernel will panic. A pointer to the 1851 * first byte of the extra data is returned. 1852 */ 1853 void *skb_put(struct sk_buff *skb, unsigned int len) 1854 { 1855 void *tmp = skb_tail_pointer(skb); 1856 SKB_LINEAR_ASSERT(skb); 1857 skb->tail += len; 1858 skb->len += len; 1859 if (unlikely(skb->tail > skb->end)) 1860 skb_over_panic(skb, len, __builtin_return_address(0)); 1861 return tmp; 1862 } 1863 EXPORT_SYMBOL(skb_put); 1864 1865 /** 1866 * skb_push - add data to the start of a buffer 1867 * @skb: buffer to use 1868 * @len: amount of data to add 1869 * 1870 * This function extends the used data area of the buffer at the buffer 1871 * start. If this would exceed the total buffer headroom the kernel will 1872 * panic. A pointer to the first byte of the extra data is returned. 1873 */ 1874 void *skb_push(struct sk_buff *skb, unsigned int len) 1875 { 1876 skb->data -= len; 1877 skb->len += len; 1878 if (unlikely(skb->data < skb->head)) 1879 skb_under_panic(skb, len, __builtin_return_address(0)); 1880 return skb->data; 1881 } 1882 EXPORT_SYMBOL(skb_push); 1883 1884 /** 1885 * skb_pull - remove data from the start of a buffer 1886 * @skb: buffer to use 1887 * @len: amount of data to remove 1888 * 1889 * This function removes data from the start of a buffer, returning 1890 * the memory to the headroom. A pointer to the next data in the buffer 1891 * is returned. Once the data has been pulled future pushes will overwrite 1892 * the old data. 1893 */ 1894 void *skb_pull(struct sk_buff *skb, unsigned int len) 1895 { 1896 return skb_pull_inline(skb, len); 1897 } 1898 EXPORT_SYMBOL(skb_pull); 1899 1900 /** 1901 * skb_trim - remove end from a buffer 1902 * @skb: buffer to alter 1903 * @len: new length 1904 * 1905 * Cut the length of a buffer down by removing data from the tail. If 1906 * the buffer is already under the length specified it is not modified. 1907 * The skb must be linear. 1908 */ 1909 void skb_trim(struct sk_buff *skb, unsigned int len) 1910 { 1911 if (skb->len > len) 1912 __skb_trim(skb, len); 1913 } 1914 EXPORT_SYMBOL(skb_trim); 1915 1916 /* Trims skb to length len. It can change skb pointers. 1917 */ 1918 1919 int ___pskb_trim(struct sk_buff *skb, unsigned int len) 1920 { 1921 struct sk_buff **fragp; 1922 struct sk_buff *frag; 1923 int offset = skb_headlen(skb); 1924 int nfrags = skb_shinfo(skb)->nr_frags; 1925 int i; 1926 int err; 1927 1928 if (skb_cloned(skb) && 1929 unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))) 1930 return err; 1931 1932 i = 0; 1933 if (offset >= len) 1934 goto drop_pages; 1935 1936 for (; i < nfrags; i++) { 1937 int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]); 1938 1939 if (end < len) { 1940 offset = end; 1941 continue; 1942 } 1943 1944 skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset); 1945 1946 drop_pages: 1947 skb_shinfo(skb)->nr_frags = i; 1948 1949 for (; i < nfrags; i++) 1950 skb_frag_unref(skb, i); 1951 1952 if (skb_has_frag_list(skb)) 1953 skb_drop_fraglist(skb); 1954 goto done; 1955 } 1956 1957 for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp); 1958 fragp = &frag->next) { 1959 int end = offset + frag->len; 1960 1961 if (skb_shared(frag)) { 1962 struct sk_buff *nfrag; 1963 1964 nfrag = skb_clone(frag, GFP_ATOMIC); 1965 if (unlikely(!nfrag)) 1966 return -ENOMEM; 1967 1968 nfrag->next = frag->next; 1969 consume_skb(frag); 1970 frag = nfrag; 1971 *fragp = frag; 1972 } 1973 1974 if (end < len) { 1975 offset = end; 1976 continue; 1977 } 1978 1979 if (end > len && 1980 unlikely((err = pskb_trim(frag, len - offset)))) 1981 return err; 1982 1983 if (frag->next) 1984 skb_drop_list(&frag->next); 1985 break; 1986 } 1987 1988 done: 1989 if (len > skb_headlen(skb)) { 1990 skb->data_len -= skb->len - len; 1991 skb->len = len; 1992 } else { 1993 skb->len = len; 1994 skb->data_len = 0; 1995 skb_set_tail_pointer(skb, len); 1996 } 1997 1998 if (!skb->sk || skb->destructor == sock_edemux) 1999 skb_condense(skb); 2000 return 0; 2001 } 2002 EXPORT_SYMBOL(___pskb_trim); 2003 2004 /* Note : use pskb_trim_rcsum() instead of calling this directly 2005 */ 2006 int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len) 2007 { 2008 if (skb->ip_summed == CHECKSUM_COMPLETE) { 2009 int delta = skb->len - len; 2010 2011 skb->csum = csum_block_sub(skb->csum, 2012 skb_checksum(skb, len, delta, 0), 2013 len); 2014 } 2015 return __pskb_trim(skb, len); 2016 } 2017 EXPORT_SYMBOL(pskb_trim_rcsum_slow); 2018 2019 /** 2020 * __pskb_pull_tail - advance tail of skb header 2021 * @skb: buffer to reallocate 2022 * @delta: number of bytes to advance tail 2023 * 2024 * The function makes a sense only on a fragmented &sk_buff, 2025 * it expands header moving its tail forward and copying necessary 2026 * data from fragmented part. 2027 * 2028 * &sk_buff MUST have reference count of 1. 2029 * 2030 * Returns %NULL (and &sk_buff does not change) if pull failed 2031 * or value of new tail of skb in the case of success. 2032 * 2033 * All the pointers pointing into skb header may change and must be 2034 * reloaded after call to this function. 2035 */ 2036 2037 /* Moves tail of skb head forward, copying data from fragmented part, 2038 * when it is necessary. 2039 * 1. It may fail due to malloc failure. 2040 * 2. It may change skb pointers. 2041 * 2042 * It is pretty complicated. Luckily, it is called only in exceptional cases. 2043 */ 2044 void *__pskb_pull_tail(struct sk_buff *skb, int delta) 2045 { 2046 /* If skb has not enough free space at tail, get new one 2047 * plus 128 bytes for future expansions. If we have enough 2048 * room at tail, reallocate without expansion only if skb is cloned. 2049 */ 2050 int i, k, eat = (skb->tail + delta) - skb->end; 2051 2052 if (eat > 0 || skb_cloned(skb)) { 2053 if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, 2054 GFP_ATOMIC)) 2055 return NULL; 2056 } 2057 2058 BUG_ON(skb_copy_bits(skb, skb_headlen(skb), 2059 skb_tail_pointer(skb), delta)); 2060 2061 /* Optimization: no fragments, no reasons to preestimate 2062 * size of pulled pages. Superb. 2063 */ 2064 if (!skb_has_frag_list(skb)) 2065 goto pull_pages; 2066 2067 /* Estimate size of pulled pages. */ 2068 eat = delta; 2069 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2070 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 2071 2072 if (size >= eat) 2073 goto pull_pages; 2074 eat -= size; 2075 } 2076 2077 /* If we need update frag list, we are in troubles. 2078 * Certainly, it is possible to add an offset to skb data, 2079 * but taking into account that pulling is expected to 2080 * be very rare operation, it is worth to fight against 2081 * further bloating skb head and crucify ourselves here instead. 2082 * Pure masohism, indeed. 8)8) 2083 */ 2084 if (eat) { 2085 struct sk_buff *list = skb_shinfo(skb)->frag_list; 2086 struct sk_buff *clone = NULL; 2087 struct sk_buff *insp = NULL; 2088 2089 do { 2090 if (list->len <= eat) { 2091 /* Eaten as whole. */ 2092 eat -= list->len; 2093 list = list->next; 2094 insp = list; 2095 } else { 2096 /* Eaten partially. */ 2097 2098 if (skb_shared(list)) { 2099 /* Sucks! We need to fork list. :-( */ 2100 clone = skb_clone(list, GFP_ATOMIC); 2101 if (!clone) 2102 return NULL; 2103 insp = list->next; 2104 list = clone; 2105 } else { 2106 /* This may be pulled without 2107 * problems. */ 2108 insp = list; 2109 } 2110 if (!pskb_pull(list, eat)) { 2111 kfree_skb(clone); 2112 return NULL; 2113 } 2114 break; 2115 } 2116 } while (eat); 2117 2118 /* Free pulled out fragments. */ 2119 while ((list = skb_shinfo(skb)->frag_list) != insp) { 2120 skb_shinfo(skb)->frag_list = list->next; 2121 kfree_skb(list); 2122 } 2123 /* And insert new clone at head. */ 2124 if (clone) { 2125 clone->next = list; 2126 skb_shinfo(skb)->frag_list = clone; 2127 } 2128 } 2129 /* Success! Now we may commit changes to skb data. */ 2130 2131 pull_pages: 2132 eat = delta; 2133 k = 0; 2134 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2135 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 2136 2137 if (size <= eat) { 2138 skb_frag_unref(skb, i); 2139 eat -= size; 2140 } else { 2141 skb_frag_t *frag = &skb_shinfo(skb)->frags[k]; 2142 2143 *frag = skb_shinfo(skb)->frags[i]; 2144 if (eat) { 2145 skb_frag_off_add(frag, eat); 2146 skb_frag_size_sub(frag, eat); 2147 if (!i) 2148 goto end; 2149 eat = 0; 2150 } 2151 k++; 2152 } 2153 } 2154 skb_shinfo(skb)->nr_frags = k; 2155 2156 end: 2157 skb->tail += delta; 2158 skb->data_len -= delta; 2159 2160 if (!skb->data_len) 2161 skb_zcopy_clear(skb, false); 2162 2163 return skb_tail_pointer(skb); 2164 } 2165 EXPORT_SYMBOL(__pskb_pull_tail); 2166 2167 /** 2168 * skb_copy_bits - copy bits from skb to kernel buffer 2169 * @skb: source skb 2170 * @offset: offset in source 2171 * @to: destination buffer 2172 * @len: number of bytes to copy 2173 * 2174 * Copy the specified number of bytes from the source skb to the 2175 * destination buffer. 2176 * 2177 * CAUTION ! : 2178 * If its prototype is ever changed, 2179 * check arch/{*}/net/{*}.S files, 2180 * since it is called from BPF assembly code. 2181 */ 2182 int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) 2183 { 2184 int start = skb_headlen(skb); 2185 struct sk_buff *frag_iter; 2186 int i, copy; 2187 2188 if (offset > (int)skb->len - len) 2189 goto fault; 2190 2191 /* Copy header. */ 2192 if ((copy = start - offset) > 0) { 2193 if (copy > len) 2194 copy = len; 2195 skb_copy_from_linear_data_offset(skb, offset, to, copy); 2196 if ((len -= copy) == 0) 2197 return 0; 2198 offset += copy; 2199 to += copy; 2200 } 2201 2202 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2203 int end; 2204 skb_frag_t *f = &skb_shinfo(skb)->frags[i]; 2205 2206 WARN_ON(start > offset + len); 2207 2208 end = start + skb_frag_size(f); 2209 if ((copy = end - offset) > 0) { 2210 u32 p_off, p_len, copied; 2211 struct page *p; 2212 u8 *vaddr; 2213 2214 if (copy > len) 2215 copy = len; 2216 2217 skb_frag_foreach_page(f, 2218 skb_frag_off(f) + offset - start, 2219 copy, p, p_off, p_len, copied) { 2220 vaddr = kmap_atomic(p); 2221 memcpy(to + copied, vaddr + p_off, p_len); 2222 kunmap_atomic(vaddr); 2223 } 2224 2225 if ((len -= copy) == 0) 2226 return 0; 2227 offset += copy; 2228 to += copy; 2229 } 2230 start = end; 2231 } 2232 2233 skb_walk_frags(skb, frag_iter) { 2234 int end; 2235 2236 WARN_ON(start > offset + len); 2237 2238 end = start + frag_iter->len; 2239 if ((copy = end - offset) > 0) { 2240 if (copy > len) 2241 copy = len; 2242 if (skb_copy_bits(frag_iter, offset - start, to, copy)) 2243 goto fault; 2244 if ((len -= copy) == 0) 2245 return 0; 2246 offset += copy; 2247 to += copy; 2248 } 2249 start = end; 2250 } 2251 2252 if (!len) 2253 return 0; 2254 2255 fault: 2256 return -EFAULT; 2257 } 2258 EXPORT_SYMBOL(skb_copy_bits); 2259 2260 /* 2261 * Callback from splice_to_pipe(), if we need to release some pages 2262 * at the end of the spd in case we error'ed out in filling the pipe. 2263 */ 2264 static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) 2265 { 2266 put_page(spd->pages[i]); 2267 } 2268 2269 static struct page *linear_to_page(struct page *page, unsigned int *len, 2270 unsigned int *offset, 2271 struct sock *sk) 2272 { 2273 struct page_frag *pfrag = sk_page_frag(sk); 2274 2275 if (!sk_page_frag_refill(sk, pfrag)) 2276 return NULL; 2277 2278 *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset); 2279 2280 memcpy(page_address(pfrag->page) + pfrag->offset, 2281 page_address(page) + *offset, *len); 2282 *offset = pfrag->offset; 2283 pfrag->offset += *len; 2284 2285 return pfrag->page; 2286 } 2287 2288 static bool spd_can_coalesce(const struct splice_pipe_desc *spd, 2289 struct page *page, 2290 unsigned int offset) 2291 { 2292 return spd->nr_pages && 2293 spd->pages[spd->nr_pages - 1] == page && 2294 (spd->partial[spd->nr_pages - 1].offset + 2295 spd->partial[spd->nr_pages - 1].len == offset); 2296 } 2297 2298 /* 2299 * Fill page/offset/length into spd, if it can hold more pages. 2300 */ 2301 static bool spd_fill_page(struct splice_pipe_desc *spd, 2302 struct pipe_inode_info *pipe, struct page *page, 2303 unsigned int *len, unsigned int offset, 2304 bool linear, 2305 struct sock *sk) 2306 { 2307 if (unlikely(spd->nr_pages == MAX_SKB_FRAGS)) 2308 return true; 2309 2310 if (linear) { 2311 page = linear_to_page(page, len, &offset, sk); 2312 if (!page) 2313 return true; 2314 } 2315 if (spd_can_coalesce(spd, page, offset)) { 2316 spd->partial[spd->nr_pages - 1].len += *len; 2317 return false; 2318 } 2319 get_page(page); 2320 spd->pages[spd->nr_pages] = page; 2321 spd->partial[spd->nr_pages].len = *len; 2322 spd->partial[spd->nr_pages].offset = offset; 2323 spd->nr_pages++; 2324 2325 return false; 2326 } 2327 2328 static bool __splice_segment(struct page *page, unsigned int poff, 2329 unsigned int plen, unsigned int *off, 2330 unsigned int *len, 2331 struct splice_pipe_desc *spd, bool linear, 2332 struct sock *sk, 2333 struct pipe_inode_info *pipe) 2334 { 2335 if (!*len) 2336 return true; 2337 2338 /* skip this segment if already processed */ 2339 if (*off >= plen) { 2340 *off -= plen; 2341 return false; 2342 } 2343 2344 /* ignore any bits we already processed */ 2345 poff += *off; 2346 plen -= *off; 2347 *off = 0; 2348 2349 do { 2350 unsigned int flen = min(*len, plen); 2351 2352 if (spd_fill_page(spd, pipe, page, &flen, poff, 2353 linear, sk)) 2354 return true; 2355 poff += flen; 2356 plen -= flen; 2357 *len -= flen; 2358 } while (*len && plen); 2359 2360 return false; 2361 } 2362 2363 /* 2364 * Map linear and fragment data from the skb to spd. It reports true if the 2365 * pipe is full or if we already spliced the requested length. 2366 */ 2367 static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, 2368 unsigned int *offset, unsigned int *len, 2369 struct splice_pipe_desc *spd, struct sock *sk) 2370 { 2371 int seg; 2372 struct sk_buff *iter; 2373 2374 /* map the linear part : 2375 * If skb->head_frag is set, this 'linear' part is backed by a 2376 * fragment, and if the head is not shared with any clones then 2377 * we can avoid a copy since we own the head portion of this page. 2378 */ 2379 if (__splice_segment(virt_to_page(skb->data), 2380 (unsigned long) skb->data & (PAGE_SIZE - 1), 2381 skb_headlen(skb), 2382 offset, len, spd, 2383 skb_head_is_locked(skb), 2384 sk, pipe)) 2385 return true; 2386 2387 /* 2388 * then map the fragments 2389 */ 2390 for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { 2391 const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; 2392 2393 if (__splice_segment(skb_frag_page(f), 2394 skb_frag_off(f), skb_frag_size(f), 2395 offset, len, spd, false, sk, pipe)) 2396 return true; 2397 } 2398 2399 skb_walk_frags(skb, iter) { 2400 if (*offset >= iter->len) { 2401 *offset -= iter->len; 2402 continue; 2403 } 2404 /* __skb_splice_bits() only fails if the output has no room 2405 * left, so no point in going over the frag_list for the error 2406 * case. 2407 */ 2408 if (__skb_splice_bits(iter, pipe, offset, len, spd, sk)) 2409 return true; 2410 } 2411 2412 return false; 2413 } 2414 2415 /* 2416 * Map data from the skb to a pipe. Should handle both the linear part, 2417 * the fragments, and the frag list. 2418 */ 2419 int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, 2420 struct pipe_inode_info *pipe, unsigned int tlen, 2421 unsigned int flags) 2422 { 2423 struct partial_page partial[MAX_SKB_FRAGS]; 2424 struct page *pages[MAX_SKB_FRAGS]; 2425 struct splice_pipe_desc spd = { 2426 .pages = pages, 2427 .partial = partial, 2428 .nr_pages_max = MAX_SKB_FRAGS, 2429 .ops = &nosteal_pipe_buf_ops, 2430 .spd_release = sock_spd_release, 2431 }; 2432 int ret = 0; 2433 2434 __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk); 2435 2436 if (spd.nr_pages) 2437 ret = splice_to_pipe(pipe, &spd); 2438 2439 return ret; 2440 } 2441 EXPORT_SYMBOL_GPL(skb_splice_bits); 2442 2443 /* Send skb data on a socket. Socket must be locked. */ 2444 int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, 2445 int len) 2446 { 2447 unsigned int orig_len = len; 2448 struct sk_buff *head = skb; 2449 unsigned short fragidx; 2450 int slen, ret; 2451 2452 do_frag_list: 2453 2454 /* Deal with head data */ 2455 while (offset < skb_headlen(skb) && len) { 2456 struct kvec kv; 2457 struct msghdr msg; 2458 2459 slen = min_t(int, len, skb_headlen(skb) - offset); 2460 kv.iov_base = skb->data + offset; 2461 kv.iov_len = slen; 2462 memset(&msg, 0, sizeof(msg)); 2463 msg.msg_flags = MSG_DONTWAIT; 2464 2465 ret = kernel_sendmsg_locked(sk, &msg, &kv, 1, slen); 2466 if (ret <= 0) 2467 goto error; 2468 2469 offset += ret; 2470 len -= ret; 2471 } 2472 2473 /* All the data was skb head? */ 2474 if (!len) 2475 goto out; 2476 2477 /* Make offset relative to start of frags */ 2478 offset -= skb_headlen(skb); 2479 2480 /* Find where we are in frag list */ 2481 for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { 2482 skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; 2483 2484 if (offset < skb_frag_size(frag)) 2485 break; 2486 2487 offset -= skb_frag_size(frag); 2488 } 2489 2490 for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { 2491 skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; 2492 2493 slen = min_t(size_t, len, skb_frag_size(frag) - offset); 2494 2495 while (slen) { 2496 ret = kernel_sendpage_locked(sk, skb_frag_page(frag), 2497 skb_frag_off(frag) + offset, 2498 slen, MSG_DONTWAIT); 2499 if (ret <= 0) 2500 goto error; 2501 2502 len -= ret; 2503 offset += ret; 2504 slen -= ret; 2505 } 2506 2507 offset = 0; 2508 } 2509 2510 if (len) { 2511 /* Process any frag lists */ 2512 2513 if (skb == head) { 2514 if (skb_has_frag_list(skb)) { 2515 skb = skb_shinfo(skb)->frag_list; 2516 goto do_frag_list; 2517 } 2518 } else if (skb->next) { 2519 skb = skb->next; 2520 goto do_frag_list; 2521 } 2522 } 2523 2524 out: 2525 return orig_len - len; 2526 2527 error: 2528 return orig_len == len ? ret : orig_len - len; 2529 } 2530 EXPORT_SYMBOL_GPL(skb_send_sock_locked); 2531 2532 /** 2533 * skb_store_bits - store bits from kernel buffer to skb 2534 * @skb: destination buffer 2535 * @offset: offset in destination 2536 * @from: source buffer 2537 * @len: number of bytes to copy 2538 * 2539 * Copy the specified number of bytes from the source buffer to the 2540 * destination skb. This function handles all the messy bits of 2541 * traversing fragment lists and such. 2542 */ 2543 2544 int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) 2545 { 2546 int start = skb_headlen(skb); 2547 struct sk_buff *frag_iter; 2548 int i, copy; 2549 2550 if (offset > (int)skb->len - len) 2551 goto fault; 2552 2553 if ((copy = start - offset) > 0) { 2554 if (copy > len) 2555 copy = len; 2556 skb_copy_to_linear_data_offset(skb, offset, from, copy); 2557 if ((len -= copy) == 0) 2558 return 0; 2559 offset += copy; 2560 from += copy; 2561 } 2562 2563 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2564 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2565 int end; 2566 2567 WARN_ON(start > offset + len); 2568 2569 end = start + skb_frag_size(frag); 2570 if ((copy = end - offset) > 0) { 2571 u32 p_off, p_len, copied; 2572 struct page *p; 2573 u8 *vaddr; 2574 2575 if (copy > len) 2576 copy = len; 2577 2578 skb_frag_foreach_page(frag, 2579 skb_frag_off(frag) + offset - start, 2580 copy, p, p_off, p_len, copied) { 2581 vaddr = kmap_atomic(p); 2582 memcpy(vaddr + p_off, from + copied, p_len); 2583 kunmap_atomic(vaddr); 2584 } 2585 2586 if ((len -= copy) == 0) 2587 return 0; 2588 offset += copy; 2589 from += copy; 2590 } 2591 start = end; 2592 } 2593 2594 skb_walk_frags(skb, frag_iter) { 2595 int end; 2596 2597 WARN_ON(start > offset + len); 2598 2599 end = start + frag_iter->len; 2600 if ((copy = end - offset) > 0) { 2601 if (copy > len) 2602 copy = len; 2603 if (skb_store_bits(frag_iter, offset - start, 2604 from, copy)) 2605 goto fault; 2606 if ((len -= copy) == 0) 2607 return 0; 2608 offset += copy; 2609 from += copy; 2610 } 2611 start = end; 2612 } 2613 if (!len) 2614 return 0; 2615 2616 fault: 2617 return -EFAULT; 2618 } 2619 EXPORT_SYMBOL(skb_store_bits); 2620 2621 /* Checksum skb data. */ 2622 __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, 2623 __wsum csum, const struct skb_checksum_ops *ops) 2624 { 2625 int start = skb_headlen(skb); 2626 int i, copy = start - offset; 2627 struct sk_buff *frag_iter; 2628 int pos = 0; 2629 2630 /* Checksum header. */ 2631 if (copy > 0) { 2632 if (copy > len) 2633 copy = len; 2634 csum = INDIRECT_CALL_1(ops->update, csum_partial_ext, 2635 skb->data + offset, copy, csum); 2636 if ((len -= copy) == 0) 2637 return csum; 2638 offset += copy; 2639 pos = copy; 2640 } 2641 2642 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2643 int end; 2644 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2645 2646 WARN_ON(start > offset + len); 2647 2648 end = start + skb_frag_size(frag); 2649 if ((copy = end - offset) > 0) { 2650 u32 p_off, p_len, copied; 2651 struct page *p; 2652 __wsum csum2; 2653 u8 *vaddr; 2654 2655 if (copy > len) 2656 copy = len; 2657 2658 skb_frag_foreach_page(frag, 2659 skb_frag_off(frag) + offset - start, 2660 copy, p, p_off, p_len, copied) { 2661 vaddr = kmap_atomic(p); 2662 csum2 = INDIRECT_CALL_1(ops->update, 2663 csum_partial_ext, 2664 vaddr + p_off, p_len, 0); 2665 kunmap_atomic(vaddr); 2666 csum = INDIRECT_CALL_1(ops->combine, 2667 csum_block_add_ext, csum, 2668 csum2, pos, p_len); 2669 pos += p_len; 2670 } 2671 2672 if (!(len -= copy)) 2673 return csum; 2674 offset += copy; 2675 } 2676 start = end; 2677 } 2678 2679 skb_walk_frags(skb, frag_iter) { 2680 int end; 2681 2682 WARN_ON(start > offset + len); 2683 2684 end = start + frag_iter->len; 2685 if ((copy = end - offset) > 0) { 2686 __wsum csum2; 2687 if (copy > len) 2688 copy = len; 2689 csum2 = __skb_checksum(frag_iter, offset - start, 2690 copy, 0, ops); 2691 csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext, 2692 csum, csum2, pos, copy); 2693 if ((len -= copy) == 0) 2694 return csum; 2695 offset += copy; 2696 pos += copy; 2697 } 2698 start = end; 2699 } 2700 BUG_ON(len); 2701 2702 return csum; 2703 } 2704 EXPORT_SYMBOL(__skb_checksum); 2705 2706 __wsum skb_checksum(const struct sk_buff *skb, int offset, 2707 int len, __wsum csum) 2708 { 2709 const struct skb_checksum_ops ops = { 2710 .update = csum_partial_ext, 2711 .combine = csum_block_add_ext, 2712 }; 2713 2714 return __skb_checksum(skb, offset, len, csum, &ops); 2715 } 2716 EXPORT_SYMBOL(skb_checksum); 2717 2718 /* Both of above in one bottle. */ 2719 2720 __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, 2721 u8 *to, int len) 2722 { 2723 int start = skb_headlen(skb); 2724 int i, copy = start - offset; 2725 struct sk_buff *frag_iter; 2726 int pos = 0; 2727 __wsum csum = 0; 2728 2729 /* Copy header. */ 2730 if (copy > 0) { 2731 if (copy > len) 2732 copy = len; 2733 csum = csum_partial_copy_nocheck(skb->data + offset, to, 2734 copy); 2735 if ((len -= copy) == 0) 2736 return csum; 2737 offset += copy; 2738 to += copy; 2739 pos = copy; 2740 } 2741 2742 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2743 int end; 2744 2745 WARN_ON(start > offset + len); 2746 2747 end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); 2748 if ((copy = end - offset) > 0) { 2749 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2750 u32 p_off, p_len, copied; 2751 struct page *p; 2752 __wsum csum2; 2753 u8 *vaddr; 2754 2755 if (copy > len) 2756 copy = len; 2757 2758 skb_frag_foreach_page(frag, 2759 skb_frag_off(frag) + offset - start, 2760 copy, p, p_off, p_len, copied) { 2761 vaddr = kmap_atomic(p); 2762 csum2 = csum_partial_copy_nocheck(vaddr + p_off, 2763 to + copied, 2764 p_len); 2765 kunmap_atomic(vaddr); 2766 csum = csum_block_add(csum, csum2, pos); 2767 pos += p_len; 2768 } 2769 2770 if (!(len -= copy)) 2771 return csum; 2772 offset += copy; 2773 to += copy; 2774 } 2775 start = end; 2776 } 2777 2778 skb_walk_frags(skb, frag_iter) { 2779 __wsum csum2; 2780 int end; 2781 2782 WARN_ON(start > offset + len); 2783 2784 end = start + frag_iter->len; 2785 if ((copy = end - offset) > 0) { 2786 if (copy > len) 2787 copy = len; 2788 csum2 = skb_copy_and_csum_bits(frag_iter, 2789 offset - start, 2790 to, copy); 2791 csum = csum_block_add(csum, csum2, pos); 2792 if ((len -= copy) == 0) 2793 return csum; 2794 offset += copy; 2795 to += copy; 2796 pos += copy; 2797 } 2798 start = end; 2799 } 2800 BUG_ON(len); 2801 return csum; 2802 } 2803 EXPORT_SYMBOL(skb_copy_and_csum_bits); 2804 2805 __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) 2806 { 2807 __sum16 sum; 2808 2809 sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); 2810 /* See comments in __skb_checksum_complete(). */ 2811 if (likely(!sum)) { 2812 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && 2813 !skb->csum_complete_sw) 2814 netdev_rx_csum_fault(skb->dev, skb); 2815 } 2816 if (!skb_shared(skb)) 2817 skb->csum_valid = !sum; 2818 return sum; 2819 } 2820 EXPORT_SYMBOL(__skb_checksum_complete_head); 2821 2822 /* This function assumes skb->csum already holds pseudo header's checksum, 2823 * which has been changed from the hardware checksum, for example, by 2824 * __skb_checksum_validate_complete(). And, the original skb->csum must 2825 * have been validated unsuccessfully for CHECKSUM_COMPLETE case. 2826 * 2827 * It returns non-zero if the recomputed checksum is still invalid, otherwise 2828 * zero. The new checksum is stored back into skb->csum unless the skb is 2829 * shared. 2830 */ 2831 __sum16 __skb_checksum_complete(struct sk_buff *skb) 2832 { 2833 __wsum csum; 2834 __sum16 sum; 2835 2836 csum = skb_checksum(skb, 0, skb->len, 0); 2837 2838 sum = csum_fold(csum_add(skb->csum, csum)); 2839 /* This check is inverted, because we already knew the hardware 2840 * checksum is invalid before calling this function. So, if the 2841 * re-computed checksum is valid instead, then we have a mismatch 2842 * between the original skb->csum and skb_checksum(). This means either 2843 * the original hardware checksum is incorrect or we screw up skb->csum 2844 * when moving skb->data around. 2845 */ 2846 if (likely(!sum)) { 2847 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && 2848 !skb->csum_complete_sw) 2849 netdev_rx_csum_fault(skb->dev, skb); 2850 } 2851 2852 if (!skb_shared(skb)) { 2853 /* Save full packet checksum */ 2854 skb->csum = csum; 2855 skb->ip_summed = CHECKSUM_COMPLETE; 2856 skb->csum_complete_sw = 1; 2857 skb->csum_valid = !sum; 2858 } 2859 2860 return sum; 2861 } 2862 EXPORT_SYMBOL(__skb_checksum_complete); 2863 2864 static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum) 2865 { 2866 net_warn_ratelimited( 2867 "%s: attempt to compute crc32c without libcrc32c.ko\n", 2868 __func__); 2869 return 0; 2870 } 2871 2872 static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2, 2873 int offset, int len) 2874 { 2875 net_warn_ratelimited( 2876 "%s: attempt to compute crc32c without libcrc32c.ko\n", 2877 __func__); 2878 return 0; 2879 } 2880 2881 static const struct skb_checksum_ops default_crc32c_ops = { 2882 .update = warn_crc32c_csum_update, 2883 .combine = warn_crc32c_csum_combine, 2884 }; 2885 2886 const struct skb_checksum_ops *crc32c_csum_stub __read_mostly = 2887 &default_crc32c_ops; 2888 EXPORT_SYMBOL(crc32c_csum_stub); 2889 2890 /** 2891 * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() 2892 * @from: source buffer 2893 * 2894 * Calculates the amount of linear headroom needed in the 'to' skb passed 2895 * into skb_zerocopy(). 2896 */ 2897 unsigned int 2898 skb_zerocopy_headlen(const struct sk_buff *from) 2899 { 2900 unsigned int hlen = 0; 2901 2902 if (!from->head_frag || 2903 skb_headlen(from) < L1_CACHE_BYTES || 2904 skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) 2905 hlen = skb_headlen(from); 2906 2907 if (skb_has_frag_list(from)) 2908 hlen = from->len; 2909 2910 return hlen; 2911 } 2912 EXPORT_SYMBOL_GPL(skb_zerocopy_headlen); 2913 2914 /** 2915 * skb_zerocopy - Zero copy skb to skb 2916 * @to: destination buffer 2917 * @from: source buffer 2918 * @len: number of bytes to copy from source buffer 2919 * @hlen: size of linear headroom in destination buffer 2920 * 2921 * Copies up to `len` bytes from `from` to `to` by creating references 2922 * to the frags in the source buffer. 2923 * 2924 * The `hlen` as calculated by skb_zerocopy_headlen() specifies the 2925 * headroom in the `to` buffer. 2926 * 2927 * Return value: 2928 * 0: everything is OK 2929 * -ENOMEM: couldn't orphan frags of @from due to lack of memory 2930 * -EFAULT: skb_copy_bits() found some problem with skb geometry 2931 */ 2932 int 2933 skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) 2934 { 2935 int i, j = 0; 2936 int plen = 0; /* length of skb->head fragment */ 2937 int ret; 2938 struct page *page; 2939 unsigned int offset; 2940 2941 BUG_ON(!from->head_frag && !hlen); 2942 2943 /* dont bother with small payloads */ 2944 if (len <= skb_tailroom(to)) 2945 return skb_copy_bits(from, 0, skb_put(to, len), len); 2946 2947 if (hlen) { 2948 ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen); 2949 if (unlikely(ret)) 2950 return ret; 2951 len -= hlen; 2952 } else { 2953 plen = min_t(int, skb_headlen(from), len); 2954 if (plen) { 2955 page = virt_to_head_page(from->head); 2956 offset = from->data - (unsigned char *)page_address(page); 2957 __skb_fill_page_desc(to, 0, page, offset, plen); 2958 get_page(page); 2959 j = 1; 2960 len -= plen; 2961 } 2962 } 2963 2964 to->truesize += len + plen; 2965 to->len += len + plen; 2966 to->data_len += len + plen; 2967 2968 if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) { 2969 skb_tx_error(from); 2970 return -ENOMEM; 2971 } 2972 skb_zerocopy_clone(to, from, GFP_ATOMIC); 2973 2974 for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { 2975 int size; 2976 2977 if (!len) 2978 break; 2979 skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; 2980 size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]), 2981 len); 2982 skb_frag_size_set(&skb_shinfo(to)->frags[j], size); 2983 len -= size; 2984 skb_frag_ref(to, j); 2985 j++; 2986 } 2987 skb_shinfo(to)->nr_frags = j; 2988 2989 return 0; 2990 } 2991 EXPORT_SYMBOL_GPL(skb_zerocopy); 2992 2993 void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) 2994 { 2995 __wsum csum; 2996 long csstart; 2997 2998 if (skb->ip_summed == CHECKSUM_PARTIAL) 2999 csstart = skb_checksum_start_offset(skb); 3000 else 3001 csstart = skb_headlen(skb); 3002 3003 BUG_ON(csstart > skb_headlen(skb)); 3004 3005 skb_copy_from_linear_data(skb, to, csstart); 3006 3007 csum = 0; 3008 if (csstart != skb->len) 3009 csum = skb_copy_and_csum_bits(skb, csstart, to + csstart, 3010 skb->len - csstart); 3011 3012 if (skb->ip_summed == CHECKSUM_PARTIAL) { 3013 long csstuff = csstart + skb->csum_offset; 3014 3015 *((__sum16 *)(to + csstuff)) = csum_fold(csum); 3016 } 3017 } 3018 EXPORT_SYMBOL(skb_copy_and_csum_dev); 3019 3020 /** 3021 * skb_dequeue - remove from the head of the queue 3022 * @list: list to dequeue from 3023 * 3024 * Remove the head of the list. The list lock is taken so the function 3025 * may be used safely with other locking list functions. The head item is 3026 * returned or %NULL if the list is empty. 3027 */ 3028 3029 struct sk_buff *skb_dequeue(struct sk_buff_head *list) 3030 { 3031 unsigned long flags; 3032 struct sk_buff *result; 3033 3034 spin_lock_irqsave(&list->lock, flags); 3035 result = __skb_dequeue(list); 3036 spin_unlock_irqrestore(&list->lock, flags); 3037 return result; 3038 } 3039 EXPORT_SYMBOL(skb_dequeue); 3040 3041 /** 3042 * skb_dequeue_tail - remove from the tail of the queue 3043 * @list: list to dequeue from 3044 * 3045 * Remove the tail of the list. The list lock is taken so the function 3046 * may be used safely with other locking list functions. The tail item is 3047 * returned or %NULL if the list is empty. 3048 */ 3049 struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) 3050 { 3051 unsigned long flags; 3052 struct sk_buff *result; 3053 3054 spin_lock_irqsave(&list->lock, flags); 3055 result = __skb_dequeue_tail(list); 3056 spin_unlock_irqrestore(&list->lock, flags); 3057 return result; 3058 } 3059 EXPORT_SYMBOL(skb_dequeue_tail); 3060 3061 /** 3062 * skb_queue_purge - empty a list 3063 * @list: list to empty 3064 * 3065 * Delete all buffers on an &sk_buff list. Each buffer is removed from 3066 * the list and one reference dropped. This function takes the list 3067 * lock and is atomic with respect to other list locking functions. 3068 */ 3069 void skb_queue_purge(struct sk_buff_head *list) 3070 { 3071 struct sk_buff *skb; 3072 while ((skb = skb_dequeue(list)) != NULL) 3073 kfree_skb(skb); 3074 } 3075 EXPORT_SYMBOL(skb_queue_purge); 3076 3077 /** 3078 * skb_rbtree_purge - empty a skb rbtree 3079 * @root: root of the rbtree to empty 3080 * Return value: the sum of truesizes of all purged skbs. 3081 * 3082 * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from 3083 * the list and one reference dropped. This function does not take 3084 * any lock. Synchronization should be handled by the caller (e.g., TCP 3085 * out-of-order queue is protected by the socket lock). 3086 */ 3087 unsigned int skb_rbtree_purge(struct rb_root *root) 3088 { 3089 struct rb_node *p = rb_first(root); 3090 unsigned int sum = 0; 3091 3092 while (p) { 3093 struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); 3094 3095 p = rb_next(p); 3096 rb_erase(&skb->rbnode, root); 3097 sum += skb->truesize; 3098 kfree_skb(skb); 3099 } 3100 return sum; 3101 } 3102 3103 /** 3104 * skb_queue_head - queue a buffer at the list head 3105 * @list: list to use 3106 * @newsk: buffer to queue 3107 * 3108 * Queue a buffer at the start of the list. This function takes the 3109 * list lock and can be used safely with other locking &sk_buff functions 3110 * safely. 3111 * 3112 * A buffer cannot be placed on two lists at the same time. 3113 */ 3114 void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) 3115 { 3116 unsigned long flags; 3117 3118 spin_lock_irqsave(&list->lock, flags); 3119 __skb_queue_head(list, newsk); 3120 spin_unlock_irqrestore(&list->lock, flags); 3121 } 3122 EXPORT_SYMBOL(skb_queue_head); 3123 3124 /** 3125 * skb_queue_tail - queue a buffer at the list tail 3126 * @list: list to use 3127 * @newsk: buffer to queue 3128 * 3129 * Queue a buffer at the tail of the list. This function takes the 3130 * list lock and can be used safely with other locking &sk_buff functions 3131 * safely. 3132 * 3133 * A buffer cannot be placed on two lists at the same time. 3134 */ 3135 void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) 3136 { 3137 unsigned long flags; 3138 3139 spin_lock_irqsave(&list->lock, flags); 3140 __skb_queue_tail(list, newsk); 3141 spin_unlock_irqrestore(&list->lock, flags); 3142 } 3143 EXPORT_SYMBOL(skb_queue_tail); 3144 3145 /** 3146 * skb_unlink - remove a buffer from a list 3147 * @skb: buffer to remove 3148 * @list: list to use 3149 * 3150 * Remove a packet from a list. The list locks are taken and this 3151 * function is atomic with respect to other list locked calls 3152 * 3153 * You must know what list the SKB is on. 3154 */ 3155 void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) 3156 { 3157 unsigned long flags; 3158 3159 spin_lock_irqsave(&list->lock, flags); 3160 __skb_unlink(skb, list); 3161 spin_unlock_irqrestore(&list->lock, flags); 3162 } 3163 EXPORT_SYMBOL(skb_unlink); 3164 3165 /** 3166 * skb_append - append a buffer 3167 * @old: buffer to insert after 3168 * @newsk: buffer to insert 3169 * @list: list to use 3170 * 3171 * Place a packet after a given packet in a list. The list locks are taken 3172 * and this function is atomic with respect to other list locked calls. 3173 * A buffer cannot be placed on two lists at the same time. 3174 */ 3175 void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) 3176 { 3177 unsigned long flags; 3178 3179 spin_lock_irqsave(&list->lock, flags); 3180 __skb_queue_after(list, old, newsk); 3181 spin_unlock_irqrestore(&list->lock, flags); 3182 } 3183 EXPORT_SYMBOL(skb_append); 3184 3185 static inline void skb_split_inside_header(struct sk_buff *skb, 3186 struct sk_buff* skb1, 3187 const u32 len, const int pos) 3188 { 3189 int i; 3190 3191 skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len), 3192 pos - len); 3193 /* And move data appendix as is. */ 3194 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 3195 skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; 3196 3197 skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; 3198 skb_shinfo(skb)->nr_frags = 0; 3199 skb1->data_len = skb->data_len; 3200 skb1->len += skb1->data_len; 3201 skb->data_len = 0; 3202 skb->len = len; 3203 skb_set_tail_pointer(skb, len); 3204 } 3205 3206 static inline void skb_split_no_header(struct sk_buff *skb, 3207 struct sk_buff* skb1, 3208 const u32 len, int pos) 3209 { 3210 int i, k = 0; 3211 const int nfrags = skb_shinfo(skb)->nr_frags; 3212 3213 skb_shinfo(skb)->nr_frags = 0; 3214 skb1->len = skb1->data_len = skb->len - len; 3215 skb->len = len; 3216 skb->data_len = len - pos; 3217 3218 for (i = 0; i < nfrags; i++) { 3219 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 3220 3221 if (pos + size > len) { 3222 skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; 3223 3224 if (pos < len) { 3225 /* Split frag. 3226 * We have two variants in this case: 3227 * 1. Move all the frag to the second 3228 * part, if it is possible. F.e. 3229 * this approach is mandatory for TUX, 3230 * where splitting is expensive. 3231 * 2. Split is accurately. We make this. 3232 */ 3233 skb_frag_ref(skb, i); 3234 skb_frag_off_add(&skb_shinfo(skb1)->frags[0], len - pos); 3235 skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos); 3236 skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos); 3237 skb_shinfo(skb)->nr_frags++; 3238 } 3239 k++; 3240 } else 3241 skb_shinfo(skb)->nr_frags++; 3242 pos += size; 3243 } 3244 skb_shinfo(skb1)->nr_frags = k; 3245 } 3246 3247 /** 3248 * skb_split - Split fragmented skb to two parts at length len. 3249 * @skb: the buffer to split 3250 * @skb1: the buffer to receive the second part 3251 * @len: new length for skb 3252 */ 3253 void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) 3254 { 3255 int pos = skb_headlen(skb); 3256 3257 skb_shinfo(skb1)->tx_flags |= skb_shinfo(skb)->tx_flags & 3258 SKBTX_SHARED_FRAG; 3259 skb_zerocopy_clone(skb1, skb, 0); 3260 if (len < pos) /* Split line is inside header. */ 3261 skb_split_inside_header(skb, skb1, len, pos); 3262 else /* Second chunk has no header, nothing to copy. */ 3263 skb_split_no_header(skb, skb1, len, pos); 3264 } 3265 EXPORT_SYMBOL(skb_split); 3266 3267 /* Shifting from/to a cloned skb is a no-go. 3268 * 3269 * Caller cannot keep skb_shinfo related pointers past calling here! 3270 */ 3271 static int skb_prepare_for_shift(struct sk_buff *skb) 3272 { 3273 return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 3274 } 3275 3276 /** 3277 * skb_shift - Shifts paged data partially from skb to another 3278 * @tgt: buffer into which tail data gets added 3279 * @skb: buffer from which the paged data comes from 3280 * @shiftlen: shift up to this many bytes 3281 * 3282 * Attempts to shift up to shiftlen worth of bytes, which may be less than 3283 * the length of the skb, from skb to tgt. Returns number bytes shifted. 3284 * It's up to caller to free skb if everything was shifted. 3285 * 3286 * If @tgt runs out of frags, the whole operation is aborted. 3287 * 3288 * Skb cannot include anything else but paged data while tgt is allowed 3289 * to have non-paged data as well. 3290 * 3291 * TODO: full sized shift could be optimized but that would need 3292 * specialized skb free'er to handle frags without up-to-date nr_frags. 3293 */ 3294 int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) 3295 { 3296 int from, to, merge, todo; 3297 skb_frag_t *fragfrom, *fragto; 3298 3299 BUG_ON(shiftlen > skb->len); 3300 3301 if (skb_headlen(skb)) 3302 return 0; 3303 if (skb_zcopy(tgt) || skb_zcopy(skb)) 3304 return 0; 3305 3306 todo = shiftlen; 3307 from = 0; 3308 to = skb_shinfo(tgt)->nr_frags; 3309 fragfrom = &skb_shinfo(skb)->frags[from]; 3310 3311 /* Actual merge is delayed until the point when we know we can 3312 * commit all, so that we don't have to undo partial changes 3313 */ 3314 if (!to || 3315 !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), 3316 skb_frag_off(fragfrom))) { 3317 merge = -1; 3318 } else { 3319 merge = to - 1; 3320 3321 todo -= skb_frag_size(fragfrom); 3322 if (todo < 0) { 3323 if (skb_prepare_for_shift(skb) || 3324 skb_prepare_for_shift(tgt)) 3325 return 0; 3326 3327 /* All previous frag pointers might be stale! */ 3328 fragfrom = &skb_shinfo(skb)->frags[from]; 3329 fragto = &skb_shinfo(tgt)->frags[merge]; 3330 3331 skb_frag_size_add(fragto, shiftlen); 3332 skb_frag_size_sub(fragfrom, shiftlen); 3333 skb_frag_off_add(fragfrom, shiftlen); 3334 3335 goto onlymerged; 3336 } 3337 3338 from++; 3339 } 3340 3341 /* Skip full, not-fitting skb to avoid expensive operations */ 3342 if ((shiftlen == skb->len) && 3343 (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to)) 3344 return 0; 3345 3346 if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt)) 3347 return 0; 3348 3349 while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) { 3350 if (to == MAX_SKB_FRAGS) 3351 return 0; 3352 3353 fragfrom = &skb_shinfo(skb)->frags[from]; 3354 fragto = &skb_shinfo(tgt)->frags[to]; 3355 3356 if (todo >= skb_frag_size(fragfrom)) { 3357 *fragto = *fragfrom; 3358 todo -= skb_frag_size(fragfrom); 3359 from++; 3360 to++; 3361 3362 } else { 3363 __skb_frag_ref(fragfrom); 3364 skb_frag_page_copy(fragto, fragfrom); 3365 skb_frag_off_copy(fragto, fragfrom); 3366 skb_frag_size_set(fragto, todo); 3367 3368 skb_frag_off_add(fragfrom, todo); 3369 skb_frag_size_sub(fragfrom, todo); 3370 todo = 0; 3371 3372 to++; 3373 break; 3374 } 3375 } 3376 3377 /* Ready to "commit" this state change to tgt */ 3378 skb_shinfo(tgt)->nr_frags = to; 3379 3380 if (merge >= 0) { 3381 fragfrom = &skb_shinfo(skb)->frags[0]; 3382 fragto = &skb_shinfo(tgt)->frags[merge]; 3383 3384 skb_frag_size_add(fragto, skb_frag_size(fragfrom)); 3385 __skb_frag_unref(fragfrom); 3386 } 3387 3388 /* Reposition in the original skb */ 3389 to = 0; 3390 while (from < skb_shinfo(skb)->nr_frags) 3391 skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++]; 3392 skb_shinfo(skb)->nr_frags = to; 3393 3394 BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags); 3395 3396 onlymerged: 3397 /* Most likely the tgt won't ever need its checksum anymore, skb on 3398 * the other hand might need it if it needs to be resent 3399 */ 3400 tgt->ip_summed = CHECKSUM_PARTIAL; 3401 skb->ip_summed = CHECKSUM_PARTIAL; 3402 3403 /* Yak, is it really working this way? Some helper please? */ 3404 skb->len -= shiftlen; 3405 skb->data_len -= shiftlen; 3406 skb->truesize -= shiftlen; 3407 tgt->len += shiftlen; 3408 tgt->data_len += shiftlen; 3409 tgt->truesize += shiftlen; 3410 3411 return shiftlen; 3412 } 3413 3414 /** 3415 * skb_prepare_seq_read - Prepare a sequential read of skb data 3416 * @skb: the buffer to read 3417 * @from: lower offset of data to be read 3418 * @to: upper offset of data to be read 3419 * @st: state variable 3420 * 3421 * Initializes the specified state variable. Must be called before 3422 * invoking skb_seq_read() for the first time. 3423 */ 3424 void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, 3425 unsigned int to, struct skb_seq_state *st) 3426 { 3427 st->lower_offset = from; 3428 st->upper_offset = to; 3429 st->root_skb = st->cur_skb = skb; 3430 st->frag_idx = st->stepped_offset = 0; 3431 st->frag_data = NULL; 3432 } 3433 EXPORT_SYMBOL(skb_prepare_seq_read); 3434 3435 /** 3436 * skb_seq_read - Sequentially read skb data 3437 * @consumed: number of bytes consumed by the caller so far 3438 * @data: destination pointer for data to be returned 3439 * @st: state variable 3440 * 3441 * Reads a block of skb data at @consumed relative to the 3442 * lower offset specified to skb_prepare_seq_read(). Assigns 3443 * the head of the data block to @data and returns the length 3444 * of the block or 0 if the end of the skb data or the upper 3445 * offset has been reached. 3446 * 3447 * The caller is not required to consume all of the data 3448 * returned, i.e. @consumed is typically set to the number 3449 * of bytes already consumed and the next call to 3450 * skb_seq_read() will return the remaining part of the block. 3451 * 3452 * Note 1: The size of each block of data returned can be arbitrary, 3453 * this limitation is the cost for zerocopy sequential 3454 * reads of potentially non linear data. 3455 * 3456 * Note 2: Fragment lists within fragments are not implemented 3457 * at the moment, state->root_skb could be replaced with 3458 * a stack for this purpose. 3459 */ 3460 unsigned int skb_seq_read(unsigned int consumed, const u8 **data, 3461 struct skb_seq_state *st) 3462 { 3463 unsigned int block_limit, abs_offset = consumed + st->lower_offset; 3464 skb_frag_t *frag; 3465 3466 if (unlikely(abs_offset >= st->upper_offset)) { 3467 if (st->frag_data) { 3468 kunmap_atomic(st->frag_data); 3469 st->frag_data = NULL; 3470 } 3471 return 0; 3472 } 3473 3474 next_skb: 3475 block_limit = skb_headlen(st->cur_skb) + st->stepped_offset; 3476 3477 if (abs_offset < block_limit && !st->frag_data) { 3478 *data = st->cur_skb->data + (abs_offset - st->stepped_offset); 3479 return block_limit - abs_offset; 3480 } 3481 3482 if (st->frag_idx == 0 && !st->frag_data) 3483 st->stepped_offset += skb_headlen(st->cur_skb); 3484 3485 while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { 3486 frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; 3487 block_limit = skb_frag_size(frag) + st->stepped_offset; 3488 3489 if (abs_offset < block_limit) { 3490 if (!st->frag_data) 3491 st->frag_data = kmap_atomic(skb_frag_page(frag)); 3492 3493 *data = (u8 *) st->frag_data + skb_frag_off(frag) + 3494 (abs_offset - st->stepped_offset); 3495 3496 return block_limit - abs_offset; 3497 } 3498 3499 if (st->frag_data) { 3500 kunmap_atomic(st->frag_data); 3501 st->frag_data = NULL; 3502 } 3503 3504 st->frag_idx++; 3505 st->stepped_offset += skb_frag_size(frag); 3506 } 3507 3508 if (st->frag_data) { 3509 kunmap_atomic(st->frag_data); 3510 st->frag_data = NULL; 3511 } 3512 3513 if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) { 3514 st->cur_skb = skb_shinfo(st->root_skb)->frag_list; 3515 st->frag_idx = 0; 3516 goto next_skb; 3517 } else if (st->cur_skb->next) { 3518 st->cur_skb = st->cur_skb->next; 3519 st->frag_idx = 0; 3520 goto next_skb; 3521 } 3522 3523 return 0; 3524 } 3525 EXPORT_SYMBOL(skb_seq_read); 3526 3527 /** 3528 * skb_abort_seq_read - Abort a sequential read of skb data 3529 * @st: state variable 3530 * 3531 * Must be called if skb_seq_read() was not called until it 3532 * returned 0. 3533 */ 3534 void skb_abort_seq_read(struct skb_seq_state *st) 3535 { 3536 if (st->frag_data) 3537 kunmap_atomic(st->frag_data); 3538 } 3539 EXPORT_SYMBOL(skb_abort_seq_read); 3540 3541 #define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) 3542 3543 static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text, 3544 struct ts_config *conf, 3545 struct ts_state *state) 3546 { 3547 return skb_seq_read(offset, text, TS_SKB_CB(state)); 3548 } 3549 3550 static void skb_ts_finish(struct ts_config *conf, struct ts_state *state) 3551 { 3552 skb_abort_seq_read(TS_SKB_CB(state)); 3553 } 3554 3555 /** 3556 * skb_find_text - Find a text pattern in skb data 3557 * @skb: the buffer to look in 3558 * @from: search offset 3559 * @to: search limit 3560 * @config: textsearch configuration 3561 * 3562 * Finds a pattern in the skb data according to the specified 3563 * textsearch configuration. Use textsearch_next() to retrieve 3564 * subsequent occurrences of the pattern. Returns the offset 3565 * to the first occurrence or UINT_MAX if no match was found. 3566 */ 3567 unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, 3568 unsigned int to, struct ts_config *config) 3569 { 3570 struct ts_state state; 3571 unsigned int ret; 3572 3573 config->get_next_block = skb_ts_get_next_block; 3574 config->finish = skb_ts_finish; 3575 3576 skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state)); 3577 3578 ret = textsearch_find(config, &state); 3579 return (ret <= to - from ? ret : UINT_MAX); 3580 } 3581 EXPORT_SYMBOL(skb_find_text); 3582 3583 int skb_append_pagefrags(struct sk_buff *skb, struct page *page, 3584 int offset, size_t size) 3585 { 3586 int i = skb_shinfo(skb)->nr_frags; 3587 3588 if (skb_can_coalesce(skb, i, page, offset)) { 3589 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); 3590 } else if (i < MAX_SKB_FRAGS) { 3591 get_page(page); 3592 skb_fill_page_desc(skb, i, page, offset, size); 3593 } else { 3594 return -EMSGSIZE; 3595 } 3596 3597 return 0; 3598 } 3599 EXPORT_SYMBOL_GPL(skb_append_pagefrags); 3600 3601 /** 3602 * skb_pull_rcsum - pull skb and update receive checksum 3603 * @skb: buffer to update 3604 * @len: length of data pulled 3605 * 3606 * This function performs an skb_pull on the packet and updates 3607 * the CHECKSUM_COMPLETE checksum. It should be used on 3608 * receive path processing instead of skb_pull unless you know 3609 * that the checksum difference is zero (e.g., a valid IP header) 3610 * or you are setting ip_summed to CHECKSUM_NONE. 3611 */ 3612 void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) 3613 { 3614 unsigned char *data = skb->data; 3615 3616 BUG_ON(len > skb->len); 3617 __skb_pull(skb, len); 3618 skb_postpull_rcsum(skb, data, len); 3619 return skb->data; 3620 } 3621 EXPORT_SYMBOL_GPL(skb_pull_rcsum); 3622 3623 static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb) 3624 { 3625 skb_frag_t head_frag; 3626 struct page *page; 3627 3628 page = virt_to_head_page(frag_skb->head); 3629 __skb_frag_set_page(&head_frag, page); 3630 skb_frag_off_set(&head_frag, frag_skb->data - 3631 (unsigned char *)page_address(page)); 3632 skb_frag_size_set(&head_frag, skb_headlen(frag_skb)); 3633 return head_frag; 3634 } 3635 3636 struct sk_buff *skb_segment_list(struct sk_buff *skb, 3637 netdev_features_t features, 3638 unsigned int offset) 3639 { 3640 struct sk_buff *list_skb = skb_shinfo(skb)->frag_list; 3641 unsigned int tnl_hlen = skb_tnl_header_len(skb); 3642 unsigned int delta_truesize = 0; 3643 unsigned int delta_len = 0; 3644 struct sk_buff *tail = NULL; 3645 struct sk_buff *nskb; 3646 3647 skb_push(skb, -skb_network_offset(skb) + offset); 3648 3649 skb_shinfo(skb)->frag_list = NULL; 3650 3651 do { 3652 nskb = list_skb; 3653 list_skb = list_skb->next; 3654 3655 if (!tail) 3656 skb->next = nskb; 3657 else 3658 tail->next = nskb; 3659 3660 tail = nskb; 3661 3662 delta_len += nskb->len; 3663 delta_truesize += nskb->truesize; 3664 3665 skb_push(nskb, -skb_network_offset(nskb) + offset); 3666 3667 skb_release_head_state(nskb); 3668 __copy_skb_header(nskb, skb); 3669 3670 skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb)); 3671 skb_copy_from_linear_data_offset(skb, -tnl_hlen, 3672 nskb->data - tnl_hlen, 3673 offset + tnl_hlen); 3674 3675 if (skb_needs_linearize(nskb, features) && 3676 __skb_linearize(nskb)) 3677 goto err_linearize; 3678 3679 } while (list_skb); 3680 3681 skb->truesize = skb->truesize - delta_truesize; 3682 skb->data_len = skb->data_len - delta_len; 3683 skb->len = skb->len - delta_len; 3684 3685 skb_gso_reset(skb); 3686 3687 skb->prev = tail; 3688 3689 if (skb_needs_linearize(skb, features) && 3690 __skb_linearize(skb)) 3691 goto err_linearize; 3692 3693 skb_get(skb); 3694 3695 return skb; 3696 3697 err_linearize: 3698 kfree_skb_list(skb->next); 3699 skb->next = NULL; 3700 return ERR_PTR(-ENOMEM); 3701 } 3702 EXPORT_SYMBOL_GPL(skb_segment_list); 3703 3704 int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) 3705 { 3706 if (unlikely(p->len + skb->len >= 65536)) 3707 return -E2BIG; 3708 3709 if (NAPI_GRO_CB(p)->last == p) 3710 skb_shinfo(p)->frag_list = skb; 3711 else 3712 NAPI_GRO_CB(p)->last->next = skb; 3713 3714 skb_pull(skb, skb_gro_offset(skb)); 3715 3716 NAPI_GRO_CB(p)->last = skb; 3717 NAPI_GRO_CB(p)->count++; 3718 p->data_len += skb->len; 3719 p->truesize += skb->truesize; 3720 p->len += skb->len; 3721 3722 NAPI_GRO_CB(skb)->same_flow = 1; 3723 3724 return 0; 3725 } 3726 3727 /** 3728 * skb_segment - Perform protocol segmentation on skb. 3729 * @head_skb: buffer to segment 3730 * @features: features for the output path (see dev->features) 3731 * 3732 * This function performs segmentation on the given skb. It returns 3733 * a pointer to the first in a list of new skbs for the segments. 3734 * In case of error it returns ERR_PTR(err). 3735 */ 3736 struct sk_buff *skb_segment(struct sk_buff *head_skb, 3737 netdev_features_t features) 3738 { 3739 struct sk_buff *segs = NULL; 3740 struct sk_buff *tail = NULL; 3741 struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list; 3742 skb_frag_t *frag = skb_shinfo(head_skb)->frags; 3743 unsigned int mss = skb_shinfo(head_skb)->gso_size; 3744 unsigned int doffset = head_skb->data - skb_mac_header(head_skb); 3745 struct sk_buff *frag_skb = head_skb; 3746 unsigned int offset = doffset; 3747 unsigned int tnl_hlen = skb_tnl_header_len(head_skb); 3748 unsigned int partial_segs = 0; 3749 unsigned int headroom; 3750 unsigned int len = head_skb->len; 3751 __be16 proto; 3752 bool csum, sg; 3753 int nfrags = skb_shinfo(head_skb)->nr_frags; 3754 int err = -ENOMEM; 3755 int i = 0; 3756 int pos; 3757 3758 if (list_skb && !list_skb->head_frag && skb_headlen(list_skb) && 3759 (skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY)) { 3760 /* gso_size is untrusted, and we have a frag_list with a linear 3761 * non head_frag head. 3762 * 3763 * (we assume checking the first list_skb member suffices; 3764 * i.e if either of the list_skb members have non head_frag 3765 * head, then the first one has too). 3766 * 3767 * If head_skb's headlen does not fit requested gso_size, it 3768 * means that the frag_list members do NOT terminate on exact 3769 * gso_size boundaries. Hence we cannot perform skb_frag_t page 3770 * sharing. Therefore we must fallback to copying the frag_list 3771 * skbs; we do so by disabling SG. 3772 */ 3773 if (mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) 3774 features &= ~NETIF_F_SG; 3775 } 3776 3777 __skb_push(head_skb, doffset); 3778 proto = skb_network_protocol(head_skb, NULL); 3779 if (unlikely(!proto)) 3780 return ERR_PTR(-EINVAL); 3781 3782 sg = !!(features & NETIF_F_SG); 3783 csum = !!can_checksum_protocol(features, proto); 3784 3785 if (sg && csum && (mss != GSO_BY_FRAGS)) { 3786 if (!(features & NETIF_F_GSO_PARTIAL)) { 3787 struct sk_buff *iter; 3788 unsigned int frag_len; 3789 3790 if (!list_skb || 3791 !net_gso_ok(features, skb_shinfo(head_skb)->gso_type)) 3792 goto normal; 3793 3794 /* If we get here then all the required 3795 * GSO features except frag_list are supported. 3796 * Try to split the SKB to multiple GSO SKBs 3797 * with no frag_list. 3798 * Currently we can do that only when the buffers don't 3799 * have a linear part and all the buffers except 3800 * the last are of the same length. 3801 */ 3802 frag_len = list_skb->len; 3803 skb_walk_frags(head_skb, iter) { 3804 if (frag_len != iter->len && iter->next) 3805 goto normal; 3806 if (skb_headlen(iter) && !iter->head_frag) 3807 goto normal; 3808 3809 len -= iter->len; 3810 } 3811 3812 if (len != frag_len) 3813 goto normal; 3814 } 3815 3816 /* GSO partial only requires that we trim off any excess that 3817 * doesn't fit into an MSS sized block, so take care of that 3818 * now. 3819 */ 3820 partial_segs = len / mss; 3821 if (partial_segs > 1) 3822 mss *= partial_segs; 3823 else 3824 partial_segs = 0; 3825 } 3826 3827 normal: 3828 headroom = skb_headroom(head_skb); 3829 pos = skb_headlen(head_skb); 3830 3831 do { 3832 struct sk_buff *nskb; 3833 skb_frag_t *nskb_frag; 3834 int hsize; 3835 int size; 3836 3837 if (unlikely(mss == GSO_BY_FRAGS)) { 3838 len = list_skb->len; 3839 } else { 3840 len = head_skb->len - offset; 3841 if (len > mss) 3842 len = mss; 3843 } 3844 3845 hsize = skb_headlen(head_skb) - offset; 3846 if (hsize < 0) 3847 hsize = 0; 3848 if (hsize > len || !sg) 3849 hsize = len; 3850 3851 if (!hsize && i >= nfrags && skb_headlen(list_skb) && 3852 (skb_headlen(list_skb) == len || sg)) { 3853 BUG_ON(skb_headlen(list_skb) > len); 3854 3855 i = 0; 3856 nfrags = skb_shinfo(list_skb)->nr_frags; 3857 frag = skb_shinfo(list_skb)->frags; 3858 frag_skb = list_skb; 3859 pos += skb_headlen(list_skb); 3860 3861 while (pos < offset + len) { 3862 BUG_ON(i >= nfrags); 3863 3864 size = skb_frag_size(frag); 3865 if (pos + size > offset + len) 3866 break; 3867 3868 i++; 3869 pos += size; 3870 frag++; 3871 } 3872 3873 nskb = skb_clone(list_skb, GFP_ATOMIC); 3874 list_skb = list_skb->next; 3875 3876 if (unlikely(!nskb)) 3877 goto err; 3878 3879 if (unlikely(pskb_trim(nskb, len))) { 3880 kfree_skb(nskb); 3881 goto err; 3882 } 3883 3884 hsize = skb_end_offset(nskb); 3885 if (skb_cow_head(nskb, doffset + headroom)) { 3886 kfree_skb(nskb); 3887 goto err; 3888 } 3889 3890 nskb->truesize += skb_end_offset(nskb) - hsize; 3891 skb_release_head_state(nskb); 3892 __skb_push(nskb, doffset); 3893 } else { 3894 nskb = __alloc_skb(hsize + doffset + headroom, 3895 GFP_ATOMIC, skb_alloc_rx_flag(head_skb), 3896 NUMA_NO_NODE); 3897 3898 if (unlikely(!nskb)) 3899 goto err; 3900 3901 skb_reserve(nskb, headroom); 3902 __skb_put(nskb, doffset); 3903 } 3904 3905 if (segs) 3906 tail->next = nskb; 3907 else 3908 segs = nskb; 3909 tail = nskb; 3910 3911 __copy_skb_header(nskb, head_skb); 3912 3913 skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom); 3914 skb_reset_mac_len(nskb); 3915 3916 skb_copy_from_linear_data_offset(head_skb, -tnl_hlen, 3917 nskb->data - tnl_hlen, 3918 doffset + tnl_hlen); 3919 3920 if (nskb->len == len + doffset) 3921 goto perform_csum_check; 3922 3923 if (!sg) { 3924 if (!csum) { 3925 if (!nskb->remcsum_offload) 3926 nskb->ip_summed = CHECKSUM_NONE; 3927 SKB_GSO_CB(nskb)->csum = 3928 skb_copy_and_csum_bits(head_skb, offset, 3929 skb_put(nskb, 3930 len), 3931 len); 3932 SKB_GSO_CB(nskb)->csum_start = 3933 skb_headroom(nskb) + doffset; 3934 } else { 3935 skb_copy_bits(head_skb, offset, 3936 skb_put(nskb, len), 3937 len); 3938 } 3939 continue; 3940 } 3941 3942 nskb_frag = skb_shinfo(nskb)->frags; 3943 3944 skb_copy_from_linear_data_offset(head_skb, offset, 3945 skb_put(nskb, hsize), hsize); 3946 3947 skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags & 3948 SKBTX_SHARED_FRAG; 3949 3950 if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || 3951 skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) 3952 goto err; 3953 3954 while (pos < offset + len) { 3955 if (i >= nfrags) { 3956 i = 0; 3957 nfrags = skb_shinfo(list_skb)->nr_frags; 3958 frag = skb_shinfo(list_skb)->frags; 3959 frag_skb = list_skb; 3960 if (!skb_headlen(list_skb)) { 3961 BUG_ON(!nfrags); 3962 } else { 3963 BUG_ON(!list_skb->head_frag); 3964 3965 /* to make room for head_frag. */ 3966 i--; 3967 frag--; 3968 } 3969 if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || 3970 skb_zerocopy_clone(nskb, frag_skb, 3971 GFP_ATOMIC)) 3972 goto err; 3973 3974 list_skb = list_skb->next; 3975 } 3976 3977 if (unlikely(skb_shinfo(nskb)->nr_frags >= 3978 MAX_SKB_FRAGS)) { 3979 net_warn_ratelimited( 3980 "skb_segment: too many frags: %u %u\n", 3981 pos, mss); 3982 err = -EINVAL; 3983 goto err; 3984 } 3985 3986 *nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag; 3987 __skb_frag_ref(nskb_frag); 3988 size = skb_frag_size(nskb_frag); 3989 3990 if (pos < offset) { 3991 skb_frag_off_add(nskb_frag, offset - pos); 3992 skb_frag_size_sub(nskb_frag, offset - pos); 3993 } 3994 3995 skb_shinfo(nskb)->nr_frags++; 3996 3997 if (pos + size <= offset + len) { 3998 i++; 3999 frag++; 4000 pos += size; 4001 } else { 4002 skb_frag_size_sub(nskb_frag, pos + size - (offset + len)); 4003 goto skip_fraglist; 4004 } 4005 4006 nskb_frag++; 4007 } 4008 4009 skip_fraglist: 4010 nskb->data_len = len - hsize; 4011 nskb->len += nskb->data_len; 4012 nskb->truesize += nskb->data_len; 4013 4014 perform_csum_check: 4015 if (!csum) { 4016 if (skb_has_shared_frag(nskb) && 4017 __skb_linearize(nskb)) 4018 goto err; 4019 4020 if (!nskb->remcsum_offload) 4021 nskb->ip_summed = CHECKSUM_NONE; 4022 SKB_GSO_CB(nskb)->csum = 4023 skb_checksum(nskb, doffset, 4024 nskb->len - doffset, 0); 4025 SKB_GSO_CB(nskb)->csum_start = 4026 skb_headroom(nskb) + doffset; 4027 } 4028 } while ((offset += len) < head_skb->len); 4029 4030 /* Some callers want to get the end of the list. 4031 * Put it in segs->prev to avoid walking the list. 4032 * (see validate_xmit_skb_list() for example) 4033 */ 4034 segs->prev = tail; 4035 4036 if (partial_segs) { 4037 struct sk_buff *iter; 4038 int type = skb_shinfo(head_skb)->gso_type; 4039 unsigned short gso_size = skb_shinfo(head_skb)->gso_size; 4040 4041 /* Update type to add partial and then remove dodgy if set */ 4042 type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL; 4043 type &= ~SKB_GSO_DODGY; 4044 4045 /* Update GSO info and prepare to start updating headers on 4046 * our way back down the stack of protocols. 4047 */ 4048 for (iter = segs; iter; iter = iter->next) { 4049 skb_shinfo(iter)->gso_size = gso_size; 4050 skb_shinfo(iter)->gso_segs = partial_segs; 4051 skb_shinfo(iter)->gso_type = type; 4052 SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset; 4053 } 4054 4055 if (tail->len - doffset <= gso_size) 4056 skb_shinfo(tail)->gso_size = 0; 4057 else if (tail != segs) 4058 skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size); 4059 } 4060 4061 /* Following permits correct backpressure, for protocols 4062 * using skb_set_owner_w(). 4063 * Idea is to tranfert ownership from head_skb to last segment. 4064 */ 4065 if (head_skb->destructor == sock_wfree) { 4066 swap(tail->truesize, head_skb->truesize); 4067 swap(tail->destructor, head_skb->destructor); 4068 swap(tail->sk, head_skb->sk); 4069 } 4070 return segs; 4071 4072 err: 4073 kfree_skb_list(segs); 4074 return ERR_PTR(err); 4075 } 4076 EXPORT_SYMBOL_GPL(skb_segment); 4077 4078 int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) 4079 { 4080 struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb); 4081 unsigned int offset = skb_gro_offset(skb); 4082 unsigned int headlen = skb_headlen(skb); 4083 unsigned int len = skb_gro_len(skb); 4084 unsigned int delta_truesize; 4085 struct sk_buff *lp; 4086 4087 if (unlikely(p->len + len >= 65536 || NAPI_GRO_CB(skb)->flush)) 4088 return -E2BIG; 4089 4090 lp = NAPI_GRO_CB(p)->last; 4091 pinfo = skb_shinfo(lp); 4092 4093 if (headlen <= offset) { 4094 skb_frag_t *frag; 4095 skb_frag_t *frag2; 4096 int i = skbinfo->nr_frags; 4097 int nr_frags = pinfo->nr_frags + i; 4098 4099 if (nr_frags > MAX_SKB_FRAGS) 4100 goto merge; 4101 4102 offset -= headlen; 4103 pinfo->nr_frags = nr_frags; 4104 skbinfo->nr_frags = 0; 4105 4106 frag = pinfo->frags + nr_frags; 4107 frag2 = skbinfo->frags + i; 4108 do { 4109 *--frag = *--frag2; 4110 } while (--i); 4111 4112 skb_frag_off_add(frag, offset); 4113 skb_frag_size_sub(frag, offset); 4114 4115 /* all fragments truesize : remove (head size + sk_buff) */ 4116 delta_truesize = skb->truesize - 4117 SKB_TRUESIZE(skb_end_offset(skb)); 4118 4119 skb->truesize -= skb->data_len; 4120 skb->len -= skb->data_len; 4121 skb->data_len = 0; 4122 4123 NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE; 4124 goto done; 4125 } else if (skb->head_frag) { 4126 int nr_frags = pinfo->nr_frags; 4127 skb_frag_t *frag = pinfo->frags + nr_frags; 4128 struct page *page = virt_to_head_page(skb->head); 4129 unsigned int first_size = headlen - offset; 4130 unsigned int first_offset; 4131 4132 if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS) 4133 goto merge; 4134 4135 first_offset = skb->data - 4136 (unsigned char *)page_address(page) + 4137 offset; 4138 4139 pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; 4140 4141 __skb_frag_set_page(frag, page); 4142 skb_frag_off_set(frag, first_offset); 4143 skb_frag_size_set(frag, first_size); 4144 4145 memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); 4146 /* We dont need to clear skbinfo->nr_frags here */ 4147 4148 delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); 4149 NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; 4150 goto done; 4151 } 4152 4153 merge: 4154 delta_truesize = skb->truesize; 4155 if (offset > headlen) { 4156 unsigned int eat = offset - headlen; 4157 4158 skb_frag_off_add(&skbinfo->frags[0], eat); 4159 skb_frag_size_sub(&skbinfo->frags[0], eat); 4160 skb->data_len -= eat; 4161 skb->len -= eat; 4162 offset = headlen; 4163 } 4164 4165 __skb_pull(skb, offset); 4166 4167 if (NAPI_GRO_CB(p)->last == p) 4168 skb_shinfo(p)->frag_list = skb; 4169 else 4170 NAPI_GRO_CB(p)->last->next = skb; 4171 NAPI_GRO_CB(p)->last = skb; 4172 __skb_header_release(skb); 4173 lp = p; 4174 4175 done: 4176 NAPI_GRO_CB(p)->count++; 4177 p->data_len += len; 4178 p->truesize += delta_truesize; 4179 p->len += len; 4180 if (lp != p) { 4181 lp->data_len += len; 4182 lp->truesize += delta_truesize; 4183 lp->len += len; 4184 } 4185 NAPI_GRO_CB(skb)->same_flow = 1; 4186 return 0; 4187 } 4188 4189 #ifdef CONFIG_SKB_EXTENSIONS 4190 #define SKB_EXT_ALIGN_VALUE 8 4191 #define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE) 4192 4193 static const u8 skb_ext_type_len[] = { 4194 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) 4195 [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info), 4196 #endif 4197 #ifdef CONFIG_XFRM 4198 [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path), 4199 #endif 4200 #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) 4201 [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext), 4202 #endif 4203 #if IS_ENABLED(CONFIG_MPTCP) 4204 [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext), 4205 #endif 4206 }; 4207 4208 static __always_inline unsigned int skb_ext_total_length(void) 4209 { 4210 return SKB_EXT_CHUNKSIZEOF(struct skb_ext) + 4211 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) 4212 skb_ext_type_len[SKB_EXT_BRIDGE_NF] + 4213 #endif 4214 #ifdef CONFIG_XFRM 4215 skb_ext_type_len[SKB_EXT_SEC_PATH] + 4216 #endif 4217 #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) 4218 skb_ext_type_len[TC_SKB_EXT] + 4219 #endif 4220 #if IS_ENABLED(CONFIG_MPTCP) 4221 skb_ext_type_len[SKB_EXT_MPTCP] + 4222 #endif 4223 0; 4224 } 4225 4226 static void skb_extensions_init(void) 4227 { 4228 BUILD_BUG_ON(SKB_EXT_NUM >= 8); 4229 BUILD_BUG_ON(skb_ext_total_length() > 255); 4230 4231 skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache", 4232 SKB_EXT_ALIGN_VALUE * skb_ext_total_length(), 4233 0, 4234 SLAB_HWCACHE_ALIGN|SLAB_PANIC, 4235 NULL); 4236 } 4237 #else 4238 static void skb_extensions_init(void) {} 4239 #endif 4240 4241 void __init skb_init(void) 4242 { 4243 skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache", 4244 sizeof(struct sk_buff), 4245 0, 4246 SLAB_HWCACHE_ALIGN|SLAB_PANIC, 4247 offsetof(struct sk_buff, cb), 4248 sizeof_field(struct sk_buff, cb), 4249 NULL); 4250 skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", 4251 sizeof(struct sk_buff_fclones), 4252 0, 4253 SLAB_HWCACHE_ALIGN|SLAB_PANIC, 4254 NULL); 4255 skb_extensions_init(); 4256 } 4257 4258 static int 4259 __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len, 4260 unsigned int recursion_level) 4261 { 4262 int start = skb_headlen(skb); 4263 int i, copy = start - offset; 4264 struct sk_buff *frag_iter; 4265 int elt = 0; 4266 4267 if (unlikely(recursion_level >= 24)) 4268 return -EMSGSIZE; 4269 4270 if (copy > 0) { 4271 if (copy > len) 4272 copy = len; 4273 sg_set_buf(sg, skb->data + offset, copy); 4274 elt++; 4275 if ((len -= copy) == 0) 4276 return elt; 4277 offset += copy; 4278 } 4279 4280 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 4281 int end; 4282 4283 WARN_ON(start > offset + len); 4284 4285 end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); 4286 if ((copy = end - offset) > 0) { 4287 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 4288 if (unlikely(elt && sg_is_last(&sg[elt - 1]))) 4289 return -EMSGSIZE; 4290 4291 if (copy > len) 4292 copy = len; 4293 sg_set_page(&sg[elt], skb_frag_page(frag), copy, 4294 skb_frag_off(frag) + offset - start); 4295 elt++; 4296 if (!(len -= copy)) 4297 return elt; 4298 offset += copy; 4299 } 4300 start = end; 4301 } 4302 4303 skb_walk_frags(skb, frag_iter) { 4304 int end, ret; 4305 4306 WARN_ON(start > offset + len); 4307 4308 end = start + frag_iter->len; 4309 if ((copy = end - offset) > 0) { 4310 if (unlikely(elt && sg_is_last(&sg[elt - 1]))) 4311 return -EMSGSIZE; 4312 4313 if (copy > len) 4314 copy = len; 4315 ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start, 4316 copy, recursion_level + 1); 4317 if (unlikely(ret < 0)) 4318 return ret; 4319 elt += ret; 4320 if ((len -= copy) == 0) 4321 return elt; 4322 offset += copy; 4323 } 4324 start = end; 4325 } 4326 BUG_ON(len); 4327 return elt; 4328 } 4329 4330 /** 4331 * skb_to_sgvec - Fill a scatter-gather list from a socket buffer 4332 * @skb: Socket buffer containing the buffers to be mapped 4333 * @sg: The scatter-gather list to map into 4334 * @offset: The offset into the buffer's contents to start mapping 4335 * @len: Length of buffer space to be mapped 4336 * 4337 * Fill the specified scatter-gather list with mappings/pointers into a 4338 * region of the buffer space attached to a socket buffer. Returns either 4339 * the number of scatterlist items used, or -EMSGSIZE if the contents 4340 * could not fit. 4341 */ 4342 int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) 4343 { 4344 int nsg = __skb_to_sgvec(skb, sg, offset, len, 0); 4345 4346 if (nsg <= 0) 4347 return nsg; 4348 4349 sg_mark_end(&sg[nsg - 1]); 4350 4351 return nsg; 4352 } 4353 EXPORT_SYMBOL_GPL(skb_to_sgvec); 4354 4355 /* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given 4356 * sglist without mark the sg which contain last skb data as the end. 4357 * So the caller can mannipulate sg list as will when padding new data after 4358 * the first call without calling sg_unmark_end to expend sg list. 4359 * 4360 * Scenario to use skb_to_sgvec_nomark: 4361 * 1. sg_init_table 4362 * 2. skb_to_sgvec_nomark(payload1) 4363 * 3. skb_to_sgvec_nomark(payload2) 4364 * 4365 * This is equivalent to: 4366 * 1. sg_init_table 4367 * 2. skb_to_sgvec(payload1) 4368 * 3. sg_unmark_end 4369 * 4. skb_to_sgvec(payload2) 4370 * 4371 * When mapping mutilple payload conditionally, skb_to_sgvec_nomark 4372 * is more preferable. 4373 */ 4374 int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, 4375 int offset, int len) 4376 { 4377 return __skb_to_sgvec(skb, sg, offset, len, 0); 4378 } 4379 EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark); 4380 4381 4382 4383 /** 4384 * skb_cow_data - Check that a socket buffer's data buffers are writable 4385 * @skb: The socket buffer to check. 4386 * @tailbits: Amount of trailing space to be added 4387 * @trailer: Returned pointer to the skb where the @tailbits space begins 4388 * 4389 * Make sure that the data buffers attached to a socket buffer are 4390 * writable. If they are not, private copies are made of the data buffers 4391 * and the socket buffer is set to use these instead. 4392 * 4393 * If @tailbits is given, make sure that there is space to write @tailbits 4394 * bytes of data beyond current end of socket buffer. @trailer will be 4395 * set to point to the skb in which this space begins. 4396 * 4397 * The number of scatterlist elements required to completely map the 4398 * COW'd and extended socket buffer will be returned. 4399 */ 4400 int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) 4401 { 4402 int copyflag; 4403 int elt; 4404 struct sk_buff *skb1, **skb_p; 4405 4406 /* If skb is cloned or its head is paged, reallocate 4407 * head pulling out all the pages (pages are considered not writable 4408 * at the moment even if they are anonymous). 4409 */ 4410 if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) && 4411 !__pskb_pull_tail(skb, __skb_pagelen(skb))) 4412 return -ENOMEM; 4413 4414 /* Easy case. Most of packets will go this way. */ 4415 if (!skb_has_frag_list(skb)) { 4416 /* A little of trouble, not enough of space for trailer. 4417 * This should not happen, when stack is tuned to generate 4418 * good frames. OK, on miss we reallocate and reserve even more 4419 * space, 128 bytes is fair. */ 4420 4421 if (skb_tailroom(skb) < tailbits && 4422 pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC)) 4423 return -ENOMEM; 4424 4425 /* Voila! */ 4426 *trailer = skb; 4427 return 1; 4428 } 4429 4430 /* Misery. We are in troubles, going to mincer fragments... */ 4431 4432 elt = 1; 4433 skb_p = &skb_shinfo(skb)->frag_list; 4434 copyflag = 0; 4435 4436 while ((skb1 = *skb_p) != NULL) { 4437 int ntail = 0; 4438 4439 /* The fragment is partially pulled by someone, 4440 * this can happen on input. Copy it and everything 4441 * after it. */ 4442 4443 if (skb_shared(skb1)) 4444 copyflag = 1; 4445 4446 /* If the skb is the last, worry about trailer. */ 4447 4448 if (skb1->next == NULL && tailbits) { 4449 if (skb_shinfo(skb1)->nr_frags || 4450 skb_has_frag_list(skb1) || 4451 skb_tailroom(skb1) < tailbits) 4452 ntail = tailbits + 128; 4453 } 4454 4455 if (copyflag || 4456 skb_cloned(skb1) || 4457 ntail || 4458 skb_shinfo(skb1)->nr_frags || 4459 skb_has_frag_list(skb1)) { 4460 struct sk_buff *skb2; 4461 4462 /* Fuck, we are miserable poor guys... */ 4463 if (ntail == 0) 4464 skb2 = skb_copy(skb1, GFP_ATOMIC); 4465 else 4466 skb2 = skb_copy_expand(skb1, 4467 skb_headroom(skb1), 4468 ntail, 4469 GFP_ATOMIC); 4470 if (unlikely(skb2 == NULL)) 4471 return -ENOMEM; 4472 4473 if (skb1->sk) 4474 skb_set_owner_w(skb2, skb1->sk); 4475 4476 /* Looking around. Are we still alive? 4477 * OK, link new skb, drop old one */ 4478 4479 skb2->next = skb1->next; 4480 *skb_p = skb2; 4481 kfree_skb(skb1); 4482 skb1 = skb2; 4483 } 4484 elt++; 4485 *trailer = skb1; 4486 skb_p = &skb1->next; 4487 } 4488 4489 return elt; 4490 } 4491 EXPORT_SYMBOL_GPL(skb_cow_data); 4492 4493 static void sock_rmem_free(struct sk_buff *skb) 4494 { 4495 struct sock *sk = skb->sk; 4496 4497 atomic_sub(skb->truesize, &sk->sk_rmem_alloc); 4498 } 4499 4500 static void skb_set_err_queue(struct sk_buff *skb) 4501 { 4502 /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING. 4503 * So, it is safe to (mis)use it to mark skbs on the error queue. 4504 */ 4505 skb->pkt_type = PACKET_OUTGOING; 4506 BUILD_BUG_ON(PACKET_OUTGOING == 0); 4507 } 4508 4509 /* 4510 * Note: We dont mem charge error packets (no sk_forward_alloc changes) 4511 */ 4512 int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) 4513 { 4514 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= 4515 (unsigned int)READ_ONCE(sk->sk_rcvbuf)) 4516 return -ENOMEM; 4517 4518 skb_orphan(skb); 4519 skb->sk = sk; 4520 skb->destructor = sock_rmem_free; 4521 atomic_add(skb->truesize, &sk->sk_rmem_alloc); 4522 skb_set_err_queue(skb); 4523 4524 /* before exiting rcu section, make sure dst is refcounted */ 4525 skb_dst_force(skb); 4526 4527 skb_queue_tail(&sk->sk_error_queue, skb); 4528 if (!sock_flag(sk, SOCK_DEAD)) 4529 sk->sk_error_report(sk); 4530 return 0; 4531 } 4532 EXPORT_SYMBOL(sock_queue_err_skb); 4533 4534 static bool is_icmp_err_skb(const struct sk_buff *skb) 4535 { 4536 return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP || 4537 SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6); 4538 } 4539 4540 struct sk_buff *sock_dequeue_err_skb(struct sock *sk) 4541 { 4542 struct sk_buff_head *q = &sk->sk_error_queue; 4543 struct sk_buff *skb, *skb_next = NULL; 4544 bool icmp_next = false; 4545 unsigned long flags; 4546 4547 spin_lock_irqsave(&q->lock, flags); 4548 skb = __skb_dequeue(q); 4549 if (skb && (skb_next = skb_peek(q))) { 4550 icmp_next = is_icmp_err_skb(skb_next); 4551 if (icmp_next) 4552 sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_origin; 4553 } 4554 spin_unlock_irqrestore(&q->lock, flags); 4555 4556 if (is_icmp_err_skb(skb) && !icmp_next) 4557 sk->sk_err = 0; 4558 4559 if (skb_next) 4560 sk->sk_error_report(sk); 4561 4562 return skb; 4563 } 4564 EXPORT_SYMBOL(sock_dequeue_err_skb); 4565 4566 /** 4567 * skb_clone_sk - create clone of skb, and take reference to socket 4568 * @skb: the skb to clone 4569 * 4570 * This function creates a clone of a buffer that holds a reference on 4571 * sk_refcnt. Buffers created via this function are meant to be 4572 * returned using sock_queue_err_skb, or free via kfree_skb. 4573 * 4574 * When passing buffers allocated with this function to sock_queue_err_skb 4575 * it is necessary to wrap the call with sock_hold/sock_put in order to 4576 * prevent the socket from being released prior to being enqueued on 4577 * the sk_error_queue. 4578 */ 4579 struct sk_buff *skb_clone_sk(struct sk_buff *skb) 4580 { 4581 struct sock *sk = skb->sk; 4582 struct sk_buff *clone; 4583 4584 if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt)) 4585 return NULL; 4586 4587 clone = skb_clone(skb, GFP_ATOMIC); 4588 if (!clone) { 4589 sock_put(sk); 4590 return NULL; 4591 } 4592 4593 clone->sk = sk; 4594 clone->destructor = sock_efree; 4595 4596 return clone; 4597 } 4598 EXPORT_SYMBOL(skb_clone_sk); 4599 4600 static void __skb_complete_tx_timestamp(struct sk_buff *skb, 4601 struct sock *sk, 4602 int tstype, 4603 bool opt_stats) 4604 { 4605 struct sock_exterr_skb *serr; 4606 int err; 4607 4608 BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb)); 4609 4610 serr = SKB_EXT_ERR(skb); 4611 memset(serr, 0, sizeof(*serr)); 4612 serr->ee.ee_errno = ENOMSG; 4613 serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; 4614 serr->ee.ee_info = tstype; 4615 serr->opt_stats = opt_stats; 4616 serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0; 4617 if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { 4618 serr->ee.ee_data = skb_shinfo(skb)->tskey; 4619 if (sk->sk_protocol == IPPROTO_TCP && 4620 sk->sk_type == SOCK_STREAM) 4621 serr->ee.ee_data -= sk->sk_tskey; 4622 } 4623 4624 err = sock_queue_err_skb(sk, skb); 4625 4626 if (err) 4627 kfree_skb(skb); 4628 } 4629 4630 static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly) 4631 { 4632 bool ret; 4633 4634 if (likely(sysctl_tstamp_allow_data || tsonly)) 4635 return true; 4636 4637 read_lock_bh(&sk->sk_callback_lock); 4638 ret = sk->sk_socket && sk->sk_socket->file && 4639 file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW); 4640 read_unlock_bh(&sk->sk_callback_lock); 4641 return ret; 4642 } 4643 4644 void skb_complete_tx_timestamp(struct sk_buff *skb, 4645 struct skb_shared_hwtstamps *hwtstamps) 4646 { 4647 struct sock *sk = skb->sk; 4648 4649 if (!skb_may_tx_timestamp(sk, false)) 4650 goto err; 4651 4652 /* Take a reference to prevent skb_orphan() from freeing the socket, 4653 * but only if the socket refcount is not zero. 4654 */ 4655 if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { 4656 *skb_hwtstamps(skb) = *hwtstamps; 4657 __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false); 4658 sock_put(sk); 4659 return; 4660 } 4661 4662 err: 4663 kfree_skb(skb); 4664 } 4665 EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); 4666 4667 void __skb_tstamp_tx(struct sk_buff *orig_skb, 4668 struct skb_shared_hwtstamps *hwtstamps, 4669 struct sock *sk, int tstype) 4670 { 4671 struct sk_buff *skb; 4672 bool tsonly, opt_stats = false; 4673 4674 if (!sk) 4675 return; 4676 4677 if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && 4678 skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) 4679 return; 4680 4681 tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY; 4682 if (!skb_may_tx_timestamp(sk, tsonly)) 4683 return; 4684 4685 if (tsonly) { 4686 #ifdef CONFIG_INET 4687 if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && 4688 sk->sk_protocol == IPPROTO_TCP && 4689 sk->sk_type == SOCK_STREAM) { 4690 skb = tcp_get_timestamping_opt_stats(sk, orig_skb); 4691 opt_stats = true; 4692 } else 4693 #endif 4694 skb = alloc_skb(0, GFP_ATOMIC); 4695 } else { 4696 skb = skb_clone(orig_skb, GFP_ATOMIC); 4697 } 4698 if (!skb) 4699 return; 4700 4701 if (tsonly) { 4702 skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags & 4703 SKBTX_ANY_TSTAMP; 4704 skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey; 4705 } 4706 4707 if (hwtstamps) 4708 *skb_hwtstamps(skb) = *hwtstamps; 4709 else 4710 skb->tstamp = ktime_get_real(); 4711 4712 __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats); 4713 } 4714 EXPORT_SYMBOL_GPL(__skb_tstamp_tx); 4715 4716 void skb_tstamp_tx(struct sk_buff *orig_skb, 4717 struct skb_shared_hwtstamps *hwtstamps) 4718 { 4719 return __skb_tstamp_tx(orig_skb, hwtstamps, orig_skb->sk, 4720 SCM_TSTAMP_SND); 4721 } 4722 EXPORT_SYMBOL_GPL(skb_tstamp_tx); 4723 4724 void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) 4725 { 4726 struct sock *sk = skb->sk; 4727 struct sock_exterr_skb *serr; 4728 int err = 1; 4729 4730 skb->wifi_acked_valid = 1; 4731 skb->wifi_acked = acked; 4732 4733 serr = SKB_EXT_ERR(skb); 4734 memset(serr, 0, sizeof(*serr)); 4735 serr->ee.ee_errno = ENOMSG; 4736 serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; 4737 4738 /* Take a reference to prevent skb_orphan() from freeing the socket, 4739 * but only if the socket refcount is not zero. 4740 */ 4741 if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { 4742 err = sock_queue_err_skb(sk, skb); 4743 sock_put(sk); 4744 } 4745 if (err) 4746 kfree_skb(skb); 4747 } 4748 EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); 4749 4750 /** 4751 * skb_partial_csum_set - set up and verify partial csum values for packet 4752 * @skb: the skb to set 4753 * @start: the number of bytes after skb->data to start checksumming. 4754 * @off: the offset from start to place the checksum. 4755 * 4756 * For untrusted partially-checksummed packets, we need to make sure the values 4757 * for skb->csum_start and skb->csum_offset are valid so we don't oops. 4758 * 4759 * This function checks and sets those values and skb->ip_summed: if this 4760 * returns false you should drop the packet. 4761 */ 4762 bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) 4763 { 4764 u32 csum_end = (u32)start + (u32)off + sizeof(__sum16); 4765 u32 csum_start = skb_headroom(skb) + (u32)start; 4766 4767 if (unlikely(csum_start > U16_MAX || csum_end > skb_headlen(skb))) { 4768 net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n", 4769 start, off, skb_headroom(skb), skb_headlen(skb)); 4770 return false; 4771 } 4772 skb->ip_summed = CHECKSUM_PARTIAL; 4773 skb->csum_start = csum_start; 4774 skb->csum_offset = off; 4775 skb_set_transport_header(skb, start); 4776 return true; 4777 } 4778 EXPORT_SYMBOL_GPL(skb_partial_csum_set); 4779 4780 static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len, 4781 unsigned int max) 4782 { 4783 if (skb_headlen(skb) >= len) 4784 return 0; 4785 4786 /* If we need to pullup then pullup to the max, so we 4787 * won't need to do it again. 4788 */ 4789 if (max > skb->len) 4790 max = skb->len; 4791 4792 if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL) 4793 return -ENOMEM; 4794 4795 if (skb_headlen(skb) < len) 4796 return -EPROTO; 4797 4798 return 0; 4799 } 4800 4801 #define MAX_TCP_HDR_LEN (15 * 4) 4802 4803 static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb, 4804 typeof(IPPROTO_IP) proto, 4805 unsigned int off) 4806 { 4807 int err; 4808 4809 switch (proto) { 4810 case IPPROTO_TCP: 4811 err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr), 4812 off + MAX_TCP_HDR_LEN); 4813 if (!err && !skb_partial_csum_set(skb, off, 4814 offsetof(struct tcphdr, 4815 check))) 4816 err = -EPROTO; 4817 return err ? ERR_PTR(err) : &tcp_hdr(skb)->check; 4818 4819 case IPPROTO_UDP: 4820 err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr), 4821 off + sizeof(struct udphdr)); 4822 if (!err && !skb_partial_csum_set(skb, off, 4823 offsetof(struct udphdr, 4824 check))) 4825 err = -EPROTO; 4826 return err ? ERR_PTR(err) : &udp_hdr(skb)->check; 4827 } 4828 4829 return ERR_PTR(-EPROTO); 4830 } 4831 4832 /* This value should be large enough to cover a tagged ethernet header plus 4833 * maximally sized IP and TCP or UDP headers. 4834 */ 4835 #define MAX_IP_HDR_LEN 128 4836 4837 static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate) 4838 { 4839 unsigned int off; 4840 bool fragment; 4841 __sum16 *csum; 4842 int err; 4843 4844 fragment = false; 4845 4846 err = skb_maybe_pull_tail(skb, 4847 sizeof(struct iphdr), 4848 MAX_IP_HDR_LEN); 4849 if (err < 0) 4850 goto out; 4851 4852 if (ip_is_fragment(ip_hdr(skb))) 4853 fragment = true; 4854 4855 off = ip_hdrlen(skb); 4856 4857 err = -EPROTO; 4858 4859 if (fragment) 4860 goto out; 4861 4862 csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off); 4863 if (IS_ERR(csum)) 4864 return PTR_ERR(csum); 4865 4866 if (recalculate) 4867 *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr, 4868 ip_hdr(skb)->daddr, 4869 skb->len - off, 4870 ip_hdr(skb)->protocol, 0); 4871 err = 0; 4872 4873 out: 4874 return err; 4875 } 4876 4877 /* This value should be large enough to cover a tagged ethernet header plus 4878 * an IPv6 header, all options, and a maximal TCP or UDP header. 4879 */ 4880 #define MAX_IPV6_HDR_LEN 256 4881 4882 #define OPT_HDR(type, skb, off) \ 4883 (type *)(skb_network_header(skb) + (off)) 4884 4885 static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate) 4886 { 4887 int err; 4888 u8 nexthdr; 4889 unsigned int off; 4890 unsigned int len; 4891 bool fragment; 4892 bool done; 4893 __sum16 *csum; 4894 4895 fragment = false; 4896 done = false; 4897 4898 off = sizeof(struct ipv6hdr); 4899 4900 err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN); 4901 if (err < 0) 4902 goto out; 4903 4904 nexthdr = ipv6_hdr(skb)->nexthdr; 4905 4906 len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len); 4907 while (off <= len && !done) { 4908 switch (nexthdr) { 4909 case IPPROTO_DSTOPTS: 4910 case IPPROTO_HOPOPTS: 4911 case IPPROTO_ROUTING: { 4912 struct ipv6_opt_hdr *hp; 4913 4914 err = skb_maybe_pull_tail(skb, 4915 off + 4916 sizeof(struct ipv6_opt_hdr), 4917 MAX_IPV6_HDR_LEN); 4918 if (err < 0) 4919 goto out; 4920 4921 hp = OPT_HDR(struct ipv6_opt_hdr, skb, off); 4922 nexthdr = hp->nexthdr; 4923 off += ipv6_optlen(hp); 4924 break; 4925 } 4926 case IPPROTO_AH: { 4927 struct ip_auth_hdr *hp; 4928 4929 err = skb_maybe_pull_tail(skb, 4930 off + 4931 sizeof(struct ip_auth_hdr), 4932 MAX_IPV6_HDR_LEN); 4933 if (err < 0) 4934 goto out; 4935 4936 hp = OPT_HDR(struct ip_auth_hdr, skb, off); 4937 nexthdr = hp->nexthdr; 4938 off += ipv6_authlen(hp); 4939 break; 4940 } 4941 case IPPROTO_FRAGMENT: { 4942 struct frag_hdr *hp; 4943 4944 err = skb_maybe_pull_tail(skb, 4945 off + 4946 sizeof(struct frag_hdr), 4947 MAX_IPV6_HDR_LEN); 4948 if (err < 0) 4949 goto out; 4950 4951 hp = OPT_HDR(struct frag_hdr, skb, off); 4952 4953 if (hp->frag_off & htons(IP6_OFFSET | IP6_MF)) 4954 fragment = true; 4955 4956 nexthdr = hp->nexthdr; 4957 off += sizeof(struct frag_hdr); 4958 break; 4959 } 4960 default: 4961 done = true; 4962 break; 4963 } 4964 } 4965 4966 err = -EPROTO; 4967 4968 if (!done || fragment) 4969 goto out; 4970 4971 csum = skb_checksum_setup_ip(skb, nexthdr, off); 4972 if (IS_ERR(csum)) 4973 return PTR_ERR(csum); 4974 4975 if (recalculate) 4976 *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, 4977 &ipv6_hdr(skb)->daddr, 4978 skb->len - off, nexthdr, 0); 4979 err = 0; 4980 4981 out: 4982 return err; 4983 } 4984 4985 /** 4986 * skb_checksum_setup - set up partial checksum offset 4987 * @skb: the skb to set up 4988 * @recalculate: if true the pseudo-header checksum will be recalculated 4989 */ 4990 int skb_checksum_setup(struct sk_buff *skb, bool recalculate) 4991 { 4992 int err; 4993 4994 switch (skb->protocol) { 4995 case htons(ETH_P_IP): 4996 err = skb_checksum_setup_ipv4(skb, recalculate); 4997 break; 4998 4999 case htons(ETH_P_IPV6): 5000 err = skb_checksum_setup_ipv6(skb, recalculate); 5001 break; 5002 5003 default: 5004 err = -EPROTO; 5005 break; 5006 } 5007 5008 return err; 5009 } 5010 EXPORT_SYMBOL(skb_checksum_setup); 5011 5012 /** 5013 * skb_checksum_maybe_trim - maybe trims the given skb 5014 * @skb: the skb to check 5015 * @transport_len: the data length beyond the network header 5016 * 5017 * Checks whether the given skb has data beyond the given transport length. 5018 * If so, returns a cloned skb trimmed to this transport length. 5019 * Otherwise returns the provided skb. Returns NULL in error cases 5020 * (e.g. transport_len exceeds skb length or out-of-memory). 5021 * 5022 * Caller needs to set the skb transport header and free any returned skb if it 5023 * differs from the provided skb. 5024 */ 5025 static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb, 5026 unsigned int transport_len) 5027 { 5028 struct sk_buff *skb_chk; 5029 unsigned int len = skb_transport_offset(skb) + transport_len; 5030 int ret; 5031 5032 if (skb->len < len) 5033 return NULL; 5034 else if (skb->len == len) 5035 return skb; 5036 5037 skb_chk = skb_clone(skb, GFP_ATOMIC); 5038 if (!skb_chk) 5039 return NULL; 5040 5041 ret = pskb_trim_rcsum(skb_chk, len); 5042 if (ret) { 5043 kfree_skb(skb_chk); 5044 return NULL; 5045 } 5046 5047 return skb_chk; 5048 } 5049 5050 /** 5051 * skb_checksum_trimmed - validate checksum of an skb 5052 * @skb: the skb to check 5053 * @transport_len: the data length beyond the network header 5054 * @skb_chkf: checksum function to use 5055 * 5056 * Applies the given checksum function skb_chkf to the provided skb. 5057 * Returns a checked and maybe trimmed skb. Returns NULL on error. 5058 * 5059 * If the skb has data beyond the given transport length, then a 5060 * trimmed & cloned skb is checked and returned. 5061 * 5062 * Caller needs to set the skb transport header and free any returned skb if it 5063 * differs from the provided skb. 5064 */ 5065 struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, 5066 unsigned int transport_len, 5067 __sum16(*skb_chkf)(struct sk_buff *skb)) 5068 { 5069 struct sk_buff *skb_chk; 5070 unsigned int offset = skb_transport_offset(skb); 5071 __sum16 ret; 5072 5073 skb_chk = skb_checksum_maybe_trim(skb, transport_len); 5074 if (!skb_chk) 5075 goto err; 5076 5077 if (!pskb_may_pull(skb_chk, offset)) 5078 goto err; 5079 5080 skb_pull_rcsum(skb_chk, offset); 5081 ret = skb_chkf(skb_chk); 5082 skb_push_rcsum(skb_chk, offset); 5083 5084 if (ret) 5085 goto err; 5086 5087 return skb_chk; 5088 5089 err: 5090 if (skb_chk && skb_chk != skb) 5091 kfree_skb(skb_chk); 5092 5093 return NULL; 5094 5095 } 5096 EXPORT_SYMBOL(skb_checksum_trimmed); 5097 5098 void __skb_warn_lro_forwarding(const struct sk_buff *skb) 5099 { 5100 net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n", 5101 skb->dev->name); 5102 } 5103 EXPORT_SYMBOL(__skb_warn_lro_forwarding); 5104 5105 void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) 5106 { 5107 if (head_stolen) { 5108 skb_release_head_state(skb); 5109 kmem_cache_free(skbuff_head_cache, skb); 5110 } else { 5111 __kfree_skb(skb); 5112 } 5113 } 5114 EXPORT_SYMBOL(kfree_skb_partial); 5115 5116 /** 5117 * skb_try_coalesce - try to merge skb to prior one 5118 * @to: prior buffer 5119 * @from: buffer to add 5120 * @fragstolen: pointer to boolean 5121 * @delta_truesize: how much more was allocated than was requested 5122 */ 5123 bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, 5124 bool *fragstolen, int *delta_truesize) 5125 { 5126 struct skb_shared_info *to_shinfo, *from_shinfo; 5127 int i, delta, len = from->len; 5128 5129 *fragstolen = false; 5130 5131 if (skb_cloned(to)) 5132 return false; 5133 5134 if (len <= skb_tailroom(to)) { 5135 if (len) 5136 BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); 5137 *delta_truesize = 0; 5138 return true; 5139 } 5140 5141 to_shinfo = skb_shinfo(to); 5142 from_shinfo = skb_shinfo(from); 5143 if (to_shinfo->frag_list || from_shinfo->frag_list) 5144 return false; 5145 if (skb_zcopy(to) || skb_zcopy(from)) 5146 return false; 5147 5148 if (skb_headlen(from) != 0) { 5149 struct page *page; 5150 unsigned int offset; 5151 5152 if (to_shinfo->nr_frags + 5153 from_shinfo->nr_frags >= MAX_SKB_FRAGS) 5154 return false; 5155 5156 if (skb_head_is_locked(from)) 5157 return false; 5158 5159 delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); 5160 5161 page = virt_to_head_page(from->head); 5162 offset = from->data - (unsigned char *)page_address(page); 5163 5164 skb_fill_page_desc(to, to_shinfo->nr_frags, 5165 page, offset, skb_headlen(from)); 5166 *fragstolen = true; 5167 } else { 5168 if (to_shinfo->nr_frags + 5169 from_shinfo->nr_frags > MAX_SKB_FRAGS) 5170 return false; 5171 5172 delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from)); 5173 } 5174 5175 WARN_ON_ONCE(delta < len); 5176 5177 memcpy(to_shinfo->frags + to_shinfo->nr_frags, 5178 from_shinfo->frags, 5179 from_shinfo->nr_frags * sizeof(skb_frag_t)); 5180 to_shinfo->nr_frags += from_shinfo->nr_frags; 5181 5182 if (!skb_cloned(from)) 5183 from_shinfo->nr_frags = 0; 5184 5185 /* if the skb is not cloned this does nothing 5186 * since we set nr_frags to 0. 5187 */ 5188 for (i = 0; i < from_shinfo->nr_frags; i++) 5189 __skb_frag_ref(&from_shinfo->frags[i]); 5190 5191 to->truesize += delta; 5192 to->len += len; 5193 to->data_len += len; 5194 5195 *delta_truesize = delta; 5196 return true; 5197 } 5198 EXPORT_SYMBOL(skb_try_coalesce); 5199 5200 /** 5201 * skb_scrub_packet - scrub an skb 5202 * 5203 * @skb: buffer to clean 5204 * @xnet: packet is crossing netns 5205 * 5206 * skb_scrub_packet can be used after encapsulating or decapsulting a packet 5207 * into/from a tunnel. Some information have to be cleared during these 5208 * operations. 5209 * skb_scrub_packet can also be used to clean a skb before injecting it in 5210 * another namespace (@xnet == true). We have to clear all information in the 5211 * skb that could impact namespace isolation. 5212 */ 5213 void skb_scrub_packet(struct sk_buff *skb, bool xnet) 5214 { 5215 skb->pkt_type = PACKET_HOST; 5216 skb->skb_iif = 0; 5217 skb->ignore_df = 0; 5218 skb_dst_drop(skb); 5219 skb_ext_reset(skb); 5220 nf_reset_ct(skb); 5221 nf_reset_trace(skb); 5222 5223 #ifdef CONFIG_NET_SWITCHDEV 5224 skb->offload_fwd_mark = 0; 5225 skb->offload_l3_fwd_mark = 0; 5226 #endif 5227 5228 if (!xnet) 5229 return; 5230 5231 ipvs_reset(skb); 5232 skb->mark = 0; 5233 skb->tstamp = 0; 5234 } 5235 EXPORT_SYMBOL_GPL(skb_scrub_packet); 5236 5237 /** 5238 * skb_gso_transport_seglen - Return length of individual segments of a gso packet 5239 * 5240 * @skb: GSO skb 5241 * 5242 * skb_gso_transport_seglen is used to determine the real size of the 5243 * individual segments, including Layer4 headers (TCP/UDP). 5244 * 5245 * The MAC/L2 or network (IP, IPv6) headers are not accounted for. 5246 */ 5247 static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) 5248 { 5249 const struct skb_shared_info *shinfo = skb_shinfo(skb); 5250 unsigned int thlen = 0; 5251 5252 if (skb->encapsulation) { 5253 thlen = skb_inner_transport_header(skb) - 5254 skb_transport_header(skb); 5255 5256 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) 5257 thlen += inner_tcp_hdrlen(skb); 5258 } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { 5259 thlen = tcp_hdrlen(skb); 5260 } else if (unlikely(skb_is_gso_sctp(skb))) { 5261 thlen = sizeof(struct sctphdr); 5262 } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { 5263 thlen = sizeof(struct udphdr); 5264 } 5265 /* UFO sets gso_size to the size of the fragmentation 5266 * payload, i.e. the size of the L4 (UDP) header is already 5267 * accounted for. 5268 */ 5269 return thlen + shinfo->gso_size; 5270 } 5271 5272 /** 5273 * skb_gso_network_seglen - Return length of individual segments of a gso packet 5274 * 5275 * @skb: GSO skb 5276 * 5277 * skb_gso_network_seglen is used to determine the real size of the 5278 * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP). 5279 * 5280 * The MAC/L2 header is not accounted for. 5281 */ 5282 static unsigned int skb_gso_network_seglen(const struct sk_buff *skb) 5283 { 5284 unsigned int hdr_len = skb_transport_header(skb) - 5285 skb_network_header(skb); 5286 5287 return hdr_len + skb_gso_transport_seglen(skb); 5288 } 5289 5290 /** 5291 * skb_gso_mac_seglen - Return length of individual segments of a gso packet 5292 * 5293 * @skb: GSO skb 5294 * 5295 * skb_gso_mac_seglen is used to determine the real size of the 5296 * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4 5297 * headers (TCP/UDP). 5298 */ 5299 static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) 5300 { 5301 unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); 5302 5303 return hdr_len + skb_gso_transport_seglen(skb); 5304 } 5305 5306 /** 5307 * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS 5308 * 5309 * There are a couple of instances where we have a GSO skb, and we 5310 * want to determine what size it would be after it is segmented. 5311 * 5312 * We might want to check: 5313 * - L3+L4+payload size (e.g. IP forwarding) 5314 * - L2+L3+L4+payload size (e.g. sanity check before passing to driver) 5315 * 5316 * This is a helper to do that correctly considering GSO_BY_FRAGS. 5317 * 5318 * @skb: GSO skb 5319 * 5320 * @seg_len: The segmented length (from skb_gso_*_seglen). In the 5321 * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS]. 5322 * 5323 * @max_len: The maximum permissible length. 5324 * 5325 * Returns true if the segmented length <= max length. 5326 */ 5327 static inline bool skb_gso_size_check(const struct sk_buff *skb, 5328 unsigned int seg_len, 5329 unsigned int max_len) { 5330 const struct skb_shared_info *shinfo = skb_shinfo(skb); 5331 const struct sk_buff *iter; 5332 5333 if (shinfo->gso_size != GSO_BY_FRAGS) 5334 return seg_len <= max_len; 5335 5336 /* Undo this so we can re-use header sizes */ 5337 seg_len -= GSO_BY_FRAGS; 5338 5339 skb_walk_frags(skb, iter) { 5340 if (seg_len + skb_headlen(iter) > max_len) 5341 return false; 5342 } 5343 5344 return true; 5345 } 5346 5347 /** 5348 * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU? 5349 * 5350 * @skb: GSO skb 5351 * @mtu: MTU to validate against 5352 * 5353 * skb_gso_validate_network_len validates if a given skb will fit a 5354 * wanted MTU once split. It considers L3 headers, L4 headers, and the 5355 * payload. 5356 */ 5357 bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu) 5358 { 5359 return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu); 5360 } 5361 EXPORT_SYMBOL_GPL(skb_gso_validate_network_len); 5362 5363 /** 5364 * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length? 5365 * 5366 * @skb: GSO skb 5367 * @len: length to validate against 5368 * 5369 * skb_gso_validate_mac_len validates if a given skb will fit a wanted 5370 * length once split, including L2, L3 and L4 headers and the payload. 5371 */ 5372 bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len) 5373 { 5374 return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len); 5375 } 5376 EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len); 5377 5378 static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) 5379 { 5380 int mac_len, meta_len; 5381 void *meta; 5382 5383 if (skb_cow(skb, skb_headroom(skb)) < 0) { 5384 kfree_skb(skb); 5385 return NULL; 5386 } 5387 5388 mac_len = skb->data - skb_mac_header(skb); 5389 if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) { 5390 memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb), 5391 mac_len - VLAN_HLEN - ETH_TLEN); 5392 } 5393 5394 meta_len = skb_metadata_len(skb); 5395 if (meta_len) { 5396 meta = skb_metadata_end(skb) - meta_len; 5397 memmove(meta + VLAN_HLEN, meta, meta_len); 5398 } 5399 5400 skb->mac_header += VLAN_HLEN; 5401 return skb; 5402 } 5403 5404 struct sk_buff *skb_vlan_untag(struct sk_buff *skb) 5405 { 5406 struct vlan_hdr *vhdr; 5407 u16 vlan_tci; 5408 5409 if (unlikely(skb_vlan_tag_present(skb))) { 5410 /* vlan_tci is already set-up so leave this for another time */ 5411 return skb; 5412 } 5413 5414 skb = skb_share_check(skb, GFP_ATOMIC); 5415 if (unlikely(!skb)) 5416 goto err_free; 5417 /* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */ 5418 if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short)))) 5419 goto err_free; 5420 5421 vhdr = (struct vlan_hdr *)skb->data; 5422 vlan_tci = ntohs(vhdr->h_vlan_TCI); 5423 __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci); 5424 5425 skb_pull_rcsum(skb, VLAN_HLEN); 5426 vlan_set_encap_proto(skb, vhdr); 5427 5428 skb = skb_reorder_vlan_header(skb); 5429 if (unlikely(!skb)) 5430 goto err_free; 5431 5432 skb_reset_network_header(skb); 5433 skb_reset_transport_header(skb); 5434 skb_reset_mac_len(skb); 5435 5436 return skb; 5437 5438 err_free: 5439 kfree_skb(skb); 5440 return NULL; 5441 } 5442 EXPORT_SYMBOL(skb_vlan_untag); 5443 5444 int skb_ensure_writable(struct sk_buff *skb, int write_len) 5445 { 5446 if (!pskb_may_pull(skb, write_len)) 5447 return -ENOMEM; 5448 5449 if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) 5450 return 0; 5451 5452 return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 5453 } 5454 EXPORT_SYMBOL(skb_ensure_writable); 5455 5456 /* remove VLAN header from packet and update csum accordingly. 5457 * expects a non skb_vlan_tag_present skb with a vlan tag payload 5458 */ 5459 int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) 5460 { 5461 struct vlan_hdr *vhdr; 5462 int offset = skb->data - skb_mac_header(skb); 5463 int err; 5464 5465 if (WARN_ONCE(offset, 5466 "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n", 5467 offset)) { 5468 return -EINVAL; 5469 } 5470 5471 err = skb_ensure_writable(skb, VLAN_ETH_HLEN); 5472 if (unlikely(err)) 5473 return err; 5474 5475 skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); 5476 5477 vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); 5478 *vlan_tci = ntohs(vhdr->h_vlan_TCI); 5479 5480 memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); 5481 __skb_pull(skb, VLAN_HLEN); 5482 5483 vlan_set_encap_proto(skb, vhdr); 5484 skb->mac_header += VLAN_HLEN; 5485 5486 if (skb_network_offset(skb) < ETH_HLEN) 5487 skb_set_network_header(skb, ETH_HLEN); 5488 5489 skb_reset_mac_len(skb); 5490 5491 return err; 5492 } 5493 EXPORT_SYMBOL(__skb_vlan_pop); 5494 5495 /* Pop a vlan tag either from hwaccel or from payload. 5496 * Expects skb->data at mac header. 5497 */ 5498 int skb_vlan_pop(struct sk_buff *skb) 5499 { 5500 u16 vlan_tci; 5501 __be16 vlan_proto; 5502 int err; 5503 5504 if (likely(skb_vlan_tag_present(skb))) { 5505 __vlan_hwaccel_clear_tag(skb); 5506 } else { 5507 if (unlikely(!eth_type_vlan(skb->protocol))) 5508 return 0; 5509 5510 err = __skb_vlan_pop(skb, &vlan_tci); 5511 if (err) 5512 return err; 5513 } 5514 /* move next vlan tag to hw accel tag */ 5515 if (likely(!eth_type_vlan(skb->protocol))) 5516 return 0; 5517 5518 vlan_proto = skb->protocol; 5519 err = __skb_vlan_pop(skb, &vlan_tci); 5520 if (unlikely(err)) 5521 return err; 5522 5523 __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); 5524 return 0; 5525 } 5526 EXPORT_SYMBOL(skb_vlan_pop); 5527 5528 /* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present). 5529 * Expects skb->data at mac header. 5530 */ 5531 int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) 5532 { 5533 if (skb_vlan_tag_present(skb)) { 5534 int offset = skb->data - skb_mac_header(skb); 5535 int err; 5536 5537 if (WARN_ONCE(offset, 5538 "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n", 5539 offset)) { 5540 return -EINVAL; 5541 } 5542 5543 err = __vlan_insert_tag(skb, skb->vlan_proto, 5544 skb_vlan_tag_get(skb)); 5545 if (err) 5546 return err; 5547 5548 skb->protocol = skb->vlan_proto; 5549 skb->mac_len += VLAN_HLEN; 5550 5551 skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); 5552 } 5553 __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); 5554 return 0; 5555 } 5556 EXPORT_SYMBOL(skb_vlan_push); 5557 5558 /** 5559 * skb_eth_pop() - Drop the Ethernet header at the head of a packet 5560 * 5561 * @skb: Socket buffer to modify 5562 * 5563 * Drop the Ethernet header of @skb. 5564 * 5565 * Expects that skb->data points to the mac header and that no VLAN tags are 5566 * present. 5567 * 5568 * Returns 0 on success, -errno otherwise. 5569 */ 5570 int skb_eth_pop(struct sk_buff *skb) 5571 { 5572 if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) || 5573 skb_network_offset(skb) < ETH_HLEN) 5574 return -EPROTO; 5575 5576 skb_pull_rcsum(skb, ETH_HLEN); 5577 skb_reset_mac_header(skb); 5578 skb_reset_mac_len(skb); 5579 5580 return 0; 5581 } 5582 EXPORT_SYMBOL(skb_eth_pop); 5583 5584 /** 5585 * skb_eth_push() - Add a new Ethernet header at the head of a packet 5586 * 5587 * @skb: Socket buffer to modify 5588 * @dst: Destination MAC address of the new header 5589 * @src: Source MAC address of the new header 5590 * 5591 * Prepend @skb with a new Ethernet header. 5592 * 5593 * Expects that skb->data points to the mac header, which must be empty. 5594 * 5595 * Returns 0 on success, -errno otherwise. 5596 */ 5597 int skb_eth_push(struct sk_buff *skb, const unsigned char *dst, 5598 const unsigned char *src) 5599 { 5600 struct ethhdr *eth; 5601 int err; 5602 5603 if (skb_network_offset(skb) || skb_vlan_tag_present(skb)) 5604 return -EPROTO; 5605 5606 err = skb_cow_head(skb, sizeof(*eth)); 5607 if (err < 0) 5608 return err; 5609 5610 skb_push(skb, sizeof(*eth)); 5611 skb_reset_mac_header(skb); 5612 skb_reset_mac_len(skb); 5613 5614 eth = eth_hdr(skb); 5615 ether_addr_copy(eth->h_dest, dst); 5616 ether_addr_copy(eth->h_source, src); 5617 eth->h_proto = skb->protocol; 5618 5619 skb_postpush_rcsum(skb, eth, sizeof(*eth)); 5620 5621 return 0; 5622 } 5623 EXPORT_SYMBOL(skb_eth_push); 5624 5625 /* Update the ethertype of hdr and the skb csum value if required. */ 5626 static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr, 5627 __be16 ethertype) 5628 { 5629 if (skb->ip_summed == CHECKSUM_COMPLETE) { 5630 __be16 diff[] = { ~hdr->h_proto, ethertype }; 5631 5632 skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); 5633 } 5634 5635 hdr->h_proto = ethertype; 5636 } 5637 5638 /** 5639 * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of 5640 * the packet 5641 * 5642 * @skb: buffer 5643 * @mpls_lse: MPLS label stack entry to push 5644 * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848) 5645 * @mac_len: length of the MAC header 5646 * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is 5647 * ethernet 5648 * 5649 * Expects skb->data at mac header. 5650 * 5651 * Returns 0 on success, -errno otherwise. 5652 */ 5653 int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, 5654 int mac_len, bool ethernet) 5655 { 5656 struct mpls_shim_hdr *lse; 5657 int err; 5658 5659 if (unlikely(!eth_p_mpls(mpls_proto))) 5660 return -EINVAL; 5661 5662 /* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */ 5663 if (skb->encapsulation) 5664 return -EINVAL; 5665 5666 err = skb_cow_head(skb, MPLS_HLEN); 5667 if (unlikely(err)) 5668 return err; 5669 5670 if (!skb->inner_protocol) { 5671 skb_set_inner_network_header(skb, skb_network_offset(skb)); 5672 skb_set_inner_protocol(skb, skb->protocol); 5673 } 5674 5675 skb_push(skb, MPLS_HLEN); 5676 memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), 5677 mac_len); 5678 skb_reset_mac_header(skb); 5679 skb_set_network_header(skb, mac_len); 5680 skb_reset_mac_len(skb); 5681 5682 lse = mpls_hdr(skb); 5683 lse->label_stack_entry = mpls_lse; 5684 skb_postpush_rcsum(skb, lse, MPLS_HLEN); 5685 5686 if (ethernet && mac_len >= ETH_HLEN) 5687 skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto); 5688 skb->protocol = mpls_proto; 5689 5690 return 0; 5691 } 5692 EXPORT_SYMBOL_GPL(skb_mpls_push); 5693 5694 /** 5695 * skb_mpls_pop() - pop the outermost MPLS header 5696 * 5697 * @skb: buffer 5698 * @next_proto: ethertype of header after popped MPLS header 5699 * @mac_len: length of the MAC header 5700 * @ethernet: flag to indicate if the packet is ethernet 5701 * 5702 * Expects skb->data at mac header. 5703 * 5704 * Returns 0 on success, -errno otherwise. 5705 */ 5706 int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, 5707 bool ethernet) 5708 { 5709 int err; 5710 5711 if (unlikely(!eth_p_mpls(skb->protocol))) 5712 return 0; 5713 5714 err = skb_ensure_writable(skb, mac_len + MPLS_HLEN); 5715 if (unlikely(err)) 5716 return err; 5717 5718 skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN); 5719 memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), 5720 mac_len); 5721 5722 __skb_pull(skb, MPLS_HLEN); 5723 skb_reset_mac_header(skb); 5724 skb_set_network_header(skb, mac_len); 5725 5726 if (ethernet && mac_len >= ETH_HLEN) { 5727 struct ethhdr *hdr; 5728 5729 /* use mpls_hdr() to get ethertype to account for VLANs. */ 5730 hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN); 5731 skb_mod_eth_type(skb, hdr, next_proto); 5732 } 5733 skb->protocol = next_proto; 5734 5735 return 0; 5736 } 5737 EXPORT_SYMBOL_GPL(skb_mpls_pop); 5738 5739 /** 5740 * skb_mpls_update_lse() - modify outermost MPLS header and update csum 5741 * 5742 * @skb: buffer 5743 * @mpls_lse: new MPLS label stack entry to update to 5744 * 5745 * Expects skb->data at mac header. 5746 * 5747 * Returns 0 on success, -errno otherwise. 5748 */ 5749 int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse) 5750 { 5751 int err; 5752 5753 if (unlikely(!eth_p_mpls(skb->protocol))) 5754 return -EINVAL; 5755 5756 err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); 5757 if (unlikely(err)) 5758 return err; 5759 5760 if (skb->ip_summed == CHECKSUM_COMPLETE) { 5761 __be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse }; 5762 5763 skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); 5764 } 5765 5766 mpls_hdr(skb)->label_stack_entry = mpls_lse; 5767 5768 return 0; 5769 } 5770 EXPORT_SYMBOL_GPL(skb_mpls_update_lse); 5771 5772 /** 5773 * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header 5774 * 5775 * @skb: buffer 5776 * 5777 * Expects skb->data at mac header. 5778 * 5779 * Returns 0 on success, -errno otherwise. 5780 */ 5781 int skb_mpls_dec_ttl(struct sk_buff *skb) 5782 { 5783 u32 lse; 5784 u8 ttl; 5785 5786 if (unlikely(!eth_p_mpls(skb->protocol))) 5787 return -EINVAL; 5788 5789 lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry); 5790 ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; 5791 if (!--ttl) 5792 return -EINVAL; 5793 5794 lse &= ~MPLS_LS_TTL_MASK; 5795 lse |= ttl << MPLS_LS_TTL_SHIFT; 5796 5797 return skb_mpls_update_lse(skb, cpu_to_be32(lse)); 5798 } 5799 EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl); 5800 5801 /** 5802 * alloc_skb_with_frags - allocate skb with page frags 5803 * 5804 * @header_len: size of linear part 5805 * @data_len: needed length in frags 5806 * @max_page_order: max page order desired. 5807 * @errcode: pointer to error code if any 5808 * @gfp_mask: allocation mask 5809 * 5810 * This can be used to allocate a paged skb, given a maximal order for frags. 5811 */ 5812 struct sk_buff *alloc_skb_with_frags(unsigned long header_len, 5813 unsigned long data_len, 5814 int max_page_order, 5815 int *errcode, 5816 gfp_t gfp_mask) 5817 { 5818 int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; 5819 unsigned long chunk; 5820 struct sk_buff *skb; 5821 struct page *page; 5822 int i; 5823 5824 *errcode = -EMSGSIZE; 5825 /* Note this test could be relaxed, if we succeed to allocate 5826 * high order pages... 5827 */ 5828 if (npages > MAX_SKB_FRAGS) 5829 return NULL; 5830 5831 *errcode = -ENOBUFS; 5832 skb = alloc_skb(header_len, gfp_mask); 5833 if (!skb) 5834 return NULL; 5835 5836 skb->truesize += npages << PAGE_SHIFT; 5837 5838 for (i = 0; npages > 0; i++) { 5839 int order = max_page_order; 5840 5841 while (order) { 5842 if (npages >= 1 << order) { 5843 page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | 5844 __GFP_COMP | 5845 __GFP_NOWARN, 5846 order); 5847 if (page) 5848 goto fill_page; 5849 /* Do not retry other high order allocations */ 5850 order = 1; 5851 max_page_order = 0; 5852 } 5853 order--; 5854 } 5855 page = alloc_page(gfp_mask); 5856 if (!page) 5857 goto failure; 5858 fill_page: 5859 chunk = min_t(unsigned long, data_len, 5860 PAGE_SIZE << order); 5861 skb_fill_page_desc(skb, i, page, 0, chunk); 5862 data_len -= chunk; 5863 npages -= 1 << order; 5864 } 5865 return skb; 5866 5867 failure: 5868 kfree_skb(skb); 5869 return NULL; 5870 } 5871 EXPORT_SYMBOL(alloc_skb_with_frags); 5872 5873 /* carve out the first off bytes from skb when off < headlen */ 5874 static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, 5875 const int headlen, gfp_t gfp_mask) 5876 { 5877 int i; 5878 int size = skb_end_offset(skb); 5879 int new_hlen = headlen - off; 5880 u8 *data; 5881 5882 size = SKB_DATA_ALIGN(size); 5883 5884 if (skb_pfmemalloc(skb)) 5885 gfp_mask |= __GFP_MEMALLOC; 5886 data = kmalloc_reserve(size + 5887 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), 5888 gfp_mask, NUMA_NO_NODE, NULL); 5889 if (!data) 5890 return -ENOMEM; 5891 5892 size = SKB_WITH_OVERHEAD(ksize(data)); 5893 5894 /* Copy real data, and all frags */ 5895 skb_copy_from_linear_data_offset(skb, off, data, new_hlen); 5896 skb->len -= off; 5897 5898 memcpy((struct skb_shared_info *)(data + size), 5899 skb_shinfo(skb), 5900 offsetof(struct skb_shared_info, 5901 frags[skb_shinfo(skb)->nr_frags])); 5902 if (skb_cloned(skb)) { 5903 /* drop the old head gracefully */ 5904 if (skb_orphan_frags(skb, gfp_mask)) { 5905 kfree(data); 5906 return -ENOMEM; 5907 } 5908 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 5909 skb_frag_ref(skb, i); 5910 if (skb_has_frag_list(skb)) 5911 skb_clone_fraglist(skb); 5912 skb_release_data(skb); 5913 } else { 5914 /* we can reuse existing recount- all we did was 5915 * relocate values 5916 */ 5917 skb_free_head(skb); 5918 } 5919 5920 skb->head = data; 5921 skb->data = data; 5922 skb->head_frag = 0; 5923 #ifdef NET_SKBUFF_DATA_USES_OFFSET 5924 skb->end = size; 5925 #else 5926 skb->end = skb->head + size; 5927 #endif 5928 skb_set_tail_pointer(skb, skb_headlen(skb)); 5929 skb_headers_offset_update(skb, 0); 5930 skb->cloned = 0; 5931 skb->hdr_len = 0; 5932 skb->nohdr = 0; 5933 atomic_set(&skb_shinfo(skb)->dataref, 1); 5934 5935 return 0; 5936 } 5937 5938 static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp); 5939 5940 /* carve out the first eat bytes from skb's frag_list. May recurse into 5941 * pskb_carve() 5942 */ 5943 static int pskb_carve_frag_list(struct sk_buff *skb, 5944 struct skb_shared_info *shinfo, int eat, 5945 gfp_t gfp_mask) 5946 { 5947 struct sk_buff *list = shinfo->frag_list; 5948 struct sk_buff *clone = NULL; 5949 struct sk_buff *insp = NULL; 5950 5951 do { 5952 if (!list) { 5953 pr_err("Not enough bytes to eat. Want %d\n", eat); 5954 return -EFAULT; 5955 } 5956 if (list->len <= eat) { 5957 /* Eaten as whole. */ 5958 eat -= list->len; 5959 list = list->next; 5960 insp = list; 5961 } else { 5962 /* Eaten partially. */ 5963 if (skb_shared(list)) { 5964 clone = skb_clone(list, gfp_mask); 5965 if (!clone) 5966 return -ENOMEM; 5967 insp = list->next; 5968 list = clone; 5969 } else { 5970 /* This may be pulled without problems. */ 5971 insp = list; 5972 } 5973 if (pskb_carve(list, eat, gfp_mask) < 0) { 5974 kfree_skb(clone); 5975 return -ENOMEM; 5976 } 5977 break; 5978 } 5979 } while (eat); 5980 5981 /* Free pulled out fragments. */ 5982 while ((list = shinfo->frag_list) != insp) { 5983 shinfo->frag_list = list->next; 5984 kfree_skb(list); 5985 } 5986 /* And insert new clone at head. */ 5987 if (clone) { 5988 clone->next = list; 5989 shinfo->frag_list = clone; 5990 } 5991 return 0; 5992 } 5993 5994 /* carve off first len bytes from skb. Split line (off) is in the 5995 * non-linear part of skb 5996 */ 5997 static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, 5998 int pos, gfp_t gfp_mask) 5999 { 6000 int i, k = 0; 6001 int size = skb_end_offset(skb); 6002 u8 *data; 6003 const int nfrags = skb_shinfo(skb)->nr_frags; 6004 struct skb_shared_info *shinfo; 6005 6006 size = SKB_DATA_ALIGN(size); 6007 6008 if (skb_pfmemalloc(skb)) 6009 gfp_mask |= __GFP_MEMALLOC; 6010 data = kmalloc_reserve(size + 6011 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), 6012 gfp_mask, NUMA_NO_NODE, NULL); 6013 if (!data) 6014 return -ENOMEM; 6015 6016 size = SKB_WITH_OVERHEAD(ksize(data)); 6017 6018 memcpy((struct skb_shared_info *)(data + size), 6019 skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0])); 6020 if (skb_orphan_frags(skb, gfp_mask)) { 6021 kfree(data); 6022 return -ENOMEM; 6023 } 6024 shinfo = (struct skb_shared_info *)(data + size); 6025 for (i = 0; i < nfrags; i++) { 6026 int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]); 6027 6028 if (pos + fsize > off) { 6029 shinfo->frags[k] = skb_shinfo(skb)->frags[i]; 6030 6031 if (pos < off) { 6032 /* Split frag. 6033 * We have two variants in this case: 6034 * 1. Move all the frag to the second 6035 * part, if it is possible. F.e. 6036 * this approach is mandatory for TUX, 6037 * where splitting is expensive. 6038 * 2. Split is accurately. We make this. 6039 */ 6040 skb_frag_off_add(&shinfo->frags[0], off - pos); 6041 skb_frag_size_sub(&shinfo->frags[0], off - pos); 6042 } 6043 skb_frag_ref(skb, i); 6044 k++; 6045 } 6046 pos += fsize; 6047 } 6048 shinfo->nr_frags = k; 6049 if (skb_has_frag_list(skb)) 6050 skb_clone_fraglist(skb); 6051 6052 /* split line is in frag list */ 6053 if (k == 0 && pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask)) { 6054 /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */ 6055 if (skb_has_frag_list(skb)) 6056 kfree_skb_list(skb_shinfo(skb)->frag_list); 6057 kfree(data); 6058 return -ENOMEM; 6059 } 6060 skb_release_data(skb); 6061 6062 skb->head = data; 6063 skb->head_frag = 0; 6064 skb->data = data; 6065 #ifdef NET_SKBUFF_DATA_USES_OFFSET 6066 skb->end = size; 6067 #else 6068 skb->end = skb->head + size; 6069 #endif 6070 skb_reset_tail_pointer(skb); 6071 skb_headers_offset_update(skb, 0); 6072 skb->cloned = 0; 6073 skb->hdr_len = 0; 6074 skb->nohdr = 0; 6075 skb->len -= off; 6076 skb->data_len = skb->len; 6077 atomic_set(&skb_shinfo(skb)->dataref, 1); 6078 return 0; 6079 } 6080 6081 /* remove len bytes from the beginning of the skb */ 6082 static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp) 6083 { 6084 int headlen = skb_headlen(skb); 6085 6086 if (len < headlen) 6087 return pskb_carve_inside_header(skb, len, headlen, gfp); 6088 else 6089 return pskb_carve_inside_nonlinear(skb, len, headlen, gfp); 6090 } 6091 6092 /* Extract to_copy bytes starting at off from skb, and return this in 6093 * a new skb 6094 */ 6095 struct sk_buff *pskb_extract(struct sk_buff *skb, int off, 6096 int to_copy, gfp_t gfp) 6097 { 6098 struct sk_buff *clone = skb_clone(skb, gfp); 6099 6100 if (!clone) 6101 return NULL; 6102 6103 if (pskb_carve(clone, off, gfp) < 0 || 6104 pskb_trim(clone, to_copy)) { 6105 kfree_skb(clone); 6106 return NULL; 6107 } 6108 return clone; 6109 } 6110 EXPORT_SYMBOL(pskb_extract); 6111 6112 /** 6113 * skb_condense - try to get rid of fragments/frag_list if possible 6114 * @skb: buffer 6115 * 6116 * Can be used to save memory before skb is added to a busy queue. 6117 * If packet has bytes in frags and enough tail room in skb->head, 6118 * pull all of them, so that we can free the frags right now and adjust 6119 * truesize. 6120 * Notes: 6121 * We do not reallocate skb->head thus can not fail. 6122 * Caller must re-evaluate skb->truesize if needed. 6123 */ 6124 void skb_condense(struct sk_buff *skb) 6125 { 6126 if (skb->data_len) { 6127 if (skb->data_len > skb->end - skb->tail || 6128 skb_cloned(skb)) 6129 return; 6130 6131 /* Nice, we can free page frag(s) right now */ 6132 __pskb_pull_tail(skb, skb->data_len); 6133 } 6134 /* At this point, skb->truesize might be over estimated, 6135 * because skb had a fragment, and fragments do not tell 6136 * their truesize. 6137 * When we pulled its content into skb->head, fragment 6138 * was freed, but __pskb_pull_tail() could not possibly 6139 * adjust skb->truesize, not knowing the frag truesize. 6140 */ 6141 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); 6142 } 6143 6144 #ifdef CONFIG_SKB_EXTENSIONS 6145 static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) 6146 { 6147 return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE); 6148 } 6149 6150 /** 6151 * __skb_ext_alloc - allocate a new skb extensions storage 6152 * 6153 * @flags: See kmalloc(). 6154 * 6155 * Returns the newly allocated pointer. The pointer can later attached to a 6156 * skb via __skb_ext_set(). 6157 * Note: caller must handle the skb_ext as an opaque data. 6158 */ 6159 struct skb_ext *__skb_ext_alloc(gfp_t flags) 6160 { 6161 struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags); 6162 6163 if (new) { 6164 memset(new->offset, 0, sizeof(new->offset)); 6165 refcount_set(&new->refcnt, 1); 6166 } 6167 6168 return new; 6169 } 6170 6171 static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old, 6172 unsigned int old_active) 6173 { 6174 struct skb_ext *new; 6175 6176 if (refcount_read(&old->refcnt) == 1) 6177 return old; 6178 6179 new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); 6180 if (!new) 6181 return NULL; 6182 6183 memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE); 6184 refcount_set(&new->refcnt, 1); 6185 6186 #ifdef CONFIG_XFRM 6187 if (old_active & (1 << SKB_EXT_SEC_PATH)) { 6188 struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH); 6189 unsigned int i; 6190 6191 for (i = 0; i < sp->len; i++) 6192 xfrm_state_hold(sp->xvec[i]); 6193 } 6194 #endif 6195 __skb_ext_put(old); 6196 return new; 6197 } 6198 6199 /** 6200 * __skb_ext_set - attach the specified extension storage to this skb 6201 * @skb: buffer 6202 * @id: extension id 6203 * @ext: extension storage previously allocated via __skb_ext_alloc() 6204 * 6205 * Existing extensions, if any, are cleared. 6206 * 6207 * Returns the pointer to the extension. 6208 */ 6209 void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, 6210 struct skb_ext *ext) 6211 { 6212 unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext); 6213 6214 skb_ext_put(skb); 6215 newlen = newoff + skb_ext_type_len[id]; 6216 ext->chunks = newlen; 6217 ext->offset[id] = newoff; 6218 skb->extensions = ext; 6219 skb->active_extensions = 1 << id; 6220 return skb_ext_get_ptr(ext, id); 6221 } 6222 6223 /** 6224 * skb_ext_add - allocate space for given extension, COW if needed 6225 * @skb: buffer 6226 * @id: extension to allocate space for 6227 * 6228 * Allocates enough space for the given extension. 6229 * If the extension is already present, a pointer to that extension 6230 * is returned. 6231 * 6232 * If the skb was cloned, COW applies and the returned memory can be 6233 * modified without changing the extension space of clones buffers. 6234 * 6235 * Returns pointer to the extension or NULL on allocation failure. 6236 */ 6237 void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) 6238 { 6239 struct skb_ext *new, *old = NULL; 6240 unsigned int newlen, newoff; 6241 6242 if (skb->active_extensions) { 6243 old = skb->extensions; 6244 6245 new = skb_ext_maybe_cow(old, skb->active_extensions); 6246 if (!new) 6247 return NULL; 6248 6249 if (__skb_ext_exist(new, id)) 6250 goto set_active; 6251 6252 newoff = new->chunks; 6253 } else { 6254 newoff = SKB_EXT_CHUNKSIZEOF(*new); 6255 6256 new = __skb_ext_alloc(GFP_ATOMIC); 6257 if (!new) 6258 return NULL; 6259 } 6260 6261 newlen = newoff + skb_ext_type_len[id]; 6262 new->chunks = newlen; 6263 new->offset[id] = newoff; 6264 set_active: 6265 skb->extensions = new; 6266 skb->active_extensions |= 1 << id; 6267 return skb_ext_get_ptr(new, id); 6268 } 6269 EXPORT_SYMBOL(skb_ext_add); 6270 6271 #ifdef CONFIG_XFRM 6272 static void skb_ext_put_sp(struct sec_path *sp) 6273 { 6274 unsigned int i; 6275 6276 for (i = 0; i < sp->len; i++) 6277 xfrm_state_put(sp->xvec[i]); 6278 } 6279 #endif 6280 6281 void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) 6282 { 6283 struct skb_ext *ext = skb->extensions; 6284 6285 skb->active_extensions &= ~(1 << id); 6286 if (skb->active_extensions == 0) { 6287 skb->extensions = NULL; 6288 __skb_ext_put(ext); 6289 #ifdef CONFIG_XFRM 6290 } else if (id == SKB_EXT_SEC_PATH && 6291 refcount_read(&ext->refcnt) == 1) { 6292 struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH); 6293 6294 skb_ext_put_sp(sp); 6295 sp->len = 0; 6296 #endif 6297 } 6298 } 6299 EXPORT_SYMBOL(__skb_ext_del); 6300 6301 void __skb_ext_put(struct skb_ext *ext) 6302 { 6303 /* If this is last clone, nothing can increment 6304 * it after check passes. Avoids one atomic op. 6305 */ 6306 if (refcount_read(&ext->refcnt) == 1) 6307 goto free_now; 6308 6309 if (!refcount_dec_and_test(&ext->refcnt)) 6310 return; 6311 free_now: 6312 #ifdef CONFIG_XFRM 6313 if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH)) 6314 skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH)); 6315 #endif 6316 6317 kmem_cache_free(skbuff_ext_cache, ext); 6318 } 6319 EXPORT_SYMBOL(__skb_ext_put); 6320 #endif /* CONFIG_SKB_EXTENSIONS */ 6321