1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * inet fragments management 4 * 5 * Authors: Pavel Emelyanov <xemul@openvz.org> 6 * Started as consolidation of ipv4/ip_fragment.c, 7 * ipv6/reassembly. and ipv6 nf conntrack reassembly 8 */ 9 10 #include <linux/list.h> 11 #include <linux/spinlock.h> 12 #include <linux/module.h> 13 #include <linux/timer.h> 14 #include <linux/mm.h> 15 #include <linux/random.h> 16 #include <linux/skbuff.h> 17 #include <linux/rtnetlink.h> 18 #include <linux/slab.h> 19 #include <linux/rhashtable.h> 20 21 #include <net/sock.h> 22 #include <net/inet_frag.h> 23 #include <net/inet_ecn.h> 24 #include <net/ip.h> 25 #include <net/ipv6.h> 26 27 /* Use skb->cb to track consecutive/adjacent fragments coming at 28 * the end of the queue. Nodes in the rb-tree queue will 29 * contain "runs" of one or more adjacent fragments. 30 * 31 * Invariants: 32 * - next_frag is NULL at the tail of a "run"; 33 * - the head of a "run" has the sum of all fragment lengths in frag_run_len. 34 */ 35 struct ipfrag_skb_cb { 36 union { 37 struct inet_skb_parm h4; 38 struct inet6_skb_parm h6; 39 }; 40 struct sk_buff *next_frag; 41 int frag_run_len; 42 }; 43 44 #define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) 45 46 static void fragcb_clear(struct sk_buff *skb) 47 { 48 RB_CLEAR_NODE(&skb->rbnode); 49 FRAG_CB(skb)->next_frag = NULL; 50 FRAG_CB(skb)->frag_run_len = skb->len; 51 } 52 53 /* Append skb to the last "run". */ 54 static void fragrun_append_to_last(struct inet_frag_queue *q, 55 struct sk_buff *skb) 56 { 57 fragcb_clear(skb); 58 59 FRAG_CB(q->last_run_head)->frag_run_len += skb->len; 60 FRAG_CB(q->fragments_tail)->next_frag = skb; 61 q->fragments_tail = skb; 62 } 63 64 /* Create a new "run" with the skb. */ 65 static void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb) 66 { 67 BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); 68 fragcb_clear(skb); 69 70 if (q->last_run_head) 71 rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, 72 &q->last_run_head->rbnode.rb_right); 73 else 74 rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); 75 rb_insert_color(&skb->rbnode, &q->rb_fragments); 76 77 q->fragments_tail = skb; 78 q->last_run_head = skb; 79 } 80 81 /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements 82 * Value : 0xff if frame should be dropped. 83 * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field 84 */ 85 const u8 ip_frag_ecn_table[16] = { 86 /* at least one fragment had CE, and others ECT_0 or ECT_1 */ 87 [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE, 88 [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE, 89 [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE, 90 91 /* invalid combinations : drop frame */ 92 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff, 93 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff, 94 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff, 95 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, 96 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff, 97 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff, 98 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, 99 }; 100 EXPORT_SYMBOL(ip_frag_ecn_table); 101 102 int inet_frags_init(struct inet_frags *f) 103 { 104 f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0, 105 NULL); 106 if (!f->frags_cachep) 107 return -ENOMEM; 108 109 return 0; 110 } 111 EXPORT_SYMBOL(inet_frags_init); 112 113 void inet_frags_fini(struct inet_frags *f) 114 { 115 /* We must wait that all inet_frag_destroy_rcu() have completed. */ 116 rcu_barrier(); 117 118 kmem_cache_destroy(f->frags_cachep); 119 f->frags_cachep = NULL; 120 } 121 EXPORT_SYMBOL(inet_frags_fini); 122 123 static void inet_frags_free_cb(void *ptr, void *arg) 124 { 125 struct inet_frag_queue *fq = ptr; 126 127 /* If we can not cancel the timer, it means this frag_queue 128 * is already disappearing, we have nothing to do. 129 * Otherwise, we own a refcount until the end of this function. 130 */ 131 if (!del_timer(&fq->timer)) 132 return; 133 134 spin_lock_bh(&fq->lock); 135 if (!(fq->flags & INET_FRAG_COMPLETE)) { 136 fq->flags |= INET_FRAG_COMPLETE; 137 refcount_dec(&fq->refcnt); 138 } 139 spin_unlock_bh(&fq->lock); 140 141 inet_frag_put(fq); 142 } 143 144 void inet_frags_exit_net(struct netns_frags *nf) 145 { 146 nf->high_thresh = 0; /* prevent creation of new frags */ 147 148 rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL); 149 } 150 EXPORT_SYMBOL(inet_frags_exit_net); 151 152 void inet_frag_kill(struct inet_frag_queue *fq) 153 { 154 if (del_timer(&fq->timer)) 155 refcount_dec(&fq->refcnt); 156 157 if (!(fq->flags & INET_FRAG_COMPLETE)) { 158 struct netns_frags *nf = fq->net; 159 160 fq->flags |= INET_FRAG_COMPLETE; 161 rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params); 162 refcount_dec(&fq->refcnt); 163 } 164 } 165 EXPORT_SYMBOL(inet_frag_kill); 166 167 static void inet_frag_destroy_rcu(struct rcu_head *head) 168 { 169 struct inet_frag_queue *q = container_of(head, struct inet_frag_queue, 170 rcu); 171 struct inet_frags *f = q->net->f; 172 173 if (f->destructor) 174 f->destructor(q); 175 kmem_cache_free(f->frags_cachep, q); 176 } 177 178 unsigned int inet_frag_rbtree_purge(struct rb_root *root) 179 { 180 struct rb_node *p = rb_first(root); 181 unsigned int sum = 0; 182 183 while (p) { 184 struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); 185 186 p = rb_next(p); 187 rb_erase(&skb->rbnode, root); 188 while (skb) { 189 struct sk_buff *next = FRAG_CB(skb)->next_frag; 190 191 sum += skb->truesize; 192 kfree_skb(skb); 193 skb = next; 194 } 195 } 196 return sum; 197 } 198 EXPORT_SYMBOL(inet_frag_rbtree_purge); 199 200 void inet_frag_destroy(struct inet_frag_queue *q) 201 { 202 struct netns_frags *nf; 203 unsigned int sum, sum_truesize = 0; 204 struct inet_frags *f; 205 206 WARN_ON(!(q->flags & INET_FRAG_COMPLETE)); 207 WARN_ON(del_timer(&q->timer) != 0); 208 209 /* Release all fragment data. */ 210 nf = q->net; 211 f = nf->f; 212 sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments); 213 sum = sum_truesize + f->qsize; 214 215 call_rcu(&q->rcu, inet_frag_destroy_rcu); 216 217 sub_frag_mem_limit(nf, sum); 218 } 219 EXPORT_SYMBOL(inet_frag_destroy); 220 221 static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, 222 struct inet_frags *f, 223 void *arg) 224 { 225 struct inet_frag_queue *q; 226 227 q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); 228 if (!q) 229 return NULL; 230 231 q->net = nf; 232 f->constructor(q, arg); 233 add_frag_mem_limit(nf, f->qsize); 234 235 timer_setup(&q->timer, f->frag_expire, 0); 236 spin_lock_init(&q->lock); 237 refcount_set(&q->refcnt, 3); 238 239 return q; 240 } 241 242 static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, 243 void *arg, 244 struct inet_frag_queue **prev) 245 { 246 struct inet_frags *f = nf->f; 247 struct inet_frag_queue *q; 248 249 q = inet_frag_alloc(nf, f, arg); 250 if (!q) { 251 *prev = ERR_PTR(-ENOMEM); 252 return NULL; 253 } 254 mod_timer(&q->timer, jiffies + nf->timeout); 255 256 *prev = rhashtable_lookup_get_insert_key(&nf->rhashtable, &q->key, 257 &q->node, f->rhash_params); 258 if (*prev) { 259 q->flags |= INET_FRAG_COMPLETE; 260 inet_frag_kill(q); 261 inet_frag_destroy(q); 262 return NULL; 263 } 264 return q; 265 } 266 267 /* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */ 268 struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key) 269 { 270 struct inet_frag_queue *fq = NULL, *prev; 271 272 if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) 273 return NULL; 274 275 rcu_read_lock(); 276 277 prev = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params); 278 if (!prev) 279 fq = inet_frag_create(nf, key, &prev); 280 if (prev && !IS_ERR(prev)) { 281 fq = prev; 282 if (!refcount_inc_not_zero(&fq->refcnt)) 283 fq = NULL; 284 } 285 rcu_read_unlock(); 286 return fq; 287 } 288 EXPORT_SYMBOL(inet_frag_find); 289 290 int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, 291 int offset, int end) 292 { 293 struct sk_buff *last = q->fragments_tail; 294 295 /* RFC5722, Section 4, amended by Errata ID : 3089 296 * When reassembling an IPv6 datagram, if 297 * one or more its constituent fragments is determined to be an 298 * overlapping fragment, the entire datagram (and any constituent 299 * fragments) MUST be silently discarded. 300 * 301 * Duplicates, however, should be ignored (i.e. skb dropped, but the 302 * queue/fragments kept for later reassembly). 303 */ 304 if (!last) 305 fragrun_create(q, skb); /* First fragment. */ 306 else if (last->ip_defrag_offset + last->len < end) { 307 /* This is the common case: skb goes to the end. */ 308 /* Detect and discard overlaps. */ 309 if (offset < last->ip_defrag_offset + last->len) 310 return IPFRAG_OVERLAP; 311 if (offset == last->ip_defrag_offset + last->len) 312 fragrun_append_to_last(q, skb); 313 else 314 fragrun_create(q, skb); 315 } else { 316 /* Binary search. Note that skb can become the first fragment, 317 * but not the last (covered above). 318 */ 319 struct rb_node **rbn, *parent; 320 321 rbn = &q->rb_fragments.rb_node; 322 do { 323 struct sk_buff *curr; 324 int curr_run_end; 325 326 parent = *rbn; 327 curr = rb_to_skb(parent); 328 curr_run_end = curr->ip_defrag_offset + 329 FRAG_CB(curr)->frag_run_len; 330 if (end <= curr->ip_defrag_offset) 331 rbn = &parent->rb_left; 332 else if (offset >= curr_run_end) 333 rbn = &parent->rb_right; 334 else if (offset >= curr->ip_defrag_offset && 335 end <= curr_run_end) 336 return IPFRAG_DUP; 337 else 338 return IPFRAG_OVERLAP; 339 } while (*rbn); 340 /* Here we have parent properly set, and rbn pointing to 341 * one of its NULL left/right children. Insert skb. 342 */ 343 fragcb_clear(skb); 344 rb_link_node(&skb->rbnode, parent, rbn); 345 rb_insert_color(&skb->rbnode, &q->rb_fragments); 346 } 347 348 skb->ip_defrag_offset = offset; 349 350 return IPFRAG_OK; 351 } 352 EXPORT_SYMBOL(inet_frag_queue_insert); 353 354 void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, 355 struct sk_buff *parent) 356 { 357 struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments); 358 struct sk_buff **nextp; 359 int delta; 360 361 if (head != skb) { 362 fp = skb_clone(skb, GFP_ATOMIC); 363 if (!fp) 364 return NULL; 365 FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; 366 if (RB_EMPTY_NODE(&skb->rbnode)) 367 FRAG_CB(parent)->next_frag = fp; 368 else 369 rb_replace_node(&skb->rbnode, &fp->rbnode, 370 &q->rb_fragments); 371 if (q->fragments_tail == skb) 372 q->fragments_tail = fp; 373 skb_morph(skb, head); 374 FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; 375 rb_replace_node(&head->rbnode, &skb->rbnode, 376 &q->rb_fragments); 377 consume_skb(head); 378 head = skb; 379 } 380 WARN_ON(head->ip_defrag_offset != 0); 381 382 delta = -head->truesize; 383 384 /* Head of list must not be cloned. */ 385 if (skb_unclone(head, GFP_ATOMIC)) 386 return NULL; 387 388 delta += head->truesize; 389 if (delta) 390 add_frag_mem_limit(q->net, delta); 391 392 /* If the first fragment is fragmented itself, we split 393 * it to two chunks: the first with data and paged part 394 * and the second, holding only fragments. 395 */ 396 if (skb_has_frag_list(head)) { 397 struct sk_buff *clone; 398 int i, plen = 0; 399 400 clone = alloc_skb(0, GFP_ATOMIC); 401 if (!clone) 402 return NULL; 403 skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; 404 skb_frag_list_init(head); 405 for (i = 0; i < skb_shinfo(head)->nr_frags; i++) 406 plen += skb_frag_size(&skb_shinfo(head)->frags[i]); 407 clone->data_len = head->data_len - plen; 408 clone->len = clone->data_len; 409 head->truesize += clone->truesize; 410 clone->csum = 0; 411 clone->ip_summed = head->ip_summed; 412 add_frag_mem_limit(q->net, clone->truesize); 413 skb_shinfo(head)->frag_list = clone; 414 nextp = &clone->next; 415 } else { 416 nextp = &skb_shinfo(head)->frag_list; 417 } 418 419 return nextp; 420 } 421 EXPORT_SYMBOL(inet_frag_reasm_prepare); 422 423 void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, 424 void *reasm_data) 425 { 426 struct sk_buff **nextp = (struct sk_buff **)reasm_data; 427 struct rb_node *rbn; 428 struct sk_buff *fp; 429 430 skb_push(head, head->data - skb_network_header(head)); 431 432 /* Traverse the tree in order, to build frag_list. */ 433 fp = FRAG_CB(head)->next_frag; 434 rbn = rb_next(&head->rbnode); 435 rb_erase(&head->rbnode, &q->rb_fragments); 436 while (rbn || fp) { 437 /* fp points to the next sk_buff in the current run; 438 * rbn points to the next run. 439 */ 440 /* Go through the current run. */ 441 while (fp) { 442 *nextp = fp; 443 nextp = &fp->next; 444 fp->prev = NULL; 445 memset(&fp->rbnode, 0, sizeof(fp->rbnode)); 446 fp->sk = NULL; 447 head->data_len += fp->len; 448 head->len += fp->len; 449 if (head->ip_summed != fp->ip_summed) 450 head->ip_summed = CHECKSUM_NONE; 451 else if (head->ip_summed == CHECKSUM_COMPLETE) 452 head->csum = csum_add(head->csum, fp->csum); 453 head->truesize += fp->truesize; 454 fp = FRAG_CB(fp)->next_frag; 455 } 456 /* Move to the next run. */ 457 if (rbn) { 458 struct rb_node *rbnext = rb_next(rbn); 459 460 fp = rb_to_skb(rbn); 461 rb_erase(rbn, &q->rb_fragments); 462 rbn = rbnext; 463 } 464 } 465 sub_frag_mem_limit(q->net, head->truesize); 466 467 *nextp = NULL; 468 skb_mark_not_on_list(head); 469 head->prev = NULL; 470 head->tstamp = q->stamp; 471 } 472 EXPORT_SYMBOL(inet_frag_reasm_finish); 473 474 struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q) 475 { 476 struct sk_buff *head, *skb; 477 478 head = skb_rb_first(&q->rb_fragments); 479 if (!head) 480 return NULL; 481 skb = FRAG_CB(head)->next_frag; 482 if (skb) 483 rb_replace_node(&head->rbnode, &skb->rbnode, 484 &q->rb_fragments); 485 else 486 rb_erase(&head->rbnode, &q->rb_fragments); 487 memset(&head->rbnode, 0, sizeof(head->rbnode)); 488 barrier(); 489 490 if (head == q->fragments_tail) 491 q->fragments_tail = NULL; 492 493 sub_frag_mem_limit(q->net, head->truesize); 494 495 return head; 496 } 497 EXPORT_SYMBOL(inet_frag_pull_head); 498