1 /* 2 * inet fragments management 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 * 9 * Authors: Pavel Emelyanov <xemul@openvz.org> 10 * Started as consolidation of ipv4/ip_fragment.c, 11 * ipv6/reassembly. and ipv6 nf conntrack reassembly 12 */ 13 14 #include <linux/list.h> 15 #include <linux/spinlock.h> 16 #include <linux/module.h> 17 #include <linux/timer.h> 18 #include <linux/mm.h> 19 #include <linux/random.h> 20 #include <linux/skbuff.h> 21 #include <linux/rtnetlink.h> 22 #include <linux/slab.h> 23 #include <linux/rhashtable.h> 24 25 #include <net/sock.h> 26 #include <net/inet_frag.h> 27 #include <net/inet_ecn.h> 28 #include <net/ip.h> 29 #include <net/ipv6.h> 30 31 /* Use skb->cb to track consecutive/adjacent fragments coming at 32 * the end of the queue. Nodes in the rb-tree queue will 33 * contain "runs" of one or more adjacent fragments. 34 * 35 * Invariants: 36 * - next_frag is NULL at the tail of a "run"; 37 * - the head of a "run" has the sum of all fragment lengths in frag_run_len. 38 */ 39 struct ipfrag_skb_cb { 40 union { 41 struct inet_skb_parm h4; 42 struct inet6_skb_parm h6; 43 }; 44 struct sk_buff *next_frag; 45 int frag_run_len; 46 }; 47 48 #define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) 49 50 static void fragcb_clear(struct sk_buff *skb) 51 { 52 RB_CLEAR_NODE(&skb->rbnode); 53 FRAG_CB(skb)->next_frag = NULL; 54 FRAG_CB(skb)->frag_run_len = skb->len; 55 } 56 57 /* Append skb to the last "run". */ 58 static void fragrun_append_to_last(struct inet_frag_queue *q, 59 struct sk_buff *skb) 60 { 61 fragcb_clear(skb); 62 63 FRAG_CB(q->last_run_head)->frag_run_len += skb->len; 64 FRAG_CB(q->fragments_tail)->next_frag = skb; 65 q->fragments_tail = skb; 66 } 67 68 /* Create a new "run" with the skb. */ 69 static void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb) 70 { 71 BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); 72 fragcb_clear(skb); 73 74 if (q->last_run_head) 75 rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, 76 &q->last_run_head->rbnode.rb_right); 77 else 78 rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); 79 rb_insert_color(&skb->rbnode, &q->rb_fragments); 80 81 q->fragments_tail = skb; 82 q->last_run_head = skb; 83 } 84 85 /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements 86 * Value : 0xff if frame should be dropped. 87 * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field 88 */ 89 const u8 ip_frag_ecn_table[16] = { 90 /* at least one fragment had CE, and others ECT_0 or ECT_1 */ 91 [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE, 92 [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE, 93 [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE, 94 95 /* invalid combinations : drop frame */ 96 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff, 97 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff, 98 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff, 99 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, 100 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff, 101 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff, 102 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, 103 }; 104 EXPORT_SYMBOL(ip_frag_ecn_table); 105 106 int inet_frags_init(struct inet_frags *f) 107 { 108 f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0, 109 NULL); 110 if (!f->frags_cachep) 111 return -ENOMEM; 112 113 return 0; 114 } 115 EXPORT_SYMBOL(inet_frags_init); 116 117 void inet_frags_fini(struct inet_frags *f) 118 { 119 /* We must wait that all inet_frag_destroy_rcu() have completed. */ 120 rcu_barrier(); 121 122 kmem_cache_destroy(f->frags_cachep); 123 f->frags_cachep = NULL; 124 } 125 EXPORT_SYMBOL(inet_frags_fini); 126 127 static void inet_frags_free_cb(void *ptr, void *arg) 128 { 129 struct inet_frag_queue *fq = ptr; 130 131 /* If we can not cancel the timer, it means this frag_queue 132 * is already disappearing, we have nothing to do. 133 * Otherwise, we own a refcount until the end of this function. 134 */ 135 if (!del_timer(&fq->timer)) 136 return; 137 138 spin_lock_bh(&fq->lock); 139 if (!(fq->flags & INET_FRAG_COMPLETE)) { 140 fq->flags |= INET_FRAG_COMPLETE; 141 refcount_dec(&fq->refcnt); 142 } 143 spin_unlock_bh(&fq->lock); 144 145 inet_frag_put(fq); 146 } 147 148 void inet_frags_exit_net(struct netns_frags *nf) 149 { 150 nf->high_thresh = 0; /* prevent creation of new frags */ 151 152 rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL); 153 } 154 EXPORT_SYMBOL(inet_frags_exit_net); 155 156 void inet_frag_kill(struct inet_frag_queue *fq) 157 { 158 if (del_timer(&fq->timer)) 159 refcount_dec(&fq->refcnt); 160 161 if (!(fq->flags & INET_FRAG_COMPLETE)) { 162 struct netns_frags *nf = fq->net; 163 164 fq->flags |= INET_FRAG_COMPLETE; 165 rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params); 166 refcount_dec(&fq->refcnt); 167 } 168 } 169 EXPORT_SYMBOL(inet_frag_kill); 170 171 static void inet_frag_destroy_rcu(struct rcu_head *head) 172 { 173 struct inet_frag_queue *q = container_of(head, struct inet_frag_queue, 174 rcu); 175 struct inet_frags *f = q->net->f; 176 177 if (f->destructor) 178 f->destructor(q); 179 kmem_cache_free(f->frags_cachep, q); 180 } 181 182 unsigned int inet_frag_rbtree_purge(struct rb_root *root) 183 { 184 struct rb_node *p = rb_first(root); 185 unsigned int sum = 0; 186 187 while (p) { 188 struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); 189 190 p = rb_next(p); 191 rb_erase(&skb->rbnode, root); 192 while (skb) { 193 struct sk_buff *next = FRAG_CB(skb)->next_frag; 194 195 sum += skb->truesize; 196 kfree_skb(skb); 197 skb = next; 198 } 199 } 200 return sum; 201 } 202 EXPORT_SYMBOL(inet_frag_rbtree_purge); 203 204 void inet_frag_destroy(struct inet_frag_queue *q) 205 { 206 struct netns_frags *nf; 207 unsigned int sum, sum_truesize = 0; 208 struct inet_frags *f; 209 210 WARN_ON(!(q->flags & INET_FRAG_COMPLETE)); 211 WARN_ON(del_timer(&q->timer) != 0); 212 213 /* Release all fragment data. */ 214 nf = q->net; 215 f = nf->f; 216 sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments); 217 sum = sum_truesize + f->qsize; 218 219 call_rcu(&q->rcu, inet_frag_destroy_rcu); 220 221 sub_frag_mem_limit(nf, sum); 222 } 223 EXPORT_SYMBOL(inet_frag_destroy); 224 225 static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, 226 struct inet_frags *f, 227 void *arg) 228 { 229 struct inet_frag_queue *q; 230 231 q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); 232 if (!q) 233 return NULL; 234 235 q->net = nf; 236 f->constructor(q, arg); 237 add_frag_mem_limit(nf, f->qsize); 238 239 timer_setup(&q->timer, f->frag_expire, 0); 240 spin_lock_init(&q->lock); 241 refcount_set(&q->refcnt, 3); 242 243 return q; 244 } 245 246 static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, 247 void *arg, 248 struct inet_frag_queue **prev) 249 { 250 struct inet_frags *f = nf->f; 251 struct inet_frag_queue *q; 252 253 q = inet_frag_alloc(nf, f, arg); 254 if (!q) { 255 *prev = ERR_PTR(-ENOMEM); 256 return NULL; 257 } 258 mod_timer(&q->timer, jiffies + nf->timeout); 259 260 *prev = rhashtable_lookup_get_insert_key(&nf->rhashtable, &q->key, 261 &q->node, f->rhash_params); 262 if (*prev) { 263 q->flags |= INET_FRAG_COMPLETE; 264 inet_frag_kill(q); 265 inet_frag_destroy(q); 266 return NULL; 267 } 268 return q; 269 } 270 271 /* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */ 272 struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key) 273 { 274 struct inet_frag_queue *fq = NULL, *prev; 275 276 if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) 277 return NULL; 278 279 rcu_read_lock(); 280 281 prev = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params); 282 if (!prev) 283 fq = inet_frag_create(nf, key, &prev); 284 if (prev && !IS_ERR(prev)) { 285 fq = prev; 286 if (!refcount_inc_not_zero(&fq->refcnt)) 287 fq = NULL; 288 } 289 rcu_read_unlock(); 290 return fq; 291 } 292 EXPORT_SYMBOL(inet_frag_find); 293 294 int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, 295 int offset, int end) 296 { 297 struct sk_buff *last = q->fragments_tail; 298 299 /* RFC5722, Section 4, amended by Errata ID : 3089 300 * When reassembling an IPv6 datagram, if 301 * one or more its constituent fragments is determined to be an 302 * overlapping fragment, the entire datagram (and any constituent 303 * fragments) MUST be silently discarded. 304 * 305 * Duplicates, however, should be ignored (i.e. skb dropped, but the 306 * queue/fragments kept for later reassembly). 307 */ 308 if (!last) 309 fragrun_create(q, skb); /* First fragment. */ 310 else if (last->ip_defrag_offset + last->len < end) { 311 /* This is the common case: skb goes to the end. */ 312 /* Detect and discard overlaps. */ 313 if (offset < last->ip_defrag_offset + last->len) 314 return IPFRAG_OVERLAP; 315 if (offset == last->ip_defrag_offset + last->len) 316 fragrun_append_to_last(q, skb); 317 else 318 fragrun_create(q, skb); 319 } else { 320 /* Binary search. Note that skb can become the first fragment, 321 * but not the last (covered above). 322 */ 323 struct rb_node **rbn, *parent; 324 325 rbn = &q->rb_fragments.rb_node; 326 do { 327 struct sk_buff *curr; 328 int curr_run_end; 329 330 parent = *rbn; 331 curr = rb_to_skb(parent); 332 curr_run_end = curr->ip_defrag_offset + 333 FRAG_CB(curr)->frag_run_len; 334 if (end <= curr->ip_defrag_offset) 335 rbn = &parent->rb_left; 336 else if (offset >= curr_run_end) 337 rbn = &parent->rb_right; 338 else if (offset >= curr->ip_defrag_offset && 339 end <= curr_run_end) 340 return IPFRAG_DUP; 341 else 342 return IPFRAG_OVERLAP; 343 } while (*rbn); 344 /* Here we have parent properly set, and rbn pointing to 345 * one of its NULL left/right children. Insert skb. 346 */ 347 fragcb_clear(skb); 348 rb_link_node(&skb->rbnode, parent, rbn); 349 rb_insert_color(&skb->rbnode, &q->rb_fragments); 350 } 351 352 skb->ip_defrag_offset = offset; 353 354 return IPFRAG_OK; 355 } 356 EXPORT_SYMBOL(inet_frag_queue_insert); 357 358 void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, 359 struct sk_buff *parent) 360 { 361 struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments); 362 struct sk_buff **nextp; 363 int delta; 364 365 if (head != skb) { 366 fp = skb_clone(skb, GFP_ATOMIC); 367 if (!fp) 368 return NULL; 369 FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; 370 if (RB_EMPTY_NODE(&skb->rbnode)) 371 FRAG_CB(parent)->next_frag = fp; 372 else 373 rb_replace_node(&skb->rbnode, &fp->rbnode, 374 &q->rb_fragments); 375 if (q->fragments_tail == skb) 376 q->fragments_tail = fp; 377 skb_morph(skb, head); 378 FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; 379 rb_replace_node(&head->rbnode, &skb->rbnode, 380 &q->rb_fragments); 381 consume_skb(head); 382 head = skb; 383 } 384 WARN_ON(head->ip_defrag_offset != 0); 385 386 delta = -head->truesize; 387 388 /* Head of list must not be cloned. */ 389 if (skb_unclone(head, GFP_ATOMIC)) 390 return NULL; 391 392 delta += head->truesize; 393 if (delta) 394 add_frag_mem_limit(q->net, delta); 395 396 /* If the first fragment is fragmented itself, we split 397 * it to two chunks: the first with data and paged part 398 * and the second, holding only fragments. 399 */ 400 if (skb_has_frag_list(head)) { 401 struct sk_buff *clone; 402 int i, plen = 0; 403 404 clone = alloc_skb(0, GFP_ATOMIC); 405 if (!clone) 406 return NULL; 407 skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; 408 skb_frag_list_init(head); 409 for (i = 0; i < skb_shinfo(head)->nr_frags; i++) 410 plen += skb_frag_size(&skb_shinfo(head)->frags[i]); 411 clone->data_len = head->data_len - plen; 412 clone->len = clone->data_len; 413 head->truesize += clone->truesize; 414 clone->csum = 0; 415 clone->ip_summed = head->ip_summed; 416 add_frag_mem_limit(q->net, clone->truesize); 417 skb_shinfo(head)->frag_list = clone; 418 nextp = &clone->next; 419 } else { 420 nextp = &skb_shinfo(head)->frag_list; 421 } 422 423 return nextp; 424 } 425 EXPORT_SYMBOL(inet_frag_reasm_prepare); 426 427 void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, 428 void *reasm_data) 429 { 430 struct sk_buff **nextp = (struct sk_buff **)reasm_data; 431 struct rb_node *rbn; 432 struct sk_buff *fp; 433 434 skb_push(head, head->data - skb_network_header(head)); 435 436 /* Traverse the tree in order, to build frag_list. */ 437 fp = FRAG_CB(head)->next_frag; 438 rbn = rb_next(&head->rbnode); 439 rb_erase(&head->rbnode, &q->rb_fragments); 440 while (rbn || fp) { 441 /* fp points to the next sk_buff in the current run; 442 * rbn points to the next run. 443 */ 444 /* Go through the current run. */ 445 while (fp) { 446 *nextp = fp; 447 nextp = &fp->next; 448 fp->prev = NULL; 449 memset(&fp->rbnode, 0, sizeof(fp->rbnode)); 450 fp->sk = NULL; 451 head->data_len += fp->len; 452 head->len += fp->len; 453 if (head->ip_summed != fp->ip_summed) 454 head->ip_summed = CHECKSUM_NONE; 455 else if (head->ip_summed == CHECKSUM_COMPLETE) 456 head->csum = csum_add(head->csum, fp->csum); 457 head->truesize += fp->truesize; 458 fp = FRAG_CB(fp)->next_frag; 459 } 460 /* Move to the next run. */ 461 if (rbn) { 462 struct rb_node *rbnext = rb_next(rbn); 463 464 fp = rb_to_skb(rbn); 465 rb_erase(rbn, &q->rb_fragments); 466 rbn = rbnext; 467 } 468 } 469 sub_frag_mem_limit(q->net, head->truesize); 470 471 *nextp = NULL; 472 skb_mark_not_on_list(head); 473 head->prev = NULL; 474 head->tstamp = q->stamp; 475 } 476 EXPORT_SYMBOL(inet_frag_reasm_finish); 477 478 struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q) 479 { 480 struct sk_buff *head, *skb; 481 482 head = skb_rb_first(&q->rb_fragments); 483 if (!head) 484 return NULL; 485 skb = FRAG_CB(head)->next_frag; 486 if (skb) 487 rb_replace_node(&head->rbnode, &skb->rbnode, 488 &q->rb_fragments); 489 else 490 rb_erase(&head->rbnode, &q->rb_fragments); 491 memset(&head->rbnode, 0, sizeof(head->rbnode)); 492 barrier(); 493 494 if (head == q->fragments_tail) 495 q->fragments_tail = NULL; 496 497 sub_frag_mem_limit(q->net, head->truesize); 498 499 return head; 500 } 501 EXPORT_SYMBOL(inet_frag_pull_head); 502