1 /* 2 * INETPEER - A storage for permanent information about peers 3 * 4 * This source is covered by the GNU GPL, the same as all kernel sources. 5 * 6 * Authors: Andrey V. Savochkin <saw@msu.ru> 7 */ 8 9 #include <linux/module.h> 10 #include <linux/types.h> 11 #include <linux/slab.h> 12 #include <linux/interrupt.h> 13 #include <linux/spinlock.h> 14 #include <linux/random.h> 15 #include <linux/timer.h> 16 #include <linux/time.h> 17 #include <linux/kernel.h> 18 #include <linux/mm.h> 19 #include <linux/net.h> 20 #include <net/ip.h> 21 #include <net/inetpeer.h> 22 23 /* 24 * Theory of operations. 25 * We keep one entry for each peer IP address. The nodes contains long-living 26 * information about the peer which doesn't depend on routes. 27 * At this moment this information consists only of ID field for the next 28 * outgoing IP packet. This field is incremented with each packet as encoded 29 * in inet_getid() function (include/net/inetpeer.h). 30 * At the moment of writing this notes identifier of IP packets is generated 31 * to be unpredictable using this code only for packets subjected 32 * (actually or potentially) to defragmentation. I.e. DF packets less than 33 * PMTU in size uses a constant ID and do not use this code (see 34 * ip_select_ident() in include/net/ip.h). 35 * 36 * Route cache entries hold references to our nodes. 37 * New cache entries get references via lookup by destination IP address in 38 * the avl tree. The reference is grabbed only when it's needed i.e. only 39 * when we try to output IP packet which needs an unpredictable ID (see 40 * __ip_select_ident() in net/ipv4/route.c). 41 * Nodes are removed only when reference counter goes to 0. 42 * When it's happened the node may be removed when a sufficient amount of 43 * time has been passed since its last use. The less-recently-used entry can 44 * also be removed if the pool is overloaded i.e. if the total amount of 45 * entries is greater-or-equal than the threshold. 46 * 47 * Node pool is organised as an AVL tree. 48 * Such an implementation has been chosen not just for fun. It's a way to 49 * prevent easy and efficient DoS attacks by creating hash collisions. A huge 50 * amount of long living nodes in a single hash slot would significantly delay 51 * lookups performed with disabled BHs. 52 * 53 * Serialisation issues. 54 * 1. Nodes may appear in the tree only with the pool lock held. 55 * 2. Nodes may disappear from the tree only with the pool lock held 56 * AND reference count being 0. 57 * 3. Nodes appears and disappears from unused node list only under 58 * "inet_peer_unused_lock". 59 * 4. Global variable peer_total is modified under the pool lock. 60 * 5. struct inet_peer fields modification: 61 * avl_left, avl_right, avl_parent, avl_height: pool lock 62 * unused: unused node list lock 63 * refcnt: atomically against modifications on other CPU; 64 * usually under some other lock to prevent node disappearing 65 * dtime: unused node list lock 66 * daddr: unchangeable 67 * ip_id_count: atomic value (no lock needed) 68 */ 69 70 static struct kmem_cache *peer_cachep __read_mostly; 71 72 #define node_height(x) x->avl_height 73 74 #define peer_avl_empty ((struct inet_peer *)&peer_fake_node) 75 #define peer_avl_empty_rcu ((struct inet_peer __rcu __force *)&peer_fake_node) 76 static const struct inet_peer peer_fake_node = { 77 .avl_left = peer_avl_empty_rcu, 78 .avl_right = peer_avl_empty_rcu, 79 .avl_height = 0 80 }; 81 82 struct inet_peer_base { 83 struct inet_peer __rcu *root; 84 spinlock_t lock; 85 int total; 86 }; 87 88 static struct inet_peer_base v4_peers = { 89 .root = peer_avl_empty_rcu, 90 .lock = __SPIN_LOCK_UNLOCKED(v4_peers.lock), 91 .total = 0, 92 }; 93 94 static struct inet_peer_base v6_peers = { 95 .root = peer_avl_empty_rcu, 96 .lock = __SPIN_LOCK_UNLOCKED(v6_peers.lock), 97 .total = 0, 98 }; 99 100 #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ 101 102 /* Exported for sysctl_net_ipv4. */ 103 int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries more 104 * aggressively at this stage */ 105 int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ 106 int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */ 107 int inet_peer_gc_mintime __read_mostly = 10 * HZ; 108 int inet_peer_gc_maxtime __read_mostly = 120 * HZ; 109 110 static struct { 111 struct list_head list; 112 spinlock_t lock; 113 } unused_peers = { 114 .list = LIST_HEAD_INIT(unused_peers.list), 115 .lock = __SPIN_LOCK_UNLOCKED(unused_peers.lock), 116 }; 117 118 static void peer_check_expire(unsigned long dummy); 119 static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0); 120 121 122 /* Called from ip_output.c:ip_init */ 123 void __init inet_initpeers(void) 124 { 125 struct sysinfo si; 126 127 /* Use the straight interface to information about memory. */ 128 si_meminfo(&si); 129 /* The values below were suggested by Alexey Kuznetsov 130 * <kuznet@ms2.inr.ac.ru>. I don't have any opinion about the values 131 * myself. --SAW 132 */ 133 if (si.totalram <= (32768*1024)/PAGE_SIZE) 134 inet_peer_threshold >>= 1; /* max pool size about 1MB on IA32 */ 135 if (si.totalram <= (16384*1024)/PAGE_SIZE) 136 inet_peer_threshold >>= 1; /* about 512KB */ 137 if (si.totalram <= (8192*1024)/PAGE_SIZE) 138 inet_peer_threshold >>= 2; /* about 128KB */ 139 140 peer_cachep = kmem_cache_create("inet_peer_cache", 141 sizeof(struct inet_peer), 142 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, 143 NULL); 144 145 /* All the timers, started at system startup tend 146 to synchronize. Perturb it a bit. 147 */ 148 peer_periodic_timer.expires = jiffies 149 + net_random() % inet_peer_gc_maxtime 150 + inet_peer_gc_maxtime; 151 add_timer(&peer_periodic_timer); 152 } 153 154 /* Called with or without local BH being disabled. */ 155 static void unlink_from_unused(struct inet_peer *p) 156 { 157 if (!list_empty(&p->unused)) { 158 spin_lock_bh(&unused_peers.lock); 159 list_del_init(&p->unused); 160 spin_unlock_bh(&unused_peers.lock); 161 } 162 } 163 164 static int addr_compare(const struct inetpeer_addr *a, 165 const struct inetpeer_addr *b) 166 { 167 int i, n = (a->family == AF_INET ? 1 : 4); 168 169 for (i = 0; i < n; i++) { 170 if (a->a6[i] == b->a6[i]) 171 continue; 172 if (a->a6[i] < b->a6[i]) 173 return -1; 174 return 1; 175 } 176 177 return 0; 178 } 179 180 /* 181 * Called with local BH disabled and the pool lock held. 182 */ 183 #define lookup(_daddr, _stack, _base) \ 184 ({ \ 185 struct inet_peer *u; \ 186 struct inet_peer __rcu **v; \ 187 \ 188 stackptr = _stack; \ 189 *stackptr++ = &_base->root; \ 190 for (u = rcu_dereference_protected(_base->root, \ 191 lockdep_is_held(&_base->lock)); \ 192 u != peer_avl_empty; ) { \ 193 int cmp = addr_compare(_daddr, &u->daddr); \ 194 if (cmp == 0) \ 195 break; \ 196 if (cmp == -1) \ 197 v = &u->avl_left; \ 198 else \ 199 v = &u->avl_right; \ 200 *stackptr++ = v; \ 201 u = rcu_dereference_protected(*v, \ 202 lockdep_is_held(&_base->lock)); \ 203 } \ 204 u; \ 205 }) 206 207 /* 208 * Called with rcu_read_lock_bh() 209 * Because we hold no lock against a writer, its quite possible we fall 210 * in an endless loop. 211 * But every pointer we follow is guaranteed to be valid thanks to RCU. 212 * We exit from this function if number of links exceeds PEER_MAXDEPTH 213 */ 214 static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr, 215 struct inet_peer_base *base) 216 { 217 struct inet_peer *u = rcu_dereference_bh(base->root); 218 int count = 0; 219 220 while (u != peer_avl_empty) { 221 int cmp = addr_compare(daddr, &u->daddr); 222 if (cmp == 0) { 223 /* Before taking a reference, check if this entry was 224 * deleted, unlink_from_pool() sets refcnt=-1 to make 225 * distinction between an unused entry (refcnt=0) and 226 * a freed one. 227 */ 228 if (unlikely(!atomic_add_unless(&u->refcnt, 1, -1))) 229 u = NULL; 230 return u; 231 } 232 if (cmp == -1) 233 u = rcu_dereference_bh(u->avl_left); 234 else 235 u = rcu_dereference_bh(u->avl_right); 236 if (unlikely(++count == PEER_MAXDEPTH)) 237 break; 238 } 239 return NULL; 240 } 241 242 /* Called with local BH disabled and the pool lock held. */ 243 #define lookup_rightempty(start, base) \ 244 ({ \ 245 struct inet_peer *u; \ 246 struct inet_peer __rcu **v; \ 247 *stackptr++ = &start->avl_left; \ 248 v = &start->avl_left; \ 249 for (u = rcu_dereference_protected(*v, \ 250 lockdep_is_held(&base->lock)); \ 251 u->avl_right != peer_avl_empty_rcu; ) { \ 252 v = &u->avl_right; \ 253 *stackptr++ = v; \ 254 u = rcu_dereference_protected(*v, \ 255 lockdep_is_held(&base->lock)); \ 256 } \ 257 u; \ 258 }) 259 260 /* Called with local BH disabled and the pool lock held. 261 * Variable names are the proof of operation correctness. 262 * Look into mm/map_avl.c for more detail description of the ideas. 263 */ 264 static void peer_avl_rebalance(struct inet_peer __rcu **stack[], 265 struct inet_peer __rcu ***stackend, 266 struct inet_peer_base *base) 267 { 268 struct inet_peer __rcu **nodep; 269 struct inet_peer *node, *l, *r; 270 int lh, rh; 271 272 while (stackend > stack) { 273 nodep = *--stackend; 274 node = rcu_dereference_protected(*nodep, 275 lockdep_is_held(&base->lock)); 276 l = rcu_dereference_protected(node->avl_left, 277 lockdep_is_held(&base->lock)); 278 r = rcu_dereference_protected(node->avl_right, 279 lockdep_is_held(&base->lock)); 280 lh = node_height(l); 281 rh = node_height(r); 282 if (lh > rh + 1) { /* l: RH+2 */ 283 struct inet_peer *ll, *lr, *lrl, *lrr; 284 int lrh; 285 ll = rcu_dereference_protected(l->avl_left, 286 lockdep_is_held(&base->lock)); 287 lr = rcu_dereference_protected(l->avl_right, 288 lockdep_is_held(&base->lock)); 289 lrh = node_height(lr); 290 if (lrh <= node_height(ll)) { /* ll: RH+1 */ 291 RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */ 292 RCU_INIT_POINTER(node->avl_right, r); /* r: RH */ 293 node->avl_height = lrh + 1; /* RH+1 or RH+2 */ 294 RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH+1 */ 295 RCU_INIT_POINTER(l->avl_right, node); /* node: RH+1 or RH+2 */ 296 l->avl_height = node->avl_height + 1; 297 RCU_INIT_POINTER(*nodep, l); 298 } else { /* ll: RH, lr: RH+1 */ 299 lrl = rcu_dereference_protected(lr->avl_left, 300 lockdep_is_held(&base->lock)); /* lrl: RH or RH-1 */ 301 lrr = rcu_dereference_protected(lr->avl_right, 302 lockdep_is_held(&base->lock)); /* lrr: RH or RH-1 */ 303 RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */ 304 RCU_INIT_POINTER(node->avl_right, r); /* r: RH */ 305 node->avl_height = rh + 1; /* node: RH+1 */ 306 RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH */ 307 RCU_INIT_POINTER(l->avl_right, lrl); /* lrl: RH or RH-1 */ 308 l->avl_height = rh + 1; /* l: RH+1 */ 309 RCU_INIT_POINTER(lr->avl_left, l); /* l: RH+1 */ 310 RCU_INIT_POINTER(lr->avl_right, node); /* node: RH+1 */ 311 lr->avl_height = rh + 2; 312 RCU_INIT_POINTER(*nodep, lr); 313 } 314 } else if (rh > lh + 1) { /* r: LH+2 */ 315 struct inet_peer *rr, *rl, *rlr, *rll; 316 int rlh; 317 rr = rcu_dereference_protected(r->avl_right, 318 lockdep_is_held(&base->lock)); 319 rl = rcu_dereference_protected(r->avl_left, 320 lockdep_is_held(&base->lock)); 321 rlh = node_height(rl); 322 if (rlh <= node_height(rr)) { /* rr: LH+1 */ 323 RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */ 324 RCU_INIT_POINTER(node->avl_left, l); /* l: LH */ 325 node->avl_height = rlh + 1; /* LH+1 or LH+2 */ 326 RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH+1 */ 327 RCU_INIT_POINTER(r->avl_left, node); /* node: LH+1 or LH+2 */ 328 r->avl_height = node->avl_height + 1; 329 RCU_INIT_POINTER(*nodep, r); 330 } else { /* rr: RH, rl: RH+1 */ 331 rlr = rcu_dereference_protected(rl->avl_right, 332 lockdep_is_held(&base->lock)); /* rlr: LH or LH-1 */ 333 rll = rcu_dereference_protected(rl->avl_left, 334 lockdep_is_held(&base->lock)); /* rll: LH or LH-1 */ 335 RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */ 336 RCU_INIT_POINTER(node->avl_left, l); /* l: LH */ 337 node->avl_height = lh + 1; /* node: LH+1 */ 338 RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH */ 339 RCU_INIT_POINTER(r->avl_left, rlr); /* rlr: LH or LH-1 */ 340 r->avl_height = lh + 1; /* r: LH+1 */ 341 RCU_INIT_POINTER(rl->avl_right, r); /* r: LH+1 */ 342 RCU_INIT_POINTER(rl->avl_left, node); /* node: LH+1 */ 343 rl->avl_height = lh + 2; 344 RCU_INIT_POINTER(*nodep, rl); 345 } 346 } else { 347 node->avl_height = (lh > rh ? lh : rh) + 1; 348 } 349 } 350 } 351 352 /* Called with local BH disabled and the pool lock held. */ 353 #define link_to_pool(n, base) \ 354 do { \ 355 n->avl_height = 1; \ 356 n->avl_left = peer_avl_empty_rcu; \ 357 n->avl_right = peer_avl_empty_rcu; \ 358 /* lockless readers can catch us now */ \ 359 rcu_assign_pointer(**--stackptr, n); \ 360 peer_avl_rebalance(stack, stackptr, base); \ 361 } while (0) 362 363 static void inetpeer_free_rcu(struct rcu_head *head) 364 { 365 kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu)); 366 } 367 368 /* May be called with local BH enabled. */ 369 static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base) 370 { 371 int do_free; 372 373 do_free = 0; 374 375 spin_lock_bh(&base->lock); 376 /* Check the reference counter. It was artificially incremented by 1 377 * in cleanup() function to prevent sudden disappearing. If we can 378 * atomically (because of lockless readers) take this last reference, 379 * it's safe to remove the node and free it later. 380 * We use refcnt=-1 to alert lockless readers this entry is deleted. 381 */ 382 if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) { 383 struct inet_peer __rcu **stack[PEER_MAXDEPTH]; 384 struct inet_peer __rcu ***stackptr, ***delp; 385 if (lookup(&p->daddr, stack, base) != p) 386 BUG(); 387 delp = stackptr - 1; /* *delp[0] == p */ 388 if (p->avl_left == peer_avl_empty_rcu) { 389 *delp[0] = p->avl_right; 390 --stackptr; 391 } else { 392 /* look for a node to insert instead of p */ 393 struct inet_peer *t; 394 t = lookup_rightempty(p, base); 395 BUG_ON(rcu_dereference_protected(*stackptr[-1], 396 lockdep_is_held(&base->lock)) != t); 397 **--stackptr = t->avl_left; 398 /* t is removed, t->daddr > x->daddr for any 399 * x in p->avl_left subtree. 400 * Put t in the old place of p. */ 401 RCU_INIT_POINTER(*delp[0], t); 402 t->avl_left = p->avl_left; 403 t->avl_right = p->avl_right; 404 t->avl_height = p->avl_height; 405 BUG_ON(delp[1] != &p->avl_left); 406 delp[1] = &t->avl_left; /* was &p->avl_left */ 407 } 408 peer_avl_rebalance(stack, stackptr, base); 409 base->total--; 410 do_free = 1; 411 } 412 spin_unlock_bh(&base->lock); 413 414 if (do_free) 415 call_rcu_bh(&p->rcu, inetpeer_free_rcu); 416 else 417 /* The node is used again. Decrease the reference counter 418 * back. The loop "cleanup -> unlink_from_unused 419 * -> unlink_from_pool -> putpeer -> link_to_unused 420 * -> cleanup (for the same node)" 421 * doesn't really exist because the entry will have a 422 * recent deletion time and will not be cleaned again soon. 423 */ 424 inet_putpeer(p); 425 } 426 427 static struct inet_peer_base *family_to_base(int family) 428 { 429 return (family == AF_INET ? &v4_peers : &v6_peers); 430 } 431 432 static struct inet_peer_base *peer_to_base(struct inet_peer *p) 433 { 434 return family_to_base(p->daddr.family); 435 } 436 437 /* May be called with local BH enabled. */ 438 static int cleanup_once(unsigned long ttl) 439 { 440 struct inet_peer *p = NULL; 441 442 /* Remove the first entry from the list of unused nodes. */ 443 spin_lock_bh(&unused_peers.lock); 444 if (!list_empty(&unused_peers.list)) { 445 __u32 delta; 446 447 p = list_first_entry(&unused_peers.list, struct inet_peer, unused); 448 delta = (__u32)jiffies - p->dtime; 449 450 if (delta < ttl) { 451 /* Do not prune fresh entries. */ 452 spin_unlock_bh(&unused_peers.lock); 453 return -1; 454 } 455 456 list_del_init(&p->unused); 457 458 /* Grab an extra reference to prevent node disappearing 459 * before unlink_from_pool() call. */ 460 atomic_inc(&p->refcnt); 461 } 462 spin_unlock_bh(&unused_peers.lock); 463 464 if (p == NULL) 465 /* It means that the total number of USED entries has 466 * grown over inet_peer_threshold. It shouldn't really 467 * happen because of entry limits in route cache. */ 468 return -1; 469 470 unlink_from_pool(p, peer_to_base(p)); 471 return 0; 472 } 473 474 /* Called with or without local BH being disabled. */ 475 struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) 476 { 477 struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; 478 struct inet_peer_base *base = family_to_base(AF_INET); 479 struct inet_peer *p; 480 481 /* Look up for the address quickly, lockless. 482 * Because of a concurrent writer, we might not find an existing entry. 483 */ 484 rcu_read_lock_bh(); 485 p = lookup_rcu_bh(daddr, base); 486 rcu_read_unlock_bh(); 487 488 if (p) { 489 /* The existing node has been found. 490 * Remove the entry from unused list if it was there. 491 */ 492 unlink_from_unused(p); 493 return p; 494 } 495 496 /* retry an exact lookup, taking the lock before. 497 * At least, nodes should be hot in our cache. 498 */ 499 spin_lock_bh(&base->lock); 500 p = lookup(daddr, stack, base); 501 if (p != peer_avl_empty) { 502 atomic_inc(&p->refcnt); 503 spin_unlock_bh(&base->lock); 504 /* Remove the entry from unused list if it was there. */ 505 unlink_from_unused(p); 506 return p; 507 } 508 p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL; 509 if (p) { 510 p->daddr = *daddr; 511 atomic_set(&p->refcnt, 1); 512 atomic_set(&p->rid, 0); 513 atomic_set(&p->ip_id_count, secure_ip_id(daddr->a4)); 514 p->tcp_ts_stamp = 0; 515 INIT_LIST_HEAD(&p->unused); 516 517 518 /* Link the node. */ 519 link_to_pool(p, base); 520 base->total++; 521 } 522 spin_unlock_bh(&base->lock); 523 524 if (base->total >= inet_peer_threshold) 525 /* Remove one less-recently-used entry. */ 526 cleanup_once(0); 527 528 return p; 529 } 530 531 static int compute_total(void) 532 { 533 return v4_peers.total + v6_peers.total; 534 } 535 EXPORT_SYMBOL_GPL(inet_getpeer); 536 537 /* Called with local BH disabled. */ 538 static void peer_check_expire(unsigned long dummy) 539 { 540 unsigned long now = jiffies; 541 int ttl, total; 542 543 total = compute_total(); 544 if (total >= inet_peer_threshold) 545 ttl = inet_peer_minttl; 546 else 547 ttl = inet_peer_maxttl 548 - (inet_peer_maxttl - inet_peer_minttl) / HZ * 549 total / inet_peer_threshold * HZ; 550 while (!cleanup_once(ttl)) { 551 if (jiffies != now) 552 break; 553 } 554 555 /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime 556 * interval depending on the total number of entries (more entries, 557 * less interval). */ 558 total = compute_total(); 559 if (total >= inet_peer_threshold) 560 peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime; 561 else 562 peer_periodic_timer.expires = jiffies 563 + inet_peer_gc_maxtime 564 - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ * 565 total / inet_peer_threshold * HZ; 566 add_timer(&peer_periodic_timer); 567 } 568 569 void inet_putpeer(struct inet_peer *p) 570 { 571 local_bh_disable(); 572 573 if (atomic_dec_and_lock(&p->refcnt, &unused_peers.lock)) { 574 list_add_tail(&p->unused, &unused_peers.list); 575 p->dtime = (__u32)jiffies; 576 spin_unlock(&unused_peers.lock); 577 } 578 579 local_bh_enable(); 580 } 581 EXPORT_SYMBOL_GPL(inet_putpeer); 582