1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * ROUTE - implementation of the IP router. 7 * 8 * Authors: Ross Biro 9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 10 * Alan Cox, <gw4pts@gw4pts.ampr.org> 11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi> 12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 13 * 14 * Fixes: 15 * Alan Cox : Verify area fixes. 16 * Alan Cox : cli() protects routing changes 17 * Rui Oliveira : ICMP routing table updates 18 * (rco@di.uminho.pt) Routing table insertion and update 19 * Linus Torvalds : Rewrote bits to be sensible 20 * Alan Cox : Added BSD route gw semantics 21 * Alan Cox : Super /proc >4K 22 * Alan Cox : MTU in route table 23 * Alan Cox : MSS actually. Also added the window 24 * clamper. 25 * Sam Lantinga : Fixed route matching in rt_del() 26 * Alan Cox : Routing cache support. 27 * Alan Cox : Removed compatibility cruft. 28 * Alan Cox : RTF_REJECT support. 29 * Alan Cox : TCP irtt support. 30 * Jonathan Naylor : Added Metric support. 31 * Miquel van Smoorenburg : BSD API fixes. 32 * Miquel van Smoorenburg : Metrics. 33 * Alan Cox : Use __u32 properly 34 * Alan Cox : Aligned routing errors more closely with BSD 35 * our system is still very different. 36 * Alan Cox : Faster /proc handling 37 * Alexey Kuznetsov : Massive rework to support tree based routing, 38 * routing caches and better behaviour. 39 * 40 * Olaf Erb : irtt wasn't being copied right. 41 * Bjorn Ekwall : Kerneld route support. 42 * Alan Cox : Multicast fixed (I hope) 43 * Pavel Krauz : Limited broadcast fixed 44 * Mike McLagan : Routing by source 45 * Alexey Kuznetsov : End of old history. Split to fib.c and 46 * route.c and rewritten from scratch. 47 * Andi Kleen : Load-limit warning messages. 48 * Vitaly E. Lavrov : Transparent proxy revived after year coma. 49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow. 50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. 51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful. 52 * Marc Boucher : routing by fwmark 53 * Robert Olsson : Added rt_cache statistics 54 * Arnaldo C. Melo : Convert proc stuff to seq_file 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. 56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect 57 * Ilia Sotnikov : Removed TOS from hash calculations 58 * 59 * This program is free software; you can redistribute it and/or 60 * modify it under the terms of the GNU General Public License 61 * as published by the Free Software Foundation; either version 62 * 2 of the License, or (at your option) any later version. 63 */ 64 65 #define pr_fmt(fmt) "IPv4: " fmt 66 67 #include <linux/module.h> 68 #include <linux/uaccess.h> 69 #include <linux/bitops.h> 70 #include <linux/types.h> 71 #include <linux/kernel.h> 72 #include <linux/mm.h> 73 #include <linux/string.h> 74 #include <linux/socket.h> 75 #include <linux/sockios.h> 76 #include <linux/errno.h> 77 #include <linux/in.h> 78 #include <linux/inet.h> 79 #include <linux/netdevice.h> 80 #include <linux/proc_fs.h> 81 #include <linux/init.h> 82 #include <linux/skbuff.h> 83 #include <linux/inetdevice.h> 84 #include <linux/igmp.h> 85 #include <linux/pkt_sched.h> 86 #include <linux/mroute.h> 87 #include <linux/netfilter_ipv4.h> 88 #include <linux/random.h> 89 #include <linux/rcupdate.h> 90 #include <linux/times.h> 91 #include <linux/slab.h> 92 #include <linux/jhash.h> 93 #include <net/dst.h> 94 #include <net/dst_metadata.h> 95 #include <net/net_namespace.h> 96 #include <net/protocol.h> 97 #include <net/ip.h> 98 #include <net/route.h> 99 #include <net/inetpeer.h> 100 #include <net/sock.h> 101 #include <net/ip_fib.h> 102 #include <net/arp.h> 103 #include <net/tcp.h> 104 #include <net/icmp.h> 105 #include <net/xfrm.h> 106 #include <net/lwtunnel.h> 107 #include <net/netevent.h> 108 #include <net/rtnetlink.h> 109 #ifdef CONFIG_SYSCTL 110 #include <linux/sysctl.h> 111 #include <linux/kmemleak.h> 112 #endif 113 #include <net/secure_seq.h> 114 #include <net/ip_tunnels.h> 115 #include <net/l3mdev.h> 116 117 #include "fib_lookup.h" 118 119 #define RT_FL_TOS(oldflp4) \ 120 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) 121 122 #define RT_GC_TIMEOUT (300*HZ) 123 124 static int ip_rt_max_size; 125 static int ip_rt_redirect_number __read_mostly = 9; 126 static int ip_rt_redirect_load __read_mostly = HZ / 50; 127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1)); 128 static int ip_rt_error_cost __read_mostly = HZ; 129 static int ip_rt_error_burst __read_mostly = 5 * HZ; 130 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; 131 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; 132 static int ip_rt_min_advmss __read_mostly = 256; 133 134 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; 135 /* 136 * Interface to generic destination cache. 137 */ 138 139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); 140 static unsigned int ipv4_default_advmss(const struct dst_entry *dst); 141 static unsigned int ipv4_mtu(const struct dst_entry *dst); 142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); 143 static void ipv4_link_failure(struct sk_buff *skb); 144 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 145 struct sk_buff *skb, u32 mtu); 146 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, 147 struct sk_buff *skb); 148 static void ipv4_dst_destroy(struct dst_entry *dst); 149 150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) 151 { 152 WARN_ON(1); 153 return NULL; 154 } 155 156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, 157 struct sk_buff *skb, 158 const void *daddr); 159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr); 160 161 static struct dst_ops ipv4_dst_ops = { 162 .family = AF_INET, 163 .check = ipv4_dst_check, 164 .default_advmss = ipv4_default_advmss, 165 .mtu = ipv4_mtu, 166 .cow_metrics = ipv4_cow_metrics, 167 .destroy = ipv4_dst_destroy, 168 .negative_advice = ipv4_negative_advice, 169 .link_failure = ipv4_link_failure, 170 .update_pmtu = ip_rt_update_pmtu, 171 .redirect = ip_do_redirect, 172 .local_out = __ip_local_out, 173 .neigh_lookup = ipv4_neigh_lookup, 174 .confirm_neigh = ipv4_confirm_neigh, 175 }; 176 177 #define ECN_OR_COST(class) TC_PRIO_##class 178 179 const __u8 ip_tos2prio[16] = { 180 TC_PRIO_BESTEFFORT, 181 ECN_OR_COST(BESTEFFORT), 182 TC_PRIO_BESTEFFORT, 183 ECN_OR_COST(BESTEFFORT), 184 TC_PRIO_BULK, 185 ECN_OR_COST(BULK), 186 TC_PRIO_BULK, 187 ECN_OR_COST(BULK), 188 TC_PRIO_INTERACTIVE, 189 ECN_OR_COST(INTERACTIVE), 190 TC_PRIO_INTERACTIVE, 191 ECN_OR_COST(INTERACTIVE), 192 TC_PRIO_INTERACTIVE_BULK, 193 ECN_OR_COST(INTERACTIVE_BULK), 194 TC_PRIO_INTERACTIVE_BULK, 195 ECN_OR_COST(INTERACTIVE_BULK) 196 }; 197 EXPORT_SYMBOL(ip_tos2prio); 198 199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); 200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field) 201 202 #ifdef CONFIG_PROC_FS 203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) 204 { 205 if (*pos) 206 return NULL; 207 return SEQ_START_TOKEN; 208 } 209 210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) 211 { 212 ++*pos; 213 return NULL; 214 } 215 216 static void rt_cache_seq_stop(struct seq_file *seq, void *v) 217 { 218 } 219 220 static int rt_cache_seq_show(struct seq_file *seq, void *v) 221 { 222 if (v == SEQ_START_TOKEN) 223 seq_printf(seq, "%-127s\n", 224 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" 225 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" 226 "HHUptod\tSpecDst"); 227 return 0; 228 } 229 230 static const struct seq_operations rt_cache_seq_ops = { 231 .start = rt_cache_seq_start, 232 .next = rt_cache_seq_next, 233 .stop = rt_cache_seq_stop, 234 .show = rt_cache_seq_show, 235 }; 236 237 static int rt_cache_seq_open(struct inode *inode, struct file *file) 238 { 239 return seq_open(file, &rt_cache_seq_ops); 240 } 241 242 static const struct file_operations rt_cache_seq_fops = { 243 .open = rt_cache_seq_open, 244 .read = seq_read, 245 .llseek = seq_lseek, 246 .release = seq_release, 247 }; 248 249 250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) 251 { 252 int cpu; 253 254 if (*pos == 0) 255 return SEQ_START_TOKEN; 256 257 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { 258 if (!cpu_possible(cpu)) 259 continue; 260 *pos = cpu+1; 261 return &per_cpu(rt_cache_stat, cpu); 262 } 263 return NULL; 264 } 265 266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) 267 { 268 int cpu; 269 270 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { 271 if (!cpu_possible(cpu)) 272 continue; 273 *pos = cpu+1; 274 return &per_cpu(rt_cache_stat, cpu); 275 } 276 return NULL; 277 278 } 279 280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v) 281 { 282 283 } 284 285 static int rt_cpu_seq_show(struct seq_file *seq, void *v) 286 { 287 struct rt_cache_stat *st = v; 288 289 if (v == SEQ_START_TOKEN) { 290 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); 291 return 0; 292 } 293 294 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " 295 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", 296 dst_entries_get_slow(&ipv4_dst_ops), 297 0, /* st->in_hit */ 298 st->in_slow_tot, 299 st->in_slow_mc, 300 st->in_no_route, 301 st->in_brd, 302 st->in_martian_dst, 303 st->in_martian_src, 304 305 0, /* st->out_hit */ 306 st->out_slow_tot, 307 st->out_slow_mc, 308 309 0, /* st->gc_total */ 310 0, /* st->gc_ignored */ 311 0, /* st->gc_goal_miss */ 312 0, /* st->gc_dst_overflow */ 313 0, /* st->in_hlist_search */ 314 0 /* st->out_hlist_search */ 315 ); 316 return 0; 317 } 318 319 static const struct seq_operations rt_cpu_seq_ops = { 320 .start = rt_cpu_seq_start, 321 .next = rt_cpu_seq_next, 322 .stop = rt_cpu_seq_stop, 323 .show = rt_cpu_seq_show, 324 }; 325 326 327 static int rt_cpu_seq_open(struct inode *inode, struct file *file) 328 { 329 return seq_open(file, &rt_cpu_seq_ops); 330 } 331 332 static const struct file_operations rt_cpu_seq_fops = { 333 .open = rt_cpu_seq_open, 334 .read = seq_read, 335 .llseek = seq_lseek, 336 .release = seq_release, 337 }; 338 339 #ifdef CONFIG_IP_ROUTE_CLASSID 340 static int rt_acct_proc_show(struct seq_file *m, void *v) 341 { 342 struct ip_rt_acct *dst, *src; 343 unsigned int i, j; 344 345 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL); 346 if (!dst) 347 return -ENOMEM; 348 349 for_each_possible_cpu(i) { 350 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i); 351 for (j = 0; j < 256; j++) { 352 dst[j].o_bytes += src[j].o_bytes; 353 dst[j].o_packets += src[j].o_packets; 354 dst[j].i_bytes += src[j].i_bytes; 355 dst[j].i_packets += src[j].i_packets; 356 } 357 } 358 359 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct)); 360 kfree(dst); 361 return 0; 362 } 363 364 static int rt_acct_proc_open(struct inode *inode, struct file *file) 365 { 366 return single_open(file, rt_acct_proc_show, NULL); 367 } 368 369 static const struct file_operations rt_acct_proc_fops = { 370 .open = rt_acct_proc_open, 371 .read = seq_read, 372 .llseek = seq_lseek, 373 .release = single_release, 374 }; 375 #endif 376 377 static int __net_init ip_rt_do_proc_init(struct net *net) 378 { 379 struct proc_dir_entry *pde; 380 381 pde = proc_create("rt_cache", S_IRUGO, net->proc_net, 382 &rt_cache_seq_fops); 383 if (!pde) 384 goto err1; 385 386 pde = proc_create("rt_cache", S_IRUGO, 387 net->proc_net_stat, &rt_cpu_seq_fops); 388 if (!pde) 389 goto err2; 390 391 #ifdef CONFIG_IP_ROUTE_CLASSID 392 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); 393 if (!pde) 394 goto err3; 395 #endif 396 return 0; 397 398 #ifdef CONFIG_IP_ROUTE_CLASSID 399 err3: 400 remove_proc_entry("rt_cache", net->proc_net_stat); 401 #endif 402 err2: 403 remove_proc_entry("rt_cache", net->proc_net); 404 err1: 405 return -ENOMEM; 406 } 407 408 static void __net_exit ip_rt_do_proc_exit(struct net *net) 409 { 410 remove_proc_entry("rt_cache", net->proc_net_stat); 411 remove_proc_entry("rt_cache", net->proc_net); 412 #ifdef CONFIG_IP_ROUTE_CLASSID 413 remove_proc_entry("rt_acct", net->proc_net); 414 #endif 415 } 416 417 static struct pernet_operations ip_rt_proc_ops __net_initdata = { 418 .init = ip_rt_do_proc_init, 419 .exit = ip_rt_do_proc_exit, 420 .async = true, 421 }; 422 423 static int __init ip_rt_proc_init(void) 424 { 425 return register_pernet_subsys(&ip_rt_proc_ops); 426 } 427 428 #else 429 static inline int ip_rt_proc_init(void) 430 { 431 return 0; 432 } 433 #endif /* CONFIG_PROC_FS */ 434 435 static inline bool rt_is_expired(const struct rtable *rth) 436 { 437 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev)); 438 } 439 440 void rt_cache_flush(struct net *net) 441 { 442 rt_genid_bump_ipv4(net); 443 } 444 445 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, 446 struct sk_buff *skb, 447 const void *daddr) 448 { 449 struct net_device *dev = dst->dev; 450 const __be32 *pkey = daddr; 451 const struct rtable *rt; 452 struct neighbour *n; 453 454 rt = (const struct rtable *) dst; 455 if (rt->rt_gateway) 456 pkey = (const __be32 *) &rt->rt_gateway; 457 else if (skb) 458 pkey = &ip_hdr(skb)->daddr; 459 460 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey); 461 if (n) 462 return n; 463 return neigh_create(&arp_tbl, pkey, dev); 464 } 465 466 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr) 467 { 468 struct net_device *dev = dst->dev; 469 const __be32 *pkey = daddr; 470 const struct rtable *rt; 471 472 rt = (const struct rtable *)dst; 473 if (rt->rt_gateway) 474 pkey = (const __be32 *)&rt->rt_gateway; 475 else if (!daddr || 476 (rt->rt_flags & 477 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) 478 return; 479 480 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey); 481 } 482 483 #define IP_IDENTS_SZ 2048u 484 485 static atomic_t *ip_idents __read_mostly; 486 static u32 *ip_tstamps __read_mostly; 487 488 /* In order to protect privacy, we add a perturbation to identifiers 489 * if one generator is seldom used. This makes hard for an attacker 490 * to infer how many packets were sent between two points in time. 491 */ 492 u32 ip_idents_reserve(u32 hash, int segs) 493 { 494 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ; 495 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ; 496 u32 old = READ_ONCE(*p_tstamp); 497 u32 now = (u32)jiffies; 498 u32 new, delta = 0; 499 500 if (old != now && cmpxchg(p_tstamp, old, now) == old) 501 delta = prandom_u32_max(now - old); 502 503 /* Do not use atomic_add_return() as it makes UBSAN unhappy */ 504 do { 505 old = (u32)atomic_read(p_id); 506 new = old + delta + segs; 507 } while (atomic_cmpxchg(p_id, old, new) != old); 508 509 return new - segs; 510 } 511 EXPORT_SYMBOL(ip_idents_reserve); 512 513 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs) 514 { 515 static u32 ip_idents_hashrnd __read_mostly; 516 u32 hash, id; 517 518 net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd)); 519 520 hash = jhash_3words((__force u32)iph->daddr, 521 (__force u32)iph->saddr, 522 iph->protocol ^ net_hash_mix(net), 523 ip_idents_hashrnd); 524 id = ip_idents_reserve(hash, segs); 525 iph->id = htons(id); 526 } 527 EXPORT_SYMBOL(__ip_select_ident); 528 529 static void __build_flow_key(const struct net *net, struct flowi4 *fl4, 530 const struct sock *sk, 531 const struct iphdr *iph, 532 int oif, u8 tos, 533 u8 prot, u32 mark, int flow_flags) 534 { 535 if (sk) { 536 const struct inet_sock *inet = inet_sk(sk); 537 538 oif = sk->sk_bound_dev_if; 539 mark = sk->sk_mark; 540 tos = RT_CONN_FLAGS(sk); 541 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol; 542 } 543 flowi4_init_output(fl4, oif, mark, tos, 544 RT_SCOPE_UNIVERSE, prot, 545 flow_flags, 546 iph->daddr, iph->saddr, 0, 0, 547 sock_net_uid(net, sk)); 548 } 549 550 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, 551 const struct sock *sk) 552 { 553 const struct net *net = dev_net(skb->dev); 554 const struct iphdr *iph = ip_hdr(skb); 555 int oif = skb->dev->ifindex; 556 u8 tos = RT_TOS(iph->tos); 557 u8 prot = iph->protocol; 558 u32 mark = skb->mark; 559 560 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0); 561 } 562 563 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) 564 { 565 const struct inet_sock *inet = inet_sk(sk); 566 const struct ip_options_rcu *inet_opt; 567 __be32 daddr = inet->inet_daddr; 568 569 rcu_read_lock(); 570 inet_opt = rcu_dereference(inet->inet_opt); 571 if (inet_opt && inet_opt->opt.srr) 572 daddr = inet_opt->opt.faddr; 573 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, 574 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, 575 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, 576 inet_sk_flowi_flags(sk), 577 daddr, inet->inet_saddr, 0, 0, sk->sk_uid); 578 rcu_read_unlock(); 579 } 580 581 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, 582 const struct sk_buff *skb) 583 { 584 if (skb) 585 build_skb_flow_key(fl4, skb, sk); 586 else 587 build_sk_flow_key(fl4, sk); 588 } 589 590 static DEFINE_SPINLOCK(fnhe_lock); 591 592 static void fnhe_flush_routes(struct fib_nh_exception *fnhe) 593 { 594 struct rtable *rt; 595 596 rt = rcu_dereference(fnhe->fnhe_rth_input); 597 if (rt) { 598 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL); 599 dst_dev_put(&rt->dst); 600 dst_release(&rt->dst); 601 } 602 rt = rcu_dereference(fnhe->fnhe_rth_output); 603 if (rt) { 604 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL); 605 dst_dev_put(&rt->dst); 606 dst_release(&rt->dst); 607 } 608 } 609 610 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) 611 { 612 struct fib_nh_exception *fnhe, *oldest; 613 614 oldest = rcu_dereference(hash->chain); 615 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe; 616 fnhe = rcu_dereference(fnhe->fnhe_next)) { 617 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) 618 oldest = fnhe; 619 } 620 fnhe_flush_routes(oldest); 621 return oldest; 622 } 623 624 static inline u32 fnhe_hashfun(__be32 daddr) 625 { 626 static u32 fnhe_hashrnd __read_mostly; 627 u32 hval; 628 629 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd)); 630 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd); 631 return hash_32(hval, FNHE_HASH_SHIFT); 632 } 633 634 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe) 635 { 636 rt->rt_pmtu = fnhe->fnhe_pmtu; 637 rt->dst.expires = fnhe->fnhe_expires; 638 639 if (fnhe->fnhe_gw) { 640 rt->rt_flags |= RTCF_REDIRECTED; 641 rt->rt_gateway = fnhe->fnhe_gw; 642 rt->rt_uses_gateway = 1; 643 } 644 } 645 646 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, 647 u32 pmtu, unsigned long expires) 648 { 649 struct fnhe_hash_bucket *hash; 650 struct fib_nh_exception *fnhe; 651 struct rtable *rt; 652 u32 genid, hval; 653 unsigned int i; 654 int depth; 655 656 genid = fnhe_genid(dev_net(nh->nh_dev)); 657 hval = fnhe_hashfun(daddr); 658 659 spin_lock_bh(&fnhe_lock); 660 661 hash = rcu_dereference(nh->nh_exceptions); 662 if (!hash) { 663 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC); 664 if (!hash) 665 goto out_unlock; 666 rcu_assign_pointer(nh->nh_exceptions, hash); 667 } 668 669 hash += hval; 670 671 depth = 0; 672 for (fnhe = rcu_dereference(hash->chain); fnhe; 673 fnhe = rcu_dereference(fnhe->fnhe_next)) { 674 if (fnhe->fnhe_daddr == daddr) 675 break; 676 depth++; 677 } 678 679 if (fnhe) { 680 if (fnhe->fnhe_genid != genid) 681 fnhe->fnhe_genid = genid; 682 if (gw) 683 fnhe->fnhe_gw = gw; 684 if (pmtu) 685 fnhe->fnhe_pmtu = pmtu; 686 fnhe->fnhe_expires = max(1UL, expires); 687 /* Update all cached dsts too */ 688 rt = rcu_dereference(fnhe->fnhe_rth_input); 689 if (rt) 690 fill_route_from_fnhe(rt, fnhe); 691 rt = rcu_dereference(fnhe->fnhe_rth_output); 692 if (rt) 693 fill_route_from_fnhe(rt, fnhe); 694 } else { 695 if (depth > FNHE_RECLAIM_DEPTH) 696 fnhe = fnhe_oldest(hash); 697 else { 698 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); 699 if (!fnhe) 700 goto out_unlock; 701 702 fnhe->fnhe_next = hash->chain; 703 rcu_assign_pointer(hash->chain, fnhe); 704 } 705 fnhe->fnhe_genid = genid; 706 fnhe->fnhe_daddr = daddr; 707 fnhe->fnhe_gw = gw; 708 fnhe->fnhe_pmtu = pmtu; 709 fnhe->fnhe_expires = expires; 710 711 /* Exception created; mark the cached routes for the nexthop 712 * stale, so anyone caching it rechecks if this exception 713 * applies to them. 714 */ 715 rt = rcu_dereference(nh->nh_rth_input); 716 if (rt) 717 rt->dst.obsolete = DST_OBSOLETE_KILL; 718 719 for_each_possible_cpu(i) { 720 struct rtable __rcu **prt; 721 prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i); 722 rt = rcu_dereference(*prt); 723 if (rt) 724 rt->dst.obsolete = DST_OBSOLETE_KILL; 725 } 726 } 727 728 fnhe->fnhe_stamp = jiffies; 729 730 out_unlock: 731 spin_unlock_bh(&fnhe_lock); 732 } 733 734 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, 735 bool kill_route) 736 { 737 __be32 new_gw = icmp_hdr(skb)->un.gateway; 738 __be32 old_gw = ip_hdr(skb)->saddr; 739 struct net_device *dev = skb->dev; 740 struct in_device *in_dev; 741 struct fib_result res; 742 struct neighbour *n; 743 struct net *net; 744 745 switch (icmp_hdr(skb)->code & 7) { 746 case ICMP_REDIR_NET: 747 case ICMP_REDIR_NETTOS: 748 case ICMP_REDIR_HOST: 749 case ICMP_REDIR_HOSTTOS: 750 break; 751 752 default: 753 return; 754 } 755 756 if (rt->rt_gateway != old_gw) 757 return; 758 759 in_dev = __in_dev_get_rcu(dev); 760 if (!in_dev) 761 return; 762 763 net = dev_net(dev); 764 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || 765 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) || 766 ipv4_is_zeronet(new_gw)) 767 goto reject_redirect; 768 769 if (!IN_DEV_SHARED_MEDIA(in_dev)) { 770 if (!inet_addr_onlink(in_dev, new_gw, old_gw)) 771 goto reject_redirect; 772 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) 773 goto reject_redirect; 774 } else { 775 if (inet_addr_type(net, new_gw) != RTN_UNICAST) 776 goto reject_redirect; 777 } 778 779 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw); 780 if (!n) 781 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev); 782 if (!IS_ERR(n)) { 783 if (!(n->nud_state & NUD_VALID)) { 784 neigh_event_send(n, NULL); 785 } else { 786 if (fib_lookup(net, fl4, &res, 0) == 0) { 787 struct fib_nh *nh = &FIB_RES_NH(res); 788 789 update_or_create_fnhe(nh, fl4->daddr, new_gw, 790 0, jiffies + ip_rt_gc_timeout); 791 } 792 if (kill_route) 793 rt->dst.obsolete = DST_OBSOLETE_KILL; 794 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); 795 } 796 neigh_release(n); 797 } 798 return; 799 800 reject_redirect: 801 #ifdef CONFIG_IP_ROUTE_VERBOSE 802 if (IN_DEV_LOG_MARTIANS(in_dev)) { 803 const struct iphdr *iph = (const struct iphdr *) skb->data; 804 __be32 daddr = iph->daddr; 805 __be32 saddr = iph->saddr; 806 807 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n" 808 " Advised path = %pI4 -> %pI4\n", 809 &old_gw, dev->name, &new_gw, 810 &saddr, &daddr); 811 } 812 #endif 813 ; 814 } 815 816 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 817 { 818 struct rtable *rt; 819 struct flowi4 fl4; 820 const struct iphdr *iph = (const struct iphdr *) skb->data; 821 struct net *net = dev_net(skb->dev); 822 int oif = skb->dev->ifindex; 823 u8 tos = RT_TOS(iph->tos); 824 u8 prot = iph->protocol; 825 u32 mark = skb->mark; 826 827 rt = (struct rtable *) dst; 828 829 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0); 830 __ip_do_redirect(rt, skb, &fl4, true); 831 } 832 833 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) 834 { 835 struct rtable *rt = (struct rtable *)dst; 836 struct dst_entry *ret = dst; 837 838 if (rt) { 839 if (dst->obsolete > 0) { 840 ip_rt_put(rt); 841 ret = NULL; 842 } else if ((rt->rt_flags & RTCF_REDIRECTED) || 843 rt->dst.expires) { 844 ip_rt_put(rt); 845 ret = NULL; 846 } 847 } 848 return ret; 849 } 850 851 /* 852 * Algorithm: 853 * 1. The first ip_rt_redirect_number redirects are sent 854 * with exponential backoff, then we stop sending them at all, 855 * assuming that the host ignores our redirects. 856 * 2. If we did not see packets requiring redirects 857 * during ip_rt_redirect_silence, we assume that the host 858 * forgot redirected route and start to send redirects again. 859 * 860 * This algorithm is much cheaper and more intelligent than dumb load limiting 861 * in icmp.c. 862 * 863 * NOTE. Do not forget to inhibit load limiting for redirects (redundant) 864 * and "frag. need" (breaks PMTU discovery) in icmp.c. 865 */ 866 867 void ip_rt_send_redirect(struct sk_buff *skb) 868 { 869 struct rtable *rt = skb_rtable(skb); 870 struct in_device *in_dev; 871 struct inet_peer *peer; 872 struct net *net; 873 int log_martians; 874 int vif; 875 876 rcu_read_lock(); 877 in_dev = __in_dev_get_rcu(rt->dst.dev); 878 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { 879 rcu_read_unlock(); 880 return; 881 } 882 log_martians = IN_DEV_LOG_MARTIANS(in_dev); 883 vif = l3mdev_master_ifindex_rcu(rt->dst.dev); 884 rcu_read_unlock(); 885 886 net = dev_net(rt->dst.dev); 887 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1); 888 if (!peer) { 889 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, 890 rt_nexthop(rt, ip_hdr(skb)->daddr)); 891 return; 892 } 893 894 /* No redirected packets during ip_rt_redirect_silence; 895 * reset the algorithm. 896 */ 897 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) 898 peer->rate_tokens = 0; 899 900 /* Too many ignored redirects; do not send anything 901 * set dst.rate_last to the last seen redirected packet. 902 */ 903 if (peer->rate_tokens >= ip_rt_redirect_number) { 904 peer->rate_last = jiffies; 905 goto out_put_peer; 906 } 907 908 /* Check for load limit; set rate_last to the latest sent 909 * redirect. 910 */ 911 if (peer->rate_tokens == 0 || 912 time_after(jiffies, 913 (peer->rate_last + 914 (ip_rt_redirect_load << peer->rate_tokens)))) { 915 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr); 916 917 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); 918 peer->rate_last = jiffies; 919 ++peer->rate_tokens; 920 #ifdef CONFIG_IP_ROUTE_VERBOSE 921 if (log_martians && 922 peer->rate_tokens == ip_rt_redirect_number) 923 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", 924 &ip_hdr(skb)->saddr, inet_iif(skb), 925 &ip_hdr(skb)->daddr, &gw); 926 #endif 927 } 928 out_put_peer: 929 inet_putpeer(peer); 930 } 931 932 static int ip_error(struct sk_buff *skb) 933 { 934 struct in_device *in_dev = __in_dev_get_rcu(skb->dev); 935 struct rtable *rt = skb_rtable(skb); 936 struct inet_peer *peer; 937 unsigned long now; 938 struct net *net; 939 bool send; 940 int code; 941 942 /* IP on this device is disabled. */ 943 if (!in_dev) 944 goto out; 945 946 net = dev_net(rt->dst.dev); 947 if (!IN_DEV_FORWARD(in_dev)) { 948 switch (rt->dst.error) { 949 case EHOSTUNREACH: 950 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS); 951 break; 952 953 case ENETUNREACH: 954 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES); 955 break; 956 } 957 goto out; 958 } 959 960 switch (rt->dst.error) { 961 case EINVAL: 962 default: 963 goto out; 964 case EHOSTUNREACH: 965 code = ICMP_HOST_UNREACH; 966 break; 967 case ENETUNREACH: 968 code = ICMP_NET_UNREACH; 969 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES); 970 break; 971 case EACCES: 972 code = ICMP_PKT_FILTERED; 973 break; 974 } 975 976 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 977 l3mdev_master_ifindex(skb->dev), 1); 978 979 send = true; 980 if (peer) { 981 now = jiffies; 982 peer->rate_tokens += now - peer->rate_last; 983 if (peer->rate_tokens > ip_rt_error_burst) 984 peer->rate_tokens = ip_rt_error_burst; 985 peer->rate_last = now; 986 if (peer->rate_tokens >= ip_rt_error_cost) 987 peer->rate_tokens -= ip_rt_error_cost; 988 else 989 send = false; 990 inet_putpeer(peer); 991 } 992 if (send) 993 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 994 995 out: kfree_skb(skb); 996 return 0; 997 } 998 999 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) 1000 { 1001 struct dst_entry *dst = &rt->dst; 1002 struct fib_result res; 1003 1004 if (dst_metric_locked(dst, RTAX_MTU)) 1005 return; 1006 1007 if (ipv4_mtu(dst) < mtu) 1008 return; 1009 1010 if (mtu < ip_rt_min_pmtu) 1011 mtu = ip_rt_min_pmtu; 1012 1013 if (rt->rt_pmtu == mtu && 1014 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2)) 1015 return; 1016 1017 rcu_read_lock(); 1018 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) { 1019 struct fib_nh *nh = &FIB_RES_NH(res); 1020 1021 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, 1022 jiffies + ip_rt_mtu_expires); 1023 } 1024 rcu_read_unlock(); 1025 } 1026 1027 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 1028 struct sk_buff *skb, u32 mtu) 1029 { 1030 struct rtable *rt = (struct rtable *) dst; 1031 struct flowi4 fl4; 1032 1033 ip_rt_build_flow_key(&fl4, sk, skb); 1034 __ip_rt_update_pmtu(rt, &fl4, mtu); 1035 } 1036 1037 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, 1038 int oif, u32 mark, u8 protocol, int flow_flags) 1039 { 1040 const struct iphdr *iph = (const struct iphdr *) skb->data; 1041 struct flowi4 fl4; 1042 struct rtable *rt; 1043 1044 if (!mark) 1045 mark = IP4_REPLY_MARK(net, skb->mark); 1046 1047 __build_flow_key(net, &fl4, NULL, iph, oif, 1048 RT_TOS(iph->tos), protocol, mark, flow_flags); 1049 rt = __ip_route_output_key(net, &fl4); 1050 if (!IS_ERR(rt)) { 1051 __ip_rt_update_pmtu(rt, &fl4, mtu); 1052 ip_rt_put(rt); 1053 } 1054 } 1055 EXPORT_SYMBOL_GPL(ipv4_update_pmtu); 1056 1057 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) 1058 { 1059 const struct iphdr *iph = (const struct iphdr *) skb->data; 1060 struct flowi4 fl4; 1061 struct rtable *rt; 1062 1063 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0); 1064 1065 if (!fl4.flowi4_mark) 1066 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark); 1067 1068 rt = __ip_route_output_key(sock_net(sk), &fl4); 1069 if (!IS_ERR(rt)) { 1070 __ip_rt_update_pmtu(rt, &fl4, mtu); 1071 ip_rt_put(rt); 1072 } 1073 } 1074 1075 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) 1076 { 1077 const struct iphdr *iph = (const struct iphdr *) skb->data; 1078 struct flowi4 fl4; 1079 struct rtable *rt; 1080 struct dst_entry *odst = NULL; 1081 bool new = false; 1082 struct net *net = sock_net(sk); 1083 1084 bh_lock_sock(sk); 1085 1086 if (!ip_sk_accept_pmtu(sk)) 1087 goto out; 1088 1089 odst = sk_dst_get(sk); 1090 1091 if (sock_owned_by_user(sk) || !odst) { 1092 __ipv4_sk_update_pmtu(skb, sk, mtu); 1093 goto out; 1094 } 1095 1096 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); 1097 1098 rt = (struct rtable *)odst; 1099 if (odst->obsolete && !odst->ops->check(odst, 0)) { 1100 rt = ip_route_output_flow(sock_net(sk), &fl4, sk); 1101 if (IS_ERR(rt)) 1102 goto out; 1103 1104 new = true; 1105 } 1106 1107 __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu); 1108 1109 if (!dst_check(&rt->dst, 0)) { 1110 if (new) 1111 dst_release(&rt->dst); 1112 1113 rt = ip_route_output_flow(sock_net(sk), &fl4, sk); 1114 if (IS_ERR(rt)) 1115 goto out; 1116 1117 new = true; 1118 } 1119 1120 if (new) 1121 sk_dst_set(sk, &rt->dst); 1122 1123 out: 1124 bh_unlock_sock(sk); 1125 dst_release(odst); 1126 } 1127 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); 1128 1129 void ipv4_redirect(struct sk_buff *skb, struct net *net, 1130 int oif, u32 mark, u8 protocol, int flow_flags) 1131 { 1132 const struct iphdr *iph = (const struct iphdr *) skb->data; 1133 struct flowi4 fl4; 1134 struct rtable *rt; 1135 1136 __build_flow_key(net, &fl4, NULL, iph, oif, 1137 RT_TOS(iph->tos), protocol, mark, flow_flags); 1138 rt = __ip_route_output_key(net, &fl4); 1139 if (!IS_ERR(rt)) { 1140 __ip_do_redirect(rt, skb, &fl4, false); 1141 ip_rt_put(rt); 1142 } 1143 } 1144 EXPORT_SYMBOL_GPL(ipv4_redirect); 1145 1146 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) 1147 { 1148 const struct iphdr *iph = (const struct iphdr *) skb->data; 1149 struct flowi4 fl4; 1150 struct rtable *rt; 1151 struct net *net = sock_net(sk); 1152 1153 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); 1154 rt = __ip_route_output_key(net, &fl4); 1155 if (!IS_ERR(rt)) { 1156 __ip_do_redirect(rt, skb, &fl4, false); 1157 ip_rt_put(rt); 1158 } 1159 } 1160 EXPORT_SYMBOL_GPL(ipv4_sk_redirect); 1161 1162 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 1163 { 1164 struct rtable *rt = (struct rtable *) dst; 1165 1166 /* All IPV4 dsts are created with ->obsolete set to the value 1167 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 1168 * into this function always. 1169 * 1170 * When a PMTU/redirect information update invalidates a route, 1171 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or 1172 * DST_OBSOLETE_DEAD by dst_free(). 1173 */ 1174 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt)) 1175 return NULL; 1176 return dst; 1177 } 1178 1179 static void ipv4_link_failure(struct sk_buff *skb) 1180 { 1181 struct rtable *rt; 1182 1183 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); 1184 1185 rt = skb_rtable(skb); 1186 if (rt) 1187 dst_set_expires(&rt->dst, 0); 1188 } 1189 1190 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb) 1191 { 1192 pr_debug("%s: %pI4 -> %pI4, %s\n", 1193 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, 1194 skb->dev ? skb->dev->name : "?"); 1195 kfree_skb(skb); 1196 WARN_ON(1); 1197 return 0; 1198 } 1199 1200 /* 1201 We do not cache source address of outgoing interface, 1202 because it is used only by IP RR, TS and SRR options, 1203 so that it out of fast path. 1204 1205 BTW remember: "addr" is allowed to be not aligned 1206 in IP options! 1207 */ 1208 1209 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) 1210 { 1211 __be32 src; 1212 1213 if (rt_is_output_route(rt)) 1214 src = ip_hdr(skb)->saddr; 1215 else { 1216 struct fib_result res; 1217 struct flowi4 fl4; 1218 struct iphdr *iph; 1219 1220 iph = ip_hdr(skb); 1221 1222 memset(&fl4, 0, sizeof(fl4)); 1223 fl4.daddr = iph->daddr; 1224 fl4.saddr = iph->saddr; 1225 fl4.flowi4_tos = RT_TOS(iph->tos); 1226 fl4.flowi4_oif = rt->dst.dev->ifindex; 1227 fl4.flowi4_iif = skb->dev->ifindex; 1228 fl4.flowi4_mark = skb->mark; 1229 1230 rcu_read_lock(); 1231 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0) 1232 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); 1233 else 1234 src = inet_select_addr(rt->dst.dev, 1235 rt_nexthop(rt, iph->daddr), 1236 RT_SCOPE_UNIVERSE); 1237 rcu_read_unlock(); 1238 } 1239 memcpy(addr, &src, 4); 1240 } 1241 1242 #ifdef CONFIG_IP_ROUTE_CLASSID 1243 static void set_class_tag(struct rtable *rt, u32 tag) 1244 { 1245 if (!(rt->dst.tclassid & 0xFFFF)) 1246 rt->dst.tclassid |= tag & 0xFFFF; 1247 if (!(rt->dst.tclassid & 0xFFFF0000)) 1248 rt->dst.tclassid |= tag & 0xFFFF0000; 1249 } 1250 #endif 1251 1252 static unsigned int ipv4_default_advmss(const struct dst_entry *dst) 1253 { 1254 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr); 1255 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, 1256 ip_rt_min_advmss); 1257 1258 return min(advmss, IPV4_MAX_PMTU - header_size); 1259 } 1260 1261 static unsigned int ipv4_mtu(const struct dst_entry *dst) 1262 { 1263 const struct rtable *rt = (const struct rtable *) dst; 1264 unsigned int mtu = rt->rt_pmtu; 1265 1266 if (!mtu || time_after_eq(jiffies, rt->dst.expires)) 1267 mtu = dst_metric_raw(dst, RTAX_MTU); 1268 1269 if (mtu) 1270 return mtu; 1271 1272 mtu = READ_ONCE(dst->dev->mtu); 1273 1274 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { 1275 if (rt->rt_uses_gateway && mtu > 576) 1276 mtu = 576; 1277 } 1278 1279 mtu = min_t(unsigned int, mtu, IP_MAX_MTU); 1280 1281 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 1282 } 1283 1284 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) 1285 { 1286 struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions); 1287 struct fib_nh_exception *fnhe; 1288 u32 hval; 1289 1290 if (!hash) 1291 return NULL; 1292 1293 hval = fnhe_hashfun(daddr); 1294 1295 for (fnhe = rcu_dereference(hash[hval].chain); fnhe; 1296 fnhe = rcu_dereference(fnhe->fnhe_next)) { 1297 if (fnhe->fnhe_daddr == daddr) 1298 return fnhe; 1299 } 1300 return NULL; 1301 } 1302 1303 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, 1304 __be32 daddr, const bool do_cache) 1305 { 1306 bool ret = false; 1307 1308 spin_lock_bh(&fnhe_lock); 1309 1310 if (daddr == fnhe->fnhe_daddr) { 1311 struct rtable __rcu **porig; 1312 struct rtable *orig; 1313 int genid = fnhe_genid(dev_net(rt->dst.dev)); 1314 1315 if (rt_is_input_route(rt)) 1316 porig = &fnhe->fnhe_rth_input; 1317 else 1318 porig = &fnhe->fnhe_rth_output; 1319 orig = rcu_dereference(*porig); 1320 1321 if (fnhe->fnhe_genid != genid) { 1322 fnhe->fnhe_genid = genid; 1323 fnhe->fnhe_gw = 0; 1324 fnhe->fnhe_pmtu = 0; 1325 fnhe->fnhe_expires = 0; 1326 fnhe_flush_routes(fnhe); 1327 orig = NULL; 1328 } 1329 fill_route_from_fnhe(rt, fnhe); 1330 if (!rt->rt_gateway) 1331 rt->rt_gateway = daddr; 1332 1333 if (do_cache) { 1334 dst_hold(&rt->dst); 1335 rcu_assign_pointer(*porig, rt); 1336 if (orig) { 1337 dst_dev_put(&orig->dst); 1338 dst_release(&orig->dst); 1339 } 1340 ret = true; 1341 } 1342 1343 fnhe->fnhe_stamp = jiffies; 1344 } 1345 spin_unlock_bh(&fnhe_lock); 1346 1347 return ret; 1348 } 1349 1350 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) 1351 { 1352 struct rtable *orig, *prev, **p; 1353 bool ret = true; 1354 1355 if (rt_is_input_route(rt)) { 1356 p = (struct rtable **)&nh->nh_rth_input; 1357 } else { 1358 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output); 1359 } 1360 orig = *p; 1361 1362 /* hold dst before doing cmpxchg() to avoid race condition 1363 * on this dst 1364 */ 1365 dst_hold(&rt->dst); 1366 prev = cmpxchg(p, orig, rt); 1367 if (prev == orig) { 1368 if (orig) { 1369 dst_dev_put(&orig->dst); 1370 dst_release(&orig->dst); 1371 } 1372 } else { 1373 dst_release(&rt->dst); 1374 ret = false; 1375 } 1376 1377 return ret; 1378 } 1379 1380 struct uncached_list { 1381 spinlock_t lock; 1382 struct list_head head; 1383 }; 1384 1385 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list); 1386 1387 static void rt_add_uncached_list(struct rtable *rt) 1388 { 1389 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list); 1390 1391 rt->rt_uncached_list = ul; 1392 1393 spin_lock_bh(&ul->lock); 1394 list_add_tail(&rt->rt_uncached, &ul->head); 1395 spin_unlock_bh(&ul->lock); 1396 } 1397 1398 static void ipv4_dst_destroy(struct dst_entry *dst) 1399 { 1400 struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst); 1401 struct rtable *rt = (struct rtable *) dst; 1402 1403 if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt)) 1404 kfree(p); 1405 1406 if (!list_empty(&rt->rt_uncached)) { 1407 struct uncached_list *ul = rt->rt_uncached_list; 1408 1409 spin_lock_bh(&ul->lock); 1410 list_del(&rt->rt_uncached); 1411 spin_unlock_bh(&ul->lock); 1412 } 1413 } 1414 1415 void rt_flush_dev(struct net_device *dev) 1416 { 1417 struct net *net = dev_net(dev); 1418 struct rtable *rt; 1419 int cpu; 1420 1421 for_each_possible_cpu(cpu) { 1422 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); 1423 1424 spin_lock_bh(&ul->lock); 1425 list_for_each_entry(rt, &ul->head, rt_uncached) { 1426 if (rt->dst.dev != dev) 1427 continue; 1428 rt->dst.dev = net->loopback_dev; 1429 dev_hold(rt->dst.dev); 1430 dev_put(dev); 1431 } 1432 spin_unlock_bh(&ul->lock); 1433 } 1434 } 1435 1436 static bool rt_cache_valid(const struct rtable *rt) 1437 { 1438 return rt && 1439 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 1440 !rt_is_expired(rt); 1441 } 1442 1443 static void rt_set_nexthop(struct rtable *rt, __be32 daddr, 1444 const struct fib_result *res, 1445 struct fib_nh_exception *fnhe, 1446 struct fib_info *fi, u16 type, u32 itag, 1447 const bool do_cache) 1448 { 1449 bool cached = false; 1450 1451 if (fi) { 1452 struct fib_nh *nh = &FIB_RES_NH(*res); 1453 1454 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) { 1455 rt->rt_gateway = nh->nh_gw; 1456 rt->rt_uses_gateway = 1; 1457 } 1458 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true); 1459 if (fi->fib_metrics != &dst_default_metrics) { 1460 rt->dst._metrics |= DST_METRICS_REFCOUNTED; 1461 refcount_inc(&fi->fib_metrics->refcnt); 1462 } 1463 #ifdef CONFIG_IP_ROUTE_CLASSID 1464 rt->dst.tclassid = nh->nh_tclassid; 1465 #endif 1466 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate); 1467 if (unlikely(fnhe)) 1468 cached = rt_bind_exception(rt, fnhe, daddr, do_cache); 1469 else if (do_cache) 1470 cached = rt_cache_route(nh, rt); 1471 if (unlikely(!cached)) { 1472 /* Routes we intend to cache in nexthop exception or 1473 * FIB nexthop have the DST_NOCACHE bit clear. 1474 * However, if we are unsuccessful at storing this 1475 * route into the cache we really need to set it. 1476 */ 1477 if (!rt->rt_gateway) 1478 rt->rt_gateway = daddr; 1479 rt_add_uncached_list(rt); 1480 } 1481 } else 1482 rt_add_uncached_list(rt); 1483 1484 #ifdef CONFIG_IP_ROUTE_CLASSID 1485 #ifdef CONFIG_IP_MULTIPLE_TABLES 1486 set_class_tag(rt, res->tclassid); 1487 #endif 1488 set_class_tag(rt, itag); 1489 #endif 1490 } 1491 1492 struct rtable *rt_dst_alloc(struct net_device *dev, 1493 unsigned int flags, u16 type, 1494 bool nopolicy, bool noxfrm, bool will_cache) 1495 { 1496 struct rtable *rt; 1497 1498 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, 1499 (will_cache ? 0 : DST_HOST) | 1500 (nopolicy ? DST_NOPOLICY : 0) | 1501 (noxfrm ? DST_NOXFRM : 0)); 1502 1503 if (rt) { 1504 rt->rt_genid = rt_genid_ipv4(dev_net(dev)); 1505 rt->rt_flags = flags; 1506 rt->rt_type = type; 1507 rt->rt_is_input = 0; 1508 rt->rt_iif = 0; 1509 rt->rt_pmtu = 0; 1510 rt->rt_gateway = 0; 1511 rt->rt_uses_gateway = 0; 1512 rt->rt_table_id = 0; 1513 INIT_LIST_HEAD(&rt->rt_uncached); 1514 1515 rt->dst.output = ip_output; 1516 if (flags & RTCF_LOCAL) 1517 rt->dst.input = ip_local_deliver; 1518 } 1519 1520 return rt; 1521 } 1522 EXPORT_SYMBOL(rt_dst_alloc); 1523 1524 /* called in rcu_read_lock() section */ 1525 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1526 u8 tos, struct net_device *dev, 1527 struct in_device *in_dev, u32 *itag) 1528 { 1529 int err; 1530 1531 /* Primary sanity checks. */ 1532 if (!in_dev) 1533 return -EINVAL; 1534 1535 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || 1536 skb->protocol != htons(ETH_P_IP)) 1537 return -EINVAL; 1538 1539 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev)) 1540 return -EINVAL; 1541 1542 if (ipv4_is_zeronet(saddr)) { 1543 if (!ipv4_is_local_multicast(daddr)) 1544 return -EINVAL; 1545 } else { 1546 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, 1547 in_dev, itag); 1548 if (err < 0) 1549 return err; 1550 } 1551 return 0; 1552 } 1553 1554 /* called in rcu_read_lock() section */ 1555 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1556 u8 tos, struct net_device *dev, int our) 1557 { 1558 struct in_device *in_dev = __in_dev_get_rcu(dev); 1559 unsigned int flags = RTCF_MULTICAST; 1560 struct rtable *rth; 1561 u32 itag = 0; 1562 int err; 1563 1564 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag); 1565 if (err) 1566 return err; 1567 1568 if (our) 1569 flags |= RTCF_LOCAL; 1570 1571 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, 1572 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); 1573 if (!rth) 1574 return -ENOBUFS; 1575 1576 #ifdef CONFIG_IP_ROUTE_CLASSID 1577 rth->dst.tclassid = itag; 1578 #endif 1579 rth->dst.output = ip_rt_bug; 1580 rth->rt_is_input= 1; 1581 1582 #ifdef CONFIG_IP_MROUTE 1583 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) 1584 rth->dst.input = ip_mr_input; 1585 #endif 1586 RT_CACHE_STAT_INC(in_slow_mc); 1587 1588 skb_dst_set(skb, &rth->dst); 1589 return 0; 1590 } 1591 1592 1593 static void ip_handle_martian_source(struct net_device *dev, 1594 struct in_device *in_dev, 1595 struct sk_buff *skb, 1596 __be32 daddr, 1597 __be32 saddr) 1598 { 1599 RT_CACHE_STAT_INC(in_martian_src); 1600 #ifdef CONFIG_IP_ROUTE_VERBOSE 1601 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { 1602 /* 1603 * RFC1812 recommendation, if source is martian, 1604 * the only hint is MAC header. 1605 */ 1606 pr_warn("martian source %pI4 from %pI4, on dev %s\n", 1607 &daddr, &saddr, dev->name); 1608 if (dev->hard_header_len && skb_mac_header_was_set(skb)) { 1609 print_hex_dump(KERN_WARNING, "ll header: ", 1610 DUMP_PREFIX_OFFSET, 16, 1, 1611 skb_mac_header(skb), 1612 dev->hard_header_len, true); 1613 } 1614 } 1615 #endif 1616 } 1617 1618 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr) 1619 { 1620 struct fnhe_hash_bucket *hash; 1621 struct fib_nh_exception *fnhe, __rcu **fnhe_p; 1622 u32 hval = fnhe_hashfun(daddr); 1623 1624 spin_lock_bh(&fnhe_lock); 1625 1626 hash = rcu_dereference_protected(nh->nh_exceptions, 1627 lockdep_is_held(&fnhe_lock)); 1628 hash += hval; 1629 1630 fnhe_p = &hash->chain; 1631 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock)); 1632 while (fnhe) { 1633 if (fnhe->fnhe_daddr == daddr) { 1634 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected( 1635 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock))); 1636 fnhe_flush_routes(fnhe); 1637 kfree_rcu(fnhe, rcu); 1638 break; 1639 } 1640 fnhe_p = &fnhe->fnhe_next; 1641 fnhe = rcu_dereference_protected(fnhe->fnhe_next, 1642 lockdep_is_held(&fnhe_lock)); 1643 } 1644 1645 spin_unlock_bh(&fnhe_lock); 1646 } 1647 1648 /* called in rcu_read_lock() section */ 1649 static int __mkroute_input(struct sk_buff *skb, 1650 const struct fib_result *res, 1651 struct in_device *in_dev, 1652 __be32 daddr, __be32 saddr, u32 tos) 1653 { 1654 struct fib_nh_exception *fnhe; 1655 struct rtable *rth; 1656 int err; 1657 struct in_device *out_dev; 1658 bool do_cache; 1659 u32 itag = 0; 1660 1661 /* get a working reference to the output device */ 1662 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res)); 1663 if (!out_dev) { 1664 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n"); 1665 return -EINVAL; 1666 } 1667 1668 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), 1669 in_dev->dev, in_dev, &itag); 1670 if (err < 0) { 1671 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 1672 saddr); 1673 1674 goto cleanup; 1675 } 1676 1677 do_cache = res->fi && !itag; 1678 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && 1679 skb->protocol == htons(ETH_P_IP) && 1680 (IN_DEV_SHARED_MEDIA(out_dev) || 1681 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) 1682 IPCB(skb)->flags |= IPSKB_DOREDIRECT; 1683 1684 if (skb->protocol != htons(ETH_P_IP)) { 1685 /* Not IP (i.e. ARP). Do not create route, if it is 1686 * invalid for proxy arp. DNAT routes are always valid. 1687 * 1688 * Proxy arp feature have been extended to allow, ARP 1689 * replies back to the same interface, to support 1690 * Private VLAN switch technologies. See arp.c. 1691 */ 1692 if (out_dev == in_dev && 1693 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { 1694 err = -EINVAL; 1695 goto cleanup; 1696 } 1697 } 1698 1699 fnhe = find_exception(&FIB_RES_NH(*res), daddr); 1700 if (do_cache) { 1701 if (fnhe) { 1702 rth = rcu_dereference(fnhe->fnhe_rth_input); 1703 if (rth && rth->dst.expires && 1704 time_after(jiffies, rth->dst.expires)) { 1705 ip_del_fnhe(&FIB_RES_NH(*res), daddr); 1706 fnhe = NULL; 1707 } else { 1708 goto rt_cache; 1709 } 1710 } 1711 1712 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); 1713 1714 rt_cache: 1715 if (rt_cache_valid(rth)) { 1716 skb_dst_set_noref(skb, &rth->dst); 1717 goto out; 1718 } 1719 } 1720 1721 rth = rt_dst_alloc(out_dev->dev, 0, res->type, 1722 IN_DEV_CONF_GET(in_dev, NOPOLICY), 1723 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache); 1724 if (!rth) { 1725 err = -ENOBUFS; 1726 goto cleanup; 1727 } 1728 1729 rth->rt_is_input = 1; 1730 if (res->table) 1731 rth->rt_table_id = res->table->tb_id; 1732 RT_CACHE_STAT_INC(in_slow_tot); 1733 1734 rth->dst.input = ip_forward; 1735 1736 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag, 1737 do_cache); 1738 lwtunnel_set_redirect(&rth->dst); 1739 skb_dst_set(skb, &rth->dst); 1740 out: 1741 err = 0; 1742 cleanup: 1743 return err; 1744 } 1745 1746 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1747 /* To make ICMP packets follow the right flow, the multipath hash is 1748 * calculated from the inner IP addresses. 1749 */ 1750 static void ip_multipath_l3_keys(const struct sk_buff *skb, 1751 struct flow_keys *hash_keys) 1752 { 1753 const struct iphdr *outer_iph = ip_hdr(skb); 1754 const struct iphdr *inner_iph; 1755 const struct icmphdr *icmph; 1756 struct iphdr _inner_iph; 1757 struct icmphdr _icmph; 1758 1759 hash_keys->addrs.v4addrs.src = outer_iph->saddr; 1760 hash_keys->addrs.v4addrs.dst = outer_iph->daddr; 1761 if (likely(outer_iph->protocol != IPPROTO_ICMP)) 1762 return; 1763 1764 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0)) 1765 return; 1766 1767 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph), 1768 &_icmph); 1769 if (!icmph) 1770 return; 1771 1772 if (icmph->type != ICMP_DEST_UNREACH && 1773 icmph->type != ICMP_REDIRECT && 1774 icmph->type != ICMP_TIME_EXCEEDED && 1775 icmph->type != ICMP_PARAMETERPROB) 1776 return; 1777 1778 inner_iph = skb_header_pointer(skb, 1779 outer_iph->ihl * 4 + sizeof(_icmph), 1780 sizeof(_inner_iph), &_inner_iph); 1781 if (!inner_iph) 1782 return; 1783 hash_keys->addrs.v4addrs.src = inner_iph->saddr; 1784 hash_keys->addrs.v4addrs.dst = inner_iph->daddr; 1785 } 1786 1787 /* if skb is set it will be used and fl4 can be NULL */ 1788 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4, 1789 const struct sk_buff *skb) 1790 { 1791 struct net *net = fi->fib_net; 1792 struct flow_keys hash_keys; 1793 u32 mhash; 1794 1795 switch (net->ipv4.sysctl_fib_multipath_hash_policy) { 1796 case 0: 1797 memset(&hash_keys, 0, sizeof(hash_keys)); 1798 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 1799 if (skb) { 1800 ip_multipath_l3_keys(skb, &hash_keys); 1801 } else { 1802 hash_keys.addrs.v4addrs.src = fl4->saddr; 1803 hash_keys.addrs.v4addrs.dst = fl4->daddr; 1804 } 1805 break; 1806 case 1: 1807 /* skb is currently provided only when forwarding */ 1808 if (skb) { 1809 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 1810 struct flow_keys keys; 1811 1812 /* short-circuit if we already have L4 hash present */ 1813 if (skb->l4_hash) 1814 return skb_get_hash_raw(skb) >> 1; 1815 memset(&hash_keys, 0, sizeof(hash_keys)); 1816 skb_flow_dissect_flow_keys(skb, &keys, flag); 1817 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; 1818 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; 1819 hash_keys.ports.src = keys.ports.src; 1820 hash_keys.ports.dst = keys.ports.dst; 1821 hash_keys.basic.ip_proto = keys.basic.ip_proto; 1822 } else { 1823 memset(&hash_keys, 0, sizeof(hash_keys)); 1824 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 1825 hash_keys.addrs.v4addrs.src = fl4->saddr; 1826 hash_keys.addrs.v4addrs.dst = fl4->daddr; 1827 hash_keys.ports.src = fl4->fl4_sport; 1828 hash_keys.ports.dst = fl4->fl4_dport; 1829 hash_keys.basic.ip_proto = fl4->flowi4_proto; 1830 } 1831 break; 1832 } 1833 mhash = flow_hash_from_keys(&hash_keys); 1834 1835 return mhash >> 1; 1836 } 1837 #endif /* CONFIG_IP_ROUTE_MULTIPATH */ 1838 1839 static int ip_mkroute_input(struct sk_buff *skb, 1840 struct fib_result *res, 1841 struct in_device *in_dev, 1842 __be32 daddr, __be32 saddr, u32 tos) 1843 { 1844 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1845 if (res->fi && res->fi->fib_nhs > 1) { 1846 int h = fib_multipath_hash(res->fi, NULL, skb); 1847 1848 fib_select_multipath(res, h); 1849 } 1850 #endif 1851 1852 /* create a routing cache entry */ 1853 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos); 1854 } 1855 1856 /* 1857 * NOTE. We drop all the packets that has local source 1858 * addresses, because every properly looped back packet 1859 * must have correct destination already attached by output routine. 1860 * 1861 * Such approach solves two big problems: 1862 * 1. Not simplex devices are handled properly. 1863 * 2. IP spoofing attempts are filtered with 100% of guarantee. 1864 * called with rcu_read_lock() 1865 */ 1866 1867 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1868 u8 tos, struct net_device *dev, 1869 struct fib_result *res) 1870 { 1871 struct in_device *in_dev = __in_dev_get_rcu(dev); 1872 struct ip_tunnel_info *tun_info; 1873 struct flowi4 fl4; 1874 unsigned int flags = 0; 1875 u32 itag = 0; 1876 struct rtable *rth; 1877 int err = -EINVAL; 1878 struct net *net = dev_net(dev); 1879 bool do_cache; 1880 1881 /* IP on this device is disabled. */ 1882 1883 if (!in_dev) 1884 goto out; 1885 1886 /* Check for the most weird martians, which can be not detected 1887 by fib_lookup. 1888 */ 1889 1890 tun_info = skb_tunnel_info(skb); 1891 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 1892 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id; 1893 else 1894 fl4.flowi4_tun_key.tun_id = 0; 1895 skb_dst_drop(skb); 1896 1897 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) 1898 goto martian_source; 1899 1900 res->fi = NULL; 1901 res->table = NULL; 1902 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) 1903 goto brd_input; 1904 1905 /* Accept zero addresses only to limited broadcast; 1906 * I even do not know to fix it or not. Waiting for complains :-) 1907 */ 1908 if (ipv4_is_zeronet(saddr)) 1909 goto martian_source; 1910 1911 if (ipv4_is_zeronet(daddr)) 1912 goto martian_destination; 1913 1914 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(), 1915 * and call it once if daddr or/and saddr are loopback addresses 1916 */ 1917 if (ipv4_is_loopback(daddr)) { 1918 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) 1919 goto martian_destination; 1920 } else if (ipv4_is_loopback(saddr)) { 1921 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) 1922 goto martian_source; 1923 } 1924 1925 /* 1926 * Now we are ready to route packet. 1927 */ 1928 fl4.flowi4_oif = 0; 1929 fl4.flowi4_iif = dev->ifindex; 1930 fl4.flowi4_mark = skb->mark; 1931 fl4.flowi4_tos = tos; 1932 fl4.flowi4_scope = RT_SCOPE_UNIVERSE; 1933 fl4.flowi4_flags = 0; 1934 fl4.daddr = daddr; 1935 fl4.saddr = saddr; 1936 fl4.flowi4_uid = sock_net_uid(net, NULL); 1937 err = fib_lookup(net, &fl4, res, 0); 1938 if (err != 0) { 1939 if (!IN_DEV_FORWARD(in_dev)) 1940 err = -EHOSTUNREACH; 1941 goto no_route; 1942 } 1943 1944 if (res->type == RTN_BROADCAST) 1945 goto brd_input; 1946 1947 if (res->type == RTN_LOCAL) { 1948 err = fib_validate_source(skb, saddr, daddr, tos, 1949 0, dev, in_dev, &itag); 1950 if (err < 0) 1951 goto martian_source; 1952 goto local_input; 1953 } 1954 1955 if (!IN_DEV_FORWARD(in_dev)) { 1956 err = -EHOSTUNREACH; 1957 goto no_route; 1958 } 1959 if (res->type != RTN_UNICAST) 1960 goto martian_destination; 1961 1962 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos); 1963 out: return err; 1964 1965 brd_input: 1966 if (skb->protocol != htons(ETH_P_IP)) 1967 goto e_inval; 1968 1969 if (!ipv4_is_zeronet(saddr)) { 1970 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, 1971 in_dev, &itag); 1972 if (err < 0) 1973 goto martian_source; 1974 } 1975 flags |= RTCF_BROADCAST; 1976 res->type = RTN_BROADCAST; 1977 RT_CACHE_STAT_INC(in_brd); 1978 1979 local_input: 1980 do_cache = false; 1981 if (res->fi) { 1982 if (!itag) { 1983 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); 1984 if (rt_cache_valid(rth)) { 1985 skb_dst_set_noref(skb, &rth->dst); 1986 err = 0; 1987 goto out; 1988 } 1989 do_cache = true; 1990 } 1991 } 1992 1993 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev, 1994 flags | RTCF_LOCAL, res->type, 1995 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache); 1996 if (!rth) 1997 goto e_nobufs; 1998 1999 rth->dst.output= ip_rt_bug; 2000 #ifdef CONFIG_IP_ROUTE_CLASSID 2001 rth->dst.tclassid = itag; 2002 #endif 2003 rth->rt_is_input = 1; 2004 if (res->table) 2005 rth->rt_table_id = res->table->tb_id; 2006 2007 RT_CACHE_STAT_INC(in_slow_tot); 2008 if (res->type == RTN_UNREACHABLE) { 2009 rth->dst.input= ip_error; 2010 rth->dst.error= -err; 2011 rth->rt_flags &= ~RTCF_LOCAL; 2012 } 2013 2014 if (do_cache) { 2015 struct fib_nh *nh = &FIB_RES_NH(*res); 2016 2017 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate); 2018 if (lwtunnel_input_redirect(rth->dst.lwtstate)) { 2019 WARN_ON(rth->dst.input == lwtunnel_input); 2020 rth->dst.lwtstate->orig_input = rth->dst.input; 2021 rth->dst.input = lwtunnel_input; 2022 } 2023 2024 if (unlikely(!rt_cache_route(nh, rth))) 2025 rt_add_uncached_list(rth); 2026 } 2027 skb_dst_set(skb, &rth->dst); 2028 err = 0; 2029 goto out; 2030 2031 no_route: 2032 RT_CACHE_STAT_INC(in_no_route); 2033 res->type = RTN_UNREACHABLE; 2034 res->fi = NULL; 2035 res->table = NULL; 2036 goto local_input; 2037 2038 /* 2039 * Do not cache martian addresses: they should be logged (RFC1812) 2040 */ 2041 martian_destination: 2042 RT_CACHE_STAT_INC(in_martian_dst); 2043 #ifdef CONFIG_IP_ROUTE_VERBOSE 2044 if (IN_DEV_LOG_MARTIANS(in_dev)) 2045 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n", 2046 &daddr, &saddr, dev->name); 2047 #endif 2048 2049 e_inval: 2050 err = -EINVAL; 2051 goto out; 2052 2053 e_nobufs: 2054 err = -ENOBUFS; 2055 goto out; 2056 2057 martian_source: 2058 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); 2059 goto out; 2060 } 2061 2062 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2063 u8 tos, struct net_device *dev) 2064 { 2065 struct fib_result res; 2066 int err; 2067 2068 tos &= IPTOS_RT_MASK; 2069 rcu_read_lock(); 2070 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res); 2071 rcu_read_unlock(); 2072 2073 return err; 2074 } 2075 EXPORT_SYMBOL(ip_route_input_noref); 2076 2077 /* called with rcu_read_lock held */ 2078 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2079 u8 tos, struct net_device *dev, struct fib_result *res) 2080 { 2081 /* Multicast recognition logic is moved from route cache to here. 2082 The problem was that too many Ethernet cards have broken/missing 2083 hardware multicast filters :-( As result the host on multicasting 2084 network acquires a lot of useless route cache entries, sort of 2085 SDR messages from all the world. Now we try to get rid of them. 2086 Really, provided software IP multicast filter is organized 2087 reasonably (at least, hashed), it does not result in a slowdown 2088 comparing with route cache reject entries. 2089 Note, that multicast routers are not affected, because 2090 route cache entry is created eventually. 2091 */ 2092 if (ipv4_is_multicast(daddr)) { 2093 struct in_device *in_dev = __in_dev_get_rcu(dev); 2094 int our = 0; 2095 int err = -EINVAL; 2096 2097 if (in_dev) 2098 our = ip_check_mc_rcu(in_dev, daddr, saddr, 2099 ip_hdr(skb)->protocol); 2100 2101 /* check l3 master if no match yet */ 2102 if ((!in_dev || !our) && netif_is_l3_slave(dev)) { 2103 struct in_device *l3_in_dev; 2104 2105 l3_in_dev = __in_dev_get_rcu(skb->dev); 2106 if (l3_in_dev) 2107 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr, 2108 ip_hdr(skb)->protocol); 2109 } 2110 2111 if (our 2112 #ifdef CONFIG_IP_MROUTE 2113 || 2114 (!ipv4_is_local_multicast(daddr) && 2115 IN_DEV_MFORWARD(in_dev)) 2116 #endif 2117 ) { 2118 err = ip_route_input_mc(skb, daddr, saddr, 2119 tos, dev, our); 2120 } 2121 return err; 2122 } 2123 2124 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res); 2125 } 2126 2127 /* called with rcu_read_lock() */ 2128 static struct rtable *__mkroute_output(const struct fib_result *res, 2129 const struct flowi4 *fl4, int orig_oif, 2130 struct net_device *dev_out, 2131 unsigned int flags) 2132 { 2133 struct fib_info *fi = res->fi; 2134 struct fib_nh_exception *fnhe; 2135 struct in_device *in_dev; 2136 u16 type = res->type; 2137 struct rtable *rth; 2138 bool do_cache; 2139 2140 in_dev = __in_dev_get_rcu(dev_out); 2141 if (!in_dev) 2142 return ERR_PTR(-EINVAL); 2143 2144 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) 2145 if (ipv4_is_loopback(fl4->saddr) && 2146 !(dev_out->flags & IFF_LOOPBACK) && 2147 !netif_is_l3_master(dev_out)) 2148 return ERR_PTR(-EINVAL); 2149 2150 if (ipv4_is_lbcast(fl4->daddr)) 2151 type = RTN_BROADCAST; 2152 else if (ipv4_is_multicast(fl4->daddr)) 2153 type = RTN_MULTICAST; 2154 else if (ipv4_is_zeronet(fl4->daddr)) 2155 return ERR_PTR(-EINVAL); 2156 2157 if (dev_out->flags & IFF_LOOPBACK) 2158 flags |= RTCF_LOCAL; 2159 2160 do_cache = true; 2161 if (type == RTN_BROADCAST) { 2162 flags |= RTCF_BROADCAST | RTCF_LOCAL; 2163 fi = NULL; 2164 } else if (type == RTN_MULTICAST) { 2165 flags |= RTCF_MULTICAST | RTCF_LOCAL; 2166 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, 2167 fl4->flowi4_proto)) 2168 flags &= ~RTCF_LOCAL; 2169 else 2170 do_cache = false; 2171 /* If multicast route do not exist use 2172 * default one, but do not gateway in this case. 2173 * Yes, it is hack. 2174 */ 2175 if (fi && res->prefixlen < 4) 2176 fi = NULL; 2177 } else if ((type == RTN_LOCAL) && (orig_oif != 0) && 2178 (orig_oif != dev_out->ifindex)) { 2179 /* For local routes that require a particular output interface 2180 * we do not want to cache the result. Caching the result 2181 * causes incorrect behaviour when there are multiple source 2182 * addresses on the interface, the end result being that if the 2183 * intended recipient is waiting on that interface for the 2184 * packet he won't receive it because it will be delivered on 2185 * the loopback interface and the IP_PKTINFO ipi_ifindex will 2186 * be set to the loopback interface as well. 2187 */ 2188 fi = NULL; 2189 } 2190 2191 fnhe = NULL; 2192 do_cache &= fi != NULL; 2193 if (do_cache) { 2194 struct rtable __rcu **prth; 2195 struct fib_nh *nh = &FIB_RES_NH(*res); 2196 2197 fnhe = find_exception(nh, fl4->daddr); 2198 if (fnhe) { 2199 prth = &fnhe->fnhe_rth_output; 2200 rth = rcu_dereference(*prth); 2201 if (rth && rth->dst.expires && 2202 time_after(jiffies, rth->dst.expires)) { 2203 ip_del_fnhe(nh, fl4->daddr); 2204 fnhe = NULL; 2205 } else { 2206 goto rt_cache; 2207 } 2208 } 2209 2210 if (unlikely(fl4->flowi4_flags & 2211 FLOWI_FLAG_KNOWN_NH && 2212 !(nh->nh_gw && 2213 nh->nh_scope == RT_SCOPE_LINK))) { 2214 do_cache = false; 2215 goto add; 2216 } 2217 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output); 2218 rth = rcu_dereference(*prth); 2219 2220 rt_cache: 2221 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst)) 2222 return rth; 2223 } 2224 2225 add: 2226 rth = rt_dst_alloc(dev_out, flags, type, 2227 IN_DEV_CONF_GET(in_dev, NOPOLICY), 2228 IN_DEV_CONF_GET(in_dev, NOXFRM), 2229 do_cache); 2230 if (!rth) 2231 return ERR_PTR(-ENOBUFS); 2232 2233 rth->rt_iif = orig_oif; 2234 if (res->table) 2235 rth->rt_table_id = res->table->tb_id; 2236 2237 RT_CACHE_STAT_INC(out_slow_tot); 2238 2239 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { 2240 if (flags & RTCF_LOCAL && 2241 !(dev_out->flags & IFF_LOOPBACK)) { 2242 rth->dst.output = ip_mc_output; 2243 RT_CACHE_STAT_INC(out_slow_mc); 2244 } 2245 #ifdef CONFIG_IP_MROUTE 2246 if (type == RTN_MULTICAST) { 2247 if (IN_DEV_MFORWARD(in_dev) && 2248 !ipv4_is_local_multicast(fl4->daddr)) { 2249 rth->dst.input = ip_mr_input; 2250 rth->dst.output = ip_mc_output; 2251 } 2252 } 2253 #endif 2254 } 2255 2256 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache); 2257 lwtunnel_set_redirect(&rth->dst); 2258 2259 return rth; 2260 } 2261 2262 /* 2263 * Major route resolver routine. 2264 */ 2265 2266 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, 2267 const struct sk_buff *skb) 2268 { 2269 __u8 tos = RT_FL_TOS(fl4); 2270 struct fib_result res; 2271 struct rtable *rth; 2272 2273 res.tclassid = 0; 2274 res.fi = NULL; 2275 res.table = NULL; 2276 2277 fl4->flowi4_iif = LOOPBACK_IFINDEX; 2278 fl4->flowi4_tos = tos & IPTOS_RT_MASK; 2279 fl4->flowi4_scope = ((tos & RTO_ONLINK) ? 2280 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); 2281 2282 rcu_read_lock(); 2283 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb); 2284 rcu_read_unlock(); 2285 2286 return rth; 2287 } 2288 EXPORT_SYMBOL_GPL(ip_route_output_key_hash); 2289 2290 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, 2291 struct fib_result *res, 2292 const struct sk_buff *skb) 2293 { 2294 struct net_device *dev_out = NULL; 2295 int orig_oif = fl4->flowi4_oif; 2296 unsigned int flags = 0; 2297 struct rtable *rth; 2298 int err = -ENETUNREACH; 2299 2300 if (fl4->saddr) { 2301 rth = ERR_PTR(-EINVAL); 2302 if (ipv4_is_multicast(fl4->saddr) || 2303 ipv4_is_lbcast(fl4->saddr) || 2304 ipv4_is_zeronet(fl4->saddr)) 2305 goto out; 2306 2307 /* I removed check for oif == dev_out->oif here. 2308 It was wrong for two reasons: 2309 1. ip_dev_find(net, saddr) can return wrong iface, if saddr 2310 is assigned to multiple interfaces. 2311 2. Moreover, we are allowed to send packets with saddr 2312 of another iface. --ANK 2313 */ 2314 2315 if (fl4->flowi4_oif == 0 && 2316 (ipv4_is_multicast(fl4->daddr) || 2317 ipv4_is_lbcast(fl4->daddr))) { 2318 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2319 dev_out = __ip_dev_find(net, fl4->saddr, false); 2320 if (!dev_out) 2321 goto out; 2322 2323 /* Special hack: user can direct multicasts 2324 and limited broadcast via necessary interface 2325 without fiddling with IP_MULTICAST_IF or IP_PKTINFO. 2326 This hack is not just for fun, it allows 2327 vic,vat and friends to work. 2328 They bind socket to loopback, set ttl to zero 2329 and expect that it will work. 2330 From the viewpoint of routing cache they are broken, 2331 because we are not allowed to build multicast path 2332 with loopback source addr (look, routing cache 2333 cannot know, that ttl is zero, so that packet 2334 will not leave this host and route is valid). 2335 Luckily, this hack is good workaround. 2336 */ 2337 2338 fl4->flowi4_oif = dev_out->ifindex; 2339 goto make_route; 2340 } 2341 2342 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) { 2343 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2344 if (!__ip_dev_find(net, fl4->saddr, false)) 2345 goto out; 2346 } 2347 } 2348 2349 2350 if (fl4->flowi4_oif) { 2351 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif); 2352 rth = ERR_PTR(-ENODEV); 2353 if (!dev_out) 2354 goto out; 2355 2356 /* RACE: Check return value of inet_select_addr instead. */ 2357 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { 2358 rth = ERR_PTR(-ENETUNREACH); 2359 goto out; 2360 } 2361 if (ipv4_is_local_multicast(fl4->daddr) || 2362 ipv4_is_lbcast(fl4->daddr) || 2363 fl4->flowi4_proto == IPPROTO_IGMP) { 2364 if (!fl4->saddr) 2365 fl4->saddr = inet_select_addr(dev_out, 0, 2366 RT_SCOPE_LINK); 2367 goto make_route; 2368 } 2369 if (!fl4->saddr) { 2370 if (ipv4_is_multicast(fl4->daddr)) 2371 fl4->saddr = inet_select_addr(dev_out, 0, 2372 fl4->flowi4_scope); 2373 else if (!fl4->daddr) 2374 fl4->saddr = inet_select_addr(dev_out, 0, 2375 RT_SCOPE_HOST); 2376 } 2377 } 2378 2379 if (!fl4->daddr) { 2380 fl4->daddr = fl4->saddr; 2381 if (!fl4->daddr) 2382 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); 2383 dev_out = net->loopback_dev; 2384 fl4->flowi4_oif = LOOPBACK_IFINDEX; 2385 res->type = RTN_LOCAL; 2386 flags |= RTCF_LOCAL; 2387 goto make_route; 2388 } 2389 2390 err = fib_lookup(net, fl4, res, 0); 2391 if (err) { 2392 res->fi = NULL; 2393 res->table = NULL; 2394 if (fl4->flowi4_oif && 2395 (ipv4_is_multicast(fl4->daddr) || 2396 !netif_index_is_l3_master(net, fl4->flowi4_oif))) { 2397 /* Apparently, routing tables are wrong. Assume, 2398 that the destination is on link. 2399 2400 WHY? DW. 2401 Because we are allowed to send to iface 2402 even if it has NO routes and NO assigned 2403 addresses. When oif is specified, routing 2404 tables are looked up with only one purpose: 2405 to catch if destination is gatewayed, rather than 2406 direct. Moreover, if MSG_DONTROUTE is set, 2407 we send packet, ignoring both routing tables 2408 and ifaddr state. --ANK 2409 2410 2411 We could make it even if oif is unknown, 2412 likely IPv6, but we do not. 2413 */ 2414 2415 if (fl4->saddr == 0) 2416 fl4->saddr = inet_select_addr(dev_out, 0, 2417 RT_SCOPE_LINK); 2418 res->type = RTN_UNICAST; 2419 goto make_route; 2420 } 2421 rth = ERR_PTR(err); 2422 goto out; 2423 } 2424 2425 if (res->type == RTN_LOCAL) { 2426 if (!fl4->saddr) { 2427 if (res->fi->fib_prefsrc) 2428 fl4->saddr = res->fi->fib_prefsrc; 2429 else 2430 fl4->saddr = fl4->daddr; 2431 } 2432 2433 /* L3 master device is the loopback for that domain */ 2434 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? : 2435 net->loopback_dev; 2436 2437 /* make sure orig_oif points to fib result device even 2438 * though packet rx/tx happens over loopback or l3mdev 2439 */ 2440 orig_oif = FIB_RES_OIF(*res); 2441 2442 fl4->flowi4_oif = dev_out->ifindex; 2443 flags |= RTCF_LOCAL; 2444 goto make_route; 2445 } 2446 2447 fib_select_path(net, res, fl4, skb); 2448 2449 dev_out = FIB_RES_DEV(*res); 2450 fl4->flowi4_oif = dev_out->ifindex; 2451 2452 2453 make_route: 2454 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags); 2455 2456 out: 2457 return rth; 2458 } 2459 2460 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) 2461 { 2462 return NULL; 2463 } 2464 2465 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst) 2466 { 2467 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 2468 2469 return mtu ? : dst->dev->mtu; 2470 } 2471 2472 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 2473 struct sk_buff *skb, u32 mtu) 2474 { 2475 } 2476 2477 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 2478 struct sk_buff *skb) 2479 { 2480 } 2481 2482 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst, 2483 unsigned long old) 2484 { 2485 return NULL; 2486 } 2487 2488 static struct dst_ops ipv4_dst_blackhole_ops = { 2489 .family = AF_INET, 2490 .check = ipv4_blackhole_dst_check, 2491 .mtu = ipv4_blackhole_mtu, 2492 .default_advmss = ipv4_default_advmss, 2493 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2494 .redirect = ipv4_rt_blackhole_redirect, 2495 .cow_metrics = ipv4_rt_blackhole_cow_metrics, 2496 .neigh_lookup = ipv4_neigh_lookup, 2497 }; 2498 2499 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2500 { 2501 struct rtable *ort = (struct rtable *) dst_orig; 2502 struct rtable *rt; 2503 2504 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0); 2505 if (rt) { 2506 struct dst_entry *new = &rt->dst; 2507 2508 new->__use = 1; 2509 new->input = dst_discard; 2510 new->output = dst_discard_out; 2511 2512 new->dev = net->loopback_dev; 2513 if (new->dev) 2514 dev_hold(new->dev); 2515 2516 rt->rt_is_input = ort->rt_is_input; 2517 rt->rt_iif = ort->rt_iif; 2518 rt->rt_pmtu = ort->rt_pmtu; 2519 2520 rt->rt_genid = rt_genid_ipv4(net); 2521 rt->rt_flags = ort->rt_flags; 2522 rt->rt_type = ort->rt_type; 2523 rt->rt_gateway = ort->rt_gateway; 2524 rt->rt_uses_gateway = ort->rt_uses_gateway; 2525 2526 INIT_LIST_HEAD(&rt->rt_uncached); 2527 } 2528 2529 dst_release(dst_orig); 2530 2531 return rt ? &rt->dst : ERR_PTR(-ENOMEM); 2532 } 2533 2534 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, 2535 const struct sock *sk) 2536 { 2537 struct rtable *rt = __ip_route_output_key(net, flp4); 2538 2539 if (IS_ERR(rt)) 2540 return rt; 2541 2542 if (flp4->flowi4_proto) 2543 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst, 2544 flowi4_to_flowi(flp4), 2545 sk, 0); 2546 2547 return rt; 2548 } 2549 EXPORT_SYMBOL_GPL(ip_route_output_flow); 2550 2551 /* called with rcu_read_lock held */ 2552 static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, 2553 struct flowi4 *fl4, struct sk_buff *skb, u32 portid, 2554 u32 seq) 2555 { 2556 struct rtable *rt = skb_rtable(skb); 2557 struct rtmsg *r; 2558 struct nlmsghdr *nlh; 2559 unsigned long expires = 0; 2560 u32 error; 2561 u32 metrics[RTAX_MAX]; 2562 2563 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0); 2564 if (!nlh) 2565 return -EMSGSIZE; 2566 2567 r = nlmsg_data(nlh); 2568 r->rtm_family = AF_INET; 2569 r->rtm_dst_len = 32; 2570 r->rtm_src_len = 0; 2571 r->rtm_tos = fl4->flowi4_tos; 2572 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT; 2573 if (nla_put_u32(skb, RTA_TABLE, table_id)) 2574 goto nla_put_failure; 2575 r->rtm_type = rt->rt_type; 2576 r->rtm_scope = RT_SCOPE_UNIVERSE; 2577 r->rtm_protocol = RTPROT_UNSPEC; 2578 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; 2579 if (rt->rt_flags & RTCF_NOTIFY) 2580 r->rtm_flags |= RTM_F_NOTIFY; 2581 if (IPCB(skb)->flags & IPSKB_DOREDIRECT) 2582 r->rtm_flags |= RTCF_DOREDIRECT; 2583 2584 if (nla_put_in_addr(skb, RTA_DST, dst)) 2585 goto nla_put_failure; 2586 if (src) { 2587 r->rtm_src_len = 32; 2588 if (nla_put_in_addr(skb, RTA_SRC, src)) 2589 goto nla_put_failure; 2590 } 2591 if (rt->dst.dev && 2592 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) 2593 goto nla_put_failure; 2594 #ifdef CONFIG_IP_ROUTE_CLASSID 2595 if (rt->dst.tclassid && 2596 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) 2597 goto nla_put_failure; 2598 #endif 2599 if (!rt_is_input_route(rt) && 2600 fl4->saddr != src) { 2601 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr)) 2602 goto nla_put_failure; 2603 } 2604 if (rt->rt_uses_gateway && 2605 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway)) 2606 goto nla_put_failure; 2607 2608 expires = rt->dst.expires; 2609 if (expires) { 2610 unsigned long now = jiffies; 2611 2612 if (time_before(now, expires)) 2613 expires -= now; 2614 else 2615 expires = 0; 2616 } 2617 2618 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); 2619 if (rt->rt_pmtu && expires) 2620 metrics[RTAX_MTU - 1] = rt->rt_pmtu; 2621 if (rtnetlink_put_metrics(skb, metrics) < 0) 2622 goto nla_put_failure; 2623 2624 if (fl4->flowi4_mark && 2625 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) 2626 goto nla_put_failure; 2627 2628 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) && 2629 nla_put_u32(skb, RTA_UID, 2630 from_kuid_munged(current_user_ns(), fl4->flowi4_uid))) 2631 goto nla_put_failure; 2632 2633 error = rt->dst.error; 2634 2635 if (rt_is_input_route(rt)) { 2636 #ifdef CONFIG_IP_MROUTE 2637 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && 2638 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { 2639 int err = ipmr_get_route(net, skb, 2640 fl4->saddr, fl4->daddr, 2641 r, portid); 2642 2643 if (err <= 0) { 2644 if (err == 0) 2645 return 0; 2646 goto nla_put_failure; 2647 } 2648 } else 2649 #endif 2650 if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex)) 2651 goto nla_put_failure; 2652 } 2653 2654 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) 2655 goto nla_put_failure; 2656 2657 nlmsg_end(skb, nlh); 2658 return 0; 2659 2660 nla_put_failure: 2661 nlmsg_cancel(skb, nlh); 2662 return -EMSGSIZE; 2663 } 2664 2665 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 2666 struct netlink_ext_ack *extack) 2667 { 2668 struct net *net = sock_net(in_skb->sk); 2669 struct rtmsg *rtm; 2670 struct nlattr *tb[RTA_MAX+1]; 2671 struct fib_result res = {}; 2672 struct rtable *rt = NULL; 2673 struct flowi4 fl4; 2674 __be32 dst = 0; 2675 __be32 src = 0; 2676 u32 iif; 2677 int err; 2678 int mark; 2679 struct sk_buff *skb; 2680 u32 table_id = RT_TABLE_MAIN; 2681 kuid_t uid; 2682 2683 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, 2684 extack); 2685 if (err < 0) 2686 goto errout; 2687 2688 rtm = nlmsg_data(nlh); 2689 2690 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 2691 if (!skb) { 2692 err = -ENOBUFS; 2693 goto errout; 2694 } 2695 2696 /* Reserve room for dummy headers, this skb can pass 2697 through good chunk of routing engine. 2698 */ 2699 skb_reset_mac_header(skb); 2700 skb_reset_network_header(skb); 2701 2702 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0; 2703 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0; 2704 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; 2705 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; 2706 if (tb[RTA_UID]) 2707 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID])); 2708 else 2709 uid = (iif ? INVALID_UID : current_uid()); 2710 2711 /* Bugfix: need to give ip_route_input enough of an IP header to 2712 * not gag. 2713 */ 2714 ip_hdr(skb)->protocol = IPPROTO_UDP; 2715 ip_hdr(skb)->saddr = src; 2716 ip_hdr(skb)->daddr = dst; 2717 2718 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); 2719 2720 memset(&fl4, 0, sizeof(fl4)); 2721 fl4.daddr = dst; 2722 fl4.saddr = src; 2723 fl4.flowi4_tos = rtm->rtm_tos; 2724 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; 2725 fl4.flowi4_mark = mark; 2726 fl4.flowi4_uid = uid; 2727 2728 rcu_read_lock(); 2729 2730 if (iif) { 2731 struct net_device *dev; 2732 2733 dev = dev_get_by_index_rcu(net, iif); 2734 if (!dev) { 2735 err = -ENODEV; 2736 goto errout_free; 2737 } 2738 2739 skb->protocol = htons(ETH_P_IP); 2740 skb->dev = dev; 2741 skb->mark = mark; 2742 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos, 2743 dev, &res); 2744 2745 rt = skb_rtable(skb); 2746 if (err == 0 && rt->dst.error) 2747 err = -rt->dst.error; 2748 } else { 2749 fl4.flowi4_iif = LOOPBACK_IFINDEX; 2750 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb); 2751 err = 0; 2752 if (IS_ERR(rt)) 2753 err = PTR_ERR(rt); 2754 else 2755 skb_dst_set(skb, &rt->dst); 2756 } 2757 2758 if (err) 2759 goto errout_free; 2760 2761 if (rtm->rtm_flags & RTM_F_NOTIFY) 2762 rt->rt_flags |= RTCF_NOTIFY; 2763 2764 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE) 2765 table_id = rt->rt_table_id; 2766 2767 if (rtm->rtm_flags & RTM_F_FIB_MATCH) { 2768 if (!res.fi) { 2769 err = fib_props[res.type].error; 2770 if (!err) 2771 err = -EHOSTUNREACH; 2772 goto errout_free; 2773 } 2774 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid, 2775 nlh->nlmsg_seq, RTM_NEWROUTE, table_id, 2776 rt->rt_type, res.prefix, res.prefixlen, 2777 fl4.flowi4_tos, res.fi, 0); 2778 } else { 2779 err = rt_fill_info(net, dst, src, table_id, &fl4, skb, 2780 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq); 2781 } 2782 if (err < 0) 2783 goto errout_free; 2784 2785 rcu_read_unlock(); 2786 2787 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 2788 errout: 2789 return err; 2790 2791 errout_free: 2792 rcu_read_unlock(); 2793 kfree_skb(skb); 2794 goto errout; 2795 } 2796 2797 void ip_rt_multicast_event(struct in_device *in_dev) 2798 { 2799 rt_cache_flush(dev_net(in_dev->dev)); 2800 } 2801 2802 #ifdef CONFIG_SYSCTL 2803 static int ip_rt_gc_interval __read_mostly = 60 * HZ; 2804 static int ip_rt_gc_min_interval __read_mostly = HZ / 2; 2805 static int ip_rt_gc_elasticity __read_mostly = 8; 2806 2807 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write, 2808 void __user *buffer, 2809 size_t *lenp, loff_t *ppos) 2810 { 2811 struct net *net = (struct net *)__ctl->extra1; 2812 2813 if (write) { 2814 rt_cache_flush(net); 2815 fnhe_genid_bump(net); 2816 return 0; 2817 } 2818 2819 return -EINVAL; 2820 } 2821 2822 static struct ctl_table ipv4_route_table[] = { 2823 { 2824 .procname = "gc_thresh", 2825 .data = &ipv4_dst_ops.gc_thresh, 2826 .maxlen = sizeof(int), 2827 .mode = 0644, 2828 .proc_handler = proc_dointvec, 2829 }, 2830 { 2831 .procname = "max_size", 2832 .data = &ip_rt_max_size, 2833 .maxlen = sizeof(int), 2834 .mode = 0644, 2835 .proc_handler = proc_dointvec, 2836 }, 2837 { 2838 /* Deprecated. Use gc_min_interval_ms */ 2839 2840 .procname = "gc_min_interval", 2841 .data = &ip_rt_gc_min_interval, 2842 .maxlen = sizeof(int), 2843 .mode = 0644, 2844 .proc_handler = proc_dointvec_jiffies, 2845 }, 2846 { 2847 .procname = "gc_min_interval_ms", 2848 .data = &ip_rt_gc_min_interval, 2849 .maxlen = sizeof(int), 2850 .mode = 0644, 2851 .proc_handler = proc_dointvec_ms_jiffies, 2852 }, 2853 { 2854 .procname = "gc_timeout", 2855 .data = &ip_rt_gc_timeout, 2856 .maxlen = sizeof(int), 2857 .mode = 0644, 2858 .proc_handler = proc_dointvec_jiffies, 2859 }, 2860 { 2861 .procname = "gc_interval", 2862 .data = &ip_rt_gc_interval, 2863 .maxlen = sizeof(int), 2864 .mode = 0644, 2865 .proc_handler = proc_dointvec_jiffies, 2866 }, 2867 { 2868 .procname = "redirect_load", 2869 .data = &ip_rt_redirect_load, 2870 .maxlen = sizeof(int), 2871 .mode = 0644, 2872 .proc_handler = proc_dointvec, 2873 }, 2874 { 2875 .procname = "redirect_number", 2876 .data = &ip_rt_redirect_number, 2877 .maxlen = sizeof(int), 2878 .mode = 0644, 2879 .proc_handler = proc_dointvec, 2880 }, 2881 { 2882 .procname = "redirect_silence", 2883 .data = &ip_rt_redirect_silence, 2884 .maxlen = sizeof(int), 2885 .mode = 0644, 2886 .proc_handler = proc_dointvec, 2887 }, 2888 { 2889 .procname = "error_cost", 2890 .data = &ip_rt_error_cost, 2891 .maxlen = sizeof(int), 2892 .mode = 0644, 2893 .proc_handler = proc_dointvec, 2894 }, 2895 { 2896 .procname = "error_burst", 2897 .data = &ip_rt_error_burst, 2898 .maxlen = sizeof(int), 2899 .mode = 0644, 2900 .proc_handler = proc_dointvec, 2901 }, 2902 { 2903 .procname = "gc_elasticity", 2904 .data = &ip_rt_gc_elasticity, 2905 .maxlen = sizeof(int), 2906 .mode = 0644, 2907 .proc_handler = proc_dointvec, 2908 }, 2909 { 2910 .procname = "mtu_expires", 2911 .data = &ip_rt_mtu_expires, 2912 .maxlen = sizeof(int), 2913 .mode = 0644, 2914 .proc_handler = proc_dointvec_jiffies, 2915 }, 2916 { 2917 .procname = "min_pmtu", 2918 .data = &ip_rt_min_pmtu, 2919 .maxlen = sizeof(int), 2920 .mode = 0644, 2921 .proc_handler = proc_dointvec, 2922 }, 2923 { 2924 .procname = "min_adv_mss", 2925 .data = &ip_rt_min_advmss, 2926 .maxlen = sizeof(int), 2927 .mode = 0644, 2928 .proc_handler = proc_dointvec, 2929 }, 2930 { } 2931 }; 2932 2933 static struct ctl_table ipv4_route_flush_table[] = { 2934 { 2935 .procname = "flush", 2936 .maxlen = sizeof(int), 2937 .mode = 0200, 2938 .proc_handler = ipv4_sysctl_rtcache_flush, 2939 }, 2940 { }, 2941 }; 2942 2943 static __net_init int sysctl_route_net_init(struct net *net) 2944 { 2945 struct ctl_table *tbl; 2946 2947 tbl = ipv4_route_flush_table; 2948 if (!net_eq(net, &init_net)) { 2949 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); 2950 if (!tbl) 2951 goto err_dup; 2952 2953 /* Don't export sysctls to unprivileged users */ 2954 if (net->user_ns != &init_user_ns) 2955 tbl[0].procname = NULL; 2956 } 2957 tbl[0].extra1 = net; 2958 2959 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl); 2960 if (!net->ipv4.route_hdr) 2961 goto err_reg; 2962 return 0; 2963 2964 err_reg: 2965 if (tbl != ipv4_route_flush_table) 2966 kfree(tbl); 2967 err_dup: 2968 return -ENOMEM; 2969 } 2970 2971 static __net_exit void sysctl_route_net_exit(struct net *net) 2972 { 2973 struct ctl_table *tbl; 2974 2975 tbl = net->ipv4.route_hdr->ctl_table_arg; 2976 unregister_net_sysctl_table(net->ipv4.route_hdr); 2977 BUG_ON(tbl == ipv4_route_flush_table); 2978 kfree(tbl); 2979 } 2980 2981 static __net_initdata struct pernet_operations sysctl_route_ops = { 2982 .init = sysctl_route_net_init, 2983 .exit = sysctl_route_net_exit, 2984 .async = true, 2985 }; 2986 #endif 2987 2988 static __net_init int rt_genid_init(struct net *net) 2989 { 2990 atomic_set(&net->ipv4.rt_genid, 0); 2991 atomic_set(&net->fnhe_genid, 0); 2992 atomic_set(&net->ipv4.dev_addr_genid, get_random_int()); 2993 return 0; 2994 } 2995 2996 static __net_initdata struct pernet_operations rt_genid_ops = { 2997 .init = rt_genid_init, 2998 .async = true, 2999 }; 3000 3001 static int __net_init ipv4_inetpeer_init(struct net *net) 3002 { 3003 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 3004 3005 if (!bp) 3006 return -ENOMEM; 3007 inet_peer_base_init(bp); 3008 net->ipv4.peers = bp; 3009 return 0; 3010 } 3011 3012 static void __net_exit ipv4_inetpeer_exit(struct net *net) 3013 { 3014 struct inet_peer_base *bp = net->ipv4.peers; 3015 3016 net->ipv4.peers = NULL; 3017 inetpeer_invalidate_tree(bp); 3018 kfree(bp); 3019 } 3020 3021 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { 3022 .init = ipv4_inetpeer_init, 3023 .exit = ipv4_inetpeer_exit, 3024 .async = true, 3025 }; 3026 3027 #ifdef CONFIG_IP_ROUTE_CLASSID 3028 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; 3029 #endif /* CONFIG_IP_ROUTE_CLASSID */ 3030 3031 int __init ip_rt_init(void) 3032 { 3033 int cpu; 3034 3035 ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL); 3036 if (!ip_idents) 3037 panic("IP: failed to allocate ip_idents\n"); 3038 3039 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); 3040 3041 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL); 3042 if (!ip_tstamps) 3043 panic("IP: failed to allocate ip_tstamps\n"); 3044 3045 for_each_possible_cpu(cpu) { 3046 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); 3047 3048 INIT_LIST_HEAD(&ul->head); 3049 spin_lock_init(&ul->lock); 3050 } 3051 #ifdef CONFIG_IP_ROUTE_CLASSID 3052 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); 3053 if (!ip_rt_acct) 3054 panic("IP: failed to allocate ip_rt_acct\n"); 3055 #endif 3056 3057 ipv4_dst_ops.kmem_cachep = 3058 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0, 3059 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 3060 3061 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; 3062 3063 if (dst_entries_init(&ipv4_dst_ops) < 0) 3064 panic("IP: failed to allocate ipv4_dst_ops counter\n"); 3065 3066 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) 3067 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); 3068 3069 ipv4_dst_ops.gc_thresh = ~0; 3070 ip_rt_max_size = INT_MAX; 3071 3072 devinet_init(); 3073 ip_fib_init(); 3074 3075 if (ip_rt_proc_init()) 3076 pr_err("Unable to create route proc files\n"); 3077 #ifdef CONFIG_XFRM 3078 xfrm_init(); 3079 xfrm4_init(); 3080 #endif 3081 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, 3082 RTNL_FLAG_DOIT_UNLOCKED); 3083 3084 #ifdef CONFIG_SYSCTL 3085 register_pernet_subsys(&sysctl_route_ops); 3086 #endif 3087 register_pernet_subsys(&rt_genid_ops); 3088 register_pernet_subsys(&ipv4_inetpeer_ops); 3089 return 0; 3090 } 3091 3092 #ifdef CONFIG_SYSCTL 3093 /* 3094 * We really need to sanitize the damn ipv4 init order, then all 3095 * this nonsense will go away. 3096 */ 3097 void __init ip_static_sysctl_init(void) 3098 { 3099 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table); 3100 } 3101 #endif 3102