1 /* 2 * IP multicast routing support for mrouted 3.6/3.8 3 * 4 * (c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk> 5 * Linux Consultancy and Custom Driver Development 6 * 7 * This program is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU General Public License 9 * as published by the Free Software Foundation; either version 10 * 2 of the License, or (at your option) any later version. 11 * 12 * Fixes: 13 * Michael Chastain : Incorrect size of copying. 14 * Alan Cox : Added the cache manager code 15 * Alan Cox : Fixed the clone/copy bug and device race. 16 * Mike McLagan : Routing by source 17 * Malcolm Beattie : Buffer handling fixes. 18 * Alexey Kuznetsov : Double buffer free and other fixes. 19 * SVR Anand : Fixed several multicast bugs and problems. 20 * Alexey Kuznetsov : Status, optimisations and more. 21 * Brad Parker : Better behaviour on mrouted upcall 22 * overflow. 23 * Carlos Picoto : PIMv1 Support 24 * Pavlin Ivanov Radoslavov: PIMv2 Registers must checksum only PIM header 25 * Relax this requrement to work with older peers. 26 * 27 */ 28 29 #include <asm/system.h> 30 #include <asm/uaccess.h> 31 #include <linux/types.h> 32 #include <linux/capability.h> 33 #include <linux/errno.h> 34 #include <linux/timer.h> 35 #include <linux/mm.h> 36 #include <linux/kernel.h> 37 #include <linux/fcntl.h> 38 #include <linux/stat.h> 39 #include <linux/socket.h> 40 #include <linux/in.h> 41 #include <linux/inet.h> 42 #include <linux/netdevice.h> 43 #include <linux/inetdevice.h> 44 #include <linux/igmp.h> 45 #include <linux/proc_fs.h> 46 #include <linux/seq_file.h> 47 #include <linux/mroute.h> 48 #include <linux/init.h> 49 #include <linux/if_ether.h> 50 #include <net/net_namespace.h> 51 #include <net/ip.h> 52 #include <net/protocol.h> 53 #include <linux/skbuff.h> 54 #include <net/route.h> 55 #include <net/sock.h> 56 #include <net/icmp.h> 57 #include <net/udp.h> 58 #include <net/raw.h> 59 #include <linux/notifier.h> 60 #include <linux/if_arp.h> 61 #include <linux/netfilter_ipv4.h> 62 #include <net/ipip.h> 63 #include <net/checksum.h> 64 #include <net/netlink.h> 65 66 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2) 67 #define CONFIG_IP_PIMSM 1 68 #endif 69 70 /* Big lock, protecting vif table, mrt cache and mroute socket state. 71 Note that the changes are semaphored via rtnl_lock. 72 */ 73 74 static DEFINE_RWLOCK(mrt_lock); 75 76 /* 77 * Multicast router control variables 78 */ 79 80 #define VIF_EXISTS(_net, _idx) ((_net)->ipv4.vif_table[_idx].dev != NULL) 81 82 static struct mfc_cache *mfc_unres_queue; /* Queue of unresolved entries */ 83 84 /* Special spinlock for queue of unresolved entries */ 85 static DEFINE_SPINLOCK(mfc_unres_lock); 86 87 /* We return to original Alan's scheme. Hash table of resolved 88 entries is changed only in process context and protected 89 with weak lock mrt_lock. Queue of unresolved entries is protected 90 with strong spinlock mfc_unres_lock. 91 92 In this case data path is free of exclusive locks at all. 93 */ 94 95 static struct kmem_cache *mrt_cachep __read_mostly; 96 97 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local); 98 static int ipmr_cache_report(struct net *net, 99 struct sk_buff *pkt, vifi_t vifi, int assert); 100 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm); 101 102 static struct timer_list ipmr_expire_timer; 103 104 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ 105 106 static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v) 107 { 108 struct net *net = dev_net(dev); 109 110 dev_close(dev); 111 112 dev = __dev_get_by_name(net, "tunl0"); 113 if (dev) { 114 const struct net_device_ops *ops = dev->netdev_ops; 115 struct ifreq ifr; 116 struct ip_tunnel_parm p; 117 118 memset(&p, 0, sizeof(p)); 119 p.iph.daddr = v->vifc_rmt_addr.s_addr; 120 p.iph.saddr = v->vifc_lcl_addr.s_addr; 121 p.iph.version = 4; 122 p.iph.ihl = 5; 123 p.iph.protocol = IPPROTO_IPIP; 124 sprintf(p.name, "dvmrp%d", v->vifc_vifi); 125 ifr.ifr_ifru.ifru_data = (__force void __user *)&p; 126 127 if (ops->ndo_do_ioctl) { 128 mm_segment_t oldfs = get_fs(); 129 130 set_fs(KERNEL_DS); 131 ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL); 132 set_fs(oldfs); 133 } 134 } 135 } 136 137 static 138 struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) 139 { 140 struct net_device *dev; 141 142 dev = __dev_get_by_name(net, "tunl0"); 143 144 if (dev) { 145 const struct net_device_ops *ops = dev->netdev_ops; 146 int err; 147 struct ifreq ifr; 148 struct ip_tunnel_parm p; 149 struct in_device *in_dev; 150 151 memset(&p, 0, sizeof(p)); 152 p.iph.daddr = v->vifc_rmt_addr.s_addr; 153 p.iph.saddr = v->vifc_lcl_addr.s_addr; 154 p.iph.version = 4; 155 p.iph.ihl = 5; 156 p.iph.protocol = IPPROTO_IPIP; 157 sprintf(p.name, "dvmrp%d", v->vifc_vifi); 158 ifr.ifr_ifru.ifru_data = (__force void __user *)&p; 159 160 if (ops->ndo_do_ioctl) { 161 mm_segment_t oldfs = get_fs(); 162 163 set_fs(KERNEL_DS); 164 err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL); 165 set_fs(oldfs); 166 } else 167 err = -EOPNOTSUPP; 168 169 dev = NULL; 170 171 if (err == 0 && 172 (dev = __dev_get_by_name(net, p.name)) != NULL) { 173 dev->flags |= IFF_MULTICAST; 174 175 in_dev = __in_dev_get_rtnl(dev); 176 if (in_dev == NULL) 177 goto failure; 178 179 ipv4_devconf_setall(in_dev); 180 IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0; 181 182 if (dev_open(dev)) 183 goto failure; 184 dev_hold(dev); 185 } 186 } 187 return dev; 188 189 failure: 190 /* allow the register to be completed before unregistering. */ 191 rtnl_unlock(); 192 rtnl_lock(); 193 194 unregister_netdevice(dev); 195 return NULL; 196 } 197 198 #ifdef CONFIG_IP_PIMSM 199 200 static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) 201 { 202 struct net *net = dev_net(dev); 203 204 read_lock(&mrt_lock); 205 dev->stats.tx_bytes += skb->len; 206 dev->stats.tx_packets++; 207 ipmr_cache_report(net, skb, net->ipv4.mroute_reg_vif_num, 208 IGMPMSG_WHOLEPKT); 209 read_unlock(&mrt_lock); 210 kfree_skb(skb); 211 return NETDEV_TX_OK; 212 } 213 214 static const struct net_device_ops reg_vif_netdev_ops = { 215 .ndo_start_xmit = reg_vif_xmit, 216 }; 217 218 static void reg_vif_setup(struct net_device *dev) 219 { 220 dev->type = ARPHRD_PIMREG; 221 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8; 222 dev->flags = IFF_NOARP; 223 dev->netdev_ops = ®_vif_netdev_ops, 224 dev->destructor = free_netdev; 225 dev->features |= NETIF_F_NETNS_LOCAL; 226 } 227 228 static struct net_device *ipmr_reg_vif(struct net *net) 229 { 230 struct net_device *dev; 231 struct in_device *in_dev; 232 233 dev = alloc_netdev(0, "pimreg", reg_vif_setup); 234 235 if (dev == NULL) 236 return NULL; 237 238 dev_net_set(dev, net); 239 240 if (register_netdevice(dev)) { 241 free_netdev(dev); 242 return NULL; 243 } 244 dev->iflink = 0; 245 246 rcu_read_lock(); 247 if ((in_dev = __in_dev_get_rcu(dev)) == NULL) { 248 rcu_read_unlock(); 249 goto failure; 250 } 251 252 ipv4_devconf_setall(in_dev); 253 IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0; 254 rcu_read_unlock(); 255 256 if (dev_open(dev)) 257 goto failure; 258 259 dev_hold(dev); 260 261 return dev; 262 263 failure: 264 /* allow the register to be completed before unregistering. */ 265 rtnl_unlock(); 266 rtnl_lock(); 267 268 unregister_netdevice(dev); 269 return NULL; 270 } 271 #endif 272 273 /* 274 * Delete a VIF entry 275 * @notify: Set to 1, if the caller is a notifier_call 276 */ 277 278 static int vif_delete(struct net *net, int vifi, int notify) 279 { 280 struct vif_device *v; 281 struct net_device *dev; 282 struct in_device *in_dev; 283 284 if (vifi < 0 || vifi >= net->ipv4.maxvif) 285 return -EADDRNOTAVAIL; 286 287 v = &net->ipv4.vif_table[vifi]; 288 289 write_lock_bh(&mrt_lock); 290 dev = v->dev; 291 v->dev = NULL; 292 293 if (!dev) { 294 write_unlock_bh(&mrt_lock); 295 return -EADDRNOTAVAIL; 296 } 297 298 #ifdef CONFIG_IP_PIMSM 299 if (vifi == net->ipv4.mroute_reg_vif_num) 300 net->ipv4.mroute_reg_vif_num = -1; 301 #endif 302 303 if (vifi+1 == net->ipv4.maxvif) { 304 int tmp; 305 for (tmp=vifi-1; tmp>=0; tmp--) { 306 if (VIF_EXISTS(net, tmp)) 307 break; 308 } 309 net->ipv4.maxvif = tmp+1; 310 } 311 312 write_unlock_bh(&mrt_lock); 313 314 dev_set_allmulti(dev, -1); 315 316 if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) { 317 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--; 318 ip_rt_multicast_event(in_dev); 319 } 320 321 if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify) 322 unregister_netdevice(dev); 323 324 dev_put(dev); 325 return 0; 326 } 327 328 static inline void ipmr_cache_free(struct mfc_cache *c) 329 { 330 release_net(mfc_net(c)); 331 kmem_cache_free(mrt_cachep, c); 332 } 333 334 /* Destroy an unresolved cache entry, killing queued skbs 335 and reporting error to netlink readers. 336 */ 337 338 static void ipmr_destroy_unres(struct mfc_cache *c) 339 { 340 struct sk_buff *skb; 341 struct nlmsgerr *e; 342 struct net *net = mfc_net(c); 343 344 atomic_dec(&net->ipv4.cache_resolve_queue_len); 345 346 while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) { 347 if (ip_hdr(skb)->version == 0) { 348 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); 349 nlh->nlmsg_type = NLMSG_ERROR; 350 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); 351 skb_trim(skb, nlh->nlmsg_len); 352 e = NLMSG_DATA(nlh); 353 e->error = -ETIMEDOUT; 354 memset(&e->msg, 0, sizeof(e->msg)); 355 356 rtnl_unicast(skb, net, NETLINK_CB(skb).pid); 357 } else 358 kfree_skb(skb); 359 } 360 361 ipmr_cache_free(c); 362 } 363 364 365 /* Single timer process for all the unresolved queue. */ 366 367 static void ipmr_expire_process(unsigned long dummy) 368 { 369 unsigned long now; 370 unsigned long expires; 371 struct mfc_cache *c, **cp; 372 373 if (!spin_trylock(&mfc_unres_lock)) { 374 mod_timer(&ipmr_expire_timer, jiffies+HZ/10); 375 return; 376 } 377 378 if (mfc_unres_queue == NULL) 379 goto out; 380 381 now = jiffies; 382 expires = 10*HZ; 383 cp = &mfc_unres_queue; 384 385 while ((c=*cp) != NULL) { 386 if (time_after(c->mfc_un.unres.expires, now)) { 387 unsigned long interval = c->mfc_un.unres.expires - now; 388 if (interval < expires) 389 expires = interval; 390 cp = &c->next; 391 continue; 392 } 393 394 *cp = c->next; 395 396 ipmr_destroy_unres(c); 397 } 398 399 if (mfc_unres_queue != NULL) 400 mod_timer(&ipmr_expire_timer, jiffies + expires); 401 402 out: 403 spin_unlock(&mfc_unres_lock); 404 } 405 406 /* Fill oifs list. It is called under write locked mrt_lock. */ 407 408 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls) 409 { 410 int vifi; 411 struct net *net = mfc_net(cache); 412 413 cache->mfc_un.res.minvif = MAXVIFS; 414 cache->mfc_un.res.maxvif = 0; 415 memset(cache->mfc_un.res.ttls, 255, MAXVIFS); 416 417 for (vifi = 0; vifi < net->ipv4.maxvif; vifi++) { 418 if (VIF_EXISTS(net, vifi) && 419 ttls[vifi] && ttls[vifi] < 255) { 420 cache->mfc_un.res.ttls[vifi] = ttls[vifi]; 421 if (cache->mfc_un.res.minvif > vifi) 422 cache->mfc_un.res.minvif = vifi; 423 if (cache->mfc_un.res.maxvif <= vifi) 424 cache->mfc_un.res.maxvif = vifi + 1; 425 } 426 } 427 } 428 429 static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock) 430 { 431 int vifi = vifc->vifc_vifi; 432 struct vif_device *v = &net->ipv4.vif_table[vifi]; 433 struct net_device *dev; 434 struct in_device *in_dev; 435 int err; 436 437 /* Is vif busy ? */ 438 if (VIF_EXISTS(net, vifi)) 439 return -EADDRINUSE; 440 441 switch (vifc->vifc_flags) { 442 #ifdef CONFIG_IP_PIMSM 443 case VIFF_REGISTER: 444 /* 445 * Special Purpose VIF in PIM 446 * All the packets will be sent to the daemon 447 */ 448 if (net->ipv4.mroute_reg_vif_num >= 0) 449 return -EADDRINUSE; 450 dev = ipmr_reg_vif(net); 451 if (!dev) 452 return -ENOBUFS; 453 err = dev_set_allmulti(dev, 1); 454 if (err) { 455 unregister_netdevice(dev); 456 dev_put(dev); 457 return err; 458 } 459 break; 460 #endif 461 case VIFF_TUNNEL: 462 dev = ipmr_new_tunnel(net, vifc); 463 if (!dev) 464 return -ENOBUFS; 465 err = dev_set_allmulti(dev, 1); 466 if (err) { 467 ipmr_del_tunnel(dev, vifc); 468 dev_put(dev); 469 return err; 470 } 471 break; 472 case 0: 473 dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr); 474 if (!dev) 475 return -EADDRNOTAVAIL; 476 err = dev_set_allmulti(dev, 1); 477 if (err) { 478 dev_put(dev); 479 return err; 480 } 481 break; 482 default: 483 return -EINVAL; 484 } 485 486 if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) { 487 dev_put(dev); 488 return -EADDRNOTAVAIL; 489 } 490 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++; 491 ip_rt_multicast_event(in_dev); 492 493 /* 494 * Fill in the VIF structures 495 */ 496 v->rate_limit = vifc->vifc_rate_limit; 497 v->local = vifc->vifc_lcl_addr.s_addr; 498 v->remote = vifc->vifc_rmt_addr.s_addr; 499 v->flags = vifc->vifc_flags; 500 if (!mrtsock) 501 v->flags |= VIFF_STATIC; 502 v->threshold = vifc->vifc_threshold; 503 v->bytes_in = 0; 504 v->bytes_out = 0; 505 v->pkt_in = 0; 506 v->pkt_out = 0; 507 v->link = dev->ifindex; 508 if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER)) 509 v->link = dev->iflink; 510 511 /* And finish update writing critical data */ 512 write_lock_bh(&mrt_lock); 513 v->dev = dev; 514 #ifdef CONFIG_IP_PIMSM 515 if (v->flags&VIFF_REGISTER) 516 net->ipv4.mroute_reg_vif_num = vifi; 517 #endif 518 if (vifi+1 > net->ipv4.maxvif) 519 net->ipv4.maxvif = vifi+1; 520 write_unlock_bh(&mrt_lock); 521 return 0; 522 } 523 524 static struct mfc_cache *ipmr_cache_find(struct net *net, 525 __be32 origin, 526 __be32 mcastgrp) 527 { 528 int line = MFC_HASH(mcastgrp, origin); 529 struct mfc_cache *c; 530 531 for (c = net->ipv4.mfc_cache_array[line]; c; c = c->next) { 532 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp) 533 break; 534 } 535 return c; 536 } 537 538 /* 539 * Allocate a multicast cache entry 540 */ 541 static struct mfc_cache *ipmr_cache_alloc(struct net *net) 542 { 543 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); 544 if (c == NULL) 545 return NULL; 546 c->mfc_un.res.minvif = MAXVIFS; 547 mfc_net_set(c, net); 548 return c; 549 } 550 551 static struct mfc_cache *ipmr_cache_alloc_unres(struct net *net) 552 { 553 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); 554 if (c == NULL) 555 return NULL; 556 skb_queue_head_init(&c->mfc_un.unres.unresolved); 557 c->mfc_un.unres.expires = jiffies + 10*HZ; 558 mfc_net_set(c, net); 559 return c; 560 } 561 562 /* 563 * A cache entry has gone into a resolved state from queued 564 */ 565 566 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c) 567 { 568 struct sk_buff *skb; 569 struct nlmsgerr *e; 570 571 /* 572 * Play the pending entries through our router 573 */ 574 575 while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) { 576 if (ip_hdr(skb)->version == 0) { 577 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); 578 579 if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) { 580 nlh->nlmsg_len = (skb_tail_pointer(skb) - 581 (u8 *)nlh); 582 } else { 583 nlh->nlmsg_type = NLMSG_ERROR; 584 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); 585 skb_trim(skb, nlh->nlmsg_len); 586 e = NLMSG_DATA(nlh); 587 e->error = -EMSGSIZE; 588 memset(&e->msg, 0, sizeof(e->msg)); 589 } 590 591 rtnl_unicast(skb, mfc_net(c), NETLINK_CB(skb).pid); 592 } else 593 ip_mr_forward(skb, c, 0); 594 } 595 } 596 597 /* 598 * Bounce a cache query up to mrouted. We could use netlink for this but mrouted 599 * expects the following bizarre scheme. 600 * 601 * Called under mrt_lock. 602 */ 603 604 static int ipmr_cache_report(struct net *net, 605 struct sk_buff *pkt, vifi_t vifi, int assert) 606 { 607 struct sk_buff *skb; 608 const int ihl = ip_hdrlen(pkt); 609 struct igmphdr *igmp; 610 struct igmpmsg *msg; 611 int ret; 612 613 #ifdef CONFIG_IP_PIMSM 614 if (assert == IGMPMSG_WHOLEPKT) 615 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr)); 616 else 617 #endif 618 skb = alloc_skb(128, GFP_ATOMIC); 619 620 if (!skb) 621 return -ENOBUFS; 622 623 #ifdef CONFIG_IP_PIMSM 624 if (assert == IGMPMSG_WHOLEPKT) { 625 /* Ugly, but we have no choice with this interface. 626 Duplicate old header, fix ihl, length etc. 627 And all this only to mangle msg->im_msgtype and 628 to set msg->im_mbz to "mbz" :-) 629 */ 630 skb_push(skb, sizeof(struct iphdr)); 631 skb_reset_network_header(skb); 632 skb_reset_transport_header(skb); 633 msg = (struct igmpmsg *)skb_network_header(skb); 634 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr)); 635 msg->im_msgtype = IGMPMSG_WHOLEPKT; 636 msg->im_mbz = 0; 637 msg->im_vif = net->ipv4.mroute_reg_vif_num; 638 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2; 639 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) + 640 sizeof(struct iphdr)); 641 } else 642 #endif 643 { 644 645 /* 646 * Copy the IP header 647 */ 648 649 skb->network_header = skb->tail; 650 skb_put(skb, ihl); 651 skb_copy_to_linear_data(skb, pkt->data, ihl); 652 ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */ 653 msg = (struct igmpmsg *)skb_network_header(skb); 654 msg->im_vif = vifi; 655 skb_dst_set(skb, dst_clone(skb_dst(pkt))); 656 657 /* 658 * Add our header 659 */ 660 661 igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr)); 662 igmp->type = 663 msg->im_msgtype = assert; 664 igmp->code = 0; 665 ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */ 666 skb->transport_header = skb->network_header; 667 } 668 669 if (net->ipv4.mroute_sk == NULL) { 670 kfree_skb(skb); 671 return -EINVAL; 672 } 673 674 /* 675 * Deliver to mrouted 676 */ 677 ret = sock_queue_rcv_skb(net->ipv4.mroute_sk, skb); 678 if (ret < 0) { 679 if (net_ratelimit()) 680 printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n"); 681 kfree_skb(skb); 682 } 683 684 return ret; 685 } 686 687 /* 688 * Queue a packet for resolution. It gets locked cache entry! 689 */ 690 691 static int 692 ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb) 693 { 694 int err; 695 struct mfc_cache *c; 696 const struct iphdr *iph = ip_hdr(skb); 697 698 spin_lock_bh(&mfc_unres_lock); 699 for (c=mfc_unres_queue; c; c=c->next) { 700 if (net_eq(mfc_net(c), net) && 701 c->mfc_mcastgrp == iph->daddr && 702 c->mfc_origin == iph->saddr) 703 break; 704 } 705 706 if (c == NULL) { 707 /* 708 * Create a new entry if allowable 709 */ 710 711 if (atomic_read(&net->ipv4.cache_resolve_queue_len) >= 10 || 712 (c = ipmr_cache_alloc_unres(net)) == NULL) { 713 spin_unlock_bh(&mfc_unres_lock); 714 715 kfree_skb(skb); 716 return -ENOBUFS; 717 } 718 719 /* 720 * Fill in the new cache entry 721 */ 722 c->mfc_parent = -1; 723 c->mfc_origin = iph->saddr; 724 c->mfc_mcastgrp = iph->daddr; 725 726 /* 727 * Reflect first query at mrouted. 728 */ 729 err = ipmr_cache_report(net, skb, vifi, IGMPMSG_NOCACHE); 730 if (err < 0) { 731 /* If the report failed throw the cache entry 732 out - Brad Parker 733 */ 734 spin_unlock_bh(&mfc_unres_lock); 735 736 ipmr_cache_free(c); 737 kfree_skb(skb); 738 return err; 739 } 740 741 atomic_inc(&net->ipv4.cache_resolve_queue_len); 742 c->next = mfc_unres_queue; 743 mfc_unres_queue = c; 744 745 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires); 746 } 747 748 /* 749 * See if we can append the packet 750 */ 751 if (c->mfc_un.unres.unresolved.qlen>3) { 752 kfree_skb(skb); 753 err = -ENOBUFS; 754 } else { 755 skb_queue_tail(&c->mfc_un.unres.unresolved, skb); 756 err = 0; 757 } 758 759 spin_unlock_bh(&mfc_unres_lock); 760 return err; 761 } 762 763 /* 764 * MFC cache manipulation by user space mroute daemon 765 */ 766 767 static int ipmr_mfc_delete(struct net *net, struct mfcctl *mfc) 768 { 769 int line; 770 struct mfc_cache *c, **cp; 771 772 line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); 773 774 for (cp = &net->ipv4.mfc_cache_array[line]; 775 (c = *cp) != NULL; cp = &c->next) { 776 if (c->mfc_origin == mfc->mfcc_origin.s_addr && 777 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) { 778 write_lock_bh(&mrt_lock); 779 *cp = c->next; 780 write_unlock_bh(&mrt_lock); 781 782 ipmr_cache_free(c); 783 return 0; 784 } 785 } 786 return -ENOENT; 787 } 788 789 static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock) 790 { 791 int line; 792 struct mfc_cache *uc, *c, **cp; 793 794 line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); 795 796 for (cp = &net->ipv4.mfc_cache_array[line]; 797 (c = *cp) != NULL; cp = &c->next) { 798 if (c->mfc_origin == mfc->mfcc_origin.s_addr && 799 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) 800 break; 801 } 802 803 if (c != NULL) { 804 write_lock_bh(&mrt_lock); 805 c->mfc_parent = mfc->mfcc_parent; 806 ipmr_update_thresholds(c, mfc->mfcc_ttls); 807 if (!mrtsock) 808 c->mfc_flags |= MFC_STATIC; 809 write_unlock_bh(&mrt_lock); 810 return 0; 811 } 812 813 if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr)) 814 return -EINVAL; 815 816 c = ipmr_cache_alloc(net); 817 if (c == NULL) 818 return -ENOMEM; 819 820 c->mfc_origin = mfc->mfcc_origin.s_addr; 821 c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr; 822 c->mfc_parent = mfc->mfcc_parent; 823 ipmr_update_thresholds(c, mfc->mfcc_ttls); 824 if (!mrtsock) 825 c->mfc_flags |= MFC_STATIC; 826 827 write_lock_bh(&mrt_lock); 828 c->next = net->ipv4.mfc_cache_array[line]; 829 net->ipv4.mfc_cache_array[line] = c; 830 write_unlock_bh(&mrt_lock); 831 832 /* 833 * Check to see if we resolved a queued list. If so we 834 * need to send on the frames and tidy up. 835 */ 836 spin_lock_bh(&mfc_unres_lock); 837 for (cp = &mfc_unres_queue; (uc=*cp) != NULL; 838 cp = &uc->next) { 839 if (net_eq(mfc_net(uc), net) && 840 uc->mfc_origin == c->mfc_origin && 841 uc->mfc_mcastgrp == c->mfc_mcastgrp) { 842 *cp = uc->next; 843 atomic_dec(&net->ipv4.cache_resolve_queue_len); 844 break; 845 } 846 } 847 if (mfc_unres_queue == NULL) 848 del_timer(&ipmr_expire_timer); 849 spin_unlock_bh(&mfc_unres_lock); 850 851 if (uc) { 852 ipmr_cache_resolve(uc, c); 853 ipmr_cache_free(uc); 854 } 855 return 0; 856 } 857 858 /* 859 * Close the multicast socket, and clear the vif tables etc 860 */ 861 862 static void mroute_clean_tables(struct net *net) 863 { 864 int i; 865 866 /* 867 * Shut down all active vif entries 868 */ 869 for (i = 0; i < net->ipv4.maxvif; i++) { 870 if (!(net->ipv4.vif_table[i].flags&VIFF_STATIC)) 871 vif_delete(net, i, 0); 872 } 873 874 /* 875 * Wipe the cache 876 */ 877 for (i=0; i<MFC_LINES; i++) { 878 struct mfc_cache *c, **cp; 879 880 cp = &net->ipv4.mfc_cache_array[i]; 881 while ((c = *cp) != NULL) { 882 if (c->mfc_flags&MFC_STATIC) { 883 cp = &c->next; 884 continue; 885 } 886 write_lock_bh(&mrt_lock); 887 *cp = c->next; 888 write_unlock_bh(&mrt_lock); 889 890 ipmr_cache_free(c); 891 } 892 } 893 894 if (atomic_read(&net->ipv4.cache_resolve_queue_len) != 0) { 895 struct mfc_cache *c, **cp; 896 897 spin_lock_bh(&mfc_unres_lock); 898 cp = &mfc_unres_queue; 899 while ((c = *cp) != NULL) { 900 if (!net_eq(mfc_net(c), net)) { 901 cp = &c->next; 902 continue; 903 } 904 *cp = c->next; 905 906 ipmr_destroy_unres(c); 907 } 908 spin_unlock_bh(&mfc_unres_lock); 909 } 910 } 911 912 static void mrtsock_destruct(struct sock *sk) 913 { 914 struct net *net = sock_net(sk); 915 916 rtnl_lock(); 917 if (sk == net->ipv4.mroute_sk) { 918 IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; 919 920 write_lock_bh(&mrt_lock); 921 net->ipv4.mroute_sk = NULL; 922 write_unlock_bh(&mrt_lock); 923 924 mroute_clean_tables(net); 925 } 926 rtnl_unlock(); 927 } 928 929 /* 930 * Socket options and virtual interface manipulation. The whole 931 * virtual interface system is a complete heap, but unfortunately 932 * that's how BSD mrouted happens to think. Maybe one day with a proper 933 * MOSPF/PIM router set up we can clean this up. 934 */ 935 936 int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen) 937 { 938 int ret; 939 struct vifctl vif; 940 struct mfcctl mfc; 941 struct net *net = sock_net(sk); 942 943 if (optname != MRT_INIT) { 944 if (sk != net->ipv4.mroute_sk && !capable(CAP_NET_ADMIN)) 945 return -EACCES; 946 } 947 948 switch (optname) { 949 case MRT_INIT: 950 if (sk->sk_type != SOCK_RAW || 951 inet_sk(sk)->num != IPPROTO_IGMP) 952 return -EOPNOTSUPP; 953 if (optlen != sizeof(int)) 954 return -ENOPROTOOPT; 955 956 rtnl_lock(); 957 if (net->ipv4.mroute_sk) { 958 rtnl_unlock(); 959 return -EADDRINUSE; 960 } 961 962 ret = ip_ra_control(sk, 1, mrtsock_destruct); 963 if (ret == 0) { 964 write_lock_bh(&mrt_lock); 965 net->ipv4.mroute_sk = sk; 966 write_unlock_bh(&mrt_lock); 967 968 IPV4_DEVCONF_ALL(net, MC_FORWARDING)++; 969 } 970 rtnl_unlock(); 971 return ret; 972 case MRT_DONE: 973 if (sk != net->ipv4.mroute_sk) 974 return -EACCES; 975 return ip_ra_control(sk, 0, NULL); 976 case MRT_ADD_VIF: 977 case MRT_DEL_VIF: 978 if (optlen != sizeof(vif)) 979 return -EINVAL; 980 if (copy_from_user(&vif, optval, sizeof(vif))) 981 return -EFAULT; 982 if (vif.vifc_vifi >= MAXVIFS) 983 return -ENFILE; 984 rtnl_lock(); 985 if (optname == MRT_ADD_VIF) { 986 ret = vif_add(net, &vif, sk == net->ipv4.mroute_sk); 987 } else { 988 ret = vif_delete(net, vif.vifc_vifi, 0); 989 } 990 rtnl_unlock(); 991 return ret; 992 993 /* 994 * Manipulate the forwarding caches. These live 995 * in a sort of kernel/user symbiosis. 996 */ 997 case MRT_ADD_MFC: 998 case MRT_DEL_MFC: 999 if (optlen != sizeof(mfc)) 1000 return -EINVAL; 1001 if (copy_from_user(&mfc, optval, sizeof(mfc))) 1002 return -EFAULT; 1003 rtnl_lock(); 1004 if (optname == MRT_DEL_MFC) 1005 ret = ipmr_mfc_delete(net, &mfc); 1006 else 1007 ret = ipmr_mfc_add(net, &mfc, sk == net->ipv4.mroute_sk); 1008 rtnl_unlock(); 1009 return ret; 1010 /* 1011 * Control PIM assert. 1012 */ 1013 case MRT_ASSERT: 1014 { 1015 int v; 1016 if (get_user(v,(int __user *)optval)) 1017 return -EFAULT; 1018 net->ipv4.mroute_do_assert = (v) ? 1 : 0; 1019 return 0; 1020 } 1021 #ifdef CONFIG_IP_PIMSM 1022 case MRT_PIM: 1023 { 1024 int v; 1025 1026 if (get_user(v,(int __user *)optval)) 1027 return -EFAULT; 1028 v = (v) ? 1 : 0; 1029 1030 rtnl_lock(); 1031 ret = 0; 1032 if (v != net->ipv4.mroute_do_pim) { 1033 net->ipv4.mroute_do_pim = v; 1034 net->ipv4.mroute_do_assert = v; 1035 } 1036 rtnl_unlock(); 1037 return ret; 1038 } 1039 #endif 1040 /* 1041 * Spurious command, or MRT_VERSION which you cannot 1042 * set. 1043 */ 1044 default: 1045 return -ENOPROTOOPT; 1046 } 1047 } 1048 1049 /* 1050 * Getsock opt support for the multicast routing system. 1051 */ 1052 1053 int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen) 1054 { 1055 int olr; 1056 int val; 1057 struct net *net = sock_net(sk); 1058 1059 if (optname != MRT_VERSION && 1060 #ifdef CONFIG_IP_PIMSM 1061 optname!=MRT_PIM && 1062 #endif 1063 optname!=MRT_ASSERT) 1064 return -ENOPROTOOPT; 1065 1066 if (get_user(olr, optlen)) 1067 return -EFAULT; 1068 1069 olr = min_t(unsigned int, olr, sizeof(int)); 1070 if (olr < 0) 1071 return -EINVAL; 1072 1073 if (put_user(olr, optlen)) 1074 return -EFAULT; 1075 if (optname == MRT_VERSION) 1076 val = 0x0305; 1077 #ifdef CONFIG_IP_PIMSM 1078 else if (optname == MRT_PIM) 1079 val = net->ipv4.mroute_do_pim; 1080 #endif 1081 else 1082 val = net->ipv4.mroute_do_assert; 1083 if (copy_to_user(optval, &val, olr)) 1084 return -EFAULT; 1085 return 0; 1086 } 1087 1088 /* 1089 * The IP multicast ioctl support routines. 1090 */ 1091 1092 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) 1093 { 1094 struct sioc_sg_req sr; 1095 struct sioc_vif_req vr; 1096 struct vif_device *vif; 1097 struct mfc_cache *c; 1098 struct net *net = sock_net(sk); 1099 1100 switch (cmd) { 1101 case SIOCGETVIFCNT: 1102 if (copy_from_user(&vr, arg, sizeof(vr))) 1103 return -EFAULT; 1104 if (vr.vifi >= net->ipv4.maxvif) 1105 return -EINVAL; 1106 read_lock(&mrt_lock); 1107 vif = &net->ipv4.vif_table[vr.vifi]; 1108 if (VIF_EXISTS(net, vr.vifi)) { 1109 vr.icount = vif->pkt_in; 1110 vr.ocount = vif->pkt_out; 1111 vr.ibytes = vif->bytes_in; 1112 vr.obytes = vif->bytes_out; 1113 read_unlock(&mrt_lock); 1114 1115 if (copy_to_user(arg, &vr, sizeof(vr))) 1116 return -EFAULT; 1117 return 0; 1118 } 1119 read_unlock(&mrt_lock); 1120 return -EADDRNOTAVAIL; 1121 case SIOCGETSGCNT: 1122 if (copy_from_user(&sr, arg, sizeof(sr))) 1123 return -EFAULT; 1124 1125 read_lock(&mrt_lock); 1126 c = ipmr_cache_find(net, sr.src.s_addr, sr.grp.s_addr); 1127 if (c) { 1128 sr.pktcnt = c->mfc_un.res.pkt; 1129 sr.bytecnt = c->mfc_un.res.bytes; 1130 sr.wrong_if = c->mfc_un.res.wrong_if; 1131 read_unlock(&mrt_lock); 1132 1133 if (copy_to_user(arg, &sr, sizeof(sr))) 1134 return -EFAULT; 1135 return 0; 1136 } 1137 read_unlock(&mrt_lock); 1138 return -EADDRNOTAVAIL; 1139 default: 1140 return -ENOIOCTLCMD; 1141 } 1142 } 1143 1144 1145 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr) 1146 { 1147 struct net_device *dev = ptr; 1148 struct net *net = dev_net(dev); 1149 struct vif_device *v; 1150 int ct; 1151 1152 if (!net_eq(dev_net(dev), net)) 1153 return NOTIFY_DONE; 1154 1155 if (event != NETDEV_UNREGISTER) 1156 return NOTIFY_DONE; 1157 v = &net->ipv4.vif_table[0]; 1158 for (ct = 0; ct < net->ipv4.maxvif; ct++, v++) { 1159 if (v->dev == dev) 1160 vif_delete(net, ct, 1); 1161 } 1162 return NOTIFY_DONE; 1163 } 1164 1165 1166 static struct notifier_block ip_mr_notifier = { 1167 .notifier_call = ipmr_device_event, 1168 }; 1169 1170 /* 1171 * Encapsulate a packet by attaching a valid IPIP header to it. 1172 * This avoids tunnel drivers and other mess and gives us the speed so 1173 * important for multicast video. 1174 */ 1175 1176 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) 1177 { 1178 struct iphdr *iph; 1179 struct iphdr *old_iph = ip_hdr(skb); 1180 1181 skb_push(skb, sizeof(struct iphdr)); 1182 skb->transport_header = skb->network_header; 1183 skb_reset_network_header(skb); 1184 iph = ip_hdr(skb); 1185 1186 iph->version = 4; 1187 iph->tos = old_iph->tos; 1188 iph->ttl = old_iph->ttl; 1189 iph->frag_off = 0; 1190 iph->daddr = daddr; 1191 iph->saddr = saddr; 1192 iph->protocol = IPPROTO_IPIP; 1193 iph->ihl = 5; 1194 iph->tot_len = htons(skb->len); 1195 ip_select_ident(iph, skb_dst(skb), NULL); 1196 ip_send_check(iph); 1197 1198 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 1199 nf_reset(skb); 1200 } 1201 1202 static inline int ipmr_forward_finish(struct sk_buff *skb) 1203 { 1204 struct ip_options * opt = &(IPCB(skb)->opt); 1205 1206 IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); 1207 1208 if (unlikely(opt->optlen)) 1209 ip_forward_options(skb); 1210 1211 return dst_output(skb); 1212 } 1213 1214 /* 1215 * Processing handlers for ipmr_forward 1216 */ 1217 1218 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) 1219 { 1220 struct net *net = mfc_net(c); 1221 const struct iphdr *iph = ip_hdr(skb); 1222 struct vif_device *vif = &net->ipv4.vif_table[vifi]; 1223 struct net_device *dev; 1224 struct rtable *rt; 1225 int encap = 0; 1226 1227 if (vif->dev == NULL) 1228 goto out_free; 1229 1230 #ifdef CONFIG_IP_PIMSM 1231 if (vif->flags & VIFF_REGISTER) { 1232 vif->pkt_out++; 1233 vif->bytes_out += skb->len; 1234 vif->dev->stats.tx_bytes += skb->len; 1235 vif->dev->stats.tx_packets++; 1236 ipmr_cache_report(net, skb, vifi, IGMPMSG_WHOLEPKT); 1237 goto out_free; 1238 } 1239 #endif 1240 1241 if (vif->flags&VIFF_TUNNEL) { 1242 struct flowi fl = { .oif = vif->link, 1243 .nl_u = { .ip4_u = 1244 { .daddr = vif->remote, 1245 .saddr = vif->local, 1246 .tos = RT_TOS(iph->tos) } }, 1247 .proto = IPPROTO_IPIP }; 1248 if (ip_route_output_key(net, &rt, &fl)) 1249 goto out_free; 1250 encap = sizeof(struct iphdr); 1251 } else { 1252 struct flowi fl = { .oif = vif->link, 1253 .nl_u = { .ip4_u = 1254 { .daddr = iph->daddr, 1255 .tos = RT_TOS(iph->tos) } }, 1256 .proto = IPPROTO_IPIP }; 1257 if (ip_route_output_key(net, &rt, &fl)) 1258 goto out_free; 1259 } 1260 1261 dev = rt->u.dst.dev; 1262 1263 if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) { 1264 /* Do not fragment multicasts. Alas, IPv4 does not 1265 allow to send ICMP, so that packets will disappear 1266 to blackhole. 1267 */ 1268 1269 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS); 1270 ip_rt_put(rt); 1271 goto out_free; 1272 } 1273 1274 encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len; 1275 1276 if (skb_cow(skb, encap)) { 1277 ip_rt_put(rt); 1278 goto out_free; 1279 } 1280 1281 vif->pkt_out++; 1282 vif->bytes_out += skb->len; 1283 1284 skb_dst_drop(skb); 1285 skb_dst_set(skb, &rt->u.dst); 1286 ip_decrease_ttl(ip_hdr(skb)); 1287 1288 /* FIXME: forward and output firewalls used to be called here. 1289 * What do we do with netfilter? -- RR */ 1290 if (vif->flags & VIFF_TUNNEL) { 1291 ip_encap(skb, vif->local, vif->remote); 1292 /* FIXME: extra output firewall step used to be here. --RR */ 1293 vif->dev->stats.tx_packets++; 1294 vif->dev->stats.tx_bytes += skb->len; 1295 } 1296 1297 IPCB(skb)->flags |= IPSKB_FORWARDED; 1298 1299 /* 1300 * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally 1301 * not only before forwarding, but after forwarding on all output 1302 * interfaces. It is clear, if mrouter runs a multicasting 1303 * program, it should receive packets not depending to what interface 1304 * program is joined. 1305 * If we will not make it, the program will have to join on all 1306 * interfaces. On the other hand, multihoming host (or router, but 1307 * not mrouter) cannot join to more than one interface - it will 1308 * result in receiving multiple packets. 1309 */ 1310 NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, dev, 1311 ipmr_forward_finish); 1312 return; 1313 1314 out_free: 1315 kfree_skb(skb); 1316 return; 1317 } 1318 1319 static int ipmr_find_vif(struct net_device *dev) 1320 { 1321 struct net *net = dev_net(dev); 1322 int ct; 1323 for (ct = net->ipv4.maxvif-1; ct >= 0; ct--) { 1324 if (net->ipv4.vif_table[ct].dev == dev) 1325 break; 1326 } 1327 return ct; 1328 } 1329 1330 /* "local" means that we should preserve one skb (for local delivery) */ 1331 1332 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local) 1333 { 1334 int psend = -1; 1335 int vif, ct; 1336 struct net *net = mfc_net(cache); 1337 1338 vif = cache->mfc_parent; 1339 cache->mfc_un.res.pkt++; 1340 cache->mfc_un.res.bytes += skb->len; 1341 1342 /* 1343 * Wrong interface: drop packet and (maybe) send PIM assert. 1344 */ 1345 if (net->ipv4.vif_table[vif].dev != skb->dev) { 1346 int true_vifi; 1347 1348 if (skb_rtable(skb)->fl.iif == 0) { 1349 /* It is our own packet, looped back. 1350 Very complicated situation... 1351 1352 The best workaround until routing daemons will be 1353 fixed is not to redistribute packet, if it was 1354 send through wrong interface. It means, that 1355 multicast applications WILL NOT work for 1356 (S,G), which have default multicast route pointing 1357 to wrong oif. In any case, it is not a good 1358 idea to use multicasting applications on router. 1359 */ 1360 goto dont_forward; 1361 } 1362 1363 cache->mfc_un.res.wrong_if++; 1364 true_vifi = ipmr_find_vif(skb->dev); 1365 1366 if (true_vifi >= 0 && net->ipv4.mroute_do_assert && 1367 /* pimsm uses asserts, when switching from RPT to SPT, 1368 so that we cannot check that packet arrived on an oif. 1369 It is bad, but otherwise we would need to move pretty 1370 large chunk of pimd to kernel. Ough... --ANK 1371 */ 1372 (net->ipv4.mroute_do_pim || 1373 cache->mfc_un.res.ttls[true_vifi] < 255) && 1374 time_after(jiffies, 1375 cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { 1376 cache->mfc_un.res.last_assert = jiffies; 1377 ipmr_cache_report(net, skb, true_vifi, IGMPMSG_WRONGVIF); 1378 } 1379 goto dont_forward; 1380 } 1381 1382 net->ipv4.vif_table[vif].pkt_in++; 1383 net->ipv4.vif_table[vif].bytes_in += skb->len; 1384 1385 /* 1386 * Forward the frame 1387 */ 1388 for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) { 1389 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) { 1390 if (psend != -1) { 1391 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 1392 if (skb2) 1393 ipmr_queue_xmit(skb2, cache, psend); 1394 } 1395 psend = ct; 1396 } 1397 } 1398 if (psend != -1) { 1399 if (local) { 1400 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 1401 if (skb2) 1402 ipmr_queue_xmit(skb2, cache, psend); 1403 } else { 1404 ipmr_queue_xmit(skb, cache, psend); 1405 return 0; 1406 } 1407 } 1408 1409 dont_forward: 1410 if (!local) 1411 kfree_skb(skb); 1412 return 0; 1413 } 1414 1415 1416 /* 1417 * Multicast packets for forwarding arrive here 1418 */ 1419 1420 int ip_mr_input(struct sk_buff *skb) 1421 { 1422 struct mfc_cache *cache; 1423 struct net *net = dev_net(skb->dev); 1424 int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; 1425 1426 /* Packet is looped back after forward, it should not be 1427 forwarded second time, but still can be delivered locally. 1428 */ 1429 if (IPCB(skb)->flags&IPSKB_FORWARDED) 1430 goto dont_forward; 1431 1432 if (!local) { 1433 if (IPCB(skb)->opt.router_alert) { 1434 if (ip_call_ra_chain(skb)) 1435 return 0; 1436 } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){ 1437 /* IGMPv1 (and broken IGMPv2 implementations sort of 1438 Cisco IOS <= 11.2(8)) do not put router alert 1439 option to IGMP packets destined to routable 1440 groups. It is very bad, because it means 1441 that we can forward NO IGMP messages. 1442 */ 1443 read_lock(&mrt_lock); 1444 if (net->ipv4.mroute_sk) { 1445 nf_reset(skb); 1446 raw_rcv(net->ipv4.mroute_sk, skb); 1447 read_unlock(&mrt_lock); 1448 return 0; 1449 } 1450 read_unlock(&mrt_lock); 1451 } 1452 } 1453 1454 read_lock(&mrt_lock); 1455 cache = ipmr_cache_find(net, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); 1456 1457 /* 1458 * No usable cache entry 1459 */ 1460 if (cache == NULL) { 1461 int vif; 1462 1463 if (local) { 1464 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 1465 ip_local_deliver(skb); 1466 if (skb2 == NULL) { 1467 read_unlock(&mrt_lock); 1468 return -ENOBUFS; 1469 } 1470 skb = skb2; 1471 } 1472 1473 vif = ipmr_find_vif(skb->dev); 1474 if (vif >= 0) { 1475 int err = ipmr_cache_unresolved(net, vif, skb); 1476 read_unlock(&mrt_lock); 1477 1478 return err; 1479 } 1480 read_unlock(&mrt_lock); 1481 kfree_skb(skb); 1482 return -ENODEV; 1483 } 1484 1485 ip_mr_forward(skb, cache, local); 1486 1487 read_unlock(&mrt_lock); 1488 1489 if (local) 1490 return ip_local_deliver(skb); 1491 1492 return 0; 1493 1494 dont_forward: 1495 if (local) 1496 return ip_local_deliver(skb); 1497 kfree_skb(skb); 1498 return 0; 1499 } 1500 1501 #ifdef CONFIG_IP_PIMSM 1502 static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen) 1503 { 1504 struct net_device *reg_dev = NULL; 1505 struct iphdr *encap; 1506 struct net *net = dev_net(skb->dev); 1507 1508 encap = (struct iphdr *)(skb_transport_header(skb) + pimlen); 1509 /* 1510 Check that: 1511 a. packet is really destinted to a multicast group 1512 b. packet is not a NULL-REGISTER 1513 c. packet is not truncated 1514 */ 1515 if (!ipv4_is_multicast(encap->daddr) || 1516 encap->tot_len == 0 || 1517 ntohs(encap->tot_len) + pimlen > skb->len) 1518 return 1; 1519 1520 read_lock(&mrt_lock); 1521 if (net->ipv4.mroute_reg_vif_num >= 0) 1522 reg_dev = net->ipv4.vif_table[net->ipv4.mroute_reg_vif_num].dev; 1523 if (reg_dev) 1524 dev_hold(reg_dev); 1525 read_unlock(&mrt_lock); 1526 1527 if (reg_dev == NULL) 1528 return 1; 1529 1530 skb->mac_header = skb->network_header; 1531 skb_pull(skb, (u8*)encap - skb->data); 1532 skb_reset_network_header(skb); 1533 skb->dev = reg_dev; 1534 skb->protocol = htons(ETH_P_IP); 1535 skb->ip_summed = 0; 1536 skb->pkt_type = PACKET_HOST; 1537 skb_dst_drop(skb); 1538 reg_dev->stats.rx_bytes += skb->len; 1539 reg_dev->stats.rx_packets++; 1540 nf_reset(skb); 1541 netif_rx(skb); 1542 dev_put(reg_dev); 1543 1544 return 0; 1545 } 1546 #endif 1547 1548 #ifdef CONFIG_IP_PIMSM_V1 1549 /* 1550 * Handle IGMP messages of PIMv1 1551 */ 1552 1553 int pim_rcv_v1(struct sk_buff * skb) 1554 { 1555 struct igmphdr *pim; 1556 struct net *net = dev_net(skb->dev); 1557 1558 if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr))) 1559 goto drop; 1560 1561 pim = igmp_hdr(skb); 1562 1563 if (!net->ipv4.mroute_do_pim || 1564 pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) 1565 goto drop; 1566 1567 if (__pim_rcv(skb, sizeof(*pim))) { 1568 drop: 1569 kfree_skb(skb); 1570 } 1571 return 0; 1572 } 1573 #endif 1574 1575 #ifdef CONFIG_IP_PIMSM_V2 1576 static int pim_rcv(struct sk_buff * skb) 1577 { 1578 struct pimreghdr *pim; 1579 1580 if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr))) 1581 goto drop; 1582 1583 pim = (struct pimreghdr *)skb_transport_header(skb); 1584 if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) || 1585 (pim->flags&PIM_NULL_REGISTER) || 1586 (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && 1587 csum_fold(skb_checksum(skb, 0, skb->len, 0)))) 1588 goto drop; 1589 1590 if (__pim_rcv(skb, sizeof(*pim))) { 1591 drop: 1592 kfree_skb(skb); 1593 } 1594 return 0; 1595 } 1596 #endif 1597 1598 static int 1599 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm) 1600 { 1601 int ct; 1602 struct rtnexthop *nhp; 1603 struct net *net = mfc_net(c); 1604 struct net_device *dev = net->ipv4.vif_table[c->mfc_parent].dev; 1605 u8 *b = skb_tail_pointer(skb); 1606 struct rtattr *mp_head; 1607 1608 if (dev) 1609 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex); 1610 1611 mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0)); 1612 1613 for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { 1614 if (c->mfc_un.res.ttls[ct] < 255) { 1615 if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) 1616 goto rtattr_failure; 1617 nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); 1618 nhp->rtnh_flags = 0; 1619 nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; 1620 nhp->rtnh_ifindex = net->ipv4.vif_table[ct].dev->ifindex; 1621 nhp->rtnh_len = sizeof(*nhp); 1622 } 1623 } 1624 mp_head->rta_type = RTA_MULTIPATH; 1625 mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head; 1626 rtm->rtm_type = RTN_MULTICAST; 1627 return 1; 1628 1629 rtattr_failure: 1630 nlmsg_trim(skb, b); 1631 return -EMSGSIZE; 1632 } 1633 1634 int ipmr_get_route(struct net *net, 1635 struct sk_buff *skb, struct rtmsg *rtm, int nowait) 1636 { 1637 int err; 1638 struct mfc_cache *cache; 1639 struct rtable *rt = skb_rtable(skb); 1640 1641 read_lock(&mrt_lock); 1642 cache = ipmr_cache_find(net, rt->rt_src, rt->rt_dst); 1643 1644 if (cache == NULL) { 1645 struct sk_buff *skb2; 1646 struct iphdr *iph; 1647 struct net_device *dev; 1648 int vif; 1649 1650 if (nowait) { 1651 read_unlock(&mrt_lock); 1652 return -EAGAIN; 1653 } 1654 1655 dev = skb->dev; 1656 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) { 1657 read_unlock(&mrt_lock); 1658 return -ENODEV; 1659 } 1660 skb2 = skb_clone(skb, GFP_ATOMIC); 1661 if (!skb2) { 1662 read_unlock(&mrt_lock); 1663 return -ENOMEM; 1664 } 1665 1666 skb_push(skb2, sizeof(struct iphdr)); 1667 skb_reset_network_header(skb2); 1668 iph = ip_hdr(skb2); 1669 iph->ihl = sizeof(struct iphdr) >> 2; 1670 iph->saddr = rt->rt_src; 1671 iph->daddr = rt->rt_dst; 1672 iph->version = 0; 1673 err = ipmr_cache_unresolved(net, vif, skb2); 1674 read_unlock(&mrt_lock); 1675 return err; 1676 } 1677 1678 if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY)) 1679 cache->mfc_flags |= MFC_NOTIFY; 1680 err = ipmr_fill_mroute(skb, cache, rtm); 1681 read_unlock(&mrt_lock); 1682 return err; 1683 } 1684 1685 #ifdef CONFIG_PROC_FS 1686 /* 1687 * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif 1688 */ 1689 struct ipmr_vif_iter { 1690 struct seq_net_private p; 1691 int ct; 1692 }; 1693 1694 static struct vif_device *ipmr_vif_seq_idx(struct net *net, 1695 struct ipmr_vif_iter *iter, 1696 loff_t pos) 1697 { 1698 for (iter->ct = 0; iter->ct < net->ipv4.maxvif; ++iter->ct) { 1699 if (!VIF_EXISTS(net, iter->ct)) 1700 continue; 1701 if (pos-- == 0) 1702 return &net->ipv4.vif_table[iter->ct]; 1703 } 1704 return NULL; 1705 } 1706 1707 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos) 1708 __acquires(mrt_lock) 1709 { 1710 struct net *net = seq_file_net(seq); 1711 1712 read_lock(&mrt_lock); 1713 return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1) 1714 : SEQ_START_TOKEN; 1715 } 1716 1717 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) 1718 { 1719 struct ipmr_vif_iter *iter = seq->private; 1720 struct net *net = seq_file_net(seq); 1721 1722 ++*pos; 1723 if (v == SEQ_START_TOKEN) 1724 return ipmr_vif_seq_idx(net, iter, 0); 1725 1726 while (++iter->ct < net->ipv4.maxvif) { 1727 if (!VIF_EXISTS(net, iter->ct)) 1728 continue; 1729 return &net->ipv4.vif_table[iter->ct]; 1730 } 1731 return NULL; 1732 } 1733 1734 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v) 1735 __releases(mrt_lock) 1736 { 1737 read_unlock(&mrt_lock); 1738 } 1739 1740 static int ipmr_vif_seq_show(struct seq_file *seq, void *v) 1741 { 1742 struct net *net = seq_file_net(seq); 1743 1744 if (v == SEQ_START_TOKEN) { 1745 seq_puts(seq, 1746 "Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n"); 1747 } else { 1748 const struct vif_device *vif = v; 1749 const char *name = vif->dev ? vif->dev->name : "none"; 1750 1751 seq_printf(seq, 1752 "%2Zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n", 1753 vif - net->ipv4.vif_table, 1754 name, vif->bytes_in, vif->pkt_in, 1755 vif->bytes_out, vif->pkt_out, 1756 vif->flags, vif->local, vif->remote); 1757 } 1758 return 0; 1759 } 1760 1761 static const struct seq_operations ipmr_vif_seq_ops = { 1762 .start = ipmr_vif_seq_start, 1763 .next = ipmr_vif_seq_next, 1764 .stop = ipmr_vif_seq_stop, 1765 .show = ipmr_vif_seq_show, 1766 }; 1767 1768 static int ipmr_vif_open(struct inode *inode, struct file *file) 1769 { 1770 return seq_open_net(inode, file, &ipmr_vif_seq_ops, 1771 sizeof(struct ipmr_vif_iter)); 1772 } 1773 1774 static const struct file_operations ipmr_vif_fops = { 1775 .owner = THIS_MODULE, 1776 .open = ipmr_vif_open, 1777 .read = seq_read, 1778 .llseek = seq_lseek, 1779 .release = seq_release_net, 1780 }; 1781 1782 struct ipmr_mfc_iter { 1783 struct seq_net_private p; 1784 struct mfc_cache **cache; 1785 int ct; 1786 }; 1787 1788 1789 static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net, 1790 struct ipmr_mfc_iter *it, loff_t pos) 1791 { 1792 struct mfc_cache *mfc; 1793 1794 it->cache = net->ipv4.mfc_cache_array; 1795 read_lock(&mrt_lock); 1796 for (it->ct = 0; it->ct < MFC_LINES; it->ct++) 1797 for (mfc = net->ipv4.mfc_cache_array[it->ct]; 1798 mfc; mfc = mfc->next) 1799 if (pos-- == 0) 1800 return mfc; 1801 read_unlock(&mrt_lock); 1802 1803 it->cache = &mfc_unres_queue; 1804 spin_lock_bh(&mfc_unres_lock); 1805 for (mfc = mfc_unres_queue; mfc; mfc = mfc->next) 1806 if (net_eq(mfc_net(mfc), net) && 1807 pos-- == 0) 1808 return mfc; 1809 spin_unlock_bh(&mfc_unres_lock); 1810 1811 it->cache = NULL; 1812 return NULL; 1813 } 1814 1815 1816 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) 1817 { 1818 struct ipmr_mfc_iter *it = seq->private; 1819 struct net *net = seq_file_net(seq); 1820 1821 it->cache = NULL; 1822 it->ct = 0; 1823 return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1) 1824 : SEQ_START_TOKEN; 1825 } 1826 1827 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) 1828 { 1829 struct mfc_cache *mfc = v; 1830 struct ipmr_mfc_iter *it = seq->private; 1831 struct net *net = seq_file_net(seq); 1832 1833 ++*pos; 1834 1835 if (v == SEQ_START_TOKEN) 1836 return ipmr_mfc_seq_idx(net, seq->private, 0); 1837 1838 if (mfc->next) 1839 return mfc->next; 1840 1841 if (it->cache == &mfc_unres_queue) 1842 goto end_of_list; 1843 1844 BUG_ON(it->cache != net->ipv4.mfc_cache_array); 1845 1846 while (++it->ct < MFC_LINES) { 1847 mfc = net->ipv4.mfc_cache_array[it->ct]; 1848 if (mfc) 1849 return mfc; 1850 } 1851 1852 /* exhausted cache_array, show unresolved */ 1853 read_unlock(&mrt_lock); 1854 it->cache = &mfc_unres_queue; 1855 it->ct = 0; 1856 1857 spin_lock_bh(&mfc_unres_lock); 1858 mfc = mfc_unres_queue; 1859 while (mfc && !net_eq(mfc_net(mfc), net)) 1860 mfc = mfc->next; 1861 if (mfc) 1862 return mfc; 1863 1864 end_of_list: 1865 spin_unlock_bh(&mfc_unres_lock); 1866 it->cache = NULL; 1867 1868 return NULL; 1869 } 1870 1871 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v) 1872 { 1873 struct ipmr_mfc_iter *it = seq->private; 1874 struct net *net = seq_file_net(seq); 1875 1876 if (it->cache == &mfc_unres_queue) 1877 spin_unlock_bh(&mfc_unres_lock); 1878 else if (it->cache == net->ipv4.mfc_cache_array) 1879 read_unlock(&mrt_lock); 1880 } 1881 1882 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) 1883 { 1884 int n; 1885 struct net *net = seq_file_net(seq); 1886 1887 if (v == SEQ_START_TOKEN) { 1888 seq_puts(seq, 1889 "Group Origin Iif Pkts Bytes Wrong Oifs\n"); 1890 } else { 1891 const struct mfc_cache *mfc = v; 1892 const struct ipmr_mfc_iter *it = seq->private; 1893 1894 seq_printf(seq, "%08lX %08lX %-3hd", 1895 (unsigned long) mfc->mfc_mcastgrp, 1896 (unsigned long) mfc->mfc_origin, 1897 mfc->mfc_parent); 1898 1899 if (it->cache != &mfc_unres_queue) { 1900 seq_printf(seq, " %8lu %8lu %8lu", 1901 mfc->mfc_un.res.pkt, 1902 mfc->mfc_un.res.bytes, 1903 mfc->mfc_un.res.wrong_if); 1904 for (n = mfc->mfc_un.res.minvif; 1905 n < mfc->mfc_un.res.maxvif; n++ ) { 1906 if (VIF_EXISTS(net, n) && 1907 mfc->mfc_un.res.ttls[n] < 255) 1908 seq_printf(seq, 1909 " %2d:%-3d", 1910 n, mfc->mfc_un.res.ttls[n]); 1911 } 1912 } else { 1913 /* unresolved mfc_caches don't contain 1914 * pkt, bytes and wrong_if values 1915 */ 1916 seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul); 1917 } 1918 seq_putc(seq, '\n'); 1919 } 1920 return 0; 1921 } 1922 1923 static const struct seq_operations ipmr_mfc_seq_ops = { 1924 .start = ipmr_mfc_seq_start, 1925 .next = ipmr_mfc_seq_next, 1926 .stop = ipmr_mfc_seq_stop, 1927 .show = ipmr_mfc_seq_show, 1928 }; 1929 1930 static int ipmr_mfc_open(struct inode *inode, struct file *file) 1931 { 1932 return seq_open_net(inode, file, &ipmr_mfc_seq_ops, 1933 sizeof(struct ipmr_mfc_iter)); 1934 } 1935 1936 static const struct file_operations ipmr_mfc_fops = { 1937 .owner = THIS_MODULE, 1938 .open = ipmr_mfc_open, 1939 .read = seq_read, 1940 .llseek = seq_lseek, 1941 .release = seq_release_net, 1942 }; 1943 #endif 1944 1945 #ifdef CONFIG_IP_PIMSM_V2 1946 static const struct net_protocol pim_protocol = { 1947 .handler = pim_rcv, 1948 .netns_ok = 1, 1949 }; 1950 #endif 1951 1952 1953 /* 1954 * Setup for IP multicast routing 1955 */ 1956 static int __net_init ipmr_net_init(struct net *net) 1957 { 1958 int err = 0; 1959 1960 net->ipv4.vif_table = kcalloc(MAXVIFS, sizeof(struct vif_device), 1961 GFP_KERNEL); 1962 if (!net->ipv4.vif_table) { 1963 err = -ENOMEM; 1964 goto fail; 1965 } 1966 1967 /* Forwarding cache */ 1968 net->ipv4.mfc_cache_array = kcalloc(MFC_LINES, 1969 sizeof(struct mfc_cache *), 1970 GFP_KERNEL); 1971 if (!net->ipv4.mfc_cache_array) { 1972 err = -ENOMEM; 1973 goto fail_mfc_cache; 1974 } 1975 1976 #ifdef CONFIG_IP_PIMSM 1977 net->ipv4.mroute_reg_vif_num = -1; 1978 #endif 1979 1980 #ifdef CONFIG_PROC_FS 1981 err = -ENOMEM; 1982 if (!proc_net_fops_create(net, "ip_mr_vif", 0, &ipmr_vif_fops)) 1983 goto proc_vif_fail; 1984 if (!proc_net_fops_create(net, "ip_mr_cache", 0, &ipmr_mfc_fops)) 1985 goto proc_cache_fail; 1986 #endif 1987 return 0; 1988 1989 #ifdef CONFIG_PROC_FS 1990 proc_cache_fail: 1991 proc_net_remove(net, "ip_mr_vif"); 1992 proc_vif_fail: 1993 kfree(net->ipv4.mfc_cache_array); 1994 #endif 1995 fail_mfc_cache: 1996 kfree(net->ipv4.vif_table); 1997 fail: 1998 return err; 1999 } 2000 2001 static void __net_exit ipmr_net_exit(struct net *net) 2002 { 2003 #ifdef CONFIG_PROC_FS 2004 proc_net_remove(net, "ip_mr_cache"); 2005 proc_net_remove(net, "ip_mr_vif"); 2006 #endif 2007 kfree(net->ipv4.mfc_cache_array); 2008 kfree(net->ipv4.vif_table); 2009 } 2010 2011 static struct pernet_operations ipmr_net_ops = { 2012 .init = ipmr_net_init, 2013 .exit = ipmr_net_exit, 2014 }; 2015 2016 int __init ip_mr_init(void) 2017 { 2018 int err; 2019 2020 mrt_cachep = kmem_cache_create("ip_mrt_cache", 2021 sizeof(struct mfc_cache), 2022 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, 2023 NULL); 2024 if (!mrt_cachep) 2025 return -ENOMEM; 2026 2027 err = register_pernet_subsys(&ipmr_net_ops); 2028 if (err) 2029 goto reg_pernet_fail; 2030 2031 setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0); 2032 err = register_netdevice_notifier(&ip_mr_notifier); 2033 if (err) 2034 goto reg_notif_fail; 2035 #ifdef CONFIG_IP_PIMSM_V2 2036 if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) { 2037 printk(KERN_ERR "ip_mr_init: can't add PIM protocol\n"); 2038 err = -EAGAIN; 2039 goto add_proto_fail; 2040 } 2041 #endif 2042 return 0; 2043 2044 #ifdef CONFIG_IP_PIMSM_V2 2045 add_proto_fail: 2046 unregister_netdevice_notifier(&ip_mr_notifier); 2047 #endif 2048 reg_notif_fail: 2049 del_timer(&ipmr_expire_timer); 2050 unregister_pernet_subsys(&ipmr_net_ops); 2051 reg_pernet_fail: 2052 kmem_cache_destroy(mrt_cachep); 2053 return err; 2054 } 2055