1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * PACKET - implements raw packet sockets. 7 * 8 * Authors: Ross Biro 9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 10 * Alan Cox, <gw4pts@gw4pts.ampr.org> 11 * 12 * Fixes: 13 * Alan Cox : verify_area() now used correctly 14 * Alan Cox : new skbuff lists, look ma no backlogs! 15 * Alan Cox : tidied skbuff lists. 16 * Alan Cox : Now uses generic datagram routines I 17 * added. Also fixed the peek/read crash 18 * from all old Linux datagram code. 19 * Alan Cox : Uses the improved datagram code. 20 * Alan Cox : Added NULL's for socket options. 21 * Alan Cox : Re-commented the code. 22 * Alan Cox : Use new kernel side addressing 23 * Rob Janssen : Correct MTU usage. 24 * Dave Platt : Counter leaks caused by incorrect 25 * interrupt locking and some slightly 26 * dubious gcc output. Can you read 27 * compiler: it said _VOLATILE_ 28 * Richard Kooijman : Timestamp fixes. 29 * Alan Cox : New buffers. Use sk->mac.raw. 30 * Alan Cox : sendmsg/recvmsg support. 31 * Alan Cox : Protocol setting support 32 * Alexey Kuznetsov : Untied from IPv4 stack. 33 * Cyrus Durgin : Fixed kerneld for kmod. 34 * Michal Ostrowski : Module initialization cleanup. 35 * Ulises Alonso : Frame number limit removal and 36 * packet_set_ring memory leak. 37 * Eric Biederman : Allow for > 8 byte hardware addresses. 38 * The convention is that longer addresses 39 * will simply extend the hardware address 40 * byte arrays at the end of sockaddr_ll 41 * and packet_mreq. 42 * Johann Baudy : Added TX RING. 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction 44 * layer. 45 * Copyright (C) 2011, <lokec@ccs.neu.edu> 46 * 47 * 48 * This program is free software; you can redistribute it and/or 49 * modify it under the terms of the GNU General Public License 50 * as published by the Free Software Foundation; either version 51 * 2 of the License, or (at your option) any later version. 52 * 53 */ 54 55 #include <linux/types.h> 56 #include <linux/mm.h> 57 #include <linux/capability.h> 58 #include <linux/fcntl.h> 59 #include <linux/socket.h> 60 #include <linux/in.h> 61 #include <linux/inet.h> 62 #include <linux/netdevice.h> 63 #include <linux/if_packet.h> 64 #include <linux/wireless.h> 65 #include <linux/kernel.h> 66 #include <linux/kmod.h> 67 #include <linux/slab.h> 68 #include <linux/vmalloc.h> 69 #include <net/net_namespace.h> 70 #include <net/ip.h> 71 #include <net/protocol.h> 72 #include <linux/skbuff.h> 73 #include <net/sock.h> 74 #include <linux/errno.h> 75 #include <linux/timer.h> 76 #include <asm/uaccess.h> 77 #include <asm/ioctls.h> 78 #include <asm/page.h> 79 #include <asm/cacheflush.h> 80 #include <asm/io.h> 81 #include <linux/proc_fs.h> 82 #include <linux/seq_file.h> 83 #include <linux/poll.h> 84 #include <linux/module.h> 85 #include <linux/init.h> 86 #include <linux/mutex.h> 87 #include <linux/if_vlan.h> 88 #include <linux/virtio_net.h> 89 #include <linux/errqueue.h> 90 #include <linux/net_tstamp.h> 91 #include <linux/percpu.h> 92 #ifdef CONFIG_INET 93 #include <net/inet_common.h> 94 #endif 95 96 #include "internal.h" 97 98 /* 99 Assumptions: 100 - if device has no dev->hard_header routine, it adds and removes ll header 101 inside itself. In this case ll header is invisible outside of device, 102 but higher levels still should reserve dev->hard_header_len. 103 Some devices are enough clever to reallocate skb, when header 104 will not fit to reserved space (tunnel), another ones are silly 105 (PPP). 106 - packet socket receives packets with pulled ll header, 107 so that SOCK_RAW should push it back. 108 109 On receive: 110 ----------- 111 112 Incoming, dev->hard_header!=NULL 113 mac_header -> ll header 114 data -> data 115 116 Outgoing, dev->hard_header!=NULL 117 mac_header -> ll header 118 data -> ll header 119 120 Incoming, dev->hard_header==NULL 121 mac_header -> UNKNOWN position. It is very likely, that it points to ll 122 header. PPP makes it, that is wrong, because introduce 123 assymetry between rx and tx paths. 124 data -> data 125 126 Outgoing, dev->hard_header==NULL 127 mac_header -> data. ll header is still not built! 128 data -> data 129 130 Resume 131 If dev->hard_header==NULL we are unlikely to restore sensible ll header. 132 133 134 On transmit: 135 ------------ 136 137 dev->hard_header != NULL 138 mac_header -> ll header 139 data -> ll header 140 141 dev->hard_header == NULL (ll header is added by device, we cannot control it) 142 mac_header -> data 143 data -> data 144 145 We should set nh.raw on output to correct posistion, 146 packet classifier depends on it. 147 */ 148 149 /* Private packet socket structures. */ 150 151 /* identical to struct packet_mreq except it has 152 * a longer address field. 153 */ 154 struct packet_mreq_max { 155 int mr_ifindex; 156 unsigned short mr_type; 157 unsigned short mr_alen; 158 unsigned char mr_address[MAX_ADDR_LEN]; 159 }; 160 161 union tpacket_uhdr { 162 struct tpacket_hdr *h1; 163 struct tpacket2_hdr *h2; 164 struct tpacket3_hdr *h3; 165 void *raw; 166 }; 167 168 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, 169 int closing, int tx_ring); 170 171 #define V3_ALIGNMENT (8) 172 173 #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT)) 174 175 #define BLK_PLUS_PRIV(sz_of_priv) \ 176 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT)) 177 178 #define PGV_FROM_VMALLOC 1 179 180 #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status) 181 #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts) 182 #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt) 183 #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len) 184 #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num) 185 #define BLOCK_O2PRIV(x) ((x)->offset_to_priv) 186 #define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x))) 187 188 struct packet_sock; 189 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg); 190 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, 191 struct packet_type *pt, struct net_device *orig_dev); 192 193 static void *packet_previous_frame(struct packet_sock *po, 194 struct packet_ring_buffer *rb, 195 int status); 196 static void packet_increment_head(struct packet_ring_buffer *buff); 197 static int prb_curr_blk_in_use(struct tpacket_kbdq_core *, 198 struct tpacket_block_desc *); 199 static void *prb_dispatch_next_block(struct tpacket_kbdq_core *, 200 struct packet_sock *); 201 static void prb_retire_current_block(struct tpacket_kbdq_core *, 202 struct packet_sock *, unsigned int status); 203 static int prb_queue_frozen(struct tpacket_kbdq_core *); 204 static void prb_open_block(struct tpacket_kbdq_core *, 205 struct tpacket_block_desc *); 206 static void prb_retire_rx_blk_timer_expired(unsigned long); 207 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *); 208 static void prb_init_blk_timer(struct packet_sock *, 209 struct tpacket_kbdq_core *, 210 void (*func) (unsigned long)); 211 static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); 212 static void prb_clear_rxhash(struct tpacket_kbdq_core *, 213 struct tpacket3_hdr *); 214 static void prb_fill_vlan_info(struct tpacket_kbdq_core *, 215 struct tpacket3_hdr *); 216 static void packet_flush_mclist(struct sock *sk); 217 218 struct packet_skb_cb { 219 unsigned int origlen; 220 union { 221 struct sockaddr_pkt pkt; 222 struct sockaddr_ll ll; 223 } sa; 224 }; 225 226 #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) 227 228 #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc)) 229 #define GET_PBLOCK_DESC(x, bid) \ 230 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer)) 231 #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \ 232 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer)) 233 #define GET_NEXT_PRB_BLK_NUM(x) \ 234 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \ 235 ((x)->kactive_blk_num+1) : 0) 236 237 static void __fanout_unlink(struct sock *sk, struct packet_sock *po); 238 static void __fanout_link(struct sock *sk, struct packet_sock *po); 239 240 static int packet_direct_xmit(struct sk_buff *skb) 241 { 242 struct net_device *dev = skb->dev; 243 const struct net_device_ops *ops = dev->netdev_ops; 244 netdev_features_t features; 245 struct netdev_queue *txq; 246 int ret = NETDEV_TX_BUSY; 247 u16 queue_map; 248 249 if (unlikely(!netif_running(dev) || 250 !netif_carrier_ok(dev))) 251 goto drop; 252 253 features = netif_skb_features(skb); 254 if (skb_needs_linearize(skb, features) && 255 __skb_linearize(skb)) 256 goto drop; 257 258 queue_map = skb_get_queue_mapping(skb); 259 txq = netdev_get_tx_queue(dev, queue_map); 260 261 local_bh_disable(); 262 263 HARD_TX_LOCK(dev, txq, smp_processor_id()); 264 if (!netif_xmit_frozen_or_drv_stopped(txq)) { 265 ret = ops->ndo_start_xmit(skb, dev); 266 if (ret == NETDEV_TX_OK) 267 txq_trans_update(txq); 268 } 269 HARD_TX_UNLOCK(dev, txq); 270 271 local_bh_enable(); 272 273 if (!dev_xmit_complete(ret)) 274 kfree_skb(skb); 275 276 return ret; 277 drop: 278 atomic_long_inc(&dev->tx_dropped); 279 kfree_skb(skb); 280 return NET_XMIT_DROP; 281 } 282 283 static struct net_device *packet_cached_dev_get(struct packet_sock *po) 284 { 285 struct net_device *dev; 286 287 rcu_read_lock(); 288 dev = rcu_dereference(po->cached_dev); 289 if (likely(dev)) 290 dev_hold(dev); 291 rcu_read_unlock(); 292 293 return dev; 294 } 295 296 static void packet_cached_dev_assign(struct packet_sock *po, 297 struct net_device *dev) 298 { 299 rcu_assign_pointer(po->cached_dev, dev); 300 } 301 302 static void packet_cached_dev_reset(struct packet_sock *po) 303 { 304 RCU_INIT_POINTER(po->cached_dev, NULL); 305 } 306 307 static bool packet_use_direct_xmit(const struct packet_sock *po) 308 { 309 return po->xmit == packet_direct_xmit; 310 } 311 312 static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb) 313 { 314 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues; 315 } 316 317 static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb) 318 { 319 const struct net_device_ops *ops = dev->netdev_ops; 320 u16 queue_index; 321 322 if (ops->ndo_select_queue) { 323 queue_index = ops->ndo_select_queue(dev, skb, NULL, 324 __packet_pick_tx_queue); 325 queue_index = netdev_cap_txqueue(dev, queue_index); 326 } else { 327 queue_index = __packet_pick_tx_queue(dev, skb); 328 } 329 330 skb_set_queue_mapping(skb, queue_index); 331 } 332 333 /* register_prot_hook must be invoked with the po->bind_lock held, 334 * or from a context in which asynchronous accesses to the packet 335 * socket is not possible (packet_create()). 336 */ 337 static void register_prot_hook(struct sock *sk) 338 { 339 struct packet_sock *po = pkt_sk(sk); 340 341 if (!po->running) { 342 if (po->fanout) 343 __fanout_link(sk, po); 344 else 345 dev_add_pack(&po->prot_hook); 346 347 sock_hold(sk); 348 po->running = 1; 349 } 350 } 351 352 /* {,__}unregister_prot_hook() must be invoked with the po->bind_lock 353 * held. If the sync parameter is true, we will temporarily drop 354 * the po->bind_lock and do a synchronize_net to make sure no 355 * asynchronous packet processing paths still refer to the elements 356 * of po->prot_hook. If the sync parameter is false, it is the 357 * callers responsibility to take care of this. 358 */ 359 static void __unregister_prot_hook(struct sock *sk, bool sync) 360 { 361 struct packet_sock *po = pkt_sk(sk); 362 363 po->running = 0; 364 365 if (po->fanout) 366 __fanout_unlink(sk, po); 367 else 368 __dev_remove_pack(&po->prot_hook); 369 370 __sock_put(sk); 371 372 if (sync) { 373 spin_unlock(&po->bind_lock); 374 synchronize_net(); 375 spin_lock(&po->bind_lock); 376 } 377 } 378 379 static void unregister_prot_hook(struct sock *sk, bool sync) 380 { 381 struct packet_sock *po = pkt_sk(sk); 382 383 if (po->running) 384 __unregister_prot_hook(sk, sync); 385 } 386 387 static inline __pure struct page *pgv_to_page(void *addr) 388 { 389 if (is_vmalloc_addr(addr)) 390 return vmalloc_to_page(addr); 391 return virt_to_page(addr); 392 } 393 394 static void __packet_set_status(struct packet_sock *po, void *frame, int status) 395 { 396 union tpacket_uhdr h; 397 398 h.raw = frame; 399 switch (po->tp_version) { 400 case TPACKET_V1: 401 h.h1->tp_status = status; 402 flush_dcache_page(pgv_to_page(&h.h1->tp_status)); 403 break; 404 case TPACKET_V2: 405 h.h2->tp_status = status; 406 flush_dcache_page(pgv_to_page(&h.h2->tp_status)); 407 break; 408 case TPACKET_V3: 409 default: 410 WARN(1, "TPACKET version not supported.\n"); 411 BUG(); 412 } 413 414 smp_wmb(); 415 } 416 417 static int __packet_get_status(struct packet_sock *po, void *frame) 418 { 419 union tpacket_uhdr h; 420 421 smp_rmb(); 422 423 h.raw = frame; 424 switch (po->tp_version) { 425 case TPACKET_V1: 426 flush_dcache_page(pgv_to_page(&h.h1->tp_status)); 427 return h.h1->tp_status; 428 case TPACKET_V2: 429 flush_dcache_page(pgv_to_page(&h.h2->tp_status)); 430 return h.h2->tp_status; 431 case TPACKET_V3: 432 default: 433 WARN(1, "TPACKET version not supported.\n"); 434 BUG(); 435 return 0; 436 } 437 } 438 439 static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts, 440 unsigned int flags) 441 { 442 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); 443 444 if (shhwtstamps && 445 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) && 446 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts)) 447 return TP_STATUS_TS_RAW_HARDWARE; 448 449 if (ktime_to_timespec_cond(skb->tstamp, ts)) 450 return TP_STATUS_TS_SOFTWARE; 451 452 return 0; 453 } 454 455 static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame, 456 struct sk_buff *skb) 457 { 458 union tpacket_uhdr h; 459 struct timespec ts; 460 __u32 ts_status; 461 462 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp))) 463 return 0; 464 465 h.raw = frame; 466 switch (po->tp_version) { 467 case TPACKET_V1: 468 h.h1->tp_sec = ts.tv_sec; 469 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; 470 break; 471 case TPACKET_V2: 472 h.h2->tp_sec = ts.tv_sec; 473 h.h2->tp_nsec = ts.tv_nsec; 474 break; 475 case TPACKET_V3: 476 default: 477 WARN(1, "TPACKET version not supported.\n"); 478 BUG(); 479 } 480 481 /* one flush is safe, as both fields always lie on the same cacheline */ 482 flush_dcache_page(pgv_to_page(&h.h1->tp_sec)); 483 smp_wmb(); 484 485 return ts_status; 486 } 487 488 static void *packet_lookup_frame(struct packet_sock *po, 489 struct packet_ring_buffer *rb, 490 unsigned int position, 491 int status) 492 { 493 unsigned int pg_vec_pos, frame_offset; 494 union tpacket_uhdr h; 495 496 pg_vec_pos = position / rb->frames_per_block; 497 frame_offset = position % rb->frames_per_block; 498 499 h.raw = rb->pg_vec[pg_vec_pos].buffer + 500 (frame_offset * rb->frame_size); 501 502 if (status != __packet_get_status(po, h.raw)) 503 return NULL; 504 505 return h.raw; 506 } 507 508 static void *packet_current_frame(struct packet_sock *po, 509 struct packet_ring_buffer *rb, 510 int status) 511 { 512 return packet_lookup_frame(po, rb, rb->head, status); 513 } 514 515 static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc) 516 { 517 del_timer_sync(&pkc->retire_blk_timer); 518 } 519 520 static void prb_shutdown_retire_blk_timer(struct packet_sock *po, 521 int tx_ring, 522 struct sk_buff_head *rb_queue) 523 { 524 struct tpacket_kbdq_core *pkc; 525 526 pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) : 527 GET_PBDQC_FROM_RB(&po->rx_ring); 528 529 spin_lock_bh(&rb_queue->lock); 530 pkc->delete_blk_timer = 1; 531 spin_unlock_bh(&rb_queue->lock); 532 533 prb_del_retire_blk_timer(pkc); 534 } 535 536 static void prb_init_blk_timer(struct packet_sock *po, 537 struct tpacket_kbdq_core *pkc, 538 void (*func) (unsigned long)) 539 { 540 init_timer(&pkc->retire_blk_timer); 541 pkc->retire_blk_timer.data = (long)po; 542 pkc->retire_blk_timer.function = func; 543 pkc->retire_blk_timer.expires = jiffies; 544 } 545 546 static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring) 547 { 548 struct tpacket_kbdq_core *pkc; 549 550 if (tx_ring) 551 BUG(); 552 553 pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) : 554 GET_PBDQC_FROM_RB(&po->rx_ring); 555 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired); 556 } 557 558 static int prb_calc_retire_blk_tmo(struct packet_sock *po, 559 int blk_size_in_bytes) 560 { 561 struct net_device *dev; 562 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0; 563 struct ethtool_cmd ecmd; 564 int err; 565 u32 speed; 566 567 rtnl_lock(); 568 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex); 569 if (unlikely(!dev)) { 570 rtnl_unlock(); 571 return DEFAULT_PRB_RETIRE_TOV; 572 } 573 err = __ethtool_get_settings(dev, &ecmd); 574 speed = ethtool_cmd_speed(&ecmd); 575 rtnl_unlock(); 576 if (!err) { 577 /* 578 * If the link speed is so slow you don't really 579 * need to worry about perf anyways 580 */ 581 if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) { 582 return DEFAULT_PRB_RETIRE_TOV; 583 } else { 584 msec = 1; 585 div = speed / 1000; 586 } 587 } 588 589 mbits = (blk_size_in_bytes * 8) / (1024 * 1024); 590 591 if (div) 592 mbits /= div; 593 594 tmo = mbits * msec; 595 596 if (div) 597 return tmo+1; 598 return tmo; 599 } 600 601 static void prb_init_ft_ops(struct tpacket_kbdq_core *p1, 602 union tpacket_req_u *req_u) 603 { 604 p1->feature_req_word = req_u->req3.tp_feature_req_word; 605 } 606 607 static void init_prb_bdqc(struct packet_sock *po, 608 struct packet_ring_buffer *rb, 609 struct pgv *pg_vec, 610 union tpacket_req_u *req_u, int tx_ring) 611 { 612 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb); 613 struct tpacket_block_desc *pbd; 614 615 memset(p1, 0x0, sizeof(*p1)); 616 617 p1->knxt_seq_num = 1; 618 p1->pkbdq = pg_vec; 619 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer; 620 p1->pkblk_start = pg_vec[0].buffer; 621 p1->kblk_size = req_u->req3.tp_block_size; 622 p1->knum_blocks = req_u->req3.tp_block_nr; 623 p1->hdrlen = po->tp_hdrlen; 624 p1->version = po->tp_version; 625 p1->last_kactive_blk_num = 0; 626 po->stats.stats3.tp_freeze_q_cnt = 0; 627 if (req_u->req3.tp_retire_blk_tov) 628 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov; 629 else 630 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po, 631 req_u->req3.tp_block_size); 632 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov); 633 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv; 634 635 prb_init_ft_ops(p1, req_u); 636 prb_setup_retire_blk_timer(po, tx_ring); 637 prb_open_block(p1, pbd); 638 } 639 640 /* Do NOT update the last_blk_num first. 641 * Assumes sk_buff_head lock is held. 642 */ 643 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc) 644 { 645 mod_timer(&pkc->retire_blk_timer, 646 jiffies + pkc->tov_in_jiffies); 647 pkc->last_kactive_blk_num = pkc->kactive_blk_num; 648 } 649 650 /* 651 * Timer logic: 652 * 1) We refresh the timer only when we open a block. 653 * By doing this we don't waste cycles refreshing the timer 654 * on packet-by-packet basis. 655 * 656 * With a 1MB block-size, on a 1Gbps line, it will take 657 * i) ~8 ms to fill a block + ii) memcpy etc. 658 * In this cut we are not accounting for the memcpy time. 659 * 660 * So, if the user sets the 'tmo' to 10ms then the timer 661 * will never fire while the block is still getting filled 662 * (which is what we want). However, the user could choose 663 * to close a block early and that's fine. 664 * 665 * But when the timer does fire, we check whether or not to refresh it. 666 * Since the tmo granularity is in msecs, it is not too expensive 667 * to refresh the timer, lets say every '8' msecs. 668 * Either the user can set the 'tmo' or we can derive it based on 669 * a) line-speed and b) block-size. 670 * prb_calc_retire_blk_tmo() calculates the tmo. 671 * 672 */ 673 static void prb_retire_rx_blk_timer_expired(unsigned long data) 674 { 675 struct packet_sock *po = (struct packet_sock *)data; 676 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring); 677 unsigned int frozen; 678 struct tpacket_block_desc *pbd; 679 680 spin_lock(&po->sk.sk_receive_queue.lock); 681 682 frozen = prb_queue_frozen(pkc); 683 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); 684 685 if (unlikely(pkc->delete_blk_timer)) 686 goto out; 687 688 /* We only need to plug the race when the block is partially filled. 689 * tpacket_rcv: 690 * lock(); increment BLOCK_NUM_PKTS; unlock() 691 * copy_bits() is in progress ... 692 * timer fires on other cpu: 693 * we can't retire the current block because copy_bits 694 * is in progress. 695 * 696 */ 697 if (BLOCK_NUM_PKTS(pbd)) { 698 while (atomic_read(&pkc->blk_fill_in_prog)) { 699 /* Waiting for skb_copy_bits to finish... */ 700 cpu_relax(); 701 } 702 } 703 704 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) { 705 if (!frozen) { 706 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO); 707 if (!prb_dispatch_next_block(pkc, po)) 708 goto refresh_timer; 709 else 710 goto out; 711 } else { 712 /* Case 1. Queue was frozen because user-space was 713 * lagging behind. 714 */ 715 if (prb_curr_blk_in_use(pkc, pbd)) { 716 /* 717 * Ok, user-space is still behind. 718 * So just refresh the timer. 719 */ 720 goto refresh_timer; 721 } else { 722 /* Case 2. queue was frozen,user-space caught up, 723 * now the link went idle && the timer fired. 724 * We don't have a block to close.So we open this 725 * block and restart the timer. 726 * opening a block thaws the queue,restarts timer 727 * Thawing/timer-refresh is a side effect. 728 */ 729 prb_open_block(pkc, pbd); 730 goto out; 731 } 732 } 733 } 734 735 refresh_timer: 736 _prb_refresh_rx_retire_blk_timer(pkc); 737 738 out: 739 spin_unlock(&po->sk.sk_receive_queue.lock); 740 } 741 742 static void prb_flush_block(struct tpacket_kbdq_core *pkc1, 743 struct tpacket_block_desc *pbd1, __u32 status) 744 { 745 /* Flush everything minus the block header */ 746 747 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 748 u8 *start, *end; 749 750 start = (u8 *)pbd1; 751 752 /* Skip the block header(we know header WILL fit in 4K) */ 753 start += PAGE_SIZE; 754 755 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end); 756 for (; start < end; start += PAGE_SIZE) 757 flush_dcache_page(pgv_to_page(start)); 758 759 smp_wmb(); 760 #endif 761 762 /* Now update the block status. */ 763 764 BLOCK_STATUS(pbd1) = status; 765 766 /* Flush the block header */ 767 768 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 769 start = (u8 *)pbd1; 770 flush_dcache_page(pgv_to_page(start)); 771 772 smp_wmb(); 773 #endif 774 } 775 776 /* 777 * Side effect: 778 * 779 * 1) flush the block 780 * 2) Increment active_blk_num 781 * 782 * Note:We DONT refresh the timer on purpose. 783 * Because almost always the next block will be opened. 784 */ 785 static void prb_close_block(struct tpacket_kbdq_core *pkc1, 786 struct tpacket_block_desc *pbd1, 787 struct packet_sock *po, unsigned int stat) 788 { 789 __u32 status = TP_STATUS_USER | stat; 790 791 struct tpacket3_hdr *last_pkt; 792 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; 793 794 if (po->stats.stats3.tp_drops) 795 status |= TP_STATUS_LOSING; 796 797 last_pkt = (struct tpacket3_hdr *)pkc1->prev; 798 last_pkt->tp_next_offset = 0; 799 800 /* Get the ts of the last pkt */ 801 if (BLOCK_NUM_PKTS(pbd1)) { 802 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec; 803 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec; 804 } else { 805 /* Ok, we tmo'd - so get the current time */ 806 struct timespec ts; 807 getnstimeofday(&ts); 808 h1->ts_last_pkt.ts_sec = ts.tv_sec; 809 h1->ts_last_pkt.ts_nsec = ts.tv_nsec; 810 } 811 812 smp_wmb(); 813 814 /* Flush the block */ 815 prb_flush_block(pkc1, pbd1, status); 816 817 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1); 818 } 819 820 static void prb_thaw_queue(struct tpacket_kbdq_core *pkc) 821 { 822 pkc->reset_pending_on_curr_blk = 0; 823 } 824 825 /* 826 * Side effect of opening a block: 827 * 828 * 1) prb_queue is thawed. 829 * 2) retire_blk_timer is refreshed. 830 * 831 */ 832 static void prb_open_block(struct tpacket_kbdq_core *pkc1, 833 struct tpacket_block_desc *pbd1) 834 { 835 struct timespec ts; 836 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; 837 838 smp_rmb(); 839 840 /* We could have just memset this but we will lose the 841 * flexibility of making the priv area sticky 842 */ 843 844 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++; 845 BLOCK_NUM_PKTS(pbd1) = 0; 846 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); 847 848 getnstimeofday(&ts); 849 850 h1->ts_first_pkt.ts_sec = ts.tv_sec; 851 h1->ts_first_pkt.ts_nsec = ts.tv_nsec; 852 853 pkc1->pkblk_start = (char *)pbd1; 854 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); 855 856 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); 857 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN; 858 859 pbd1->version = pkc1->version; 860 pkc1->prev = pkc1->nxt_offset; 861 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size; 862 863 prb_thaw_queue(pkc1); 864 _prb_refresh_rx_retire_blk_timer(pkc1); 865 866 smp_wmb(); 867 } 868 869 /* 870 * Queue freeze logic: 871 * 1) Assume tp_block_nr = 8 blocks. 872 * 2) At time 't0', user opens Rx ring. 873 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7 874 * 4) user-space is either sleeping or processing block '0'. 875 * 5) tpacket_rcv is currently filling block '7', since there is no space left, 876 * it will close block-7,loop around and try to fill block '0'. 877 * call-flow: 878 * __packet_lookup_frame_in_block 879 * prb_retire_current_block() 880 * prb_dispatch_next_block() 881 * |->(BLOCK_STATUS == USER) evaluates to true 882 * 5.1) Since block-0 is currently in-use, we just freeze the queue. 883 * 6) Now there are two cases: 884 * 6.1) Link goes idle right after the queue is frozen. 885 * But remember, the last open_block() refreshed the timer. 886 * When this timer expires,it will refresh itself so that we can 887 * re-open block-0 in near future. 888 * 6.2) Link is busy and keeps on receiving packets. This is a simple 889 * case and __packet_lookup_frame_in_block will check if block-0 890 * is free and can now be re-used. 891 */ 892 static void prb_freeze_queue(struct tpacket_kbdq_core *pkc, 893 struct packet_sock *po) 894 { 895 pkc->reset_pending_on_curr_blk = 1; 896 po->stats.stats3.tp_freeze_q_cnt++; 897 } 898 899 #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT)) 900 901 /* 902 * If the next block is free then we will dispatch it 903 * and return a good offset. 904 * Else, we will freeze the queue. 905 * So, caller must check the return value. 906 */ 907 static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc, 908 struct packet_sock *po) 909 { 910 struct tpacket_block_desc *pbd; 911 912 smp_rmb(); 913 914 /* 1. Get current block num */ 915 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); 916 917 /* 2. If this block is currently in_use then freeze the queue */ 918 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) { 919 prb_freeze_queue(pkc, po); 920 return NULL; 921 } 922 923 /* 924 * 3. 925 * open this block and return the offset where the first packet 926 * needs to get stored. 927 */ 928 prb_open_block(pkc, pbd); 929 return (void *)pkc->nxt_offset; 930 } 931 932 static void prb_retire_current_block(struct tpacket_kbdq_core *pkc, 933 struct packet_sock *po, unsigned int status) 934 { 935 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); 936 937 /* retire/close the current block */ 938 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) { 939 /* 940 * Plug the case where copy_bits() is in progress on 941 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't 942 * have space to copy the pkt in the current block and 943 * called prb_retire_current_block() 944 * 945 * We don't need to worry about the TMO case because 946 * the timer-handler already handled this case. 947 */ 948 if (!(status & TP_STATUS_BLK_TMO)) { 949 while (atomic_read(&pkc->blk_fill_in_prog)) { 950 /* Waiting for skb_copy_bits to finish... */ 951 cpu_relax(); 952 } 953 } 954 prb_close_block(pkc, pbd, po, status); 955 return; 956 } 957 } 958 959 static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc, 960 struct tpacket_block_desc *pbd) 961 { 962 return TP_STATUS_USER & BLOCK_STATUS(pbd); 963 } 964 965 static int prb_queue_frozen(struct tpacket_kbdq_core *pkc) 966 { 967 return pkc->reset_pending_on_curr_blk; 968 } 969 970 static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb) 971 { 972 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); 973 atomic_dec(&pkc->blk_fill_in_prog); 974 } 975 976 static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc, 977 struct tpacket3_hdr *ppd) 978 { 979 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb); 980 } 981 982 static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc, 983 struct tpacket3_hdr *ppd) 984 { 985 ppd->hv1.tp_rxhash = 0; 986 } 987 988 static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc, 989 struct tpacket3_hdr *ppd) 990 { 991 if (vlan_tx_tag_present(pkc->skb)) { 992 ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb); 993 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto); 994 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; 995 } else { 996 ppd->hv1.tp_vlan_tci = 0; 997 ppd->hv1.tp_vlan_tpid = 0; 998 ppd->tp_status = TP_STATUS_AVAILABLE; 999 } 1000 } 1001 1002 static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc, 1003 struct tpacket3_hdr *ppd) 1004 { 1005 ppd->hv1.tp_padding = 0; 1006 prb_fill_vlan_info(pkc, ppd); 1007 1008 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH) 1009 prb_fill_rxhash(pkc, ppd); 1010 else 1011 prb_clear_rxhash(pkc, ppd); 1012 } 1013 1014 static void prb_fill_curr_block(char *curr, 1015 struct tpacket_kbdq_core *pkc, 1016 struct tpacket_block_desc *pbd, 1017 unsigned int len) 1018 { 1019 struct tpacket3_hdr *ppd; 1020 1021 ppd = (struct tpacket3_hdr *)curr; 1022 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len); 1023 pkc->prev = curr; 1024 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len); 1025 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len); 1026 BLOCK_NUM_PKTS(pbd) += 1; 1027 atomic_inc(&pkc->blk_fill_in_prog); 1028 prb_run_all_ft_ops(pkc, ppd); 1029 } 1030 1031 /* Assumes caller has the sk->rx_queue.lock */ 1032 static void *__packet_lookup_frame_in_block(struct packet_sock *po, 1033 struct sk_buff *skb, 1034 int status, 1035 unsigned int len 1036 ) 1037 { 1038 struct tpacket_kbdq_core *pkc; 1039 struct tpacket_block_desc *pbd; 1040 char *curr, *end; 1041 1042 pkc = GET_PBDQC_FROM_RB(&po->rx_ring); 1043 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); 1044 1045 /* Queue is frozen when user space is lagging behind */ 1046 if (prb_queue_frozen(pkc)) { 1047 /* 1048 * Check if that last block which caused the queue to freeze, 1049 * is still in_use by user-space. 1050 */ 1051 if (prb_curr_blk_in_use(pkc, pbd)) { 1052 /* Can't record this packet */ 1053 return NULL; 1054 } else { 1055 /* 1056 * Ok, the block was released by user-space. 1057 * Now let's open that block. 1058 * opening a block also thaws the queue. 1059 * Thawing is a side effect. 1060 */ 1061 prb_open_block(pkc, pbd); 1062 } 1063 } 1064 1065 smp_mb(); 1066 curr = pkc->nxt_offset; 1067 pkc->skb = skb; 1068 end = (char *)pbd + pkc->kblk_size; 1069 1070 /* first try the current block */ 1071 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) { 1072 prb_fill_curr_block(curr, pkc, pbd, len); 1073 return (void *)curr; 1074 } 1075 1076 /* Ok, close the current block */ 1077 prb_retire_current_block(pkc, po, 0); 1078 1079 /* Now, try to dispatch the next block */ 1080 curr = (char *)prb_dispatch_next_block(pkc, po); 1081 if (curr) { 1082 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); 1083 prb_fill_curr_block(curr, pkc, pbd, len); 1084 return (void *)curr; 1085 } 1086 1087 /* 1088 * No free blocks are available.user_space hasn't caught up yet. 1089 * Queue was just frozen and now this packet will get dropped. 1090 */ 1091 return NULL; 1092 } 1093 1094 static void *packet_current_rx_frame(struct packet_sock *po, 1095 struct sk_buff *skb, 1096 int status, unsigned int len) 1097 { 1098 char *curr = NULL; 1099 switch (po->tp_version) { 1100 case TPACKET_V1: 1101 case TPACKET_V2: 1102 curr = packet_lookup_frame(po, &po->rx_ring, 1103 po->rx_ring.head, status); 1104 return curr; 1105 case TPACKET_V3: 1106 return __packet_lookup_frame_in_block(po, skb, status, len); 1107 default: 1108 WARN(1, "TPACKET version not supported\n"); 1109 BUG(); 1110 return NULL; 1111 } 1112 } 1113 1114 static void *prb_lookup_block(struct packet_sock *po, 1115 struct packet_ring_buffer *rb, 1116 unsigned int idx, 1117 int status) 1118 { 1119 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); 1120 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx); 1121 1122 if (status != BLOCK_STATUS(pbd)) 1123 return NULL; 1124 return pbd; 1125 } 1126 1127 static int prb_previous_blk_num(struct packet_ring_buffer *rb) 1128 { 1129 unsigned int prev; 1130 if (rb->prb_bdqc.kactive_blk_num) 1131 prev = rb->prb_bdqc.kactive_blk_num-1; 1132 else 1133 prev = rb->prb_bdqc.knum_blocks-1; 1134 return prev; 1135 } 1136 1137 /* Assumes caller has held the rx_queue.lock */ 1138 static void *__prb_previous_block(struct packet_sock *po, 1139 struct packet_ring_buffer *rb, 1140 int status) 1141 { 1142 unsigned int previous = prb_previous_blk_num(rb); 1143 return prb_lookup_block(po, rb, previous, status); 1144 } 1145 1146 static void *packet_previous_rx_frame(struct packet_sock *po, 1147 struct packet_ring_buffer *rb, 1148 int status) 1149 { 1150 if (po->tp_version <= TPACKET_V2) 1151 return packet_previous_frame(po, rb, status); 1152 1153 return __prb_previous_block(po, rb, status); 1154 } 1155 1156 static void packet_increment_rx_head(struct packet_sock *po, 1157 struct packet_ring_buffer *rb) 1158 { 1159 switch (po->tp_version) { 1160 case TPACKET_V1: 1161 case TPACKET_V2: 1162 return packet_increment_head(rb); 1163 case TPACKET_V3: 1164 default: 1165 WARN(1, "TPACKET version not supported.\n"); 1166 BUG(); 1167 return; 1168 } 1169 } 1170 1171 static void *packet_previous_frame(struct packet_sock *po, 1172 struct packet_ring_buffer *rb, 1173 int status) 1174 { 1175 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max; 1176 return packet_lookup_frame(po, rb, previous, status); 1177 } 1178 1179 static void packet_increment_head(struct packet_ring_buffer *buff) 1180 { 1181 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0; 1182 } 1183 1184 static void packet_inc_pending(struct packet_ring_buffer *rb) 1185 { 1186 this_cpu_inc(*rb->pending_refcnt); 1187 } 1188 1189 static void packet_dec_pending(struct packet_ring_buffer *rb) 1190 { 1191 this_cpu_dec(*rb->pending_refcnt); 1192 } 1193 1194 static unsigned int packet_read_pending(const struct packet_ring_buffer *rb) 1195 { 1196 unsigned int refcnt = 0; 1197 int cpu; 1198 1199 /* We don't use pending refcount in rx_ring. */ 1200 if (rb->pending_refcnt == NULL) 1201 return 0; 1202 1203 for_each_possible_cpu(cpu) 1204 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu); 1205 1206 return refcnt; 1207 } 1208 1209 static int packet_alloc_pending(struct packet_sock *po) 1210 { 1211 po->rx_ring.pending_refcnt = NULL; 1212 1213 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int); 1214 if (unlikely(po->tx_ring.pending_refcnt == NULL)) 1215 return -ENOBUFS; 1216 1217 return 0; 1218 } 1219 1220 static void packet_free_pending(struct packet_sock *po) 1221 { 1222 free_percpu(po->tx_ring.pending_refcnt); 1223 } 1224 1225 static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) 1226 { 1227 struct sock *sk = &po->sk; 1228 bool has_room; 1229 1230 if (po->prot_hook.func != tpacket_rcv) 1231 return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize) 1232 <= sk->sk_rcvbuf; 1233 1234 spin_lock(&sk->sk_receive_queue.lock); 1235 if (po->tp_version == TPACKET_V3) 1236 has_room = prb_lookup_block(po, &po->rx_ring, 1237 po->rx_ring.prb_bdqc.kactive_blk_num, 1238 TP_STATUS_KERNEL); 1239 else 1240 has_room = packet_lookup_frame(po, &po->rx_ring, 1241 po->rx_ring.head, 1242 TP_STATUS_KERNEL); 1243 spin_unlock(&sk->sk_receive_queue.lock); 1244 1245 return has_room; 1246 } 1247 1248 static void packet_sock_destruct(struct sock *sk) 1249 { 1250 skb_queue_purge(&sk->sk_error_queue); 1251 1252 WARN_ON(atomic_read(&sk->sk_rmem_alloc)); 1253 WARN_ON(atomic_read(&sk->sk_wmem_alloc)); 1254 1255 if (!sock_flag(sk, SOCK_DEAD)) { 1256 pr_err("Attempt to release alive packet socket: %p\n", sk); 1257 return; 1258 } 1259 1260 sk_refcnt_debug_dec(sk); 1261 } 1262 1263 static int fanout_rr_next(struct packet_fanout *f, unsigned int num) 1264 { 1265 int x = atomic_read(&f->rr_cur) + 1; 1266 1267 if (x >= num) 1268 x = 0; 1269 1270 return x; 1271 } 1272 1273 static unsigned int fanout_demux_hash(struct packet_fanout *f, 1274 struct sk_buff *skb, 1275 unsigned int num) 1276 { 1277 return reciprocal_scale(skb_get_hash(skb), num); 1278 } 1279 1280 static unsigned int fanout_demux_lb(struct packet_fanout *f, 1281 struct sk_buff *skb, 1282 unsigned int num) 1283 { 1284 int cur, old; 1285 1286 cur = atomic_read(&f->rr_cur); 1287 while ((old = atomic_cmpxchg(&f->rr_cur, cur, 1288 fanout_rr_next(f, num))) != cur) 1289 cur = old; 1290 return cur; 1291 } 1292 1293 static unsigned int fanout_demux_cpu(struct packet_fanout *f, 1294 struct sk_buff *skb, 1295 unsigned int num) 1296 { 1297 return smp_processor_id() % num; 1298 } 1299 1300 static unsigned int fanout_demux_rnd(struct packet_fanout *f, 1301 struct sk_buff *skb, 1302 unsigned int num) 1303 { 1304 return prandom_u32_max(num); 1305 } 1306 1307 static unsigned int fanout_demux_rollover(struct packet_fanout *f, 1308 struct sk_buff *skb, 1309 unsigned int idx, unsigned int skip, 1310 unsigned int num) 1311 { 1312 unsigned int i, j; 1313 1314 i = j = min_t(int, f->next[idx], num - 1); 1315 do { 1316 if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) { 1317 if (i != j) 1318 f->next[idx] = i; 1319 return i; 1320 } 1321 if (++i == num) 1322 i = 0; 1323 } while (i != j); 1324 1325 return idx; 1326 } 1327 1328 static unsigned int fanout_demux_qm(struct packet_fanout *f, 1329 struct sk_buff *skb, 1330 unsigned int num) 1331 { 1332 return skb_get_queue_mapping(skb) % num; 1333 } 1334 1335 static bool fanout_has_flag(struct packet_fanout *f, u16 flag) 1336 { 1337 return f->flags & (flag >> 8); 1338 } 1339 1340 static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, 1341 struct packet_type *pt, struct net_device *orig_dev) 1342 { 1343 struct packet_fanout *f = pt->af_packet_priv; 1344 unsigned int num = f->num_members; 1345 struct packet_sock *po; 1346 unsigned int idx; 1347 1348 if (!net_eq(dev_net(dev), read_pnet(&f->net)) || 1349 !num) { 1350 kfree_skb(skb); 1351 return 0; 1352 } 1353 1354 switch (f->type) { 1355 case PACKET_FANOUT_HASH: 1356 default: 1357 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) { 1358 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET); 1359 if (!skb) 1360 return 0; 1361 } 1362 idx = fanout_demux_hash(f, skb, num); 1363 break; 1364 case PACKET_FANOUT_LB: 1365 idx = fanout_demux_lb(f, skb, num); 1366 break; 1367 case PACKET_FANOUT_CPU: 1368 idx = fanout_demux_cpu(f, skb, num); 1369 break; 1370 case PACKET_FANOUT_RND: 1371 idx = fanout_demux_rnd(f, skb, num); 1372 break; 1373 case PACKET_FANOUT_QM: 1374 idx = fanout_demux_qm(f, skb, num); 1375 break; 1376 case PACKET_FANOUT_ROLLOVER: 1377 idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num); 1378 break; 1379 } 1380 1381 po = pkt_sk(f->arr[idx]); 1382 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) && 1383 unlikely(!packet_rcv_has_room(po, skb))) { 1384 idx = fanout_demux_rollover(f, skb, idx, idx, num); 1385 po = pkt_sk(f->arr[idx]); 1386 } 1387 1388 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev); 1389 } 1390 1391 DEFINE_MUTEX(fanout_mutex); 1392 EXPORT_SYMBOL_GPL(fanout_mutex); 1393 static LIST_HEAD(fanout_list); 1394 1395 static void __fanout_link(struct sock *sk, struct packet_sock *po) 1396 { 1397 struct packet_fanout *f = po->fanout; 1398 1399 spin_lock(&f->lock); 1400 f->arr[f->num_members] = sk; 1401 smp_wmb(); 1402 f->num_members++; 1403 spin_unlock(&f->lock); 1404 } 1405 1406 static void __fanout_unlink(struct sock *sk, struct packet_sock *po) 1407 { 1408 struct packet_fanout *f = po->fanout; 1409 int i; 1410 1411 spin_lock(&f->lock); 1412 for (i = 0; i < f->num_members; i++) { 1413 if (f->arr[i] == sk) 1414 break; 1415 } 1416 BUG_ON(i >= f->num_members); 1417 f->arr[i] = f->arr[f->num_members - 1]; 1418 f->num_members--; 1419 spin_unlock(&f->lock); 1420 } 1421 1422 static bool match_fanout_group(struct packet_type *ptype, struct sock *sk) 1423 { 1424 if (ptype->af_packet_priv == (void *)((struct packet_sock *)sk)->fanout) 1425 return true; 1426 1427 return false; 1428 } 1429 1430 static int fanout_add(struct sock *sk, u16 id, u16 type_flags) 1431 { 1432 struct packet_sock *po = pkt_sk(sk); 1433 struct packet_fanout *f, *match; 1434 u8 type = type_flags & 0xff; 1435 u8 flags = type_flags >> 8; 1436 int err; 1437 1438 switch (type) { 1439 case PACKET_FANOUT_ROLLOVER: 1440 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER) 1441 return -EINVAL; 1442 case PACKET_FANOUT_HASH: 1443 case PACKET_FANOUT_LB: 1444 case PACKET_FANOUT_CPU: 1445 case PACKET_FANOUT_RND: 1446 case PACKET_FANOUT_QM: 1447 break; 1448 default: 1449 return -EINVAL; 1450 } 1451 1452 if (!po->running) 1453 return -EINVAL; 1454 1455 if (po->fanout) 1456 return -EALREADY; 1457 1458 mutex_lock(&fanout_mutex); 1459 match = NULL; 1460 list_for_each_entry(f, &fanout_list, list) { 1461 if (f->id == id && 1462 read_pnet(&f->net) == sock_net(sk)) { 1463 match = f; 1464 break; 1465 } 1466 } 1467 err = -EINVAL; 1468 if (match && match->flags != flags) 1469 goto out; 1470 if (!match) { 1471 err = -ENOMEM; 1472 match = kzalloc(sizeof(*match), GFP_KERNEL); 1473 if (!match) 1474 goto out; 1475 write_pnet(&match->net, sock_net(sk)); 1476 match->id = id; 1477 match->type = type; 1478 match->flags = flags; 1479 atomic_set(&match->rr_cur, 0); 1480 INIT_LIST_HEAD(&match->list); 1481 spin_lock_init(&match->lock); 1482 atomic_set(&match->sk_ref, 0); 1483 match->prot_hook.type = po->prot_hook.type; 1484 match->prot_hook.dev = po->prot_hook.dev; 1485 match->prot_hook.func = packet_rcv_fanout; 1486 match->prot_hook.af_packet_priv = match; 1487 match->prot_hook.id_match = match_fanout_group; 1488 dev_add_pack(&match->prot_hook); 1489 list_add(&match->list, &fanout_list); 1490 } 1491 err = -EINVAL; 1492 if (match->type == type && 1493 match->prot_hook.type == po->prot_hook.type && 1494 match->prot_hook.dev == po->prot_hook.dev) { 1495 err = -ENOSPC; 1496 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) { 1497 __dev_remove_pack(&po->prot_hook); 1498 po->fanout = match; 1499 atomic_inc(&match->sk_ref); 1500 __fanout_link(sk, po); 1501 err = 0; 1502 } 1503 } 1504 out: 1505 mutex_unlock(&fanout_mutex); 1506 return err; 1507 } 1508 1509 static void fanout_release(struct sock *sk) 1510 { 1511 struct packet_sock *po = pkt_sk(sk); 1512 struct packet_fanout *f; 1513 1514 f = po->fanout; 1515 if (!f) 1516 return; 1517 1518 mutex_lock(&fanout_mutex); 1519 po->fanout = NULL; 1520 1521 if (atomic_dec_and_test(&f->sk_ref)) { 1522 list_del(&f->list); 1523 dev_remove_pack(&f->prot_hook); 1524 kfree(f); 1525 } 1526 mutex_unlock(&fanout_mutex); 1527 } 1528 1529 static const struct proto_ops packet_ops; 1530 1531 static const struct proto_ops packet_ops_spkt; 1532 1533 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, 1534 struct packet_type *pt, struct net_device *orig_dev) 1535 { 1536 struct sock *sk; 1537 struct sockaddr_pkt *spkt; 1538 1539 /* 1540 * When we registered the protocol we saved the socket in the data 1541 * field for just this event. 1542 */ 1543 1544 sk = pt->af_packet_priv; 1545 1546 /* 1547 * Yank back the headers [hope the device set this 1548 * right or kerboom...] 1549 * 1550 * Incoming packets have ll header pulled, 1551 * push it back. 1552 * 1553 * For outgoing ones skb->data == skb_mac_header(skb) 1554 * so that this procedure is noop. 1555 */ 1556 1557 if (skb->pkt_type == PACKET_LOOPBACK) 1558 goto out; 1559 1560 if (!net_eq(dev_net(dev), sock_net(sk))) 1561 goto out; 1562 1563 skb = skb_share_check(skb, GFP_ATOMIC); 1564 if (skb == NULL) 1565 goto oom; 1566 1567 /* drop any routing info */ 1568 skb_dst_drop(skb); 1569 1570 /* drop conntrack reference */ 1571 nf_reset(skb); 1572 1573 spkt = &PACKET_SKB_CB(skb)->sa.pkt; 1574 1575 skb_push(skb, skb->data - skb_mac_header(skb)); 1576 1577 /* 1578 * The SOCK_PACKET socket receives _all_ frames. 1579 */ 1580 1581 spkt->spkt_family = dev->type; 1582 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device)); 1583 spkt->spkt_protocol = skb->protocol; 1584 1585 /* 1586 * Charge the memory to the socket. This is done specifically 1587 * to prevent sockets using all the memory up. 1588 */ 1589 1590 if (sock_queue_rcv_skb(sk, skb) == 0) 1591 return 0; 1592 1593 out: 1594 kfree_skb(skb); 1595 oom: 1596 return 0; 1597 } 1598 1599 1600 /* 1601 * Output a raw packet to a device layer. This bypasses all the other 1602 * protocol layers and you must therefore supply it with a complete frame 1603 */ 1604 1605 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock, 1606 struct msghdr *msg, size_t len) 1607 { 1608 struct sock *sk = sock->sk; 1609 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name); 1610 struct sk_buff *skb = NULL; 1611 struct net_device *dev; 1612 __be16 proto = 0; 1613 int err; 1614 int extra_len = 0; 1615 1616 /* 1617 * Get and verify the address. 1618 */ 1619 1620 if (saddr) { 1621 if (msg->msg_namelen < sizeof(struct sockaddr)) 1622 return -EINVAL; 1623 if (msg->msg_namelen == sizeof(struct sockaddr_pkt)) 1624 proto = saddr->spkt_protocol; 1625 } else 1626 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */ 1627 1628 /* 1629 * Find the device first to size check it 1630 */ 1631 1632 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0; 1633 retry: 1634 rcu_read_lock(); 1635 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device); 1636 err = -ENODEV; 1637 if (dev == NULL) 1638 goto out_unlock; 1639 1640 err = -ENETDOWN; 1641 if (!(dev->flags & IFF_UP)) 1642 goto out_unlock; 1643 1644 /* 1645 * You may not queue a frame bigger than the mtu. This is the lowest level 1646 * raw protocol and you must do your own fragmentation at this level. 1647 */ 1648 1649 if (unlikely(sock_flag(sk, SOCK_NOFCS))) { 1650 if (!netif_supports_nofcs(dev)) { 1651 err = -EPROTONOSUPPORT; 1652 goto out_unlock; 1653 } 1654 extra_len = 4; /* We're doing our own CRC */ 1655 } 1656 1657 err = -EMSGSIZE; 1658 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len) 1659 goto out_unlock; 1660 1661 if (!skb) { 1662 size_t reserved = LL_RESERVED_SPACE(dev); 1663 int tlen = dev->needed_tailroom; 1664 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0; 1665 1666 rcu_read_unlock(); 1667 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL); 1668 if (skb == NULL) 1669 return -ENOBUFS; 1670 /* FIXME: Save some space for broken drivers that write a hard 1671 * header at transmission time by themselves. PPP is the notable 1672 * one here. This should really be fixed at the driver level. 1673 */ 1674 skb_reserve(skb, reserved); 1675 skb_reset_network_header(skb); 1676 1677 /* Try to align data part correctly */ 1678 if (hhlen) { 1679 skb->data -= hhlen; 1680 skb->tail -= hhlen; 1681 if (len < hhlen) 1682 skb_reset_network_header(skb); 1683 } 1684 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); 1685 if (err) 1686 goto out_free; 1687 goto retry; 1688 } 1689 1690 if (len > (dev->mtu + dev->hard_header_len + extra_len)) { 1691 /* Earlier code assumed this would be a VLAN pkt, 1692 * double-check this now that we have the actual 1693 * packet in hand. 1694 */ 1695 struct ethhdr *ehdr; 1696 skb_reset_mac_header(skb); 1697 ehdr = eth_hdr(skb); 1698 if (ehdr->h_proto != htons(ETH_P_8021Q)) { 1699 err = -EMSGSIZE; 1700 goto out_unlock; 1701 } 1702 } 1703 1704 skb->protocol = proto; 1705 skb->dev = dev; 1706 skb->priority = sk->sk_priority; 1707 skb->mark = sk->sk_mark; 1708 1709 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); 1710 1711 if (unlikely(extra_len == 4)) 1712 skb->no_fcs = 1; 1713 1714 skb_probe_transport_header(skb, 0); 1715 1716 dev_queue_xmit(skb); 1717 rcu_read_unlock(); 1718 return len; 1719 1720 out_unlock: 1721 rcu_read_unlock(); 1722 out_free: 1723 kfree_skb(skb); 1724 return err; 1725 } 1726 1727 static unsigned int run_filter(const struct sk_buff *skb, 1728 const struct sock *sk, 1729 unsigned int res) 1730 { 1731 struct sk_filter *filter; 1732 1733 rcu_read_lock(); 1734 filter = rcu_dereference(sk->sk_filter); 1735 if (filter != NULL) 1736 res = SK_RUN_FILTER(filter, skb); 1737 rcu_read_unlock(); 1738 1739 return res; 1740 } 1741 1742 /* 1743 * This function makes lazy skb cloning in hope that most of packets 1744 * are discarded by BPF. 1745 * 1746 * Note tricky part: we DO mangle shared skb! skb->data, skb->len 1747 * and skb->cb are mangled. It works because (and until) packets 1748 * falling here are owned by current CPU. Output packets are cloned 1749 * by dev_queue_xmit_nit(), input packets are processed by net_bh 1750 * sequencially, so that if we return skb to original state on exit, 1751 * we will not harm anyone. 1752 */ 1753 1754 static int packet_rcv(struct sk_buff *skb, struct net_device *dev, 1755 struct packet_type *pt, struct net_device *orig_dev) 1756 { 1757 struct sock *sk; 1758 struct sockaddr_ll *sll; 1759 struct packet_sock *po; 1760 u8 *skb_head = skb->data; 1761 int skb_len = skb->len; 1762 unsigned int snaplen, res; 1763 1764 if (skb->pkt_type == PACKET_LOOPBACK) 1765 goto drop; 1766 1767 sk = pt->af_packet_priv; 1768 po = pkt_sk(sk); 1769 1770 if (!net_eq(dev_net(dev), sock_net(sk))) 1771 goto drop; 1772 1773 skb->dev = dev; 1774 1775 if (dev->header_ops) { 1776 /* The device has an explicit notion of ll header, 1777 * exported to higher levels. 1778 * 1779 * Otherwise, the device hides details of its frame 1780 * structure, so that corresponding packet head is 1781 * never delivered to user. 1782 */ 1783 if (sk->sk_type != SOCK_DGRAM) 1784 skb_push(skb, skb->data - skb_mac_header(skb)); 1785 else if (skb->pkt_type == PACKET_OUTGOING) { 1786 /* Special case: outgoing packets have ll header at head */ 1787 skb_pull(skb, skb_network_offset(skb)); 1788 } 1789 } 1790 1791 snaplen = skb->len; 1792 1793 res = run_filter(skb, sk, snaplen); 1794 if (!res) 1795 goto drop_n_restore; 1796 if (snaplen > res) 1797 snaplen = res; 1798 1799 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) 1800 goto drop_n_acct; 1801 1802 if (skb_shared(skb)) { 1803 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); 1804 if (nskb == NULL) 1805 goto drop_n_acct; 1806 1807 if (skb_head != skb->data) { 1808 skb->data = skb_head; 1809 skb->len = skb_len; 1810 } 1811 consume_skb(skb); 1812 skb = nskb; 1813 } 1814 1815 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 > 1816 sizeof(skb->cb)); 1817 1818 sll = &PACKET_SKB_CB(skb)->sa.ll; 1819 sll->sll_family = AF_PACKET; 1820 sll->sll_hatype = dev->type; 1821 sll->sll_protocol = skb->protocol; 1822 sll->sll_pkttype = skb->pkt_type; 1823 if (unlikely(po->origdev)) 1824 sll->sll_ifindex = orig_dev->ifindex; 1825 else 1826 sll->sll_ifindex = dev->ifindex; 1827 1828 sll->sll_halen = dev_parse_header(skb, sll->sll_addr); 1829 1830 PACKET_SKB_CB(skb)->origlen = skb->len; 1831 1832 if (pskb_trim(skb, snaplen)) 1833 goto drop_n_acct; 1834 1835 skb_set_owner_r(skb, sk); 1836 skb->dev = NULL; 1837 skb_dst_drop(skb); 1838 1839 /* drop conntrack reference */ 1840 nf_reset(skb); 1841 1842 spin_lock(&sk->sk_receive_queue.lock); 1843 po->stats.stats1.tp_packets++; 1844 skb->dropcount = atomic_read(&sk->sk_drops); 1845 __skb_queue_tail(&sk->sk_receive_queue, skb); 1846 spin_unlock(&sk->sk_receive_queue.lock); 1847 sk->sk_data_ready(sk); 1848 return 0; 1849 1850 drop_n_acct: 1851 spin_lock(&sk->sk_receive_queue.lock); 1852 po->stats.stats1.tp_drops++; 1853 atomic_inc(&sk->sk_drops); 1854 spin_unlock(&sk->sk_receive_queue.lock); 1855 1856 drop_n_restore: 1857 if (skb_head != skb->data && skb_shared(skb)) { 1858 skb->data = skb_head; 1859 skb->len = skb_len; 1860 } 1861 drop: 1862 consume_skb(skb); 1863 return 0; 1864 } 1865 1866 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, 1867 struct packet_type *pt, struct net_device *orig_dev) 1868 { 1869 struct sock *sk; 1870 struct packet_sock *po; 1871 struct sockaddr_ll *sll; 1872 union tpacket_uhdr h; 1873 u8 *skb_head = skb->data; 1874 int skb_len = skb->len; 1875 unsigned int snaplen, res; 1876 unsigned long status = TP_STATUS_USER; 1877 unsigned short macoff, netoff, hdrlen; 1878 struct sk_buff *copy_skb = NULL; 1879 struct timespec ts; 1880 __u32 ts_status; 1881 1882 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT. 1883 * We may add members to them until current aligned size without forcing 1884 * userspace to call getsockopt(..., PACKET_HDRLEN, ...). 1885 */ 1886 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32); 1887 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48); 1888 1889 if (skb->pkt_type == PACKET_LOOPBACK) 1890 goto drop; 1891 1892 sk = pt->af_packet_priv; 1893 po = pkt_sk(sk); 1894 1895 if (!net_eq(dev_net(dev), sock_net(sk))) 1896 goto drop; 1897 1898 if (dev->header_ops) { 1899 if (sk->sk_type != SOCK_DGRAM) 1900 skb_push(skb, skb->data - skb_mac_header(skb)); 1901 else if (skb->pkt_type == PACKET_OUTGOING) { 1902 /* Special case: outgoing packets have ll header at head */ 1903 skb_pull(skb, skb_network_offset(skb)); 1904 } 1905 } 1906 1907 if (skb->ip_summed == CHECKSUM_PARTIAL) 1908 status |= TP_STATUS_CSUMNOTREADY; 1909 1910 snaplen = skb->len; 1911 1912 res = run_filter(skb, sk, snaplen); 1913 if (!res) 1914 goto drop_n_restore; 1915 if (snaplen > res) 1916 snaplen = res; 1917 1918 if (sk->sk_type == SOCK_DGRAM) { 1919 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 + 1920 po->tp_reserve; 1921 } else { 1922 unsigned int maclen = skb_network_offset(skb); 1923 netoff = TPACKET_ALIGN(po->tp_hdrlen + 1924 (maclen < 16 ? 16 : maclen)) + 1925 po->tp_reserve; 1926 macoff = netoff - maclen; 1927 } 1928 if (po->tp_version <= TPACKET_V2) { 1929 if (macoff + snaplen > po->rx_ring.frame_size) { 1930 if (po->copy_thresh && 1931 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { 1932 if (skb_shared(skb)) { 1933 copy_skb = skb_clone(skb, GFP_ATOMIC); 1934 } else { 1935 copy_skb = skb_get(skb); 1936 skb_head = skb->data; 1937 } 1938 if (copy_skb) 1939 skb_set_owner_r(copy_skb, sk); 1940 } 1941 snaplen = po->rx_ring.frame_size - macoff; 1942 if ((int)snaplen < 0) 1943 snaplen = 0; 1944 } 1945 } 1946 spin_lock(&sk->sk_receive_queue.lock); 1947 h.raw = packet_current_rx_frame(po, skb, 1948 TP_STATUS_KERNEL, (macoff+snaplen)); 1949 if (!h.raw) 1950 goto ring_is_full; 1951 if (po->tp_version <= TPACKET_V2) { 1952 packet_increment_rx_head(po, &po->rx_ring); 1953 /* 1954 * LOSING will be reported till you read the stats, 1955 * because it's COR - Clear On Read. 1956 * Anyways, moving it for V1/V2 only as V3 doesn't need this 1957 * at packet level. 1958 */ 1959 if (po->stats.stats1.tp_drops) 1960 status |= TP_STATUS_LOSING; 1961 } 1962 po->stats.stats1.tp_packets++; 1963 if (copy_skb) { 1964 status |= TP_STATUS_COPY; 1965 __skb_queue_tail(&sk->sk_receive_queue, copy_skb); 1966 } 1967 spin_unlock(&sk->sk_receive_queue.lock); 1968 1969 skb_copy_bits(skb, 0, h.raw + macoff, snaplen); 1970 1971 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp))) 1972 getnstimeofday(&ts); 1973 1974 status |= ts_status; 1975 1976 switch (po->tp_version) { 1977 case TPACKET_V1: 1978 h.h1->tp_len = skb->len; 1979 h.h1->tp_snaplen = snaplen; 1980 h.h1->tp_mac = macoff; 1981 h.h1->tp_net = netoff; 1982 h.h1->tp_sec = ts.tv_sec; 1983 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; 1984 hdrlen = sizeof(*h.h1); 1985 break; 1986 case TPACKET_V2: 1987 h.h2->tp_len = skb->len; 1988 h.h2->tp_snaplen = snaplen; 1989 h.h2->tp_mac = macoff; 1990 h.h2->tp_net = netoff; 1991 h.h2->tp_sec = ts.tv_sec; 1992 h.h2->tp_nsec = ts.tv_nsec; 1993 if (vlan_tx_tag_present(skb)) { 1994 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb); 1995 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto); 1996 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; 1997 } else { 1998 h.h2->tp_vlan_tci = 0; 1999 h.h2->tp_vlan_tpid = 0; 2000 } 2001 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding)); 2002 hdrlen = sizeof(*h.h2); 2003 break; 2004 case TPACKET_V3: 2005 /* tp_nxt_offset,vlan are already populated above. 2006 * So DONT clear those fields here 2007 */ 2008 h.h3->tp_status |= status; 2009 h.h3->tp_len = skb->len; 2010 h.h3->tp_snaplen = snaplen; 2011 h.h3->tp_mac = macoff; 2012 h.h3->tp_net = netoff; 2013 h.h3->tp_sec = ts.tv_sec; 2014 h.h3->tp_nsec = ts.tv_nsec; 2015 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding)); 2016 hdrlen = sizeof(*h.h3); 2017 break; 2018 default: 2019 BUG(); 2020 } 2021 2022 sll = h.raw + TPACKET_ALIGN(hdrlen); 2023 sll->sll_halen = dev_parse_header(skb, sll->sll_addr); 2024 sll->sll_family = AF_PACKET; 2025 sll->sll_hatype = dev->type; 2026 sll->sll_protocol = skb->protocol; 2027 sll->sll_pkttype = skb->pkt_type; 2028 if (unlikely(po->origdev)) 2029 sll->sll_ifindex = orig_dev->ifindex; 2030 else 2031 sll->sll_ifindex = dev->ifindex; 2032 2033 smp_mb(); 2034 2035 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 2036 if (po->tp_version <= TPACKET_V2) { 2037 u8 *start, *end; 2038 2039 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw + 2040 macoff + snaplen); 2041 2042 for (start = h.raw; start < end; start += PAGE_SIZE) 2043 flush_dcache_page(pgv_to_page(start)); 2044 } 2045 smp_wmb(); 2046 #endif 2047 2048 if (po->tp_version <= TPACKET_V2) 2049 __packet_set_status(po, h.raw, status); 2050 else 2051 prb_clear_blk_fill_status(&po->rx_ring); 2052 2053 sk->sk_data_ready(sk); 2054 2055 drop_n_restore: 2056 if (skb_head != skb->data && skb_shared(skb)) { 2057 skb->data = skb_head; 2058 skb->len = skb_len; 2059 } 2060 drop: 2061 kfree_skb(skb); 2062 return 0; 2063 2064 ring_is_full: 2065 po->stats.stats1.tp_drops++; 2066 spin_unlock(&sk->sk_receive_queue.lock); 2067 2068 sk->sk_data_ready(sk); 2069 kfree_skb(copy_skb); 2070 goto drop_n_restore; 2071 } 2072 2073 static void tpacket_destruct_skb(struct sk_buff *skb) 2074 { 2075 struct packet_sock *po = pkt_sk(skb->sk); 2076 2077 if (likely(po->tx_ring.pg_vec)) { 2078 void *ph; 2079 __u32 ts; 2080 2081 ph = skb_shinfo(skb)->destructor_arg; 2082 packet_dec_pending(&po->tx_ring); 2083 2084 ts = __packet_set_timestamp(po, ph, skb); 2085 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts); 2086 } 2087 2088 sock_wfree(skb); 2089 } 2090 2091 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, 2092 void *frame, struct net_device *dev, int size_max, 2093 __be16 proto, unsigned char *addr, int hlen) 2094 { 2095 union tpacket_uhdr ph; 2096 int to_write, offset, len, tp_len, nr_frags, len_max; 2097 struct socket *sock = po->sk.sk_socket; 2098 struct page *page; 2099 void *data; 2100 int err; 2101 2102 ph.raw = frame; 2103 2104 skb->protocol = proto; 2105 skb->dev = dev; 2106 skb->priority = po->sk.sk_priority; 2107 skb->mark = po->sk.sk_mark; 2108 sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags); 2109 skb_shinfo(skb)->destructor_arg = ph.raw; 2110 2111 switch (po->tp_version) { 2112 case TPACKET_V2: 2113 tp_len = ph.h2->tp_len; 2114 break; 2115 default: 2116 tp_len = ph.h1->tp_len; 2117 break; 2118 } 2119 if (unlikely(tp_len > size_max)) { 2120 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max); 2121 return -EMSGSIZE; 2122 } 2123 2124 skb_reserve(skb, hlen); 2125 skb_reset_network_header(skb); 2126 2127 if (!packet_use_direct_xmit(po)) 2128 skb_probe_transport_header(skb, 0); 2129 if (unlikely(po->tp_tx_has_off)) { 2130 int off_min, off_max, off; 2131 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); 2132 off_max = po->tx_ring.frame_size - tp_len; 2133 if (sock->type == SOCK_DGRAM) { 2134 switch (po->tp_version) { 2135 case TPACKET_V2: 2136 off = ph.h2->tp_net; 2137 break; 2138 default: 2139 off = ph.h1->tp_net; 2140 break; 2141 } 2142 } else { 2143 switch (po->tp_version) { 2144 case TPACKET_V2: 2145 off = ph.h2->tp_mac; 2146 break; 2147 default: 2148 off = ph.h1->tp_mac; 2149 break; 2150 } 2151 } 2152 if (unlikely((off < off_min) || (off_max < off))) 2153 return -EINVAL; 2154 data = ph.raw + off; 2155 } else { 2156 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll); 2157 } 2158 to_write = tp_len; 2159 2160 if (sock->type == SOCK_DGRAM) { 2161 err = dev_hard_header(skb, dev, ntohs(proto), addr, 2162 NULL, tp_len); 2163 if (unlikely(err < 0)) 2164 return -EINVAL; 2165 } else if (dev->hard_header_len) { 2166 /* net device doesn't like empty head */ 2167 if (unlikely(tp_len <= dev->hard_header_len)) { 2168 pr_err("packet size is too short (%d < %d)\n", 2169 tp_len, dev->hard_header_len); 2170 return -EINVAL; 2171 } 2172 2173 skb_push(skb, dev->hard_header_len); 2174 err = skb_store_bits(skb, 0, data, 2175 dev->hard_header_len); 2176 if (unlikely(err)) 2177 return err; 2178 2179 data += dev->hard_header_len; 2180 to_write -= dev->hard_header_len; 2181 } 2182 2183 offset = offset_in_page(data); 2184 len_max = PAGE_SIZE - offset; 2185 len = ((to_write > len_max) ? len_max : to_write); 2186 2187 skb->data_len = to_write; 2188 skb->len += to_write; 2189 skb->truesize += to_write; 2190 atomic_add(to_write, &po->sk.sk_wmem_alloc); 2191 2192 while (likely(to_write)) { 2193 nr_frags = skb_shinfo(skb)->nr_frags; 2194 2195 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) { 2196 pr_err("Packet exceed the number of skb frags(%lu)\n", 2197 MAX_SKB_FRAGS); 2198 return -EFAULT; 2199 } 2200 2201 page = pgv_to_page(data); 2202 data += len; 2203 flush_dcache_page(page); 2204 get_page(page); 2205 skb_fill_page_desc(skb, nr_frags, page, offset, len); 2206 to_write -= len; 2207 offset = 0; 2208 len_max = PAGE_SIZE; 2209 len = ((to_write > len_max) ? len_max : to_write); 2210 } 2211 2212 return tp_len; 2213 } 2214 2215 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) 2216 { 2217 struct sk_buff *skb; 2218 struct net_device *dev; 2219 __be16 proto; 2220 int err, reserve = 0; 2221 void *ph; 2222 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name); 2223 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT); 2224 int tp_len, size_max; 2225 unsigned char *addr; 2226 int len_sum = 0; 2227 int status = TP_STATUS_AVAILABLE; 2228 int hlen, tlen; 2229 2230 mutex_lock(&po->pg_vec_lock); 2231 2232 if (likely(saddr == NULL)) { 2233 dev = packet_cached_dev_get(po); 2234 proto = po->num; 2235 addr = NULL; 2236 } else { 2237 err = -EINVAL; 2238 if (msg->msg_namelen < sizeof(struct sockaddr_ll)) 2239 goto out; 2240 if (msg->msg_namelen < (saddr->sll_halen 2241 + offsetof(struct sockaddr_ll, 2242 sll_addr))) 2243 goto out; 2244 proto = saddr->sll_protocol; 2245 addr = saddr->sll_addr; 2246 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex); 2247 } 2248 2249 err = -ENXIO; 2250 if (unlikely(dev == NULL)) 2251 goto out; 2252 err = -ENETDOWN; 2253 if (unlikely(!(dev->flags & IFF_UP))) 2254 goto out_put; 2255 2256 reserve = dev->hard_header_len + VLAN_HLEN; 2257 size_max = po->tx_ring.frame_size 2258 - (po->tp_hdrlen - sizeof(struct sockaddr_ll)); 2259 2260 if (size_max > dev->mtu + reserve) 2261 size_max = dev->mtu + reserve; 2262 2263 do { 2264 ph = packet_current_frame(po, &po->tx_ring, 2265 TP_STATUS_SEND_REQUEST); 2266 if (unlikely(ph == NULL)) { 2267 if (need_wait && need_resched()) 2268 schedule(); 2269 continue; 2270 } 2271 2272 status = TP_STATUS_SEND_REQUEST; 2273 hlen = LL_RESERVED_SPACE(dev); 2274 tlen = dev->needed_tailroom; 2275 skb = sock_alloc_send_skb(&po->sk, 2276 hlen + tlen + sizeof(struct sockaddr_ll), 2277 0, &err); 2278 2279 if (unlikely(skb == NULL)) 2280 goto out_status; 2281 2282 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto, 2283 addr, hlen); 2284 if (tp_len > dev->mtu + dev->hard_header_len) { 2285 struct ethhdr *ehdr; 2286 /* Earlier code assumed this would be a VLAN pkt, 2287 * double-check this now that we have the actual 2288 * packet in hand. 2289 */ 2290 2291 skb_reset_mac_header(skb); 2292 ehdr = eth_hdr(skb); 2293 if (ehdr->h_proto != htons(ETH_P_8021Q)) 2294 tp_len = -EMSGSIZE; 2295 } 2296 if (unlikely(tp_len < 0)) { 2297 if (po->tp_loss) { 2298 __packet_set_status(po, ph, 2299 TP_STATUS_AVAILABLE); 2300 packet_increment_head(&po->tx_ring); 2301 kfree_skb(skb); 2302 continue; 2303 } else { 2304 status = TP_STATUS_WRONG_FORMAT; 2305 err = tp_len; 2306 goto out_status; 2307 } 2308 } 2309 2310 packet_pick_tx_queue(dev, skb); 2311 2312 skb->destructor = tpacket_destruct_skb; 2313 __packet_set_status(po, ph, TP_STATUS_SENDING); 2314 packet_inc_pending(&po->tx_ring); 2315 2316 status = TP_STATUS_SEND_REQUEST; 2317 err = po->xmit(skb); 2318 if (unlikely(err > 0)) { 2319 err = net_xmit_errno(err); 2320 if (err && __packet_get_status(po, ph) == 2321 TP_STATUS_AVAILABLE) { 2322 /* skb was destructed already */ 2323 skb = NULL; 2324 goto out_status; 2325 } 2326 /* 2327 * skb was dropped but not destructed yet; 2328 * let's treat it like congestion or err < 0 2329 */ 2330 err = 0; 2331 } 2332 packet_increment_head(&po->tx_ring); 2333 len_sum += tp_len; 2334 } while (likely((ph != NULL) || 2335 /* Note: packet_read_pending() might be slow if we have 2336 * to call it as it's per_cpu variable, but in fast-path 2337 * we already short-circuit the loop with the first 2338 * condition, and luckily don't have to go that path 2339 * anyway. 2340 */ 2341 (need_wait && packet_read_pending(&po->tx_ring)))); 2342 2343 err = len_sum; 2344 goto out_put; 2345 2346 out_status: 2347 __packet_set_status(po, ph, status); 2348 kfree_skb(skb); 2349 out_put: 2350 dev_put(dev); 2351 out: 2352 mutex_unlock(&po->pg_vec_lock); 2353 return err; 2354 } 2355 2356 static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, 2357 size_t reserve, size_t len, 2358 size_t linear, int noblock, 2359 int *err) 2360 { 2361 struct sk_buff *skb; 2362 2363 /* Under a page? Don't bother with paged skb. */ 2364 if (prepad + len < PAGE_SIZE || !linear) 2365 linear = len; 2366 2367 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, 2368 err, 0); 2369 if (!skb) 2370 return NULL; 2371 2372 skb_reserve(skb, reserve); 2373 skb_put(skb, linear); 2374 skb->data_len = len - linear; 2375 skb->len += len - linear; 2376 2377 return skb; 2378 } 2379 2380 static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) 2381 { 2382 struct sock *sk = sock->sk; 2383 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name); 2384 struct sk_buff *skb; 2385 struct net_device *dev; 2386 __be16 proto; 2387 unsigned char *addr; 2388 int err, reserve = 0; 2389 struct virtio_net_hdr vnet_hdr = { 0 }; 2390 int offset = 0; 2391 int vnet_hdr_len; 2392 struct packet_sock *po = pkt_sk(sk); 2393 unsigned short gso_type = 0; 2394 int hlen, tlen; 2395 int extra_len = 0; 2396 2397 /* 2398 * Get and verify the address. 2399 */ 2400 2401 if (likely(saddr == NULL)) { 2402 dev = packet_cached_dev_get(po); 2403 proto = po->num; 2404 addr = NULL; 2405 } else { 2406 err = -EINVAL; 2407 if (msg->msg_namelen < sizeof(struct sockaddr_ll)) 2408 goto out; 2409 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr))) 2410 goto out; 2411 proto = saddr->sll_protocol; 2412 addr = saddr->sll_addr; 2413 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex); 2414 } 2415 2416 err = -ENXIO; 2417 if (unlikely(dev == NULL)) 2418 goto out_unlock; 2419 err = -ENETDOWN; 2420 if (unlikely(!(dev->flags & IFF_UP))) 2421 goto out_unlock; 2422 2423 if (sock->type == SOCK_RAW) 2424 reserve = dev->hard_header_len; 2425 if (po->has_vnet_hdr) { 2426 vnet_hdr_len = sizeof(vnet_hdr); 2427 2428 err = -EINVAL; 2429 if (len < vnet_hdr_len) 2430 goto out_unlock; 2431 2432 len -= vnet_hdr_len; 2433 2434 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov, 2435 vnet_hdr_len); 2436 if (err < 0) 2437 goto out_unlock; 2438 2439 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && 2440 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 > 2441 vnet_hdr.hdr_len)) 2442 vnet_hdr.hdr_len = vnet_hdr.csum_start + 2443 vnet_hdr.csum_offset + 2; 2444 2445 err = -EINVAL; 2446 if (vnet_hdr.hdr_len > len) 2447 goto out_unlock; 2448 2449 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) { 2450 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { 2451 case VIRTIO_NET_HDR_GSO_TCPV4: 2452 gso_type = SKB_GSO_TCPV4; 2453 break; 2454 case VIRTIO_NET_HDR_GSO_TCPV6: 2455 gso_type = SKB_GSO_TCPV6; 2456 break; 2457 case VIRTIO_NET_HDR_GSO_UDP: 2458 gso_type = SKB_GSO_UDP; 2459 break; 2460 default: 2461 goto out_unlock; 2462 } 2463 2464 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN) 2465 gso_type |= SKB_GSO_TCP_ECN; 2466 2467 if (vnet_hdr.gso_size == 0) 2468 goto out_unlock; 2469 2470 } 2471 } 2472 2473 if (unlikely(sock_flag(sk, SOCK_NOFCS))) { 2474 if (!netif_supports_nofcs(dev)) { 2475 err = -EPROTONOSUPPORT; 2476 goto out_unlock; 2477 } 2478 extra_len = 4; /* We're doing our own CRC */ 2479 } 2480 2481 err = -EMSGSIZE; 2482 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len)) 2483 goto out_unlock; 2484 2485 err = -ENOBUFS; 2486 hlen = LL_RESERVED_SPACE(dev); 2487 tlen = dev->needed_tailroom; 2488 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, vnet_hdr.hdr_len, 2489 msg->msg_flags & MSG_DONTWAIT, &err); 2490 if (skb == NULL) 2491 goto out_unlock; 2492 2493 skb_set_network_header(skb, reserve); 2494 2495 err = -EINVAL; 2496 if (sock->type == SOCK_DGRAM && 2497 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0) 2498 goto out_free; 2499 2500 /* Returns -EFAULT on error */ 2501 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len); 2502 if (err) 2503 goto out_free; 2504 2505 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); 2506 2507 if (!gso_type && (len > dev->mtu + reserve + extra_len)) { 2508 /* Earlier code assumed this would be a VLAN pkt, 2509 * double-check this now that we have the actual 2510 * packet in hand. 2511 */ 2512 struct ethhdr *ehdr; 2513 skb_reset_mac_header(skb); 2514 ehdr = eth_hdr(skb); 2515 if (ehdr->h_proto != htons(ETH_P_8021Q)) { 2516 err = -EMSGSIZE; 2517 goto out_free; 2518 } 2519 } 2520 2521 skb->protocol = proto; 2522 skb->dev = dev; 2523 skb->priority = sk->sk_priority; 2524 skb->mark = sk->sk_mark; 2525 2526 packet_pick_tx_queue(dev, skb); 2527 2528 if (po->has_vnet_hdr) { 2529 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { 2530 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start, 2531 vnet_hdr.csum_offset)) { 2532 err = -EINVAL; 2533 goto out_free; 2534 } 2535 } 2536 2537 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size; 2538 skb_shinfo(skb)->gso_type = gso_type; 2539 2540 /* Header must be checked, and gso_segs computed. */ 2541 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; 2542 skb_shinfo(skb)->gso_segs = 0; 2543 2544 len += vnet_hdr_len; 2545 } 2546 2547 if (!packet_use_direct_xmit(po)) 2548 skb_probe_transport_header(skb, reserve); 2549 if (unlikely(extra_len == 4)) 2550 skb->no_fcs = 1; 2551 2552 err = po->xmit(skb); 2553 if (err > 0 && (err = net_xmit_errno(err)) != 0) 2554 goto out_unlock; 2555 2556 dev_put(dev); 2557 2558 return len; 2559 2560 out_free: 2561 kfree_skb(skb); 2562 out_unlock: 2563 if (dev) 2564 dev_put(dev); 2565 out: 2566 return err; 2567 } 2568 2569 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock, 2570 struct msghdr *msg, size_t len) 2571 { 2572 struct sock *sk = sock->sk; 2573 struct packet_sock *po = pkt_sk(sk); 2574 2575 if (po->tx_ring.pg_vec) 2576 return tpacket_snd(po, msg); 2577 else 2578 return packet_snd(sock, msg, len); 2579 } 2580 2581 /* 2582 * Close a PACKET socket. This is fairly simple. We immediately go 2583 * to 'closed' state and remove our protocol entry in the device list. 2584 */ 2585 2586 static int packet_release(struct socket *sock) 2587 { 2588 struct sock *sk = sock->sk; 2589 struct packet_sock *po; 2590 struct net *net; 2591 union tpacket_req_u req_u; 2592 2593 if (!sk) 2594 return 0; 2595 2596 net = sock_net(sk); 2597 po = pkt_sk(sk); 2598 2599 mutex_lock(&net->packet.sklist_lock); 2600 sk_del_node_init_rcu(sk); 2601 mutex_unlock(&net->packet.sklist_lock); 2602 2603 preempt_disable(); 2604 sock_prot_inuse_add(net, sk->sk_prot, -1); 2605 preempt_enable(); 2606 2607 spin_lock(&po->bind_lock); 2608 unregister_prot_hook(sk, false); 2609 packet_cached_dev_reset(po); 2610 2611 if (po->prot_hook.dev) { 2612 dev_put(po->prot_hook.dev); 2613 po->prot_hook.dev = NULL; 2614 } 2615 spin_unlock(&po->bind_lock); 2616 2617 packet_flush_mclist(sk); 2618 2619 if (po->rx_ring.pg_vec) { 2620 memset(&req_u, 0, sizeof(req_u)); 2621 packet_set_ring(sk, &req_u, 1, 0); 2622 } 2623 2624 if (po->tx_ring.pg_vec) { 2625 memset(&req_u, 0, sizeof(req_u)); 2626 packet_set_ring(sk, &req_u, 1, 1); 2627 } 2628 2629 fanout_release(sk); 2630 2631 synchronize_net(); 2632 /* 2633 * Now the socket is dead. No more input will appear. 2634 */ 2635 sock_orphan(sk); 2636 sock->sk = NULL; 2637 2638 /* Purge queues */ 2639 2640 skb_queue_purge(&sk->sk_receive_queue); 2641 packet_free_pending(po); 2642 sk_refcnt_debug_release(sk); 2643 2644 sock_put(sk); 2645 return 0; 2646 } 2647 2648 /* 2649 * Attach a packet hook. 2650 */ 2651 2652 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto) 2653 { 2654 struct packet_sock *po = pkt_sk(sk); 2655 const struct net_device *dev_curr; 2656 __be16 proto_curr; 2657 bool need_rehook; 2658 2659 if (po->fanout) { 2660 if (dev) 2661 dev_put(dev); 2662 2663 return -EINVAL; 2664 } 2665 2666 lock_sock(sk); 2667 spin_lock(&po->bind_lock); 2668 2669 proto_curr = po->prot_hook.type; 2670 dev_curr = po->prot_hook.dev; 2671 2672 need_rehook = proto_curr != proto || dev_curr != dev; 2673 2674 if (need_rehook) { 2675 unregister_prot_hook(sk, true); 2676 2677 po->num = proto; 2678 po->prot_hook.type = proto; 2679 2680 if (po->prot_hook.dev) 2681 dev_put(po->prot_hook.dev); 2682 2683 po->prot_hook.dev = dev; 2684 2685 po->ifindex = dev ? dev->ifindex : 0; 2686 packet_cached_dev_assign(po, dev); 2687 } 2688 2689 if (proto == 0 || !need_rehook) 2690 goto out_unlock; 2691 2692 if (!dev || (dev->flags & IFF_UP)) { 2693 register_prot_hook(sk); 2694 } else { 2695 sk->sk_err = ENETDOWN; 2696 if (!sock_flag(sk, SOCK_DEAD)) 2697 sk->sk_error_report(sk); 2698 } 2699 2700 out_unlock: 2701 spin_unlock(&po->bind_lock); 2702 release_sock(sk); 2703 return 0; 2704 } 2705 2706 /* 2707 * Bind a packet socket to a device 2708 */ 2709 2710 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, 2711 int addr_len) 2712 { 2713 struct sock *sk = sock->sk; 2714 char name[15]; 2715 struct net_device *dev; 2716 int err = -ENODEV; 2717 2718 /* 2719 * Check legality 2720 */ 2721 2722 if (addr_len != sizeof(struct sockaddr)) 2723 return -EINVAL; 2724 strlcpy(name, uaddr->sa_data, sizeof(name)); 2725 2726 dev = dev_get_by_name(sock_net(sk), name); 2727 if (dev) 2728 err = packet_do_bind(sk, dev, pkt_sk(sk)->num); 2729 return err; 2730 } 2731 2732 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 2733 { 2734 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr; 2735 struct sock *sk = sock->sk; 2736 struct net_device *dev = NULL; 2737 int err; 2738 2739 2740 /* 2741 * Check legality 2742 */ 2743 2744 if (addr_len < sizeof(struct sockaddr_ll)) 2745 return -EINVAL; 2746 if (sll->sll_family != AF_PACKET) 2747 return -EINVAL; 2748 2749 if (sll->sll_ifindex) { 2750 err = -ENODEV; 2751 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex); 2752 if (dev == NULL) 2753 goto out; 2754 } 2755 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num); 2756 2757 out: 2758 return err; 2759 } 2760 2761 static struct proto packet_proto = { 2762 .name = "PACKET", 2763 .owner = THIS_MODULE, 2764 .obj_size = sizeof(struct packet_sock), 2765 }; 2766 2767 /* 2768 * Create a packet of type SOCK_PACKET. 2769 */ 2770 2771 static int packet_create(struct net *net, struct socket *sock, int protocol, 2772 int kern) 2773 { 2774 struct sock *sk; 2775 struct packet_sock *po; 2776 __be16 proto = (__force __be16)protocol; /* weird, but documented */ 2777 int err; 2778 2779 if (!ns_capable(net->user_ns, CAP_NET_RAW)) 2780 return -EPERM; 2781 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW && 2782 sock->type != SOCK_PACKET) 2783 return -ESOCKTNOSUPPORT; 2784 2785 sock->state = SS_UNCONNECTED; 2786 2787 err = -ENOBUFS; 2788 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto); 2789 if (sk == NULL) 2790 goto out; 2791 2792 sock->ops = &packet_ops; 2793 if (sock->type == SOCK_PACKET) 2794 sock->ops = &packet_ops_spkt; 2795 2796 sock_init_data(sock, sk); 2797 2798 po = pkt_sk(sk); 2799 sk->sk_family = PF_PACKET; 2800 po->num = proto; 2801 po->xmit = dev_queue_xmit; 2802 2803 err = packet_alloc_pending(po); 2804 if (err) 2805 goto out2; 2806 2807 packet_cached_dev_reset(po); 2808 2809 sk->sk_destruct = packet_sock_destruct; 2810 sk_refcnt_debug_inc(sk); 2811 2812 /* 2813 * Attach a protocol block 2814 */ 2815 2816 spin_lock_init(&po->bind_lock); 2817 mutex_init(&po->pg_vec_lock); 2818 po->prot_hook.func = packet_rcv; 2819 2820 if (sock->type == SOCK_PACKET) 2821 po->prot_hook.func = packet_rcv_spkt; 2822 2823 po->prot_hook.af_packet_priv = sk; 2824 2825 if (proto) { 2826 po->prot_hook.type = proto; 2827 register_prot_hook(sk); 2828 } 2829 2830 mutex_lock(&net->packet.sklist_lock); 2831 sk_add_node_rcu(sk, &net->packet.sklist); 2832 mutex_unlock(&net->packet.sklist_lock); 2833 2834 preempt_disable(); 2835 sock_prot_inuse_add(net, &packet_proto, 1); 2836 preempt_enable(); 2837 2838 return 0; 2839 out2: 2840 sk_free(sk); 2841 out: 2842 return err; 2843 } 2844 2845 /* 2846 * Pull a packet from our receive queue and hand it to the user. 2847 * If necessary we block. 2848 */ 2849 2850 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, 2851 struct msghdr *msg, size_t len, int flags) 2852 { 2853 struct sock *sk = sock->sk; 2854 struct sk_buff *skb; 2855 int copied, err; 2856 int vnet_hdr_len = 0; 2857 2858 err = -EINVAL; 2859 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE)) 2860 goto out; 2861 2862 #if 0 2863 /* What error should we return now? EUNATTACH? */ 2864 if (pkt_sk(sk)->ifindex < 0) 2865 return -ENODEV; 2866 #endif 2867 2868 if (flags & MSG_ERRQUEUE) { 2869 err = sock_recv_errqueue(sk, msg, len, 2870 SOL_PACKET, PACKET_TX_TIMESTAMP); 2871 goto out; 2872 } 2873 2874 /* 2875 * Call the generic datagram receiver. This handles all sorts 2876 * of horrible races and re-entrancy so we can forget about it 2877 * in the protocol layers. 2878 * 2879 * Now it will return ENETDOWN, if device have just gone down, 2880 * but then it will block. 2881 */ 2882 2883 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err); 2884 2885 /* 2886 * An error occurred so return it. Because skb_recv_datagram() 2887 * handles the blocking we don't see and worry about blocking 2888 * retries. 2889 */ 2890 2891 if (skb == NULL) 2892 goto out; 2893 2894 if (pkt_sk(sk)->has_vnet_hdr) { 2895 struct virtio_net_hdr vnet_hdr = { 0 }; 2896 2897 err = -EINVAL; 2898 vnet_hdr_len = sizeof(vnet_hdr); 2899 if (len < vnet_hdr_len) 2900 goto out_free; 2901 2902 len -= vnet_hdr_len; 2903 2904 if (skb_is_gso(skb)) { 2905 struct skb_shared_info *sinfo = skb_shinfo(skb); 2906 2907 /* This is a hint as to how much should be linear. */ 2908 vnet_hdr.hdr_len = skb_headlen(skb); 2909 vnet_hdr.gso_size = sinfo->gso_size; 2910 if (sinfo->gso_type & SKB_GSO_TCPV4) 2911 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4; 2912 else if (sinfo->gso_type & SKB_GSO_TCPV6) 2913 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6; 2914 else if (sinfo->gso_type & SKB_GSO_UDP) 2915 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP; 2916 else if (sinfo->gso_type & SKB_GSO_FCOE) 2917 goto out_free; 2918 else 2919 BUG(); 2920 if (sinfo->gso_type & SKB_GSO_TCP_ECN) 2921 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN; 2922 } else 2923 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE; 2924 2925 if (skb->ip_summed == CHECKSUM_PARTIAL) { 2926 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; 2927 vnet_hdr.csum_start = skb_checksum_start_offset(skb); 2928 vnet_hdr.csum_offset = skb->csum_offset; 2929 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) { 2930 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID; 2931 } /* else everything is zero */ 2932 2933 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr, 2934 vnet_hdr_len); 2935 if (err < 0) 2936 goto out_free; 2937 } 2938 2939 /* You lose any data beyond the buffer you gave. If it worries 2940 * a user program they can ask the device for its MTU 2941 * anyway. 2942 */ 2943 copied = skb->len; 2944 if (copied > len) { 2945 copied = len; 2946 msg->msg_flags |= MSG_TRUNC; 2947 } 2948 2949 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); 2950 if (err) 2951 goto out_free; 2952 2953 sock_recv_ts_and_drops(msg, sk, skb); 2954 2955 if (msg->msg_name) { 2956 /* If the address length field is there to be filled 2957 * in, we fill it in now. 2958 */ 2959 if (sock->type == SOCK_PACKET) { 2960 __sockaddr_check_size(sizeof(struct sockaddr_pkt)); 2961 msg->msg_namelen = sizeof(struct sockaddr_pkt); 2962 } else { 2963 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; 2964 msg->msg_namelen = sll->sll_halen + 2965 offsetof(struct sockaddr_ll, sll_addr); 2966 } 2967 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, 2968 msg->msg_namelen); 2969 } 2970 2971 if (pkt_sk(sk)->auxdata) { 2972 struct tpacket_auxdata aux; 2973 2974 aux.tp_status = TP_STATUS_USER; 2975 if (skb->ip_summed == CHECKSUM_PARTIAL) 2976 aux.tp_status |= TP_STATUS_CSUMNOTREADY; 2977 aux.tp_len = PACKET_SKB_CB(skb)->origlen; 2978 aux.tp_snaplen = skb->len; 2979 aux.tp_mac = 0; 2980 aux.tp_net = skb_network_offset(skb); 2981 if (vlan_tx_tag_present(skb)) { 2982 aux.tp_vlan_tci = vlan_tx_tag_get(skb); 2983 aux.tp_vlan_tpid = ntohs(skb->vlan_proto); 2984 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; 2985 } else { 2986 aux.tp_vlan_tci = 0; 2987 aux.tp_vlan_tpid = 0; 2988 } 2989 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux); 2990 } 2991 2992 /* 2993 * Free or return the buffer as appropriate. Again this 2994 * hides all the races and re-entrancy issues from us. 2995 */ 2996 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied); 2997 2998 out_free: 2999 skb_free_datagram(sk, skb); 3000 out: 3001 return err; 3002 } 3003 3004 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, 3005 int *uaddr_len, int peer) 3006 { 3007 struct net_device *dev; 3008 struct sock *sk = sock->sk; 3009 3010 if (peer) 3011 return -EOPNOTSUPP; 3012 3013 uaddr->sa_family = AF_PACKET; 3014 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data)); 3015 rcu_read_lock(); 3016 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex); 3017 if (dev) 3018 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data)); 3019 rcu_read_unlock(); 3020 *uaddr_len = sizeof(*uaddr); 3021 3022 return 0; 3023 } 3024 3025 static int packet_getname(struct socket *sock, struct sockaddr *uaddr, 3026 int *uaddr_len, int peer) 3027 { 3028 struct net_device *dev; 3029 struct sock *sk = sock->sk; 3030 struct packet_sock *po = pkt_sk(sk); 3031 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr); 3032 3033 if (peer) 3034 return -EOPNOTSUPP; 3035 3036 sll->sll_family = AF_PACKET; 3037 sll->sll_ifindex = po->ifindex; 3038 sll->sll_protocol = po->num; 3039 sll->sll_pkttype = 0; 3040 rcu_read_lock(); 3041 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex); 3042 if (dev) { 3043 sll->sll_hatype = dev->type; 3044 sll->sll_halen = dev->addr_len; 3045 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len); 3046 } else { 3047 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */ 3048 sll->sll_halen = 0; 3049 } 3050 rcu_read_unlock(); 3051 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen; 3052 3053 return 0; 3054 } 3055 3056 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i, 3057 int what) 3058 { 3059 switch (i->type) { 3060 case PACKET_MR_MULTICAST: 3061 if (i->alen != dev->addr_len) 3062 return -EINVAL; 3063 if (what > 0) 3064 return dev_mc_add(dev, i->addr); 3065 else 3066 return dev_mc_del(dev, i->addr); 3067 break; 3068 case PACKET_MR_PROMISC: 3069 return dev_set_promiscuity(dev, what); 3070 case PACKET_MR_ALLMULTI: 3071 return dev_set_allmulti(dev, what); 3072 case PACKET_MR_UNICAST: 3073 if (i->alen != dev->addr_len) 3074 return -EINVAL; 3075 if (what > 0) 3076 return dev_uc_add(dev, i->addr); 3077 else 3078 return dev_uc_del(dev, i->addr); 3079 break; 3080 default: 3081 break; 3082 } 3083 return 0; 3084 } 3085 3086 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what) 3087 { 3088 for ( ; i; i = i->next) { 3089 if (i->ifindex == dev->ifindex) 3090 packet_dev_mc(dev, i, what); 3091 } 3092 } 3093 3094 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq) 3095 { 3096 struct packet_sock *po = pkt_sk(sk); 3097 struct packet_mclist *ml, *i; 3098 struct net_device *dev; 3099 int err; 3100 3101 rtnl_lock(); 3102 3103 err = -ENODEV; 3104 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex); 3105 if (!dev) 3106 goto done; 3107 3108 err = -EINVAL; 3109 if (mreq->mr_alen > dev->addr_len) 3110 goto done; 3111 3112 err = -ENOBUFS; 3113 i = kmalloc(sizeof(*i), GFP_KERNEL); 3114 if (i == NULL) 3115 goto done; 3116 3117 err = 0; 3118 for (ml = po->mclist; ml; ml = ml->next) { 3119 if (ml->ifindex == mreq->mr_ifindex && 3120 ml->type == mreq->mr_type && 3121 ml->alen == mreq->mr_alen && 3122 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { 3123 ml->count++; 3124 /* Free the new element ... */ 3125 kfree(i); 3126 goto done; 3127 } 3128 } 3129 3130 i->type = mreq->mr_type; 3131 i->ifindex = mreq->mr_ifindex; 3132 i->alen = mreq->mr_alen; 3133 memcpy(i->addr, mreq->mr_address, i->alen); 3134 i->count = 1; 3135 i->next = po->mclist; 3136 po->mclist = i; 3137 err = packet_dev_mc(dev, i, 1); 3138 if (err) { 3139 po->mclist = i->next; 3140 kfree(i); 3141 } 3142 3143 done: 3144 rtnl_unlock(); 3145 return err; 3146 } 3147 3148 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq) 3149 { 3150 struct packet_mclist *ml, **mlp; 3151 3152 rtnl_lock(); 3153 3154 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) { 3155 if (ml->ifindex == mreq->mr_ifindex && 3156 ml->type == mreq->mr_type && 3157 ml->alen == mreq->mr_alen && 3158 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { 3159 if (--ml->count == 0) { 3160 struct net_device *dev; 3161 *mlp = ml->next; 3162 dev = __dev_get_by_index(sock_net(sk), ml->ifindex); 3163 if (dev) 3164 packet_dev_mc(dev, ml, -1); 3165 kfree(ml); 3166 } 3167 rtnl_unlock(); 3168 return 0; 3169 } 3170 } 3171 rtnl_unlock(); 3172 return -EADDRNOTAVAIL; 3173 } 3174 3175 static void packet_flush_mclist(struct sock *sk) 3176 { 3177 struct packet_sock *po = pkt_sk(sk); 3178 struct packet_mclist *ml; 3179 3180 if (!po->mclist) 3181 return; 3182 3183 rtnl_lock(); 3184 while ((ml = po->mclist) != NULL) { 3185 struct net_device *dev; 3186 3187 po->mclist = ml->next; 3188 dev = __dev_get_by_index(sock_net(sk), ml->ifindex); 3189 if (dev != NULL) 3190 packet_dev_mc(dev, ml, -1); 3191 kfree(ml); 3192 } 3193 rtnl_unlock(); 3194 } 3195 3196 static int 3197 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) 3198 { 3199 struct sock *sk = sock->sk; 3200 struct packet_sock *po = pkt_sk(sk); 3201 int ret; 3202 3203 if (level != SOL_PACKET) 3204 return -ENOPROTOOPT; 3205 3206 switch (optname) { 3207 case PACKET_ADD_MEMBERSHIP: 3208 case PACKET_DROP_MEMBERSHIP: 3209 { 3210 struct packet_mreq_max mreq; 3211 int len = optlen; 3212 memset(&mreq, 0, sizeof(mreq)); 3213 if (len < sizeof(struct packet_mreq)) 3214 return -EINVAL; 3215 if (len > sizeof(mreq)) 3216 len = sizeof(mreq); 3217 if (copy_from_user(&mreq, optval, len)) 3218 return -EFAULT; 3219 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address))) 3220 return -EINVAL; 3221 if (optname == PACKET_ADD_MEMBERSHIP) 3222 ret = packet_mc_add(sk, &mreq); 3223 else 3224 ret = packet_mc_drop(sk, &mreq); 3225 return ret; 3226 } 3227 3228 case PACKET_RX_RING: 3229 case PACKET_TX_RING: 3230 { 3231 union tpacket_req_u req_u; 3232 int len; 3233 3234 switch (po->tp_version) { 3235 case TPACKET_V1: 3236 case TPACKET_V2: 3237 len = sizeof(req_u.req); 3238 break; 3239 case TPACKET_V3: 3240 default: 3241 len = sizeof(req_u.req3); 3242 break; 3243 } 3244 if (optlen < len) 3245 return -EINVAL; 3246 if (pkt_sk(sk)->has_vnet_hdr) 3247 return -EINVAL; 3248 if (copy_from_user(&req_u.req, optval, len)) 3249 return -EFAULT; 3250 return packet_set_ring(sk, &req_u, 0, 3251 optname == PACKET_TX_RING); 3252 } 3253 case PACKET_COPY_THRESH: 3254 { 3255 int val; 3256 3257 if (optlen != sizeof(val)) 3258 return -EINVAL; 3259 if (copy_from_user(&val, optval, sizeof(val))) 3260 return -EFAULT; 3261 3262 pkt_sk(sk)->copy_thresh = val; 3263 return 0; 3264 } 3265 case PACKET_VERSION: 3266 { 3267 int val; 3268 3269 if (optlen != sizeof(val)) 3270 return -EINVAL; 3271 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) 3272 return -EBUSY; 3273 if (copy_from_user(&val, optval, sizeof(val))) 3274 return -EFAULT; 3275 switch (val) { 3276 case TPACKET_V1: 3277 case TPACKET_V2: 3278 case TPACKET_V3: 3279 po->tp_version = val; 3280 return 0; 3281 default: 3282 return -EINVAL; 3283 } 3284 } 3285 case PACKET_RESERVE: 3286 { 3287 unsigned int val; 3288 3289 if (optlen != sizeof(val)) 3290 return -EINVAL; 3291 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) 3292 return -EBUSY; 3293 if (copy_from_user(&val, optval, sizeof(val))) 3294 return -EFAULT; 3295 po->tp_reserve = val; 3296 return 0; 3297 } 3298 case PACKET_LOSS: 3299 { 3300 unsigned int val; 3301 3302 if (optlen != sizeof(val)) 3303 return -EINVAL; 3304 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) 3305 return -EBUSY; 3306 if (copy_from_user(&val, optval, sizeof(val))) 3307 return -EFAULT; 3308 po->tp_loss = !!val; 3309 return 0; 3310 } 3311 case PACKET_AUXDATA: 3312 { 3313 int val; 3314 3315 if (optlen < sizeof(val)) 3316 return -EINVAL; 3317 if (copy_from_user(&val, optval, sizeof(val))) 3318 return -EFAULT; 3319 3320 po->auxdata = !!val; 3321 return 0; 3322 } 3323 case PACKET_ORIGDEV: 3324 { 3325 int val; 3326 3327 if (optlen < sizeof(val)) 3328 return -EINVAL; 3329 if (copy_from_user(&val, optval, sizeof(val))) 3330 return -EFAULT; 3331 3332 po->origdev = !!val; 3333 return 0; 3334 } 3335 case PACKET_VNET_HDR: 3336 { 3337 int val; 3338 3339 if (sock->type != SOCK_RAW) 3340 return -EINVAL; 3341 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) 3342 return -EBUSY; 3343 if (optlen < sizeof(val)) 3344 return -EINVAL; 3345 if (copy_from_user(&val, optval, sizeof(val))) 3346 return -EFAULT; 3347 3348 po->has_vnet_hdr = !!val; 3349 return 0; 3350 } 3351 case PACKET_TIMESTAMP: 3352 { 3353 int val; 3354 3355 if (optlen != sizeof(val)) 3356 return -EINVAL; 3357 if (copy_from_user(&val, optval, sizeof(val))) 3358 return -EFAULT; 3359 3360 po->tp_tstamp = val; 3361 return 0; 3362 } 3363 case PACKET_FANOUT: 3364 { 3365 int val; 3366 3367 if (optlen != sizeof(val)) 3368 return -EINVAL; 3369 if (copy_from_user(&val, optval, sizeof(val))) 3370 return -EFAULT; 3371 3372 return fanout_add(sk, val & 0xffff, val >> 16); 3373 } 3374 case PACKET_TX_HAS_OFF: 3375 { 3376 unsigned int val; 3377 3378 if (optlen != sizeof(val)) 3379 return -EINVAL; 3380 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) 3381 return -EBUSY; 3382 if (copy_from_user(&val, optval, sizeof(val))) 3383 return -EFAULT; 3384 po->tp_tx_has_off = !!val; 3385 return 0; 3386 } 3387 case PACKET_QDISC_BYPASS: 3388 { 3389 int val; 3390 3391 if (optlen != sizeof(val)) 3392 return -EINVAL; 3393 if (copy_from_user(&val, optval, sizeof(val))) 3394 return -EFAULT; 3395 3396 po->xmit = val ? packet_direct_xmit : dev_queue_xmit; 3397 return 0; 3398 } 3399 default: 3400 return -ENOPROTOOPT; 3401 } 3402 } 3403 3404 static int packet_getsockopt(struct socket *sock, int level, int optname, 3405 char __user *optval, int __user *optlen) 3406 { 3407 int len; 3408 int val, lv = sizeof(val); 3409 struct sock *sk = sock->sk; 3410 struct packet_sock *po = pkt_sk(sk); 3411 void *data = &val; 3412 union tpacket_stats_u st; 3413 3414 if (level != SOL_PACKET) 3415 return -ENOPROTOOPT; 3416 3417 if (get_user(len, optlen)) 3418 return -EFAULT; 3419 3420 if (len < 0) 3421 return -EINVAL; 3422 3423 switch (optname) { 3424 case PACKET_STATISTICS: 3425 spin_lock_bh(&sk->sk_receive_queue.lock); 3426 memcpy(&st, &po->stats, sizeof(st)); 3427 memset(&po->stats, 0, sizeof(po->stats)); 3428 spin_unlock_bh(&sk->sk_receive_queue.lock); 3429 3430 if (po->tp_version == TPACKET_V3) { 3431 lv = sizeof(struct tpacket_stats_v3); 3432 st.stats3.tp_packets += st.stats3.tp_drops; 3433 data = &st.stats3; 3434 } else { 3435 lv = sizeof(struct tpacket_stats); 3436 st.stats1.tp_packets += st.stats1.tp_drops; 3437 data = &st.stats1; 3438 } 3439 3440 break; 3441 case PACKET_AUXDATA: 3442 val = po->auxdata; 3443 break; 3444 case PACKET_ORIGDEV: 3445 val = po->origdev; 3446 break; 3447 case PACKET_VNET_HDR: 3448 val = po->has_vnet_hdr; 3449 break; 3450 case PACKET_VERSION: 3451 val = po->tp_version; 3452 break; 3453 case PACKET_HDRLEN: 3454 if (len > sizeof(int)) 3455 len = sizeof(int); 3456 if (copy_from_user(&val, optval, len)) 3457 return -EFAULT; 3458 switch (val) { 3459 case TPACKET_V1: 3460 val = sizeof(struct tpacket_hdr); 3461 break; 3462 case TPACKET_V2: 3463 val = sizeof(struct tpacket2_hdr); 3464 break; 3465 case TPACKET_V3: 3466 val = sizeof(struct tpacket3_hdr); 3467 break; 3468 default: 3469 return -EINVAL; 3470 } 3471 break; 3472 case PACKET_RESERVE: 3473 val = po->tp_reserve; 3474 break; 3475 case PACKET_LOSS: 3476 val = po->tp_loss; 3477 break; 3478 case PACKET_TIMESTAMP: 3479 val = po->tp_tstamp; 3480 break; 3481 case PACKET_FANOUT: 3482 val = (po->fanout ? 3483 ((u32)po->fanout->id | 3484 ((u32)po->fanout->type << 16) | 3485 ((u32)po->fanout->flags << 24)) : 3486 0); 3487 break; 3488 case PACKET_TX_HAS_OFF: 3489 val = po->tp_tx_has_off; 3490 break; 3491 case PACKET_QDISC_BYPASS: 3492 val = packet_use_direct_xmit(po); 3493 break; 3494 default: 3495 return -ENOPROTOOPT; 3496 } 3497 3498 if (len > lv) 3499 len = lv; 3500 if (put_user(len, optlen)) 3501 return -EFAULT; 3502 if (copy_to_user(optval, data, len)) 3503 return -EFAULT; 3504 return 0; 3505 } 3506 3507 3508 static int packet_notifier(struct notifier_block *this, 3509 unsigned long msg, void *ptr) 3510 { 3511 struct sock *sk; 3512 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 3513 struct net *net = dev_net(dev); 3514 3515 rcu_read_lock(); 3516 sk_for_each_rcu(sk, &net->packet.sklist) { 3517 struct packet_sock *po = pkt_sk(sk); 3518 3519 switch (msg) { 3520 case NETDEV_UNREGISTER: 3521 if (po->mclist) 3522 packet_dev_mclist(dev, po->mclist, -1); 3523 /* fallthrough */ 3524 3525 case NETDEV_DOWN: 3526 if (dev->ifindex == po->ifindex) { 3527 spin_lock(&po->bind_lock); 3528 if (po->running) { 3529 __unregister_prot_hook(sk, false); 3530 sk->sk_err = ENETDOWN; 3531 if (!sock_flag(sk, SOCK_DEAD)) 3532 sk->sk_error_report(sk); 3533 } 3534 if (msg == NETDEV_UNREGISTER) { 3535 packet_cached_dev_reset(po); 3536 po->ifindex = -1; 3537 if (po->prot_hook.dev) 3538 dev_put(po->prot_hook.dev); 3539 po->prot_hook.dev = NULL; 3540 } 3541 spin_unlock(&po->bind_lock); 3542 } 3543 break; 3544 case NETDEV_UP: 3545 if (dev->ifindex == po->ifindex) { 3546 spin_lock(&po->bind_lock); 3547 if (po->num) 3548 register_prot_hook(sk); 3549 spin_unlock(&po->bind_lock); 3550 } 3551 break; 3552 } 3553 } 3554 rcu_read_unlock(); 3555 return NOTIFY_DONE; 3556 } 3557 3558 3559 static int packet_ioctl(struct socket *sock, unsigned int cmd, 3560 unsigned long arg) 3561 { 3562 struct sock *sk = sock->sk; 3563 3564 switch (cmd) { 3565 case SIOCOUTQ: 3566 { 3567 int amount = sk_wmem_alloc_get(sk); 3568 3569 return put_user(amount, (int __user *)arg); 3570 } 3571 case SIOCINQ: 3572 { 3573 struct sk_buff *skb; 3574 int amount = 0; 3575 3576 spin_lock_bh(&sk->sk_receive_queue.lock); 3577 skb = skb_peek(&sk->sk_receive_queue); 3578 if (skb) 3579 amount = skb->len; 3580 spin_unlock_bh(&sk->sk_receive_queue.lock); 3581 return put_user(amount, (int __user *)arg); 3582 } 3583 case SIOCGSTAMP: 3584 return sock_get_timestamp(sk, (struct timeval __user *)arg); 3585 case SIOCGSTAMPNS: 3586 return sock_get_timestampns(sk, (struct timespec __user *)arg); 3587 3588 #ifdef CONFIG_INET 3589 case SIOCADDRT: 3590 case SIOCDELRT: 3591 case SIOCDARP: 3592 case SIOCGARP: 3593 case SIOCSARP: 3594 case SIOCGIFADDR: 3595 case SIOCSIFADDR: 3596 case SIOCGIFBRDADDR: 3597 case SIOCSIFBRDADDR: 3598 case SIOCGIFNETMASK: 3599 case SIOCSIFNETMASK: 3600 case SIOCGIFDSTADDR: 3601 case SIOCSIFDSTADDR: 3602 case SIOCSIFFLAGS: 3603 return inet_dgram_ops.ioctl(sock, cmd, arg); 3604 #endif 3605 3606 default: 3607 return -ENOIOCTLCMD; 3608 } 3609 return 0; 3610 } 3611 3612 static unsigned int packet_poll(struct file *file, struct socket *sock, 3613 poll_table *wait) 3614 { 3615 struct sock *sk = sock->sk; 3616 struct packet_sock *po = pkt_sk(sk); 3617 unsigned int mask = datagram_poll(file, sock, wait); 3618 3619 spin_lock_bh(&sk->sk_receive_queue.lock); 3620 if (po->rx_ring.pg_vec) { 3621 if (!packet_previous_rx_frame(po, &po->rx_ring, 3622 TP_STATUS_KERNEL)) 3623 mask |= POLLIN | POLLRDNORM; 3624 } 3625 spin_unlock_bh(&sk->sk_receive_queue.lock); 3626 spin_lock_bh(&sk->sk_write_queue.lock); 3627 if (po->tx_ring.pg_vec) { 3628 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE)) 3629 mask |= POLLOUT | POLLWRNORM; 3630 } 3631 spin_unlock_bh(&sk->sk_write_queue.lock); 3632 return mask; 3633 } 3634 3635 3636 /* Dirty? Well, I still did not learn better way to account 3637 * for user mmaps. 3638 */ 3639 3640 static void packet_mm_open(struct vm_area_struct *vma) 3641 { 3642 struct file *file = vma->vm_file; 3643 struct socket *sock = file->private_data; 3644 struct sock *sk = sock->sk; 3645 3646 if (sk) 3647 atomic_inc(&pkt_sk(sk)->mapped); 3648 } 3649 3650 static void packet_mm_close(struct vm_area_struct *vma) 3651 { 3652 struct file *file = vma->vm_file; 3653 struct socket *sock = file->private_data; 3654 struct sock *sk = sock->sk; 3655 3656 if (sk) 3657 atomic_dec(&pkt_sk(sk)->mapped); 3658 } 3659 3660 static const struct vm_operations_struct packet_mmap_ops = { 3661 .open = packet_mm_open, 3662 .close = packet_mm_close, 3663 }; 3664 3665 static void free_pg_vec(struct pgv *pg_vec, unsigned int order, 3666 unsigned int len) 3667 { 3668 int i; 3669 3670 for (i = 0; i < len; i++) { 3671 if (likely(pg_vec[i].buffer)) { 3672 if (is_vmalloc_addr(pg_vec[i].buffer)) 3673 vfree(pg_vec[i].buffer); 3674 else 3675 free_pages((unsigned long)pg_vec[i].buffer, 3676 order); 3677 pg_vec[i].buffer = NULL; 3678 } 3679 } 3680 kfree(pg_vec); 3681 } 3682 3683 static char *alloc_one_pg_vec_page(unsigned long order) 3684 { 3685 char *buffer; 3686 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | 3687 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; 3688 3689 buffer = (char *) __get_free_pages(gfp_flags, order); 3690 if (buffer) 3691 return buffer; 3692 3693 /* __get_free_pages failed, fall back to vmalloc */ 3694 buffer = vzalloc((1 << order) * PAGE_SIZE); 3695 if (buffer) 3696 return buffer; 3697 3698 /* vmalloc failed, lets dig into swap here */ 3699 gfp_flags &= ~__GFP_NORETRY; 3700 buffer = (char *) __get_free_pages(gfp_flags, order); 3701 if (buffer) 3702 return buffer; 3703 3704 /* complete and utter failure */ 3705 return NULL; 3706 } 3707 3708 static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order) 3709 { 3710 unsigned int block_nr = req->tp_block_nr; 3711 struct pgv *pg_vec; 3712 int i; 3713 3714 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL); 3715 if (unlikely(!pg_vec)) 3716 goto out; 3717 3718 for (i = 0; i < block_nr; i++) { 3719 pg_vec[i].buffer = alloc_one_pg_vec_page(order); 3720 if (unlikely(!pg_vec[i].buffer)) 3721 goto out_free_pgvec; 3722 } 3723 3724 out: 3725 return pg_vec; 3726 3727 out_free_pgvec: 3728 free_pg_vec(pg_vec, order, block_nr); 3729 pg_vec = NULL; 3730 goto out; 3731 } 3732 3733 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, 3734 int closing, int tx_ring) 3735 { 3736 struct pgv *pg_vec = NULL; 3737 struct packet_sock *po = pkt_sk(sk); 3738 int was_running, order = 0; 3739 struct packet_ring_buffer *rb; 3740 struct sk_buff_head *rb_queue; 3741 __be16 num; 3742 int err = -EINVAL; 3743 /* Added to avoid minimal code churn */ 3744 struct tpacket_req *req = &req_u->req; 3745 3746 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */ 3747 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) { 3748 WARN(1, "Tx-ring is not supported.\n"); 3749 goto out; 3750 } 3751 3752 rb = tx_ring ? &po->tx_ring : &po->rx_ring; 3753 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; 3754 3755 err = -EBUSY; 3756 if (!closing) { 3757 if (atomic_read(&po->mapped)) 3758 goto out; 3759 if (packet_read_pending(rb)) 3760 goto out; 3761 } 3762 3763 if (req->tp_block_nr) { 3764 /* Sanity tests and some calculations */ 3765 err = -EBUSY; 3766 if (unlikely(rb->pg_vec)) 3767 goto out; 3768 3769 switch (po->tp_version) { 3770 case TPACKET_V1: 3771 po->tp_hdrlen = TPACKET_HDRLEN; 3772 break; 3773 case TPACKET_V2: 3774 po->tp_hdrlen = TPACKET2_HDRLEN; 3775 break; 3776 case TPACKET_V3: 3777 po->tp_hdrlen = TPACKET3_HDRLEN; 3778 break; 3779 } 3780 3781 err = -EINVAL; 3782 if (unlikely((int)req->tp_block_size <= 0)) 3783 goto out; 3784 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1))) 3785 goto out; 3786 if (unlikely(req->tp_frame_size < po->tp_hdrlen + 3787 po->tp_reserve)) 3788 goto out; 3789 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1))) 3790 goto out; 3791 3792 rb->frames_per_block = req->tp_block_size/req->tp_frame_size; 3793 if (unlikely(rb->frames_per_block <= 0)) 3794 goto out; 3795 if (unlikely((rb->frames_per_block * req->tp_block_nr) != 3796 req->tp_frame_nr)) 3797 goto out; 3798 3799 err = -ENOMEM; 3800 order = get_order(req->tp_block_size); 3801 pg_vec = alloc_pg_vec(req, order); 3802 if (unlikely(!pg_vec)) 3803 goto out; 3804 switch (po->tp_version) { 3805 case TPACKET_V3: 3806 /* Transmit path is not supported. We checked 3807 * it above but just being paranoid 3808 */ 3809 if (!tx_ring) 3810 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring); 3811 break; 3812 default: 3813 break; 3814 } 3815 } 3816 /* Done */ 3817 else { 3818 err = -EINVAL; 3819 if (unlikely(req->tp_frame_nr)) 3820 goto out; 3821 } 3822 3823 lock_sock(sk); 3824 3825 /* Detach socket from network */ 3826 spin_lock(&po->bind_lock); 3827 was_running = po->running; 3828 num = po->num; 3829 if (was_running) { 3830 po->num = 0; 3831 __unregister_prot_hook(sk, false); 3832 } 3833 spin_unlock(&po->bind_lock); 3834 3835 synchronize_net(); 3836 3837 err = -EBUSY; 3838 mutex_lock(&po->pg_vec_lock); 3839 if (closing || atomic_read(&po->mapped) == 0) { 3840 err = 0; 3841 spin_lock_bh(&rb_queue->lock); 3842 swap(rb->pg_vec, pg_vec); 3843 rb->frame_max = (req->tp_frame_nr - 1); 3844 rb->head = 0; 3845 rb->frame_size = req->tp_frame_size; 3846 spin_unlock_bh(&rb_queue->lock); 3847 3848 swap(rb->pg_vec_order, order); 3849 swap(rb->pg_vec_len, req->tp_block_nr); 3850 3851 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE; 3852 po->prot_hook.func = (po->rx_ring.pg_vec) ? 3853 tpacket_rcv : packet_rcv; 3854 skb_queue_purge(rb_queue); 3855 if (atomic_read(&po->mapped)) 3856 pr_err("packet_mmap: vma is busy: %d\n", 3857 atomic_read(&po->mapped)); 3858 } 3859 mutex_unlock(&po->pg_vec_lock); 3860 3861 spin_lock(&po->bind_lock); 3862 if (was_running) { 3863 po->num = num; 3864 register_prot_hook(sk); 3865 } 3866 spin_unlock(&po->bind_lock); 3867 if (closing && (po->tp_version > TPACKET_V2)) { 3868 /* Because we don't support block-based V3 on tx-ring */ 3869 if (!tx_ring) 3870 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue); 3871 } 3872 release_sock(sk); 3873 3874 if (pg_vec) 3875 free_pg_vec(pg_vec, order, req->tp_block_nr); 3876 out: 3877 return err; 3878 } 3879 3880 static int packet_mmap(struct file *file, struct socket *sock, 3881 struct vm_area_struct *vma) 3882 { 3883 struct sock *sk = sock->sk; 3884 struct packet_sock *po = pkt_sk(sk); 3885 unsigned long size, expected_size; 3886 struct packet_ring_buffer *rb; 3887 unsigned long start; 3888 int err = -EINVAL; 3889 int i; 3890 3891 if (vma->vm_pgoff) 3892 return -EINVAL; 3893 3894 mutex_lock(&po->pg_vec_lock); 3895 3896 expected_size = 0; 3897 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) { 3898 if (rb->pg_vec) { 3899 expected_size += rb->pg_vec_len 3900 * rb->pg_vec_pages 3901 * PAGE_SIZE; 3902 } 3903 } 3904 3905 if (expected_size == 0) 3906 goto out; 3907 3908 size = vma->vm_end - vma->vm_start; 3909 if (size != expected_size) 3910 goto out; 3911 3912 start = vma->vm_start; 3913 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) { 3914 if (rb->pg_vec == NULL) 3915 continue; 3916 3917 for (i = 0; i < rb->pg_vec_len; i++) { 3918 struct page *page; 3919 void *kaddr = rb->pg_vec[i].buffer; 3920 int pg_num; 3921 3922 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) { 3923 page = pgv_to_page(kaddr); 3924 err = vm_insert_page(vma, start, page); 3925 if (unlikely(err)) 3926 goto out; 3927 start += PAGE_SIZE; 3928 kaddr += PAGE_SIZE; 3929 } 3930 } 3931 } 3932 3933 atomic_inc(&po->mapped); 3934 vma->vm_ops = &packet_mmap_ops; 3935 err = 0; 3936 3937 out: 3938 mutex_unlock(&po->pg_vec_lock); 3939 return err; 3940 } 3941 3942 static const struct proto_ops packet_ops_spkt = { 3943 .family = PF_PACKET, 3944 .owner = THIS_MODULE, 3945 .release = packet_release, 3946 .bind = packet_bind_spkt, 3947 .connect = sock_no_connect, 3948 .socketpair = sock_no_socketpair, 3949 .accept = sock_no_accept, 3950 .getname = packet_getname_spkt, 3951 .poll = datagram_poll, 3952 .ioctl = packet_ioctl, 3953 .listen = sock_no_listen, 3954 .shutdown = sock_no_shutdown, 3955 .setsockopt = sock_no_setsockopt, 3956 .getsockopt = sock_no_getsockopt, 3957 .sendmsg = packet_sendmsg_spkt, 3958 .recvmsg = packet_recvmsg, 3959 .mmap = sock_no_mmap, 3960 .sendpage = sock_no_sendpage, 3961 }; 3962 3963 static const struct proto_ops packet_ops = { 3964 .family = PF_PACKET, 3965 .owner = THIS_MODULE, 3966 .release = packet_release, 3967 .bind = packet_bind, 3968 .connect = sock_no_connect, 3969 .socketpair = sock_no_socketpair, 3970 .accept = sock_no_accept, 3971 .getname = packet_getname, 3972 .poll = packet_poll, 3973 .ioctl = packet_ioctl, 3974 .listen = sock_no_listen, 3975 .shutdown = sock_no_shutdown, 3976 .setsockopt = packet_setsockopt, 3977 .getsockopt = packet_getsockopt, 3978 .sendmsg = packet_sendmsg, 3979 .recvmsg = packet_recvmsg, 3980 .mmap = packet_mmap, 3981 .sendpage = sock_no_sendpage, 3982 }; 3983 3984 static const struct net_proto_family packet_family_ops = { 3985 .family = PF_PACKET, 3986 .create = packet_create, 3987 .owner = THIS_MODULE, 3988 }; 3989 3990 static struct notifier_block packet_netdev_notifier = { 3991 .notifier_call = packet_notifier, 3992 }; 3993 3994 #ifdef CONFIG_PROC_FS 3995 3996 static void *packet_seq_start(struct seq_file *seq, loff_t *pos) 3997 __acquires(RCU) 3998 { 3999 struct net *net = seq_file_net(seq); 4000 4001 rcu_read_lock(); 4002 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos); 4003 } 4004 4005 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos) 4006 { 4007 struct net *net = seq_file_net(seq); 4008 return seq_hlist_next_rcu(v, &net->packet.sklist, pos); 4009 } 4010 4011 static void packet_seq_stop(struct seq_file *seq, void *v) 4012 __releases(RCU) 4013 { 4014 rcu_read_unlock(); 4015 } 4016 4017 static int packet_seq_show(struct seq_file *seq, void *v) 4018 { 4019 if (v == SEQ_START_TOKEN) 4020 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n"); 4021 else { 4022 struct sock *s = sk_entry(v); 4023 const struct packet_sock *po = pkt_sk(s); 4024 4025 seq_printf(seq, 4026 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n", 4027 s, 4028 atomic_read(&s->sk_refcnt), 4029 s->sk_type, 4030 ntohs(po->num), 4031 po->ifindex, 4032 po->running, 4033 atomic_read(&s->sk_rmem_alloc), 4034 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)), 4035 sock_i_ino(s)); 4036 } 4037 4038 return 0; 4039 } 4040 4041 static const struct seq_operations packet_seq_ops = { 4042 .start = packet_seq_start, 4043 .next = packet_seq_next, 4044 .stop = packet_seq_stop, 4045 .show = packet_seq_show, 4046 }; 4047 4048 static int packet_seq_open(struct inode *inode, struct file *file) 4049 { 4050 return seq_open_net(inode, file, &packet_seq_ops, 4051 sizeof(struct seq_net_private)); 4052 } 4053 4054 static const struct file_operations packet_seq_fops = { 4055 .owner = THIS_MODULE, 4056 .open = packet_seq_open, 4057 .read = seq_read, 4058 .llseek = seq_lseek, 4059 .release = seq_release_net, 4060 }; 4061 4062 #endif 4063 4064 static int __net_init packet_net_init(struct net *net) 4065 { 4066 mutex_init(&net->packet.sklist_lock); 4067 INIT_HLIST_HEAD(&net->packet.sklist); 4068 4069 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops)) 4070 return -ENOMEM; 4071 4072 return 0; 4073 } 4074 4075 static void __net_exit packet_net_exit(struct net *net) 4076 { 4077 remove_proc_entry("packet", net->proc_net); 4078 } 4079 4080 static struct pernet_operations packet_net_ops = { 4081 .init = packet_net_init, 4082 .exit = packet_net_exit, 4083 }; 4084 4085 4086 static void __exit packet_exit(void) 4087 { 4088 unregister_netdevice_notifier(&packet_netdev_notifier); 4089 unregister_pernet_subsys(&packet_net_ops); 4090 sock_unregister(PF_PACKET); 4091 proto_unregister(&packet_proto); 4092 } 4093 4094 static int __init packet_init(void) 4095 { 4096 int rc = proto_register(&packet_proto, 0); 4097 4098 if (rc != 0) 4099 goto out; 4100 4101 sock_register(&packet_family_ops); 4102 register_pernet_subsys(&packet_net_ops); 4103 register_netdevice_notifier(&packet_netdev_notifier); 4104 out: 4105 return rc; 4106 } 4107 4108 module_init(packet_init); 4109 module_exit(packet_exit); 4110 MODULE_LICENSE("GPL"); 4111 MODULE_ALIAS_NETPROTO(PF_PACKET); 4112