1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * PACKET - implements raw packet sockets. 8 * 9 * Authors: Ross Biro 10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 11 * Alan Cox, <gw4pts@gw4pts.ampr.org> 12 * 13 * Fixes: 14 * Alan Cox : verify_area() now used correctly 15 * Alan Cox : new skbuff lists, look ma no backlogs! 16 * Alan Cox : tidied skbuff lists. 17 * Alan Cox : Now uses generic datagram routines I 18 * added. Also fixed the peek/read crash 19 * from all old Linux datagram code. 20 * Alan Cox : Uses the improved datagram code. 21 * Alan Cox : Added NULL's for socket options. 22 * Alan Cox : Re-commented the code. 23 * Alan Cox : Use new kernel side addressing 24 * Rob Janssen : Correct MTU usage. 25 * Dave Platt : Counter leaks caused by incorrect 26 * interrupt locking and some slightly 27 * dubious gcc output. Can you read 28 * compiler: it said _VOLATILE_ 29 * Richard Kooijman : Timestamp fixes. 30 * Alan Cox : New buffers. Use sk->mac.raw. 31 * Alan Cox : sendmsg/recvmsg support. 32 * Alan Cox : Protocol setting support 33 * Alexey Kuznetsov : Untied from IPv4 stack. 34 * Cyrus Durgin : Fixed kerneld for kmod. 35 * Michal Ostrowski : Module initialization cleanup. 36 * Ulises Alonso : Frame number limit removal and 37 * packet_set_ring memory leak. 38 * Eric Biederman : Allow for > 8 byte hardware addresses. 39 * The convention is that longer addresses 40 * will simply extend the hardware address 41 * byte arrays at the end of sockaddr_ll 42 * and packet_mreq. 43 * Johann Baudy : Added TX RING. 44 * Chetan Loke : Implemented TPACKET_V3 block abstraction 45 * layer. 46 * Copyright (C) 2011, <lokec@ccs.neu.edu> 47 */ 48 49 #include <linux/types.h> 50 #include <linux/mm.h> 51 #include <linux/capability.h> 52 #include <linux/fcntl.h> 53 #include <linux/socket.h> 54 #include <linux/in.h> 55 #include <linux/inet.h> 56 #include <linux/netdevice.h> 57 #include <linux/if_packet.h> 58 #include <linux/wireless.h> 59 #include <linux/kernel.h> 60 #include <linux/kmod.h> 61 #include <linux/slab.h> 62 #include <linux/vmalloc.h> 63 #include <net/net_namespace.h> 64 #include <net/ip.h> 65 #include <net/protocol.h> 66 #include <linux/skbuff.h> 67 #include <net/sock.h> 68 #include <linux/errno.h> 69 #include <linux/timer.h> 70 #include <linux/uaccess.h> 71 #include <asm/ioctls.h> 72 #include <asm/page.h> 73 #include <asm/cacheflush.h> 74 #include <asm/io.h> 75 #include <linux/proc_fs.h> 76 #include <linux/seq_file.h> 77 #include <linux/poll.h> 78 #include <linux/module.h> 79 #include <linux/init.h> 80 #include <linux/mutex.h> 81 #include <linux/if_vlan.h> 82 #include <linux/virtio_net.h> 83 #include <linux/errqueue.h> 84 #include <linux/net_tstamp.h> 85 #include <linux/percpu.h> 86 #ifdef CONFIG_INET 87 #include <net/inet_common.h> 88 #endif 89 #include <linux/bpf.h> 90 #include <net/compat.h> 91 92 #include "internal.h" 93 94 /* 95 Assumptions: 96 - if device has no dev->hard_header routine, it adds and removes ll header 97 inside itself. In this case ll header is invisible outside of device, 98 but higher levels still should reserve dev->hard_header_len. 99 Some devices are enough clever to reallocate skb, when header 100 will not fit to reserved space (tunnel), another ones are silly 101 (PPP). 102 - packet socket receives packets with pulled ll header, 103 so that SOCK_RAW should push it back. 104 105 On receive: 106 ----------- 107 108 Incoming, dev->hard_header!=NULL 109 mac_header -> ll header 110 data -> data 111 112 Outgoing, dev->hard_header!=NULL 113 mac_header -> ll header 114 data -> ll header 115 116 Incoming, dev->hard_header==NULL 117 mac_header -> UNKNOWN position. It is very likely, that it points to ll 118 header. PPP makes it, that is wrong, because introduce 119 assymetry between rx and tx paths. 120 data -> data 121 122 Outgoing, dev->hard_header==NULL 123 mac_header -> data. ll header is still not built! 124 data -> data 125 126 Resume 127 If dev->hard_header==NULL we are unlikely to restore sensible ll header. 128 129 130 On transmit: 131 ------------ 132 133 dev->hard_header != NULL 134 mac_header -> ll header 135 data -> ll header 136 137 dev->hard_header == NULL (ll header is added by device, we cannot control it) 138 mac_header -> data 139 data -> data 140 141 We should set nh.raw on output to correct posistion, 142 packet classifier depends on it. 143 */ 144 145 /* Private packet socket structures. */ 146 147 /* identical to struct packet_mreq except it has 148 * a longer address field. 149 */ 150 struct packet_mreq_max { 151 int mr_ifindex; 152 unsigned short mr_type; 153 unsigned short mr_alen; 154 unsigned char mr_address[MAX_ADDR_LEN]; 155 }; 156 157 union tpacket_uhdr { 158 struct tpacket_hdr *h1; 159 struct tpacket2_hdr *h2; 160 struct tpacket3_hdr *h3; 161 void *raw; 162 }; 163 164 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, 165 int closing, int tx_ring); 166 167 #define V3_ALIGNMENT (8) 168 169 #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT)) 170 171 #define BLK_PLUS_PRIV(sz_of_priv) \ 172 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT)) 173 174 #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status) 175 #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts) 176 #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt) 177 #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len) 178 #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num) 179 #define BLOCK_O2PRIV(x) ((x)->offset_to_priv) 180 #define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x))) 181 182 struct packet_sock; 183 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, 184 struct packet_type *pt, struct net_device *orig_dev); 185 186 static void *packet_previous_frame(struct packet_sock *po, 187 struct packet_ring_buffer *rb, 188 int status); 189 static void packet_increment_head(struct packet_ring_buffer *buff); 190 static int prb_curr_blk_in_use(struct tpacket_block_desc *); 191 static void *prb_dispatch_next_block(struct tpacket_kbdq_core *, 192 struct packet_sock *); 193 static void prb_retire_current_block(struct tpacket_kbdq_core *, 194 struct packet_sock *, unsigned int status); 195 static int prb_queue_frozen(struct tpacket_kbdq_core *); 196 static void prb_open_block(struct tpacket_kbdq_core *, 197 struct tpacket_block_desc *); 198 static void prb_retire_rx_blk_timer_expired(struct timer_list *); 199 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *); 200 static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); 201 static void prb_clear_rxhash(struct tpacket_kbdq_core *, 202 struct tpacket3_hdr *); 203 static void prb_fill_vlan_info(struct tpacket_kbdq_core *, 204 struct tpacket3_hdr *); 205 static void packet_flush_mclist(struct sock *sk); 206 static u16 packet_pick_tx_queue(struct sk_buff *skb); 207 208 struct packet_skb_cb { 209 union { 210 struct sockaddr_pkt pkt; 211 union { 212 /* Trick: alias skb original length with 213 * ll.sll_family and ll.protocol in order 214 * to save room. 215 */ 216 unsigned int origlen; 217 struct sockaddr_ll ll; 218 }; 219 } sa; 220 }; 221 222 #define vio_le() virtio_legacy_is_little_endian() 223 224 #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) 225 226 #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc)) 227 #define GET_PBLOCK_DESC(x, bid) \ 228 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer)) 229 #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \ 230 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer)) 231 #define GET_NEXT_PRB_BLK_NUM(x) \ 232 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \ 233 ((x)->kactive_blk_num+1) : 0) 234 235 static void __fanout_unlink(struct sock *sk, struct packet_sock *po); 236 static void __fanout_link(struct sock *sk, struct packet_sock *po); 237 238 static int packet_direct_xmit(struct sk_buff *skb) 239 { 240 return dev_direct_xmit(skb, packet_pick_tx_queue(skb)); 241 } 242 243 static struct net_device *packet_cached_dev_get(struct packet_sock *po) 244 { 245 struct net_device *dev; 246 247 rcu_read_lock(); 248 dev = rcu_dereference(po->cached_dev); 249 if (likely(dev)) 250 dev_hold(dev); 251 rcu_read_unlock(); 252 253 return dev; 254 } 255 256 static void packet_cached_dev_assign(struct packet_sock *po, 257 struct net_device *dev) 258 { 259 rcu_assign_pointer(po->cached_dev, dev); 260 } 261 262 static void packet_cached_dev_reset(struct packet_sock *po) 263 { 264 RCU_INIT_POINTER(po->cached_dev, NULL); 265 } 266 267 static bool packet_use_direct_xmit(const struct packet_sock *po) 268 { 269 return po->xmit == packet_direct_xmit; 270 } 271 272 static u16 packet_pick_tx_queue(struct sk_buff *skb) 273 { 274 struct net_device *dev = skb->dev; 275 const struct net_device_ops *ops = dev->netdev_ops; 276 int cpu = raw_smp_processor_id(); 277 u16 queue_index; 278 279 #ifdef CONFIG_XPS 280 skb->sender_cpu = cpu + 1; 281 #endif 282 skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues); 283 if (ops->ndo_select_queue) { 284 queue_index = ops->ndo_select_queue(dev, skb, NULL); 285 queue_index = netdev_cap_txqueue(dev, queue_index); 286 } else { 287 queue_index = netdev_pick_tx(dev, skb, NULL); 288 } 289 290 return queue_index; 291 } 292 293 /* __register_prot_hook must be invoked through register_prot_hook 294 * or from a context in which asynchronous accesses to the packet 295 * socket is not possible (packet_create()). 296 */ 297 static void __register_prot_hook(struct sock *sk) 298 { 299 struct packet_sock *po = pkt_sk(sk); 300 301 if (!po->running) { 302 if (po->fanout) 303 __fanout_link(sk, po); 304 else 305 dev_add_pack(&po->prot_hook); 306 307 sock_hold(sk); 308 po->running = 1; 309 } 310 } 311 312 static void register_prot_hook(struct sock *sk) 313 { 314 lockdep_assert_held_once(&pkt_sk(sk)->bind_lock); 315 __register_prot_hook(sk); 316 } 317 318 /* If the sync parameter is true, we will temporarily drop 319 * the po->bind_lock and do a synchronize_net to make sure no 320 * asynchronous packet processing paths still refer to the elements 321 * of po->prot_hook. If the sync parameter is false, it is the 322 * callers responsibility to take care of this. 323 */ 324 static void __unregister_prot_hook(struct sock *sk, bool sync) 325 { 326 struct packet_sock *po = pkt_sk(sk); 327 328 lockdep_assert_held_once(&po->bind_lock); 329 330 po->running = 0; 331 332 if (po->fanout) 333 __fanout_unlink(sk, po); 334 else 335 __dev_remove_pack(&po->prot_hook); 336 337 __sock_put(sk); 338 339 if (sync) { 340 spin_unlock(&po->bind_lock); 341 synchronize_net(); 342 spin_lock(&po->bind_lock); 343 } 344 } 345 346 static void unregister_prot_hook(struct sock *sk, bool sync) 347 { 348 struct packet_sock *po = pkt_sk(sk); 349 350 if (po->running) 351 __unregister_prot_hook(sk, sync); 352 } 353 354 static inline struct page * __pure pgv_to_page(void *addr) 355 { 356 if (is_vmalloc_addr(addr)) 357 return vmalloc_to_page(addr); 358 return virt_to_page(addr); 359 } 360 361 static void __packet_set_status(struct packet_sock *po, void *frame, int status) 362 { 363 union tpacket_uhdr h; 364 365 h.raw = frame; 366 switch (po->tp_version) { 367 case TPACKET_V1: 368 h.h1->tp_status = status; 369 flush_dcache_page(pgv_to_page(&h.h1->tp_status)); 370 break; 371 case TPACKET_V2: 372 h.h2->tp_status = status; 373 flush_dcache_page(pgv_to_page(&h.h2->tp_status)); 374 break; 375 case TPACKET_V3: 376 h.h3->tp_status = status; 377 flush_dcache_page(pgv_to_page(&h.h3->tp_status)); 378 break; 379 default: 380 WARN(1, "TPACKET version not supported.\n"); 381 BUG(); 382 } 383 384 smp_wmb(); 385 } 386 387 static int __packet_get_status(const struct packet_sock *po, void *frame) 388 { 389 union tpacket_uhdr h; 390 391 smp_rmb(); 392 393 h.raw = frame; 394 switch (po->tp_version) { 395 case TPACKET_V1: 396 flush_dcache_page(pgv_to_page(&h.h1->tp_status)); 397 return h.h1->tp_status; 398 case TPACKET_V2: 399 flush_dcache_page(pgv_to_page(&h.h2->tp_status)); 400 return h.h2->tp_status; 401 case TPACKET_V3: 402 flush_dcache_page(pgv_to_page(&h.h3->tp_status)); 403 return h.h3->tp_status; 404 default: 405 WARN(1, "TPACKET version not supported.\n"); 406 BUG(); 407 return 0; 408 } 409 } 410 411 static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts, 412 unsigned int flags) 413 { 414 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); 415 416 if (shhwtstamps && 417 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) && 418 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts)) 419 return TP_STATUS_TS_RAW_HARDWARE; 420 421 if (ktime_to_timespec_cond(skb->tstamp, ts)) 422 return TP_STATUS_TS_SOFTWARE; 423 424 return 0; 425 } 426 427 static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame, 428 struct sk_buff *skb) 429 { 430 union tpacket_uhdr h; 431 struct timespec ts; 432 __u32 ts_status; 433 434 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp))) 435 return 0; 436 437 h.raw = frame; 438 switch (po->tp_version) { 439 case TPACKET_V1: 440 h.h1->tp_sec = ts.tv_sec; 441 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; 442 break; 443 case TPACKET_V2: 444 h.h2->tp_sec = ts.tv_sec; 445 h.h2->tp_nsec = ts.tv_nsec; 446 break; 447 case TPACKET_V3: 448 h.h3->tp_sec = ts.tv_sec; 449 h.h3->tp_nsec = ts.tv_nsec; 450 break; 451 default: 452 WARN(1, "TPACKET version not supported.\n"); 453 BUG(); 454 } 455 456 /* one flush is safe, as both fields always lie on the same cacheline */ 457 flush_dcache_page(pgv_to_page(&h.h1->tp_sec)); 458 smp_wmb(); 459 460 return ts_status; 461 } 462 463 static void *packet_lookup_frame(const struct packet_sock *po, 464 const struct packet_ring_buffer *rb, 465 unsigned int position, 466 int status) 467 { 468 unsigned int pg_vec_pos, frame_offset; 469 union tpacket_uhdr h; 470 471 pg_vec_pos = position / rb->frames_per_block; 472 frame_offset = position % rb->frames_per_block; 473 474 h.raw = rb->pg_vec[pg_vec_pos].buffer + 475 (frame_offset * rb->frame_size); 476 477 if (status != __packet_get_status(po, h.raw)) 478 return NULL; 479 480 return h.raw; 481 } 482 483 static void *packet_current_frame(struct packet_sock *po, 484 struct packet_ring_buffer *rb, 485 int status) 486 { 487 return packet_lookup_frame(po, rb, rb->head, status); 488 } 489 490 static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc) 491 { 492 del_timer_sync(&pkc->retire_blk_timer); 493 } 494 495 static void prb_shutdown_retire_blk_timer(struct packet_sock *po, 496 struct sk_buff_head *rb_queue) 497 { 498 struct tpacket_kbdq_core *pkc; 499 500 pkc = GET_PBDQC_FROM_RB(&po->rx_ring); 501 502 spin_lock_bh(&rb_queue->lock); 503 pkc->delete_blk_timer = 1; 504 spin_unlock_bh(&rb_queue->lock); 505 506 prb_del_retire_blk_timer(pkc); 507 } 508 509 static void prb_setup_retire_blk_timer(struct packet_sock *po) 510 { 511 struct tpacket_kbdq_core *pkc; 512 513 pkc = GET_PBDQC_FROM_RB(&po->rx_ring); 514 timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired, 515 0); 516 pkc->retire_blk_timer.expires = jiffies; 517 } 518 519 static int prb_calc_retire_blk_tmo(struct packet_sock *po, 520 int blk_size_in_bytes) 521 { 522 struct net_device *dev; 523 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0; 524 struct ethtool_link_ksettings ecmd; 525 int err; 526 527 rtnl_lock(); 528 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex); 529 if (unlikely(!dev)) { 530 rtnl_unlock(); 531 return DEFAULT_PRB_RETIRE_TOV; 532 } 533 err = __ethtool_get_link_ksettings(dev, &ecmd); 534 rtnl_unlock(); 535 if (!err) { 536 /* 537 * If the link speed is so slow you don't really 538 * need to worry about perf anyways 539 */ 540 if (ecmd.base.speed < SPEED_1000 || 541 ecmd.base.speed == SPEED_UNKNOWN) { 542 return DEFAULT_PRB_RETIRE_TOV; 543 } else { 544 msec = 1; 545 div = ecmd.base.speed / 1000; 546 } 547 } else 548 return DEFAULT_PRB_RETIRE_TOV; 549 550 mbits = (blk_size_in_bytes * 8) / (1024 * 1024); 551 552 if (div) 553 mbits /= div; 554 555 tmo = mbits * msec; 556 557 if (div) 558 return tmo+1; 559 return tmo; 560 } 561 562 static void prb_init_ft_ops(struct tpacket_kbdq_core *p1, 563 union tpacket_req_u *req_u) 564 { 565 p1->feature_req_word = req_u->req3.tp_feature_req_word; 566 } 567 568 static void init_prb_bdqc(struct packet_sock *po, 569 struct packet_ring_buffer *rb, 570 struct pgv *pg_vec, 571 union tpacket_req_u *req_u) 572 { 573 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb); 574 struct tpacket_block_desc *pbd; 575 576 memset(p1, 0x0, sizeof(*p1)); 577 578 p1->knxt_seq_num = 1; 579 p1->pkbdq = pg_vec; 580 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer; 581 p1->pkblk_start = pg_vec[0].buffer; 582 p1->kblk_size = req_u->req3.tp_block_size; 583 p1->knum_blocks = req_u->req3.tp_block_nr; 584 p1->hdrlen = po->tp_hdrlen; 585 p1->version = po->tp_version; 586 p1->last_kactive_blk_num = 0; 587 po->stats.stats3.tp_freeze_q_cnt = 0; 588 if (req_u->req3.tp_retire_blk_tov) 589 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov; 590 else 591 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po, 592 req_u->req3.tp_block_size); 593 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov); 594 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv; 595 596 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv); 597 prb_init_ft_ops(p1, req_u); 598 prb_setup_retire_blk_timer(po); 599 prb_open_block(p1, pbd); 600 } 601 602 /* Do NOT update the last_blk_num first. 603 * Assumes sk_buff_head lock is held. 604 */ 605 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc) 606 { 607 mod_timer(&pkc->retire_blk_timer, 608 jiffies + pkc->tov_in_jiffies); 609 pkc->last_kactive_blk_num = pkc->kactive_blk_num; 610 } 611 612 /* 613 * Timer logic: 614 * 1) We refresh the timer only when we open a block. 615 * By doing this we don't waste cycles refreshing the timer 616 * on packet-by-packet basis. 617 * 618 * With a 1MB block-size, on a 1Gbps line, it will take 619 * i) ~8 ms to fill a block + ii) memcpy etc. 620 * In this cut we are not accounting for the memcpy time. 621 * 622 * So, if the user sets the 'tmo' to 10ms then the timer 623 * will never fire while the block is still getting filled 624 * (which is what we want). However, the user could choose 625 * to close a block early and that's fine. 626 * 627 * But when the timer does fire, we check whether or not to refresh it. 628 * Since the tmo granularity is in msecs, it is not too expensive 629 * to refresh the timer, lets say every '8' msecs. 630 * Either the user can set the 'tmo' or we can derive it based on 631 * a) line-speed and b) block-size. 632 * prb_calc_retire_blk_tmo() calculates the tmo. 633 * 634 */ 635 static void prb_retire_rx_blk_timer_expired(struct timer_list *t) 636 { 637 struct packet_sock *po = 638 from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer); 639 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring); 640 unsigned int frozen; 641 struct tpacket_block_desc *pbd; 642 643 spin_lock(&po->sk.sk_receive_queue.lock); 644 645 frozen = prb_queue_frozen(pkc); 646 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); 647 648 if (unlikely(pkc->delete_blk_timer)) 649 goto out; 650 651 /* We only need to plug the race when the block is partially filled. 652 * tpacket_rcv: 653 * lock(); increment BLOCK_NUM_PKTS; unlock() 654 * copy_bits() is in progress ... 655 * timer fires on other cpu: 656 * we can't retire the current block because copy_bits 657 * is in progress. 658 * 659 */ 660 if (BLOCK_NUM_PKTS(pbd)) { 661 while (atomic_read(&pkc->blk_fill_in_prog)) { 662 /* Waiting for skb_copy_bits to finish... */ 663 cpu_relax(); 664 } 665 } 666 667 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) { 668 if (!frozen) { 669 if (!BLOCK_NUM_PKTS(pbd)) { 670 /* An empty block. Just refresh the timer. */ 671 goto refresh_timer; 672 } 673 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO); 674 if (!prb_dispatch_next_block(pkc, po)) 675 goto refresh_timer; 676 else 677 goto out; 678 } else { 679 /* Case 1. Queue was frozen because user-space was 680 * lagging behind. 681 */ 682 if (prb_curr_blk_in_use(pbd)) { 683 /* 684 * Ok, user-space is still behind. 685 * So just refresh the timer. 686 */ 687 goto refresh_timer; 688 } else { 689 /* Case 2. queue was frozen,user-space caught up, 690 * now the link went idle && the timer fired. 691 * We don't have a block to close.So we open this 692 * block and restart the timer. 693 * opening a block thaws the queue,restarts timer 694 * Thawing/timer-refresh is a side effect. 695 */ 696 prb_open_block(pkc, pbd); 697 goto out; 698 } 699 } 700 } 701 702 refresh_timer: 703 _prb_refresh_rx_retire_blk_timer(pkc); 704 705 out: 706 spin_unlock(&po->sk.sk_receive_queue.lock); 707 } 708 709 static void prb_flush_block(struct tpacket_kbdq_core *pkc1, 710 struct tpacket_block_desc *pbd1, __u32 status) 711 { 712 /* Flush everything minus the block header */ 713 714 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 715 u8 *start, *end; 716 717 start = (u8 *)pbd1; 718 719 /* Skip the block header(we know header WILL fit in 4K) */ 720 start += PAGE_SIZE; 721 722 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end); 723 for (; start < end; start += PAGE_SIZE) 724 flush_dcache_page(pgv_to_page(start)); 725 726 smp_wmb(); 727 #endif 728 729 /* Now update the block status. */ 730 731 BLOCK_STATUS(pbd1) = status; 732 733 /* Flush the block header */ 734 735 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 736 start = (u8 *)pbd1; 737 flush_dcache_page(pgv_to_page(start)); 738 739 smp_wmb(); 740 #endif 741 } 742 743 /* 744 * Side effect: 745 * 746 * 1) flush the block 747 * 2) Increment active_blk_num 748 * 749 * Note:We DONT refresh the timer on purpose. 750 * Because almost always the next block will be opened. 751 */ 752 static void prb_close_block(struct tpacket_kbdq_core *pkc1, 753 struct tpacket_block_desc *pbd1, 754 struct packet_sock *po, unsigned int stat) 755 { 756 __u32 status = TP_STATUS_USER | stat; 757 758 struct tpacket3_hdr *last_pkt; 759 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; 760 struct sock *sk = &po->sk; 761 762 if (atomic_read(&po->tp_drops)) 763 status |= TP_STATUS_LOSING; 764 765 last_pkt = (struct tpacket3_hdr *)pkc1->prev; 766 last_pkt->tp_next_offset = 0; 767 768 /* Get the ts of the last pkt */ 769 if (BLOCK_NUM_PKTS(pbd1)) { 770 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec; 771 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec; 772 } else { 773 /* Ok, we tmo'd - so get the current time. 774 * 775 * It shouldn't really happen as we don't close empty 776 * blocks. See prb_retire_rx_blk_timer_expired(). 777 */ 778 struct timespec ts; 779 getnstimeofday(&ts); 780 h1->ts_last_pkt.ts_sec = ts.tv_sec; 781 h1->ts_last_pkt.ts_nsec = ts.tv_nsec; 782 } 783 784 smp_wmb(); 785 786 /* Flush the block */ 787 prb_flush_block(pkc1, pbd1, status); 788 789 sk->sk_data_ready(sk); 790 791 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1); 792 } 793 794 static void prb_thaw_queue(struct tpacket_kbdq_core *pkc) 795 { 796 pkc->reset_pending_on_curr_blk = 0; 797 } 798 799 /* 800 * Side effect of opening a block: 801 * 802 * 1) prb_queue is thawed. 803 * 2) retire_blk_timer is refreshed. 804 * 805 */ 806 static void prb_open_block(struct tpacket_kbdq_core *pkc1, 807 struct tpacket_block_desc *pbd1) 808 { 809 struct timespec ts; 810 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; 811 812 smp_rmb(); 813 814 /* We could have just memset this but we will lose the 815 * flexibility of making the priv area sticky 816 */ 817 818 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++; 819 BLOCK_NUM_PKTS(pbd1) = 0; 820 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); 821 822 getnstimeofday(&ts); 823 824 h1->ts_first_pkt.ts_sec = ts.tv_sec; 825 h1->ts_first_pkt.ts_nsec = ts.tv_nsec; 826 827 pkc1->pkblk_start = (char *)pbd1; 828 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); 829 830 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); 831 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN; 832 833 pbd1->version = pkc1->version; 834 pkc1->prev = pkc1->nxt_offset; 835 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size; 836 837 prb_thaw_queue(pkc1); 838 _prb_refresh_rx_retire_blk_timer(pkc1); 839 840 smp_wmb(); 841 } 842 843 /* 844 * Queue freeze logic: 845 * 1) Assume tp_block_nr = 8 blocks. 846 * 2) At time 't0', user opens Rx ring. 847 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7 848 * 4) user-space is either sleeping or processing block '0'. 849 * 5) tpacket_rcv is currently filling block '7', since there is no space left, 850 * it will close block-7,loop around and try to fill block '0'. 851 * call-flow: 852 * __packet_lookup_frame_in_block 853 * prb_retire_current_block() 854 * prb_dispatch_next_block() 855 * |->(BLOCK_STATUS == USER) evaluates to true 856 * 5.1) Since block-0 is currently in-use, we just freeze the queue. 857 * 6) Now there are two cases: 858 * 6.1) Link goes idle right after the queue is frozen. 859 * But remember, the last open_block() refreshed the timer. 860 * When this timer expires,it will refresh itself so that we can 861 * re-open block-0 in near future. 862 * 6.2) Link is busy and keeps on receiving packets. This is a simple 863 * case and __packet_lookup_frame_in_block will check if block-0 864 * is free and can now be re-used. 865 */ 866 static void prb_freeze_queue(struct tpacket_kbdq_core *pkc, 867 struct packet_sock *po) 868 { 869 pkc->reset_pending_on_curr_blk = 1; 870 po->stats.stats3.tp_freeze_q_cnt++; 871 } 872 873 #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT)) 874 875 /* 876 * If the next block is free then we will dispatch it 877 * and return a good offset. 878 * Else, we will freeze the queue. 879 * So, caller must check the return value. 880 */ 881 static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc, 882 struct packet_sock *po) 883 { 884 struct tpacket_block_desc *pbd; 885 886 smp_rmb(); 887 888 /* 1. Get current block num */ 889 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); 890 891 /* 2. If this block is currently in_use then freeze the queue */ 892 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) { 893 prb_freeze_queue(pkc, po); 894 return NULL; 895 } 896 897 /* 898 * 3. 899 * open this block and return the offset where the first packet 900 * needs to get stored. 901 */ 902 prb_open_block(pkc, pbd); 903 return (void *)pkc->nxt_offset; 904 } 905 906 static void prb_retire_current_block(struct tpacket_kbdq_core *pkc, 907 struct packet_sock *po, unsigned int status) 908 { 909 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); 910 911 /* retire/close the current block */ 912 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) { 913 /* 914 * Plug the case where copy_bits() is in progress on 915 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't 916 * have space to copy the pkt in the current block and 917 * called prb_retire_current_block() 918 * 919 * We don't need to worry about the TMO case because 920 * the timer-handler already handled this case. 921 */ 922 if (!(status & TP_STATUS_BLK_TMO)) { 923 while (atomic_read(&pkc->blk_fill_in_prog)) { 924 /* Waiting for skb_copy_bits to finish... */ 925 cpu_relax(); 926 } 927 } 928 prb_close_block(pkc, pbd, po, status); 929 return; 930 } 931 } 932 933 static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd) 934 { 935 return TP_STATUS_USER & BLOCK_STATUS(pbd); 936 } 937 938 static int prb_queue_frozen(struct tpacket_kbdq_core *pkc) 939 { 940 return pkc->reset_pending_on_curr_blk; 941 } 942 943 static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb) 944 { 945 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); 946 atomic_dec(&pkc->blk_fill_in_prog); 947 } 948 949 static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc, 950 struct tpacket3_hdr *ppd) 951 { 952 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb); 953 } 954 955 static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc, 956 struct tpacket3_hdr *ppd) 957 { 958 ppd->hv1.tp_rxhash = 0; 959 } 960 961 static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc, 962 struct tpacket3_hdr *ppd) 963 { 964 if (skb_vlan_tag_present(pkc->skb)) { 965 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb); 966 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto); 967 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; 968 } else { 969 ppd->hv1.tp_vlan_tci = 0; 970 ppd->hv1.tp_vlan_tpid = 0; 971 ppd->tp_status = TP_STATUS_AVAILABLE; 972 } 973 } 974 975 static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc, 976 struct tpacket3_hdr *ppd) 977 { 978 ppd->hv1.tp_padding = 0; 979 prb_fill_vlan_info(pkc, ppd); 980 981 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH) 982 prb_fill_rxhash(pkc, ppd); 983 else 984 prb_clear_rxhash(pkc, ppd); 985 } 986 987 static void prb_fill_curr_block(char *curr, 988 struct tpacket_kbdq_core *pkc, 989 struct tpacket_block_desc *pbd, 990 unsigned int len) 991 { 992 struct tpacket3_hdr *ppd; 993 994 ppd = (struct tpacket3_hdr *)curr; 995 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len); 996 pkc->prev = curr; 997 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len); 998 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len); 999 BLOCK_NUM_PKTS(pbd) += 1; 1000 atomic_inc(&pkc->blk_fill_in_prog); 1001 prb_run_all_ft_ops(pkc, ppd); 1002 } 1003 1004 /* Assumes caller has the sk->rx_queue.lock */ 1005 static void *__packet_lookup_frame_in_block(struct packet_sock *po, 1006 struct sk_buff *skb, 1007 unsigned int len 1008 ) 1009 { 1010 struct tpacket_kbdq_core *pkc; 1011 struct tpacket_block_desc *pbd; 1012 char *curr, *end; 1013 1014 pkc = GET_PBDQC_FROM_RB(&po->rx_ring); 1015 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); 1016 1017 /* Queue is frozen when user space is lagging behind */ 1018 if (prb_queue_frozen(pkc)) { 1019 /* 1020 * Check if that last block which caused the queue to freeze, 1021 * is still in_use by user-space. 1022 */ 1023 if (prb_curr_blk_in_use(pbd)) { 1024 /* Can't record this packet */ 1025 return NULL; 1026 } else { 1027 /* 1028 * Ok, the block was released by user-space. 1029 * Now let's open that block. 1030 * opening a block also thaws the queue. 1031 * Thawing is a side effect. 1032 */ 1033 prb_open_block(pkc, pbd); 1034 } 1035 } 1036 1037 smp_mb(); 1038 curr = pkc->nxt_offset; 1039 pkc->skb = skb; 1040 end = (char *)pbd + pkc->kblk_size; 1041 1042 /* first try the current block */ 1043 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) { 1044 prb_fill_curr_block(curr, pkc, pbd, len); 1045 return (void *)curr; 1046 } 1047 1048 /* Ok, close the current block */ 1049 prb_retire_current_block(pkc, po, 0); 1050 1051 /* Now, try to dispatch the next block */ 1052 curr = (char *)prb_dispatch_next_block(pkc, po); 1053 if (curr) { 1054 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); 1055 prb_fill_curr_block(curr, pkc, pbd, len); 1056 return (void *)curr; 1057 } 1058 1059 /* 1060 * No free blocks are available.user_space hasn't caught up yet. 1061 * Queue was just frozen and now this packet will get dropped. 1062 */ 1063 return NULL; 1064 } 1065 1066 static void *packet_current_rx_frame(struct packet_sock *po, 1067 struct sk_buff *skb, 1068 int status, unsigned int len) 1069 { 1070 char *curr = NULL; 1071 switch (po->tp_version) { 1072 case TPACKET_V1: 1073 case TPACKET_V2: 1074 curr = packet_lookup_frame(po, &po->rx_ring, 1075 po->rx_ring.head, status); 1076 return curr; 1077 case TPACKET_V3: 1078 return __packet_lookup_frame_in_block(po, skb, len); 1079 default: 1080 WARN(1, "TPACKET version not supported\n"); 1081 BUG(); 1082 return NULL; 1083 } 1084 } 1085 1086 static void *prb_lookup_block(const struct packet_sock *po, 1087 const struct packet_ring_buffer *rb, 1088 unsigned int idx, 1089 int status) 1090 { 1091 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); 1092 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx); 1093 1094 if (status != BLOCK_STATUS(pbd)) 1095 return NULL; 1096 return pbd; 1097 } 1098 1099 static int prb_previous_blk_num(struct packet_ring_buffer *rb) 1100 { 1101 unsigned int prev; 1102 if (rb->prb_bdqc.kactive_blk_num) 1103 prev = rb->prb_bdqc.kactive_blk_num-1; 1104 else 1105 prev = rb->prb_bdqc.knum_blocks-1; 1106 return prev; 1107 } 1108 1109 /* Assumes caller has held the rx_queue.lock */ 1110 static void *__prb_previous_block(struct packet_sock *po, 1111 struct packet_ring_buffer *rb, 1112 int status) 1113 { 1114 unsigned int previous = prb_previous_blk_num(rb); 1115 return prb_lookup_block(po, rb, previous, status); 1116 } 1117 1118 static void *packet_previous_rx_frame(struct packet_sock *po, 1119 struct packet_ring_buffer *rb, 1120 int status) 1121 { 1122 if (po->tp_version <= TPACKET_V2) 1123 return packet_previous_frame(po, rb, status); 1124 1125 return __prb_previous_block(po, rb, status); 1126 } 1127 1128 static void packet_increment_rx_head(struct packet_sock *po, 1129 struct packet_ring_buffer *rb) 1130 { 1131 switch (po->tp_version) { 1132 case TPACKET_V1: 1133 case TPACKET_V2: 1134 return packet_increment_head(rb); 1135 case TPACKET_V3: 1136 default: 1137 WARN(1, "TPACKET version not supported.\n"); 1138 BUG(); 1139 return; 1140 } 1141 } 1142 1143 static void *packet_previous_frame(struct packet_sock *po, 1144 struct packet_ring_buffer *rb, 1145 int status) 1146 { 1147 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max; 1148 return packet_lookup_frame(po, rb, previous, status); 1149 } 1150 1151 static void packet_increment_head(struct packet_ring_buffer *buff) 1152 { 1153 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0; 1154 } 1155 1156 static void packet_inc_pending(struct packet_ring_buffer *rb) 1157 { 1158 this_cpu_inc(*rb->pending_refcnt); 1159 } 1160 1161 static void packet_dec_pending(struct packet_ring_buffer *rb) 1162 { 1163 this_cpu_dec(*rb->pending_refcnt); 1164 } 1165 1166 static unsigned int packet_read_pending(const struct packet_ring_buffer *rb) 1167 { 1168 unsigned int refcnt = 0; 1169 int cpu; 1170 1171 /* We don't use pending refcount in rx_ring. */ 1172 if (rb->pending_refcnt == NULL) 1173 return 0; 1174 1175 for_each_possible_cpu(cpu) 1176 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu); 1177 1178 return refcnt; 1179 } 1180 1181 static int packet_alloc_pending(struct packet_sock *po) 1182 { 1183 po->rx_ring.pending_refcnt = NULL; 1184 1185 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int); 1186 if (unlikely(po->tx_ring.pending_refcnt == NULL)) 1187 return -ENOBUFS; 1188 1189 return 0; 1190 } 1191 1192 static void packet_free_pending(struct packet_sock *po) 1193 { 1194 free_percpu(po->tx_ring.pending_refcnt); 1195 } 1196 1197 #define ROOM_POW_OFF 2 1198 #define ROOM_NONE 0x0 1199 #define ROOM_LOW 0x1 1200 #define ROOM_NORMAL 0x2 1201 1202 static bool __tpacket_has_room(const struct packet_sock *po, int pow_off) 1203 { 1204 int idx, len; 1205 1206 len = READ_ONCE(po->rx_ring.frame_max) + 1; 1207 idx = READ_ONCE(po->rx_ring.head); 1208 if (pow_off) 1209 idx += len >> pow_off; 1210 if (idx >= len) 1211 idx -= len; 1212 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL); 1213 } 1214 1215 static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off) 1216 { 1217 int idx, len; 1218 1219 len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks); 1220 idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num); 1221 if (pow_off) 1222 idx += len >> pow_off; 1223 if (idx >= len) 1224 idx -= len; 1225 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL); 1226 } 1227 1228 static int __packet_rcv_has_room(const struct packet_sock *po, 1229 const struct sk_buff *skb) 1230 { 1231 const struct sock *sk = &po->sk; 1232 int ret = ROOM_NONE; 1233 1234 if (po->prot_hook.func != tpacket_rcv) { 1235 int rcvbuf = READ_ONCE(sk->sk_rcvbuf); 1236 int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc) 1237 - (skb ? skb->truesize : 0); 1238 1239 if (avail > (rcvbuf >> ROOM_POW_OFF)) 1240 return ROOM_NORMAL; 1241 else if (avail > 0) 1242 return ROOM_LOW; 1243 else 1244 return ROOM_NONE; 1245 } 1246 1247 if (po->tp_version == TPACKET_V3) { 1248 if (__tpacket_v3_has_room(po, ROOM_POW_OFF)) 1249 ret = ROOM_NORMAL; 1250 else if (__tpacket_v3_has_room(po, 0)) 1251 ret = ROOM_LOW; 1252 } else { 1253 if (__tpacket_has_room(po, ROOM_POW_OFF)) 1254 ret = ROOM_NORMAL; 1255 else if (__tpacket_has_room(po, 0)) 1256 ret = ROOM_LOW; 1257 } 1258 1259 return ret; 1260 } 1261 1262 static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) 1263 { 1264 int pressure, ret; 1265 1266 ret = __packet_rcv_has_room(po, skb); 1267 pressure = ret != ROOM_NORMAL; 1268 1269 if (READ_ONCE(po->pressure) != pressure) 1270 WRITE_ONCE(po->pressure, pressure); 1271 1272 return ret; 1273 } 1274 1275 static void packet_rcv_try_clear_pressure(struct packet_sock *po) 1276 { 1277 if (READ_ONCE(po->pressure) && 1278 __packet_rcv_has_room(po, NULL) == ROOM_NORMAL) 1279 WRITE_ONCE(po->pressure, 0); 1280 } 1281 1282 static void packet_sock_destruct(struct sock *sk) 1283 { 1284 skb_queue_purge(&sk->sk_error_queue); 1285 1286 WARN_ON(atomic_read(&sk->sk_rmem_alloc)); 1287 WARN_ON(refcount_read(&sk->sk_wmem_alloc)); 1288 1289 if (!sock_flag(sk, SOCK_DEAD)) { 1290 pr_err("Attempt to release alive packet socket: %p\n", sk); 1291 return; 1292 } 1293 1294 sk_refcnt_debug_dec(sk); 1295 } 1296 1297 static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb) 1298 { 1299 u32 *history = po->rollover->history; 1300 u32 victim, rxhash; 1301 int i, count = 0; 1302 1303 rxhash = skb_get_hash(skb); 1304 for (i = 0; i < ROLLOVER_HLEN; i++) 1305 if (READ_ONCE(history[i]) == rxhash) 1306 count++; 1307 1308 victim = prandom_u32() % ROLLOVER_HLEN; 1309 1310 /* Avoid dirtying the cache line if possible */ 1311 if (READ_ONCE(history[victim]) != rxhash) 1312 WRITE_ONCE(history[victim], rxhash); 1313 1314 return count > (ROLLOVER_HLEN >> 1); 1315 } 1316 1317 static unsigned int fanout_demux_hash(struct packet_fanout *f, 1318 struct sk_buff *skb, 1319 unsigned int num) 1320 { 1321 return reciprocal_scale(__skb_get_hash_symmetric(skb), num); 1322 } 1323 1324 static unsigned int fanout_demux_lb(struct packet_fanout *f, 1325 struct sk_buff *skb, 1326 unsigned int num) 1327 { 1328 unsigned int val = atomic_inc_return(&f->rr_cur); 1329 1330 return val % num; 1331 } 1332 1333 static unsigned int fanout_demux_cpu(struct packet_fanout *f, 1334 struct sk_buff *skb, 1335 unsigned int num) 1336 { 1337 return smp_processor_id() % num; 1338 } 1339 1340 static unsigned int fanout_demux_rnd(struct packet_fanout *f, 1341 struct sk_buff *skb, 1342 unsigned int num) 1343 { 1344 return prandom_u32_max(num); 1345 } 1346 1347 static unsigned int fanout_demux_rollover(struct packet_fanout *f, 1348 struct sk_buff *skb, 1349 unsigned int idx, bool try_self, 1350 unsigned int num) 1351 { 1352 struct packet_sock *po, *po_next, *po_skip = NULL; 1353 unsigned int i, j, room = ROOM_NONE; 1354 1355 po = pkt_sk(f->arr[idx]); 1356 1357 if (try_self) { 1358 room = packet_rcv_has_room(po, skb); 1359 if (room == ROOM_NORMAL || 1360 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb))) 1361 return idx; 1362 po_skip = po; 1363 } 1364 1365 i = j = min_t(int, po->rollover->sock, num - 1); 1366 do { 1367 po_next = pkt_sk(f->arr[i]); 1368 if (po_next != po_skip && !READ_ONCE(po_next->pressure) && 1369 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) { 1370 if (i != j) 1371 po->rollover->sock = i; 1372 atomic_long_inc(&po->rollover->num); 1373 if (room == ROOM_LOW) 1374 atomic_long_inc(&po->rollover->num_huge); 1375 return i; 1376 } 1377 1378 if (++i == num) 1379 i = 0; 1380 } while (i != j); 1381 1382 atomic_long_inc(&po->rollover->num_failed); 1383 return idx; 1384 } 1385 1386 static unsigned int fanout_demux_qm(struct packet_fanout *f, 1387 struct sk_buff *skb, 1388 unsigned int num) 1389 { 1390 return skb_get_queue_mapping(skb) % num; 1391 } 1392 1393 static unsigned int fanout_demux_bpf(struct packet_fanout *f, 1394 struct sk_buff *skb, 1395 unsigned int num) 1396 { 1397 struct bpf_prog *prog; 1398 unsigned int ret = 0; 1399 1400 rcu_read_lock(); 1401 prog = rcu_dereference(f->bpf_prog); 1402 if (prog) 1403 ret = bpf_prog_run_clear_cb(prog, skb) % num; 1404 rcu_read_unlock(); 1405 1406 return ret; 1407 } 1408 1409 static bool fanout_has_flag(struct packet_fanout *f, u16 flag) 1410 { 1411 return f->flags & (flag >> 8); 1412 } 1413 1414 static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, 1415 struct packet_type *pt, struct net_device *orig_dev) 1416 { 1417 struct packet_fanout *f = pt->af_packet_priv; 1418 unsigned int num = READ_ONCE(f->num_members); 1419 struct net *net = read_pnet(&f->net); 1420 struct packet_sock *po; 1421 unsigned int idx; 1422 1423 if (!net_eq(dev_net(dev), net) || !num) { 1424 kfree_skb(skb); 1425 return 0; 1426 } 1427 1428 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) { 1429 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET); 1430 if (!skb) 1431 return 0; 1432 } 1433 switch (f->type) { 1434 case PACKET_FANOUT_HASH: 1435 default: 1436 idx = fanout_demux_hash(f, skb, num); 1437 break; 1438 case PACKET_FANOUT_LB: 1439 idx = fanout_demux_lb(f, skb, num); 1440 break; 1441 case PACKET_FANOUT_CPU: 1442 idx = fanout_demux_cpu(f, skb, num); 1443 break; 1444 case PACKET_FANOUT_RND: 1445 idx = fanout_demux_rnd(f, skb, num); 1446 break; 1447 case PACKET_FANOUT_QM: 1448 idx = fanout_demux_qm(f, skb, num); 1449 break; 1450 case PACKET_FANOUT_ROLLOVER: 1451 idx = fanout_demux_rollover(f, skb, 0, false, num); 1452 break; 1453 case PACKET_FANOUT_CBPF: 1454 case PACKET_FANOUT_EBPF: 1455 idx = fanout_demux_bpf(f, skb, num); 1456 break; 1457 } 1458 1459 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER)) 1460 idx = fanout_demux_rollover(f, skb, idx, true, num); 1461 1462 po = pkt_sk(f->arr[idx]); 1463 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev); 1464 } 1465 1466 DEFINE_MUTEX(fanout_mutex); 1467 EXPORT_SYMBOL_GPL(fanout_mutex); 1468 static LIST_HEAD(fanout_list); 1469 static u16 fanout_next_id; 1470 1471 static void __fanout_link(struct sock *sk, struct packet_sock *po) 1472 { 1473 struct packet_fanout *f = po->fanout; 1474 1475 spin_lock(&f->lock); 1476 f->arr[f->num_members] = sk; 1477 smp_wmb(); 1478 f->num_members++; 1479 if (f->num_members == 1) 1480 dev_add_pack(&f->prot_hook); 1481 spin_unlock(&f->lock); 1482 } 1483 1484 static void __fanout_unlink(struct sock *sk, struct packet_sock *po) 1485 { 1486 struct packet_fanout *f = po->fanout; 1487 int i; 1488 1489 spin_lock(&f->lock); 1490 for (i = 0; i < f->num_members; i++) { 1491 if (f->arr[i] == sk) 1492 break; 1493 } 1494 BUG_ON(i >= f->num_members); 1495 f->arr[i] = f->arr[f->num_members - 1]; 1496 f->num_members--; 1497 if (f->num_members == 0) 1498 __dev_remove_pack(&f->prot_hook); 1499 spin_unlock(&f->lock); 1500 } 1501 1502 static bool match_fanout_group(struct packet_type *ptype, struct sock *sk) 1503 { 1504 if (sk->sk_family != PF_PACKET) 1505 return false; 1506 1507 return ptype->af_packet_priv == pkt_sk(sk)->fanout; 1508 } 1509 1510 static void fanout_init_data(struct packet_fanout *f) 1511 { 1512 switch (f->type) { 1513 case PACKET_FANOUT_LB: 1514 atomic_set(&f->rr_cur, 0); 1515 break; 1516 case PACKET_FANOUT_CBPF: 1517 case PACKET_FANOUT_EBPF: 1518 RCU_INIT_POINTER(f->bpf_prog, NULL); 1519 break; 1520 } 1521 } 1522 1523 static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new) 1524 { 1525 struct bpf_prog *old; 1526 1527 spin_lock(&f->lock); 1528 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock)); 1529 rcu_assign_pointer(f->bpf_prog, new); 1530 spin_unlock(&f->lock); 1531 1532 if (old) { 1533 synchronize_net(); 1534 bpf_prog_destroy(old); 1535 } 1536 } 1537 1538 static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data, 1539 unsigned int len) 1540 { 1541 struct bpf_prog *new; 1542 struct sock_fprog fprog; 1543 int ret; 1544 1545 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED)) 1546 return -EPERM; 1547 if (len != sizeof(fprog)) 1548 return -EINVAL; 1549 if (copy_from_user(&fprog, data, len)) 1550 return -EFAULT; 1551 1552 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false); 1553 if (ret) 1554 return ret; 1555 1556 __fanout_set_data_bpf(po->fanout, new); 1557 return 0; 1558 } 1559 1560 static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data, 1561 unsigned int len) 1562 { 1563 struct bpf_prog *new; 1564 u32 fd; 1565 1566 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED)) 1567 return -EPERM; 1568 if (len != sizeof(fd)) 1569 return -EINVAL; 1570 if (copy_from_user(&fd, data, len)) 1571 return -EFAULT; 1572 1573 new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); 1574 if (IS_ERR(new)) 1575 return PTR_ERR(new); 1576 1577 __fanout_set_data_bpf(po->fanout, new); 1578 return 0; 1579 } 1580 1581 static int fanout_set_data(struct packet_sock *po, char __user *data, 1582 unsigned int len) 1583 { 1584 switch (po->fanout->type) { 1585 case PACKET_FANOUT_CBPF: 1586 return fanout_set_data_cbpf(po, data, len); 1587 case PACKET_FANOUT_EBPF: 1588 return fanout_set_data_ebpf(po, data, len); 1589 default: 1590 return -EINVAL; 1591 } 1592 } 1593 1594 static void fanout_release_data(struct packet_fanout *f) 1595 { 1596 switch (f->type) { 1597 case PACKET_FANOUT_CBPF: 1598 case PACKET_FANOUT_EBPF: 1599 __fanout_set_data_bpf(f, NULL); 1600 } 1601 } 1602 1603 static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id) 1604 { 1605 struct packet_fanout *f; 1606 1607 list_for_each_entry(f, &fanout_list, list) { 1608 if (f->id == candidate_id && 1609 read_pnet(&f->net) == sock_net(sk)) { 1610 return false; 1611 } 1612 } 1613 return true; 1614 } 1615 1616 static bool fanout_find_new_id(struct sock *sk, u16 *new_id) 1617 { 1618 u16 id = fanout_next_id; 1619 1620 do { 1621 if (__fanout_id_is_free(sk, id)) { 1622 *new_id = id; 1623 fanout_next_id = id + 1; 1624 return true; 1625 } 1626 1627 id++; 1628 } while (id != fanout_next_id); 1629 1630 return false; 1631 } 1632 1633 static int fanout_add(struct sock *sk, u16 id, u16 type_flags) 1634 { 1635 struct packet_rollover *rollover = NULL; 1636 struct packet_sock *po = pkt_sk(sk); 1637 struct packet_fanout *f, *match; 1638 u8 type = type_flags & 0xff; 1639 u8 flags = type_flags >> 8; 1640 int err; 1641 1642 switch (type) { 1643 case PACKET_FANOUT_ROLLOVER: 1644 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER) 1645 return -EINVAL; 1646 case PACKET_FANOUT_HASH: 1647 case PACKET_FANOUT_LB: 1648 case PACKET_FANOUT_CPU: 1649 case PACKET_FANOUT_RND: 1650 case PACKET_FANOUT_QM: 1651 case PACKET_FANOUT_CBPF: 1652 case PACKET_FANOUT_EBPF: 1653 break; 1654 default: 1655 return -EINVAL; 1656 } 1657 1658 mutex_lock(&fanout_mutex); 1659 1660 err = -EALREADY; 1661 if (po->fanout) 1662 goto out; 1663 1664 if (type == PACKET_FANOUT_ROLLOVER || 1665 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) { 1666 err = -ENOMEM; 1667 rollover = kzalloc(sizeof(*rollover), GFP_KERNEL); 1668 if (!rollover) 1669 goto out; 1670 atomic_long_set(&rollover->num, 0); 1671 atomic_long_set(&rollover->num_huge, 0); 1672 atomic_long_set(&rollover->num_failed, 0); 1673 } 1674 1675 if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) { 1676 if (id != 0) { 1677 err = -EINVAL; 1678 goto out; 1679 } 1680 if (!fanout_find_new_id(sk, &id)) { 1681 err = -ENOMEM; 1682 goto out; 1683 } 1684 /* ephemeral flag for the first socket in the group: drop it */ 1685 flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8); 1686 } 1687 1688 match = NULL; 1689 list_for_each_entry(f, &fanout_list, list) { 1690 if (f->id == id && 1691 read_pnet(&f->net) == sock_net(sk)) { 1692 match = f; 1693 break; 1694 } 1695 } 1696 err = -EINVAL; 1697 if (match && match->flags != flags) 1698 goto out; 1699 if (!match) { 1700 err = -ENOMEM; 1701 match = kzalloc(sizeof(*match), GFP_KERNEL); 1702 if (!match) 1703 goto out; 1704 write_pnet(&match->net, sock_net(sk)); 1705 match->id = id; 1706 match->type = type; 1707 match->flags = flags; 1708 INIT_LIST_HEAD(&match->list); 1709 spin_lock_init(&match->lock); 1710 refcount_set(&match->sk_ref, 0); 1711 fanout_init_data(match); 1712 match->prot_hook.type = po->prot_hook.type; 1713 match->prot_hook.dev = po->prot_hook.dev; 1714 match->prot_hook.func = packet_rcv_fanout; 1715 match->prot_hook.af_packet_priv = match; 1716 match->prot_hook.id_match = match_fanout_group; 1717 list_add(&match->list, &fanout_list); 1718 } 1719 err = -EINVAL; 1720 1721 spin_lock(&po->bind_lock); 1722 if (po->running && 1723 match->type == type && 1724 match->prot_hook.type == po->prot_hook.type && 1725 match->prot_hook.dev == po->prot_hook.dev) { 1726 err = -ENOSPC; 1727 if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) { 1728 __dev_remove_pack(&po->prot_hook); 1729 po->fanout = match; 1730 po->rollover = rollover; 1731 rollover = NULL; 1732 refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1); 1733 __fanout_link(sk, po); 1734 err = 0; 1735 } 1736 } 1737 spin_unlock(&po->bind_lock); 1738 1739 if (err && !refcount_read(&match->sk_ref)) { 1740 list_del(&match->list); 1741 kfree(match); 1742 } 1743 1744 out: 1745 kfree(rollover); 1746 mutex_unlock(&fanout_mutex); 1747 return err; 1748 } 1749 1750 /* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes 1751 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout. 1752 * It is the responsibility of the caller to call fanout_release_data() and 1753 * free the returned packet_fanout (after synchronize_net()) 1754 */ 1755 static struct packet_fanout *fanout_release(struct sock *sk) 1756 { 1757 struct packet_sock *po = pkt_sk(sk); 1758 struct packet_fanout *f; 1759 1760 mutex_lock(&fanout_mutex); 1761 f = po->fanout; 1762 if (f) { 1763 po->fanout = NULL; 1764 1765 if (refcount_dec_and_test(&f->sk_ref)) 1766 list_del(&f->list); 1767 else 1768 f = NULL; 1769 } 1770 mutex_unlock(&fanout_mutex); 1771 1772 return f; 1773 } 1774 1775 static bool packet_extra_vlan_len_allowed(const struct net_device *dev, 1776 struct sk_buff *skb) 1777 { 1778 /* Earlier code assumed this would be a VLAN pkt, double-check 1779 * this now that we have the actual packet in hand. We can only 1780 * do this check on Ethernet devices. 1781 */ 1782 if (unlikely(dev->type != ARPHRD_ETHER)) 1783 return false; 1784 1785 skb_reset_mac_header(skb); 1786 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q)); 1787 } 1788 1789 static const struct proto_ops packet_ops; 1790 1791 static const struct proto_ops packet_ops_spkt; 1792 1793 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, 1794 struct packet_type *pt, struct net_device *orig_dev) 1795 { 1796 struct sock *sk; 1797 struct sockaddr_pkt *spkt; 1798 1799 /* 1800 * When we registered the protocol we saved the socket in the data 1801 * field for just this event. 1802 */ 1803 1804 sk = pt->af_packet_priv; 1805 1806 /* 1807 * Yank back the headers [hope the device set this 1808 * right or kerboom...] 1809 * 1810 * Incoming packets have ll header pulled, 1811 * push it back. 1812 * 1813 * For outgoing ones skb->data == skb_mac_header(skb) 1814 * so that this procedure is noop. 1815 */ 1816 1817 if (skb->pkt_type == PACKET_LOOPBACK) 1818 goto out; 1819 1820 if (!net_eq(dev_net(dev), sock_net(sk))) 1821 goto out; 1822 1823 skb = skb_share_check(skb, GFP_ATOMIC); 1824 if (skb == NULL) 1825 goto oom; 1826 1827 /* drop any routing info */ 1828 skb_dst_drop(skb); 1829 1830 /* drop conntrack reference */ 1831 nf_reset_ct(skb); 1832 1833 spkt = &PACKET_SKB_CB(skb)->sa.pkt; 1834 1835 skb_push(skb, skb->data - skb_mac_header(skb)); 1836 1837 /* 1838 * The SOCK_PACKET socket receives _all_ frames. 1839 */ 1840 1841 spkt->spkt_family = dev->type; 1842 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device)); 1843 spkt->spkt_protocol = skb->protocol; 1844 1845 /* 1846 * Charge the memory to the socket. This is done specifically 1847 * to prevent sockets using all the memory up. 1848 */ 1849 1850 if (sock_queue_rcv_skb(sk, skb) == 0) 1851 return 0; 1852 1853 out: 1854 kfree_skb(skb); 1855 oom: 1856 return 0; 1857 } 1858 1859 static void packet_parse_headers(struct sk_buff *skb, struct socket *sock) 1860 { 1861 if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) && 1862 sock->type == SOCK_RAW) { 1863 skb_reset_mac_header(skb); 1864 skb->protocol = dev_parse_header_protocol(skb); 1865 } 1866 1867 skb_probe_transport_header(skb); 1868 } 1869 1870 /* 1871 * Output a raw packet to a device layer. This bypasses all the other 1872 * protocol layers and you must therefore supply it with a complete frame 1873 */ 1874 1875 static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg, 1876 size_t len) 1877 { 1878 struct sock *sk = sock->sk; 1879 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name); 1880 struct sk_buff *skb = NULL; 1881 struct net_device *dev; 1882 struct sockcm_cookie sockc; 1883 __be16 proto = 0; 1884 int err; 1885 int extra_len = 0; 1886 1887 /* 1888 * Get and verify the address. 1889 */ 1890 1891 if (saddr) { 1892 if (msg->msg_namelen < sizeof(struct sockaddr)) 1893 return -EINVAL; 1894 if (msg->msg_namelen == sizeof(struct sockaddr_pkt)) 1895 proto = saddr->spkt_protocol; 1896 } else 1897 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */ 1898 1899 /* 1900 * Find the device first to size check it 1901 */ 1902 1903 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0; 1904 retry: 1905 rcu_read_lock(); 1906 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device); 1907 err = -ENODEV; 1908 if (dev == NULL) 1909 goto out_unlock; 1910 1911 err = -ENETDOWN; 1912 if (!(dev->flags & IFF_UP)) 1913 goto out_unlock; 1914 1915 /* 1916 * You may not queue a frame bigger than the mtu. This is the lowest level 1917 * raw protocol and you must do your own fragmentation at this level. 1918 */ 1919 1920 if (unlikely(sock_flag(sk, SOCK_NOFCS))) { 1921 if (!netif_supports_nofcs(dev)) { 1922 err = -EPROTONOSUPPORT; 1923 goto out_unlock; 1924 } 1925 extra_len = 4; /* We're doing our own CRC */ 1926 } 1927 1928 err = -EMSGSIZE; 1929 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len) 1930 goto out_unlock; 1931 1932 if (!skb) { 1933 size_t reserved = LL_RESERVED_SPACE(dev); 1934 int tlen = dev->needed_tailroom; 1935 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0; 1936 1937 rcu_read_unlock(); 1938 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL); 1939 if (skb == NULL) 1940 return -ENOBUFS; 1941 /* FIXME: Save some space for broken drivers that write a hard 1942 * header at transmission time by themselves. PPP is the notable 1943 * one here. This should really be fixed at the driver level. 1944 */ 1945 skb_reserve(skb, reserved); 1946 skb_reset_network_header(skb); 1947 1948 /* Try to align data part correctly */ 1949 if (hhlen) { 1950 skb->data -= hhlen; 1951 skb->tail -= hhlen; 1952 if (len < hhlen) 1953 skb_reset_network_header(skb); 1954 } 1955 err = memcpy_from_msg(skb_put(skb, len), msg, len); 1956 if (err) 1957 goto out_free; 1958 goto retry; 1959 } 1960 1961 if (!dev_validate_header(dev, skb->data, len)) { 1962 err = -EINVAL; 1963 goto out_unlock; 1964 } 1965 if (len > (dev->mtu + dev->hard_header_len + extra_len) && 1966 !packet_extra_vlan_len_allowed(dev, skb)) { 1967 err = -EMSGSIZE; 1968 goto out_unlock; 1969 } 1970 1971 sockcm_init(&sockc, sk); 1972 if (msg->msg_controllen) { 1973 err = sock_cmsg_send(sk, msg, &sockc); 1974 if (unlikely(err)) 1975 goto out_unlock; 1976 } 1977 1978 skb->protocol = proto; 1979 skb->dev = dev; 1980 skb->priority = sk->sk_priority; 1981 skb->mark = sk->sk_mark; 1982 skb->tstamp = sockc.transmit_time; 1983 1984 skb_setup_tx_timestamp(skb, sockc.tsflags); 1985 1986 if (unlikely(extra_len == 4)) 1987 skb->no_fcs = 1; 1988 1989 packet_parse_headers(skb, sock); 1990 1991 dev_queue_xmit(skb); 1992 rcu_read_unlock(); 1993 return len; 1994 1995 out_unlock: 1996 rcu_read_unlock(); 1997 out_free: 1998 kfree_skb(skb); 1999 return err; 2000 } 2001 2002 static unsigned int run_filter(struct sk_buff *skb, 2003 const struct sock *sk, 2004 unsigned int res) 2005 { 2006 struct sk_filter *filter; 2007 2008 rcu_read_lock(); 2009 filter = rcu_dereference(sk->sk_filter); 2010 if (filter != NULL) 2011 res = bpf_prog_run_clear_cb(filter->prog, skb); 2012 rcu_read_unlock(); 2013 2014 return res; 2015 } 2016 2017 static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb, 2018 size_t *len) 2019 { 2020 struct virtio_net_hdr vnet_hdr; 2021 2022 if (*len < sizeof(vnet_hdr)) 2023 return -EINVAL; 2024 *len -= sizeof(vnet_hdr); 2025 2026 if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0)) 2027 return -EINVAL; 2028 2029 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr)); 2030 } 2031 2032 /* 2033 * This function makes lazy skb cloning in hope that most of packets 2034 * are discarded by BPF. 2035 * 2036 * Note tricky part: we DO mangle shared skb! skb->data, skb->len 2037 * and skb->cb are mangled. It works because (and until) packets 2038 * falling here are owned by current CPU. Output packets are cloned 2039 * by dev_queue_xmit_nit(), input packets are processed by net_bh 2040 * sequencially, so that if we return skb to original state on exit, 2041 * we will not harm anyone. 2042 */ 2043 2044 static int packet_rcv(struct sk_buff *skb, struct net_device *dev, 2045 struct packet_type *pt, struct net_device *orig_dev) 2046 { 2047 struct sock *sk; 2048 struct sockaddr_ll *sll; 2049 struct packet_sock *po; 2050 u8 *skb_head = skb->data; 2051 int skb_len = skb->len; 2052 unsigned int snaplen, res; 2053 bool is_drop_n_account = false; 2054 2055 if (skb->pkt_type == PACKET_LOOPBACK) 2056 goto drop; 2057 2058 sk = pt->af_packet_priv; 2059 po = pkt_sk(sk); 2060 2061 if (!net_eq(dev_net(dev), sock_net(sk))) 2062 goto drop; 2063 2064 skb->dev = dev; 2065 2066 if (dev->header_ops) { 2067 /* The device has an explicit notion of ll header, 2068 * exported to higher levels. 2069 * 2070 * Otherwise, the device hides details of its frame 2071 * structure, so that corresponding packet head is 2072 * never delivered to user. 2073 */ 2074 if (sk->sk_type != SOCK_DGRAM) 2075 skb_push(skb, skb->data - skb_mac_header(skb)); 2076 else if (skb->pkt_type == PACKET_OUTGOING) { 2077 /* Special case: outgoing packets have ll header at head */ 2078 skb_pull(skb, skb_network_offset(skb)); 2079 } 2080 } 2081 2082 snaplen = skb->len; 2083 2084 res = run_filter(skb, sk, snaplen); 2085 if (!res) 2086 goto drop_n_restore; 2087 if (snaplen > res) 2088 snaplen = res; 2089 2090 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) 2091 goto drop_n_acct; 2092 2093 if (skb_shared(skb)) { 2094 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); 2095 if (nskb == NULL) 2096 goto drop_n_acct; 2097 2098 if (skb_head != skb->data) { 2099 skb->data = skb_head; 2100 skb->len = skb_len; 2101 } 2102 consume_skb(skb); 2103 skb = nskb; 2104 } 2105 2106 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8); 2107 2108 sll = &PACKET_SKB_CB(skb)->sa.ll; 2109 sll->sll_hatype = dev->type; 2110 sll->sll_pkttype = skb->pkt_type; 2111 if (unlikely(po->origdev)) 2112 sll->sll_ifindex = orig_dev->ifindex; 2113 else 2114 sll->sll_ifindex = dev->ifindex; 2115 2116 sll->sll_halen = dev_parse_header(skb, sll->sll_addr); 2117 2118 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg(). 2119 * Use their space for storing the original skb length. 2120 */ 2121 PACKET_SKB_CB(skb)->sa.origlen = skb->len; 2122 2123 if (pskb_trim(skb, snaplen)) 2124 goto drop_n_acct; 2125 2126 skb_set_owner_r(skb, sk); 2127 skb->dev = NULL; 2128 skb_dst_drop(skb); 2129 2130 /* drop conntrack reference */ 2131 nf_reset_ct(skb); 2132 2133 spin_lock(&sk->sk_receive_queue.lock); 2134 po->stats.stats1.tp_packets++; 2135 sock_skb_set_dropcount(sk, skb); 2136 __skb_queue_tail(&sk->sk_receive_queue, skb); 2137 spin_unlock(&sk->sk_receive_queue.lock); 2138 sk->sk_data_ready(sk); 2139 return 0; 2140 2141 drop_n_acct: 2142 is_drop_n_account = true; 2143 atomic_inc(&po->tp_drops); 2144 atomic_inc(&sk->sk_drops); 2145 2146 drop_n_restore: 2147 if (skb_head != skb->data && skb_shared(skb)) { 2148 skb->data = skb_head; 2149 skb->len = skb_len; 2150 } 2151 drop: 2152 if (!is_drop_n_account) 2153 consume_skb(skb); 2154 else 2155 kfree_skb(skb); 2156 return 0; 2157 } 2158 2159 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, 2160 struct packet_type *pt, struct net_device *orig_dev) 2161 { 2162 struct sock *sk; 2163 struct packet_sock *po; 2164 struct sockaddr_ll *sll; 2165 union tpacket_uhdr h; 2166 u8 *skb_head = skb->data; 2167 int skb_len = skb->len; 2168 unsigned int snaplen, res; 2169 unsigned long status = TP_STATUS_USER; 2170 unsigned short macoff, netoff, hdrlen; 2171 struct sk_buff *copy_skb = NULL; 2172 struct timespec ts; 2173 __u32 ts_status; 2174 bool is_drop_n_account = false; 2175 bool do_vnet = false; 2176 2177 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT. 2178 * We may add members to them until current aligned size without forcing 2179 * userspace to call getsockopt(..., PACKET_HDRLEN, ...). 2180 */ 2181 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32); 2182 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48); 2183 2184 if (skb->pkt_type == PACKET_LOOPBACK) 2185 goto drop; 2186 2187 sk = pt->af_packet_priv; 2188 po = pkt_sk(sk); 2189 2190 if (!net_eq(dev_net(dev), sock_net(sk))) 2191 goto drop; 2192 2193 if (dev->header_ops) { 2194 if (sk->sk_type != SOCK_DGRAM) 2195 skb_push(skb, skb->data - skb_mac_header(skb)); 2196 else if (skb->pkt_type == PACKET_OUTGOING) { 2197 /* Special case: outgoing packets have ll header at head */ 2198 skb_pull(skb, skb_network_offset(skb)); 2199 } 2200 } 2201 2202 snaplen = skb->len; 2203 2204 res = run_filter(skb, sk, snaplen); 2205 if (!res) 2206 goto drop_n_restore; 2207 2208 /* If we are flooded, just give up */ 2209 if (__packet_rcv_has_room(po, skb) == ROOM_NONE) { 2210 atomic_inc(&po->tp_drops); 2211 goto drop_n_restore; 2212 } 2213 2214 if (skb->ip_summed == CHECKSUM_PARTIAL) 2215 status |= TP_STATUS_CSUMNOTREADY; 2216 else if (skb->pkt_type != PACKET_OUTGOING && 2217 (skb->ip_summed == CHECKSUM_COMPLETE || 2218 skb_csum_unnecessary(skb))) 2219 status |= TP_STATUS_CSUM_VALID; 2220 2221 if (snaplen > res) 2222 snaplen = res; 2223 2224 if (sk->sk_type == SOCK_DGRAM) { 2225 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 + 2226 po->tp_reserve; 2227 } else { 2228 unsigned int maclen = skb_network_offset(skb); 2229 netoff = TPACKET_ALIGN(po->tp_hdrlen + 2230 (maclen < 16 ? 16 : maclen)) + 2231 po->tp_reserve; 2232 if (po->has_vnet_hdr) { 2233 netoff += sizeof(struct virtio_net_hdr); 2234 do_vnet = true; 2235 } 2236 macoff = netoff - maclen; 2237 } 2238 if (po->tp_version <= TPACKET_V2) { 2239 if (macoff + snaplen > po->rx_ring.frame_size) { 2240 if (po->copy_thresh && 2241 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { 2242 if (skb_shared(skb)) { 2243 copy_skb = skb_clone(skb, GFP_ATOMIC); 2244 } else { 2245 copy_skb = skb_get(skb); 2246 skb_head = skb->data; 2247 } 2248 if (copy_skb) 2249 skb_set_owner_r(copy_skb, sk); 2250 } 2251 snaplen = po->rx_ring.frame_size - macoff; 2252 if ((int)snaplen < 0) { 2253 snaplen = 0; 2254 do_vnet = false; 2255 } 2256 } 2257 } else if (unlikely(macoff + snaplen > 2258 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) { 2259 u32 nval; 2260 2261 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff; 2262 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n", 2263 snaplen, nval, macoff); 2264 snaplen = nval; 2265 if (unlikely((int)snaplen < 0)) { 2266 snaplen = 0; 2267 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len; 2268 do_vnet = false; 2269 } 2270 } 2271 spin_lock(&sk->sk_receive_queue.lock); 2272 h.raw = packet_current_rx_frame(po, skb, 2273 TP_STATUS_KERNEL, (macoff+snaplen)); 2274 if (!h.raw) 2275 goto drop_n_account; 2276 if (po->tp_version <= TPACKET_V2) { 2277 packet_increment_rx_head(po, &po->rx_ring); 2278 /* 2279 * LOSING will be reported till you read the stats, 2280 * because it's COR - Clear On Read. 2281 * Anyways, moving it for V1/V2 only as V3 doesn't need this 2282 * at packet level. 2283 */ 2284 if (atomic_read(&po->tp_drops)) 2285 status |= TP_STATUS_LOSING; 2286 } 2287 2288 if (do_vnet && 2289 virtio_net_hdr_from_skb(skb, h.raw + macoff - 2290 sizeof(struct virtio_net_hdr), 2291 vio_le(), true, 0)) 2292 goto drop_n_account; 2293 2294 po->stats.stats1.tp_packets++; 2295 if (copy_skb) { 2296 status |= TP_STATUS_COPY; 2297 __skb_queue_tail(&sk->sk_receive_queue, copy_skb); 2298 } 2299 spin_unlock(&sk->sk_receive_queue.lock); 2300 2301 skb_copy_bits(skb, 0, h.raw + macoff, snaplen); 2302 2303 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp))) 2304 getnstimeofday(&ts); 2305 2306 status |= ts_status; 2307 2308 switch (po->tp_version) { 2309 case TPACKET_V1: 2310 h.h1->tp_len = skb->len; 2311 h.h1->tp_snaplen = snaplen; 2312 h.h1->tp_mac = macoff; 2313 h.h1->tp_net = netoff; 2314 h.h1->tp_sec = ts.tv_sec; 2315 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; 2316 hdrlen = sizeof(*h.h1); 2317 break; 2318 case TPACKET_V2: 2319 h.h2->tp_len = skb->len; 2320 h.h2->tp_snaplen = snaplen; 2321 h.h2->tp_mac = macoff; 2322 h.h2->tp_net = netoff; 2323 h.h2->tp_sec = ts.tv_sec; 2324 h.h2->tp_nsec = ts.tv_nsec; 2325 if (skb_vlan_tag_present(skb)) { 2326 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb); 2327 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto); 2328 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; 2329 } else { 2330 h.h2->tp_vlan_tci = 0; 2331 h.h2->tp_vlan_tpid = 0; 2332 } 2333 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding)); 2334 hdrlen = sizeof(*h.h2); 2335 break; 2336 case TPACKET_V3: 2337 /* tp_nxt_offset,vlan are already populated above. 2338 * So DONT clear those fields here 2339 */ 2340 h.h3->tp_status |= status; 2341 h.h3->tp_len = skb->len; 2342 h.h3->tp_snaplen = snaplen; 2343 h.h3->tp_mac = macoff; 2344 h.h3->tp_net = netoff; 2345 h.h3->tp_sec = ts.tv_sec; 2346 h.h3->tp_nsec = ts.tv_nsec; 2347 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding)); 2348 hdrlen = sizeof(*h.h3); 2349 break; 2350 default: 2351 BUG(); 2352 } 2353 2354 sll = h.raw + TPACKET_ALIGN(hdrlen); 2355 sll->sll_halen = dev_parse_header(skb, sll->sll_addr); 2356 sll->sll_family = AF_PACKET; 2357 sll->sll_hatype = dev->type; 2358 sll->sll_protocol = skb->protocol; 2359 sll->sll_pkttype = skb->pkt_type; 2360 if (unlikely(po->origdev)) 2361 sll->sll_ifindex = orig_dev->ifindex; 2362 else 2363 sll->sll_ifindex = dev->ifindex; 2364 2365 smp_mb(); 2366 2367 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 2368 if (po->tp_version <= TPACKET_V2) { 2369 u8 *start, *end; 2370 2371 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw + 2372 macoff + snaplen); 2373 2374 for (start = h.raw; start < end; start += PAGE_SIZE) 2375 flush_dcache_page(pgv_to_page(start)); 2376 } 2377 smp_wmb(); 2378 #endif 2379 2380 if (po->tp_version <= TPACKET_V2) { 2381 __packet_set_status(po, h.raw, status); 2382 sk->sk_data_ready(sk); 2383 } else { 2384 prb_clear_blk_fill_status(&po->rx_ring); 2385 } 2386 2387 drop_n_restore: 2388 if (skb_head != skb->data && skb_shared(skb)) { 2389 skb->data = skb_head; 2390 skb->len = skb_len; 2391 } 2392 drop: 2393 if (!is_drop_n_account) 2394 consume_skb(skb); 2395 else 2396 kfree_skb(skb); 2397 return 0; 2398 2399 drop_n_account: 2400 spin_unlock(&sk->sk_receive_queue.lock); 2401 atomic_inc(&po->tp_drops); 2402 is_drop_n_account = true; 2403 2404 sk->sk_data_ready(sk); 2405 kfree_skb(copy_skb); 2406 goto drop_n_restore; 2407 } 2408 2409 static void tpacket_destruct_skb(struct sk_buff *skb) 2410 { 2411 struct packet_sock *po = pkt_sk(skb->sk); 2412 2413 if (likely(po->tx_ring.pg_vec)) { 2414 void *ph; 2415 __u32 ts; 2416 2417 ph = skb_zcopy_get_nouarg(skb); 2418 packet_dec_pending(&po->tx_ring); 2419 2420 ts = __packet_set_timestamp(po, ph, skb); 2421 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts); 2422 2423 if (!packet_read_pending(&po->tx_ring)) 2424 complete(&po->skb_completion); 2425 } 2426 2427 sock_wfree(skb); 2428 } 2429 2430 static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len) 2431 { 2432 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && 2433 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) + 2434 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 > 2435 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len))) 2436 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(), 2437 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) + 2438 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2); 2439 2440 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len) 2441 return -EINVAL; 2442 2443 return 0; 2444 } 2445 2446 static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len, 2447 struct virtio_net_hdr *vnet_hdr) 2448 { 2449 if (*len < sizeof(*vnet_hdr)) 2450 return -EINVAL; 2451 *len -= sizeof(*vnet_hdr); 2452 2453 if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter)) 2454 return -EFAULT; 2455 2456 return __packet_snd_vnet_parse(vnet_hdr, *len); 2457 } 2458 2459 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, 2460 void *frame, struct net_device *dev, void *data, int tp_len, 2461 __be16 proto, unsigned char *addr, int hlen, int copylen, 2462 const struct sockcm_cookie *sockc) 2463 { 2464 union tpacket_uhdr ph; 2465 int to_write, offset, len, nr_frags, len_max; 2466 struct socket *sock = po->sk.sk_socket; 2467 struct page *page; 2468 int err; 2469 2470 ph.raw = frame; 2471 2472 skb->protocol = proto; 2473 skb->dev = dev; 2474 skb->priority = po->sk.sk_priority; 2475 skb->mark = po->sk.sk_mark; 2476 skb->tstamp = sockc->transmit_time; 2477 skb_setup_tx_timestamp(skb, sockc->tsflags); 2478 skb_zcopy_set_nouarg(skb, ph.raw); 2479 2480 skb_reserve(skb, hlen); 2481 skb_reset_network_header(skb); 2482 2483 to_write = tp_len; 2484 2485 if (sock->type == SOCK_DGRAM) { 2486 err = dev_hard_header(skb, dev, ntohs(proto), addr, 2487 NULL, tp_len); 2488 if (unlikely(err < 0)) 2489 return -EINVAL; 2490 } else if (copylen) { 2491 int hdrlen = min_t(int, copylen, tp_len); 2492 2493 skb_push(skb, dev->hard_header_len); 2494 skb_put(skb, copylen - dev->hard_header_len); 2495 err = skb_store_bits(skb, 0, data, hdrlen); 2496 if (unlikely(err)) 2497 return err; 2498 if (!dev_validate_header(dev, skb->data, hdrlen)) 2499 return -EINVAL; 2500 2501 data += hdrlen; 2502 to_write -= hdrlen; 2503 } 2504 2505 offset = offset_in_page(data); 2506 len_max = PAGE_SIZE - offset; 2507 len = ((to_write > len_max) ? len_max : to_write); 2508 2509 skb->data_len = to_write; 2510 skb->len += to_write; 2511 skb->truesize += to_write; 2512 refcount_add(to_write, &po->sk.sk_wmem_alloc); 2513 2514 while (likely(to_write)) { 2515 nr_frags = skb_shinfo(skb)->nr_frags; 2516 2517 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) { 2518 pr_err("Packet exceed the number of skb frags(%lu)\n", 2519 MAX_SKB_FRAGS); 2520 return -EFAULT; 2521 } 2522 2523 page = pgv_to_page(data); 2524 data += len; 2525 flush_dcache_page(page); 2526 get_page(page); 2527 skb_fill_page_desc(skb, nr_frags, page, offset, len); 2528 to_write -= len; 2529 offset = 0; 2530 len_max = PAGE_SIZE; 2531 len = ((to_write > len_max) ? len_max : to_write); 2532 } 2533 2534 packet_parse_headers(skb, sock); 2535 2536 return tp_len; 2537 } 2538 2539 static int tpacket_parse_header(struct packet_sock *po, void *frame, 2540 int size_max, void **data) 2541 { 2542 union tpacket_uhdr ph; 2543 int tp_len, off; 2544 2545 ph.raw = frame; 2546 2547 switch (po->tp_version) { 2548 case TPACKET_V3: 2549 if (ph.h3->tp_next_offset != 0) { 2550 pr_warn_once("variable sized slot not supported"); 2551 return -EINVAL; 2552 } 2553 tp_len = ph.h3->tp_len; 2554 break; 2555 case TPACKET_V2: 2556 tp_len = ph.h2->tp_len; 2557 break; 2558 default: 2559 tp_len = ph.h1->tp_len; 2560 break; 2561 } 2562 if (unlikely(tp_len > size_max)) { 2563 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max); 2564 return -EMSGSIZE; 2565 } 2566 2567 if (unlikely(po->tp_tx_has_off)) { 2568 int off_min, off_max; 2569 2570 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); 2571 off_max = po->tx_ring.frame_size - tp_len; 2572 if (po->sk.sk_type == SOCK_DGRAM) { 2573 switch (po->tp_version) { 2574 case TPACKET_V3: 2575 off = ph.h3->tp_net; 2576 break; 2577 case TPACKET_V2: 2578 off = ph.h2->tp_net; 2579 break; 2580 default: 2581 off = ph.h1->tp_net; 2582 break; 2583 } 2584 } else { 2585 switch (po->tp_version) { 2586 case TPACKET_V3: 2587 off = ph.h3->tp_mac; 2588 break; 2589 case TPACKET_V2: 2590 off = ph.h2->tp_mac; 2591 break; 2592 default: 2593 off = ph.h1->tp_mac; 2594 break; 2595 } 2596 } 2597 if (unlikely((off < off_min) || (off_max < off))) 2598 return -EINVAL; 2599 } else { 2600 off = po->tp_hdrlen - sizeof(struct sockaddr_ll); 2601 } 2602 2603 *data = frame + off; 2604 return tp_len; 2605 } 2606 2607 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) 2608 { 2609 struct sk_buff *skb = NULL; 2610 struct net_device *dev; 2611 struct virtio_net_hdr *vnet_hdr = NULL; 2612 struct sockcm_cookie sockc; 2613 __be16 proto; 2614 int err, reserve = 0; 2615 void *ph; 2616 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name); 2617 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT); 2618 unsigned char *addr = NULL; 2619 int tp_len, size_max; 2620 void *data; 2621 int len_sum = 0; 2622 int status = TP_STATUS_AVAILABLE; 2623 int hlen, tlen, copylen = 0; 2624 long timeo = 0; 2625 2626 mutex_lock(&po->pg_vec_lock); 2627 2628 /* packet_sendmsg() check on tx_ring.pg_vec was lockless, 2629 * we need to confirm it under protection of pg_vec_lock. 2630 */ 2631 if (unlikely(!po->tx_ring.pg_vec)) { 2632 err = -EBUSY; 2633 goto out; 2634 } 2635 if (likely(saddr == NULL)) { 2636 dev = packet_cached_dev_get(po); 2637 proto = po->num; 2638 } else { 2639 err = -EINVAL; 2640 if (msg->msg_namelen < sizeof(struct sockaddr_ll)) 2641 goto out; 2642 if (msg->msg_namelen < (saddr->sll_halen 2643 + offsetof(struct sockaddr_ll, 2644 sll_addr))) 2645 goto out; 2646 proto = saddr->sll_protocol; 2647 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex); 2648 if (po->sk.sk_socket->type == SOCK_DGRAM) { 2649 if (dev && msg->msg_namelen < dev->addr_len + 2650 offsetof(struct sockaddr_ll, sll_addr)) 2651 goto out_put; 2652 addr = saddr->sll_addr; 2653 } 2654 } 2655 2656 err = -ENXIO; 2657 if (unlikely(dev == NULL)) 2658 goto out; 2659 err = -ENETDOWN; 2660 if (unlikely(!(dev->flags & IFF_UP))) 2661 goto out_put; 2662 2663 sockcm_init(&sockc, &po->sk); 2664 if (msg->msg_controllen) { 2665 err = sock_cmsg_send(&po->sk, msg, &sockc); 2666 if (unlikely(err)) 2667 goto out_put; 2668 } 2669 2670 if (po->sk.sk_socket->type == SOCK_RAW) 2671 reserve = dev->hard_header_len; 2672 size_max = po->tx_ring.frame_size 2673 - (po->tp_hdrlen - sizeof(struct sockaddr_ll)); 2674 2675 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr) 2676 size_max = dev->mtu + reserve + VLAN_HLEN; 2677 2678 reinit_completion(&po->skb_completion); 2679 2680 do { 2681 ph = packet_current_frame(po, &po->tx_ring, 2682 TP_STATUS_SEND_REQUEST); 2683 if (unlikely(ph == NULL)) { 2684 if (need_wait && skb) { 2685 timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT); 2686 timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo); 2687 if (timeo <= 0) { 2688 err = !timeo ? -ETIMEDOUT : -ERESTARTSYS; 2689 goto out_put; 2690 } 2691 } 2692 /* check for additional frames */ 2693 continue; 2694 } 2695 2696 skb = NULL; 2697 tp_len = tpacket_parse_header(po, ph, size_max, &data); 2698 if (tp_len < 0) 2699 goto tpacket_error; 2700 2701 status = TP_STATUS_SEND_REQUEST; 2702 hlen = LL_RESERVED_SPACE(dev); 2703 tlen = dev->needed_tailroom; 2704 if (po->has_vnet_hdr) { 2705 vnet_hdr = data; 2706 data += sizeof(*vnet_hdr); 2707 tp_len -= sizeof(*vnet_hdr); 2708 if (tp_len < 0 || 2709 __packet_snd_vnet_parse(vnet_hdr, tp_len)) { 2710 tp_len = -EINVAL; 2711 goto tpacket_error; 2712 } 2713 copylen = __virtio16_to_cpu(vio_le(), 2714 vnet_hdr->hdr_len); 2715 } 2716 copylen = max_t(int, copylen, dev->hard_header_len); 2717 skb = sock_alloc_send_skb(&po->sk, 2718 hlen + tlen + sizeof(struct sockaddr_ll) + 2719 (copylen - dev->hard_header_len), 2720 !need_wait, &err); 2721 2722 if (unlikely(skb == NULL)) { 2723 /* we assume the socket was initially writeable ... */ 2724 if (likely(len_sum > 0)) 2725 err = len_sum; 2726 goto out_status; 2727 } 2728 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto, 2729 addr, hlen, copylen, &sockc); 2730 if (likely(tp_len >= 0) && 2731 tp_len > dev->mtu + reserve && 2732 !po->has_vnet_hdr && 2733 !packet_extra_vlan_len_allowed(dev, skb)) 2734 tp_len = -EMSGSIZE; 2735 2736 if (unlikely(tp_len < 0)) { 2737 tpacket_error: 2738 if (po->tp_loss) { 2739 __packet_set_status(po, ph, 2740 TP_STATUS_AVAILABLE); 2741 packet_increment_head(&po->tx_ring); 2742 kfree_skb(skb); 2743 continue; 2744 } else { 2745 status = TP_STATUS_WRONG_FORMAT; 2746 err = tp_len; 2747 goto out_status; 2748 } 2749 } 2750 2751 if (po->has_vnet_hdr) { 2752 if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) { 2753 tp_len = -EINVAL; 2754 goto tpacket_error; 2755 } 2756 virtio_net_hdr_set_proto(skb, vnet_hdr); 2757 } 2758 2759 skb->destructor = tpacket_destruct_skb; 2760 __packet_set_status(po, ph, TP_STATUS_SENDING); 2761 packet_inc_pending(&po->tx_ring); 2762 2763 status = TP_STATUS_SEND_REQUEST; 2764 err = po->xmit(skb); 2765 if (unlikely(err > 0)) { 2766 err = net_xmit_errno(err); 2767 if (err && __packet_get_status(po, ph) == 2768 TP_STATUS_AVAILABLE) { 2769 /* skb was destructed already */ 2770 skb = NULL; 2771 goto out_status; 2772 } 2773 /* 2774 * skb was dropped but not destructed yet; 2775 * let's treat it like congestion or err < 0 2776 */ 2777 err = 0; 2778 } 2779 packet_increment_head(&po->tx_ring); 2780 len_sum += tp_len; 2781 } while (likely((ph != NULL) || 2782 /* Note: packet_read_pending() might be slow if we have 2783 * to call it as it's per_cpu variable, but in fast-path 2784 * we already short-circuit the loop with the first 2785 * condition, and luckily don't have to go that path 2786 * anyway. 2787 */ 2788 (need_wait && packet_read_pending(&po->tx_ring)))); 2789 2790 err = len_sum; 2791 goto out_put; 2792 2793 out_status: 2794 __packet_set_status(po, ph, status); 2795 kfree_skb(skb); 2796 out_put: 2797 dev_put(dev); 2798 out: 2799 mutex_unlock(&po->pg_vec_lock); 2800 return err; 2801 } 2802 2803 static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, 2804 size_t reserve, size_t len, 2805 size_t linear, int noblock, 2806 int *err) 2807 { 2808 struct sk_buff *skb; 2809 2810 /* Under a page? Don't bother with paged skb. */ 2811 if (prepad + len < PAGE_SIZE || !linear) 2812 linear = len; 2813 2814 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, 2815 err, 0); 2816 if (!skb) 2817 return NULL; 2818 2819 skb_reserve(skb, reserve); 2820 skb_put(skb, linear); 2821 skb->data_len = len - linear; 2822 skb->len += len - linear; 2823 2824 return skb; 2825 } 2826 2827 static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) 2828 { 2829 struct sock *sk = sock->sk; 2830 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name); 2831 struct sk_buff *skb; 2832 struct net_device *dev; 2833 __be16 proto; 2834 unsigned char *addr = NULL; 2835 int err, reserve = 0; 2836 struct sockcm_cookie sockc; 2837 struct virtio_net_hdr vnet_hdr = { 0 }; 2838 int offset = 0; 2839 struct packet_sock *po = pkt_sk(sk); 2840 bool has_vnet_hdr = false; 2841 int hlen, tlen, linear; 2842 int extra_len = 0; 2843 2844 /* 2845 * Get and verify the address. 2846 */ 2847 2848 if (likely(saddr == NULL)) { 2849 dev = packet_cached_dev_get(po); 2850 proto = po->num; 2851 } else { 2852 err = -EINVAL; 2853 if (msg->msg_namelen < sizeof(struct sockaddr_ll)) 2854 goto out; 2855 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr))) 2856 goto out; 2857 proto = saddr->sll_protocol; 2858 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex); 2859 if (sock->type == SOCK_DGRAM) { 2860 if (dev && msg->msg_namelen < dev->addr_len + 2861 offsetof(struct sockaddr_ll, sll_addr)) 2862 goto out_unlock; 2863 addr = saddr->sll_addr; 2864 } 2865 } 2866 2867 err = -ENXIO; 2868 if (unlikely(dev == NULL)) 2869 goto out_unlock; 2870 err = -ENETDOWN; 2871 if (unlikely(!(dev->flags & IFF_UP))) 2872 goto out_unlock; 2873 2874 sockcm_init(&sockc, sk); 2875 sockc.mark = sk->sk_mark; 2876 if (msg->msg_controllen) { 2877 err = sock_cmsg_send(sk, msg, &sockc); 2878 if (unlikely(err)) 2879 goto out_unlock; 2880 } 2881 2882 if (sock->type == SOCK_RAW) 2883 reserve = dev->hard_header_len; 2884 if (po->has_vnet_hdr) { 2885 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr); 2886 if (err) 2887 goto out_unlock; 2888 has_vnet_hdr = true; 2889 } 2890 2891 if (unlikely(sock_flag(sk, SOCK_NOFCS))) { 2892 if (!netif_supports_nofcs(dev)) { 2893 err = -EPROTONOSUPPORT; 2894 goto out_unlock; 2895 } 2896 extra_len = 4; /* We're doing our own CRC */ 2897 } 2898 2899 err = -EMSGSIZE; 2900 if (!vnet_hdr.gso_type && 2901 (len > dev->mtu + reserve + VLAN_HLEN + extra_len)) 2902 goto out_unlock; 2903 2904 err = -ENOBUFS; 2905 hlen = LL_RESERVED_SPACE(dev); 2906 tlen = dev->needed_tailroom; 2907 linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len); 2908 linear = max(linear, min_t(int, len, dev->hard_header_len)); 2909 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear, 2910 msg->msg_flags & MSG_DONTWAIT, &err); 2911 if (skb == NULL) 2912 goto out_unlock; 2913 2914 skb_reset_network_header(skb); 2915 2916 err = -EINVAL; 2917 if (sock->type == SOCK_DGRAM) { 2918 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len); 2919 if (unlikely(offset < 0)) 2920 goto out_free; 2921 } else if (reserve) { 2922 skb_reserve(skb, -reserve); 2923 if (len < reserve + sizeof(struct ipv6hdr) && 2924 dev->min_header_len != dev->hard_header_len) 2925 skb_reset_network_header(skb); 2926 } 2927 2928 /* Returns -EFAULT on error */ 2929 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len); 2930 if (err) 2931 goto out_free; 2932 2933 if (sock->type == SOCK_RAW && 2934 !dev_validate_header(dev, skb->data, len)) { 2935 err = -EINVAL; 2936 goto out_free; 2937 } 2938 2939 skb_setup_tx_timestamp(skb, sockc.tsflags); 2940 2941 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) && 2942 !packet_extra_vlan_len_allowed(dev, skb)) { 2943 err = -EMSGSIZE; 2944 goto out_free; 2945 } 2946 2947 skb->protocol = proto; 2948 skb->dev = dev; 2949 skb->priority = sk->sk_priority; 2950 skb->mark = sockc.mark; 2951 skb->tstamp = sockc.transmit_time; 2952 2953 if (has_vnet_hdr) { 2954 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); 2955 if (err) 2956 goto out_free; 2957 len += sizeof(vnet_hdr); 2958 virtio_net_hdr_set_proto(skb, &vnet_hdr); 2959 } 2960 2961 packet_parse_headers(skb, sock); 2962 2963 if (unlikely(extra_len == 4)) 2964 skb->no_fcs = 1; 2965 2966 err = po->xmit(skb); 2967 if (err > 0 && (err = net_xmit_errno(err)) != 0) 2968 goto out_unlock; 2969 2970 dev_put(dev); 2971 2972 return len; 2973 2974 out_free: 2975 kfree_skb(skb); 2976 out_unlock: 2977 if (dev) 2978 dev_put(dev); 2979 out: 2980 return err; 2981 } 2982 2983 static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) 2984 { 2985 struct sock *sk = sock->sk; 2986 struct packet_sock *po = pkt_sk(sk); 2987 2988 if (po->tx_ring.pg_vec) 2989 return tpacket_snd(po, msg); 2990 else 2991 return packet_snd(sock, msg, len); 2992 } 2993 2994 /* 2995 * Close a PACKET socket. This is fairly simple. We immediately go 2996 * to 'closed' state and remove our protocol entry in the device list. 2997 */ 2998 2999 static int packet_release(struct socket *sock) 3000 { 3001 struct sock *sk = sock->sk; 3002 struct packet_sock *po; 3003 struct packet_fanout *f; 3004 struct net *net; 3005 union tpacket_req_u req_u; 3006 3007 if (!sk) 3008 return 0; 3009 3010 net = sock_net(sk); 3011 po = pkt_sk(sk); 3012 3013 mutex_lock(&net->packet.sklist_lock); 3014 sk_del_node_init_rcu(sk); 3015 mutex_unlock(&net->packet.sklist_lock); 3016 3017 preempt_disable(); 3018 sock_prot_inuse_add(net, sk->sk_prot, -1); 3019 preempt_enable(); 3020 3021 spin_lock(&po->bind_lock); 3022 unregister_prot_hook(sk, false); 3023 packet_cached_dev_reset(po); 3024 3025 if (po->prot_hook.dev) { 3026 dev_put(po->prot_hook.dev); 3027 po->prot_hook.dev = NULL; 3028 } 3029 spin_unlock(&po->bind_lock); 3030 3031 packet_flush_mclist(sk); 3032 3033 lock_sock(sk); 3034 if (po->rx_ring.pg_vec) { 3035 memset(&req_u, 0, sizeof(req_u)); 3036 packet_set_ring(sk, &req_u, 1, 0); 3037 } 3038 3039 if (po->tx_ring.pg_vec) { 3040 memset(&req_u, 0, sizeof(req_u)); 3041 packet_set_ring(sk, &req_u, 1, 1); 3042 } 3043 release_sock(sk); 3044 3045 f = fanout_release(sk); 3046 3047 synchronize_net(); 3048 3049 kfree(po->rollover); 3050 if (f) { 3051 fanout_release_data(f); 3052 kfree(f); 3053 } 3054 /* 3055 * Now the socket is dead. No more input will appear. 3056 */ 3057 sock_orphan(sk); 3058 sock->sk = NULL; 3059 3060 /* Purge queues */ 3061 3062 skb_queue_purge(&sk->sk_receive_queue); 3063 packet_free_pending(po); 3064 sk_refcnt_debug_release(sk); 3065 3066 sock_put(sk); 3067 return 0; 3068 } 3069 3070 /* 3071 * Attach a packet hook. 3072 */ 3073 3074 static int packet_do_bind(struct sock *sk, const char *name, int ifindex, 3075 __be16 proto) 3076 { 3077 struct packet_sock *po = pkt_sk(sk); 3078 struct net_device *dev_curr; 3079 __be16 proto_curr; 3080 bool need_rehook; 3081 struct net_device *dev = NULL; 3082 int ret = 0; 3083 bool unlisted = false; 3084 3085 lock_sock(sk); 3086 spin_lock(&po->bind_lock); 3087 rcu_read_lock(); 3088 3089 if (po->fanout) { 3090 ret = -EINVAL; 3091 goto out_unlock; 3092 } 3093 3094 if (name) { 3095 dev = dev_get_by_name_rcu(sock_net(sk), name); 3096 if (!dev) { 3097 ret = -ENODEV; 3098 goto out_unlock; 3099 } 3100 } else if (ifindex) { 3101 dev = dev_get_by_index_rcu(sock_net(sk), ifindex); 3102 if (!dev) { 3103 ret = -ENODEV; 3104 goto out_unlock; 3105 } 3106 } 3107 3108 if (dev) 3109 dev_hold(dev); 3110 3111 proto_curr = po->prot_hook.type; 3112 dev_curr = po->prot_hook.dev; 3113 3114 need_rehook = proto_curr != proto || dev_curr != dev; 3115 3116 if (need_rehook) { 3117 if (po->running) { 3118 rcu_read_unlock(); 3119 /* prevents packet_notifier() from calling 3120 * register_prot_hook() 3121 */ 3122 po->num = 0; 3123 __unregister_prot_hook(sk, true); 3124 rcu_read_lock(); 3125 dev_curr = po->prot_hook.dev; 3126 if (dev) 3127 unlisted = !dev_get_by_index_rcu(sock_net(sk), 3128 dev->ifindex); 3129 } 3130 3131 BUG_ON(po->running); 3132 po->num = proto; 3133 po->prot_hook.type = proto; 3134 3135 if (unlikely(unlisted)) { 3136 dev_put(dev); 3137 po->prot_hook.dev = NULL; 3138 po->ifindex = -1; 3139 packet_cached_dev_reset(po); 3140 } else { 3141 po->prot_hook.dev = dev; 3142 po->ifindex = dev ? dev->ifindex : 0; 3143 packet_cached_dev_assign(po, dev); 3144 } 3145 } 3146 if (dev_curr) 3147 dev_put(dev_curr); 3148 3149 if (proto == 0 || !need_rehook) 3150 goto out_unlock; 3151 3152 if (!unlisted && (!dev || (dev->flags & IFF_UP))) { 3153 register_prot_hook(sk); 3154 } else { 3155 sk->sk_err = ENETDOWN; 3156 if (!sock_flag(sk, SOCK_DEAD)) 3157 sk->sk_error_report(sk); 3158 } 3159 3160 out_unlock: 3161 rcu_read_unlock(); 3162 spin_unlock(&po->bind_lock); 3163 release_sock(sk); 3164 return ret; 3165 } 3166 3167 /* 3168 * Bind a packet socket to a device 3169 */ 3170 3171 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, 3172 int addr_len) 3173 { 3174 struct sock *sk = sock->sk; 3175 char name[sizeof(uaddr->sa_data) + 1]; 3176 3177 /* 3178 * Check legality 3179 */ 3180 3181 if (addr_len != sizeof(struct sockaddr)) 3182 return -EINVAL; 3183 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be 3184 * zero-terminated. 3185 */ 3186 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data)); 3187 name[sizeof(uaddr->sa_data)] = 0; 3188 3189 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num); 3190 } 3191 3192 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 3193 { 3194 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr; 3195 struct sock *sk = sock->sk; 3196 3197 /* 3198 * Check legality 3199 */ 3200 3201 if (addr_len < sizeof(struct sockaddr_ll)) 3202 return -EINVAL; 3203 if (sll->sll_family != AF_PACKET) 3204 return -EINVAL; 3205 3206 return packet_do_bind(sk, NULL, sll->sll_ifindex, 3207 sll->sll_protocol ? : pkt_sk(sk)->num); 3208 } 3209 3210 static struct proto packet_proto = { 3211 .name = "PACKET", 3212 .owner = THIS_MODULE, 3213 .obj_size = sizeof(struct packet_sock), 3214 }; 3215 3216 /* 3217 * Create a packet of type SOCK_PACKET. 3218 */ 3219 3220 static int packet_create(struct net *net, struct socket *sock, int protocol, 3221 int kern) 3222 { 3223 struct sock *sk; 3224 struct packet_sock *po; 3225 __be16 proto = (__force __be16)protocol; /* weird, but documented */ 3226 int err; 3227 3228 if (!ns_capable(net->user_ns, CAP_NET_RAW)) 3229 return -EPERM; 3230 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW && 3231 sock->type != SOCK_PACKET) 3232 return -ESOCKTNOSUPPORT; 3233 3234 sock->state = SS_UNCONNECTED; 3235 3236 err = -ENOBUFS; 3237 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern); 3238 if (sk == NULL) 3239 goto out; 3240 3241 sock->ops = &packet_ops; 3242 if (sock->type == SOCK_PACKET) 3243 sock->ops = &packet_ops_spkt; 3244 3245 sock_init_data(sock, sk); 3246 3247 po = pkt_sk(sk); 3248 init_completion(&po->skb_completion); 3249 sk->sk_family = PF_PACKET; 3250 po->num = proto; 3251 po->xmit = dev_queue_xmit; 3252 3253 err = packet_alloc_pending(po); 3254 if (err) 3255 goto out2; 3256 3257 packet_cached_dev_reset(po); 3258 3259 sk->sk_destruct = packet_sock_destruct; 3260 sk_refcnt_debug_inc(sk); 3261 3262 /* 3263 * Attach a protocol block 3264 */ 3265 3266 spin_lock_init(&po->bind_lock); 3267 mutex_init(&po->pg_vec_lock); 3268 po->rollover = NULL; 3269 po->prot_hook.func = packet_rcv; 3270 3271 if (sock->type == SOCK_PACKET) 3272 po->prot_hook.func = packet_rcv_spkt; 3273 3274 po->prot_hook.af_packet_priv = sk; 3275 3276 if (proto) { 3277 po->prot_hook.type = proto; 3278 __register_prot_hook(sk); 3279 } 3280 3281 mutex_lock(&net->packet.sklist_lock); 3282 sk_add_node_tail_rcu(sk, &net->packet.sklist); 3283 mutex_unlock(&net->packet.sklist_lock); 3284 3285 preempt_disable(); 3286 sock_prot_inuse_add(net, &packet_proto, 1); 3287 preempt_enable(); 3288 3289 return 0; 3290 out2: 3291 sk_free(sk); 3292 out: 3293 return err; 3294 } 3295 3296 /* 3297 * Pull a packet from our receive queue and hand it to the user. 3298 * If necessary we block. 3299 */ 3300 3301 static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, 3302 int flags) 3303 { 3304 struct sock *sk = sock->sk; 3305 struct sk_buff *skb; 3306 int copied, err; 3307 int vnet_hdr_len = 0; 3308 unsigned int origlen = 0; 3309 3310 err = -EINVAL; 3311 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE)) 3312 goto out; 3313 3314 #if 0 3315 /* What error should we return now? EUNATTACH? */ 3316 if (pkt_sk(sk)->ifindex < 0) 3317 return -ENODEV; 3318 #endif 3319 3320 if (flags & MSG_ERRQUEUE) { 3321 err = sock_recv_errqueue(sk, msg, len, 3322 SOL_PACKET, PACKET_TX_TIMESTAMP); 3323 goto out; 3324 } 3325 3326 /* 3327 * Call the generic datagram receiver. This handles all sorts 3328 * of horrible races and re-entrancy so we can forget about it 3329 * in the protocol layers. 3330 * 3331 * Now it will return ENETDOWN, if device have just gone down, 3332 * but then it will block. 3333 */ 3334 3335 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err); 3336 3337 /* 3338 * An error occurred so return it. Because skb_recv_datagram() 3339 * handles the blocking we don't see and worry about blocking 3340 * retries. 3341 */ 3342 3343 if (skb == NULL) 3344 goto out; 3345 3346 packet_rcv_try_clear_pressure(pkt_sk(sk)); 3347 3348 if (pkt_sk(sk)->has_vnet_hdr) { 3349 err = packet_rcv_vnet(msg, skb, &len); 3350 if (err) 3351 goto out_free; 3352 vnet_hdr_len = sizeof(struct virtio_net_hdr); 3353 } 3354 3355 /* You lose any data beyond the buffer you gave. If it worries 3356 * a user program they can ask the device for its MTU 3357 * anyway. 3358 */ 3359 copied = skb->len; 3360 if (copied > len) { 3361 copied = len; 3362 msg->msg_flags |= MSG_TRUNC; 3363 } 3364 3365 err = skb_copy_datagram_msg(skb, 0, msg, copied); 3366 if (err) 3367 goto out_free; 3368 3369 if (sock->type != SOCK_PACKET) { 3370 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; 3371 3372 /* Original length was stored in sockaddr_ll fields */ 3373 origlen = PACKET_SKB_CB(skb)->sa.origlen; 3374 sll->sll_family = AF_PACKET; 3375 sll->sll_protocol = skb->protocol; 3376 } 3377 3378 sock_recv_ts_and_drops(msg, sk, skb); 3379 3380 if (msg->msg_name) { 3381 int copy_len; 3382 3383 /* If the address length field is there to be filled 3384 * in, we fill it in now. 3385 */ 3386 if (sock->type == SOCK_PACKET) { 3387 __sockaddr_check_size(sizeof(struct sockaddr_pkt)); 3388 msg->msg_namelen = sizeof(struct sockaddr_pkt); 3389 copy_len = msg->msg_namelen; 3390 } else { 3391 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; 3392 3393 msg->msg_namelen = sll->sll_halen + 3394 offsetof(struct sockaddr_ll, sll_addr); 3395 copy_len = msg->msg_namelen; 3396 if (msg->msg_namelen < sizeof(struct sockaddr_ll)) { 3397 memset(msg->msg_name + 3398 offsetof(struct sockaddr_ll, sll_addr), 3399 0, sizeof(sll->sll_addr)); 3400 msg->msg_namelen = sizeof(struct sockaddr_ll); 3401 } 3402 } 3403 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len); 3404 } 3405 3406 if (pkt_sk(sk)->auxdata) { 3407 struct tpacket_auxdata aux; 3408 3409 aux.tp_status = TP_STATUS_USER; 3410 if (skb->ip_summed == CHECKSUM_PARTIAL) 3411 aux.tp_status |= TP_STATUS_CSUMNOTREADY; 3412 else if (skb->pkt_type != PACKET_OUTGOING && 3413 (skb->ip_summed == CHECKSUM_COMPLETE || 3414 skb_csum_unnecessary(skb))) 3415 aux.tp_status |= TP_STATUS_CSUM_VALID; 3416 3417 aux.tp_len = origlen; 3418 aux.tp_snaplen = skb->len; 3419 aux.tp_mac = 0; 3420 aux.tp_net = skb_network_offset(skb); 3421 if (skb_vlan_tag_present(skb)) { 3422 aux.tp_vlan_tci = skb_vlan_tag_get(skb); 3423 aux.tp_vlan_tpid = ntohs(skb->vlan_proto); 3424 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; 3425 } else { 3426 aux.tp_vlan_tci = 0; 3427 aux.tp_vlan_tpid = 0; 3428 } 3429 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux); 3430 } 3431 3432 /* 3433 * Free or return the buffer as appropriate. Again this 3434 * hides all the races and re-entrancy issues from us. 3435 */ 3436 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied); 3437 3438 out_free: 3439 skb_free_datagram(sk, skb); 3440 out: 3441 return err; 3442 } 3443 3444 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, 3445 int peer) 3446 { 3447 struct net_device *dev; 3448 struct sock *sk = sock->sk; 3449 3450 if (peer) 3451 return -EOPNOTSUPP; 3452 3453 uaddr->sa_family = AF_PACKET; 3454 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data)); 3455 rcu_read_lock(); 3456 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex); 3457 if (dev) 3458 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data)); 3459 rcu_read_unlock(); 3460 3461 return sizeof(*uaddr); 3462 } 3463 3464 static int packet_getname(struct socket *sock, struct sockaddr *uaddr, 3465 int peer) 3466 { 3467 struct net_device *dev; 3468 struct sock *sk = sock->sk; 3469 struct packet_sock *po = pkt_sk(sk); 3470 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr); 3471 3472 if (peer) 3473 return -EOPNOTSUPP; 3474 3475 sll->sll_family = AF_PACKET; 3476 sll->sll_ifindex = po->ifindex; 3477 sll->sll_protocol = po->num; 3478 sll->sll_pkttype = 0; 3479 rcu_read_lock(); 3480 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex); 3481 if (dev) { 3482 sll->sll_hatype = dev->type; 3483 sll->sll_halen = dev->addr_len; 3484 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len); 3485 } else { 3486 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */ 3487 sll->sll_halen = 0; 3488 } 3489 rcu_read_unlock(); 3490 3491 return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen; 3492 } 3493 3494 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i, 3495 int what) 3496 { 3497 switch (i->type) { 3498 case PACKET_MR_MULTICAST: 3499 if (i->alen != dev->addr_len) 3500 return -EINVAL; 3501 if (what > 0) 3502 return dev_mc_add(dev, i->addr); 3503 else 3504 return dev_mc_del(dev, i->addr); 3505 break; 3506 case PACKET_MR_PROMISC: 3507 return dev_set_promiscuity(dev, what); 3508 case PACKET_MR_ALLMULTI: 3509 return dev_set_allmulti(dev, what); 3510 case PACKET_MR_UNICAST: 3511 if (i->alen != dev->addr_len) 3512 return -EINVAL; 3513 if (what > 0) 3514 return dev_uc_add(dev, i->addr); 3515 else 3516 return dev_uc_del(dev, i->addr); 3517 break; 3518 default: 3519 break; 3520 } 3521 return 0; 3522 } 3523 3524 static void packet_dev_mclist_delete(struct net_device *dev, 3525 struct packet_mclist **mlp) 3526 { 3527 struct packet_mclist *ml; 3528 3529 while ((ml = *mlp) != NULL) { 3530 if (ml->ifindex == dev->ifindex) { 3531 packet_dev_mc(dev, ml, -1); 3532 *mlp = ml->next; 3533 kfree(ml); 3534 } else 3535 mlp = &ml->next; 3536 } 3537 } 3538 3539 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq) 3540 { 3541 struct packet_sock *po = pkt_sk(sk); 3542 struct packet_mclist *ml, *i; 3543 struct net_device *dev; 3544 int err; 3545 3546 rtnl_lock(); 3547 3548 err = -ENODEV; 3549 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex); 3550 if (!dev) 3551 goto done; 3552 3553 err = -EINVAL; 3554 if (mreq->mr_alen > dev->addr_len) 3555 goto done; 3556 3557 err = -ENOBUFS; 3558 i = kmalloc(sizeof(*i), GFP_KERNEL); 3559 if (i == NULL) 3560 goto done; 3561 3562 err = 0; 3563 for (ml = po->mclist; ml; ml = ml->next) { 3564 if (ml->ifindex == mreq->mr_ifindex && 3565 ml->type == mreq->mr_type && 3566 ml->alen == mreq->mr_alen && 3567 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { 3568 ml->count++; 3569 /* Free the new element ... */ 3570 kfree(i); 3571 goto done; 3572 } 3573 } 3574 3575 i->type = mreq->mr_type; 3576 i->ifindex = mreq->mr_ifindex; 3577 i->alen = mreq->mr_alen; 3578 memcpy(i->addr, mreq->mr_address, i->alen); 3579 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen); 3580 i->count = 1; 3581 i->next = po->mclist; 3582 po->mclist = i; 3583 err = packet_dev_mc(dev, i, 1); 3584 if (err) { 3585 po->mclist = i->next; 3586 kfree(i); 3587 } 3588 3589 done: 3590 rtnl_unlock(); 3591 return err; 3592 } 3593 3594 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq) 3595 { 3596 struct packet_mclist *ml, **mlp; 3597 3598 rtnl_lock(); 3599 3600 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) { 3601 if (ml->ifindex == mreq->mr_ifindex && 3602 ml->type == mreq->mr_type && 3603 ml->alen == mreq->mr_alen && 3604 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { 3605 if (--ml->count == 0) { 3606 struct net_device *dev; 3607 *mlp = ml->next; 3608 dev = __dev_get_by_index(sock_net(sk), ml->ifindex); 3609 if (dev) 3610 packet_dev_mc(dev, ml, -1); 3611 kfree(ml); 3612 } 3613 break; 3614 } 3615 } 3616 rtnl_unlock(); 3617 return 0; 3618 } 3619 3620 static void packet_flush_mclist(struct sock *sk) 3621 { 3622 struct packet_sock *po = pkt_sk(sk); 3623 struct packet_mclist *ml; 3624 3625 if (!po->mclist) 3626 return; 3627 3628 rtnl_lock(); 3629 while ((ml = po->mclist) != NULL) { 3630 struct net_device *dev; 3631 3632 po->mclist = ml->next; 3633 dev = __dev_get_by_index(sock_net(sk), ml->ifindex); 3634 if (dev != NULL) 3635 packet_dev_mc(dev, ml, -1); 3636 kfree(ml); 3637 } 3638 rtnl_unlock(); 3639 } 3640 3641 static int 3642 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) 3643 { 3644 struct sock *sk = sock->sk; 3645 struct packet_sock *po = pkt_sk(sk); 3646 int ret; 3647 3648 if (level != SOL_PACKET) 3649 return -ENOPROTOOPT; 3650 3651 switch (optname) { 3652 case PACKET_ADD_MEMBERSHIP: 3653 case PACKET_DROP_MEMBERSHIP: 3654 { 3655 struct packet_mreq_max mreq; 3656 int len = optlen; 3657 memset(&mreq, 0, sizeof(mreq)); 3658 if (len < sizeof(struct packet_mreq)) 3659 return -EINVAL; 3660 if (len > sizeof(mreq)) 3661 len = sizeof(mreq); 3662 if (copy_from_user(&mreq, optval, len)) 3663 return -EFAULT; 3664 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address))) 3665 return -EINVAL; 3666 if (optname == PACKET_ADD_MEMBERSHIP) 3667 ret = packet_mc_add(sk, &mreq); 3668 else 3669 ret = packet_mc_drop(sk, &mreq); 3670 return ret; 3671 } 3672 3673 case PACKET_RX_RING: 3674 case PACKET_TX_RING: 3675 { 3676 union tpacket_req_u req_u; 3677 int len; 3678 3679 lock_sock(sk); 3680 switch (po->tp_version) { 3681 case TPACKET_V1: 3682 case TPACKET_V2: 3683 len = sizeof(req_u.req); 3684 break; 3685 case TPACKET_V3: 3686 default: 3687 len = sizeof(req_u.req3); 3688 break; 3689 } 3690 if (optlen < len) { 3691 ret = -EINVAL; 3692 } else { 3693 if (copy_from_user(&req_u.req, optval, len)) 3694 ret = -EFAULT; 3695 else 3696 ret = packet_set_ring(sk, &req_u, 0, 3697 optname == PACKET_TX_RING); 3698 } 3699 release_sock(sk); 3700 return ret; 3701 } 3702 case PACKET_COPY_THRESH: 3703 { 3704 int val; 3705 3706 if (optlen != sizeof(val)) 3707 return -EINVAL; 3708 if (copy_from_user(&val, optval, sizeof(val))) 3709 return -EFAULT; 3710 3711 pkt_sk(sk)->copy_thresh = val; 3712 return 0; 3713 } 3714 case PACKET_VERSION: 3715 { 3716 int val; 3717 3718 if (optlen != sizeof(val)) 3719 return -EINVAL; 3720 if (copy_from_user(&val, optval, sizeof(val))) 3721 return -EFAULT; 3722 switch (val) { 3723 case TPACKET_V1: 3724 case TPACKET_V2: 3725 case TPACKET_V3: 3726 break; 3727 default: 3728 return -EINVAL; 3729 } 3730 lock_sock(sk); 3731 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { 3732 ret = -EBUSY; 3733 } else { 3734 po->tp_version = val; 3735 ret = 0; 3736 } 3737 release_sock(sk); 3738 return ret; 3739 } 3740 case PACKET_RESERVE: 3741 { 3742 unsigned int val; 3743 3744 if (optlen != sizeof(val)) 3745 return -EINVAL; 3746 if (copy_from_user(&val, optval, sizeof(val))) 3747 return -EFAULT; 3748 if (val > INT_MAX) 3749 return -EINVAL; 3750 lock_sock(sk); 3751 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { 3752 ret = -EBUSY; 3753 } else { 3754 po->tp_reserve = val; 3755 ret = 0; 3756 } 3757 release_sock(sk); 3758 return ret; 3759 } 3760 case PACKET_LOSS: 3761 { 3762 unsigned int val; 3763 3764 if (optlen != sizeof(val)) 3765 return -EINVAL; 3766 if (copy_from_user(&val, optval, sizeof(val))) 3767 return -EFAULT; 3768 3769 lock_sock(sk); 3770 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { 3771 ret = -EBUSY; 3772 } else { 3773 po->tp_loss = !!val; 3774 ret = 0; 3775 } 3776 release_sock(sk); 3777 return ret; 3778 } 3779 case PACKET_AUXDATA: 3780 { 3781 int val; 3782 3783 if (optlen < sizeof(val)) 3784 return -EINVAL; 3785 if (copy_from_user(&val, optval, sizeof(val))) 3786 return -EFAULT; 3787 3788 lock_sock(sk); 3789 po->auxdata = !!val; 3790 release_sock(sk); 3791 return 0; 3792 } 3793 case PACKET_ORIGDEV: 3794 { 3795 int val; 3796 3797 if (optlen < sizeof(val)) 3798 return -EINVAL; 3799 if (copy_from_user(&val, optval, sizeof(val))) 3800 return -EFAULT; 3801 3802 lock_sock(sk); 3803 po->origdev = !!val; 3804 release_sock(sk); 3805 return 0; 3806 } 3807 case PACKET_VNET_HDR: 3808 { 3809 int val; 3810 3811 if (sock->type != SOCK_RAW) 3812 return -EINVAL; 3813 if (optlen < sizeof(val)) 3814 return -EINVAL; 3815 if (copy_from_user(&val, optval, sizeof(val))) 3816 return -EFAULT; 3817 3818 lock_sock(sk); 3819 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { 3820 ret = -EBUSY; 3821 } else { 3822 po->has_vnet_hdr = !!val; 3823 ret = 0; 3824 } 3825 release_sock(sk); 3826 return ret; 3827 } 3828 case PACKET_TIMESTAMP: 3829 { 3830 int val; 3831 3832 if (optlen != sizeof(val)) 3833 return -EINVAL; 3834 if (copy_from_user(&val, optval, sizeof(val))) 3835 return -EFAULT; 3836 3837 po->tp_tstamp = val; 3838 return 0; 3839 } 3840 case PACKET_FANOUT: 3841 { 3842 int val; 3843 3844 if (optlen != sizeof(val)) 3845 return -EINVAL; 3846 if (copy_from_user(&val, optval, sizeof(val))) 3847 return -EFAULT; 3848 3849 return fanout_add(sk, val & 0xffff, val >> 16); 3850 } 3851 case PACKET_FANOUT_DATA: 3852 { 3853 if (!po->fanout) 3854 return -EINVAL; 3855 3856 return fanout_set_data(po, optval, optlen); 3857 } 3858 case PACKET_IGNORE_OUTGOING: 3859 { 3860 int val; 3861 3862 if (optlen != sizeof(val)) 3863 return -EINVAL; 3864 if (copy_from_user(&val, optval, sizeof(val))) 3865 return -EFAULT; 3866 if (val < 0 || val > 1) 3867 return -EINVAL; 3868 3869 po->prot_hook.ignore_outgoing = !!val; 3870 return 0; 3871 } 3872 case PACKET_TX_HAS_OFF: 3873 { 3874 unsigned int val; 3875 3876 if (optlen != sizeof(val)) 3877 return -EINVAL; 3878 if (copy_from_user(&val, optval, sizeof(val))) 3879 return -EFAULT; 3880 3881 lock_sock(sk); 3882 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { 3883 ret = -EBUSY; 3884 } else { 3885 po->tp_tx_has_off = !!val; 3886 ret = 0; 3887 } 3888 release_sock(sk); 3889 return 0; 3890 } 3891 case PACKET_QDISC_BYPASS: 3892 { 3893 int val; 3894 3895 if (optlen != sizeof(val)) 3896 return -EINVAL; 3897 if (copy_from_user(&val, optval, sizeof(val))) 3898 return -EFAULT; 3899 3900 po->xmit = val ? packet_direct_xmit : dev_queue_xmit; 3901 return 0; 3902 } 3903 default: 3904 return -ENOPROTOOPT; 3905 } 3906 } 3907 3908 static int packet_getsockopt(struct socket *sock, int level, int optname, 3909 char __user *optval, int __user *optlen) 3910 { 3911 int len; 3912 int val, lv = sizeof(val); 3913 struct sock *sk = sock->sk; 3914 struct packet_sock *po = pkt_sk(sk); 3915 void *data = &val; 3916 union tpacket_stats_u st; 3917 struct tpacket_rollover_stats rstats; 3918 int drops; 3919 3920 if (level != SOL_PACKET) 3921 return -ENOPROTOOPT; 3922 3923 if (get_user(len, optlen)) 3924 return -EFAULT; 3925 3926 if (len < 0) 3927 return -EINVAL; 3928 3929 switch (optname) { 3930 case PACKET_STATISTICS: 3931 spin_lock_bh(&sk->sk_receive_queue.lock); 3932 memcpy(&st, &po->stats, sizeof(st)); 3933 memset(&po->stats, 0, sizeof(po->stats)); 3934 spin_unlock_bh(&sk->sk_receive_queue.lock); 3935 drops = atomic_xchg(&po->tp_drops, 0); 3936 3937 if (po->tp_version == TPACKET_V3) { 3938 lv = sizeof(struct tpacket_stats_v3); 3939 st.stats3.tp_drops = drops; 3940 st.stats3.tp_packets += drops; 3941 data = &st.stats3; 3942 } else { 3943 lv = sizeof(struct tpacket_stats); 3944 st.stats1.tp_drops = drops; 3945 st.stats1.tp_packets += drops; 3946 data = &st.stats1; 3947 } 3948 3949 break; 3950 case PACKET_AUXDATA: 3951 val = po->auxdata; 3952 break; 3953 case PACKET_ORIGDEV: 3954 val = po->origdev; 3955 break; 3956 case PACKET_VNET_HDR: 3957 val = po->has_vnet_hdr; 3958 break; 3959 case PACKET_VERSION: 3960 val = po->tp_version; 3961 break; 3962 case PACKET_HDRLEN: 3963 if (len > sizeof(int)) 3964 len = sizeof(int); 3965 if (len < sizeof(int)) 3966 return -EINVAL; 3967 if (copy_from_user(&val, optval, len)) 3968 return -EFAULT; 3969 switch (val) { 3970 case TPACKET_V1: 3971 val = sizeof(struct tpacket_hdr); 3972 break; 3973 case TPACKET_V2: 3974 val = sizeof(struct tpacket2_hdr); 3975 break; 3976 case TPACKET_V3: 3977 val = sizeof(struct tpacket3_hdr); 3978 break; 3979 default: 3980 return -EINVAL; 3981 } 3982 break; 3983 case PACKET_RESERVE: 3984 val = po->tp_reserve; 3985 break; 3986 case PACKET_LOSS: 3987 val = po->tp_loss; 3988 break; 3989 case PACKET_TIMESTAMP: 3990 val = po->tp_tstamp; 3991 break; 3992 case PACKET_FANOUT: 3993 val = (po->fanout ? 3994 ((u32)po->fanout->id | 3995 ((u32)po->fanout->type << 16) | 3996 ((u32)po->fanout->flags << 24)) : 3997 0); 3998 break; 3999 case PACKET_IGNORE_OUTGOING: 4000 val = po->prot_hook.ignore_outgoing; 4001 break; 4002 case PACKET_ROLLOVER_STATS: 4003 if (!po->rollover) 4004 return -EINVAL; 4005 rstats.tp_all = atomic_long_read(&po->rollover->num); 4006 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge); 4007 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed); 4008 data = &rstats; 4009 lv = sizeof(rstats); 4010 break; 4011 case PACKET_TX_HAS_OFF: 4012 val = po->tp_tx_has_off; 4013 break; 4014 case PACKET_QDISC_BYPASS: 4015 val = packet_use_direct_xmit(po); 4016 break; 4017 default: 4018 return -ENOPROTOOPT; 4019 } 4020 4021 if (len > lv) 4022 len = lv; 4023 if (put_user(len, optlen)) 4024 return -EFAULT; 4025 if (copy_to_user(optval, data, len)) 4026 return -EFAULT; 4027 return 0; 4028 } 4029 4030 4031 #ifdef CONFIG_COMPAT 4032 static int compat_packet_setsockopt(struct socket *sock, int level, int optname, 4033 char __user *optval, unsigned int optlen) 4034 { 4035 struct packet_sock *po = pkt_sk(sock->sk); 4036 4037 if (level != SOL_PACKET) 4038 return -ENOPROTOOPT; 4039 4040 if (optname == PACKET_FANOUT_DATA && 4041 po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) { 4042 optval = (char __user *)get_compat_bpf_fprog(optval); 4043 if (!optval) 4044 return -EFAULT; 4045 optlen = sizeof(struct sock_fprog); 4046 } 4047 4048 return packet_setsockopt(sock, level, optname, optval, optlen); 4049 } 4050 #endif 4051 4052 static int packet_notifier(struct notifier_block *this, 4053 unsigned long msg, void *ptr) 4054 { 4055 struct sock *sk; 4056 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 4057 struct net *net = dev_net(dev); 4058 4059 rcu_read_lock(); 4060 sk_for_each_rcu(sk, &net->packet.sklist) { 4061 struct packet_sock *po = pkt_sk(sk); 4062 4063 switch (msg) { 4064 case NETDEV_UNREGISTER: 4065 if (po->mclist) 4066 packet_dev_mclist_delete(dev, &po->mclist); 4067 /* fallthrough */ 4068 4069 case NETDEV_DOWN: 4070 if (dev->ifindex == po->ifindex) { 4071 spin_lock(&po->bind_lock); 4072 if (po->running) { 4073 __unregister_prot_hook(sk, false); 4074 sk->sk_err = ENETDOWN; 4075 if (!sock_flag(sk, SOCK_DEAD)) 4076 sk->sk_error_report(sk); 4077 } 4078 if (msg == NETDEV_UNREGISTER) { 4079 packet_cached_dev_reset(po); 4080 po->ifindex = -1; 4081 if (po->prot_hook.dev) 4082 dev_put(po->prot_hook.dev); 4083 po->prot_hook.dev = NULL; 4084 } 4085 spin_unlock(&po->bind_lock); 4086 } 4087 break; 4088 case NETDEV_UP: 4089 if (dev->ifindex == po->ifindex) { 4090 spin_lock(&po->bind_lock); 4091 if (po->num) 4092 register_prot_hook(sk); 4093 spin_unlock(&po->bind_lock); 4094 } 4095 break; 4096 } 4097 } 4098 rcu_read_unlock(); 4099 return NOTIFY_DONE; 4100 } 4101 4102 4103 static int packet_ioctl(struct socket *sock, unsigned int cmd, 4104 unsigned long arg) 4105 { 4106 struct sock *sk = sock->sk; 4107 4108 switch (cmd) { 4109 case SIOCOUTQ: 4110 { 4111 int amount = sk_wmem_alloc_get(sk); 4112 4113 return put_user(amount, (int __user *)arg); 4114 } 4115 case SIOCINQ: 4116 { 4117 struct sk_buff *skb; 4118 int amount = 0; 4119 4120 spin_lock_bh(&sk->sk_receive_queue.lock); 4121 skb = skb_peek(&sk->sk_receive_queue); 4122 if (skb) 4123 amount = skb->len; 4124 spin_unlock_bh(&sk->sk_receive_queue.lock); 4125 return put_user(amount, (int __user *)arg); 4126 } 4127 #ifdef CONFIG_INET 4128 case SIOCADDRT: 4129 case SIOCDELRT: 4130 case SIOCDARP: 4131 case SIOCGARP: 4132 case SIOCSARP: 4133 case SIOCGIFADDR: 4134 case SIOCSIFADDR: 4135 case SIOCGIFBRDADDR: 4136 case SIOCSIFBRDADDR: 4137 case SIOCGIFNETMASK: 4138 case SIOCSIFNETMASK: 4139 case SIOCGIFDSTADDR: 4140 case SIOCSIFDSTADDR: 4141 case SIOCSIFFLAGS: 4142 return inet_dgram_ops.ioctl(sock, cmd, arg); 4143 #endif 4144 4145 default: 4146 return -ENOIOCTLCMD; 4147 } 4148 return 0; 4149 } 4150 4151 static __poll_t packet_poll(struct file *file, struct socket *sock, 4152 poll_table *wait) 4153 { 4154 struct sock *sk = sock->sk; 4155 struct packet_sock *po = pkt_sk(sk); 4156 __poll_t mask = datagram_poll(file, sock, wait); 4157 4158 spin_lock_bh(&sk->sk_receive_queue.lock); 4159 if (po->rx_ring.pg_vec) { 4160 if (!packet_previous_rx_frame(po, &po->rx_ring, 4161 TP_STATUS_KERNEL)) 4162 mask |= EPOLLIN | EPOLLRDNORM; 4163 } 4164 packet_rcv_try_clear_pressure(po); 4165 spin_unlock_bh(&sk->sk_receive_queue.lock); 4166 spin_lock_bh(&sk->sk_write_queue.lock); 4167 if (po->tx_ring.pg_vec) { 4168 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE)) 4169 mask |= EPOLLOUT | EPOLLWRNORM; 4170 } 4171 spin_unlock_bh(&sk->sk_write_queue.lock); 4172 return mask; 4173 } 4174 4175 4176 /* Dirty? Well, I still did not learn better way to account 4177 * for user mmaps. 4178 */ 4179 4180 static void packet_mm_open(struct vm_area_struct *vma) 4181 { 4182 struct file *file = vma->vm_file; 4183 struct socket *sock = file->private_data; 4184 struct sock *sk = sock->sk; 4185 4186 if (sk) 4187 atomic_inc(&pkt_sk(sk)->mapped); 4188 } 4189 4190 static void packet_mm_close(struct vm_area_struct *vma) 4191 { 4192 struct file *file = vma->vm_file; 4193 struct socket *sock = file->private_data; 4194 struct sock *sk = sock->sk; 4195 4196 if (sk) 4197 atomic_dec(&pkt_sk(sk)->mapped); 4198 } 4199 4200 static const struct vm_operations_struct packet_mmap_ops = { 4201 .open = packet_mm_open, 4202 .close = packet_mm_close, 4203 }; 4204 4205 static void free_pg_vec(struct pgv *pg_vec, unsigned int order, 4206 unsigned int len) 4207 { 4208 int i; 4209 4210 for (i = 0; i < len; i++) { 4211 if (likely(pg_vec[i].buffer)) { 4212 if (is_vmalloc_addr(pg_vec[i].buffer)) 4213 vfree(pg_vec[i].buffer); 4214 else 4215 free_pages((unsigned long)pg_vec[i].buffer, 4216 order); 4217 pg_vec[i].buffer = NULL; 4218 } 4219 } 4220 kfree(pg_vec); 4221 } 4222 4223 static char *alloc_one_pg_vec_page(unsigned long order) 4224 { 4225 char *buffer; 4226 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | 4227 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; 4228 4229 buffer = (char *) __get_free_pages(gfp_flags, order); 4230 if (buffer) 4231 return buffer; 4232 4233 /* __get_free_pages failed, fall back to vmalloc */ 4234 buffer = vzalloc(array_size((1 << order), PAGE_SIZE)); 4235 if (buffer) 4236 return buffer; 4237 4238 /* vmalloc failed, lets dig into swap here */ 4239 gfp_flags &= ~__GFP_NORETRY; 4240 buffer = (char *) __get_free_pages(gfp_flags, order); 4241 if (buffer) 4242 return buffer; 4243 4244 /* complete and utter failure */ 4245 return NULL; 4246 } 4247 4248 static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order) 4249 { 4250 unsigned int block_nr = req->tp_block_nr; 4251 struct pgv *pg_vec; 4252 int i; 4253 4254 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN); 4255 if (unlikely(!pg_vec)) 4256 goto out; 4257 4258 for (i = 0; i < block_nr; i++) { 4259 pg_vec[i].buffer = alloc_one_pg_vec_page(order); 4260 if (unlikely(!pg_vec[i].buffer)) 4261 goto out_free_pgvec; 4262 } 4263 4264 out: 4265 return pg_vec; 4266 4267 out_free_pgvec: 4268 free_pg_vec(pg_vec, order, block_nr); 4269 pg_vec = NULL; 4270 goto out; 4271 } 4272 4273 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, 4274 int closing, int tx_ring) 4275 { 4276 struct pgv *pg_vec = NULL; 4277 struct packet_sock *po = pkt_sk(sk); 4278 int was_running, order = 0; 4279 struct packet_ring_buffer *rb; 4280 struct sk_buff_head *rb_queue; 4281 __be16 num; 4282 int err = -EINVAL; 4283 /* Added to avoid minimal code churn */ 4284 struct tpacket_req *req = &req_u->req; 4285 4286 rb = tx_ring ? &po->tx_ring : &po->rx_ring; 4287 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; 4288 4289 err = -EBUSY; 4290 if (!closing) { 4291 if (atomic_read(&po->mapped)) 4292 goto out; 4293 if (packet_read_pending(rb)) 4294 goto out; 4295 } 4296 4297 if (req->tp_block_nr) { 4298 unsigned int min_frame_size; 4299 4300 /* Sanity tests and some calculations */ 4301 err = -EBUSY; 4302 if (unlikely(rb->pg_vec)) 4303 goto out; 4304 4305 switch (po->tp_version) { 4306 case TPACKET_V1: 4307 po->tp_hdrlen = TPACKET_HDRLEN; 4308 break; 4309 case TPACKET_V2: 4310 po->tp_hdrlen = TPACKET2_HDRLEN; 4311 break; 4312 case TPACKET_V3: 4313 po->tp_hdrlen = TPACKET3_HDRLEN; 4314 break; 4315 } 4316 4317 err = -EINVAL; 4318 if (unlikely((int)req->tp_block_size <= 0)) 4319 goto out; 4320 if (unlikely(!PAGE_ALIGNED(req->tp_block_size))) 4321 goto out; 4322 min_frame_size = po->tp_hdrlen + po->tp_reserve; 4323 if (po->tp_version >= TPACKET_V3 && 4324 req->tp_block_size < 4325 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size) 4326 goto out; 4327 if (unlikely(req->tp_frame_size < min_frame_size)) 4328 goto out; 4329 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1))) 4330 goto out; 4331 4332 rb->frames_per_block = req->tp_block_size / req->tp_frame_size; 4333 if (unlikely(rb->frames_per_block == 0)) 4334 goto out; 4335 if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr)) 4336 goto out; 4337 if (unlikely((rb->frames_per_block * req->tp_block_nr) != 4338 req->tp_frame_nr)) 4339 goto out; 4340 4341 err = -ENOMEM; 4342 order = get_order(req->tp_block_size); 4343 pg_vec = alloc_pg_vec(req, order); 4344 if (unlikely(!pg_vec)) 4345 goto out; 4346 switch (po->tp_version) { 4347 case TPACKET_V3: 4348 /* Block transmit is not supported yet */ 4349 if (!tx_ring) { 4350 init_prb_bdqc(po, rb, pg_vec, req_u); 4351 } else { 4352 struct tpacket_req3 *req3 = &req_u->req3; 4353 4354 if (req3->tp_retire_blk_tov || 4355 req3->tp_sizeof_priv || 4356 req3->tp_feature_req_word) { 4357 err = -EINVAL; 4358 goto out_free_pg_vec; 4359 } 4360 } 4361 break; 4362 default: 4363 break; 4364 } 4365 } 4366 /* Done */ 4367 else { 4368 err = -EINVAL; 4369 if (unlikely(req->tp_frame_nr)) 4370 goto out; 4371 } 4372 4373 4374 /* Detach socket from network */ 4375 spin_lock(&po->bind_lock); 4376 was_running = po->running; 4377 num = po->num; 4378 if (was_running) { 4379 po->num = 0; 4380 __unregister_prot_hook(sk, false); 4381 } 4382 spin_unlock(&po->bind_lock); 4383 4384 synchronize_net(); 4385 4386 err = -EBUSY; 4387 mutex_lock(&po->pg_vec_lock); 4388 if (closing || atomic_read(&po->mapped) == 0) { 4389 err = 0; 4390 spin_lock_bh(&rb_queue->lock); 4391 swap(rb->pg_vec, pg_vec); 4392 rb->frame_max = (req->tp_frame_nr - 1); 4393 rb->head = 0; 4394 rb->frame_size = req->tp_frame_size; 4395 spin_unlock_bh(&rb_queue->lock); 4396 4397 swap(rb->pg_vec_order, order); 4398 swap(rb->pg_vec_len, req->tp_block_nr); 4399 4400 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE; 4401 po->prot_hook.func = (po->rx_ring.pg_vec) ? 4402 tpacket_rcv : packet_rcv; 4403 skb_queue_purge(rb_queue); 4404 if (atomic_read(&po->mapped)) 4405 pr_err("packet_mmap: vma is busy: %d\n", 4406 atomic_read(&po->mapped)); 4407 } 4408 mutex_unlock(&po->pg_vec_lock); 4409 4410 spin_lock(&po->bind_lock); 4411 if (was_running) { 4412 po->num = num; 4413 register_prot_hook(sk); 4414 } 4415 spin_unlock(&po->bind_lock); 4416 if (pg_vec && (po->tp_version > TPACKET_V2)) { 4417 /* Because we don't support block-based V3 on tx-ring */ 4418 if (!tx_ring) 4419 prb_shutdown_retire_blk_timer(po, rb_queue); 4420 } 4421 4422 out_free_pg_vec: 4423 if (pg_vec) 4424 free_pg_vec(pg_vec, order, req->tp_block_nr); 4425 out: 4426 return err; 4427 } 4428 4429 static int packet_mmap(struct file *file, struct socket *sock, 4430 struct vm_area_struct *vma) 4431 { 4432 struct sock *sk = sock->sk; 4433 struct packet_sock *po = pkt_sk(sk); 4434 unsigned long size, expected_size; 4435 struct packet_ring_buffer *rb; 4436 unsigned long start; 4437 int err = -EINVAL; 4438 int i; 4439 4440 if (vma->vm_pgoff) 4441 return -EINVAL; 4442 4443 mutex_lock(&po->pg_vec_lock); 4444 4445 expected_size = 0; 4446 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) { 4447 if (rb->pg_vec) { 4448 expected_size += rb->pg_vec_len 4449 * rb->pg_vec_pages 4450 * PAGE_SIZE; 4451 } 4452 } 4453 4454 if (expected_size == 0) 4455 goto out; 4456 4457 size = vma->vm_end - vma->vm_start; 4458 if (size != expected_size) 4459 goto out; 4460 4461 start = vma->vm_start; 4462 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) { 4463 if (rb->pg_vec == NULL) 4464 continue; 4465 4466 for (i = 0; i < rb->pg_vec_len; i++) { 4467 struct page *page; 4468 void *kaddr = rb->pg_vec[i].buffer; 4469 int pg_num; 4470 4471 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) { 4472 page = pgv_to_page(kaddr); 4473 err = vm_insert_page(vma, start, page); 4474 if (unlikely(err)) 4475 goto out; 4476 start += PAGE_SIZE; 4477 kaddr += PAGE_SIZE; 4478 } 4479 } 4480 } 4481 4482 atomic_inc(&po->mapped); 4483 vma->vm_ops = &packet_mmap_ops; 4484 err = 0; 4485 4486 out: 4487 mutex_unlock(&po->pg_vec_lock); 4488 return err; 4489 } 4490 4491 static const struct proto_ops packet_ops_spkt = { 4492 .family = PF_PACKET, 4493 .owner = THIS_MODULE, 4494 .release = packet_release, 4495 .bind = packet_bind_spkt, 4496 .connect = sock_no_connect, 4497 .socketpair = sock_no_socketpair, 4498 .accept = sock_no_accept, 4499 .getname = packet_getname_spkt, 4500 .poll = datagram_poll, 4501 .ioctl = packet_ioctl, 4502 .gettstamp = sock_gettstamp, 4503 .listen = sock_no_listen, 4504 .shutdown = sock_no_shutdown, 4505 .setsockopt = sock_no_setsockopt, 4506 .getsockopt = sock_no_getsockopt, 4507 .sendmsg = packet_sendmsg_spkt, 4508 .recvmsg = packet_recvmsg, 4509 .mmap = sock_no_mmap, 4510 .sendpage = sock_no_sendpage, 4511 }; 4512 4513 static const struct proto_ops packet_ops = { 4514 .family = PF_PACKET, 4515 .owner = THIS_MODULE, 4516 .release = packet_release, 4517 .bind = packet_bind, 4518 .connect = sock_no_connect, 4519 .socketpair = sock_no_socketpair, 4520 .accept = sock_no_accept, 4521 .getname = packet_getname, 4522 .poll = packet_poll, 4523 .ioctl = packet_ioctl, 4524 .gettstamp = sock_gettstamp, 4525 .listen = sock_no_listen, 4526 .shutdown = sock_no_shutdown, 4527 .setsockopt = packet_setsockopt, 4528 .getsockopt = packet_getsockopt, 4529 #ifdef CONFIG_COMPAT 4530 .compat_setsockopt = compat_packet_setsockopt, 4531 #endif 4532 .sendmsg = packet_sendmsg, 4533 .recvmsg = packet_recvmsg, 4534 .mmap = packet_mmap, 4535 .sendpage = sock_no_sendpage, 4536 }; 4537 4538 static const struct net_proto_family packet_family_ops = { 4539 .family = PF_PACKET, 4540 .create = packet_create, 4541 .owner = THIS_MODULE, 4542 }; 4543 4544 static struct notifier_block packet_netdev_notifier = { 4545 .notifier_call = packet_notifier, 4546 }; 4547 4548 #ifdef CONFIG_PROC_FS 4549 4550 static void *packet_seq_start(struct seq_file *seq, loff_t *pos) 4551 __acquires(RCU) 4552 { 4553 struct net *net = seq_file_net(seq); 4554 4555 rcu_read_lock(); 4556 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos); 4557 } 4558 4559 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos) 4560 { 4561 struct net *net = seq_file_net(seq); 4562 return seq_hlist_next_rcu(v, &net->packet.sklist, pos); 4563 } 4564 4565 static void packet_seq_stop(struct seq_file *seq, void *v) 4566 __releases(RCU) 4567 { 4568 rcu_read_unlock(); 4569 } 4570 4571 static int packet_seq_show(struct seq_file *seq, void *v) 4572 { 4573 if (v == SEQ_START_TOKEN) 4574 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n"); 4575 else { 4576 struct sock *s = sk_entry(v); 4577 const struct packet_sock *po = pkt_sk(s); 4578 4579 seq_printf(seq, 4580 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n", 4581 s, 4582 refcount_read(&s->sk_refcnt), 4583 s->sk_type, 4584 ntohs(po->num), 4585 po->ifindex, 4586 po->running, 4587 atomic_read(&s->sk_rmem_alloc), 4588 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)), 4589 sock_i_ino(s)); 4590 } 4591 4592 return 0; 4593 } 4594 4595 static const struct seq_operations packet_seq_ops = { 4596 .start = packet_seq_start, 4597 .next = packet_seq_next, 4598 .stop = packet_seq_stop, 4599 .show = packet_seq_show, 4600 }; 4601 #endif 4602 4603 static int __net_init packet_net_init(struct net *net) 4604 { 4605 mutex_init(&net->packet.sklist_lock); 4606 INIT_HLIST_HEAD(&net->packet.sklist); 4607 4608 if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops, 4609 sizeof(struct seq_net_private))) 4610 return -ENOMEM; 4611 4612 return 0; 4613 } 4614 4615 static void __net_exit packet_net_exit(struct net *net) 4616 { 4617 remove_proc_entry("packet", net->proc_net); 4618 WARN_ON_ONCE(!hlist_empty(&net->packet.sklist)); 4619 } 4620 4621 static struct pernet_operations packet_net_ops = { 4622 .init = packet_net_init, 4623 .exit = packet_net_exit, 4624 }; 4625 4626 4627 static void __exit packet_exit(void) 4628 { 4629 unregister_netdevice_notifier(&packet_netdev_notifier); 4630 unregister_pernet_subsys(&packet_net_ops); 4631 sock_unregister(PF_PACKET); 4632 proto_unregister(&packet_proto); 4633 } 4634 4635 static int __init packet_init(void) 4636 { 4637 int rc; 4638 4639 rc = proto_register(&packet_proto, 0); 4640 if (rc) 4641 goto out; 4642 rc = sock_register(&packet_family_ops); 4643 if (rc) 4644 goto out_proto; 4645 rc = register_pernet_subsys(&packet_net_ops); 4646 if (rc) 4647 goto out_sock; 4648 rc = register_netdevice_notifier(&packet_netdev_notifier); 4649 if (rc) 4650 goto out_pernet; 4651 4652 return 0; 4653 4654 out_pernet: 4655 unregister_pernet_subsys(&packet_net_ops); 4656 out_sock: 4657 sock_unregister(PF_PACKET); 4658 out_proto: 4659 proto_unregister(&packet_proto); 4660 out: 4661 return rc; 4662 } 4663 4664 module_init(packet_init); 4665 module_exit(packet_exit); 4666 MODULE_LICENSE("GPL"); 4667 MODULE_ALIAS_NETPROTO(PF_PACKET); 4668