1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <linux/kernel.h> 3 #include <linux/module.h> 4 #include <linux/init.h> 5 #include <linux/netlink.h> 6 #include <linux/netfilter.h> 7 #include <linux/workqueue.h> 8 #include <linux/spinlock.h> 9 #include <linux/netfilter/nf_conntrack_common.h> 10 #include <linux/netfilter/nf_tables.h> 11 #include <net/ip.h> /* for ipv4 options. */ 12 #include <net/netfilter/nf_tables.h> 13 #include <net/netfilter/nf_tables_core.h> 14 #include <net/netfilter/nf_conntrack_core.h> 15 #include <net/netfilter/nf_conntrack_extend.h> 16 #include <net/netfilter/nf_flow_table.h> 17 18 struct nft_flow_offload { 19 struct nft_flowtable *flowtable; 20 }; 21 22 static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst) 23 { 24 if (dst_xfrm(dst)) 25 return FLOW_OFFLOAD_XMIT_XFRM; 26 27 return FLOW_OFFLOAD_XMIT_NEIGH; 28 } 29 30 static void nft_default_forward_path(struct nf_flow_route *route, 31 struct dst_entry *dst_cache, 32 enum ip_conntrack_dir dir) 33 { 34 route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex; 35 route->tuple[dir].dst = dst_cache; 36 route->tuple[dir].xmit_type = nft_xmit_type(dst_cache); 37 } 38 39 static int nft_dev_fill_forward_path(const struct nf_flow_route *route, 40 const struct dst_entry *dst_cache, 41 const struct nf_conn *ct, 42 enum ip_conntrack_dir dir, u8 *ha, 43 struct net_device_path_stack *stack) 44 { 45 const void *daddr = &ct->tuplehash[!dir].tuple.src.u3; 46 struct net_device *dev = dst_cache->dev; 47 struct neighbour *n; 48 u8 nud_state; 49 50 n = dst_neigh_lookup(dst_cache, daddr); 51 if (!n) 52 return -1; 53 54 read_lock_bh(&n->lock); 55 nud_state = n->nud_state; 56 ether_addr_copy(ha, n->ha); 57 read_unlock_bh(&n->lock); 58 neigh_release(n); 59 60 if (!(nud_state & NUD_VALID)) 61 return -1; 62 63 return dev_fill_forward_path(dev, ha, stack); 64 } 65 66 struct nft_forward_info { 67 const struct net_device *indev; 68 const struct net_device *outdev; 69 const struct net_device *hw_outdev; 70 struct id { 71 __u16 id; 72 __be16 proto; 73 } encap[NF_FLOW_TABLE_ENCAP_MAX]; 74 u8 num_encaps; 75 u8 ingress_vlans; 76 u8 h_source[ETH_ALEN]; 77 u8 h_dest[ETH_ALEN]; 78 enum flow_offload_xmit_type xmit_type; 79 }; 80 81 static bool nft_is_valid_ether_device(const struct net_device *dev) 82 { 83 if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || 84 dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) 85 return false; 86 87 return true; 88 } 89 90 static void nft_dev_path_info(const struct net_device_path_stack *stack, 91 struct nft_forward_info *info, 92 unsigned char *ha, struct nf_flowtable *flowtable) 93 { 94 const struct net_device_path *path; 95 int i; 96 97 memcpy(info->h_dest, ha, ETH_ALEN); 98 99 for (i = 0; i < stack->num_paths; i++) { 100 path = &stack->path[i]; 101 switch (path->type) { 102 case DEV_PATH_ETHERNET: 103 case DEV_PATH_DSA: 104 case DEV_PATH_VLAN: 105 case DEV_PATH_PPPOE: 106 info->indev = path->dev; 107 if (is_zero_ether_addr(info->h_source)) 108 memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); 109 110 if (path->type == DEV_PATH_ETHERNET) 111 break; 112 if (path->type == DEV_PATH_DSA) { 113 i = stack->num_paths; 114 break; 115 } 116 117 /* DEV_PATH_VLAN and DEV_PATH_PPPOE */ 118 if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { 119 info->indev = NULL; 120 break; 121 } 122 info->outdev = path->dev; 123 info->encap[info->num_encaps].id = path->encap.id; 124 info->encap[info->num_encaps].proto = path->encap.proto; 125 info->num_encaps++; 126 if (path->type == DEV_PATH_PPPOE) 127 memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN); 128 break; 129 case DEV_PATH_BRIDGE: 130 if (is_zero_ether_addr(info->h_source)) 131 memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); 132 133 switch (path->bridge.vlan_mode) { 134 case DEV_PATH_BR_VLAN_UNTAG_HW: 135 info->ingress_vlans |= BIT(info->num_encaps - 1); 136 break; 137 case DEV_PATH_BR_VLAN_TAG: 138 info->encap[info->num_encaps].id = path->bridge.vlan_id; 139 info->encap[info->num_encaps].proto = path->bridge.vlan_proto; 140 info->num_encaps++; 141 break; 142 case DEV_PATH_BR_VLAN_UNTAG: 143 info->num_encaps--; 144 break; 145 case DEV_PATH_BR_VLAN_KEEP: 146 break; 147 } 148 info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; 149 break; 150 default: 151 info->indev = NULL; 152 break; 153 } 154 } 155 if (!info->outdev) 156 info->outdev = info->indev; 157 158 info->hw_outdev = info->indev; 159 160 if (nf_flowtable_hw_offload(flowtable) && 161 nft_is_valid_ether_device(info->indev)) 162 info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; 163 } 164 165 static bool nft_flowtable_find_dev(const struct net_device *dev, 166 struct nft_flowtable *ft) 167 { 168 struct nft_hook *hook; 169 bool found = false; 170 171 list_for_each_entry_rcu(hook, &ft->hook_list, list) { 172 if (hook->ops.dev != dev) 173 continue; 174 175 found = true; 176 break; 177 } 178 179 return found; 180 } 181 182 static void nft_dev_forward_path(struct nf_flow_route *route, 183 const struct nf_conn *ct, 184 enum ip_conntrack_dir dir, 185 struct nft_flowtable *ft) 186 { 187 const struct dst_entry *dst = route->tuple[dir].dst; 188 struct net_device_path_stack stack; 189 struct nft_forward_info info = {}; 190 unsigned char ha[ETH_ALEN]; 191 int i; 192 193 if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0) 194 nft_dev_path_info(&stack, &info, ha, &ft->data); 195 196 if (!info.indev || !nft_flowtable_find_dev(info.indev, ft)) 197 return; 198 199 route->tuple[!dir].in.ifindex = info.indev->ifindex; 200 for (i = 0; i < info.num_encaps; i++) { 201 route->tuple[!dir].in.encap[i].id = info.encap[i].id; 202 route->tuple[!dir].in.encap[i].proto = info.encap[i].proto; 203 } 204 route->tuple[!dir].in.num_encaps = info.num_encaps; 205 route->tuple[!dir].in.ingress_vlans = info.ingress_vlans; 206 207 if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) { 208 memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN); 209 memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN); 210 route->tuple[dir].out.ifindex = info.outdev->ifindex; 211 route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex; 212 route->tuple[dir].xmit_type = info.xmit_type; 213 } 214 } 215 216 static int nft_flow_route(const struct nft_pktinfo *pkt, 217 const struct nf_conn *ct, 218 struct nf_flow_route *route, 219 enum ip_conntrack_dir dir, 220 struct nft_flowtable *ft) 221 { 222 struct dst_entry *this_dst = skb_dst(pkt->skb); 223 struct dst_entry *other_dst = NULL; 224 struct flowi fl; 225 226 memset(&fl, 0, sizeof(fl)); 227 switch (nft_pf(pkt)) { 228 case NFPROTO_IPV4: 229 fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip; 230 fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; 231 break; 232 case NFPROTO_IPV6: 233 fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6; 234 fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex; 235 break; 236 } 237 238 nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt)); 239 if (!other_dst) 240 return -ENOENT; 241 242 nft_default_forward_path(route, this_dst, dir); 243 nft_default_forward_path(route, other_dst, !dir); 244 245 if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH && 246 route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) { 247 nft_dev_forward_path(route, ct, dir, ft); 248 nft_dev_forward_path(route, ct, !dir, ft); 249 } 250 251 return 0; 252 } 253 254 static bool nft_flow_offload_skip(struct sk_buff *skb, int family) 255 { 256 if (skb_sec_path(skb)) 257 return true; 258 259 if (family == NFPROTO_IPV4) { 260 const struct ip_options *opt; 261 262 opt = &(IPCB(skb)->opt); 263 264 if (unlikely(opt->optlen)) 265 return true; 266 } 267 268 return false; 269 } 270 271 static void nft_flow_offload_eval(const struct nft_expr *expr, 272 struct nft_regs *regs, 273 const struct nft_pktinfo *pkt) 274 { 275 struct nft_flow_offload *priv = nft_expr_priv(expr); 276 struct nf_flowtable *flowtable = &priv->flowtable->data; 277 struct tcphdr _tcph, *tcph = NULL; 278 struct nf_flow_route route = {}; 279 enum ip_conntrack_info ctinfo; 280 struct flow_offload *flow; 281 enum ip_conntrack_dir dir; 282 struct nf_conn *ct; 283 int ret; 284 285 if (nft_flow_offload_skip(pkt->skb, nft_pf(pkt))) 286 goto out; 287 288 ct = nf_ct_get(pkt->skb, &ctinfo); 289 if (!ct) 290 goto out; 291 292 switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) { 293 case IPPROTO_TCP: 294 tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt), 295 sizeof(_tcph), &_tcph); 296 if (unlikely(!tcph || tcph->fin || tcph->rst)) 297 goto out; 298 break; 299 case IPPROTO_UDP: 300 break; 301 #ifdef CONFIG_NF_CT_PROTO_GRE 302 case IPPROTO_GRE: { 303 struct nf_conntrack_tuple *tuple; 304 305 if (ct->status & IPS_NAT_MASK) 306 goto out; 307 tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; 308 /* No support for GRE v1 */ 309 if (tuple->src.u.gre.key || tuple->dst.u.gre.key) 310 goto out; 311 break; 312 } 313 #endif 314 default: 315 goto out; 316 } 317 318 if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) || 319 ct->status & (IPS_SEQ_ADJUST | IPS_NAT_CLASH)) 320 goto out; 321 322 if (!nf_ct_is_confirmed(ct)) 323 goto out; 324 325 if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status)) 326 goto out; 327 328 dir = CTINFO2DIR(ctinfo); 329 if (nft_flow_route(pkt, ct, &route, dir, priv->flowtable) < 0) 330 goto err_flow_route; 331 332 flow = flow_offload_alloc(ct); 333 if (!flow) 334 goto err_flow_alloc; 335 336 if (flow_offload_route_init(flow, &route) < 0) 337 goto err_flow_add; 338 339 if (tcph) { 340 ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; 341 ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; 342 } 343 344 ret = flow_offload_add(flowtable, flow); 345 if (ret < 0) 346 goto err_flow_add; 347 348 dst_release(route.tuple[!dir].dst); 349 return; 350 351 err_flow_add: 352 flow_offload_free(flow); 353 err_flow_alloc: 354 dst_release(route.tuple[!dir].dst); 355 err_flow_route: 356 clear_bit(IPS_OFFLOAD_BIT, &ct->status); 357 out: 358 regs->verdict.code = NFT_BREAK; 359 } 360 361 static int nft_flow_offload_validate(const struct nft_ctx *ctx, 362 const struct nft_expr *expr, 363 const struct nft_data **data) 364 { 365 unsigned int hook_mask = (1 << NF_INET_FORWARD); 366 367 return nft_chain_validate_hooks(ctx->chain, hook_mask); 368 } 369 370 static const struct nla_policy nft_flow_offload_policy[NFTA_FLOW_MAX + 1] = { 371 [NFTA_FLOW_TABLE_NAME] = { .type = NLA_STRING, 372 .len = NFT_NAME_MAXLEN - 1 }, 373 }; 374 375 static int nft_flow_offload_init(const struct nft_ctx *ctx, 376 const struct nft_expr *expr, 377 const struct nlattr * const tb[]) 378 { 379 struct nft_flow_offload *priv = nft_expr_priv(expr); 380 u8 genmask = nft_genmask_next(ctx->net); 381 struct nft_flowtable *flowtable; 382 383 if (!tb[NFTA_FLOW_TABLE_NAME]) 384 return -EINVAL; 385 386 flowtable = nft_flowtable_lookup(ctx->table, tb[NFTA_FLOW_TABLE_NAME], 387 genmask); 388 if (IS_ERR(flowtable)) 389 return PTR_ERR(flowtable); 390 391 priv->flowtable = flowtable; 392 flowtable->use++; 393 394 return nf_ct_netns_get(ctx->net, ctx->family); 395 } 396 397 static void nft_flow_offload_deactivate(const struct nft_ctx *ctx, 398 const struct nft_expr *expr, 399 enum nft_trans_phase phase) 400 { 401 struct nft_flow_offload *priv = nft_expr_priv(expr); 402 403 nf_tables_deactivate_flowtable(ctx, priv->flowtable, phase); 404 } 405 406 static void nft_flow_offload_activate(const struct nft_ctx *ctx, 407 const struct nft_expr *expr) 408 { 409 struct nft_flow_offload *priv = nft_expr_priv(expr); 410 411 priv->flowtable->use++; 412 } 413 414 static void nft_flow_offload_destroy(const struct nft_ctx *ctx, 415 const struct nft_expr *expr) 416 { 417 nf_ct_netns_put(ctx->net, ctx->family); 418 } 419 420 static int nft_flow_offload_dump(struct sk_buff *skb, const struct nft_expr *expr) 421 { 422 struct nft_flow_offload *priv = nft_expr_priv(expr); 423 424 if (nla_put_string(skb, NFTA_FLOW_TABLE_NAME, priv->flowtable->name)) 425 goto nla_put_failure; 426 427 return 0; 428 429 nla_put_failure: 430 return -1; 431 } 432 433 static struct nft_expr_type nft_flow_offload_type; 434 static const struct nft_expr_ops nft_flow_offload_ops = { 435 .type = &nft_flow_offload_type, 436 .size = NFT_EXPR_SIZE(sizeof(struct nft_flow_offload)), 437 .eval = nft_flow_offload_eval, 438 .init = nft_flow_offload_init, 439 .activate = nft_flow_offload_activate, 440 .deactivate = nft_flow_offload_deactivate, 441 .destroy = nft_flow_offload_destroy, 442 .validate = nft_flow_offload_validate, 443 .dump = nft_flow_offload_dump, 444 .reduce = NFT_REDUCE_READONLY, 445 }; 446 447 static struct nft_expr_type nft_flow_offload_type __read_mostly = { 448 .name = "flow_offload", 449 .ops = &nft_flow_offload_ops, 450 .policy = nft_flow_offload_policy, 451 .maxattr = NFTA_FLOW_MAX, 452 .owner = THIS_MODULE, 453 }; 454 455 static int flow_offload_netdev_event(struct notifier_block *this, 456 unsigned long event, void *ptr) 457 { 458 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 459 460 if (event != NETDEV_DOWN) 461 return NOTIFY_DONE; 462 463 nf_flow_table_cleanup(dev); 464 465 return NOTIFY_DONE; 466 } 467 468 static struct notifier_block flow_offload_netdev_notifier = { 469 .notifier_call = flow_offload_netdev_event, 470 }; 471 472 static int __init nft_flow_offload_module_init(void) 473 { 474 int err; 475 476 err = register_netdevice_notifier(&flow_offload_netdev_notifier); 477 if (err) 478 goto err; 479 480 err = nft_register_expr(&nft_flow_offload_type); 481 if (err < 0) 482 goto register_expr; 483 484 return 0; 485 486 register_expr: 487 unregister_netdevice_notifier(&flow_offload_netdev_notifier); 488 err: 489 return err; 490 } 491 492 static void __exit nft_flow_offload_module_exit(void) 493 { 494 nft_unregister_expr(&nft_flow_offload_type); 495 unregister_netdevice_notifier(&flow_offload_netdev_notifier); 496 } 497 498 module_init(nft_flow_offload_module_init); 499 module_exit(nft_flow_offload_module_exit); 500 501 MODULE_LICENSE("GPL"); 502 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); 503 MODULE_ALIAS_NFT_EXPR("flow_offload"); 504 MODULE_DESCRIPTION("nftables hardware flow offload module"); 505