1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2021 Mellanox Technologies. */
3 
4 #include <net/fib_notifier.h>
5 #include <net/nexthop.h>
6 #include "tc_tun_encap.h"
7 #include "en_tc.h"
8 #include "tc_tun.h"
9 #include "rep/tc.h"
10 #include "diag/en_tc_tracepoint.h"
11 
12 enum {
13 	MLX5E_ROUTE_ENTRY_VALID     = BIT(0),
14 };
15 
16 static int mlx5e_set_int_port_tunnel(struct mlx5e_priv *priv,
17 				     struct mlx5_flow_attr *attr,
18 				     struct mlx5e_encap_entry *e,
19 				     int out_index)
20 {
21 	struct net_device *route_dev;
22 	int err = 0;
23 
24 	route_dev = dev_get_by_index(dev_net(e->out_dev), e->route_dev_ifindex);
25 
26 	if (!route_dev || !netif_is_ovs_master(route_dev))
27 		goto out;
28 
29 	err = mlx5e_set_fwd_to_int_port_actions(priv, attr, e->route_dev_ifindex,
30 						MLX5E_TC_INT_PORT_EGRESS,
31 						&attr->action, out_index);
32 
33 out:
34 	if (route_dev)
35 		dev_put(route_dev);
36 
37 	return err;
38 }
39 
40 struct mlx5e_route_key {
41 	int ip_version;
42 	union {
43 		__be32 v4;
44 		struct in6_addr v6;
45 	} endpoint_ip;
46 };
47 
48 struct mlx5e_route_entry {
49 	struct mlx5e_route_key key;
50 	struct list_head encap_entries;
51 	struct list_head decap_flows;
52 	u32 flags;
53 	struct hlist_node hlist;
54 	refcount_t refcnt;
55 	int tunnel_dev_index;
56 	struct rcu_head rcu;
57 };
58 
59 struct mlx5e_tc_tun_encap {
60 	struct mlx5e_priv *priv;
61 	struct notifier_block fib_nb;
62 	spinlock_t route_lock; /* protects route_tbl */
63 	unsigned long route_tbl_last_update;
64 	DECLARE_HASHTABLE(route_tbl, 8);
65 };
66 
67 static bool mlx5e_route_entry_valid(struct mlx5e_route_entry *r)
68 {
69 	return r->flags & MLX5E_ROUTE_ENTRY_VALID;
70 }
71 
72 int mlx5e_tc_set_attr_rx_tun(struct mlx5e_tc_flow *flow,
73 			     struct mlx5_flow_spec *spec)
74 {
75 	struct mlx5_esw_flow_attr *esw_attr = flow->attr->esw_attr;
76 	struct mlx5_rx_tun_attr *tun_attr;
77 	void *daddr, *saddr;
78 	u8 ip_version;
79 
80 	tun_attr = kvzalloc(sizeof(*tun_attr), GFP_KERNEL);
81 	if (!tun_attr)
82 		return -ENOMEM;
83 
84 	esw_attr->rx_tun_attr = tun_attr;
85 	ip_version = mlx5e_tc_get_ip_version(spec, true);
86 
87 	if (ip_version == 4) {
88 		daddr = MLX5_ADDR_OF(fte_match_param, spec->match_value,
89 				     outer_headers.dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
90 		saddr = MLX5_ADDR_OF(fte_match_param, spec->match_value,
91 				     outer_headers.src_ipv4_src_ipv6.ipv4_layout.ipv4);
92 		tun_attr->dst_ip.v4 = *(__be32 *)daddr;
93 		tun_attr->src_ip.v4 = *(__be32 *)saddr;
94 		if (!tun_attr->dst_ip.v4 || !tun_attr->src_ip.v4)
95 			return 0;
96 	}
97 #if IS_ENABLED(CONFIG_INET) && IS_ENABLED(CONFIG_IPV6)
98 	else if (ip_version == 6) {
99 		int ipv6_size = MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6);
100 		struct in6_addr zerov6 = {};
101 
102 		daddr = MLX5_ADDR_OF(fte_match_param, spec->match_value,
103 				     outer_headers.dst_ipv4_dst_ipv6.ipv6_layout.ipv6);
104 		saddr = MLX5_ADDR_OF(fte_match_param, spec->match_value,
105 				     outer_headers.src_ipv4_src_ipv6.ipv6_layout.ipv6);
106 		memcpy(&tun_attr->dst_ip.v6, daddr, ipv6_size);
107 		memcpy(&tun_attr->src_ip.v6, saddr, ipv6_size);
108 		if (!memcmp(&tun_attr->dst_ip.v6, &zerov6, sizeof(zerov6)) ||
109 		    !memcmp(&tun_attr->src_ip.v6, &zerov6, sizeof(zerov6)))
110 			return 0;
111 	}
112 #endif
113 	/* Only set the flag if both src and dst ip addresses exist. They are
114 	 * required to establish routing.
115 	 */
116 	flow_flag_set(flow, TUN_RX);
117 	flow->attr->tun_ip_version = ip_version;
118 	return 0;
119 }
120 
121 static bool mlx5e_tc_flow_all_encaps_valid(struct mlx5_esw_flow_attr *esw_attr)
122 {
123 	bool all_flow_encaps_valid = true;
124 	int i;
125 
126 	/* Flow can be associated with multiple encap entries.
127 	 * Before offloading the flow verify that all of them have
128 	 * a valid neighbour.
129 	 */
130 	for (i = 0; i < MLX5_MAX_FLOW_FWD_VPORTS; i++) {
131 		if (!(esw_attr->dests[i].flags & MLX5_ESW_DEST_ENCAP))
132 			continue;
133 		if (!(esw_attr->dests[i].flags & MLX5_ESW_DEST_ENCAP_VALID)) {
134 			all_flow_encaps_valid = false;
135 			break;
136 		}
137 	}
138 
139 	return all_flow_encaps_valid;
140 }
141 
142 void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
143 			      struct mlx5e_encap_entry *e,
144 			      struct list_head *flow_list)
145 {
146 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
147 	struct mlx5_pkt_reformat_params reformat_params;
148 	struct mlx5_esw_flow_attr *esw_attr;
149 	struct mlx5_flow_handle *rule;
150 	struct mlx5_flow_attr *attr;
151 	struct mlx5_flow_spec *spec;
152 	struct mlx5e_tc_flow *flow;
153 	int err;
154 
155 	if (e->flags & MLX5_ENCAP_ENTRY_NO_ROUTE)
156 		return;
157 
158 	memset(&reformat_params, 0, sizeof(reformat_params));
159 	reformat_params.type = e->reformat_type;
160 	reformat_params.size = e->encap_size;
161 	reformat_params.data = e->encap_header;
162 	e->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev,
163 						     &reformat_params,
164 						     MLX5_FLOW_NAMESPACE_FDB);
165 	if (IS_ERR(e->pkt_reformat)) {
166 		mlx5_core_warn(priv->mdev, "Failed to offload cached encapsulation header, %lu\n",
167 			       PTR_ERR(e->pkt_reformat));
168 		return;
169 	}
170 	e->flags |= MLX5_ENCAP_ENTRY_VALID;
171 	mlx5e_rep_queue_neigh_stats_work(priv);
172 
173 	list_for_each_entry(flow, flow_list, tmp_list) {
174 		if (!mlx5e_is_offloaded_flow(flow) || !flow_flag_test(flow, SLOW))
175 			continue;
176 		attr = flow->attr;
177 		esw_attr = attr->esw_attr;
178 		spec = &attr->parse_attr->spec;
179 
180 		esw_attr->dests[flow->tmp_entry_index].pkt_reformat = e->pkt_reformat;
181 		esw_attr->dests[flow->tmp_entry_index].flags |= MLX5_ESW_DEST_ENCAP_VALID;
182 
183 		/* Do not offload flows with unresolved neighbors */
184 		if (!mlx5e_tc_flow_all_encaps_valid(esw_attr))
185 			continue;
186 		/* update from slow path rule to encap rule */
187 		rule = mlx5e_tc_offload_fdb_rules(esw, flow, spec, attr);
188 		if (IS_ERR(rule)) {
189 			err = PTR_ERR(rule);
190 			mlx5_core_warn(priv->mdev, "Failed to update cached encapsulation flow, %d\n",
191 				       err);
192 			continue;
193 		}
194 
195 		mlx5e_tc_unoffload_from_slow_path(esw, flow);
196 		flow->rule[0] = rule;
197 		/* was unset when slow path rule removed */
198 		flow_flag_set(flow, OFFLOADED);
199 	}
200 }
201 
202 void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv,
203 			      struct mlx5e_encap_entry *e,
204 			      struct list_head *flow_list)
205 {
206 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
207 	struct mlx5_esw_flow_attr *esw_attr;
208 	struct mlx5_flow_handle *rule;
209 	struct mlx5_flow_attr *attr;
210 	struct mlx5_flow_spec *spec;
211 	struct mlx5e_tc_flow *flow;
212 	int err;
213 
214 	list_for_each_entry(flow, flow_list, tmp_list) {
215 		if (!mlx5e_is_offloaded_flow(flow) || flow_flag_test(flow, SLOW))
216 			continue;
217 		attr = flow->attr;
218 		esw_attr = attr->esw_attr;
219 		spec = &attr->parse_attr->spec;
220 
221 		/* update from encap rule to slow path rule */
222 		rule = mlx5e_tc_offload_to_slow_path(esw, flow, spec);
223 		/* mark the flow's encap dest as non-valid */
224 		esw_attr->dests[flow->tmp_entry_index].flags &= ~MLX5_ESW_DEST_ENCAP_VALID;
225 
226 		if (IS_ERR(rule)) {
227 			err = PTR_ERR(rule);
228 			mlx5_core_warn(priv->mdev, "Failed to update slow path (encap) flow, %d\n",
229 				       err);
230 			continue;
231 		}
232 
233 		mlx5e_tc_unoffload_fdb_rules(esw, flow, attr);
234 		flow->rule[0] = rule;
235 		/* was unset when fast path rule removed */
236 		flow_flag_set(flow, OFFLOADED);
237 	}
238 
239 	/* we know that the encap is valid */
240 	e->flags &= ~MLX5_ENCAP_ENTRY_VALID;
241 	mlx5_packet_reformat_dealloc(priv->mdev, e->pkt_reformat);
242 }
243 
244 static void mlx5e_take_tmp_flow(struct mlx5e_tc_flow *flow,
245 				struct list_head *flow_list,
246 				int index)
247 {
248 	if (IS_ERR(mlx5e_flow_get(flow))) {
249 		/* Flow is being deleted concurrently. Wait for it to be
250 		 * unoffloaded from hardware, otherwise deleting encap will
251 		 * fail.
252 		 */
253 		wait_for_completion(&flow->del_hw_done);
254 		return;
255 	}
256 	wait_for_completion(&flow->init_done);
257 
258 	flow->tmp_entry_index = index;
259 	list_add(&flow->tmp_list, flow_list);
260 }
261 
262 /* Takes reference to all flows attached to encap and adds the flows to
263  * flow_list using 'tmp_list' list_head in mlx5e_tc_flow.
264  */
265 void mlx5e_take_all_encap_flows(struct mlx5e_encap_entry *e, struct list_head *flow_list)
266 {
267 	struct encap_flow_item *efi;
268 	struct mlx5e_tc_flow *flow;
269 
270 	list_for_each_entry(efi, &e->flows, list) {
271 		flow = container_of(efi, struct mlx5e_tc_flow, encaps[efi->index]);
272 		mlx5e_take_tmp_flow(flow, flow_list, efi->index);
273 	}
274 }
275 
276 /* Takes reference to all flows attached to route and adds the flows to
277  * flow_list using 'tmp_list' list_head in mlx5e_tc_flow.
278  */
279 static void mlx5e_take_all_route_decap_flows(struct mlx5e_route_entry *r,
280 					     struct list_head *flow_list)
281 {
282 	struct mlx5e_tc_flow *flow;
283 
284 	list_for_each_entry(flow, &r->decap_flows, decap_routes)
285 		mlx5e_take_tmp_flow(flow, flow_list, 0);
286 }
287 
288 typedef bool (match_cb)(struct mlx5e_encap_entry *);
289 
290 static struct mlx5e_encap_entry *
291 mlx5e_get_next_matching_encap(struct mlx5e_neigh_hash_entry *nhe,
292 			      struct mlx5e_encap_entry *e,
293 			      match_cb match)
294 {
295 	struct mlx5e_encap_entry *next = NULL;
296 
297 retry:
298 	rcu_read_lock();
299 
300 	/* find encap with non-zero reference counter value */
301 	for (next = e ?
302 		     list_next_or_null_rcu(&nhe->encap_list,
303 					   &e->encap_list,
304 					   struct mlx5e_encap_entry,
305 					   encap_list) :
306 		     list_first_or_null_rcu(&nhe->encap_list,
307 					    struct mlx5e_encap_entry,
308 					    encap_list);
309 	     next;
310 	     next = list_next_or_null_rcu(&nhe->encap_list,
311 					  &next->encap_list,
312 					  struct mlx5e_encap_entry,
313 					  encap_list))
314 		if (mlx5e_encap_take(next))
315 			break;
316 
317 	rcu_read_unlock();
318 
319 	/* release starting encap */
320 	if (e)
321 		mlx5e_encap_put(netdev_priv(e->out_dev), e);
322 	if (!next)
323 		return next;
324 
325 	/* wait for encap to be fully initialized */
326 	wait_for_completion(&next->res_ready);
327 	/* continue searching if encap entry is not in valid state after completion */
328 	if (!match(next)) {
329 		e = next;
330 		goto retry;
331 	}
332 
333 	return next;
334 }
335 
336 static bool mlx5e_encap_valid(struct mlx5e_encap_entry *e)
337 {
338 	return e->flags & MLX5_ENCAP_ENTRY_VALID;
339 }
340 
341 static struct mlx5e_encap_entry *
342 mlx5e_get_next_valid_encap(struct mlx5e_neigh_hash_entry *nhe,
343 			   struct mlx5e_encap_entry *e)
344 {
345 	return mlx5e_get_next_matching_encap(nhe, e, mlx5e_encap_valid);
346 }
347 
348 static bool mlx5e_encap_initialized(struct mlx5e_encap_entry *e)
349 {
350 	return e->compl_result >= 0;
351 }
352 
353 struct mlx5e_encap_entry *
354 mlx5e_get_next_init_encap(struct mlx5e_neigh_hash_entry *nhe,
355 			  struct mlx5e_encap_entry *e)
356 {
357 	return mlx5e_get_next_matching_encap(nhe, e, mlx5e_encap_initialized);
358 }
359 
360 void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe)
361 {
362 	struct mlx5e_neigh *m_neigh = &nhe->m_neigh;
363 	struct mlx5e_encap_entry *e = NULL;
364 	struct mlx5e_tc_flow *flow;
365 	struct mlx5_fc *counter;
366 	struct neigh_table *tbl;
367 	bool neigh_used = false;
368 	struct neighbour *n;
369 	u64 lastuse;
370 
371 	if (m_neigh->family == AF_INET)
372 		tbl = &arp_tbl;
373 #if IS_ENABLED(CONFIG_IPV6)
374 	else if (m_neigh->family == AF_INET6)
375 		tbl = ipv6_stub->nd_tbl;
376 #endif
377 	else
378 		return;
379 
380 	/* mlx5e_get_next_valid_encap() releases previous encap before returning
381 	 * next one.
382 	 */
383 	while ((e = mlx5e_get_next_valid_encap(nhe, e)) != NULL) {
384 		struct mlx5e_priv *priv = netdev_priv(e->out_dev);
385 		struct encap_flow_item *efi, *tmp;
386 		struct mlx5_eswitch *esw;
387 		LIST_HEAD(flow_list);
388 
389 		esw = priv->mdev->priv.eswitch;
390 		mutex_lock(&esw->offloads.encap_tbl_lock);
391 		list_for_each_entry_safe(efi, tmp, &e->flows, list) {
392 			flow = container_of(efi, struct mlx5e_tc_flow,
393 					    encaps[efi->index]);
394 			if (IS_ERR(mlx5e_flow_get(flow)))
395 				continue;
396 			list_add(&flow->tmp_list, &flow_list);
397 
398 			if (mlx5e_is_offloaded_flow(flow)) {
399 				counter = mlx5e_tc_get_counter(flow);
400 				lastuse = mlx5_fc_query_lastuse(counter);
401 				if (time_after((unsigned long)lastuse, nhe->reported_lastuse)) {
402 					neigh_used = true;
403 					break;
404 				}
405 			}
406 		}
407 		mutex_unlock(&esw->offloads.encap_tbl_lock);
408 
409 		mlx5e_put_flow_list(priv, &flow_list);
410 		if (neigh_used) {
411 			/* release current encap before breaking the loop */
412 			mlx5e_encap_put(priv, e);
413 			break;
414 		}
415 	}
416 
417 	trace_mlx5e_tc_update_neigh_used_value(nhe, neigh_used);
418 
419 	if (neigh_used) {
420 		nhe->reported_lastuse = jiffies;
421 
422 		/* find the relevant neigh according to the cached device and
423 		 * dst ip pair
424 		 */
425 		n = neigh_lookup(tbl, &m_neigh->dst_ip, READ_ONCE(nhe->neigh_dev));
426 		if (!n)
427 			return;
428 
429 		neigh_event_send(n, NULL);
430 		neigh_release(n);
431 	}
432 }
433 
434 static void mlx5e_encap_dealloc(struct mlx5e_priv *priv, struct mlx5e_encap_entry *e)
435 {
436 	WARN_ON(!list_empty(&e->flows));
437 
438 	if (e->compl_result > 0) {
439 		mlx5e_rep_encap_entry_detach(netdev_priv(e->out_dev), e);
440 
441 		if (e->flags & MLX5_ENCAP_ENTRY_VALID)
442 			mlx5_packet_reformat_dealloc(priv->mdev, e->pkt_reformat);
443 	}
444 
445 	kfree(e->tun_info);
446 	kfree(e->encap_header);
447 	kfree_rcu(e, rcu);
448 }
449 
450 static void mlx5e_decap_dealloc(struct mlx5e_priv *priv,
451 				struct mlx5e_decap_entry *d)
452 {
453 	WARN_ON(!list_empty(&d->flows));
454 
455 	if (!d->compl_result)
456 		mlx5_packet_reformat_dealloc(priv->mdev, d->pkt_reformat);
457 
458 	kfree_rcu(d, rcu);
459 }
460 
461 void mlx5e_encap_put(struct mlx5e_priv *priv, struct mlx5e_encap_entry *e)
462 {
463 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
464 
465 	if (!refcount_dec_and_mutex_lock(&e->refcnt, &esw->offloads.encap_tbl_lock))
466 		return;
467 	list_del(&e->route_list);
468 	hash_del_rcu(&e->encap_hlist);
469 	mutex_unlock(&esw->offloads.encap_tbl_lock);
470 
471 	mlx5e_encap_dealloc(priv, e);
472 }
473 
474 static void mlx5e_decap_put(struct mlx5e_priv *priv, struct mlx5e_decap_entry *d)
475 {
476 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
477 
478 	if (!refcount_dec_and_mutex_lock(&d->refcnt, &esw->offloads.decap_tbl_lock))
479 		return;
480 	hash_del_rcu(&d->hlist);
481 	mutex_unlock(&esw->offloads.decap_tbl_lock);
482 
483 	mlx5e_decap_dealloc(priv, d);
484 }
485 
486 static void mlx5e_detach_encap_route(struct mlx5e_priv *priv,
487 				     struct mlx5e_tc_flow *flow,
488 				     int out_index);
489 
490 void mlx5e_detach_encap(struct mlx5e_priv *priv,
491 			struct mlx5e_tc_flow *flow, int out_index)
492 {
493 	struct mlx5e_encap_entry *e = flow->encaps[out_index].e;
494 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
495 
496 	if (flow->attr->esw_attr->dests[out_index].flags &
497 	    MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE)
498 		mlx5e_detach_encap_route(priv, flow, out_index);
499 
500 	/* flow wasn't fully initialized */
501 	if (!e)
502 		return;
503 
504 	mutex_lock(&esw->offloads.encap_tbl_lock);
505 	list_del(&flow->encaps[out_index].list);
506 	flow->encaps[out_index].e = NULL;
507 	if (!refcount_dec_and_test(&e->refcnt)) {
508 		mutex_unlock(&esw->offloads.encap_tbl_lock);
509 		return;
510 	}
511 	list_del(&e->route_list);
512 	hash_del_rcu(&e->encap_hlist);
513 	mutex_unlock(&esw->offloads.encap_tbl_lock);
514 
515 	mlx5e_encap_dealloc(priv, e);
516 }
517 
518 void mlx5e_detach_decap(struct mlx5e_priv *priv,
519 			struct mlx5e_tc_flow *flow)
520 {
521 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
522 	struct mlx5e_decap_entry *d = flow->decap_reformat;
523 
524 	if (!d)
525 		return;
526 
527 	mutex_lock(&esw->offloads.decap_tbl_lock);
528 	list_del(&flow->l3_to_l2_reformat);
529 	flow->decap_reformat = NULL;
530 
531 	if (!refcount_dec_and_test(&d->refcnt)) {
532 		mutex_unlock(&esw->offloads.decap_tbl_lock);
533 		return;
534 	}
535 	hash_del_rcu(&d->hlist);
536 	mutex_unlock(&esw->offloads.decap_tbl_lock);
537 
538 	mlx5e_decap_dealloc(priv, d);
539 }
540 
541 bool mlx5e_tc_tun_encap_info_equal_generic(struct mlx5e_encap_key *a,
542 					   struct mlx5e_encap_key *b)
543 {
544 	return memcmp(a->ip_tun_key, b->ip_tun_key, sizeof(*a->ip_tun_key)) == 0 &&
545 		a->tc_tunnel->tunnel_type == b->tc_tunnel->tunnel_type;
546 }
547 
548 static int cmp_decap_info(struct mlx5e_decap_key *a,
549 			  struct mlx5e_decap_key *b)
550 {
551 	return memcmp(&a->key, &b->key, sizeof(b->key));
552 }
553 
554 static int hash_encap_info(struct mlx5e_encap_key *key)
555 {
556 	return jhash(key->ip_tun_key, sizeof(*key->ip_tun_key),
557 		     key->tc_tunnel->tunnel_type);
558 }
559 
560 static int hash_decap_info(struct mlx5e_decap_key *key)
561 {
562 	return jhash(&key->key, sizeof(key->key), 0);
563 }
564 
565 bool mlx5e_encap_take(struct mlx5e_encap_entry *e)
566 {
567 	return refcount_inc_not_zero(&e->refcnt);
568 }
569 
570 static bool mlx5e_decap_take(struct mlx5e_decap_entry *e)
571 {
572 	return refcount_inc_not_zero(&e->refcnt);
573 }
574 
575 static struct mlx5e_encap_entry *
576 mlx5e_encap_get(struct mlx5e_priv *priv, struct mlx5e_encap_key *key,
577 		uintptr_t hash_key)
578 {
579 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
580 	struct mlx5e_encap_key e_key;
581 	struct mlx5e_encap_entry *e;
582 
583 	hash_for_each_possible_rcu(esw->offloads.encap_tbl, e,
584 				   encap_hlist, hash_key) {
585 		e_key.ip_tun_key = &e->tun_info->key;
586 		e_key.tc_tunnel = e->tunnel;
587 		if (e->tunnel->encap_info_equal(&e_key, key) &&
588 		    mlx5e_encap_take(e))
589 			return e;
590 	}
591 
592 	return NULL;
593 }
594 
595 static struct mlx5e_decap_entry *
596 mlx5e_decap_get(struct mlx5e_priv *priv, struct mlx5e_decap_key *key,
597 		uintptr_t hash_key)
598 {
599 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
600 	struct mlx5e_decap_key r_key;
601 	struct mlx5e_decap_entry *e;
602 
603 	hash_for_each_possible_rcu(esw->offloads.decap_tbl, e,
604 				   hlist, hash_key) {
605 		r_key = e->key;
606 		if (!cmp_decap_info(&r_key, key) &&
607 		    mlx5e_decap_take(e))
608 			return e;
609 	}
610 	return NULL;
611 }
612 
613 struct ip_tunnel_info *mlx5e_dup_tun_info(const struct ip_tunnel_info *tun_info)
614 {
615 	size_t tun_size = sizeof(*tun_info) + tun_info->options_len;
616 
617 	return kmemdup(tun_info, tun_size, GFP_KERNEL);
618 }
619 
620 static bool is_duplicated_encap_entry(struct mlx5e_priv *priv,
621 				      struct mlx5e_tc_flow *flow,
622 				      int out_index,
623 				      struct mlx5e_encap_entry *e,
624 				      struct netlink_ext_ack *extack)
625 {
626 	int i;
627 
628 	for (i = 0; i < out_index; i++) {
629 		if (flow->encaps[i].e != e)
630 			continue;
631 		NL_SET_ERR_MSG_MOD(extack, "can't duplicate encap action");
632 		netdev_err(priv->netdev, "can't duplicate encap action\n");
633 		return true;
634 	}
635 
636 	return false;
637 }
638 
639 static int mlx5e_set_vf_tunnel(struct mlx5_eswitch *esw,
640 			       struct mlx5_flow_attr *attr,
641 			       struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts,
642 			       struct net_device *out_dev,
643 			       int route_dev_ifindex,
644 			       int out_index)
645 {
646 	struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr;
647 	struct net_device *route_dev;
648 	u16 vport_num;
649 	int err = 0;
650 	u32 data;
651 
652 	route_dev = dev_get_by_index(dev_net(out_dev), route_dev_ifindex);
653 
654 	if (!route_dev || route_dev->netdev_ops != &mlx5e_netdev_ops ||
655 	    !mlx5e_tc_is_vf_tunnel(out_dev, route_dev))
656 		goto out;
657 
658 	err = mlx5e_tc_query_route_vport(out_dev, route_dev, &vport_num);
659 	if (err)
660 		goto out;
661 
662 	attr->dest_chain = 0;
663 	attr->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
664 	esw_attr->dests[out_index].flags |= MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE;
665 	data = mlx5_eswitch_get_vport_metadata_for_set(esw_attr->in_mdev->priv.eswitch,
666 						       vport_num);
667 	err = mlx5e_tc_match_to_reg_set_and_get_id(esw->dev, mod_hdr_acts,
668 						   MLX5_FLOW_NAMESPACE_FDB,
669 						   VPORT_TO_REG, data);
670 	if (err >= 0) {
671 		esw_attr->dests[out_index].src_port_rewrite_act_id = err;
672 		err = 0;
673 	}
674 
675 out:
676 	if (route_dev)
677 		dev_put(route_dev);
678 	return err;
679 }
680 
681 static int mlx5e_update_vf_tunnel(struct mlx5_eswitch *esw,
682 				  struct mlx5_esw_flow_attr *attr,
683 				  struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts,
684 				  struct net_device *out_dev,
685 				  int route_dev_ifindex,
686 				  int out_index)
687 {
688 	int act_id = attr->dests[out_index].src_port_rewrite_act_id;
689 	struct net_device *route_dev;
690 	u16 vport_num;
691 	int err = 0;
692 	u32 data;
693 
694 	route_dev = dev_get_by_index(dev_net(out_dev), route_dev_ifindex);
695 
696 	if (!route_dev || route_dev->netdev_ops != &mlx5e_netdev_ops ||
697 	    !mlx5e_tc_is_vf_tunnel(out_dev, route_dev)) {
698 		err = -ENODEV;
699 		goto out;
700 	}
701 
702 	err = mlx5e_tc_query_route_vport(out_dev, route_dev, &vport_num);
703 	if (err)
704 		goto out;
705 
706 	data = mlx5_eswitch_get_vport_metadata_for_set(attr->in_mdev->priv.eswitch,
707 						       vport_num);
708 	mlx5e_tc_match_to_reg_mod_hdr_change(esw->dev, mod_hdr_acts, VPORT_TO_REG, act_id, data);
709 
710 out:
711 	if (route_dev)
712 		dev_put(route_dev);
713 	return err;
714 }
715 
716 static unsigned int mlx5e_route_tbl_get_last_update(struct mlx5e_priv *priv)
717 {
718 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
719 	struct mlx5_rep_uplink_priv *uplink_priv;
720 	struct mlx5e_rep_priv *uplink_rpriv;
721 	struct mlx5e_tc_tun_encap *encap;
722 	unsigned int ret;
723 
724 	uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
725 	uplink_priv = &uplink_rpriv->uplink_priv;
726 	encap = uplink_priv->encap;
727 
728 	spin_lock_bh(&encap->route_lock);
729 	ret = encap->route_tbl_last_update;
730 	spin_unlock_bh(&encap->route_lock);
731 	return ret;
732 }
733 
734 static int mlx5e_attach_encap_route(struct mlx5e_priv *priv,
735 				    struct mlx5e_tc_flow *flow,
736 				    struct mlx5e_encap_entry *e,
737 				    bool new_encap_entry,
738 				    unsigned long tbl_time_before,
739 				    int out_index);
740 
741 int mlx5e_attach_encap(struct mlx5e_priv *priv,
742 		       struct mlx5e_tc_flow *flow,
743 		       struct net_device *mirred_dev,
744 		       int out_index,
745 		       struct netlink_ext_ack *extack,
746 		       struct net_device **encap_dev,
747 		       bool *encap_valid)
748 {
749 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
750 	struct mlx5e_tc_flow_parse_attr *parse_attr;
751 	struct mlx5_flow_attr *attr = flow->attr;
752 	const struct ip_tunnel_info *tun_info;
753 	const struct mlx5e_mpls_info *mpls_info;
754 	unsigned long tbl_time_before = 0;
755 	struct mlx5e_encap_entry *e;
756 	struct mlx5e_encap_key key;
757 	bool entry_created = false;
758 	unsigned short family;
759 	uintptr_t hash_key;
760 	int err = 0;
761 
762 	parse_attr = attr->parse_attr;
763 	tun_info = parse_attr->tun_info[out_index];
764 	mpls_info = &parse_attr->mpls_info[out_index];
765 	family = ip_tunnel_info_af(tun_info);
766 	key.ip_tun_key = &tun_info->key;
767 	key.tc_tunnel = mlx5e_get_tc_tun(mirred_dev);
768 	if (!key.tc_tunnel) {
769 		NL_SET_ERR_MSG_MOD(extack, "Unsupported tunnel");
770 		return -EOPNOTSUPP;
771 	}
772 
773 	hash_key = hash_encap_info(&key);
774 
775 	mutex_lock(&esw->offloads.encap_tbl_lock);
776 	e = mlx5e_encap_get(priv, &key, hash_key);
777 
778 	/* must verify if encap is valid or not */
779 	if (e) {
780 		/* Check that entry was not already attached to this flow */
781 		if (is_duplicated_encap_entry(priv, flow, out_index, e, extack)) {
782 			err = -EOPNOTSUPP;
783 			goto out_err;
784 		}
785 
786 		mutex_unlock(&esw->offloads.encap_tbl_lock);
787 		wait_for_completion(&e->res_ready);
788 
789 		/* Protect against concurrent neigh update. */
790 		mutex_lock(&esw->offloads.encap_tbl_lock);
791 		if (e->compl_result < 0) {
792 			err = -EREMOTEIO;
793 			goto out_err;
794 		}
795 		goto attach_flow;
796 	}
797 
798 	e = kzalloc(sizeof(*e), GFP_KERNEL);
799 	if (!e) {
800 		err = -ENOMEM;
801 		goto out_err;
802 	}
803 
804 	refcount_set(&e->refcnt, 1);
805 	init_completion(&e->res_ready);
806 	entry_created = true;
807 	INIT_LIST_HEAD(&e->route_list);
808 
809 	tun_info = mlx5e_dup_tun_info(tun_info);
810 	if (!tun_info) {
811 		err = -ENOMEM;
812 		goto out_err_init;
813 	}
814 	e->tun_info = tun_info;
815 	memcpy(&e->mpls_info, mpls_info, sizeof(*mpls_info));
816 	err = mlx5e_tc_tun_init_encap_attr(mirred_dev, priv, e, extack);
817 	if (err)
818 		goto out_err_init;
819 
820 	INIT_LIST_HEAD(&e->flows);
821 	hash_add_rcu(esw->offloads.encap_tbl, &e->encap_hlist, hash_key);
822 	tbl_time_before = mlx5e_route_tbl_get_last_update(priv);
823 	mutex_unlock(&esw->offloads.encap_tbl_lock);
824 
825 	if (family == AF_INET)
826 		err = mlx5e_tc_tun_create_header_ipv4(priv, mirred_dev, e);
827 	else if (family == AF_INET6)
828 		err = mlx5e_tc_tun_create_header_ipv6(priv, mirred_dev, e);
829 
830 	/* Protect against concurrent neigh update. */
831 	mutex_lock(&esw->offloads.encap_tbl_lock);
832 	complete_all(&e->res_ready);
833 	if (err) {
834 		e->compl_result = err;
835 		goto out_err;
836 	}
837 	e->compl_result = 1;
838 
839 attach_flow:
840 	err = mlx5e_attach_encap_route(priv, flow, e, entry_created, tbl_time_before,
841 				       out_index);
842 	if (err)
843 		goto out_err;
844 
845 	err = mlx5e_set_int_port_tunnel(priv, attr, e, out_index);
846 	if (err == -EOPNOTSUPP) {
847 		/* If device doesn't support int port offload,
848 		 * redirect to uplink vport.
849 		 */
850 		mlx5_core_dbg(priv->mdev, "attaching int port as encap dev not supported, using uplink\n");
851 		err = 0;
852 	} else if (err) {
853 		goto out_err;
854 	}
855 
856 	flow->encaps[out_index].e = e;
857 	list_add(&flow->encaps[out_index].list, &e->flows);
858 	flow->encaps[out_index].index = out_index;
859 	*encap_dev = e->out_dev;
860 	if (e->flags & MLX5_ENCAP_ENTRY_VALID) {
861 		attr->esw_attr->dests[out_index].pkt_reformat = e->pkt_reformat;
862 		attr->esw_attr->dests[out_index].flags |= MLX5_ESW_DEST_ENCAP_VALID;
863 		*encap_valid = true;
864 	} else {
865 		*encap_valid = false;
866 	}
867 	mutex_unlock(&esw->offloads.encap_tbl_lock);
868 
869 	return err;
870 
871 out_err:
872 	mutex_unlock(&esw->offloads.encap_tbl_lock);
873 	if (e)
874 		mlx5e_encap_put(priv, e);
875 	return err;
876 
877 out_err_init:
878 	mutex_unlock(&esw->offloads.encap_tbl_lock);
879 	kfree(tun_info);
880 	kfree(e);
881 	return err;
882 }
883 
884 int mlx5e_attach_decap(struct mlx5e_priv *priv,
885 		       struct mlx5e_tc_flow *flow,
886 		       struct netlink_ext_ack *extack)
887 {
888 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
889 	struct mlx5_esw_flow_attr *attr = flow->attr->esw_attr;
890 	struct mlx5_pkt_reformat_params reformat_params;
891 	struct mlx5e_tc_flow_parse_attr *parse_attr;
892 	struct mlx5e_decap_entry *d;
893 	struct mlx5e_decap_key key;
894 	uintptr_t hash_key;
895 	int err = 0;
896 
897 	parse_attr = flow->attr->parse_attr;
898 	if (sizeof(parse_attr->eth) > MLX5_CAP_ESW(priv->mdev, max_encap_header_size)) {
899 		NL_SET_ERR_MSG_MOD(extack,
900 				   "encap header larger than max supported");
901 		return -EOPNOTSUPP;
902 	}
903 
904 	key.key = parse_attr->eth;
905 	hash_key = hash_decap_info(&key);
906 	mutex_lock(&esw->offloads.decap_tbl_lock);
907 	d = mlx5e_decap_get(priv, &key, hash_key);
908 	if (d) {
909 		mutex_unlock(&esw->offloads.decap_tbl_lock);
910 		wait_for_completion(&d->res_ready);
911 		mutex_lock(&esw->offloads.decap_tbl_lock);
912 		if (d->compl_result) {
913 			err = -EREMOTEIO;
914 			goto out_free;
915 		}
916 		goto found;
917 	}
918 
919 	d = kzalloc(sizeof(*d), GFP_KERNEL);
920 	if (!d) {
921 		err = -ENOMEM;
922 		goto out_err;
923 	}
924 
925 	d->key = key;
926 	refcount_set(&d->refcnt, 1);
927 	init_completion(&d->res_ready);
928 	INIT_LIST_HEAD(&d->flows);
929 	hash_add_rcu(esw->offloads.decap_tbl, &d->hlist, hash_key);
930 	mutex_unlock(&esw->offloads.decap_tbl_lock);
931 
932 	memset(&reformat_params, 0, sizeof(reformat_params));
933 	reformat_params.type = MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2;
934 	reformat_params.size = sizeof(parse_attr->eth);
935 	reformat_params.data = &parse_attr->eth;
936 	d->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev,
937 						     &reformat_params,
938 						     MLX5_FLOW_NAMESPACE_FDB);
939 	if (IS_ERR(d->pkt_reformat)) {
940 		err = PTR_ERR(d->pkt_reformat);
941 		d->compl_result = err;
942 	}
943 	mutex_lock(&esw->offloads.decap_tbl_lock);
944 	complete_all(&d->res_ready);
945 	if (err)
946 		goto out_free;
947 
948 found:
949 	flow->decap_reformat = d;
950 	attr->decap_pkt_reformat = d->pkt_reformat;
951 	list_add(&flow->l3_to_l2_reformat, &d->flows);
952 	mutex_unlock(&esw->offloads.decap_tbl_lock);
953 	return 0;
954 
955 out_free:
956 	mutex_unlock(&esw->offloads.decap_tbl_lock);
957 	mlx5e_decap_put(priv, d);
958 	return err;
959 
960 out_err:
961 	mutex_unlock(&esw->offloads.decap_tbl_lock);
962 	return err;
963 }
964 
965 static int cmp_route_info(struct mlx5e_route_key *a,
966 			  struct mlx5e_route_key *b)
967 {
968 	if (a->ip_version == 4 && b->ip_version == 4)
969 		return memcmp(&a->endpoint_ip.v4, &b->endpoint_ip.v4,
970 			      sizeof(a->endpoint_ip.v4));
971 	else if (a->ip_version == 6 && b->ip_version == 6)
972 		return memcmp(&a->endpoint_ip.v6, &b->endpoint_ip.v6,
973 			      sizeof(a->endpoint_ip.v6));
974 	return 1;
975 }
976 
977 static u32 hash_route_info(struct mlx5e_route_key *key)
978 {
979 	if (key->ip_version == 4)
980 		return jhash(&key->endpoint_ip.v4, sizeof(key->endpoint_ip.v4), 0);
981 	return jhash(&key->endpoint_ip.v6, sizeof(key->endpoint_ip.v6), 0);
982 }
983 
984 static void mlx5e_route_dealloc(struct mlx5e_priv *priv,
985 				struct mlx5e_route_entry *r)
986 {
987 	WARN_ON(!list_empty(&r->decap_flows));
988 	WARN_ON(!list_empty(&r->encap_entries));
989 
990 	kfree_rcu(r, rcu);
991 }
992 
993 static void mlx5e_route_put(struct mlx5e_priv *priv, struct mlx5e_route_entry *r)
994 {
995 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
996 
997 	if (!refcount_dec_and_mutex_lock(&r->refcnt, &esw->offloads.encap_tbl_lock))
998 		return;
999 
1000 	hash_del_rcu(&r->hlist);
1001 	mutex_unlock(&esw->offloads.encap_tbl_lock);
1002 
1003 	mlx5e_route_dealloc(priv, r);
1004 }
1005 
1006 static void mlx5e_route_put_locked(struct mlx5e_priv *priv, struct mlx5e_route_entry *r)
1007 {
1008 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1009 
1010 	lockdep_assert_held(&esw->offloads.encap_tbl_lock);
1011 
1012 	if (!refcount_dec_and_test(&r->refcnt))
1013 		return;
1014 	hash_del_rcu(&r->hlist);
1015 	mlx5e_route_dealloc(priv, r);
1016 }
1017 
1018 static struct mlx5e_route_entry *
1019 mlx5e_route_get(struct mlx5e_tc_tun_encap *encap, struct mlx5e_route_key *key,
1020 		u32 hash_key)
1021 {
1022 	struct mlx5e_route_key r_key;
1023 	struct mlx5e_route_entry *r;
1024 
1025 	hash_for_each_possible(encap->route_tbl, r, hlist, hash_key) {
1026 		r_key = r->key;
1027 		if (!cmp_route_info(&r_key, key) &&
1028 		    refcount_inc_not_zero(&r->refcnt))
1029 			return r;
1030 	}
1031 	return NULL;
1032 }
1033 
1034 static struct mlx5e_route_entry *
1035 mlx5e_route_get_create(struct mlx5e_priv *priv,
1036 		       struct mlx5e_route_key *key,
1037 		       int tunnel_dev_index,
1038 		       unsigned long *route_tbl_change_time)
1039 {
1040 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1041 	struct mlx5_rep_uplink_priv *uplink_priv;
1042 	struct mlx5e_rep_priv *uplink_rpriv;
1043 	struct mlx5e_tc_tun_encap *encap;
1044 	struct mlx5e_route_entry *r;
1045 	u32 hash_key;
1046 
1047 	uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
1048 	uplink_priv = &uplink_rpriv->uplink_priv;
1049 	encap = uplink_priv->encap;
1050 
1051 	hash_key = hash_route_info(key);
1052 	spin_lock_bh(&encap->route_lock);
1053 	r = mlx5e_route_get(encap, key, hash_key);
1054 	spin_unlock_bh(&encap->route_lock);
1055 	if (r) {
1056 		if (!mlx5e_route_entry_valid(r)) {
1057 			mlx5e_route_put_locked(priv, r);
1058 			return ERR_PTR(-EINVAL);
1059 		}
1060 		return r;
1061 	}
1062 
1063 	r = kzalloc(sizeof(*r), GFP_KERNEL);
1064 	if (!r)
1065 		return ERR_PTR(-ENOMEM);
1066 
1067 	r->key = *key;
1068 	r->flags |= MLX5E_ROUTE_ENTRY_VALID;
1069 	r->tunnel_dev_index = tunnel_dev_index;
1070 	refcount_set(&r->refcnt, 1);
1071 	INIT_LIST_HEAD(&r->decap_flows);
1072 	INIT_LIST_HEAD(&r->encap_entries);
1073 
1074 	spin_lock_bh(&encap->route_lock);
1075 	*route_tbl_change_time = encap->route_tbl_last_update;
1076 	hash_add(encap->route_tbl, &r->hlist, hash_key);
1077 	spin_unlock_bh(&encap->route_lock);
1078 
1079 	return r;
1080 }
1081 
1082 static struct mlx5e_route_entry *
1083 mlx5e_route_lookup_for_update(struct mlx5e_tc_tun_encap *encap, struct mlx5e_route_key *key)
1084 {
1085 	u32 hash_key = hash_route_info(key);
1086 	struct mlx5e_route_entry *r;
1087 
1088 	spin_lock_bh(&encap->route_lock);
1089 	encap->route_tbl_last_update = jiffies;
1090 	r = mlx5e_route_get(encap, key, hash_key);
1091 	spin_unlock_bh(&encap->route_lock);
1092 
1093 	return r;
1094 }
1095 
1096 struct mlx5e_tc_fib_event_data {
1097 	struct work_struct work;
1098 	unsigned long event;
1099 	struct mlx5e_route_entry *r;
1100 	struct net_device *ul_dev;
1101 };
1102 
1103 static void mlx5e_tc_fib_event_work(struct work_struct *work);
1104 static struct mlx5e_tc_fib_event_data *
1105 mlx5e_tc_init_fib_work(unsigned long event, struct net_device *ul_dev, gfp_t flags)
1106 {
1107 	struct mlx5e_tc_fib_event_data *fib_work;
1108 
1109 	fib_work = kzalloc(sizeof(*fib_work), flags);
1110 	if (WARN_ON(!fib_work))
1111 		return NULL;
1112 
1113 	INIT_WORK(&fib_work->work, mlx5e_tc_fib_event_work);
1114 	fib_work->event = event;
1115 	fib_work->ul_dev = ul_dev;
1116 
1117 	return fib_work;
1118 }
1119 
1120 static int
1121 mlx5e_route_enqueue_update(struct mlx5e_priv *priv,
1122 			   struct mlx5e_route_entry *r,
1123 			   unsigned long event)
1124 {
1125 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1126 	struct mlx5e_tc_fib_event_data *fib_work;
1127 	struct mlx5e_rep_priv *uplink_rpriv;
1128 	struct net_device *ul_dev;
1129 
1130 	uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
1131 	ul_dev = uplink_rpriv->netdev;
1132 
1133 	fib_work = mlx5e_tc_init_fib_work(event, ul_dev, GFP_KERNEL);
1134 	if (!fib_work)
1135 		return -ENOMEM;
1136 
1137 	dev_hold(ul_dev);
1138 	refcount_inc(&r->refcnt);
1139 	fib_work->r = r;
1140 	queue_work(priv->wq, &fib_work->work);
1141 
1142 	return 0;
1143 }
1144 
1145 int mlx5e_attach_decap_route(struct mlx5e_priv *priv,
1146 			     struct mlx5e_tc_flow *flow)
1147 {
1148 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1149 	unsigned long tbl_time_before, tbl_time_after;
1150 	struct mlx5e_tc_flow_parse_attr *parse_attr;
1151 	struct mlx5_flow_attr *attr = flow->attr;
1152 	struct mlx5_esw_flow_attr *esw_attr;
1153 	struct mlx5e_route_entry *r;
1154 	struct mlx5e_route_key key;
1155 	int err = 0;
1156 
1157 	esw_attr = attr->esw_attr;
1158 	parse_attr = attr->parse_attr;
1159 	mutex_lock(&esw->offloads.encap_tbl_lock);
1160 	if (!esw_attr->rx_tun_attr)
1161 		goto out;
1162 
1163 	tbl_time_before = mlx5e_route_tbl_get_last_update(priv);
1164 	tbl_time_after = tbl_time_before;
1165 	err = mlx5e_tc_tun_route_lookup(priv, &parse_attr->spec, attr, parse_attr->filter_dev);
1166 	if (err || !esw_attr->rx_tun_attr->decap_vport)
1167 		goto out;
1168 
1169 	key.ip_version = attr->tun_ip_version;
1170 	if (key.ip_version == 4)
1171 		key.endpoint_ip.v4 = esw_attr->rx_tun_attr->dst_ip.v4;
1172 	else
1173 		key.endpoint_ip.v6 = esw_attr->rx_tun_attr->dst_ip.v6;
1174 
1175 	r = mlx5e_route_get_create(priv, &key, parse_attr->filter_dev->ifindex,
1176 				   &tbl_time_after);
1177 	if (IS_ERR(r)) {
1178 		err = PTR_ERR(r);
1179 		goto out;
1180 	}
1181 	/* Routing changed concurrently. FIB event handler might have missed new
1182 	 * entry, schedule update.
1183 	 */
1184 	if (tbl_time_before != tbl_time_after) {
1185 		err = mlx5e_route_enqueue_update(priv, r, FIB_EVENT_ENTRY_REPLACE);
1186 		if (err) {
1187 			mlx5e_route_put_locked(priv, r);
1188 			goto out;
1189 		}
1190 	}
1191 
1192 	flow->decap_route = r;
1193 	list_add(&flow->decap_routes, &r->decap_flows);
1194 	mutex_unlock(&esw->offloads.encap_tbl_lock);
1195 	return 0;
1196 
1197 out:
1198 	mutex_unlock(&esw->offloads.encap_tbl_lock);
1199 	return err;
1200 }
1201 
1202 static int mlx5e_attach_encap_route(struct mlx5e_priv *priv,
1203 				    struct mlx5e_tc_flow *flow,
1204 				    struct mlx5e_encap_entry *e,
1205 				    bool new_encap_entry,
1206 				    unsigned long tbl_time_before,
1207 				    int out_index)
1208 {
1209 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1210 	unsigned long tbl_time_after = tbl_time_before;
1211 	struct mlx5e_tc_flow_parse_attr *parse_attr;
1212 	struct mlx5_flow_attr *attr = flow->attr;
1213 	const struct ip_tunnel_info *tun_info;
1214 	struct mlx5_esw_flow_attr *esw_attr;
1215 	struct mlx5e_route_entry *r;
1216 	struct mlx5e_route_key key;
1217 	unsigned short family;
1218 	int err = 0;
1219 
1220 	esw_attr = attr->esw_attr;
1221 	parse_attr = attr->parse_attr;
1222 	tun_info = parse_attr->tun_info[out_index];
1223 	family = ip_tunnel_info_af(tun_info);
1224 
1225 	if (family == AF_INET) {
1226 		key.endpoint_ip.v4 = tun_info->key.u.ipv4.src;
1227 		key.ip_version = 4;
1228 	} else if (family == AF_INET6) {
1229 		key.endpoint_ip.v6 = tun_info->key.u.ipv6.src;
1230 		key.ip_version = 6;
1231 	}
1232 
1233 	err = mlx5e_set_vf_tunnel(esw, attr, &parse_attr->mod_hdr_acts, e->out_dev,
1234 				  e->route_dev_ifindex, out_index);
1235 	if (err || !(esw_attr->dests[out_index].flags &
1236 		     MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE))
1237 		return err;
1238 
1239 	r = mlx5e_route_get_create(priv, &key, parse_attr->mirred_ifindex[out_index],
1240 				   &tbl_time_after);
1241 	if (IS_ERR(r))
1242 		return PTR_ERR(r);
1243 	/* Routing changed concurrently. FIB event handler might have missed new
1244 	 * entry, schedule update.
1245 	 */
1246 	if (tbl_time_before != tbl_time_after) {
1247 		err = mlx5e_route_enqueue_update(priv, r, FIB_EVENT_ENTRY_REPLACE);
1248 		if (err) {
1249 			mlx5e_route_put_locked(priv, r);
1250 			return err;
1251 		}
1252 	}
1253 
1254 	flow->encap_routes[out_index].r = r;
1255 	if (new_encap_entry)
1256 		list_add(&e->route_list, &r->encap_entries);
1257 	flow->encap_routes[out_index].index = out_index;
1258 	return 0;
1259 }
1260 
1261 void mlx5e_detach_decap_route(struct mlx5e_priv *priv,
1262 			      struct mlx5e_tc_flow *flow)
1263 {
1264 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1265 	struct mlx5e_route_entry *r = flow->decap_route;
1266 
1267 	if (!r)
1268 		return;
1269 
1270 	mutex_lock(&esw->offloads.encap_tbl_lock);
1271 	list_del(&flow->decap_routes);
1272 	flow->decap_route = NULL;
1273 
1274 	if (!refcount_dec_and_test(&r->refcnt)) {
1275 		mutex_unlock(&esw->offloads.encap_tbl_lock);
1276 		return;
1277 	}
1278 	hash_del_rcu(&r->hlist);
1279 	mutex_unlock(&esw->offloads.encap_tbl_lock);
1280 
1281 	mlx5e_route_dealloc(priv, r);
1282 }
1283 
1284 static void mlx5e_detach_encap_route(struct mlx5e_priv *priv,
1285 				     struct mlx5e_tc_flow *flow,
1286 				     int out_index)
1287 {
1288 	struct mlx5e_route_entry *r = flow->encap_routes[out_index].r;
1289 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1290 	struct mlx5e_encap_entry *e, *tmp;
1291 
1292 	if (!r)
1293 		return;
1294 
1295 	mutex_lock(&esw->offloads.encap_tbl_lock);
1296 	flow->encap_routes[out_index].r = NULL;
1297 
1298 	if (!refcount_dec_and_test(&r->refcnt)) {
1299 		mutex_unlock(&esw->offloads.encap_tbl_lock);
1300 		return;
1301 	}
1302 	list_for_each_entry_safe(e, tmp, &r->encap_entries, route_list)
1303 		list_del_init(&e->route_list);
1304 	hash_del_rcu(&r->hlist);
1305 	mutex_unlock(&esw->offloads.encap_tbl_lock);
1306 
1307 	mlx5e_route_dealloc(priv, r);
1308 }
1309 
1310 static void mlx5e_invalidate_encap(struct mlx5e_priv *priv,
1311 				   struct mlx5e_encap_entry *e,
1312 				   struct list_head *encap_flows)
1313 {
1314 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1315 	struct mlx5e_tc_flow *flow;
1316 
1317 	list_for_each_entry(flow, encap_flows, tmp_list) {
1318 		struct mlx5_flow_attr *attr = flow->attr;
1319 		struct mlx5_esw_flow_attr *esw_attr;
1320 
1321 		if (!mlx5e_is_offloaded_flow(flow))
1322 			continue;
1323 		esw_attr = attr->esw_attr;
1324 
1325 		if (flow_flag_test(flow, SLOW))
1326 			mlx5e_tc_unoffload_from_slow_path(esw, flow);
1327 		else
1328 			mlx5e_tc_unoffload_fdb_rules(esw, flow, flow->attr);
1329 		mlx5_modify_header_dealloc(priv->mdev, attr->modify_hdr);
1330 		attr->modify_hdr = NULL;
1331 
1332 		esw_attr->dests[flow->tmp_entry_index].flags &=
1333 			~MLX5_ESW_DEST_ENCAP_VALID;
1334 		esw_attr->dests[flow->tmp_entry_index].pkt_reformat = NULL;
1335 	}
1336 
1337 	e->flags |= MLX5_ENCAP_ENTRY_NO_ROUTE;
1338 	if (e->flags & MLX5_ENCAP_ENTRY_VALID) {
1339 		e->flags &= ~MLX5_ENCAP_ENTRY_VALID;
1340 		mlx5_packet_reformat_dealloc(priv->mdev, e->pkt_reformat);
1341 		e->pkt_reformat = NULL;
1342 	}
1343 }
1344 
1345 static void mlx5e_reoffload_encap(struct mlx5e_priv *priv,
1346 				  struct net_device *tunnel_dev,
1347 				  struct mlx5e_encap_entry *e,
1348 				  struct list_head *encap_flows)
1349 {
1350 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1351 	struct mlx5e_tc_flow *flow;
1352 	int err;
1353 
1354 	err = ip_tunnel_info_af(e->tun_info) == AF_INET ?
1355 		mlx5e_tc_tun_update_header_ipv4(priv, tunnel_dev, e) :
1356 		mlx5e_tc_tun_update_header_ipv6(priv, tunnel_dev, e);
1357 	if (err)
1358 		mlx5_core_warn(priv->mdev, "Failed to update encap header, %d", err);
1359 	e->flags &= ~MLX5_ENCAP_ENTRY_NO_ROUTE;
1360 
1361 	list_for_each_entry(flow, encap_flows, tmp_list) {
1362 		struct mlx5e_tc_flow_parse_attr *parse_attr;
1363 		struct mlx5_flow_attr *attr = flow->attr;
1364 		struct mlx5_esw_flow_attr *esw_attr;
1365 		struct mlx5_flow_handle *rule;
1366 		struct mlx5_flow_spec *spec;
1367 
1368 		if (flow_flag_test(flow, FAILED))
1369 			continue;
1370 
1371 		esw_attr = attr->esw_attr;
1372 		parse_attr = attr->parse_attr;
1373 		spec = &parse_attr->spec;
1374 
1375 		err = mlx5e_update_vf_tunnel(esw, esw_attr, &parse_attr->mod_hdr_acts,
1376 					     e->out_dev, e->route_dev_ifindex,
1377 					     flow->tmp_entry_index);
1378 		if (err) {
1379 			mlx5_core_warn(priv->mdev, "Failed to update VF tunnel err=%d", err);
1380 			continue;
1381 		}
1382 
1383 		err = mlx5e_tc_add_flow_mod_hdr(priv, parse_attr, flow);
1384 		if (err) {
1385 			mlx5_core_warn(priv->mdev, "Failed to update flow mod_hdr err=%d",
1386 				       err);
1387 			continue;
1388 		}
1389 
1390 		if (e->flags & MLX5_ENCAP_ENTRY_VALID) {
1391 			esw_attr->dests[flow->tmp_entry_index].pkt_reformat = e->pkt_reformat;
1392 			esw_attr->dests[flow->tmp_entry_index].flags |= MLX5_ESW_DEST_ENCAP_VALID;
1393 			if (!mlx5e_tc_flow_all_encaps_valid(esw_attr))
1394 				goto offload_to_slow_path;
1395 			/* update from slow path rule to encap rule */
1396 			rule = mlx5e_tc_offload_fdb_rules(esw, flow, spec, attr);
1397 			if (IS_ERR(rule)) {
1398 				err = PTR_ERR(rule);
1399 				mlx5_core_warn(priv->mdev, "Failed to update cached encapsulation flow, %d\n",
1400 					       err);
1401 			} else {
1402 				flow->rule[0] = rule;
1403 			}
1404 		} else {
1405 offload_to_slow_path:
1406 			rule = mlx5e_tc_offload_to_slow_path(esw, flow, spec);
1407 			/* mark the flow's encap dest as non-valid */
1408 			esw_attr->dests[flow->tmp_entry_index].flags &=
1409 				~MLX5_ESW_DEST_ENCAP_VALID;
1410 
1411 			if (IS_ERR(rule)) {
1412 				err = PTR_ERR(rule);
1413 				mlx5_core_warn(priv->mdev, "Failed to update slow path (encap) flow, %d\n",
1414 					       err);
1415 			} else {
1416 				flow->rule[0] = rule;
1417 			}
1418 		}
1419 		flow_flag_set(flow, OFFLOADED);
1420 	}
1421 }
1422 
1423 static int mlx5e_update_route_encaps(struct mlx5e_priv *priv,
1424 				     struct mlx5e_route_entry *r,
1425 				     struct list_head *flow_list,
1426 				     bool replace)
1427 {
1428 	struct net_device *tunnel_dev;
1429 	struct mlx5e_encap_entry *e;
1430 
1431 	tunnel_dev = __dev_get_by_index(dev_net(priv->netdev), r->tunnel_dev_index);
1432 	if (!tunnel_dev)
1433 		return -ENODEV;
1434 
1435 	list_for_each_entry(e, &r->encap_entries, route_list) {
1436 		LIST_HEAD(encap_flows);
1437 
1438 		mlx5e_take_all_encap_flows(e, &encap_flows);
1439 		if (list_empty(&encap_flows))
1440 			continue;
1441 
1442 		if (mlx5e_route_entry_valid(r))
1443 			mlx5e_invalidate_encap(priv, e, &encap_flows);
1444 
1445 		if (!replace) {
1446 			list_splice(&encap_flows, flow_list);
1447 			continue;
1448 		}
1449 
1450 		mlx5e_reoffload_encap(priv, tunnel_dev, e, &encap_flows);
1451 		list_splice(&encap_flows, flow_list);
1452 	}
1453 
1454 	return 0;
1455 }
1456 
1457 static void mlx5e_unoffload_flow_list(struct mlx5e_priv *priv,
1458 				      struct list_head *flow_list)
1459 {
1460 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1461 	struct mlx5e_tc_flow *flow;
1462 
1463 	list_for_each_entry(flow, flow_list, tmp_list)
1464 		if (mlx5e_is_offloaded_flow(flow))
1465 			mlx5e_tc_unoffload_fdb_rules(esw, flow, flow->attr);
1466 }
1467 
1468 static void mlx5e_reoffload_decap(struct mlx5e_priv *priv,
1469 				  struct list_head *decap_flows)
1470 {
1471 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1472 	struct mlx5e_tc_flow *flow;
1473 
1474 	list_for_each_entry(flow, decap_flows, tmp_list) {
1475 		struct mlx5e_tc_flow_parse_attr *parse_attr;
1476 		struct mlx5_flow_attr *attr = flow->attr;
1477 		struct mlx5_flow_handle *rule;
1478 		struct mlx5_flow_spec *spec;
1479 		int err;
1480 
1481 		if (flow_flag_test(flow, FAILED))
1482 			continue;
1483 
1484 		parse_attr = attr->parse_attr;
1485 		spec = &parse_attr->spec;
1486 		err = mlx5e_tc_tun_route_lookup(priv, spec, attr, parse_attr->filter_dev);
1487 		if (err) {
1488 			mlx5_core_warn(priv->mdev, "Failed to lookup route for flow, %d\n",
1489 				       err);
1490 			continue;
1491 		}
1492 
1493 		rule = mlx5e_tc_offload_fdb_rules(esw, flow, spec, attr);
1494 		if (IS_ERR(rule)) {
1495 			err = PTR_ERR(rule);
1496 			mlx5_core_warn(priv->mdev, "Failed to update cached decap flow, %d\n",
1497 				       err);
1498 		} else {
1499 			flow->rule[0] = rule;
1500 			flow_flag_set(flow, OFFLOADED);
1501 		}
1502 	}
1503 }
1504 
1505 static int mlx5e_update_route_decap_flows(struct mlx5e_priv *priv,
1506 					  struct mlx5e_route_entry *r,
1507 					  struct list_head *flow_list,
1508 					  bool replace)
1509 {
1510 	struct net_device *tunnel_dev;
1511 	LIST_HEAD(decap_flows);
1512 
1513 	tunnel_dev = __dev_get_by_index(dev_net(priv->netdev), r->tunnel_dev_index);
1514 	if (!tunnel_dev)
1515 		return -ENODEV;
1516 
1517 	mlx5e_take_all_route_decap_flows(r, &decap_flows);
1518 	if (mlx5e_route_entry_valid(r))
1519 		mlx5e_unoffload_flow_list(priv, &decap_flows);
1520 	if (replace)
1521 		mlx5e_reoffload_decap(priv, &decap_flows);
1522 
1523 	list_splice(&decap_flows, flow_list);
1524 
1525 	return 0;
1526 }
1527 
1528 static void mlx5e_tc_fib_event_work(struct work_struct *work)
1529 {
1530 	struct mlx5e_tc_fib_event_data *event_data =
1531 		container_of(work, struct mlx5e_tc_fib_event_data, work);
1532 	struct net_device *ul_dev = event_data->ul_dev;
1533 	struct mlx5e_priv *priv = netdev_priv(ul_dev);
1534 	struct mlx5e_route_entry *r = event_data->r;
1535 	struct mlx5_eswitch *esw;
1536 	LIST_HEAD(flow_list);
1537 	bool replace;
1538 	int err;
1539 
1540 	/* sync with concurrent neigh updates */
1541 	rtnl_lock();
1542 	esw = priv->mdev->priv.eswitch;
1543 	mutex_lock(&esw->offloads.encap_tbl_lock);
1544 	replace = event_data->event == FIB_EVENT_ENTRY_REPLACE;
1545 
1546 	if (!mlx5e_route_entry_valid(r) && !replace)
1547 		goto out;
1548 
1549 	err = mlx5e_update_route_encaps(priv, r, &flow_list, replace);
1550 	if (err)
1551 		mlx5_core_warn(priv->mdev, "Failed to update route encaps, %d\n",
1552 			       err);
1553 
1554 	err = mlx5e_update_route_decap_flows(priv, r, &flow_list, replace);
1555 	if (err)
1556 		mlx5_core_warn(priv->mdev, "Failed to update route decap flows, %d\n",
1557 			       err);
1558 
1559 	if (replace)
1560 		r->flags |= MLX5E_ROUTE_ENTRY_VALID;
1561 out:
1562 	mutex_unlock(&esw->offloads.encap_tbl_lock);
1563 	rtnl_unlock();
1564 
1565 	mlx5e_put_flow_list(priv, &flow_list);
1566 	mlx5e_route_put(priv, event_data->r);
1567 	dev_put(event_data->ul_dev);
1568 	kfree(event_data);
1569 }
1570 
1571 static struct mlx5e_tc_fib_event_data *
1572 mlx5e_init_fib_work_ipv4(struct mlx5e_priv *priv,
1573 			 struct net_device *ul_dev,
1574 			 struct mlx5e_tc_tun_encap *encap,
1575 			 unsigned long event,
1576 			 struct fib_notifier_info *info)
1577 {
1578 	struct fib_entry_notifier_info *fen_info;
1579 	struct mlx5e_tc_fib_event_data *fib_work;
1580 	struct mlx5e_route_entry *r;
1581 	struct mlx5e_route_key key;
1582 	struct net_device *fib_dev;
1583 
1584 	fen_info = container_of(info, struct fib_entry_notifier_info, info);
1585 	if (fen_info->fi->nh)
1586 		return NULL;
1587 	fib_dev = fib_info_nh(fen_info->fi, 0)->fib_nh_dev;
1588 	if (!fib_dev || fib_dev->netdev_ops != &mlx5e_netdev_ops ||
1589 	    fen_info->dst_len != 32)
1590 		return NULL;
1591 
1592 	fib_work = mlx5e_tc_init_fib_work(event, ul_dev, GFP_ATOMIC);
1593 	if (!fib_work)
1594 		return ERR_PTR(-ENOMEM);
1595 
1596 	key.endpoint_ip.v4 = htonl(fen_info->dst);
1597 	key.ip_version = 4;
1598 
1599 	/* Can't fail after this point because releasing reference to r
1600 	 * requires obtaining sleeping mutex which we can't do in atomic
1601 	 * context.
1602 	 */
1603 	r = mlx5e_route_lookup_for_update(encap, &key);
1604 	if (!r)
1605 		goto out;
1606 	fib_work->r = r;
1607 	dev_hold(ul_dev);
1608 
1609 	return fib_work;
1610 
1611 out:
1612 	kfree(fib_work);
1613 	return NULL;
1614 }
1615 
1616 static struct mlx5e_tc_fib_event_data *
1617 mlx5e_init_fib_work_ipv6(struct mlx5e_priv *priv,
1618 			 struct net_device *ul_dev,
1619 			 struct mlx5e_tc_tun_encap *encap,
1620 			 unsigned long event,
1621 			 struct fib_notifier_info *info)
1622 {
1623 	struct fib6_entry_notifier_info *fen_info;
1624 	struct mlx5e_tc_fib_event_data *fib_work;
1625 	struct mlx5e_route_entry *r;
1626 	struct mlx5e_route_key key;
1627 	struct net_device *fib_dev;
1628 
1629 	fen_info = container_of(info, struct fib6_entry_notifier_info, info);
1630 	fib_dev = fib6_info_nh_dev(fen_info->rt);
1631 	if (fib_dev->netdev_ops != &mlx5e_netdev_ops ||
1632 	    fen_info->rt->fib6_dst.plen != 128)
1633 		return NULL;
1634 
1635 	fib_work = mlx5e_tc_init_fib_work(event, ul_dev, GFP_ATOMIC);
1636 	if (!fib_work)
1637 		return ERR_PTR(-ENOMEM);
1638 
1639 	memcpy(&key.endpoint_ip.v6, &fen_info->rt->fib6_dst.addr,
1640 	       sizeof(fen_info->rt->fib6_dst.addr));
1641 	key.ip_version = 6;
1642 
1643 	/* Can't fail after this point because releasing reference to r
1644 	 * requires obtaining sleeping mutex which we can't do in atomic
1645 	 * context.
1646 	 */
1647 	r = mlx5e_route_lookup_for_update(encap, &key);
1648 	if (!r)
1649 		goto out;
1650 	fib_work->r = r;
1651 	dev_hold(ul_dev);
1652 
1653 	return fib_work;
1654 
1655 out:
1656 	kfree(fib_work);
1657 	return NULL;
1658 }
1659 
1660 static int mlx5e_tc_tun_fib_event(struct notifier_block *nb, unsigned long event, void *ptr)
1661 {
1662 	struct mlx5e_tc_fib_event_data *fib_work;
1663 	struct fib_notifier_info *info = ptr;
1664 	struct mlx5e_tc_tun_encap *encap;
1665 	struct net_device *ul_dev;
1666 	struct mlx5e_priv *priv;
1667 
1668 	encap = container_of(nb, struct mlx5e_tc_tun_encap, fib_nb);
1669 	priv = encap->priv;
1670 	ul_dev = priv->netdev;
1671 	priv = netdev_priv(ul_dev);
1672 
1673 	switch (event) {
1674 	case FIB_EVENT_ENTRY_REPLACE:
1675 	case FIB_EVENT_ENTRY_DEL:
1676 		if (info->family == AF_INET)
1677 			fib_work = mlx5e_init_fib_work_ipv4(priv, ul_dev, encap, event, info);
1678 		else if (info->family == AF_INET6)
1679 			fib_work = mlx5e_init_fib_work_ipv6(priv, ul_dev, encap, event, info);
1680 		else
1681 			return NOTIFY_DONE;
1682 
1683 		if (!IS_ERR_OR_NULL(fib_work)) {
1684 			queue_work(priv->wq, &fib_work->work);
1685 		} else if (IS_ERR(fib_work)) {
1686 			NL_SET_ERR_MSG_MOD(info->extack, "Failed to init fib work");
1687 			mlx5_core_warn(priv->mdev, "Failed to init fib work, %ld\n",
1688 				       PTR_ERR(fib_work));
1689 		}
1690 
1691 		break;
1692 	default:
1693 		return NOTIFY_DONE;
1694 	}
1695 
1696 	return NOTIFY_DONE;
1697 }
1698 
1699 struct mlx5e_tc_tun_encap *mlx5e_tc_tun_init(struct mlx5e_priv *priv)
1700 {
1701 	struct mlx5e_tc_tun_encap *encap;
1702 	int err;
1703 
1704 	encap = kvzalloc(sizeof(*encap), GFP_KERNEL);
1705 	if (!encap)
1706 		return ERR_PTR(-ENOMEM);
1707 
1708 	encap->priv = priv;
1709 	encap->fib_nb.notifier_call = mlx5e_tc_tun_fib_event;
1710 	spin_lock_init(&encap->route_lock);
1711 	hash_init(encap->route_tbl);
1712 	err = register_fib_notifier(dev_net(priv->netdev), &encap->fib_nb,
1713 				    NULL, NULL);
1714 	if (err) {
1715 		kvfree(encap);
1716 		return ERR_PTR(err);
1717 	}
1718 
1719 	return encap;
1720 }
1721 
1722 void mlx5e_tc_tun_cleanup(struct mlx5e_tc_tun_encap *encap)
1723 {
1724 	if (!encap)
1725 		return;
1726 
1727 	unregister_fib_notifier(dev_net(encap->priv->netdev), &encap->fib_nb);
1728 	flush_workqueue(encap->priv->wq); /* flush fib event works */
1729 	kvfree(encap);
1730 }
1731