1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2021 Mellanox Technologies. */
3 
4 #include <net/fib_notifier.h>
5 #include "tc_tun_encap.h"
6 #include "en_tc.h"
7 #include "tc_tun.h"
8 #include "rep/tc.h"
9 #include "diag/en_tc_tracepoint.h"
10 
11 enum {
12 	MLX5E_ROUTE_ENTRY_VALID     = BIT(0),
13 };
14 
15 struct mlx5e_route_key {
16 	int ip_version;
17 	union {
18 		__be32 v4;
19 		struct in6_addr v6;
20 	} endpoint_ip;
21 };
22 
23 struct mlx5e_route_entry {
24 	struct mlx5e_route_key key;
25 	struct list_head encap_entries;
26 	struct list_head decap_flows;
27 	u32 flags;
28 	struct hlist_node hlist;
29 	refcount_t refcnt;
30 	int tunnel_dev_index;
31 	struct rcu_head rcu;
32 };
33 
34 struct mlx5e_tc_tun_encap {
35 	struct mlx5e_priv *priv;
36 	struct notifier_block fib_nb;
37 	spinlock_t route_lock; /* protects route_tbl */
38 	unsigned long route_tbl_last_update;
39 	DECLARE_HASHTABLE(route_tbl, 8);
40 };
41 
42 static bool mlx5e_route_entry_valid(struct mlx5e_route_entry *r)
43 {
44 	return r->flags & MLX5E_ROUTE_ENTRY_VALID;
45 }
46 
47 int mlx5e_tc_set_attr_rx_tun(struct mlx5e_tc_flow *flow,
48 			     struct mlx5_flow_spec *spec)
49 {
50 	struct mlx5_esw_flow_attr *esw_attr = flow->attr->esw_attr;
51 	struct mlx5_rx_tun_attr *tun_attr;
52 	void *daddr, *saddr;
53 	u8 ip_version;
54 
55 	tun_attr = kvzalloc(sizeof(*tun_attr), GFP_KERNEL);
56 	if (!tun_attr)
57 		return -ENOMEM;
58 
59 	esw_attr->rx_tun_attr = tun_attr;
60 	ip_version = mlx5e_tc_get_ip_version(spec, true);
61 
62 	if (ip_version == 4) {
63 		daddr = MLX5_ADDR_OF(fte_match_param, spec->match_value,
64 				     outer_headers.dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
65 		saddr = MLX5_ADDR_OF(fte_match_param, spec->match_value,
66 				     outer_headers.src_ipv4_src_ipv6.ipv4_layout.ipv4);
67 		tun_attr->dst_ip.v4 = *(__be32 *)daddr;
68 		tun_attr->src_ip.v4 = *(__be32 *)saddr;
69 		if (!tun_attr->dst_ip.v4 || !tun_attr->src_ip.v4)
70 			return 0;
71 	}
72 #if IS_ENABLED(CONFIG_INET) && IS_ENABLED(CONFIG_IPV6)
73 	else if (ip_version == 6) {
74 		int ipv6_size = MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6);
75 		struct in6_addr zerov6 = {};
76 
77 		daddr = MLX5_ADDR_OF(fte_match_param, spec->match_value,
78 				     outer_headers.dst_ipv4_dst_ipv6.ipv6_layout.ipv6);
79 		saddr = MLX5_ADDR_OF(fte_match_param, spec->match_value,
80 				     outer_headers.src_ipv4_src_ipv6.ipv6_layout.ipv6);
81 		memcpy(&tun_attr->dst_ip.v6, daddr, ipv6_size);
82 		memcpy(&tun_attr->src_ip.v6, saddr, ipv6_size);
83 		if (!memcmp(&tun_attr->dst_ip.v6, &zerov6, sizeof(zerov6)) ||
84 		    !memcmp(&tun_attr->src_ip.v6, &zerov6, sizeof(zerov6)))
85 			return 0;
86 	}
87 #endif
88 	/* Only set the flag if both src and dst ip addresses exist. They are
89 	 * required to establish routing.
90 	 */
91 	flow_flag_set(flow, TUN_RX);
92 	return 0;
93 }
94 
95 static bool mlx5e_tc_flow_all_encaps_valid(struct mlx5_esw_flow_attr *esw_attr)
96 {
97 	bool all_flow_encaps_valid = true;
98 	int i;
99 
100 	/* Flow can be associated with multiple encap entries.
101 	 * Before offloading the flow verify that all of them have
102 	 * a valid neighbour.
103 	 */
104 	for (i = 0; i < MLX5_MAX_FLOW_FWD_VPORTS; i++) {
105 		if (!(esw_attr->dests[i].flags & MLX5_ESW_DEST_ENCAP))
106 			continue;
107 		if (!(esw_attr->dests[i].flags & MLX5_ESW_DEST_ENCAP_VALID)) {
108 			all_flow_encaps_valid = false;
109 			break;
110 		}
111 	}
112 
113 	return all_flow_encaps_valid;
114 }
115 
116 void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
117 			      struct mlx5e_encap_entry *e,
118 			      struct list_head *flow_list)
119 {
120 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
121 	struct mlx5_esw_flow_attr *esw_attr;
122 	struct mlx5_flow_handle *rule;
123 	struct mlx5_flow_attr *attr;
124 	struct mlx5_flow_spec *spec;
125 	struct mlx5e_tc_flow *flow;
126 	int err;
127 
128 	if (e->flags & MLX5_ENCAP_ENTRY_NO_ROUTE)
129 		return;
130 
131 	e->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev,
132 						     e->reformat_type,
133 						     e->encap_size, e->encap_header,
134 						     MLX5_FLOW_NAMESPACE_FDB);
135 	if (IS_ERR(e->pkt_reformat)) {
136 		mlx5_core_warn(priv->mdev, "Failed to offload cached encapsulation header, %lu\n",
137 			       PTR_ERR(e->pkt_reformat));
138 		return;
139 	}
140 	e->flags |= MLX5_ENCAP_ENTRY_VALID;
141 	mlx5e_rep_queue_neigh_stats_work(priv);
142 
143 	list_for_each_entry(flow, flow_list, tmp_list) {
144 		if (!mlx5e_is_offloaded_flow(flow))
145 			continue;
146 		attr = flow->attr;
147 		esw_attr = attr->esw_attr;
148 		spec = &attr->parse_attr->spec;
149 
150 		esw_attr->dests[flow->tmp_entry_index].pkt_reformat = e->pkt_reformat;
151 		esw_attr->dests[flow->tmp_entry_index].flags |= MLX5_ESW_DEST_ENCAP_VALID;
152 
153 		/* Do not offload flows with unresolved neighbors */
154 		if (!mlx5e_tc_flow_all_encaps_valid(esw_attr))
155 			continue;
156 		/* update from slow path rule to encap rule */
157 		rule = mlx5e_tc_offload_fdb_rules(esw, flow, spec, attr);
158 		if (IS_ERR(rule)) {
159 			err = PTR_ERR(rule);
160 			mlx5_core_warn(priv->mdev, "Failed to update cached encapsulation flow, %d\n",
161 				       err);
162 			continue;
163 		}
164 
165 		mlx5e_tc_unoffload_from_slow_path(esw, flow);
166 		flow->rule[0] = rule;
167 		/* was unset when slow path rule removed */
168 		flow_flag_set(flow, OFFLOADED);
169 	}
170 }
171 
172 void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv,
173 			      struct mlx5e_encap_entry *e,
174 			      struct list_head *flow_list)
175 {
176 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
177 	struct mlx5_esw_flow_attr *esw_attr;
178 	struct mlx5_flow_handle *rule;
179 	struct mlx5_flow_attr *attr;
180 	struct mlx5_flow_spec *spec;
181 	struct mlx5e_tc_flow *flow;
182 	int err;
183 
184 	list_for_each_entry(flow, flow_list, tmp_list) {
185 		if (!mlx5e_is_offloaded_flow(flow))
186 			continue;
187 		attr = flow->attr;
188 		esw_attr = attr->esw_attr;
189 		spec = &attr->parse_attr->spec;
190 
191 		/* update from encap rule to slow path rule */
192 		rule = mlx5e_tc_offload_to_slow_path(esw, flow, spec);
193 		/* mark the flow's encap dest as non-valid */
194 		esw_attr->dests[flow->tmp_entry_index].flags &= ~MLX5_ESW_DEST_ENCAP_VALID;
195 
196 		if (IS_ERR(rule)) {
197 			err = PTR_ERR(rule);
198 			mlx5_core_warn(priv->mdev, "Failed to update slow path (encap) flow, %d\n",
199 				       err);
200 			continue;
201 		}
202 
203 		mlx5e_tc_unoffload_fdb_rules(esw, flow, attr);
204 		flow->rule[0] = rule;
205 		/* was unset when fast path rule removed */
206 		flow_flag_set(flow, OFFLOADED);
207 	}
208 
209 	/* we know that the encap is valid */
210 	e->flags &= ~MLX5_ENCAP_ENTRY_VALID;
211 	mlx5_packet_reformat_dealloc(priv->mdev, e->pkt_reformat);
212 }
213 
214 static void mlx5e_take_tmp_flow(struct mlx5e_tc_flow *flow,
215 				struct list_head *flow_list,
216 				int index)
217 {
218 	if (IS_ERR(mlx5e_flow_get(flow)))
219 		return;
220 	wait_for_completion(&flow->init_done);
221 
222 	flow->tmp_entry_index = index;
223 	list_add(&flow->tmp_list, flow_list);
224 }
225 
226 /* Takes reference to all flows attached to encap and adds the flows to
227  * flow_list using 'tmp_list' list_head in mlx5e_tc_flow.
228  */
229 void mlx5e_take_all_encap_flows(struct mlx5e_encap_entry *e, struct list_head *flow_list)
230 {
231 	struct encap_flow_item *efi;
232 	struct mlx5e_tc_flow *flow;
233 
234 	list_for_each_entry(efi, &e->flows, list) {
235 		flow = container_of(efi, struct mlx5e_tc_flow, encaps[efi->index]);
236 		mlx5e_take_tmp_flow(flow, flow_list, efi->index);
237 	}
238 }
239 
240 /* Takes reference to all flows attached to route and adds the flows to
241  * flow_list using 'tmp_list' list_head in mlx5e_tc_flow.
242  */
243 static void mlx5e_take_all_route_decap_flows(struct mlx5e_route_entry *r,
244 					     struct list_head *flow_list)
245 {
246 	struct mlx5e_tc_flow *flow;
247 
248 	list_for_each_entry(flow, &r->decap_flows, decap_routes)
249 		mlx5e_take_tmp_flow(flow, flow_list, 0);
250 }
251 
252 static struct mlx5e_encap_entry *
253 mlx5e_get_next_valid_encap(struct mlx5e_neigh_hash_entry *nhe,
254 			   struct mlx5e_encap_entry *e)
255 {
256 	struct mlx5e_encap_entry *next = NULL;
257 
258 retry:
259 	rcu_read_lock();
260 
261 	/* find encap with non-zero reference counter value */
262 	for (next = e ?
263 		     list_next_or_null_rcu(&nhe->encap_list,
264 					   &e->encap_list,
265 					   struct mlx5e_encap_entry,
266 					   encap_list) :
267 		     list_first_or_null_rcu(&nhe->encap_list,
268 					    struct mlx5e_encap_entry,
269 					    encap_list);
270 	     next;
271 	     next = list_next_or_null_rcu(&nhe->encap_list,
272 					  &next->encap_list,
273 					  struct mlx5e_encap_entry,
274 					  encap_list))
275 		if (mlx5e_encap_take(next))
276 			break;
277 
278 	rcu_read_unlock();
279 
280 	/* release starting encap */
281 	if (e)
282 		mlx5e_encap_put(netdev_priv(e->out_dev), e);
283 	if (!next)
284 		return next;
285 
286 	/* wait for encap to be fully initialized */
287 	wait_for_completion(&next->res_ready);
288 	/* continue searching if encap entry is not in valid state after completion */
289 	if (!(next->flags & MLX5_ENCAP_ENTRY_VALID)) {
290 		e = next;
291 		goto retry;
292 	}
293 
294 	return next;
295 }
296 
297 void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe)
298 {
299 	struct mlx5e_neigh *m_neigh = &nhe->m_neigh;
300 	struct mlx5e_encap_entry *e = NULL;
301 	struct mlx5e_tc_flow *flow;
302 	struct mlx5_fc *counter;
303 	struct neigh_table *tbl;
304 	bool neigh_used = false;
305 	struct neighbour *n;
306 	u64 lastuse;
307 
308 	if (m_neigh->family == AF_INET)
309 		tbl = &arp_tbl;
310 #if IS_ENABLED(CONFIG_IPV6)
311 	else if (m_neigh->family == AF_INET6)
312 		tbl = ipv6_stub->nd_tbl;
313 #endif
314 	else
315 		return;
316 
317 	/* mlx5e_get_next_valid_encap() releases previous encap before returning
318 	 * next one.
319 	 */
320 	while ((e = mlx5e_get_next_valid_encap(nhe, e)) != NULL) {
321 		struct mlx5e_priv *priv = netdev_priv(e->out_dev);
322 		struct encap_flow_item *efi, *tmp;
323 		struct mlx5_eswitch *esw;
324 		LIST_HEAD(flow_list);
325 
326 		esw = priv->mdev->priv.eswitch;
327 		mutex_lock(&esw->offloads.encap_tbl_lock);
328 		list_for_each_entry_safe(efi, tmp, &e->flows, list) {
329 			flow = container_of(efi, struct mlx5e_tc_flow,
330 					    encaps[efi->index]);
331 			if (IS_ERR(mlx5e_flow_get(flow)))
332 				continue;
333 			list_add(&flow->tmp_list, &flow_list);
334 
335 			if (mlx5e_is_offloaded_flow(flow)) {
336 				counter = mlx5e_tc_get_counter(flow);
337 				lastuse = mlx5_fc_query_lastuse(counter);
338 				if (time_after((unsigned long)lastuse, nhe->reported_lastuse)) {
339 					neigh_used = true;
340 					break;
341 				}
342 			}
343 		}
344 		mutex_unlock(&esw->offloads.encap_tbl_lock);
345 
346 		mlx5e_put_flow_list(priv, &flow_list);
347 		if (neigh_used) {
348 			/* release current encap before breaking the loop */
349 			mlx5e_encap_put(priv, e);
350 			break;
351 		}
352 	}
353 
354 	trace_mlx5e_tc_update_neigh_used_value(nhe, neigh_used);
355 
356 	if (neigh_used) {
357 		nhe->reported_lastuse = jiffies;
358 
359 		/* find the relevant neigh according to the cached device and
360 		 * dst ip pair
361 		 */
362 		n = neigh_lookup(tbl, &m_neigh->dst_ip, READ_ONCE(nhe->neigh_dev));
363 		if (!n)
364 			return;
365 
366 		neigh_event_send(n, NULL);
367 		neigh_release(n);
368 	}
369 }
370 
371 static void mlx5e_encap_dealloc(struct mlx5e_priv *priv, struct mlx5e_encap_entry *e)
372 {
373 	WARN_ON(!list_empty(&e->flows));
374 
375 	if (e->compl_result > 0) {
376 		mlx5e_rep_encap_entry_detach(netdev_priv(e->out_dev), e);
377 
378 		if (e->flags & MLX5_ENCAP_ENTRY_VALID)
379 			mlx5_packet_reformat_dealloc(priv->mdev, e->pkt_reformat);
380 	}
381 
382 	kfree(e->tun_info);
383 	kfree(e->encap_header);
384 	kfree_rcu(e, rcu);
385 }
386 
387 static void mlx5e_decap_dealloc(struct mlx5e_priv *priv,
388 				struct mlx5e_decap_entry *d)
389 {
390 	WARN_ON(!list_empty(&d->flows));
391 
392 	if (!d->compl_result)
393 		mlx5_packet_reformat_dealloc(priv->mdev, d->pkt_reformat);
394 
395 	kfree_rcu(d, rcu);
396 }
397 
398 void mlx5e_encap_put(struct mlx5e_priv *priv, struct mlx5e_encap_entry *e)
399 {
400 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
401 
402 	if (!refcount_dec_and_mutex_lock(&e->refcnt, &esw->offloads.encap_tbl_lock))
403 		return;
404 	list_del(&e->route_list);
405 	hash_del_rcu(&e->encap_hlist);
406 	mutex_unlock(&esw->offloads.encap_tbl_lock);
407 
408 	mlx5e_encap_dealloc(priv, e);
409 }
410 
411 static void mlx5e_decap_put(struct mlx5e_priv *priv, struct mlx5e_decap_entry *d)
412 {
413 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
414 
415 	if (!refcount_dec_and_mutex_lock(&d->refcnt, &esw->offloads.decap_tbl_lock))
416 		return;
417 	hash_del_rcu(&d->hlist);
418 	mutex_unlock(&esw->offloads.decap_tbl_lock);
419 
420 	mlx5e_decap_dealloc(priv, d);
421 }
422 
423 static void mlx5e_detach_encap_route(struct mlx5e_priv *priv,
424 				     struct mlx5e_tc_flow *flow,
425 				     int out_index);
426 
427 void mlx5e_detach_encap(struct mlx5e_priv *priv,
428 			struct mlx5e_tc_flow *flow, int out_index)
429 {
430 	struct mlx5e_encap_entry *e = flow->encaps[out_index].e;
431 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
432 
433 	if (flow->attr->esw_attr->dests[out_index].flags &
434 	    MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE)
435 		mlx5e_detach_encap_route(priv, flow, out_index);
436 
437 	/* flow wasn't fully initialized */
438 	if (!e)
439 		return;
440 
441 	mutex_lock(&esw->offloads.encap_tbl_lock);
442 	list_del(&flow->encaps[out_index].list);
443 	flow->encaps[out_index].e = NULL;
444 	if (!refcount_dec_and_test(&e->refcnt)) {
445 		mutex_unlock(&esw->offloads.encap_tbl_lock);
446 		return;
447 	}
448 	list_del(&e->route_list);
449 	hash_del_rcu(&e->encap_hlist);
450 	mutex_unlock(&esw->offloads.encap_tbl_lock);
451 
452 	mlx5e_encap_dealloc(priv, e);
453 }
454 
455 void mlx5e_detach_decap(struct mlx5e_priv *priv,
456 			struct mlx5e_tc_flow *flow)
457 {
458 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
459 	struct mlx5e_decap_entry *d = flow->decap_reformat;
460 
461 	if (!d)
462 		return;
463 
464 	mutex_lock(&esw->offloads.decap_tbl_lock);
465 	list_del(&flow->l3_to_l2_reformat);
466 	flow->decap_reformat = NULL;
467 
468 	if (!refcount_dec_and_test(&d->refcnt)) {
469 		mutex_unlock(&esw->offloads.decap_tbl_lock);
470 		return;
471 	}
472 	hash_del_rcu(&d->hlist);
473 	mutex_unlock(&esw->offloads.decap_tbl_lock);
474 
475 	mlx5e_decap_dealloc(priv, d);
476 }
477 
478 struct encap_key {
479 	const struct ip_tunnel_key *ip_tun_key;
480 	struct mlx5e_tc_tunnel *tc_tunnel;
481 };
482 
483 static int cmp_encap_info(struct encap_key *a,
484 			  struct encap_key *b)
485 {
486 	return memcmp(a->ip_tun_key, b->ip_tun_key, sizeof(*a->ip_tun_key)) ||
487 		a->tc_tunnel->tunnel_type != b->tc_tunnel->tunnel_type;
488 }
489 
490 static int cmp_decap_info(struct mlx5e_decap_key *a,
491 			  struct mlx5e_decap_key *b)
492 {
493 	return memcmp(&a->key, &b->key, sizeof(b->key));
494 }
495 
496 static int hash_encap_info(struct encap_key *key)
497 {
498 	return jhash(key->ip_tun_key, sizeof(*key->ip_tun_key),
499 		     key->tc_tunnel->tunnel_type);
500 }
501 
502 static int hash_decap_info(struct mlx5e_decap_key *key)
503 {
504 	return jhash(&key->key, sizeof(key->key), 0);
505 }
506 
507 bool mlx5e_encap_take(struct mlx5e_encap_entry *e)
508 {
509 	return refcount_inc_not_zero(&e->refcnt);
510 }
511 
512 static bool mlx5e_decap_take(struct mlx5e_decap_entry *e)
513 {
514 	return refcount_inc_not_zero(&e->refcnt);
515 }
516 
517 static struct mlx5e_encap_entry *
518 mlx5e_encap_get(struct mlx5e_priv *priv, struct encap_key *key,
519 		uintptr_t hash_key)
520 {
521 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
522 	struct mlx5e_encap_entry *e;
523 	struct encap_key e_key;
524 
525 	hash_for_each_possible_rcu(esw->offloads.encap_tbl, e,
526 				   encap_hlist, hash_key) {
527 		e_key.ip_tun_key = &e->tun_info->key;
528 		e_key.tc_tunnel = e->tunnel;
529 		if (!cmp_encap_info(&e_key, key) &&
530 		    mlx5e_encap_take(e))
531 			return e;
532 	}
533 
534 	return NULL;
535 }
536 
537 static struct mlx5e_decap_entry *
538 mlx5e_decap_get(struct mlx5e_priv *priv, struct mlx5e_decap_key *key,
539 		uintptr_t hash_key)
540 {
541 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
542 	struct mlx5e_decap_key r_key;
543 	struct mlx5e_decap_entry *e;
544 
545 	hash_for_each_possible_rcu(esw->offloads.decap_tbl, e,
546 				   hlist, hash_key) {
547 		r_key = e->key;
548 		if (!cmp_decap_info(&r_key, key) &&
549 		    mlx5e_decap_take(e))
550 			return e;
551 	}
552 	return NULL;
553 }
554 
555 struct ip_tunnel_info *mlx5e_dup_tun_info(const struct ip_tunnel_info *tun_info)
556 {
557 	size_t tun_size = sizeof(*tun_info) + tun_info->options_len;
558 
559 	return kmemdup(tun_info, tun_size, GFP_KERNEL);
560 }
561 
562 static bool is_duplicated_encap_entry(struct mlx5e_priv *priv,
563 				      struct mlx5e_tc_flow *flow,
564 				      int out_index,
565 				      struct mlx5e_encap_entry *e,
566 				      struct netlink_ext_ack *extack)
567 {
568 	int i;
569 
570 	for (i = 0; i < out_index; i++) {
571 		if (flow->encaps[i].e != e)
572 			continue;
573 		NL_SET_ERR_MSG_MOD(extack, "can't duplicate encap action");
574 		netdev_err(priv->netdev, "can't duplicate encap action\n");
575 		return true;
576 	}
577 
578 	return false;
579 }
580 
581 static int mlx5e_set_vf_tunnel(struct mlx5_eswitch *esw,
582 			       struct mlx5_flow_attr *attr,
583 			       struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts,
584 			       struct net_device *out_dev,
585 			       int route_dev_ifindex,
586 			       int out_index)
587 {
588 	struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr;
589 	struct net_device *route_dev;
590 	u16 vport_num;
591 	int err = 0;
592 	u32 data;
593 
594 	route_dev = dev_get_by_index(dev_net(out_dev), route_dev_ifindex);
595 
596 	if (!route_dev || route_dev->netdev_ops != &mlx5e_netdev_ops ||
597 	    !mlx5e_tc_is_vf_tunnel(out_dev, route_dev))
598 		goto out;
599 
600 	err = mlx5e_tc_query_route_vport(out_dev, route_dev, &vport_num);
601 	if (err)
602 		goto out;
603 
604 	attr->dest_chain = 0;
605 	attr->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
606 	esw_attr->dests[out_index].flags |= MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE;
607 	data = mlx5_eswitch_get_vport_metadata_for_set(esw_attr->in_mdev->priv.eswitch,
608 						       vport_num);
609 	err = mlx5e_tc_match_to_reg_set_and_get_id(esw->dev, mod_hdr_acts,
610 						   MLX5_FLOW_NAMESPACE_FDB,
611 						   VPORT_TO_REG, data);
612 	if (err >= 0) {
613 		esw_attr->dests[out_index].src_port_rewrite_act_id = err;
614 		err = 0;
615 	}
616 
617 out:
618 	if (route_dev)
619 		dev_put(route_dev);
620 	return err;
621 }
622 
623 static int mlx5e_update_vf_tunnel(struct mlx5_eswitch *esw,
624 				  struct mlx5_esw_flow_attr *attr,
625 				  struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts,
626 				  struct net_device *out_dev,
627 				  int route_dev_ifindex,
628 				  int out_index)
629 {
630 	int act_id = attr->dests[out_index].src_port_rewrite_act_id;
631 	struct net_device *route_dev;
632 	u16 vport_num;
633 	int err = 0;
634 	u32 data;
635 
636 	route_dev = dev_get_by_index(dev_net(out_dev), route_dev_ifindex);
637 
638 	if (!route_dev || route_dev->netdev_ops != &mlx5e_netdev_ops ||
639 	    !mlx5e_tc_is_vf_tunnel(out_dev, route_dev)) {
640 		err = -ENODEV;
641 		goto out;
642 	}
643 
644 	err = mlx5e_tc_query_route_vport(out_dev, route_dev, &vport_num);
645 	if (err)
646 		goto out;
647 
648 	data = mlx5_eswitch_get_vport_metadata_for_set(attr->in_mdev->priv.eswitch,
649 						       vport_num);
650 	mlx5e_tc_match_to_reg_mod_hdr_change(esw->dev, mod_hdr_acts, VPORT_TO_REG, act_id, data);
651 
652 out:
653 	if (route_dev)
654 		dev_put(route_dev);
655 	return err;
656 }
657 
658 static unsigned int mlx5e_route_tbl_get_last_update(struct mlx5e_priv *priv)
659 {
660 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
661 	struct mlx5_rep_uplink_priv *uplink_priv;
662 	struct mlx5e_rep_priv *uplink_rpriv;
663 	struct mlx5e_tc_tun_encap *encap;
664 	unsigned int ret;
665 
666 	uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
667 	uplink_priv = &uplink_rpriv->uplink_priv;
668 	encap = uplink_priv->encap;
669 
670 	spin_lock_bh(&encap->route_lock);
671 	ret = encap->route_tbl_last_update;
672 	spin_unlock_bh(&encap->route_lock);
673 	return ret;
674 }
675 
676 static int mlx5e_attach_encap_route(struct mlx5e_priv *priv,
677 				    struct mlx5e_tc_flow *flow,
678 				    struct mlx5e_encap_entry *e,
679 				    bool new_encap_entry,
680 				    unsigned long tbl_time_before,
681 				    int out_index);
682 
683 int mlx5e_attach_encap(struct mlx5e_priv *priv,
684 		       struct mlx5e_tc_flow *flow,
685 		       struct net_device *mirred_dev,
686 		       int out_index,
687 		       struct netlink_ext_ack *extack,
688 		       struct net_device **encap_dev,
689 		       bool *encap_valid)
690 {
691 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
692 	struct mlx5e_tc_flow_parse_attr *parse_attr;
693 	struct mlx5_flow_attr *attr = flow->attr;
694 	const struct ip_tunnel_info *tun_info;
695 	unsigned long tbl_time_before = 0;
696 	struct encap_key key;
697 	struct mlx5e_encap_entry *e;
698 	bool entry_created = false;
699 	unsigned short family;
700 	uintptr_t hash_key;
701 	int err = 0;
702 
703 	parse_attr = attr->parse_attr;
704 	tun_info = parse_attr->tun_info[out_index];
705 	family = ip_tunnel_info_af(tun_info);
706 	key.ip_tun_key = &tun_info->key;
707 	key.tc_tunnel = mlx5e_get_tc_tun(mirred_dev);
708 	if (!key.tc_tunnel) {
709 		NL_SET_ERR_MSG_MOD(extack, "Unsupported tunnel");
710 		return -EOPNOTSUPP;
711 	}
712 
713 	hash_key = hash_encap_info(&key);
714 
715 	mutex_lock(&esw->offloads.encap_tbl_lock);
716 	e = mlx5e_encap_get(priv, &key, hash_key);
717 
718 	/* must verify if encap is valid or not */
719 	if (e) {
720 		/* Check that entry was not already attached to this flow */
721 		if (is_duplicated_encap_entry(priv, flow, out_index, e, extack)) {
722 			err = -EOPNOTSUPP;
723 			goto out_err;
724 		}
725 
726 		mutex_unlock(&esw->offloads.encap_tbl_lock);
727 		wait_for_completion(&e->res_ready);
728 
729 		/* Protect against concurrent neigh update. */
730 		mutex_lock(&esw->offloads.encap_tbl_lock);
731 		if (e->compl_result < 0) {
732 			err = -EREMOTEIO;
733 			goto out_err;
734 		}
735 		goto attach_flow;
736 	}
737 
738 	e = kzalloc(sizeof(*e), GFP_KERNEL);
739 	if (!e) {
740 		err = -ENOMEM;
741 		goto out_err;
742 	}
743 
744 	refcount_set(&e->refcnt, 1);
745 	init_completion(&e->res_ready);
746 	entry_created = true;
747 	INIT_LIST_HEAD(&e->route_list);
748 
749 	tun_info = mlx5e_dup_tun_info(tun_info);
750 	if (!tun_info) {
751 		err = -ENOMEM;
752 		goto out_err_init;
753 	}
754 	e->tun_info = tun_info;
755 	err = mlx5e_tc_tun_init_encap_attr(mirred_dev, priv, e, extack);
756 	if (err)
757 		goto out_err_init;
758 
759 	INIT_LIST_HEAD(&e->flows);
760 	hash_add_rcu(esw->offloads.encap_tbl, &e->encap_hlist, hash_key);
761 	tbl_time_before = mlx5e_route_tbl_get_last_update(priv);
762 	mutex_unlock(&esw->offloads.encap_tbl_lock);
763 
764 	if (family == AF_INET)
765 		err = mlx5e_tc_tun_create_header_ipv4(priv, mirred_dev, e);
766 	else if (family == AF_INET6)
767 		err = mlx5e_tc_tun_create_header_ipv6(priv, mirred_dev, e);
768 
769 	/* Protect against concurrent neigh update. */
770 	mutex_lock(&esw->offloads.encap_tbl_lock);
771 	complete_all(&e->res_ready);
772 	if (err) {
773 		e->compl_result = err;
774 		goto out_err;
775 	}
776 	e->compl_result = 1;
777 
778 attach_flow:
779 	err = mlx5e_attach_encap_route(priv, flow, e, entry_created, tbl_time_before,
780 				       out_index);
781 	if (err)
782 		goto out_err;
783 
784 	flow->encaps[out_index].e = e;
785 	list_add(&flow->encaps[out_index].list, &e->flows);
786 	flow->encaps[out_index].index = out_index;
787 	*encap_dev = e->out_dev;
788 	if (e->flags & MLX5_ENCAP_ENTRY_VALID) {
789 		attr->esw_attr->dests[out_index].pkt_reformat = e->pkt_reformat;
790 		attr->esw_attr->dests[out_index].flags |= MLX5_ESW_DEST_ENCAP_VALID;
791 		*encap_valid = true;
792 	} else {
793 		*encap_valid = false;
794 	}
795 	mutex_unlock(&esw->offloads.encap_tbl_lock);
796 
797 	return err;
798 
799 out_err:
800 	mutex_unlock(&esw->offloads.encap_tbl_lock);
801 	if (e)
802 		mlx5e_encap_put(priv, e);
803 	return err;
804 
805 out_err_init:
806 	mutex_unlock(&esw->offloads.encap_tbl_lock);
807 	kfree(tun_info);
808 	kfree(e);
809 	return err;
810 }
811 
812 int mlx5e_attach_decap(struct mlx5e_priv *priv,
813 		       struct mlx5e_tc_flow *flow,
814 		       struct netlink_ext_ack *extack)
815 {
816 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
817 	struct mlx5_esw_flow_attr *attr = flow->attr->esw_attr;
818 	struct mlx5e_tc_flow_parse_attr *parse_attr;
819 	struct mlx5e_decap_entry *d;
820 	struct mlx5e_decap_key key;
821 	uintptr_t hash_key;
822 	int err = 0;
823 
824 	parse_attr = flow->attr->parse_attr;
825 	if (sizeof(parse_attr->eth) > MLX5_CAP_ESW(priv->mdev, max_encap_header_size)) {
826 		NL_SET_ERR_MSG_MOD(extack,
827 				   "encap header larger than max supported");
828 		return -EOPNOTSUPP;
829 	}
830 
831 	key.key = parse_attr->eth;
832 	hash_key = hash_decap_info(&key);
833 	mutex_lock(&esw->offloads.decap_tbl_lock);
834 	d = mlx5e_decap_get(priv, &key, hash_key);
835 	if (d) {
836 		mutex_unlock(&esw->offloads.decap_tbl_lock);
837 		wait_for_completion(&d->res_ready);
838 		mutex_lock(&esw->offloads.decap_tbl_lock);
839 		if (d->compl_result) {
840 			err = -EREMOTEIO;
841 			goto out_free;
842 		}
843 		goto found;
844 	}
845 
846 	d = kzalloc(sizeof(*d), GFP_KERNEL);
847 	if (!d) {
848 		err = -ENOMEM;
849 		goto out_err;
850 	}
851 
852 	d->key = key;
853 	refcount_set(&d->refcnt, 1);
854 	init_completion(&d->res_ready);
855 	INIT_LIST_HEAD(&d->flows);
856 	hash_add_rcu(esw->offloads.decap_tbl, &d->hlist, hash_key);
857 	mutex_unlock(&esw->offloads.decap_tbl_lock);
858 
859 	d->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev,
860 						     MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2,
861 						     sizeof(parse_attr->eth),
862 						     &parse_attr->eth,
863 						     MLX5_FLOW_NAMESPACE_FDB);
864 	if (IS_ERR(d->pkt_reformat)) {
865 		err = PTR_ERR(d->pkt_reformat);
866 		d->compl_result = err;
867 	}
868 	mutex_lock(&esw->offloads.decap_tbl_lock);
869 	complete_all(&d->res_ready);
870 	if (err)
871 		goto out_free;
872 
873 found:
874 	flow->decap_reformat = d;
875 	attr->decap_pkt_reformat = d->pkt_reformat;
876 	list_add(&flow->l3_to_l2_reformat, &d->flows);
877 	mutex_unlock(&esw->offloads.decap_tbl_lock);
878 	return 0;
879 
880 out_free:
881 	mutex_unlock(&esw->offloads.decap_tbl_lock);
882 	mlx5e_decap_put(priv, d);
883 	return err;
884 
885 out_err:
886 	mutex_unlock(&esw->offloads.decap_tbl_lock);
887 	return err;
888 }
889 
890 static int cmp_route_info(struct mlx5e_route_key *a,
891 			  struct mlx5e_route_key *b)
892 {
893 	if (a->ip_version == 4 && b->ip_version == 4)
894 		return memcmp(&a->endpoint_ip.v4, &b->endpoint_ip.v4,
895 			      sizeof(a->endpoint_ip.v4));
896 	else if (a->ip_version == 6 && b->ip_version == 6)
897 		return memcmp(&a->endpoint_ip.v6, &b->endpoint_ip.v6,
898 			      sizeof(a->endpoint_ip.v6));
899 	return 1;
900 }
901 
902 static u32 hash_route_info(struct mlx5e_route_key *key)
903 {
904 	if (key->ip_version == 4)
905 		return jhash(&key->endpoint_ip.v4, sizeof(key->endpoint_ip.v4), 0);
906 	return jhash(&key->endpoint_ip.v6, sizeof(key->endpoint_ip.v6), 0);
907 }
908 
909 static void mlx5e_route_dealloc(struct mlx5e_priv *priv,
910 				struct mlx5e_route_entry *r)
911 {
912 	WARN_ON(!list_empty(&r->decap_flows));
913 	WARN_ON(!list_empty(&r->encap_entries));
914 
915 	kfree_rcu(r, rcu);
916 }
917 
918 static void mlx5e_route_put(struct mlx5e_priv *priv, struct mlx5e_route_entry *r)
919 {
920 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
921 
922 	if (!refcount_dec_and_mutex_lock(&r->refcnt, &esw->offloads.encap_tbl_lock))
923 		return;
924 
925 	hash_del_rcu(&r->hlist);
926 	mutex_unlock(&esw->offloads.encap_tbl_lock);
927 
928 	mlx5e_route_dealloc(priv, r);
929 }
930 
931 static void mlx5e_route_put_locked(struct mlx5e_priv *priv, struct mlx5e_route_entry *r)
932 {
933 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
934 
935 	lockdep_assert_held(&esw->offloads.encap_tbl_lock);
936 
937 	if (!refcount_dec_and_test(&r->refcnt))
938 		return;
939 	hash_del_rcu(&r->hlist);
940 	mlx5e_route_dealloc(priv, r);
941 }
942 
943 static struct mlx5e_route_entry *
944 mlx5e_route_get(struct mlx5e_tc_tun_encap *encap, struct mlx5e_route_key *key,
945 		u32 hash_key)
946 {
947 	struct mlx5e_route_key r_key;
948 	struct mlx5e_route_entry *r;
949 
950 	hash_for_each_possible(encap->route_tbl, r, hlist, hash_key) {
951 		r_key = r->key;
952 		if (!cmp_route_info(&r_key, key) &&
953 		    refcount_inc_not_zero(&r->refcnt))
954 			return r;
955 	}
956 	return NULL;
957 }
958 
959 static struct mlx5e_route_entry *
960 mlx5e_route_get_create(struct mlx5e_priv *priv,
961 		       struct mlx5e_route_key *key,
962 		       int tunnel_dev_index,
963 		       unsigned long *route_tbl_change_time)
964 {
965 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
966 	struct mlx5_rep_uplink_priv *uplink_priv;
967 	struct mlx5e_rep_priv *uplink_rpriv;
968 	struct mlx5e_tc_tun_encap *encap;
969 	struct mlx5e_route_entry *r;
970 	u32 hash_key;
971 
972 	uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
973 	uplink_priv = &uplink_rpriv->uplink_priv;
974 	encap = uplink_priv->encap;
975 
976 	hash_key = hash_route_info(key);
977 	spin_lock_bh(&encap->route_lock);
978 	r = mlx5e_route_get(encap, key, hash_key);
979 	spin_unlock_bh(&encap->route_lock);
980 	if (r) {
981 		if (!mlx5e_route_entry_valid(r)) {
982 			mlx5e_route_put_locked(priv, r);
983 			return ERR_PTR(-EINVAL);
984 		}
985 		return r;
986 	}
987 
988 	r = kzalloc(sizeof(*r), GFP_KERNEL);
989 	if (!r)
990 		return ERR_PTR(-ENOMEM);
991 
992 	r->key = *key;
993 	r->flags |= MLX5E_ROUTE_ENTRY_VALID;
994 	r->tunnel_dev_index = tunnel_dev_index;
995 	refcount_set(&r->refcnt, 1);
996 	INIT_LIST_HEAD(&r->decap_flows);
997 	INIT_LIST_HEAD(&r->encap_entries);
998 
999 	spin_lock_bh(&encap->route_lock);
1000 	*route_tbl_change_time = encap->route_tbl_last_update;
1001 	hash_add(encap->route_tbl, &r->hlist, hash_key);
1002 	spin_unlock_bh(&encap->route_lock);
1003 
1004 	return r;
1005 }
1006 
1007 static struct mlx5e_route_entry *
1008 mlx5e_route_lookup_for_update(struct mlx5e_tc_tun_encap *encap, struct mlx5e_route_key *key)
1009 {
1010 	u32 hash_key = hash_route_info(key);
1011 	struct mlx5e_route_entry *r;
1012 
1013 	spin_lock_bh(&encap->route_lock);
1014 	encap->route_tbl_last_update = jiffies;
1015 	r = mlx5e_route_get(encap, key, hash_key);
1016 	spin_unlock_bh(&encap->route_lock);
1017 
1018 	return r;
1019 }
1020 
1021 struct mlx5e_tc_fib_event_data {
1022 	struct work_struct work;
1023 	unsigned long event;
1024 	struct mlx5e_route_entry *r;
1025 	struct net_device *ul_dev;
1026 };
1027 
1028 static void mlx5e_tc_fib_event_work(struct work_struct *work);
1029 static struct mlx5e_tc_fib_event_data *
1030 mlx5e_tc_init_fib_work(unsigned long event, struct net_device *ul_dev, gfp_t flags)
1031 {
1032 	struct mlx5e_tc_fib_event_data *fib_work;
1033 
1034 	fib_work = kzalloc(sizeof(*fib_work), flags);
1035 	if (WARN_ON(!fib_work))
1036 		return NULL;
1037 
1038 	INIT_WORK(&fib_work->work, mlx5e_tc_fib_event_work);
1039 	fib_work->event = event;
1040 	fib_work->ul_dev = ul_dev;
1041 
1042 	return fib_work;
1043 }
1044 
1045 static int
1046 mlx5e_route_enqueue_update(struct mlx5e_priv *priv,
1047 			   struct mlx5e_route_entry *r,
1048 			   unsigned long event)
1049 {
1050 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1051 	struct mlx5e_tc_fib_event_data *fib_work;
1052 	struct mlx5e_rep_priv *uplink_rpriv;
1053 	struct net_device *ul_dev;
1054 
1055 	uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
1056 	ul_dev = uplink_rpriv->netdev;
1057 
1058 	fib_work = mlx5e_tc_init_fib_work(event, ul_dev, GFP_KERNEL);
1059 	if (!fib_work)
1060 		return -ENOMEM;
1061 
1062 	dev_hold(ul_dev);
1063 	refcount_inc(&r->refcnt);
1064 	fib_work->r = r;
1065 	queue_work(priv->wq, &fib_work->work);
1066 
1067 	return 0;
1068 }
1069 
1070 int mlx5e_attach_decap_route(struct mlx5e_priv *priv,
1071 			     struct mlx5e_tc_flow *flow)
1072 {
1073 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1074 	unsigned long tbl_time_before, tbl_time_after;
1075 	struct mlx5e_tc_flow_parse_attr *parse_attr;
1076 	struct mlx5_flow_attr *attr = flow->attr;
1077 	struct mlx5_esw_flow_attr *esw_attr;
1078 	struct mlx5e_route_entry *r;
1079 	struct mlx5e_route_key key;
1080 	int err = 0;
1081 
1082 	esw_attr = attr->esw_attr;
1083 	parse_attr = attr->parse_attr;
1084 	mutex_lock(&esw->offloads.encap_tbl_lock);
1085 	if (!esw_attr->rx_tun_attr)
1086 		goto out;
1087 
1088 	tbl_time_before = mlx5e_route_tbl_get_last_update(priv);
1089 	tbl_time_after = tbl_time_before;
1090 	err = mlx5e_tc_tun_route_lookup(priv, &parse_attr->spec, attr);
1091 	if (err || !esw_attr->rx_tun_attr->decap_vport)
1092 		goto out;
1093 
1094 	key.ip_version = attr->ip_version;
1095 	if (key.ip_version == 4)
1096 		key.endpoint_ip.v4 = esw_attr->rx_tun_attr->dst_ip.v4;
1097 	else
1098 		key.endpoint_ip.v6 = esw_attr->rx_tun_attr->dst_ip.v6;
1099 
1100 	r = mlx5e_route_get_create(priv, &key, parse_attr->filter_dev->ifindex,
1101 				   &tbl_time_after);
1102 	if (IS_ERR(r)) {
1103 		err = PTR_ERR(r);
1104 		goto out;
1105 	}
1106 	/* Routing changed concurrently. FIB event handler might have missed new
1107 	 * entry, schedule update.
1108 	 */
1109 	if (tbl_time_before != tbl_time_after) {
1110 		err = mlx5e_route_enqueue_update(priv, r, FIB_EVENT_ENTRY_REPLACE);
1111 		if (err) {
1112 			mlx5e_route_put_locked(priv, r);
1113 			goto out;
1114 		}
1115 	}
1116 
1117 	flow->decap_route = r;
1118 	list_add(&flow->decap_routes, &r->decap_flows);
1119 	mutex_unlock(&esw->offloads.encap_tbl_lock);
1120 	return 0;
1121 
1122 out:
1123 	mutex_unlock(&esw->offloads.encap_tbl_lock);
1124 	return err;
1125 }
1126 
1127 static int mlx5e_attach_encap_route(struct mlx5e_priv *priv,
1128 				    struct mlx5e_tc_flow *flow,
1129 				    struct mlx5e_encap_entry *e,
1130 				    bool new_encap_entry,
1131 				    unsigned long tbl_time_before,
1132 				    int out_index)
1133 {
1134 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1135 	unsigned long tbl_time_after = tbl_time_before;
1136 	struct mlx5e_tc_flow_parse_attr *parse_attr;
1137 	struct mlx5_flow_attr *attr = flow->attr;
1138 	const struct ip_tunnel_info *tun_info;
1139 	struct mlx5_esw_flow_attr *esw_attr;
1140 	struct mlx5e_route_entry *r;
1141 	struct mlx5e_route_key key;
1142 	unsigned short family;
1143 	int err = 0;
1144 
1145 	esw_attr = attr->esw_attr;
1146 	parse_attr = attr->parse_attr;
1147 	tun_info = parse_attr->tun_info[out_index];
1148 	family = ip_tunnel_info_af(tun_info);
1149 
1150 	if (family == AF_INET) {
1151 		key.endpoint_ip.v4 = tun_info->key.u.ipv4.src;
1152 		key.ip_version = 4;
1153 	} else if (family == AF_INET6) {
1154 		key.endpoint_ip.v6 = tun_info->key.u.ipv6.src;
1155 		key.ip_version = 6;
1156 	}
1157 
1158 	err = mlx5e_set_vf_tunnel(esw, attr, &parse_attr->mod_hdr_acts, e->out_dev,
1159 				  e->route_dev_ifindex, out_index);
1160 	if (err || !(esw_attr->dests[out_index].flags &
1161 		     MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE))
1162 		return err;
1163 
1164 	r = mlx5e_route_get_create(priv, &key, parse_attr->mirred_ifindex[out_index],
1165 				   &tbl_time_after);
1166 	if (IS_ERR(r))
1167 		return PTR_ERR(r);
1168 	/* Routing changed concurrently. FIB event handler might have missed new
1169 	 * entry, schedule update.
1170 	 */
1171 	if (tbl_time_before != tbl_time_after) {
1172 		err = mlx5e_route_enqueue_update(priv, r, FIB_EVENT_ENTRY_REPLACE);
1173 		if (err) {
1174 			mlx5e_route_put_locked(priv, r);
1175 			return err;
1176 		}
1177 	}
1178 
1179 	flow->encap_routes[out_index].r = r;
1180 	if (new_encap_entry)
1181 		list_add(&e->route_list, &r->encap_entries);
1182 	flow->encap_routes[out_index].index = out_index;
1183 	return 0;
1184 }
1185 
1186 void mlx5e_detach_decap_route(struct mlx5e_priv *priv,
1187 			      struct mlx5e_tc_flow *flow)
1188 {
1189 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1190 	struct mlx5e_route_entry *r = flow->decap_route;
1191 
1192 	if (!r)
1193 		return;
1194 
1195 	mutex_lock(&esw->offloads.encap_tbl_lock);
1196 	list_del(&flow->decap_routes);
1197 	flow->decap_route = NULL;
1198 
1199 	if (!refcount_dec_and_test(&r->refcnt)) {
1200 		mutex_unlock(&esw->offloads.encap_tbl_lock);
1201 		return;
1202 	}
1203 	hash_del_rcu(&r->hlist);
1204 	mutex_unlock(&esw->offloads.encap_tbl_lock);
1205 
1206 	mlx5e_route_dealloc(priv, r);
1207 }
1208 
1209 static void mlx5e_detach_encap_route(struct mlx5e_priv *priv,
1210 				     struct mlx5e_tc_flow *flow,
1211 				     int out_index)
1212 {
1213 	struct mlx5e_route_entry *r = flow->encap_routes[out_index].r;
1214 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1215 	struct mlx5e_encap_entry *e, *tmp;
1216 
1217 	if (!r)
1218 		return;
1219 
1220 	mutex_lock(&esw->offloads.encap_tbl_lock);
1221 	flow->encap_routes[out_index].r = NULL;
1222 
1223 	if (!refcount_dec_and_test(&r->refcnt)) {
1224 		mutex_unlock(&esw->offloads.encap_tbl_lock);
1225 		return;
1226 	}
1227 	list_for_each_entry_safe(e, tmp, &r->encap_entries, route_list)
1228 		list_del_init(&e->route_list);
1229 	hash_del_rcu(&r->hlist);
1230 	mutex_unlock(&esw->offloads.encap_tbl_lock);
1231 
1232 	mlx5e_route_dealloc(priv, r);
1233 }
1234 
1235 static void mlx5e_invalidate_encap(struct mlx5e_priv *priv,
1236 				   struct mlx5e_encap_entry *e,
1237 				   struct list_head *encap_flows)
1238 {
1239 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1240 	struct mlx5e_tc_flow *flow;
1241 
1242 	list_for_each_entry(flow, encap_flows, tmp_list) {
1243 		struct mlx5_flow_attr *attr = flow->attr;
1244 		struct mlx5_esw_flow_attr *esw_attr;
1245 
1246 		if (!mlx5e_is_offloaded_flow(flow))
1247 			continue;
1248 		esw_attr = attr->esw_attr;
1249 
1250 		if (flow_flag_test(flow, SLOW))
1251 			mlx5e_tc_unoffload_from_slow_path(esw, flow);
1252 		else
1253 			mlx5e_tc_unoffload_fdb_rules(esw, flow, flow->attr);
1254 		mlx5_modify_header_dealloc(priv->mdev, attr->modify_hdr);
1255 		attr->modify_hdr = NULL;
1256 
1257 		esw_attr->dests[flow->tmp_entry_index].flags &=
1258 			~MLX5_ESW_DEST_ENCAP_VALID;
1259 		esw_attr->dests[flow->tmp_entry_index].pkt_reformat = NULL;
1260 	}
1261 
1262 	e->flags |= MLX5_ENCAP_ENTRY_NO_ROUTE;
1263 	if (e->flags & MLX5_ENCAP_ENTRY_VALID) {
1264 		e->flags &= ~MLX5_ENCAP_ENTRY_VALID;
1265 		mlx5_packet_reformat_dealloc(priv->mdev, e->pkt_reformat);
1266 		e->pkt_reformat = NULL;
1267 	}
1268 }
1269 
1270 static void mlx5e_reoffload_encap(struct mlx5e_priv *priv,
1271 				  struct net_device *tunnel_dev,
1272 				  struct mlx5e_encap_entry *e,
1273 				  struct list_head *encap_flows)
1274 {
1275 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1276 	struct mlx5e_tc_flow *flow;
1277 	int err;
1278 
1279 	err = ip_tunnel_info_af(e->tun_info) == AF_INET ?
1280 		mlx5e_tc_tun_update_header_ipv4(priv, tunnel_dev, e) :
1281 		mlx5e_tc_tun_update_header_ipv6(priv, tunnel_dev, e);
1282 	if (err)
1283 		mlx5_core_warn(priv->mdev, "Failed to update encap header, %d", err);
1284 	e->flags &= ~MLX5_ENCAP_ENTRY_NO_ROUTE;
1285 
1286 	list_for_each_entry(flow, encap_flows, tmp_list) {
1287 		struct mlx5e_tc_flow_parse_attr *parse_attr;
1288 		struct mlx5_flow_attr *attr = flow->attr;
1289 		struct mlx5_esw_flow_attr *esw_attr;
1290 		struct mlx5_flow_handle *rule;
1291 		struct mlx5_flow_spec *spec;
1292 
1293 		if (flow_flag_test(flow, FAILED))
1294 			continue;
1295 
1296 		esw_attr = attr->esw_attr;
1297 		parse_attr = attr->parse_attr;
1298 		spec = &parse_attr->spec;
1299 
1300 		err = mlx5e_update_vf_tunnel(esw, esw_attr, &parse_attr->mod_hdr_acts,
1301 					     e->out_dev, e->route_dev_ifindex,
1302 					     flow->tmp_entry_index);
1303 		if (err) {
1304 			mlx5_core_warn(priv->mdev, "Failed to update VF tunnel err=%d", err);
1305 			continue;
1306 		}
1307 
1308 		err = mlx5e_tc_add_flow_mod_hdr(priv, parse_attr, flow);
1309 		if (err) {
1310 			mlx5_core_warn(priv->mdev, "Failed to update flow mod_hdr err=%d",
1311 				       err);
1312 			continue;
1313 		}
1314 
1315 		if (e->flags & MLX5_ENCAP_ENTRY_VALID) {
1316 			esw_attr->dests[flow->tmp_entry_index].pkt_reformat = e->pkt_reformat;
1317 			esw_attr->dests[flow->tmp_entry_index].flags |= MLX5_ESW_DEST_ENCAP_VALID;
1318 			if (!mlx5e_tc_flow_all_encaps_valid(esw_attr))
1319 				goto offload_to_slow_path;
1320 			/* update from slow path rule to encap rule */
1321 			rule = mlx5e_tc_offload_fdb_rules(esw, flow, spec, attr);
1322 			if (IS_ERR(rule)) {
1323 				err = PTR_ERR(rule);
1324 				mlx5_core_warn(priv->mdev, "Failed to update cached encapsulation flow, %d\n",
1325 					       err);
1326 			} else {
1327 				flow->rule[0] = rule;
1328 			}
1329 		} else {
1330 offload_to_slow_path:
1331 			rule = mlx5e_tc_offload_to_slow_path(esw, flow, spec);
1332 			/* mark the flow's encap dest as non-valid */
1333 			esw_attr->dests[flow->tmp_entry_index].flags &=
1334 				~MLX5_ESW_DEST_ENCAP_VALID;
1335 
1336 			if (IS_ERR(rule)) {
1337 				err = PTR_ERR(rule);
1338 				mlx5_core_warn(priv->mdev, "Failed to update slow path (encap) flow, %d\n",
1339 					       err);
1340 			} else {
1341 				flow->rule[0] = rule;
1342 			}
1343 		}
1344 		flow_flag_set(flow, OFFLOADED);
1345 	}
1346 }
1347 
1348 static int mlx5e_update_route_encaps(struct mlx5e_priv *priv,
1349 				     struct mlx5e_route_entry *r,
1350 				     struct list_head *flow_list,
1351 				     bool replace)
1352 {
1353 	struct net_device *tunnel_dev;
1354 	struct mlx5e_encap_entry *e;
1355 
1356 	tunnel_dev = __dev_get_by_index(dev_net(priv->netdev), r->tunnel_dev_index);
1357 	if (!tunnel_dev)
1358 		return -ENODEV;
1359 
1360 	list_for_each_entry(e, &r->encap_entries, route_list) {
1361 		LIST_HEAD(encap_flows);
1362 
1363 		mlx5e_take_all_encap_flows(e, &encap_flows);
1364 		if (list_empty(&encap_flows))
1365 			continue;
1366 
1367 		if (mlx5e_route_entry_valid(r))
1368 			mlx5e_invalidate_encap(priv, e, &encap_flows);
1369 
1370 		if (!replace) {
1371 			list_splice(&encap_flows, flow_list);
1372 			continue;
1373 		}
1374 
1375 		mlx5e_reoffload_encap(priv, tunnel_dev, e, &encap_flows);
1376 		list_splice(&encap_flows, flow_list);
1377 	}
1378 
1379 	return 0;
1380 }
1381 
1382 static void mlx5e_unoffload_flow_list(struct mlx5e_priv *priv,
1383 				      struct list_head *flow_list)
1384 {
1385 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1386 	struct mlx5e_tc_flow *flow;
1387 
1388 	list_for_each_entry(flow, flow_list, tmp_list)
1389 		if (mlx5e_is_offloaded_flow(flow))
1390 			mlx5e_tc_unoffload_fdb_rules(esw, flow, flow->attr);
1391 }
1392 
1393 static void mlx5e_reoffload_decap(struct mlx5e_priv *priv,
1394 				  struct list_head *decap_flows)
1395 {
1396 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1397 	struct mlx5e_tc_flow *flow;
1398 
1399 	list_for_each_entry(flow, decap_flows, tmp_list) {
1400 		struct mlx5e_tc_flow_parse_attr *parse_attr;
1401 		struct mlx5_flow_attr *attr = flow->attr;
1402 		struct mlx5_flow_handle *rule;
1403 		struct mlx5_flow_spec *spec;
1404 		int err;
1405 
1406 		if (flow_flag_test(flow, FAILED))
1407 			continue;
1408 
1409 		parse_attr = attr->parse_attr;
1410 		spec = &parse_attr->spec;
1411 		err = mlx5e_tc_tun_route_lookup(priv, spec, attr);
1412 		if (err) {
1413 			mlx5_core_warn(priv->mdev, "Failed to lookup route for flow, %d\n",
1414 				       err);
1415 			continue;
1416 		}
1417 
1418 		rule = mlx5e_tc_offload_fdb_rules(esw, flow, spec, attr);
1419 		if (IS_ERR(rule)) {
1420 			err = PTR_ERR(rule);
1421 			mlx5_core_warn(priv->mdev, "Failed to update cached decap flow, %d\n",
1422 				       err);
1423 		} else {
1424 			flow->rule[0] = rule;
1425 			flow_flag_set(flow, OFFLOADED);
1426 		}
1427 	}
1428 }
1429 
1430 static int mlx5e_update_route_decap_flows(struct mlx5e_priv *priv,
1431 					  struct mlx5e_route_entry *r,
1432 					  struct list_head *flow_list,
1433 					  bool replace)
1434 {
1435 	struct net_device *tunnel_dev;
1436 	LIST_HEAD(decap_flows);
1437 
1438 	tunnel_dev = __dev_get_by_index(dev_net(priv->netdev), r->tunnel_dev_index);
1439 	if (!tunnel_dev)
1440 		return -ENODEV;
1441 
1442 	mlx5e_take_all_route_decap_flows(r, &decap_flows);
1443 	if (mlx5e_route_entry_valid(r))
1444 		mlx5e_unoffload_flow_list(priv, &decap_flows);
1445 	if (replace)
1446 		mlx5e_reoffload_decap(priv, &decap_flows);
1447 
1448 	list_splice(&decap_flows, flow_list);
1449 
1450 	return 0;
1451 }
1452 
1453 static void mlx5e_tc_fib_event_work(struct work_struct *work)
1454 {
1455 	struct mlx5e_tc_fib_event_data *event_data =
1456 		container_of(work, struct mlx5e_tc_fib_event_data, work);
1457 	struct net_device *ul_dev = event_data->ul_dev;
1458 	struct mlx5e_priv *priv = netdev_priv(ul_dev);
1459 	struct mlx5e_route_entry *r = event_data->r;
1460 	struct mlx5_eswitch *esw;
1461 	LIST_HEAD(flow_list);
1462 	bool replace;
1463 	int err;
1464 
1465 	/* sync with concurrent neigh updates */
1466 	rtnl_lock();
1467 	esw = priv->mdev->priv.eswitch;
1468 	mutex_lock(&esw->offloads.encap_tbl_lock);
1469 	replace = event_data->event == FIB_EVENT_ENTRY_REPLACE;
1470 
1471 	if (!mlx5e_route_entry_valid(r) && !replace)
1472 		goto out;
1473 
1474 	err = mlx5e_update_route_encaps(priv, r, &flow_list, replace);
1475 	if (err)
1476 		mlx5_core_warn(priv->mdev, "Failed to update route encaps, %d\n",
1477 			       err);
1478 
1479 	err = mlx5e_update_route_decap_flows(priv, r, &flow_list, replace);
1480 	if (err)
1481 		mlx5_core_warn(priv->mdev, "Failed to update route decap flows, %d\n",
1482 			       err);
1483 
1484 	if (replace)
1485 		r->flags |= MLX5E_ROUTE_ENTRY_VALID;
1486 out:
1487 	mutex_unlock(&esw->offloads.encap_tbl_lock);
1488 	rtnl_unlock();
1489 
1490 	mlx5e_put_flow_list(priv, &flow_list);
1491 	mlx5e_route_put(priv, event_data->r);
1492 	dev_put(event_data->ul_dev);
1493 	kfree(event_data);
1494 }
1495 
1496 static struct mlx5e_tc_fib_event_data *
1497 mlx5e_init_fib_work_ipv4(struct mlx5e_priv *priv,
1498 			 struct net_device *ul_dev,
1499 			 struct mlx5e_tc_tun_encap *encap,
1500 			 unsigned long event,
1501 			 struct fib_notifier_info *info)
1502 {
1503 	struct fib_entry_notifier_info *fen_info;
1504 	struct mlx5e_tc_fib_event_data *fib_work;
1505 	struct mlx5e_route_entry *r;
1506 	struct mlx5e_route_key key;
1507 	struct net_device *fib_dev;
1508 
1509 	fen_info = container_of(info, struct fib_entry_notifier_info, info);
1510 	fib_dev = fib_info_nh(fen_info->fi, 0)->fib_nh_dev;
1511 	if (fib_dev->netdev_ops != &mlx5e_netdev_ops ||
1512 	    fen_info->dst_len != 32)
1513 		return NULL;
1514 
1515 	fib_work = mlx5e_tc_init_fib_work(event, ul_dev, GFP_ATOMIC);
1516 	if (!fib_work)
1517 		return ERR_PTR(-ENOMEM);
1518 
1519 	key.endpoint_ip.v4 = htonl(fen_info->dst);
1520 	key.ip_version = 4;
1521 
1522 	/* Can't fail after this point because releasing reference to r
1523 	 * requires obtaining sleeping mutex which we can't do in atomic
1524 	 * context.
1525 	 */
1526 	r = mlx5e_route_lookup_for_update(encap, &key);
1527 	if (!r)
1528 		goto out;
1529 	fib_work->r = r;
1530 	dev_hold(ul_dev);
1531 
1532 	return fib_work;
1533 
1534 out:
1535 	kfree(fib_work);
1536 	return NULL;
1537 }
1538 
1539 static struct mlx5e_tc_fib_event_data *
1540 mlx5e_init_fib_work_ipv6(struct mlx5e_priv *priv,
1541 			 struct net_device *ul_dev,
1542 			 struct mlx5e_tc_tun_encap *encap,
1543 			 unsigned long event,
1544 			 struct fib_notifier_info *info)
1545 {
1546 	struct fib6_entry_notifier_info *fen_info;
1547 	struct mlx5e_tc_fib_event_data *fib_work;
1548 	struct mlx5e_route_entry *r;
1549 	struct mlx5e_route_key key;
1550 	struct net_device *fib_dev;
1551 
1552 	fen_info = container_of(info, struct fib6_entry_notifier_info, info);
1553 	fib_dev = fib6_info_nh_dev(fen_info->rt);
1554 	if (fib_dev->netdev_ops != &mlx5e_netdev_ops ||
1555 	    fen_info->rt->fib6_dst.plen != 128)
1556 		return NULL;
1557 
1558 	fib_work = mlx5e_tc_init_fib_work(event, ul_dev, GFP_ATOMIC);
1559 	if (!fib_work)
1560 		return ERR_PTR(-ENOMEM);
1561 
1562 	memcpy(&key.endpoint_ip.v6, &fen_info->rt->fib6_dst.addr,
1563 	       sizeof(fen_info->rt->fib6_dst.addr));
1564 	key.ip_version = 6;
1565 
1566 	/* Can't fail after this point because releasing reference to r
1567 	 * requires obtaining sleeping mutex which we can't do in atomic
1568 	 * context.
1569 	 */
1570 	r = mlx5e_route_lookup_for_update(encap, &key);
1571 	if (!r)
1572 		goto out;
1573 	fib_work->r = r;
1574 	dev_hold(ul_dev);
1575 
1576 	return fib_work;
1577 
1578 out:
1579 	kfree(fib_work);
1580 	return NULL;
1581 }
1582 
1583 static int mlx5e_tc_tun_fib_event(struct notifier_block *nb, unsigned long event, void *ptr)
1584 {
1585 	struct mlx5e_tc_fib_event_data *fib_work;
1586 	struct fib_notifier_info *info = ptr;
1587 	struct mlx5e_tc_tun_encap *encap;
1588 	struct net_device *ul_dev;
1589 	struct mlx5e_priv *priv;
1590 
1591 	encap = container_of(nb, struct mlx5e_tc_tun_encap, fib_nb);
1592 	priv = encap->priv;
1593 	ul_dev = priv->netdev;
1594 	priv = netdev_priv(ul_dev);
1595 
1596 	switch (event) {
1597 	case FIB_EVENT_ENTRY_REPLACE:
1598 	case FIB_EVENT_ENTRY_DEL:
1599 		if (info->family == AF_INET)
1600 			fib_work = mlx5e_init_fib_work_ipv4(priv, ul_dev, encap, event, info);
1601 		else if (info->family == AF_INET6)
1602 			fib_work = mlx5e_init_fib_work_ipv6(priv, ul_dev, encap, event, info);
1603 		else
1604 			return NOTIFY_DONE;
1605 
1606 		if (!IS_ERR_OR_NULL(fib_work)) {
1607 			queue_work(priv->wq, &fib_work->work);
1608 		} else if (IS_ERR(fib_work)) {
1609 			NL_SET_ERR_MSG_MOD(info->extack, "Failed to init fib work");
1610 			mlx5_core_warn(priv->mdev, "Failed to init fib work, %ld\n",
1611 				       PTR_ERR(fib_work));
1612 		}
1613 
1614 		break;
1615 	default:
1616 		return NOTIFY_DONE;
1617 	}
1618 
1619 	return NOTIFY_DONE;
1620 }
1621 
1622 struct mlx5e_tc_tun_encap *mlx5e_tc_tun_init(struct mlx5e_priv *priv)
1623 {
1624 	struct mlx5e_tc_tun_encap *encap;
1625 	int err;
1626 
1627 	encap = kvzalloc(sizeof(*encap), GFP_KERNEL);
1628 	if (!encap)
1629 		return ERR_PTR(-ENOMEM);
1630 
1631 	encap->priv = priv;
1632 	encap->fib_nb.notifier_call = mlx5e_tc_tun_fib_event;
1633 	spin_lock_init(&encap->route_lock);
1634 	hash_init(encap->route_tbl);
1635 	err = register_fib_notifier(dev_net(priv->netdev), &encap->fib_nb,
1636 				    NULL, NULL);
1637 	if (err) {
1638 		kvfree(encap);
1639 		return ERR_PTR(err);
1640 	}
1641 
1642 	return encap;
1643 }
1644 
1645 void mlx5e_tc_tun_cleanup(struct mlx5e_tc_tun_encap *encap)
1646 {
1647 	if (!encap)
1648 		return;
1649 
1650 	unregister_fib_notifier(dev_net(encap->priv->netdev), &encap->fib_nb);
1651 	flush_workqueue(encap->priv->wq); /* flush fib event works */
1652 	kvfree(encap);
1653 }
1654