1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2021 Mellanox Technologies. */
3 
4 #include <net/fib_notifier.h>
5 #include <net/nexthop.h>
6 #include "tc_tun_encap.h"
7 #include "en_tc.h"
8 #include "tc_tun.h"
9 #include "rep/tc.h"
10 #include "diag/en_tc_tracepoint.h"
11 
12 enum {
13 	MLX5E_ROUTE_ENTRY_VALID     = BIT(0),
14 };
15 
16 struct mlx5e_route_key {
17 	int ip_version;
18 	union {
19 		__be32 v4;
20 		struct in6_addr v6;
21 	} endpoint_ip;
22 };
23 
24 struct mlx5e_route_entry {
25 	struct mlx5e_route_key key;
26 	struct list_head encap_entries;
27 	struct list_head decap_flows;
28 	u32 flags;
29 	struct hlist_node hlist;
30 	refcount_t refcnt;
31 	int tunnel_dev_index;
32 	struct rcu_head rcu;
33 };
34 
35 struct mlx5e_tc_tun_encap {
36 	struct mlx5e_priv *priv;
37 	struct notifier_block fib_nb;
38 	spinlock_t route_lock; /* protects route_tbl */
39 	unsigned long route_tbl_last_update;
40 	DECLARE_HASHTABLE(route_tbl, 8);
41 };
42 
43 static bool mlx5e_route_entry_valid(struct mlx5e_route_entry *r)
44 {
45 	return r->flags & MLX5E_ROUTE_ENTRY_VALID;
46 }
47 
48 int mlx5e_tc_set_attr_rx_tun(struct mlx5e_tc_flow *flow,
49 			     struct mlx5_flow_spec *spec)
50 {
51 	struct mlx5_esw_flow_attr *esw_attr = flow->attr->esw_attr;
52 	struct mlx5_rx_tun_attr *tun_attr;
53 	void *daddr, *saddr;
54 	u8 ip_version;
55 
56 	tun_attr = kvzalloc(sizeof(*tun_attr), GFP_KERNEL);
57 	if (!tun_attr)
58 		return -ENOMEM;
59 
60 	esw_attr->rx_tun_attr = tun_attr;
61 	ip_version = mlx5e_tc_get_ip_version(spec, true);
62 
63 	if (ip_version == 4) {
64 		daddr = MLX5_ADDR_OF(fte_match_param, spec->match_value,
65 				     outer_headers.dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
66 		saddr = MLX5_ADDR_OF(fte_match_param, spec->match_value,
67 				     outer_headers.src_ipv4_src_ipv6.ipv4_layout.ipv4);
68 		tun_attr->dst_ip.v4 = *(__be32 *)daddr;
69 		tun_attr->src_ip.v4 = *(__be32 *)saddr;
70 		if (!tun_attr->dst_ip.v4 || !tun_attr->src_ip.v4)
71 			return 0;
72 	}
73 #if IS_ENABLED(CONFIG_INET) && IS_ENABLED(CONFIG_IPV6)
74 	else if (ip_version == 6) {
75 		int ipv6_size = MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6);
76 		struct in6_addr zerov6 = {};
77 
78 		daddr = MLX5_ADDR_OF(fte_match_param, spec->match_value,
79 				     outer_headers.dst_ipv4_dst_ipv6.ipv6_layout.ipv6);
80 		saddr = MLX5_ADDR_OF(fte_match_param, spec->match_value,
81 				     outer_headers.src_ipv4_src_ipv6.ipv6_layout.ipv6);
82 		memcpy(&tun_attr->dst_ip.v6, daddr, ipv6_size);
83 		memcpy(&tun_attr->src_ip.v6, saddr, ipv6_size);
84 		if (!memcmp(&tun_attr->dst_ip.v6, &zerov6, sizeof(zerov6)) ||
85 		    !memcmp(&tun_attr->src_ip.v6, &zerov6, sizeof(zerov6)))
86 			return 0;
87 	}
88 #endif
89 	/* Only set the flag if both src and dst ip addresses exist. They are
90 	 * required to establish routing.
91 	 */
92 	flow_flag_set(flow, TUN_RX);
93 	flow->attr->tun_ip_version = ip_version;
94 	return 0;
95 }
96 
97 static bool mlx5e_tc_flow_all_encaps_valid(struct mlx5_esw_flow_attr *esw_attr)
98 {
99 	bool all_flow_encaps_valid = true;
100 	int i;
101 
102 	/* Flow can be associated with multiple encap entries.
103 	 * Before offloading the flow verify that all of them have
104 	 * a valid neighbour.
105 	 */
106 	for (i = 0; i < MLX5_MAX_FLOW_FWD_VPORTS; i++) {
107 		if (!(esw_attr->dests[i].flags & MLX5_ESW_DEST_ENCAP))
108 			continue;
109 		if (!(esw_attr->dests[i].flags & MLX5_ESW_DEST_ENCAP_VALID)) {
110 			all_flow_encaps_valid = false;
111 			break;
112 		}
113 	}
114 
115 	return all_flow_encaps_valid;
116 }
117 
118 void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
119 			      struct mlx5e_encap_entry *e,
120 			      struct list_head *flow_list)
121 {
122 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
123 	struct mlx5_pkt_reformat_params reformat_params;
124 	struct mlx5_esw_flow_attr *esw_attr;
125 	struct mlx5_flow_handle *rule;
126 	struct mlx5_flow_attr *attr;
127 	struct mlx5_flow_spec *spec;
128 	struct mlx5e_tc_flow *flow;
129 	int err;
130 
131 	if (e->flags & MLX5_ENCAP_ENTRY_NO_ROUTE)
132 		return;
133 
134 	memset(&reformat_params, 0, sizeof(reformat_params));
135 	reformat_params.type = e->reformat_type;
136 	reformat_params.size = e->encap_size;
137 	reformat_params.data = e->encap_header;
138 	e->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev,
139 						     &reformat_params,
140 						     MLX5_FLOW_NAMESPACE_FDB);
141 	if (IS_ERR(e->pkt_reformat)) {
142 		mlx5_core_warn(priv->mdev, "Failed to offload cached encapsulation header, %lu\n",
143 			       PTR_ERR(e->pkt_reformat));
144 		return;
145 	}
146 	e->flags |= MLX5_ENCAP_ENTRY_VALID;
147 	mlx5e_rep_queue_neigh_stats_work(priv);
148 
149 	list_for_each_entry(flow, flow_list, tmp_list) {
150 		if (!mlx5e_is_offloaded_flow(flow) || !flow_flag_test(flow, SLOW))
151 			continue;
152 		attr = flow->attr;
153 		esw_attr = attr->esw_attr;
154 		spec = &attr->parse_attr->spec;
155 
156 		esw_attr->dests[flow->tmp_entry_index].pkt_reformat = e->pkt_reformat;
157 		esw_attr->dests[flow->tmp_entry_index].flags |= MLX5_ESW_DEST_ENCAP_VALID;
158 
159 		/* Do not offload flows with unresolved neighbors */
160 		if (!mlx5e_tc_flow_all_encaps_valid(esw_attr))
161 			continue;
162 		/* update from slow path rule to encap rule */
163 		rule = mlx5e_tc_offload_fdb_rules(esw, flow, spec, attr);
164 		if (IS_ERR(rule)) {
165 			err = PTR_ERR(rule);
166 			mlx5_core_warn(priv->mdev, "Failed to update cached encapsulation flow, %d\n",
167 				       err);
168 			continue;
169 		}
170 
171 		mlx5e_tc_unoffload_from_slow_path(esw, flow);
172 		flow->rule[0] = rule;
173 		/* was unset when slow path rule removed */
174 		flow_flag_set(flow, OFFLOADED);
175 	}
176 }
177 
178 void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv,
179 			      struct mlx5e_encap_entry *e,
180 			      struct list_head *flow_list)
181 {
182 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
183 	struct mlx5_esw_flow_attr *esw_attr;
184 	struct mlx5_flow_handle *rule;
185 	struct mlx5_flow_attr *attr;
186 	struct mlx5_flow_spec *spec;
187 	struct mlx5e_tc_flow *flow;
188 	int err;
189 
190 	list_for_each_entry(flow, flow_list, tmp_list) {
191 		if (!mlx5e_is_offloaded_flow(flow) || flow_flag_test(flow, SLOW))
192 			continue;
193 		attr = flow->attr;
194 		esw_attr = attr->esw_attr;
195 		spec = &attr->parse_attr->spec;
196 
197 		/* update from encap rule to slow path rule */
198 		rule = mlx5e_tc_offload_to_slow_path(esw, flow, spec);
199 		/* mark the flow's encap dest as non-valid */
200 		esw_attr->dests[flow->tmp_entry_index].flags &= ~MLX5_ESW_DEST_ENCAP_VALID;
201 
202 		if (IS_ERR(rule)) {
203 			err = PTR_ERR(rule);
204 			mlx5_core_warn(priv->mdev, "Failed to update slow path (encap) flow, %d\n",
205 				       err);
206 			continue;
207 		}
208 
209 		mlx5e_tc_unoffload_fdb_rules(esw, flow, attr);
210 		flow->rule[0] = rule;
211 		/* was unset when fast path rule removed */
212 		flow_flag_set(flow, OFFLOADED);
213 	}
214 
215 	/* we know that the encap is valid */
216 	e->flags &= ~MLX5_ENCAP_ENTRY_VALID;
217 	mlx5_packet_reformat_dealloc(priv->mdev, e->pkt_reformat);
218 }
219 
220 static void mlx5e_take_tmp_flow(struct mlx5e_tc_flow *flow,
221 				struct list_head *flow_list,
222 				int index)
223 {
224 	if (IS_ERR(mlx5e_flow_get(flow)))
225 		return;
226 	wait_for_completion(&flow->init_done);
227 
228 	flow->tmp_entry_index = index;
229 	list_add(&flow->tmp_list, flow_list);
230 }
231 
232 /* Takes reference to all flows attached to encap and adds the flows to
233  * flow_list using 'tmp_list' list_head in mlx5e_tc_flow.
234  */
235 void mlx5e_take_all_encap_flows(struct mlx5e_encap_entry *e, struct list_head *flow_list)
236 {
237 	struct encap_flow_item *efi;
238 	struct mlx5e_tc_flow *flow;
239 
240 	list_for_each_entry(efi, &e->flows, list) {
241 		flow = container_of(efi, struct mlx5e_tc_flow, encaps[efi->index]);
242 		mlx5e_take_tmp_flow(flow, flow_list, efi->index);
243 	}
244 }
245 
246 /* Takes reference to all flows attached to route and adds the flows to
247  * flow_list using 'tmp_list' list_head in mlx5e_tc_flow.
248  */
249 static void mlx5e_take_all_route_decap_flows(struct mlx5e_route_entry *r,
250 					     struct list_head *flow_list)
251 {
252 	struct mlx5e_tc_flow *flow;
253 
254 	list_for_each_entry(flow, &r->decap_flows, decap_routes)
255 		mlx5e_take_tmp_flow(flow, flow_list, 0);
256 }
257 
258 typedef bool (match_cb)(struct mlx5e_encap_entry *);
259 
260 static struct mlx5e_encap_entry *
261 mlx5e_get_next_matching_encap(struct mlx5e_neigh_hash_entry *nhe,
262 			      struct mlx5e_encap_entry *e,
263 			      match_cb match)
264 {
265 	struct mlx5e_encap_entry *next = NULL;
266 
267 retry:
268 	rcu_read_lock();
269 
270 	/* find encap with non-zero reference counter value */
271 	for (next = e ?
272 		     list_next_or_null_rcu(&nhe->encap_list,
273 					   &e->encap_list,
274 					   struct mlx5e_encap_entry,
275 					   encap_list) :
276 		     list_first_or_null_rcu(&nhe->encap_list,
277 					    struct mlx5e_encap_entry,
278 					    encap_list);
279 	     next;
280 	     next = list_next_or_null_rcu(&nhe->encap_list,
281 					  &next->encap_list,
282 					  struct mlx5e_encap_entry,
283 					  encap_list))
284 		if (mlx5e_encap_take(next))
285 			break;
286 
287 	rcu_read_unlock();
288 
289 	/* release starting encap */
290 	if (e)
291 		mlx5e_encap_put(netdev_priv(e->out_dev), e);
292 	if (!next)
293 		return next;
294 
295 	/* wait for encap to be fully initialized */
296 	wait_for_completion(&next->res_ready);
297 	/* continue searching if encap entry is not in valid state after completion */
298 	if (!match(next)) {
299 		e = next;
300 		goto retry;
301 	}
302 
303 	return next;
304 }
305 
306 static bool mlx5e_encap_valid(struct mlx5e_encap_entry *e)
307 {
308 	return e->flags & MLX5_ENCAP_ENTRY_VALID;
309 }
310 
311 static struct mlx5e_encap_entry *
312 mlx5e_get_next_valid_encap(struct mlx5e_neigh_hash_entry *nhe,
313 			   struct mlx5e_encap_entry *e)
314 {
315 	return mlx5e_get_next_matching_encap(nhe, e, mlx5e_encap_valid);
316 }
317 
318 static bool mlx5e_encap_initialized(struct mlx5e_encap_entry *e)
319 {
320 	return e->compl_result >= 0;
321 }
322 
323 struct mlx5e_encap_entry *
324 mlx5e_get_next_init_encap(struct mlx5e_neigh_hash_entry *nhe,
325 			  struct mlx5e_encap_entry *e)
326 {
327 	return mlx5e_get_next_matching_encap(nhe, e, mlx5e_encap_initialized);
328 }
329 
330 void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe)
331 {
332 	struct mlx5e_neigh *m_neigh = &nhe->m_neigh;
333 	struct mlx5e_encap_entry *e = NULL;
334 	struct mlx5e_tc_flow *flow;
335 	struct mlx5_fc *counter;
336 	struct neigh_table *tbl;
337 	bool neigh_used = false;
338 	struct neighbour *n;
339 	u64 lastuse;
340 
341 	if (m_neigh->family == AF_INET)
342 		tbl = &arp_tbl;
343 #if IS_ENABLED(CONFIG_IPV6)
344 	else if (m_neigh->family == AF_INET6)
345 		tbl = ipv6_stub->nd_tbl;
346 #endif
347 	else
348 		return;
349 
350 	/* mlx5e_get_next_valid_encap() releases previous encap before returning
351 	 * next one.
352 	 */
353 	while ((e = mlx5e_get_next_valid_encap(nhe, e)) != NULL) {
354 		struct mlx5e_priv *priv = netdev_priv(e->out_dev);
355 		struct encap_flow_item *efi, *tmp;
356 		struct mlx5_eswitch *esw;
357 		LIST_HEAD(flow_list);
358 
359 		esw = priv->mdev->priv.eswitch;
360 		mutex_lock(&esw->offloads.encap_tbl_lock);
361 		list_for_each_entry_safe(efi, tmp, &e->flows, list) {
362 			flow = container_of(efi, struct mlx5e_tc_flow,
363 					    encaps[efi->index]);
364 			if (IS_ERR(mlx5e_flow_get(flow)))
365 				continue;
366 			list_add(&flow->tmp_list, &flow_list);
367 
368 			if (mlx5e_is_offloaded_flow(flow)) {
369 				counter = mlx5e_tc_get_counter(flow);
370 				lastuse = mlx5_fc_query_lastuse(counter);
371 				if (time_after((unsigned long)lastuse, nhe->reported_lastuse)) {
372 					neigh_used = true;
373 					break;
374 				}
375 			}
376 		}
377 		mutex_unlock(&esw->offloads.encap_tbl_lock);
378 
379 		mlx5e_put_flow_list(priv, &flow_list);
380 		if (neigh_used) {
381 			/* release current encap before breaking the loop */
382 			mlx5e_encap_put(priv, e);
383 			break;
384 		}
385 	}
386 
387 	trace_mlx5e_tc_update_neigh_used_value(nhe, neigh_used);
388 
389 	if (neigh_used) {
390 		nhe->reported_lastuse = jiffies;
391 
392 		/* find the relevant neigh according to the cached device and
393 		 * dst ip pair
394 		 */
395 		n = neigh_lookup(tbl, &m_neigh->dst_ip, READ_ONCE(nhe->neigh_dev));
396 		if (!n)
397 			return;
398 
399 		neigh_event_send(n, NULL);
400 		neigh_release(n);
401 	}
402 }
403 
404 static void mlx5e_encap_dealloc(struct mlx5e_priv *priv, struct mlx5e_encap_entry *e)
405 {
406 	WARN_ON(!list_empty(&e->flows));
407 
408 	if (e->compl_result > 0) {
409 		mlx5e_rep_encap_entry_detach(netdev_priv(e->out_dev), e);
410 
411 		if (e->flags & MLX5_ENCAP_ENTRY_VALID)
412 			mlx5_packet_reformat_dealloc(priv->mdev, e->pkt_reformat);
413 	}
414 
415 	kfree(e->tun_info);
416 	kfree(e->encap_header);
417 	kfree_rcu(e, rcu);
418 }
419 
420 static void mlx5e_decap_dealloc(struct mlx5e_priv *priv,
421 				struct mlx5e_decap_entry *d)
422 {
423 	WARN_ON(!list_empty(&d->flows));
424 
425 	if (!d->compl_result)
426 		mlx5_packet_reformat_dealloc(priv->mdev, d->pkt_reformat);
427 
428 	kfree_rcu(d, rcu);
429 }
430 
431 void mlx5e_encap_put(struct mlx5e_priv *priv, struct mlx5e_encap_entry *e)
432 {
433 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
434 
435 	if (!refcount_dec_and_mutex_lock(&e->refcnt, &esw->offloads.encap_tbl_lock))
436 		return;
437 	list_del(&e->route_list);
438 	hash_del_rcu(&e->encap_hlist);
439 	mutex_unlock(&esw->offloads.encap_tbl_lock);
440 
441 	mlx5e_encap_dealloc(priv, e);
442 }
443 
444 static void mlx5e_decap_put(struct mlx5e_priv *priv, struct mlx5e_decap_entry *d)
445 {
446 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
447 
448 	if (!refcount_dec_and_mutex_lock(&d->refcnt, &esw->offloads.decap_tbl_lock))
449 		return;
450 	hash_del_rcu(&d->hlist);
451 	mutex_unlock(&esw->offloads.decap_tbl_lock);
452 
453 	mlx5e_decap_dealloc(priv, d);
454 }
455 
456 static void mlx5e_detach_encap_route(struct mlx5e_priv *priv,
457 				     struct mlx5e_tc_flow *flow,
458 				     int out_index);
459 
460 void mlx5e_detach_encap(struct mlx5e_priv *priv,
461 			struct mlx5e_tc_flow *flow, int out_index)
462 {
463 	struct mlx5e_encap_entry *e = flow->encaps[out_index].e;
464 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
465 
466 	if (flow->attr->esw_attr->dests[out_index].flags &
467 	    MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE)
468 		mlx5e_detach_encap_route(priv, flow, out_index);
469 
470 	/* flow wasn't fully initialized */
471 	if (!e)
472 		return;
473 
474 	mutex_lock(&esw->offloads.encap_tbl_lock);
475 	list_del(&flow->encaps[out_index].list);
476 	flow->encaps[out_index].e = NULL;
477 	if (!refcount_dec_and_test(&e->refcnt)) {
478 		mutex_unlock(&esw->offloads.encap_tbl_lock);
479 		return;
480 	}
481 	list_del(&e->route_list);
482 	hash_del_rcu(&e->encap_hlist);
483 	mutex_unlock(&esw->offloads.encap_tbl_lock);
484 
485 	mlx5e_encap_dealloc(priv, e);
486 }
487 
488 void mlx5e_detach_decap(struct mlx5e_priv *priv,
489 			struct mlx5e_tc_flow *flow)
490 {
491 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
492 	struct mlx5e_decap_entry *d = flow->decap_reformat;
493 
494 	if (!d)
495 		return;
496 
497 	mutex_lock(&esw->offloads.decap_tbl_lock);
498 	list_del(&flow->l3_to_l2_reformat);
499 	flow->decap_reformat = NULL;
500 
501 	if (!refcount_dec_and_test(&d->refcnt)) {
502 		mutex_unlock(&esw->offloads.decap_tbl_lock);
503 		return;
504 	}
505 	hash_del_rcu(&d->hlist);
506 	mutex_unlock(&esw->offloads.decap_tbl_lock);
507 
508 	mlx5e_decap_dealloc(priv, d);
509 }
510 
511 bool mlx5e_tc_tun_encap_info_equal_generic(struct mlx5e_encap_key *a,
512 					   struct mlx5e_encap_key *b)
513 {
514 	return memcmp(a->ip_tun_key, b->ip_tun_key, sizeof(*a->ip_tun_key)) == 0 &&
515 		a->tc_tunnel->tunnel_type == b->tc_tunnel->tunnel_type;
516 }
517 
518 static int cmp_decap_info(struct mlx5e_decap_key *a,
519 			  struct mlx5e_decap_key *b)
520 {
521 	return memcmp(&a->key, &b->key, sizeof(b->key));
522 }
523 
524 static int hash_encap_info(struct mlx5e_encap_key *key)
525 {
526 	return jhash(key->ip_tun_key, sizeof(*key->ip_tun_key),
527 		     key->tc_tunnel->tunnel_type);
528 }
529 
530 static int hash_decap_info(struct mlx5e_decap_key *key)
531 {
532 	return jhash(&key->key, sizeof(key->key), 0);
533 }
534 
535 bool mlx5e_encap_take(struct mlx5e_encap_entry *e)
536 {
537 	return refcount_inc_not_zero(&e->refcnt);
538 }
539 
540 static bool mlx5e_decap_take(struct mlx5e_decap_entry *e)
541 {
542 	return refcount_inc_not_zero(&e->refcnt);
543 }
544 
545 static struct mlx5e_encap_entry *
546 mlx5e_encap_get(struct mlx5e_priv *priv, struct mlx5e_encap_key *key,
547 		uintptr_t hash_key)
548 {
549 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
550 	struct mlx5e_encap_key e_key;
551 	struct mlx5e_encap_entry *e;
552 
553 	hash_for_each_possible_rcu(esw->offloads.encap_tbl, e,
554 				   encap_hlist, hash_key) {
555 		e_key.ip_tun_key = &e->tun_info->key;
556 		e_key.tc_tunnel = e->tunnel;
557 		if (e->tunnel->encap_info_equal(&e_key, key) &&
558 		    mlx5e_encap_take(e))
559 			return e;
560 	}
561 
562 	return NULL;
563 }
564 
565 static struct mlx5e_decap_entry *
566 mlx5e_decap_get(struct mlx5e_priv *priv, struct mlx5e_decap_key *key,
567 		uintptr_t hash_key)
568 {
569 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
570 	struct mlx5e_decap_key r_key;
571 	struct mlx5e_decap_entry *e;
572 
573 	hash_for_each_possible_rcu(esw->offloads.decap_tbl, e,
574 				   hlist, hash_key) {
575 		r_key = e->key;
576 		if (!cmp_decap_info(&r_key, key) &&
577 		    mlx5e_decap_take(e))
578 			return e;
579 	}
580 	return NULL;
581 }
582 
583 struct ip_tunnel_info *mlx5e_dup_tun_info(const struct ip_tunnel_info *tun_info)
584 {
585 	size_t tun_size = sizeof(*tun_info) + tun_info->options_len;
586 
587 	return kmemdup(tun_info, tun_size, GFP_KERNEL);
588 }
589 
590 static bool is_duplicated_encap_entry(struct mlx5e_priv *priv,
591 				      struct mlx5e_tc_flow *flow,
592 				      int out_index,
593 				      struct mlx5e_encap_entry *e,
594 				      struct netlink_ext_ack *extack)
595 {
596 	int i;
597 
598 	for (i = 0; i < out_index; i++) {
599 		if (flow->encaps[i].e != e)
600 			continue;
601 		NL_SET_ERR_MSG_MOD(extack, "can't duplicate encap action");
602 		netdev_err(priv->netdev, "can't duplicate encap action\n");
603 		return true;
604 	}
605 
606 	return false;
607 }
608 
609 static int mlx5e_set_vf_tunnel(struct mlx5_eswitch *esw,
610 			       struct mlx5_flow_attr *attr,
611 			       struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts,
612 			       struct net_device *out_dev,
613 			       int route_dev_ifindex,
614 			       int out_index)
615 {
616 	struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr;
617 	struct net_device *route_dev;
618 	u16 vport_num;
619 	int err = 0;
620 	u32 data;
621 
622 	route_dev = dev_get_by_index(dev_net(out_dev), route_dev_ifindex);
623 
624 	if (!route_dev || route_dev->netdev_ops != &mlx5e_netdev_ops ||
625 	    !mlx5e_tc_is_vf_tunnel(out_dev, route_dev))
626 		goto out;
627 
628 	err = mlx5e_tc_query_route_vport(out_dev, route_dev, &vport_num);
629 	if (err)
630 		goto out;
631 
632 	attr->dest_chain = 0;
633 	attr->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
634 	esw_attr->dests[out_index].flags |= MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE;
635 	data = mlx5_eswitch_get_vport_metadata_for_set(esw_attr->in_mdev->priv.eswitch,
636 						       vport_num);
637 	err = mlx5e_tc_match_to_reg_set_and_get_id(esw->dev, mod_hdr_acts,
638 						   MLX5_FLOW_NAMESPACE_FDB,
639 						   VPORT_TO_REG, data);
640 	if (err >= 0) {
641 		esw_attr->dests[out_index].src_port_rewrite_act_id = err;
642 		err = 0;
643 	}
644 
645 out:
646 	if (route_dev)
647 		dev_put(route_dev);
648 	return err;
649 }
650 
651 static int mlx5e_update_vf_tunnel(struct mlx5_eswitch *esw,
652 				  struct mlx5_esw_flow_attr *attr,
653 				  struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts,
654 				  struct net_device *out_dev,
655 				  int route_dev_ifindex,
656 				  int out_index)
657 {
658 	int act_id = attr->dests[out_index].src_port_rewrite_act_id;
659 	struct net_device *route_dev;
660 	u16 vport_num;
661 	int err = 0;
662 	u32 data;
663 
664 	route_dev = dev_get_by_index(dev_net(out_dev), route_dev_ifindex);
665 
666 	if (!route_dev || route_dev->netdev_ops != &mlx5e_netdev_ops ||
667 	    !mlx5e_tc_is_vf_tunnel(out_dev, route_dev)) {
668 		err = -ENODEV;
669 		goto out;
670 	}
671 
672 	err = mlx5e_tc_query_route_vport(out_dev, route_dev, &vport_num);
673 	if (err)
674 		goto out;
675 
676 	data = mlx5_eswitch_get_vport_metadata_for_set(attr->in_mdev->priv.eswitch,
677 						       vport_num);
678 	mlx5e_tc_match_to_reg_mod_hdr_change(esw->dev, mod_hdr_acts, VPORT_TO_REG, act_id, data);
679 
680 out:
681 	if (route_dev)
682 		dev_put(route_dev);
683 	return err;
684 }
685 
686 static unsigned int mlx5e_route_tbl_get_last_update(struct mlx5e_priv *priv)
687 {
688 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
689 	struct mlx5_rep_uplink_priv *uplink_priv;
690 	struct mlx5e_rep_priv *uplink_rpriv;
691 	struct mlx5e_tc_tun_encap *encap;
692 	unsigned int ret;
693 
694 	uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
695 	uplink_priv = &uplink_rpriv->uplink_priv;
696 	encap = uplink_priv->encap;
697 
698 	spin_lock_bh(&encap->route_lock);
699 	ret = encap->route_tbl_last_update;
700 	spin_unlock_bh(&encap->route_lock);
701 	return ret;
702 }
703 
704 static int mlx5e_attach_encap_route(struct mlx5e_priv *priv,
705 				    struct mlx5e_tc_flow *flow,
706 				    struct mlx5e_encap_entry *e,
707 				    bool new_encap_entry,
708 				    unsigned long tbl_time_before,
709 				    int out_index);
710 
711 int mlx5e_attach_encap(struct mlx5e_priv *priv,
712 		       struct mlx5e_tc_flow *flow,
713 		       struct net_device *mirred_dev,
714 		       int out_index,
715 		       struct netlink_ext_ack *extack,
716 		       struct net_device **encap_dev,
717 		       bool *encap_valid)
718 {
719 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
720 	struct mlx5e_tc_flow_parse_attr *parse_attr;
721 	struct mlx5_flow_attr *attr = flow->attr;
722 	const struct ip_tunnel_info *tun_info;
723 	unsigned long tbl_time_before = 0;
724 	struct mlx5e_encap_entry *e;
725 	struct mlx5e_encap_key key;
726 	bool entry_created = false;
727 	unsigned short family;
728 	uintptr_t hash_key;
729 	int err = 0;
730 
731 	parse_attr = attr->parse_attr;
732 	tun_info = parse_attr->tun_info[out_index];
733 	family = ip_tunnel_info_af(tun_info);
734 	key.ip_tun_key = &tun_info->key;
735 	key.tc_tunnel = mlx5e_get_tc_tun(mirred_dev);
736 	if (!key.tc_tunnel) {
737 		NL_SET_ERR_MSG_MOD(extack, "Unsupported tunnel");
738 		return -EOPNOTSUPP;
739 	}
740 
741 	hash_key = hash_encap_info(&key);
742 
743 	mutex_lock(&esw->offloads.encap_tbl_lock);
744 	e = mlx5e_encap_get(priv, &key, hash_key);
745 
746 	/* must verify if encap is valid or not */
747 	if (e) {
748 		/* Check that entry was not already attached to this flow */
749 		if (is_duplicated_encap_entry(priv, flow, out_index, e, extack)) {
750 			err = -EOPNOTSUPP;
751 			goto out_err;
752 		}
753 
754 		mutex_unlock(&esw->offloads.encap_tbl_lock);
755 		wait_for_completion(&e->res_ready);
756 
757 		/* Protect against concurrent neigh update. */
758 		mutex_lock(&esw->offloads.encap_tbl_lock);
759 		if (e->compl_result < 0) {
760 			err = -EREMOTEIO;
761 			goto out_err;
762 		}
763 		goto attach_flow;
764 	}
765 
766 	e = kzalloc(sizeof(*e), GFP_KERNEL);
767 	if (!e) {
768 		err = -ENOMEM;
769 		goto out_err;
770 	}
771 
772 	refcount_set(&e->refcnt, 1);
773 	init_completion(&e->res_ready);
774 	entry_created = true;
775 	INIT_LIST_HEAD(&e->route_list);
776 
777 	tun_info = mlx5e_dup_tun_info(tun_info);
778 	if (!tun_info) {
779 		err = -ENOMEM;
780 		goto out_err_init;
781 	}
782 	e->tun_info = tun_info;
783 	err = mlx5e_tc_tun_init_encap_attr(mirred_dev, priv, e, extack);
784 	if (err)
785 		goto out_err_init;
786 
787 	INIT_LIST_HEAD(&e->flows);
788 	hash_add_rcu(esw->offloads.encap_tbl, &e->encap_hlist, hash_key);
789 	tbl_time_before = mlx5e_route_tbl_get_last_update(priv);
790 	mutex_unlock(&esw->offloads.encap_tbl_lock);
791 
792 	if (family == AF_INET)
793 		err = mlx5e_tc_tun_create_header_ipv4(priv, mirred_dev, e);
794 	else if (family == AF_INET6)
795 		err = mlx5e_tc_tun_create_header_ipv6(priv, mirred_dev, e);
796 
797 	/* Protect against concurrent neigh update. */
798 	mutex_lock(&esw->offloads.encap_tbl_lock);
799 	complete_all(&e->res_ready);
800 	if (err) {
801 		e->compl_result = err;
802 		goto out_err;
803 	}
804 	e->compl_result = 1;
805 
806 attach_flow:
807 	err = mlx5e_attach_encap_route(priv, flow, e, entry_created, tbl_time_before,
808 				       out_index);
809 	if (err)
810 		goto out_err;
811 
812 	flow->encaps[out_index].e = e;
813 	list_add(&flow->encaps[out_index].list, &e->flows);
814 	flow->encaps[out_index].index = out_index;
815 	*encap_dev = e->out_dev;
816 	if (e->flags & MLX5_ENCAP_ENTRY_VALID) {
817 		attr->esw_attr->dests[out_index].pkt_reformat = e->pkt_reformat;
818 		attr->esw_attr->dests[out_index].flags |= MLX5_ESW_DEST_ENCAP_VALID;
819 		*encap_valid = true;
820 	} else {
821 		*encap_valid = false;
822 	}
823 	mutex_unlock(&esw->offloads.encap_tbl_lock);
824 
825 	return err;
826 
827 out_err:
828 	mutex_unlock(&esw->offloads.encap_tbl_lock);
829 	if (e)
830 		mlx5e_encap_put(priv, e);
831 	return err;
832 
833 out_err_init:
834 	mutex_unlock(&esw->offloads.encap_tbl_lock);
835 	kfree(tun_info);
836 	kfree(e);
837 	return err;
838 }
839 
840 int mlx5e_attach_decap(struct mlx5e_priv *priv,
841 		       struct mlx5e_tc_flow *flow,
842 		       struct netlink_ext_ack *extack)
843 {
844 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
845 	struct mlx5_esw_flow_attr *attr = flow->attr->esw_attr;
846 	struct mlx5_pkt_reformat_params reformat_params;
847 	struct mlx5e_tc_flow_parse_attr *parse_attr;
848 	struct mlx5e_decap_entry *d;
849 	struct mlx5e_decap_key key;
850 	uintptr_t hash_key;
851 	int err = 0;
852 
853 	parse_attr = flow->attr->parse_attr;
854 	if (sizeof(parse_attr->eth) > MLX5_CAP_ESW(priv->mdev, max_encap_header_size)) {
855 		NL_SET_ERR_MSG_MOD(extack,
856 				   "encap header larger than max supported");
857 		return -EOPNOTSUPP;
858 	}
859 
860 	key.key = parse_attr->eth;
861 	hash_key = hash_decap_info(&key);
862 	mutex_lock(&esw->offloads.decap_tbl_lock);
863 	d = mlx5e_decap_get(priv, &key, hash_key);
864 	if (d) {
865 		mutex_unlock(&esw->offloads.decap_tbl_lock);
866 		wait_for_completion(&d->res_ready);
867 		mutex_lock(&esw->offloads.decap_tbl_lock);
868 		if (d->compl_result) {
869 			err = -EREMOTEIO;
870 			goto out_free;
871 		}
872 		goto found;
873 	}
874 
875 	d = kzalloc(sizeof(*d), GFP_KERNEL);
876 	if (!d) {
877 		err = -ENOMEM;
878 		goto out_err;
879 	}
880 
881 	d->key = key;
882 	refcount_set(&d->refcnt, 1);
883 	init_completion(&d->res_ready);
884 	INIT_LIST_HEAD(&d->flows);
885 	hash_add_rcu(esw->offloads.decap_tbl, &d->hlist, hash_key);
886 	mutex_unlock(&esw->offloads.decap_tbl_lock);
887 
888 	memset(&reformat_params, 0, sizeof(reformat_params));
889 	reformat_params.type = MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2;
890 	reformat_params.size = sizeof(parse_attr->eth);
891 	reformat_params.data = &parse_attr->eth;
892 	d->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev,
893 						     &reformat_params,
894 						     MLX5_FLOW_NAMESPACE_FDB);
895 	if (IS_ERR(d->pkt_reformat)) {
896 		err = PTR_ERR(d->pkt_reformat);
897 		d->compl_result = err;
898 	}
899 	mutex_lock(&esw->offloads.decap_tbl_lock);
900 	complete_all(&d->res_ready);
901 	if (err)
902 		goto out_free;
903 
904 found:
905 	flow->decap_reformat = d;
906 	attr->decap_pkt_reformat = d->pkt_reformat;
907 	list_add(&flow->l3_to_l2_reformat, &d->flows);
908 	mutex_unlock(&esw->offloads.decap_tbl_lock);
909 	return 0;
910 
911 out_free:
912 	mutex_unlock(&esw->offloads.decap_tbl_lock);
913 	mlx5e_decap_put(priv, d);
914 	return err;
915 
916 out_err:
917 	mutex_unlock(&esw->offloads.decap_tbl_lock);
918 	return err;
919 }
920 
921 static int cmp_route_info(struct mlx5e_route_key *a,
922 			  struct mlx5e_route_key *b)
923 {
924 	if (a->ip_version == 4 && b->ip_version == 4)
925 		return memcmp(&a->endpoint_ip.v4, &b->endpoint_ip.v4,
926 			      sizeof(a->endpoint_ip.v4));
927 	else if (a->ip_version == 6 && b->ip_version == 6)
928 		return memcmp(&a->endpoint_ip.v6, &b->endpoint_ip.v6,
929 			      sizeof(a->endpoint_ip.v6));
930 	return 1;
931 }
932 
933 static u32 hash_route_info(struct mlx5e_route_key *key)
934 {
935 	if (key->ip_version == 4)
936 		return jhash(&key->endpoint_ip.v4, sizeof(key->endpoint_ip.v4), 0);
937 	return jhash(&key->endpoint_ip.v6, sizeof(key->endpoint_ip.v6), 0);
938 }
939 
940 static void mlx5e_route_dealloc(struct mlx5e_priv *priv,
941 				struct mlx5e_route_entry *r)
942 {
943 	WARN_ON(!list_empty(&r->decap_flows));
944 	WARN_ON(!list_empty(&r->encap_entries));
945 
946 	kfree_rcu(r, rcu);
947 }
948 
949 static void mlx5e_route_put(struct mlx5e_priv *priv, struct mlx5e_route_entry *r)
950 {
951 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
952 
953 	if (!refcount_dec_and_mutex_lock(&r->refcnt, &esw->offloads.encap_tbl_lock))
954 		return;
955 
956 	hash_del_rcu(&r->hlist);
957 	mutex_unlock(&esw->offloads.encap_tbl_lock);
958 
959 	mlx5e_route_dealloc(priv, r);
960 }
961 
962 static void mlx5e_route_put_locked(struct mlx5e_priv *priv, struct mlx5e_route_entry *r)
963 {
964 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
965 
966 	lockdep_assert_held(&esw->offloads.encap_tbl_lock);
967 
968 	if (!refcount_dec_and_test(&r->refcnt))
969 		return;
970 	hash_del_rcu(&r->hlist);
971 	mlx5e_route_dealloc(priv, r);
972 }
973 
974 static struct mlx5e_route_entry *
975 mlx5e_route_get(struct mlx5e_tc_tun_encap *encap, struct mlx5e_route_key *key,
976 		u32 hash_key)
977 {
978 	struct mlx5e_route_key r_key;
979 	struct mlx5e_route_entry *r;
980 
981 	hash_for_each_possible(encap->route_tbl, r, hlist, hash_key) {
982 		r_key = r->key;
983 		if (!cmp_route_info(&r_key, key) &&
984 		    refcount_inc_not_zero(&r->refcnt))
985 			return r;
986 	}
987 	return NULL;
988 }
989 
990 static struct mlx5e_route_entry *
991 mlx5e_route_get_create(struct mlx5e_priv *priv,
992 		       struct mlx5e_route_key *key,
993 		       int tunnel_dev_index,
994 		       unsigned long *route_tbl_change_time)
995 {
996 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
997 	struct mlx5_rep_uplink_priv *uplink_priv;
998 	struct mlx5e_rep_priv *uplink_rpriv;
999 	struct mlx5e_tc_tun_encap *encap;
1000 	struct mlx5e_route_entry *r;
1001 	u32 hash_key;
1002 
1003 	uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
1004 	uplink_priv = &uplink_rpriv->uplink_priv;
1005 	encap = uplink_priv->encap;
1006 
1007 	hash_key = hash_route_info(key);
1008 	spin_lock_bh(&encap->route_lock);
1009 	r = mlx5e_route_get(encap, key, hash_key);
1010 	spin_unlock_bh(&encap->route_lock);
1011 	if (r) {
1012 		if (!mlx5e_route_entry_valid(r)) {
1013 			mlx5e_route_put_locked(priv, r);
1014 			return ERR_PTR(-EINVAL);
1015 		}
1016 		return r;
1017 	}
1018 
1019 	r = kzalloc(sizeof(*r), GFP_KERNEL);
1020 	if (!r)
1021 		return ERR_PTR(-ENOMEM);
1022 
1023 	r->key = *key;
1024 	r->flags |= MLX5E_ROUTE_ENTRY_VALID;
1025 	r->tunnel_dev_index = tunnel_dev_index;
1026 	refcount_set(&r->refcnt, 1);
1027 	INIT_LIST_HEAD(&r->decap_flows);
1028 	INIT_LIST_HEAD(&r->encap_entries);
1029 
1030 	spin_lock_bh(&encap->route_lock);
1031 	*route_tbl_change_time = encap->route_tbl_last_update;
1032 	hash_add(encap->route_tbl, &r->hlist, hash_key);
1033 	spin_unlock_bh(&encap->route_lock);
1034 
1035 	return r;
1036 }
1037 
1038 static struct mlx5e_route_entry *
1039 mlx5e_route_lookup_for_update(struct mlx5e_tc_tun_encap *encap, struct mlx5e_route_key *key)
1040 {
1041 	u32 hash_key = hash_route_info(key);
1042 	struct mlx5e_route_entry *r;
1043 
1044 	spin_lock_bh(&encap->route_lock);
1045 	encap->route_tbl_last_update = jiffies;
1046 	r = mlx5e_route_get(encap, key, hash_key);
1047 	spin_unlock_bh(&encap->route_lock);
1048 
1049 	return r;
1050 }
1051 
1052 struct mlx5e_tc_fib_event_data {
1053 	struct work_struct work;
1054 	unsigned long event;
1055 	struct mlx5e_route_entry *r;
1056 	struct net_device *ul_dev;
1057 };
1058 
1059 static void mlx5e_tc_fib_event_work(struct work_struct *work);
1060 static struct mlx5e_tc_fib_event_data *
1061 mlx5e_tc_init_fib_work(unsigned long event, struct net_device *ul_dev, gfp_t flags)
1062 {
1063 	struct mlx5e_tc_fib_event_data *fib_work;
1064 
1065 	fib_work = kzalloc(sizeof(*fib_work), flags);
1066 	if (WARN_ON(!fib_work))
1067 		return NULL;
1068 
1069 	INIT_WORK(&fib_work->work, mlx5e_tc_fib_event_work);
1070 	fib_work->event = event;
1071 	fib_work->ul_dev = ul_dev;
1072 
1073 	return fib_work;
1074 }
1075 
1076 static int
1077 mlx5e_route_enqueue_update(struct mlx5e_priv *priv,
1078 			   struct mlx5e_route_entry *r,
1079 			   unsigned long event)
1080 {
1081 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1082 	struct mlx5e_tc_fib_event_data *fib_work;
1083 	struct mlx5e_rep_priv *uplink_rpriv;
1084 	struct net_device *ul_dev;
1085 
1086 	uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
1087 	ul_dev = uplink_rpriv->netdev;
1088 
1089 	fib_work = mlx5e_tc_init_fib_work(event, ul_dev, GFP_KERNEL);
1090 	if (!fib_work)
1091 		return -ENOMEM;
1092 
1093 	dev_hold(ul_dev);
1094 	refcount_inc(&r->refcnt);
1095 	fib_work->r = r;
1096 	queue_work(priv->wq, &fib_work->work);
1097 
1098 	return 0;
1099 }
1100 
1101 int mlx5e_attach_decap_route(struct mlx5e_priv *priv,
1102 			     struct mlx5e_tc_flow *flow)
1103 {
1104 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1105 	unsigned long tbl_time_before, tbl_time_after;
1106 	struct mlx5e_tc_flow_parse_attr *parse_attr;
1107 	struct mlx5_flow_attr *attr = flow->attr;
1108 	struct mlx5_esw_flow_attr *esw_attr;
1109 	struct mlx5e_route_entry *r;
1110 	struct mlx5e_route_key key;
1111 	int err = 0;
1112 
1113 	esw_attr = attr->esw_attr;
1114 	parse_attr = attr->parse_attr;
1115 	mutex_lock(&esw->offloads.encap_tbl_lock);
1116 	if (!esw_attr->rx_tun_attr)
1117 		goto out;
1118 
1119 	tbl_time_before = mlx5e_route_tbl_get_last_update(priv);
1120 	tbl_time_after = tbl_time_before;
1121 	err = mlx5e_tc_tun_route_lookup(priv, &parse_attr->spec, attr);
1122 	if (err || !esw_attr->rx_tun_attr->decap_vport)
1123 		goto out;
1124 
1125 	key.ip_version = attr->tun_ip_version;
1126 	if (key.ip_version == 4)
1127 		key.endpoint_ip.v4 = esw_attr->rx_tun_attr->dst_ip.v4;
1128 	else
1129 		key.endpoint_ip.v6 = esw_attr->rx_tun_attr->dst_ip.v6;
1130 
1131 	r = mlx5e_route_get_create(priv, &key, parse_attr->filter_dev->ifindex,
1132 				   &tbl_time_after);
1133 	if (IS_ERR(r)) {
1134 		err = PTR_ERR(r);
1135 		goto out;
1136 	}
1137 	/* Routing changed concurrently. FIB event handler might have missed new
1138 	 * entry, schedule update.
1139 	 */
1140 	if (tbl_time_before != tbl_time_after) {
1141 		err = mlx5e_route_enqueue_update(priv, r, FIB_EVENT_ENTRY_REPLACE);
1142 		if (err) {
1143 			mlx5e_route_put_locked(priv, r);
1144 			goto out;
1145 		}
1146 	}
1147 
1148 	flow->decap_route = r;
1149 	list_add(&flow->decap_routes, &r->decap_flows);
1150 	mutex_unlock(&esw->offloads.encap_tbl_lock);
1151 	return 0;
1152 
1153 out:
1154 	mutex_unlock(&esw->offloads.encap_tbl_lock);
1155 	return err;
1156 }
1157 
1158 static int mlx5e_attach_encap_route(struct mlx5e_priv *priv,
1159 				    struct mlx5e_tc_flow *flow,
1160 				    struct mlx5e_encap_entry *e,
1161 				    bool new_encap_entry,
1162 				    unsigned long tbl_time_before,
1163 				    int out_index)
1164 {
1165 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1166 	unsigned long tbl_time_after = tbl_time_before;
1167 	struct mlx5e_tc_flow_parse_attr *parse_attr;
1168 	struct mlx5_flow_attr *attr = flow->attr;
1169 	const struct ip_tunnel_info *tun_info;
1170 	struct mlx5_esw_flow_attr *esw_attr;
1171 	struct mlx5e_route_entry *r;
1172 	struct mlx5e_route_key key;
1173 	unsigned short family;
1174 	int err = 0;
1175 
1176 	esw_attr = attr->esw_attr;
1177 	parse_attr = attr->parse_attr;
1178 	tun_info = parse_attr->tun_info[out_index];
1179 	family = ip_tunnel_info_af(tun_info);
1180 
1181 	if (family == AF_INET) {
1182 		key.endpoint_ip.v4 = tun_info->key.u.ipv4.src;
1183 		key.ip_version = 4;
1184 	} else if (family == AF_INET6) {
1185 		key.endpoint_ip.v6 = tun_info->key.u.ipv6.src;
1186 		key.ip_version = 6;
1187 	}
1188 
1189 	err = mlx5e_set_vf_tunnel(esw, attr, &parse_attr->mod_hdr_acts, e->out_dev,
1190 				  e->route_dev_ifindex, out_index);
1191 	if (err || !(esw_attr->dests[out_index].flags &
1192 		     MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE))
1193 		return err;
1194 
1195 	r = mlx5e_route_get_create(priv, &key, parse_attr->mirred_ifindex[out_index],
1196 				   &tbl_time_after);
1197 	if (IS_ERR(r))
1198 		return PTR_ERR(r);
1199 	/* Routing changed concurrently. FIB event handler might have missed new
1200 	 * entry, schedule update.
1201 	 */
1202 	if (tbl_time_before != tbl_time_after) {
1203 		err = mlx5e_route_enqueue_update(priv, r, FIB_EVENT_ENTRY_REPLACE);
1204 		if (err) {
1205 			mlx5e_route_put_locked(priv, r);
1206 			return err;
1207 		}
1208 	}
1209 
1210 	flow->encap_routes[out_index].r = r;
1211 	if (new_encap_entry)
1212 		list_add(&e->route_list, &r->encap_entries);
1213 	flow->encap_routes[out_index].index = out_index;
1214 	return 0;
1215 }
1216 
1217 void mlx5e_detach_decap_route(struct mlx5e_priv *priv,
1218 			      struct mlx5e_tc_flow *flow)
1219 {
1220 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1221 	struct mlx5e_route_entry *r = flow->decap_route;
1222 
1223 	if (!r)
1224 		return;
1225 
1226 	mutex_lock(&esw->offloads.encap_tbl_lock);
1227 	list_del(&flow->decap_routes);
1228 	flow->decap_route = NULL;
1229 
1230 	if (!refcount_dec_and_test(&r->refcnt)) {
1231 		mutex_unlock(&esw->offloads.encap_tbl_lock);
1232 		return;
1233 	}
1234 	hash_del_rcu(&r->hlist);
1235 	mutex_unlock(&esw->offloads.encap_tbl_lock);
1236 
1237 	mlx5e_route_dealloc(priv, r);
1238 }
1239 
1240 static void mlx5e_detach_encap_route(struct mlx5e_priv *priv,
1241 				     struct mlx5e_tc_flow *flow,
1242 				     int out_index)
1243 {
1244 	struct mlx5e_route_entry *r = flow->encap_routes[out_index].r;
1245 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1246 	struct mlx5e_encap_entry *e, *tmp;
1247 
1248 	if (!r)
1249 		return;
1250 
1251 	mutex_lock(&esw->offloads.encap_tbl_lock);
1252 	flow->encap_routes[out_index].r = NULL;
1253 
1254 	if (!refcount_dec_and_test(&r->refcnt)) {
1255 		mutex_unlock(&esw->offloads.encap_tbl_lock);
1256 		return;
1257 	}
1258 	list_for_each_entry_safe(e, tmp, &r->encap_entries, route_list)
1259 		list_del_init(&e->route_list);
1260 	hash_del_rcu(&r->hlist);
1261 	mutex_unlock(&esw->offloads.encap_tbl_lock);
1262 
1263 	mlx5e_route_dealloc(priv, r);
1264 }
1265 
1266 static void mlx5e_invalidate_encap(struct mlx5e_priv *priv,
1267 				   struct mlx5e_encap_entry *e,
1268 				   struct list_head *encap_flows)
1269 {
1270 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1271 	struct mlx5e_tc_flow *flow;
1272 
1273 	list_for_each_entry(flow, encap_flows, tmp_list) {
1274 		struct mlx5_flow_attr *attr = flow->attr;
1275 		struct mlx5_esw_flow_attr *esw_attr;
1276 
1277 		if (!mlx5e_is_offloaded_flow(flow))
1278 			continue;
1279 		esw_attr = attr->esw_attr;
1280 
1281 		if (flow_flag_test(flow, SLOW))
1282 			mlx5e_tc_unoffload_from_slow_path(esw, flow);
1283 		else
1284 			mlx5e_tc_unoffload_fdb_rules(esw, flow, flow->attr);
1285 		mlx5_modify_header_dealloc(priv->mdev, attr->modify_hdr);
1286 		attr->modify_hdr = NULL;
1287 
1288 		esw_attr->dests[flow->tmp_entry_index].flags &=
1289 			~MLX5_ESW_DEST_ENCAP_VALID;
1290 		esw_attr->dests[flow->tmp_entry_index].pkt_reformat = NULL;
1291 	}
1292 
1293 	e->flags |= MLX5_ENCAP_ENTRY_NO_ROUTE;
1294 	if (e->flags & MLX5_ENCAP_ENTRY_VALID) {
1295 		e->flags &= ~MLX5_ENCAP_ENTRY_VALID;
1296 		mlx5_packet_reformat_dealloc(priv->mdev, e->pkt_reformat);
1297 		e->pkt_reformat = NULL;
1298 	}
1299 }
1300 
1301 static void mlx5e_reoffload_encap(struct mlx5e_priv *priv,
1302 				  struct net_device *tunnel_dev,
1303 				  struct mlx5e_encap_entry *e,
1304 				  struct list_head *encap_flows)
1305 {
1306 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1307 	struct mlx5e_tc_flow *flow;
1308 	int err;
1309 
1310 	err = ip_tunnel_info_af(e->tun_info) == AF_INET ?
1311 		mlx5e_tc_tun_update_header_ipv4(priv, tunnel_dev, e) :
1312 		mlx5e_tc_tun_update_header_ipv6(priv, tunnel_dev, e);
1313 	if (err)
1314 		mlx5_core_warn(priv->mdev, "Failed to update encap header, %d", err);
1315 	e->flags &= ~MLX5_ENCAP_ENTRY_NO_ROUTE;
1316 
1317 	list_for_each_entry(flow, encap_flows, tmp_list) {
1318 		struct mlx5e_tc_flow_parse_attr *parse_attr;
1319 		struct mlx5_flow_attr *attr = flow->attr;
1320 		struct mlx5_esw_flow_attr *esw_attr;
1321 		struct mlx5_flow_handle *rule;
1322 		struct mlx5_flow_spec *spec;
1323 
1324 		if (flow_flag_test(flow, FAILED))
1325 			continue;
1326 
1327 		esw_attr = attr->esw_attr;
1328 		parse_attr = attr->parse_attr;
1329 		spec = &parse_attr->spec;
1330 
1331 		err = mlx5e_update_vf_tunnel(esw, esw_attr, &parse_attr->mod_hdr_acts,
1332 					     e->out_dev, e->route_dev_ifindex,
1333 					     flow->tmp_entry_index);
1334 		if (err) {
1335 			mlx5_core_warn(priv->mdev, "Failed to update VF tunnel err=%d", err);
1336 			continue;
1337 		}
1338 
1339 		err = mlx5e_tc_add_flow_mod_hdr(priv, parse_attr, flow);
1340 		if (err) {
1341 			mlx5_core_warn(priv->mdev, "Failed to update flow mod_hdr err=%d",
1342 				       err);
1343 			continue;
1344 		}
1345 
1346 		if (e->flags & MLX5_ENCAP_ENTRY_VALID) {
1347 			esw_attr->dests[flow->tmp_entry_index].pkt_reformat = e->pkt_reformat;
1348 			esw_attr->dests[flow->tmp_entry_index].flags |= MLX5_ESW_DEST_ENCAP_VALID;
1349 			if (!mlx5e_tc_flow_all_encaps_valid(esw_attr))
1350 				goto offload_to_slow_path;
1351 			/* update from slow path rule to encap rule */
1352 			rule = mlx5e_tc_offload_fdb_rules(esw, flow, spec, attr);
1353 			if (IS_ERR(rule)) {
1354 				err = PTR_ERR(rule);
1355 				mlx5_core_warn(priv->mdev, "Failed to update cached encapsulation flow, %d\n",
1356 					       err);
1357 			} else {
1358 				flow->rule[0] = rule;
1359 			}
1360 		} else {
1361 offload_to_slow_path:
1362 			rule = mlx5e_tc_offload_to_slow_path(esw, flow, spec);
1363 			/* mark the flow's encap dest as non-valid */
1364 			esw_attr->dests[flow->tmp_entry_index].flags &=
1365 				~MLX5_ESW_DEST_ENCAP_VALID;
1366 
1367 			if (IS_ERR(rule)) {
1368 				err = PTR_ERR(rule);
1369 				mlx5_core_warn(priv->mdev, "Failed to update slow path (encap) flow, %d\n",
1370 					       err);
1371 			} else {
1372 				flow->rule[0] = rule;
1373 			}
1374 		}
1375 		flow_flag_set(flow, OFFLOADED);
1376 	}
1377 }
1378 
1379 static int mlx5e_update_route_encaps(struct mlx5e_priv *priv,
1380 				     struct mlx5e_route_entry *r,
1381 				     struct list_head *flow_list,
1382 				     bool replace)
1383 {
1384 	struct net_device *tunnel_dev;
1385 	struct mlx5e_encap_entry *e;
1386 
1387 	tunnel_dev = __dev_get_by_index(dev_net(priv->netdev), r->tunnel_dev_index);
1388 	if (!tunnel_dev)
1389 		return -ENODEV;
1390 
1391 	list_for_each_entry(e, &r->encap_entries, route_list) {
1392 		LIST_HEAD(encap_flows);
1393 
1394 		mlx5e_take_all_encap_flows(e, &encap_flows);
1395 		if (list_empty(&encap_flows))
1396 			continue;
1397 
1398 		if (mlx5e_route_entry_valid(r))
1399 			mlx5e_invalidate_encap(priv, e, &encap_flows);
1400 
1401 		if (!replace) {
1402 			list_splice(&encap_flows, flow_list);
1403 			continue;
1404 		}
1405 
1406 		mlx5e_reoffload_encap(priv, tunnel_dev, e, &encap_flows);
1407 		list_splice(&encap_flows, flow_list);
1408 	}
1409 
1410 	return 0;
1411 }
1412 
1413 static void mlx5e_unoffload_flow_list(struct mlx5e_priv *priv,
1414 				      struct list_head *flow_list)
1415 {
1416 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1417 	struct mlx5e_tc_flow *flow;
1418 
1419 	list_for_each_entry(flow, flow_list, tmp_list)
1420 		if (mlx5e_is_offloaded_flow(flow))
1421 			mlx5e_tc_unoffload_fdb_rules(esw, flow, flow->attr);
1422 }
1423 
1424 static void mlx5e_reoffload_decap(struct mlx5e_priv *priv,
1425 				  struct list_head *decap_flows)
1426 {
1427 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1428 	struct mlx5e_tc_flow *flow;
1429 
1430 	list_for_each_entry(flow, decap_flows, tmp_list) {
1431 		struct mlx5e_tc_flow_parse_attr *parse_attr;
1432 		struct mlx5_flow_attr *attr = flow->attr;
1433 		struct mlx5_flow_handle *rule;
1434 		struct mlx5_flow_spec *spec;
1435 		int err;
1436 
1437 		if (flow_flag_test(flow, FAILED))
1438 			continue;
1439 
1440 		parse_attr = attr->parse_attr;
1441 		spec = &parse_attr->spec;
1442 		err = mlx5e_tc_tun_route_lookup(priv, spec, attr);
1443 		if (err) {
1444 			mlx5_core_warn(priv->mdev, "Failed to lookup route for flow, %d\n",
1445 				       err);
1446 			continue;
1447 		}
1448 
1449 		rule = mlx5e_tc_offload_fdb_rules(esw, flow, spec, attr);
1450 		if (IS_ERR(rule)) {
1451 			err = PTR_ERR(rule);
1452 			mlx5_core_warn(priv->mdev, "Failed to update cached decap flow, %d\n",
1453 				       err);
1454 		} else {
1455 			flow->rule[0] = rule;
1456 			flow_flag_set(flow, OFFLOADED);
1457 		}
1458 	}
1459 }
1460 
1461 static int mlx5e_update_route_decap_flows(struct mlx5e_priv *priv,
1462 					  struct mlx5e_route_entry *r,
1463 					  struct list_head *flow_list,
1464 					  bool replace)
1465 {
1466 	struct net_device *tunnel_dev;
1467 	LIST_HEAD(decap_flows);
1468 
1469 	tunnel_dev = __dev_get_by_index(dev_net(priv->netdev), r->tunnel_dev_index);
1470 	if (!tunnel_dev)
1471 		return -ENODEV;
1472 
1473 	mlx5e_take_all_route_decap_flows(r, &decap_flows);
1474 	if (mlx5e_route_entry_valid(r))
1475 		mlx5e_unoffload_flow_list(priv, &decap_flows);
1476 	if (replace)
1477 		mlx5e_reoffload_decap(priv, &decap_flows);
1478 
1479 	list_splice(&decap_flows, flow_list);
1480 
1481 	return 0;
1482 }
1483 
1484 static void mlx5e_tc_fib_event_work(struct work_struct *work)
1485 {
1486 	struct mlx5e_tc_fib_event_data *event_data =
1487 		container_of(work, struct mlx5e_tc_fib_event_data, work);
1488 	struct net_device *ul_dev = event_data->ul_dev;
1489 	struct mlx5e_priv *priv = netdev_priv(ul_dev);
1490 	struct mlx5e_route_entry *r = event_data->r;
1491 	struct mlx5_eswitch *esw;
1492 	LIST_HEAD(flow_list);
1493 	bool replace;
1494 	int err;
1495 
1496 	/* sync with concurrent neigh updates */
1497 	rtnl_lock();
1498 	esw = priv->mdev->priv.eswitch;
1499 	mutex_lock(&esw->offloads.encap_tbl_lock);
1500 	replace = event_data->event == FIB_EVENT_ENTRY_REPLACE;
1501 
1502 	if (!mlx5e_route_entry_valid(r) && !replace)
1503 		goto out;
1504 
1505 	err = mlx5e_update_route_encaps(priv, r, &flow_list, replace);
1506 	if (err)
1507 		mlx5_core_warn(priv->mdev, "Failed to update route encaps, %d\n",
1508 			       err);
1509 
1510 	err = mlx5e_update_route_decap_flows(priv, r, &flow_list, replace);
1511 	if (err)
1512 		mlx5_core_warn(priv->mdev, "Failed to update route decap flows, %d\n",
1513 			       err);
1514 
1515 	if (replace)
1516 		r->flags |= MLX5E_ROUTE_ENTRY_VALID;
1517 out:
1518 	mutex_unlock(&esw->offloads.encap_tbl_lock);
1519 	rtnl_unlock();
1520 
1521 	mlx5e_put_flow_list(priv, &flow_list);
1522 	mlx5e_route_put(priv, event_data->r);
1523 	dev_put(event_data->ul_dev);
1524 	kfree(event_data);
1525 }
1526 
1527 static struct mlx5e_tc_fib_event_data *
1528 mlx5e_init_fib_work_ipv4(struct mlx5e_priv *priv,
1529 			 struct net_device *ul_dev,
1530 			 struct mlx5e_tc_tun_encap *encap,
1531 			 unsigned long event,
1532 			 struct fib_notifier_info *info)
1533 {
1534 	struct fib_entry_notifier_info *fen_info;
1535 	struct mlx5e_tc_fib_event_data *fib_work;
1536 	struct mlx5e_route_entry *r;
1537 	struct mlx5e_route_key key;
1538 	struct net_device *fib_dev;
1539 
1540 	fen_info = container_of(info, struct fib_entry_notifier_info, info);
1541 	fib_dev = fib_info_nh(fen_info->fi, 0)->fib_nh_dev;
1542 	if (!fib_dev || fib_dev->netdev_ops != &mlx5e_netdev_ops ||
1543 	    fen_info->dst_len != 32)
1544 		return NULL;
1545 
1546 	fib_work = mlx5e_tc_init_fib_work(event, ul_dev, GFP_ATOMIC);
1547 	if (!fib_work)
1548 		return ERR_PTR(-ENOMEM);
1549 
1550 	key.endpoint_ip.v4 = htonl(fen_info->dst);
1551 	key.ip_version = 4;
1552 
1553 	/* Can't fail after this point because releasing reference to r
1554 	 * requires obtaining sleeping mutex which we can't do in atomic
1555 	 * context.
1556 	 */
1557 	r = mlx5e_route_lookup_for_update(encap, &key);
1558 	if (!r)
1559 		goto out;
1560 	fib_work->r = r;
1561 	dev_hold(ul_dev);
1562 
1563 	return fib_work;
1564 
1565 out:
1566 	kfree(fib_work);
1567 	return NULL;
1568 }
1569 
1570 static struct mlx5e_tc_fib_event_data *
1571 mlx5e_init_fib_work_ipv6(struct mlx5e_priv *priv,
1572 			 struct net_device *ul_dev,
1573 			 struct mlx5e_tc_tun_encap *encap,
1574 			 unsigned long event,
1575 			 struct fib_notifier_info *info)
1576 {
1577 	struct fib6_entry_notifier_info *fen_info;
1578 	struct mlx5e_tc_fib_event_data *fib_work;
1579 	struct mlx5e_route_entry *r;
1580 	struct mlx5e_route_key key;
1581 	struct net_device *fib_dev;
1582 
1583 	fen_info = container_of(info, struct fib6_entry_notifier_info, info);
1584 	fib_dev = fib6_info_nh_dev(fen_info->rt);
1585 	if (fib_dev->netdev_ops != &mlx5e_netdev_ops ||
1586 	    fen_info->rt->fib6_dst.plen != 128)
1587 		return NULL;
1588 
1589 	fib_work = mlx5e_tc_init_fib_work(event, ul_dev, GFP_ATOMIC);
1590 	if (!fib_work)
1591 		return ERR_PTR(-ENOMEM);
1592 
1593 	memcpy(&key.endpoint_ip.v6, &fen_info->rt->fib6_dst.addr,
1594 	       sizeof(fen_info->rt->fib6_dst.addr));
1595 	key.ip_version = 6;
1596 
1597 	/* Can't fail after this point because releasing reference to r
1598 	 * requires obtaining sleeping mutex which we can't do in atomic
1599 	 * context.
1600 	 */
1601 	r = mlx5e_route_lookup_for_update(encap, &key);
1602 	if (!r)
1603 		goto out;
1604 	fib_work->r = r;
1605 	dev_hold(ul_dev);
1606 
1607 	return fib_work;
1608 
1609 out:
1610 	kfree(fib_work);
1611 	return NULL;
1612 }
1613 
1614 static int mlx5e_tc_tun_fib_event(struct notifier_block *nb, unsigned long event, void *ptr)
1615 {
1616 	struct mlx5e_tc_fib_event_data *fib_work;
1617 	struct fib_notifier_info *info = ptr;
1618 	struct mlx5e_tc_tun_encap *encap;
1619 	struct net_device *ul_dev;
1620 	struct mlx5e_priv *priv;
1621 
1622 	encap = container_of(nb, struct mlx5e_tc_tun_encap, fib_nb);
1623 	priv = encap->priv;
1624 	ul_dev = priv->netdev;
1625 	priv = netdev_priv(ul_dev);
1626 
1627 	switch (event) {
1628 	case FIB_EVENT_ENTRY_REPLACE:
1629 	case FIB_EVENT_ENTRY_DEL:
1630 		if (info->family == AF_INET)
1631 			fib_work = mlx5e_init_fib_work_ipv4(priv, ul_dev, encap, event, info);
1632 		else if (info->family == AF_INET6)
1633 			fib_work = mlx5e_init_fib_work_ipv6(priv, ul_dev, encap, event, info);
1634 		else
1635 			return NOTIFY_DONE;
1636 
1637 		if (!IS_ERR_OR_NULL(fib_work)) {
1638 			queue_work(priv->wq, &fib_work->work);
1639 		} else if (IS_ERR(fib_work)) {
1640 			NL_SET_ERR_MSG_MOD(info->extack, "Failed to init fib work");
1641 			mlx5_core_warn(priv->mdev, "Failed to init fib work, %ld\n",
1642 				       PTR_ERR(fib_work));
1643 		}
1644 
1645 		break;
1646 	default:
1647 		return NOTIFY_DONE;
1648 	}
1649 
1650 	return NOTIFY_DONE;
1651 }
1652 
1653 struct mlx5e_tc_tun_encap *mlx5e_tc_tun_init(struct mlx5e_priv *priv)
1654 {
1655 	struct mlx5e_tc_tun_encap *encap;
1656 	int err;
1657 
1658 	encap = kvzalloc(sizeof(*encap), GFP_KERNEL);
1659 	if (!encap)
1660 		return ERR_PTR(-ENOMEM);
1661 
1662 	encap->priv = priv;
1663 	encap->fib_nb.notifier_call = mlx5e_tc_tun_fib_event;
1664 	spin_lock_init(&encap->route_lock);
1665 	hash_init(encap->route_tbl);
1666 	err = register_fib_notifier(dev_net(priv->netdev), &encap->fib_nb,
1667 				    NULL, NULL);
1668 	if (err) {
1669 		kvfree(encap);
1670 		return ERR_PTR(err);
1671 	}
1672 
1673 	return encap;
1674 }
1675 
1676 void mlx5e_tc_tun_cleanup(struct mlx5e_tc_tun_encap *encap)
1677 {
1678 	if (!encap)
1679 		return;
1680 
1681 	unregister_fib_notifier(dev_net(encap->priv->netdev), &encap->fib_nb);
1682 	flush_workqueue(encap->priv->wq); /* flush fib event works */
1683 	kvfree(encap);
1684 }
1685