1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2021 Mellanox Technologies. */
3 
4 #include <net/fib_notifier.h>
5 #include <net/nexthop.h>
6 #include "tc_tun_encap.h"
7 #include "en_tc.h"
8 #include "tc_tun.h"
9 #include "rep/tc.h"
10 #include "diag/en_tc_tracepoint.h"
11 
12 enum {
13 	MLX5E_ROUTE_ENTRY_VALID     = BIT(0),
14 };
15 
16 static int mlx5e_set_int_port_tunnel(struct mlx5e_priv *priv,
17 				     struct mlx5_flow_attr *attr,
18 				     struct mlx5e_encap_entry *e,
19 				     int out_index)
20 {
21 	struct net_device *route_dev;
22 	int err = 0;
23 
24 	route_dev = dev_get_by_index(dev_net(e->out_dev), e->route_dev_ifindex);
25 
26 	if (!route_dev || !netif_is_ovs_master(route_dev))
27 		goto out;
28 
29 	err = mlx5e_set_fwd_to_int_port_actions(priv, attr, e->route_dev_ifindex,
30 						MLX5E_TC_INT_PORT_EGRESS,
31 						&attr->action, out_index);
32 
33 out:
34 	if (route_dev)
35 		dev_put(route_dev);
36 
37 	return err;
38 }
39 
40 struct mlx5e_route_key {
41 	int ip_version;
42 	union {
43 		__be32 v4;
44 		struct in6_addr v6;
45 	} endpoint_ip;
46 };
47 
48 struct mlx5e_route_entry {
49 	struct mlx5e_route_key key;
50 	struct list_head encap_entries;
51 	struct list_head decap_flows;
52 	u32 flags;
53 	struct hlist_node hlist;
54 	refcount_t refcnt;
55 	int tunnel_dev_index;
56 	struct rcu_head rcu;
57 };
58 
59 struct mlx5e_tc_tun_encap {
60 	struct mlx5e_priv *priv;
61 	struct notifier_block fib_nb;
62 	spinlock_t route_lock; /* protects route_tbl */
63 	unsigned long route_tbl_last_update;
64 	DECLARE_HASHTABLE(route_tbl, 8);
65 };
66 
67 static bool mlx5e_route_entry_valid(struct mlx5e_route_entry *r)
68 {
69 	return r->flags & MLX5E_ROUTE_ENTRY_VALID;
70 }
71 
72 int mlx5e_tc_set_attr_rx_tun(struct mlx5e_tc_flow *flow,
73 			     struct mlx5_flow_spec *spec)
74 {
75 	struct mlx5_esw_flow_attr *esw_attr = flow->attr->esw_attr;
76 	struct mlx5_rx_tun_attr *tun_attr;
77 	void *daddr, *saddr;
78 	u8 ip_version;
79 
80 	tun_attr = kvzalloc(sizeof(*tun_attr), GFP_KERNEL);
81 	if (!tun_attr)
82 		return -ENOMEM;
83 
84 	esw_attr->rx_tun_attr = tun_attr;
85 	ip_version = mlx5e_tc_get_ip_version(spec, true);
86 
87 	if (ip_version == 4) {
88 		daddr = MLX5_ADDR_OF(fte_match_param, spec->match_value,
89 				     outer_headers.dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
90 		saddr = MLX5_ADDR_OF(fte_match_param, spec->match_value,
91 				     outer_headers.src_ipv4_src_ipv6.ipv4_layout.ipv4);
92 		tun_attr->dst_ip.v4 = *(__be32 *)daddr;
93 		tun_attr->src_ip.v4 = *(__be32 *)saddr;
94 		if (!tun_attr->dst_ip.v4 || !tun_attr->src_ip.v4)
95 			return 0;
96 	}
97 #if IS_ENABLED(CONFIG_INET) && IS_ENABLED(CONFIG_IPV6)
98 	else if (ip_version == 6) {
99 		int ipv6_size = MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6);
100 		struct in6_addr zerov6 = {};
101 
102 		daddr = MLX5_ADDR_OF(fte_match_param, spec->match_value,
103 				     outer_headers.dst_ipv4_dst_ipv6.ipv6_layout.ipv6);
104 		saddr = MLX5_ADDR_OF(fte_match_param, spec->match_value,
105 				     outer_headers.src_ipv4_src_ipv6.ipv6_layout.ipv6);
106 		memcpy(&tun_attr->dst_ip.v6, daddr, ipv6_size);
107 		memcpy(&tun_attr->src_ip.v6, saddr, ipv6_size);
108 		if (!memcmp(&tun_attr->dst_ip.v6, &zerov6, sizeof(zerov6)) ||
109 		    !memcmp(&tun_attr->src_ip.v6, &zerov6, sizeof(zerov6)))
110 			return 0;
111 	}
112 #endif
113 	/* Only set the flag if both src and dst ip addresses exist. They are
114 	 * required to establish routing.
115 	 */
116 	flow_flag_set(flow, TUN_RX);
117 	flow->attr->tun_ip_version = ip_version;
118 	return 0;
119 }
120 
121 static bool mlx5e_tc_flow_all_encaps_valid(struct mlx5_esw_flow_attr *esw_attr)
122 {
123 	bool all_flow_encaps_valid = true;
124 	int i;
125 
126 	/* Flow can be associated with multiple encap entries.
127 	 * Before offloading the flow verify that all of them have
128 	 * a valid neighbour.
129 	 */
130 	for (i = 0; i < MLX5_MAX_FLOW_FWD_VPORTS; i++) {
131 		if (!(esw_attr->dests[i].flags & MLX5_ESW_DEST_ENCAP))
132 			continue;
133 		if (!(esw_attr->dests[i].flags & MLX5_ESW_DEST_ENCAP_VALID)) {
134 			all_flow_encaps_valid = false;
135 			break;
136 		}
137 	}
138 
139 	return all_flow_encaps_valid;
140 }
141 
142 void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
143 			      struct mlx5e_encap_entry *e,
144 			      struct list_head *flow_list)
145 {
146 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
147 	struct mlx5_pkt_reformat_params reformat_params;
148 	struct mlx5_esw_flow_attr *esw_attr;
149 	struct mlx5_flow_handle *rule;
150 	struct mlx5_flow_attr *attr;
151 	struct mlx5_flow_spec *spec;
152 	struct mlx5e_tc_flow *flow;
153 	int err;
154 
155 	if (e->flags & MLX5_ENCAP_ENTRY_NO_ROUTE)
156 		return;
157 
158 	memset(&reformat_params, 0, sizeof(reformat_params));
159 	reformat_params.type = e->reformat_type;
160 	reformat_params.size = e->encap_size;
161 	reformat_params.data = e->encap_header;
162 	e->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev,
163 						     &reformat_params,
164 						     MLX5_FLOW_NAMESPACE_FDB);
165 	if (IS_ERR(e->pkt_reformat)) {
166 		mlx5_core_warn(priv->mdev, "Failed to offload cached encapsulation header, %lu\n",
167 			       PTR_ERR(e->pkt_reformat));
168 		return;
169 	}
170 	e->flags |= MLX5_ENCAP_ENTRY_VALID;
171 	mlx5e_rep_queue_neigh_stats_work(priv);
172 
173 	list_for_each_entry(flow, flow_list, tmp_list) {
174 		if (!mlx5e_is_offloaded_flow(flow) || !flow_flag_test(flow, SLOW))
175 			continue;
176 		attr = flow->attr;
177 		esw_attr = attr->esw_attr;
178 		spec = &attr->parse_attr->spec;
179 
180 		esw_attr->dests[flow->tmp_entry_index].pkt_reformat = e->pkt_reformat;
181 		esw_attr->dests[flow->tmp_entry_index].flags |= MLX5_ESW_DEST_ENCAP_VALID;
182 
183 		/* Do not offload flows with unresolved neighbors */
184 		if (!mlx5e_tc_flow_all_encaps_valid(esw_attr))
185 			continue;
186 		/* update from slow path rule to encap rule */
187 		rule = mlx5e_tc_offload_fdb_rules(esw, flow, spec, attr);
188 		if (IS_ERR(rule)) {
189 			err = PTR_ERR(rule);
190 			mlx5_core_warn(priv->mdev, "Failed to update cached encapsulation flow, %d\n",
191 				       err);
192 			continue;
193 		}
194 
195 		mlx5e_tc_unoffload_from_slow_path(esw, flow);
196 		flow->rule[0] = rule;
197 		/* was unset when slow path rule removed */
198 		flow_flag_set(flow, OFFLOADED);
199 	}
200 }
201 
202 void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv,
203 			      struct mlx5e_encap_entry *e,
204 			      struct list_head *flow_list)
205 {
206 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
207 	struct mlx5_esw_flow_attr *esw_attr;
208 	struct mlx5_flow_handle *rule;
209 	struct mlx5_flow_attr *attr;
210 	struct mlx5_flow_spec *spec;
211 	struct mlx5e_tc_flow *flow;
212 	int err;
213 
214 	list_for_each_entry(flow, flow_list, tmp_list) {
215 		if (!mlx5e_is_offloaded_flow(flow) || flow_flag_test(flow, SLOW))
216 			continue;
217 		attr = flow->attr;
218 		esw_attr = attr->esw_attr;
219 		spec = &attr->parse_attr->spec;
220 
221 		/* update from encap rule to slow path rule */
222 		rule = mlx5e_tc_offload_to_slow_path(esw, flow, spec);
223 		/* mark the flow's encap dest as non-valid */
224 		esw_attr->dests[flow->tmp_entry_index].flags &= ~MLX5_ESW_DEST_ENCAP_VALID;
225 
226 		if (IS_ERR(rule)) {
227 			err = PTR_ERR(rule);
228 			mlx5_core_warn(priv->mdev, "Failed to update slow path (encap) flow, %d\n",
229 				       err);
230 			continue;
231 		}
232 
233 		mlx5e_tc_unoffload_fdb_rules(esw, flow, attr);
234 		flow->rule[0] = rule;
235 		/* was unset when fast path rule removed */
236 		flow_flag_set(flow, OFFLOADED);
237 	}
238 
239 	/* we know that the encap is valid */
240 	e->flags &= ~MLX5_ENCAP_ENTRY_VALID;
241 	mlx5_packet_reformat_dealloc(priv->mdev, e->pkt_reformat);
242 }
243 
244 static void mlx5e_take_tmp_flow(struct mlx5e_tc_flow *flow,
245 				struct list_head *flow_list,
246 				int index)
247 {
248 	if (IS_ERR(mlx5e_flow_get(flow)))
249 		return;
250 	wait_for_completion(&flow->init_done);
251 
252 	flow->tmp_entry_index = index;
253 	list_add(&flow->tmp_list, flow_list);
254 }
255 
256 /* Takes reference to all flows attached to encap and adds the flows to
257  * flow_list using 'tmp_list' list_head in mlx5e_tc_flow.
258  */
259 void mlx5e_take_all_encap_flows(struct mlx5e_encap_entry *e, struct list_head *flow_list)
260 {
261 	struct encap_flow_item *efi;
262 	struct mlx5e_tc_flow *flow;
263 
264 	list_for_each_entry(efi, &e->flows, list) {
265 		flow = container_of(efi, struct mlx5e_tc_flow, encaps[efi->index]);
266 		mlx5e_take_tmp_flow(flow, flow_list, efi->index);
267 	}
268 }
269 
270 /* Takes reference to all flows attached to route and adds the flows to
271  * flow_list using 'tmp_list' list_head in mlx5e_tc_flow.
272  */
273 static void mlx5e_take_all_route_decap_flows(struct mlx5e_route_entry *r,
274 					     struct list_head *flow_list)
275 {
276 	struct mlx5e_tc_flow *flow;
277 
278 	list_for_each_entry(flow, &r->decap_flows, decap_routes)
279 		mlx5e_take_tmp_flow(flow, flow_list, 0);
280 }
281 
282 typedef bool (match_cb)(struct mlx5e_encap_entry *);
283 
284 static struct mlx5e_encap_entry *
285 mlx5e_get_next_matching_encap(struct mlx5e_neigh_hash_entry *nhe,
286 			      struct mlx5e_encap_entry *e,
287 			      match_cb match)
288 {
289 	struct mlx5e_encap_entry *next = NULL;
290 
291 retry:
292 	rcu_read_lock();
293 
294 	/* find encap with non-zero reference counter value */
295 	for (next = e ?
296 		     list_next_or_null_rcu(&nhe->encap_list,
297 					   &e->encap_list,
298 					   struct mlx5e_encap_entry,
299 					   encap_list) :
300 		     list_first_or_null_rcu(&nhe->encap_list,
301 					    struct mlx5e_encap_entry,
302 					    encap_list);
303 	     next;
304 	     next = list_next_or_null_rcu(&nhe->encap_list,
305 					  &next->encap_list,
306 					  struct mlx5e_encap_entry,
307 					  encap_list))
308 		if (mlx5e_encap_take(next))
309 			break;
310 
311 	rcu_read_unlock();
312 
313 	/* release starting encap */
314 	if (e)
315 		mlx5e_encap_put(netdev_priv(e->out_dev), e);
316 	if (!next)
317 		return next;
318 
319 	/* wait for encap to be fully initialized */
320 	wait_for_completion(&next->res_ready);
321 	/* continue searching if encap entry is not in valid state after completion */
322 	if (!match(next)) {
323 		e = next;
324 		goto retry;
325 	}
326 
327 	return next;
328 }
329 
330 static bool mlx5e_encap_valid(struct mlx5e_encap_entry *e)
331 {
332 	return e->flags & MLX5_ENCAP_ENTRY_VALID;
333 }
334 
335 static struct mlx5e_encap_entry *
336 mlx5e_get_next_valid_encap(struct mlx5e_neigh_hash_entry *nhe,
337 			   struct mlx5e_encap_entry *e)
338 {
339 	return mlx5e_get_next_matching_encap(nhe, e, mlx5e_encap_valid);
340 }
341 
342 static bool mlx5e_encap_initialized(struct mlx5e_encap_entry *e)
343 {
344 	return e->compl_result >= 0;
345 }
346 
347 struct mlx5e_encap_entry *
348 mlx5e_get_next_init_encap(struct mlx5e_neigh_hash_entry *nhe,
349 			  struct mlx5e_encap_entry *e)
350 {
351 	return mlx5e_get_next_matching_encap(nhe, e, mlx5e_encap_initialized);
352 }
353 
354 void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe)
355 {
356 	struct mlx5e_neigh *m_neigh = &nhe->m_neigh;
357 	struct mlx5e_encap_entry *e = NULL;
358 	struct mlx5e_tc_flow *flow;
359 	struct mlx5_fc *counter;
360 	struct neigh_table *tbl;
361 	bool neigh_used = false;
362 	struct neighbour *n;
363 	u64 lastuse;
364 
365 	if (m_neigh->family == AF_INET)
366 		tbl = &arp_tbl;
367 #if IS_ENABLED(CONFIG_IPV6)
368 	else if (m_neigh->family == AF_INET6)
369 		tbl = ipv6_stub->nd_tbl;
370 #endif
371 	else
372 		return;
373 
374 	/* mlx5e_get_next_valid_encap() releases previous encap before returning
375 	 * next one.
376 	 */
377 	while ((e = mlx5e_get_next_valid_encap(nhe, e)) != NULL) {
378 		struct mlx5e_priv *priv = netdev_priv(e->out_dev);
379 		struct encap_flow_item *efi, *tmp;
380 		struct mlx5_eswitch *esw;
381 		LIST_HEAD(flow_list);
382 
383 		esw = priv->mdev->priv.eswitch;
384 		mutex_lock(&esw->offloads.encap_tbl_lock);
385 		list_for_each_entry_safe(efi, tmp, &e->flows, list) {
386 			flow = container_of(efi, struct mlx5e_tc_flow,
387 					    encaps[efi->index]);
388 			if (IS_ERR(mlx5e_flow_get(flow)))
389 				continue;
390 			list_add(&flow->tmp_list, &flow_list);
391 
392 			if (mlx5e_is_offloaded_flow(flow)) {
393 				counter = mlx5e_tc_get_counter(flow);
394 				lastuse = mlx5_fc_query_lastuse(counter);
395 				if (time_after((unsigned long)lastuse, nhe->reported_lastuse)) {
396 					neigh_used = true;
397 					break;
398 				}
399 			}
400 		}
401 		mutex_unlock(&esw->offloads.encap_tbl_lock);
402 
403 		mlx5e_put_flow_list(priv, &flow_list);
404 		if (neigh_used) {
405 			/* release current encap before breaking the loop */
406 			mlx5e_encap_put(priv, e);
407 			break;
408 		}
409 	}
410 
411 	trace_mlx5e_tc_update_neigh_used_value(nhe, neigh_used);
412 
413 	if (neigh_used) {
414 		nhe->reported_lastuse = jiffies;
415 
416 		/* find the relevant neigh according to the cached device and
417 		 * dst ip pair
418 		 */
419 		n = neigh_lookup(tbl, &m_neigh->dst_ip, READ_ONCE(nhe->neigh_dev));
420 		if (!n)
421 			return;
422 
423 		neigh_event_send(n, NULL);
424 		neigh_release(n);
425 	}
426 }
427 
428 static void mlx5e_encap_dealloc(struct mlx5e_priv *priv, struct mlx5e_encap_entry *e)
429 {
430 	WARN_ON(!list_empty(&e->flows));
431 
432 	if (e->compl_result > 0) {
433 		mlx5e_rep_encap_entry_detach(netdev_priv(e->out_dev), e);
434 
435 		if (e->flags & MLX5_ENCAP_ENTRY_VALID)
436 			mlx5_packet_reformat_dealloc(priv->mdev, e->pkt_reformat);
437 	}
438 
439 	kfree(e->tun_info);
440 	kfree(e->encap_header);
441 	kfree_rcu(e, rcu);
442 }
443 
444 static void mlx5e_decap_dealloc(struct mlx5e_priv *priv,
445 				struct mlx5e_decap_entry *d)
446 {
447 	WARN_ON(!list_empty(&d->flows));
448 
449 	if (!d->compl_result)
450 		mlx5_packet_reformat_dealloc(priv->mdev, d->pkt_reformat);
451 
452 	kfree_rcu(d, rcu);
453 }
454 
455 void mlx5e_encap_put(struct mlx5e_priv *priv, struct mlx5e_encap_entry *e)
456 {
457 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
458 
459 	if (!refcount_dec_and_mutex_lock(&e->refcnt, &esw->offloads.encap_tbl_lock))
460 		return;
461 	list_del(&e->route_list);
462 	hash_del_rcu(&e->encap_hlist);
463 	mutex_unlock(&esw->offloads.encap_tbl_lock);
464 
465 	mlx5e_encap_dealloc(priv, e);
466 }
467 
468 static void mlx5e_decap_put(struct mlx5e_priv *priv, struct mlx5e_decap_entry *d)
469 {
470 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
471 
472 	if (!refcount_dec_and_mutex_lock(&d->refcnt, &esw->offloads.decap_tbl_lock))
473 		return;
474 	hash_del_rcu(&d->hlist);
475 	mutex_unlock(&esw->offloads.decap_tbl_lock);
476 
477 	mlx5e_decap_dealloc(priv, d);
478 }
479 
480 static void mlx5e_detach_encap_route(struct mlx5e_priv *priv,
481 				     struct mlx5e_tc_flow *flow,
482 				     int out_index);
483 
484 void mlx5e_detach_encap(struct mlx5e_priv *priv,
485 			struct mlx5e_tc_flow *flow, int out_index)
486 {
487 	struct mlx5e_encap_entry *e = flow->encaps[out_index].e;
488 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
489 
490 	if (flow->attr->esw_attr->dests[out_index].flags &
491 	    MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE)
492 		mlx5e_detach_encap_route(priv, flow, out_index);
493 
494 	/* flow wasn't fully initialized */
495 	if (!e)
496 		return;
497 
498 	mutex_lock(&esw->offloads.encap_tbl_lock);
499 	list_del(&flow->encaps[out_index].list);
500 	flow->encaps[out_index].e = NULL;
501 	if (!refcount_dec_and_test(&e->refcnt)) {
502 		mutex_unlock(&esw->offloads.encap_tbl_lock);
503 		return;
504 	}
505 	list_del(&e->route_list);
506 	hash_del_rcu(&e->encap_hlist);
507 	mutex_unlock(&esw->offloads.encap_tbl_lock);
508 
509 	mlx5e_encap_dealloc(priv, e);
510 }
511 
512 void mlx5e_detach_decap(struct mlx5e_priv *priv,
513 			struct mlx5e_tc_flow *flow)
514 {
515 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
516 	struct mlx5e_decap_entry *d = flow->decap_reformat;
517 
518 	if (!d)
519 		return;
520 
521 	mutex_lock(&esw->offloads.decap_tbl_lock);
522 	list_del(&flow->l3_to_l2_reformat);
523 	flow->decap_reformat = NULL;
524 
525 	if (!refcount_dec_and_test(&d->refcnt)) {
526 		mutex_unlock(&esw->offloads.decap_tbl_lock);
527 		return;
528 	}
529 	hash_del_rcu(&d->hlist);
530 	mutex_unlock(&esw->offloads.decap_tbl_lock);
531 
532 	mlx5e_decap_dealloc(priv, d);
533 }
534 
535 bool mlx5e_tc_tun_encap_info_equal_generic(struct mlx5e_encap_key *a,
536 					   struct mlx5e_encap_key *b)
537 {
538 	return memcmp(a->ip_tun_key, b->ip_tun_key, sizeof(*a->ip_tun_key)) == 0 &&
539 		a->tc_tunnel->tunnel_type == b->tc_tunnel->tunnel_type;
540 }
541 
542 static int cmp_decap_info(struct mlx5e_decap_key *a,
543 			  struct mlx5e_decap_key *b)
544 {
545 	return memcmp(&a->key, &b->key, sizeof(b->key));
546 }
547 
548 static int hash_encap_info(struct mlx5e_encap_key *key)
549 {
550 	return jhash(key->ip_tun_key, sizeof(*key->ip_tun_key),
551 		     key->tc_tunnel->tunnel_type);
552 }
553 
554 static int hash_decap_info(struct mlx5e_decap_key *key)
555 {
556 	return jhash(&key->key, sizeof(key->key), 0);
557 }
558 
559 bool mlx5e_encap_take(struct mlx5e_encap_entry *e)
560 {
561 	return refcount_inc_not_zero(&e->refcnt);
562 }
563 
564 static bool mlx5e_decap_take(struct mlx5e_decap_entry *e)
565 {
566 	return refcount_inc_not_zero(&e->refcnt);
567 }
568 
569 static struct mlx5e_encap_entry *
570 mlx5e_encap_get(struct mlx5e_priv *priv, struct mlx5e_encap_key *key,
571 		uintptr_t hash_key)
572 {
573 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
574 	struct mlx5e_encap_key e_key;
575 	struct mlx5e_encap_entry *e;
576 
577 	hash_for_each_possible_rcu(esw->offloads.encap_tbl, e,
578 				   encap_hlist, hash_key) {
579 		e_key.ip_tun_key = &e->tun_info->key;
580 		e_key.tc_tunnel = e->tunnel;
581 		if (e->tunnel->encap_info_equal(&e_key, key) &&
582 		    mlx5e_encap_take(e))
583 			return e;
584 	}
585 
586 	return NULL;
587 }
588 
589 static struct mlx5e_decap_entry *
590 mlx5e_decap_get(struct mlx5e_priv *priv, struct mlx5e_decap_key *key,
591 		uintptr_t hash_key)
592 {
593 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
594 	struct mlx5e_decap_key r_key;
595 	struct mlx5e_decap_entry *e;
596 
597 	hash_for_each_possible_rcu(esw->offloads.decap_tbl, e,
598 				   hlist, hash_key) {
599 		r_key = e->key;
600 		if (!cmp_decap_info(&r_key, key) &&
601 		    mlx5e_decap_take(e))
602 			return e;
603 	}
604 	return NULL;
605 }
606 
607 struct ip_tunnel_info *mlx5e_dup_tun_info(const struct ip_tunnel_info *tun_info)
608 {
609 	size_t tun_size = sizeof(*tun_info) + tun_info->options_len;
610 
611 	return kmemdup(tun_info, tun_size, GFP_KERNEL);
612 }
613 
614 static bool is_duplicated_encap_entry(struct mlx5e_priv *priv,
615 				      struct mlx5e_tc_flow *flow,
616 				      int out_index,
617 				      struct mlx5e_encap_entry *e,
618 				      struct netlink_ext_ack *extack)
619 {
620 	int i;
621 
622 	for (i = 0; i < out_index; i++) {
623 		if (flow->encaps[i].e != e)
624 			continue;
625 		NL_SET_ERR_MSG_MOD(extack, "can't duplicate encap action");
626 		netdev_err(priv->netdev, "can't duplicate encap action\n");
627 		return true;
628 	}
629 
630 	return false;
631 }
632 
633 static int mlx5e_set_vf_tunnel(struct mlx5_eswitch *esw,
634 			       struct mlx5_flow_attr *attr,
635 			       struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts,
636 			       struct net_device *out_dev,
637 			       int route_dev_ifindex,
638 			       int out_index)
639 {
640 	struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr;
641 	struct net_device *route_dev;
642 	u16 vport_num;
643 	int err = 0;
644 	u32 data;
645 
646 	route_dev = dev_get_by_index(dev_net(out_dev), route_dev_ifindex);
647 
648 	if (!route_dev || route_dev->netdev_ops != &mlx5e_netdev_ops ||
649 	    !mlx5e_tc_is_vf_tunnel(out_dev, route_dev))
650 		goto out;
651 
652 	err = mlx5e_tc_query_route_vport(out_dev, route_dev, &vport_num);
653 	if (err)
654 		goto out;
655 
656 	attr->dest_chain = 0;
657 	attr->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
658 	esw_attr->dests[out_index].flags |= MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE;
659 	data = mlx5_eswitch_get_vport_metadata_for_set(esw_attr->in_mdev->priv.eswitch,
660 						       vport_num);
661 	err = mlx5e_tc_match_to_reg_set_and_get_id(esw->dev, mod_hdr_acts,
662 						   MLX5_FLOW_NAMESPACE_FDB,
663 						   VPORT_TO_REG, data);
664 	if (err >= 0) {
665 		esw_attr->dests[out_index].src_port_rewrite_act_id = err;
666 		err = 0;
667 	}
668 
669 out:
670 	if (route_dev)
671 		dev_put(route_dev);
672 	return err;
673 }
674 
675 static int mlx5e_update_vf_tunnel(struct mlx5_eswitch *esw,
676 				  struct mlx5_esw_flow_attr *attr,
677 				  struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts,
678 				  struct net_device *out_dev,
679 				  int route_dev_ifindex,
680 				  int out_index)
681 {
682 	int act_id = attr->dests[out_index].src_port_rewrite_act_id;
683 	struct net_device *route_dev;
684 	u16 vport_num;
685 	int err = 0;
686 	u32 data;
687 
688 	route_dev = dev_get_by_index(dev_net(out_dev), route_dev_ifindex);
689 
690 	if (!route_dev || route_dev->netdev_ops != &mlx5e_netdev_ops ||
691 	    !mlx5e_tc_is_vf_tunnel(out_dev, route_dev)) {
692 		err = -ENODEV;
693 		goto out;
694 	}
695 
696 	err = mlx5e_tc_query_route_vport(out_dev, route_dev, &vport_num);
697 	if (err)
698 		goto out;
699 
700 	data = mlx5_eswitch_get_vport_metadata_for_set(attr->in_mdev->priv.eswitch,
701 						       vport_num);
702 	mlx5e_tc_match_to_reg_mod_hdr_change(esw->dev, mod_hdr_acts, VPORT_TO_REG, act_id, data);
703 
704 out:
705 	if (route_dev)
706 		dev_put(route_dev);
707 	return err;
708 }
709 
710 static unsigned int mlx5e_route_tbl_get_last_update(struct mlx5e_priv *priv)
711 {
712 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
713 	struct mlx5_rep_uplink_priv *uplink_priv;
714 	struct mlx5e_rep_priv *uplink_rpriv;
715 	struct mlx5e_tc_tun_encap *encap;
716 	unsigned int ret;
717 
718 	uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
719 	uplink_priv = &uplink_rpriv->uplink_priv;
720 	encap = uplink_priv->encap;
721 
722 	spin_lock_bh(&encap->route_lock);
723 	ret = encap->route_tbl_last_update;
724 	spin_unlock_bh(&encap->route_lock);
725 	return ret;
726 }
727 
728 static int mlx5e_attach_encap_route(struct mlx5e_priv *priv,
729 				    struct mlx5e_tc_flow *flow,
730 				    struct mlx5e_encap_entry *e,
731 				    bool new_encap_entry,
732 				    unsigned long tbl_time_before,
733 				    int out_index);
734 
735 int mlx5e_attach_encap(struct mlx5e_priv *priv,
736 		       struct mlx5e_tc_flow *flow,
737 		       struct net_device *mirred_dev,
738 		       int out_index,
739 		       struct netlink_ext_ack *extack,
740 		       struct net_device **encap_dev,
741 		       bool *encap_valid)
742 {
743 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
744 	struct mlx5e_tc_flow_parse_attr *parse_attr;
745 	struct mlx5_flow_attr *attr = flow->attr;
746 	const struct ip_tunnel_info *tun_info;
747 	unsigned long tbl_time_before = 0;
748 	struct mlx5e_encap_entry *e;
749 	struct mlx5e_encap_key key;
750 	bool entry_created = false;
751 	unsigned short family;
752 	uintptr_t hash_key;
753 	int err = 0;
754 
755 	parse_attr = attr->parse_attr;
756 	tun_info = parse_attr->tun_info[out_index];
757 	family = ip_tunnel_info_af(tun_info);
758 	key.ip_tun_key = &tun_info->key;
759 	key.tc_tunnel = mlx5e_get_tc_tun(mirred_dev);
760 	if (!key.tc_tunnel) {
761 		NL_SET_ERR_MSG_MOD(extack, "Unsupported tunnel");
762 		return -EOPNOTSUPP;
763 	}
764 
765 	hash_key = hash_encap_info(&key);
766 
767 	mutex_lock(&esw->offloads.encap_tbl_lock);
768 	e = mlx5e_encap_get(priv, &key, hash_key);
769 
770 	/* must verify if encap is valid or not */
771 	if (e) {
772 		/* Check that entry was not already attached to this flow */
773 		if (is_duplicated_encap_entry(priv, flow, out_index, e, extack)) {
774 			err = -EOPNOTSUPP;
775 			goto out_err;
776 		}
777 
778 		mutex_unlock(&esw->offloads.encap_tbl_lock);
779 		wait_for_completion(&e->res_ready);
780 
781 		/* Protect against concurrent neigh update. */
782 		mutex_lock(&esw->offloads.encap_tbl_lock);
783 		if (e->compl_result < 0) {
784 			err = -EREMOTEIO;
785 			goto out_err;
786 		}
787 		goto attach_flow;
788 	}
789 
790 	e = kzalloc(sizeof(*e), GFP_KERNEL);
791 	if (!e) {
792 		err = -ENOMEM;
793 		goto out_err;
794 	}
795 
796 	refcount_set(&e->refcnt, 1);
797 	init_completion(&e->res_ready);
798 	entry_created = true;
799 	INIT_LIST_HEAD(&e->route_list);
800 
801 	tun_info = mlx5e_dup_tun_info(tun_info);
802 	if (!tun_info) {
803 		err = -ENOMEM;
804 		goto out_err_init;
805 	}
806 	e->tun_info = tun_info;
807 	err = mlx5e_tc_tun_init_encap_attr(mirred_dev, priv, e, extack);
808 	if (err)
809 		goto out_err_init;
810 
811 	INIT_LIST_HEAD(&e->flows);
812 	hash_add_rcu(esw->offloads.encap_tbl, &e->encap_hlist, hash_key);
813 	tbl_time_before = mlx5e_route_tbl_get_last_update(priv);
814 	mutex_unlock(&esw->offloads.encap_tbl_lock);
815 
816 	if (family == AF_INET)
817 		err = mlx5e_tc_tun_create_header_ipv4(priv, mirred_dev, e);
818 	else if (family == AF_INET6)
819 		err = mlx5e_tc_tun_create_header_ipv6(priv, mirred_dev, e);
820 
821 	/* Protect against concurrent neigh update. */
822 	mutex_lock(&esw->offloads.encap_tbl_lock);
823 	complete_all(&e->res_ready);
824 	if (err) {
825 		e->compl_result = err;
826 		goto out_err;
827 	}
828 	e->compl_result = 1;
829 
830 attach_flow:
831 	err = mlx5e_attach_encap_route(priv, flow, e, entry_created, tbl_time_before,
832 				       out_index);
833 	if (err)
834 		goto out_err;
835 
836 	err = mlx5e_set_int_port_tunnel(priv, attr, e, out_index);
837 	if (err == -EOPNOTSUPP) {
838 		/* If device doesn't support int port offload,
839 		 * redirect to uplink vport.
840 		 */
841 		mlx5_core_dbg(priv->mdev, "attaching int port as encap dev not supported, using uplink\n");
842 		err = 0;
843 	} else if (err) {
844 		goto out_err;
845 	}
846 
847 	flow->encaps[out_index].e = e;
848 	list_add(&flow->encaps[out_index].list, &e->flows);
849 	flow->encaps[out_index].index = out_index;
850 	*encap_dev = e->out_dev;
851 	if (e->flags & MLX5_ENCAP_ENTRY_VALID) {
852 		attr->esw_attr->dests[out_index].pkt_reformat = e->pkt_reformat;
853 		attr->esw_attr->dests[out_index].flags |= MLX5_ESW_DEST_ENCAP_VALID;
854 		*encap_valid = true;
855 	} else {
856 		*encap_valid = false;
857 	}
858 	mutex_unlock(&esw->offloads.encap_tbl_lock);
859 
860 	return err;
861 
862 out_err:
863 	mutex_unlock(&esw->offloads.encap_tbl_lock);
864 	if (e)
865 		mlx5e_encap_put(priv, e);
866 	return err;
867 
868 out_err_init:
869 	mutex_unlock(&esw->offloads.encap_tbl_lock);
870 	kfree(tun_info);
871 	kfree(e);
872 	return err;
873 }
874 
875 int mlx5e_attach_decap(struct mlx5e_priv *priv,
876 		       struct mlx5e_tc_flow *flow,
877 		       struct netlink_ext_ack *extack)
878 {
879 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
880 	struct mlx5_esw_flow_attr *attr = flow->attr->esw_attr;
881 	struct mlx5_pkt_reformat_params reformat_params;
882 	struct mlx5e_tc_flow_parse_attr *parse_attr;
883 	struct mlx5e_decap_entry *d;
884 	struct mlx5e_decap_key key;
885 	uintptr_t hash_key;
886 	int err = 0;
887 
888 	parse_attr = flow->attr->parse_attr;
889 	if (sizeof(parse_attr->eth) > MLX5_CAP_ESW(priv->mdev, max_encap_header_size)) {
890 		NL_SET_ERR_MSG_MOD(extack,
891 				   "encap header larger than max supported");
892 		return -EOPNOTSUPP;
893 	}
894 
895 	key.key = parse_attr->eth;
896 	hash_key = hash_decap_info(&key);
897 	mutex_lock(&esw->offloads.decap_tbl_lock);
898 	d = mlx5e_decap_get(priv, &key, hash_key);
899 	if (d) {
900 		mutex_unlock(&esw->offloads.decap_tbl_lock);
901 		wait_for_completion(&d->res_ready);
902 		mutex_lock(&esw->offloads.decap_tbl_lock);
903 		if (d->compl_result) {
904 			err = -EREMOTEIO;
905 			goto out_free;
906 		}
907 		goto found;
908 	}
909 
910 	d = kzalloc(sizeof(*d), GFP_KERNEL);
911 	if (!d) {
912 		err = -ENOMEM;
913 		goto out_err;
914 	}
915 
916 	d->key = key;
917 	refcount_set(&d->refcnt, 1);
918 	init_completion(&d->res_ready);
919 	INIT_LIST_HEAD(&d->flows);
920 	hash_add_rcu(esw->offloads.decap_tbl, &d->hlist, hash_key);
921 	mutex_unlock(&esw->offloads.decap_tbl_lock);
922 
923 	memset(&reformat_params, 0, sizeof(reformat_params));
924 	reformat_params.type = MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2;
925 	reformat_params.size = sizeof(parse_attr->eth);
926 	reformat_params.data = &parse_attr->eth;
927 	d->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev,
928 						     &reformat_params,
929 						     MLX5_FLOW_NAMESPACE_FDB);
930 	if (IS_ERR(d->pkt_reformat)) {
931 		err = PTR_ERR(d->pkt_reformat);
932 		d->compl_result = err;
933 	}
934 	mutex_lock(&esw->offloads.decap_tbl_lock);
935 	complete_all(&d->res_ready);
936 	if (err)
937 		goto out_free;
938 
939 found:
940 	flow->decap_reformat = d;
941 	attr->decap_pkt_reformat = d->pkt_reformat;
942 	list_add(&flow->l3_to_l2_reformat, &d->flows);
943 	mutex_unlock(&esw->offloads.decap_tbl_lock);
944 	return 0;
945 
946 out_free:
947 	mutex_unlock(&esw->offloads.decap_tbl_lock);
948 	mlx5e_decap_put(priv, d);
949 	return err;
950 
951 out_err:
952 	mutex_unlock(&esw->offloads.decap_tbl_lock);
953 	return err;
954 }
955 
956 static int cmp_route_info(struct mlx5e_route_key *a,
957 			  struct mlx5e_route_key *b)
958 {
959 	if (a->ip_version == 4 && b->ip_version == 4)
960 		return memcmp(&a->endpoint_ip.v4, &b->endpoint_ip.v4,
961 			      sizeof(a->endpoint_ip.v4));
962 	else if (a->ip_version == 6 && b->ip_version == 6)
963 		return memcmp(&a->endpoint_ip.v6, &b->endpoint_ip.v6,
964 			      sizeof(a->endpoint_ip.v6));
965 	return 1;
966 }
967 
968 static u32 hash_route_info(struct mlx5e_route_key *key)
969 {
970 	if (key->ip_version == 4)
971 		return jhash(&key->endpoint_ip.v4, sizeof(key->endpoint_ip.v4), 0);
972 	return jhash(&key->endpoint_ip.v6, sizeof(key->endpoint_ip.v6), 0);
973 }
974 
975 static void mlx5e_route_dealloc(struct mlx5e_priv *priv,
976 				struct mlx5e_route_entry *r)
977 {
978 	WARN_ON(!list_empty(&r->decap_flows));
979 	WARN_ON(!list_empty(&r->encap_entries));
980 
981 	kfree_rcu(r, rcu);
982 }
983 
984 static void mlx5e_route_put(struct mlx5e_priv *priv, struct mlx5e_route_entry *r)
985 {
986 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
987 
988 	if (!refcount_dec_and_mutex_lock(&r->refcnt, &esw->offloads.encap_tbl_lock))
989 		return;
990 
991 	hash_del_rcu(&r->hlist);
992 	mutex_unlock(&esw->offloads.encap_tbl_lock);
993 
994 	mlx5e_route_dealloc(priv, r);
995 }
996 
997 static void mlx5e_route_put_locked(struct mlx5e_priv *priv, struct mlx5e_route_entry *r)
998 {
999 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1000 
1001 	lockdep_assert_held(&esw->offloads.encap_tbl_lock);
1002 
1003 	if (!refcount_dec_and_test(&r->refcnt))
1004 		return;
1005 	hash_del_rcu(&r->hlist);
1006 	mlx5e_route_dealloc(priv, r);
1007 }
1008 
1009 static struct mlx5e_route_entry *
1010 mlx5e_route_get(struct mlx5e_tc_tun_encap *encap, struct mlx5e_route_key *key,
1011 		u32 hash_key)
1012 {
1013 	struct mlx5e_route_key r_key;
1014 	struct mlx5e_route_entry *r;
1015 
1016 	hash_for_each_possible(encap->route_tbl, r, hlist, hash_key) {
1017 		r_key = r->key;
1018 		if (!cmp_route_info(&r_key, key) &&
1019 		    refcount_inc_not_zero(&r->refcnt))
1020 			return r;
1021 	}
1022 	return NULL;
1023 }
1024 
1025 static struct mlx5e_route_entry *
1026 mlx5e_route_get_create(struct mlx5e_priv *priv,
1027 		       struct mlx5e_route_key *key,
1028 		       int tunnel_dev_index,
1029 		       unsigned long *route_tbl_change_time)
1030 {
1031 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1032 	struct mlx5_rep_uplink_priv *uplink_priv;
1033 	struct mlx5e_rep_priv *uplink_rpriv;
1034 	struct mlx5e_tc_tun_encap *encap;
1035 	struct mlx5e_route_entry *r;
1036 	u32 hash_key;
1037 
1038 	uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
1039 	uplink_priv = &uplink_rpriv->uplink_priv;
1040 	encap = uplink_priv->encap;
1041 
1042 	hash_key = hash_route_info(key);
1043 	spin_lock_bh(&encap->route_lock);
1044 	r = mlx5e_route_get(encap, key, hash_key);
1045 	spin_unlock_bh(&encap->route_lock);
1046 	if (r) {
1047 		if (!mlx5e_route_entry_valid(r)) {
1048 			mlx5e_route_put_locked(priv, r);
1049 			return ERR_PTR(-EINVAL);
1050 		}
1051 		return r;
1052 	}
1053 
1054 	r = kzalloc(sizeof(*r), GFP_KERNEL);
1055 	if (!r)
1056 		return ERR_PTR(-ENOMEM);
1057 
1058 	r->key = *key;
1059 	r->flags |= MLX5E_ROUTE_ENTRY_VALID;
1060 	r->tunnel_dev_index = tunnel_dev_index;
1061 	refcount_set(&r->refcnt, 1);
1062 	INIT_LIST_HEAD(&r->decap_flows);
1063 	INIT_LIST_HEAD(&r->encap_entries);
1064 
1065 	spin_lock_bh(&encap->route_lock);
1066 	*route_tbl_change_time = encap->route_tbl_last_update;
1067 	hash_add(encap->route_tbl, &r->hlist, hash_key);
1068 	spin_unlock_bh(&encap->route_lock);
1069 
1070 	return r;
1071 }
1072 
1073 static struct mlx5e_route_entry *
1074 mlx5e_route_lookup_for_update(struct mlx5e_tc_tun_encap *encap, struct mlx5e_route_key *key)
1075 {
1076 	u32 hash_key = hash_route_info(key);
1077 	struct mlx5e_route_entry *r;
1078 
1079 	spin_lock_bh(&encap->route_lock);
1080 	encap->route_tbl_last_update = jiffies;
1081 	r = mlx5e_route_get(encap, key, hash_key);
1082 	spin_unlock_bh(&encap->route_lock);
1083 
1084 	return r;
1085 }
1086 
1087 struct mlx5e_tc_fib_event_data {
1088 	struct work_struct work;
1089 	unsigned long event;
1090 	struct mlx5e_route_entry *r;
1091 	struct net_device *ul_dev;
1092 };
1093 
1094 static void mlx5e_tc_fib_event_work(struct work_struct *work);
1095 static struct mlx5e_tc_fib_event_data *
1096 mlx5e_tc_init_fib_work(unsigned long event, struct net_device *ul_dev, gfp_t flags)
1097 {
1098 	struct mlx5e_tc_fib_event_data *fib_work;
1099 
1100 	fib_work = kzalloc(sizeof(*fib_work), flags);
1101 	if (WARN_ON(!fib_work))
1102 		return NULL;
1103 
1104 	INIT_WORK(&fib_work->work, mlx5e_tc_fib_event_work);
1105 	fib_work->event = event;
1106 	fib_work->ul_dev = ul_dev;
1107 
1108 	return fib_work;
1109 }
1110 
1111 static int
1112 mlx5e_route_enqueue_update(struct mlx5e_priv *priv,
1113 			   struct mlx5e_route_entry *r,
1114 			   unsigned long event)
1115 {
1116 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1117 	struct mlx5e_tc_fib_event_data *fib_work;
1118 	struct mlx5e_rep_priv *uplink_rpriv;
1119 	struct net_device *ul_dev;
1120 
1121 	uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
1122 	ul_dev = uplink_rpriv->netdev;
1123 
1124 	fib_work = mlx5e_tc_init_fib_work(event, ul_dev, GFP_KERNEL);
1125 	if (!fib_work)
1126 		return -ENOMEM;
1127 
1128 	dev_hold(ul_dev);
1129 	refcount_inc(&r->refcnt);
1130 	fib_work->r = r;
1131 	queue_work(priv->wq, &fib_work->work);
1132 
1133 	return 0;
1134 }
1135 
1136 int mlx5e_attach_decap_route(struct mlx5e_priv *priv,
1137 			     struct mlx5e_tc_flow *flow)
1138 {
1139 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1140 	unsigned long tbl_time_before, tbl_time_after;
1141 	struct mlx5e_tc_flow_parse_attr *parse_attr;
1142 	struct mlx5_flow_attr *attr = flow->attr;
1143 	struct mlx5_esw_flow_attr *esw_attr;
1144 	struct mlx5e_route_entry *r;
1145 	struct mlx5e_route_key key;
1146 	int err = 0;
1147 
1148 	esw_attr = attr->esw_attr;
1149 	parse_attr = attr->parse_attr;
1150 	mutex_lock(&esw->offloads.encap_tbl_lock);
1151 	if (!esw_attr->rx_tun_attr)
1152 		goto out;
1153 
1154 	tbl_time_before = mlx5e_route_tbl_get_last_update(priv);
1155 	tbl_time_after = tbl_time_before;
1156 	err = mlx5e_tc_tun_route_lookup(priv, &parse_attr->spec, attr, parse_attr->filter_dev);
1157 	if (err || !esw_attr->rx_tun_attr->decap_vport)
1158 		goto out;
1159 
1160 	key.ip_version = attr->tun_ip_version;
1161 	if (key.ip_version == 4)
1162 		key.endpoint_ip.v4 = esw_attr->rx_tun_attr->dst_ip.v4;
1163 	else
1164 		key.endpoint_ip.v6 = esw_attr->rx_tun_attr->dst_ip.v6;
1165 
1166 	r = mlx5e_route_get_create(priv, &key, parse_attr->filter_dev->ifindex,
1167 				   &tbl_time_after);
1168 	if (IS_ERR(r)) {
1169 		err = PTR_ERR(r);
1170 		goto out;
1171 	}
1172 	/* Routing changed concurrently. FIB event handler might have missed new
1173 	 * entry, schedule update.
1174 	 */
1175 	if (tbl_time_before != tbl_time_after) {
1176 		err = mlx5e_route_enqueue_update(priv, r, FIB_EVENT_ENTRY_REPLACE);
1177 		if (err) {
1178 			mlx5e_route_put_locked(priv, r);
1179 			goto out;
1180 		}
1181 	}
1182 
1183 	flow->decap_route = r;
1184 	list_add(&flow->decap_routes, &r->decap_flows);
1185 	mutex_unlock(&esw->offloads.encap_tbl_lock);
1186 	return 0;
1187 
1188 out:
1189 	mutex_unlock(&esw->offloads.encap_tbl_lock);
1190 	return err;
1191 }
1192 
1193 static int mlx5e_attach_encap_route(struct mlx5e_priv *priv,
1194 				    struct mlx5e_tc_flow *flow,
1195 				    struct mlx5e_encap_entry *e,
1196 				    bool new_encap_entry,
1197 				    unsigned long tbl_time_before,
1198 				    int out_index)
1199 {
1200 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1201 	unsigned long tbl_time_after = tbl_time_before;
1202 	struct mlx5e_tc_flow_parse_attr *parse_attr;
1203 	struct mlx5_flow_attr *attr = flow->attr;
1204 	const struct ip_tunnel_info *tun_info;
1205 	struct mlx5_esw_flow_attr *esw_attr;
1206 	struct mlx5e_route_entry *r;
1207 	struct mlx5e_route_key key;
1208 	unsigned short family;
1209 	int err = 0;
1210 
1211 	esw_attr = attr->esw_attr;
1212 	parse_attr = attr->parse_attr;
1213 	tun_info = parse_attr->tun_info[out_index];
1214 	family = ip_tunnel_info_af(tun_info);
1215 
1216 	if (family == AF_INET) {
1217 		key.endpoint_ip.v4 = tun_info->key.u.ipv4.src;
1218 		key.ip_version = 4;
1219 	} else if (family == AF_INET6) {
1220 		key.endpoint_ip.v6 = tun_info->key.u.ipv6.src;
1221 		key.ip_version = 6;
1222 	}
1223 
1224 	err = mlx5e_set_vf_tunnel(esw, attr, &parse_attr->mod_hdr_acts, e->out_dev,
1225 				  e->route_dev_ifindex, out_index);
1226 	if (err || !(esw_attr->dests[out_index].flags &
1227 		     MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE))
1228 		return err;
1229 
1230 	r = mlx5e_route_get_create(priv, &key, parse_attr->mirred_ifindex[out_index],
1231 				   &tbl_time_after);
1232 	if (IS_ERR(r))
1233 		return PTR_ERR(r);
1234 	/* Routing changed concurrently. FIB event handler might have missed new
1235 	 * entry, schedule update.
1236 	 */
1237 	if (tbl_time_before != tbl_time_after) {
1238 		err = mlx5e_route_enqueue_update(priv, r, FIB_EVENT_ENTRY_REPLACE);
1239 		if (err) {
1240 			mlx5e_route_put_locked(priv, r);
1241 			return err;
1242 		}
1243 	}
1244 
1245 	flow->encap_routes[out_index].r = r;
1246 	if (new_encap_entry)
1247 		list_add(&e->route_list, &r->encap_entries);
1248 	flow->encap_routes[out_index].index = out_index;
1249 	return 0;
1250 }
1251 
1252 void mlx5e_detach_decap_route(struct mlx5e_priv *priv,
1253 			      struct mlx5e_tc_flow *flow)
1254 {
1255 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1256 	struct mlx5e_route_entry *r = flow->decap_route;
1257 
1258 	if (!r)
1259 		return;
1260 
1261 	mutex_lock(&esw->offloads.encap_tbl_lock);
1262 	list_del(&flow->decap_routes);
1263 	flow->decap_route = NULL;
1264 
1265 	if (!refcount_dec_and_test(&r->refcnt)) {
1266 		mutex_unlock(&esw->offloads.encap_tbl_lock);
1267 		return;
1268 	}
1269 	hash_del_rcu(&r->hlist);
1270 	mutex_unlock(&esw->offloads.encap_tbl_lock);
1271 
1272 	mlx5e_route_dealloc(priv, r);
1273 }
1274 
1275 static void mlx5e_detach_encap_route(struct mlx5e_priv *priv,
1276 				     struct mlx5e_tc_flow *flow,
1277 				     int out_index)
1278 {
1279 	struct mlx5e_route_entry *r = flow->encap_routes[out_index].r;
1280 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1281 	struct mlx5e_encap_entry *e, *tmp;
1282 
1283 	if (!r)
1284 		return;
1285 
1286 	mutex_lock(&esw->offloads.encap_tbl_lock);
1287 	flow->encap_routes[out_index].r = NULL;
1288 
1289 	if (!refcount_dec_and_test(&r->refcnt)) {
1290 		mutex_unlock(&esw->offloads.encap_tbl_lock);
1291 		return;
1292 	}
1293 	list_for_each_entry_safe(e, tmp, &r->encap_entries, route_list)
1294 		list_del_init(&e->route_list);
1295 	hash_del_rcu(&r->hlist);
1296 	mutex_unlock(&esw->offloads.encap_tbl_lock);
1297 
1298 	mlx5e_route_dealloc(priv, r);
1299 }
1300 
1301 static void mlx5e_invalidate_encap(struct mlx5e_priv *priv,
1302 				   struct mlx5e_encap_entry *e,
1303 				   struct list_head *encap_flows)
1304 {
1305 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1306 	struct mlx5e_tc_flow *flow;
1307 
1308 	list_for_each_entry(flow, encap_flows, tmp_list) {
1309 		struct mlx5_flow_attr *attr = flow->attr;
1310 		struct mlx5_esw_flow_attr *esw_attr;
1311 
1312 		if (!mlx5e_is_offloaded_flow(flow))
1313 			continue;
1314 		esw_attr = attr->esw_attr;
1315 
1316 		if (flow_flag_test(flow, SLOW))
1317 			mlx5e_tc_unoffload_from_slow_path(esw, flow);
1318 		else
1319 			mlx5e_tc_unoffload_fdb_rules(esw, flow, flow->attr);
1320 		mlx5_modify_header_dealloc(priv->mdev, attr->modify_hdr);
1321 		attr->modify_hdr = NULL;
1322 
1323 		esw_attr->dests[flow->tmp_entry_index].flags &=
1324 			~MLX5_ESW_DEST_ENCAP_VALID;
1325 		esw_attr->dests[flow->tmp_entry_index].pkt_reformat = NULL;
1326 	}
1327 
1328 	e->flags |= MLX5_ENCAP_ENTRY_NO_ROUTE;
1329 	if (e->flags & MLX5_ENCAP_ENTRY_VALID) {
1330 		e->flags &= ~MLX5_ENCAP_ENTRY_VALID;
1331 		mlx5_packet_reformat_dealloc(priv->mdev, e->pkt_reformat);
1332 		e->pkt_reformat = NULL;
1333 	}
1334 }
1335 
1336 static void mlx5e_reoffload_encap(struct mlx5e_priv *priv,
1337 				  struct net_device *tunnel_dev,
1338 				  struct mlx5e_encap_entry *e,
1339 				  struct list_head *encap_flows)
1340 {
1341 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1342 	struct mlx5e_tc_flow *flow;
1343 	int err;
1344 
1345 	err = ip_tunnel_info_af(e->tun_info) == AF_INET ?
1346 		mlx5e_tc_tun_update_header_ipv4(priv, tunnel_dev, e) :
1347 		mlx5e_tc_tun_update_header_ipv6(priv, tunnel_dev, e);
1348 	if (err)
1349 		mlx5_core_warn(priv->mdev, "Failed to update encap header, %d", err);
1350 	e->flags &= ~MLX5_ENCAP_ENTRY_NO_ROUTE;
1351 
1352 	list_for_each_entry(flow, encap_flows, tmp_list) {
1353 		struct mlx5e_tc_flow_parse_attr *parse_attr;
1354 		struct mlx5_flow_attr *attr = flow->attr;
1355 		struct mlx5_esw_flow_attr *esw_attr;
1356 		struct mlx5_flow_handle *rule;
1357 		struct mlx5_flow_spec *spec;
1358 
1359 		if (flow_flag_test(flow, FAILED))
1360 			continue;
1361 
1362 		esw_attr = attr->esw_attr;
1363 		parse_attr = attr->parse_attr;
1364 		spec = &parse_attr->spec;
1365 
1366 		err = mlx5e_update_vf_tunnel(esw, esw_attr, &parse_attr->mod_hdr_acts,
1367 					     e->out_dev, e->route_dev_ifindex,
1368 					     flow->tmp_entry_index);
1369 		if (err) {
1370 			mlx5_core_warn(priv->mdev, "Failed to update VF tunnel err=%d", err);
1371 			continue;
1372 		}
1373 
1374 		err = mlx5e_tc_add_flow_mod_hdr(priv, parse_attr, flow);
1375 		if (err) {
1376 			mlx5_core_warn(priv->mdev, "Failed to update flow mod_hdr err=%d",
1377 				       err);
1378 			continue;
1379 		}
1380 
1381 		if (e->flags & MLX5_ENCAP_ENTRY_VALID) {
1382 			esw_attr->dests[flow->tmp_entry_index].pkt_reformat = e->pkt_reformat;
1383 			esw_attr->dests[flow->tmp_entry_index].flags |= MLX5_ESW_DEST_ENCAP_VALID;
1384 			if (!mlx5e_tc_flow_all_encaps_valid(esw_attr))
1385 				goto offload_to_slow_path;
1386 			/* update from slow path rule to encap rule */
1387 			rule = mlx5e_tc_offload_fdb_rules(esw, flow, spec, attr);
1388 			if (IS_ERR(rule)) {
1389 				err = PTR_ERR(rule);
1390 				mlx5_core_warn(priv->mdev, "Failed to update cached encapsulation flow, %d\n",
1391 					       err);
1392 			} else {
1393 				flow->rule[0] = rule;
1394 			}
1395 		} else {
1396 offload_to_slow_path:
1397 			rule = mlx5e_tc_offload_to_slow_path(esw, flow, spec);
1398 			/* mark the flow's encap dest as non-valid */
1399 			esw_attr->dests[flow->tmp_entry_index].flags &=
1400 				~MLX5_ESW_DEST_ENCAP_VALID;
1401 
1402 			if (IS_ERR(rule)) {
1403 				err = PTR_ERR(rule);
1404 				mlx5_core_warn(priv->mdev, "Failed to update slow path (encap) flow, %d\n",
1405 					       err);
1406 			} else {
1407 				flow->rule[0] = rule;
1408 			}
1409 		}
1410 		flow_flag_set(flow, OFFLOADED);
1411 	}
1412 }
1413 
1414 static int mlx5e_update_route_encaps(struct mlx5e_priv *priv,
1415 				     struct mlx5e_route_entry *r,
1416 				     struct list_head *flow_list,
1417 				     bool replace)
1418 {
1419 	struct net_device *tunnel_dev;
1420 	struct mlx5e_encap_entry *e;
1421 
1422 	tunnel_dev = __dev_get_by_index(dev_net(priv->netdev), r->tunnel_dev_index);
1423 	if (!tunnel_dev)
1424 		return -ENODEV;
1425 
1426 	list_for_each_entry(e, &r->encap_entries, route_list) {
1427 		LIST_HEAD(encap_flows);
1428 
1429 		mlx5e_take_all_encap_flows(e, &encap_flows);
1430 		if (list_empty(&encap_flows))
1431 			continue;
1432 
1433 		if (mlx5e_route_entry_valid(r))
1434 			mlx5e_invalidate_encap(priv, e, &encap_flows);
1435 
1436 		if (!replace) {
1437 			list_splice(&encap_flows, flow_list);
1438 			continue;
1439 		}
1440 
1441 		mlx5e_reoffload_encap(priv, tunnel_dev, e, &encap_flows);
1442 		list_splice(&encap_flows, flow_list);
1443 	}
1444 
1445 	return 0;
1446 }
1447 
1448 static void mlx5e_unoffload_flow_list(struct mlx5e_priv *priv,
1449 				      struct list_head *flow_list)
1450 {
1451 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1452 	struct mlx5e_tc_flow *flow;
1453 
1454 	list_for_each_entry(flow, flow_list, tmp_list)
1455 		if (mlx5e_is_offloaded_flow(flow))
1456 			mlx5e_tc_unoffload_fdb_rules(esw, flow, flow->attr);
1457 }
1458 
1459 static void mlx5e_reoffload_decap(struct mlx5e_priv *priv,
1460 				  struct list_head *decap_flows)
1461 {
1462 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
1463 	struct mlx5e_tc_flow *flow;
1464 
1465 	list_for_each_entry(flow, decap_flows, tmp_list) {
1466 		struct mlx5e_tc_flow_parse_attr *parse_attr;
1467 		struct mlx5_flow_attr *attr = flow->attr;
1468 		struct mlx5_flow_handle *rule;
1469 		struct mlx5_flow_spec *spec;
1470 		int err;
1471 
1472 		if (flow_flag_test(flow, FAILED))
1473 			continue;
1474 
1475 		parse_attr = attr->parse_attr;
1476 		spec = &parse_attr->spec;
1477 		err = mlx5e_tc_tun_route_lookup(priv, spec, attr, parse_attr->filter_dev);
1478 		if (err) {
1479 			mlx5_core_warn(priv->mdev, "Failed to lookup route for flow, %d\n",
1480 				       err);
1481 			continue;
1482 		}
1483 
1484 		rule = mlx5e_tc_offload_fdb_rules(esw, flow, spec, attr);
1485 		if (IS_ERR(rule)) {
1486 			err = PTR_ERR(rule);
1487 			mlx5_core_warn(priv->mdev, "Failed to update cached decap flow, %d\n",
1488 				       err);
1489 		} else {
1490 			flow->rule[0] = rule;
1491 			flow_flag_set(flow, OFFLOADED);
1492 		}
1493 	}
1494 }
1495 
1496 static int mlx5e_update_route_decap_flows(struct mlx5e_priv *priv,
1497 					  struct mlx5e_route_entry *r,
1498 					  struct list_head *flow_list,
1499 					  bool replace)
1500 {
1501 	struct net_device *tunnel_dev;
1502 	LIST_HEAD(decap_flows);
1503 
1504 	tunnel_dev = __dev_get_by_index(dev_net(priv->netdev), r->tunnel_dev_index);
1505 	if (!tunnel_dev)
1506 		return -ENODEV;
1507 
1508 	mlx5e_take_all_route_decap_flows(r, &decap_flows);
1509 	if (mlx5e_route_entry_valid(r))
1510 		mlx5e_unoffload_flow_list(priv, &decap_flows);
1511 	if (replace)
1512 		mlx5e_reoffload_decap(priv, &decap_flows);
1513 
1514 	list_splice(&decap_flows, flow_list);
1515 
1516 	return 0;
1517 }
1518 
1519 static void mlx5e_tc_fib_event_work(struct work_struct *work)
1520 {
1521 	struct mlx5e_tc_fib_event_data *event_data =
1522 		container_of(work, struct mlx5e_tc_fib_event_data, work);
1523 	struct net_device *ul_dev = event_data->ul_dev;
1524 	struct mlx5e_priv *priv = netdev_priv(ul_dev);
1525 	struct mlx5e_route_entry *r = event_data->r;
1526 	struct mlx5_eswitch *esw;
1527 	LIST_HEAD(flow_list);
1528 	bool replace;
1529 	int err;
1530 
1531 	/* sync with concurrent neigh updates */
1532 	rtnl_lock();
1533 	esw = priv->mdev->priv.eswitch;
1534 	mutex_lock(&esw->offloads.encap_tbl_lock);
1535 	replace = event_data->event == FIB_EVENT_ENTRY_REPLACE;
1536 
1537 	if (!mlx5e_route_entry_valid(r) && !replace)
1538 		goto out;
1539 
1540 	err = mlx5e_update_route_encaps(priv, r, &flow_list, replace);
1541 	if (err)
1542 		mlx5_core_warn(priv->mdev, "Failed to update route encaps, %d\n",
1543 			       err);
1544 
1545 	err = mlx5e_update_route_decap_flows(priv, r, &flow_list, replace);
1546 	if (err)
1547 		mlx5_core_warn(priv->mdev, "Failed to update route decap flows, %d\n",
1548 			       err);
1549 
1550 	if (replace)
1551 		r->flags |= MLX5E_ROUTE_ENTRY_VALID;
1552 out:
1553 	mutex_unlock(&esw->offloads.encap_tbl_lock);
1554 	rtnl_unlock();
1555 
1556 	mlx5e_put_flow_list(priv, &flow_list);
1557 	mlx5e_route_put(priv, event_data->r);
1558 	dev_put(event_data->ul_dev);
1559 	kfree(event_data);
1560 }
1561 
1562 static struct mlx5e_tc_fib_event_data *
1563 mlx5e_init_fib_work_ipv4(struct mlx5e_priv *priv,
1564 			 struct net_device *ul_dev,
1565 			 struct mlx5e_tc_tun_encap *encap,
1566 			 unsigned long event,
1567 			 struct fib_notifier_info *info)
1568 {
1569 	struct fib_entry_notifier_info *fen_info;
1570 	struct mlx5e_tc_fib_event_data *fib_work;
1571 	struct mlx5e_route_entry *r;
1572 	struct mlx5e_route_key key;
1573 	struct net_device *fib_dev;
1574 
1575 	fen_info = container_of(info, struct fib_entry_notifier_info, info);
1576 	fib_dev = fib_info_nh(fen_info->fi, 0)->fib_nh_dev;
1577 	if (!fib_dev || fib_dev->netdev_ops != &mlx5e_netdev_ops ||
1578 	    fen_info->dst_len != 32)
1579 		return NULL;
1580 
1581 	fib_work = mlx5e_tc_init_fib_work(event, ul_dev, GFP_ATOMIC);
1582 	if (!fib_work)
1583 		return ERR_PTR(-ENOMEM);
1584 
1585 	key.endpoint_ip.v4 = htonl(fen_info->dst);
1586 	key.ip_version = 4;
1587 
1588 	/* Can't fail after this point because releasing reference to r
1589 	 * requires obtaining sleeping mutex which we can't do in atomic
1590 	 * context.
1591 	 */
1592 	r = mlx5e_route_lookup_for_update(encap, &key);
1593 	if (!r)
1594 		goto out;
1595 	fib_work->r = r;
1596 	dev_hold(ul_dev);
1597 
1598 	return fib_work;
1599 
1600 out:
1601 	kfree(fib_work);
1602 	return NULL;
1603 }
1604 
1605 static struct mlx5e_tc_fib_event_data *
1606 mlx5e_init_fib_work_ipv6(struct mlx5e_priv *priv,
1607 			 struct net_device *ul_dev,
1608 			 struct mlx5e_tc_tun_encap *encap,
1609 			 unsigned long event,
1610 			 struct fib_notifier_info *info)
1611 {
1612 	struct fib6_entry_notifier_info *fen_info;
1613 	struct mlx5e_tc_fib_event_data *fib_work;
1614 	struct mlx5e_route_entry *r;
1615 	struct mlx5e_route_key key;
1616 	struct net_device *fib_dev;
1617 
1618 	fen_info = container_of(info, struct fib6_entry_notifier_info, info);
1619 	fib_dev = fib6_info_nh_dev(fen_info->rt);
1620 	if (fib_dev->netdev_ops != &mlx5e_netdev_ops ||
1621 	    fen_info->rt->fib6_dst.plen != 128)
1622 		return NULL;
1623 
1624 	fib_work = mlx5e_tc_init_fib_work(event, ul_dev, GFP_ATOMIC);
1625 	if (!fib_work)
1626 		return ERR_PTR(-ENOMEM);
1627 
1628 	memcpy(&key.endpoint_ip.v6, &fen_info->rt->fib6_dst.addr,
1629 	       sizeof(fen_info->rt->fib6_dst.addr));
1630 	key.ip_version = 6;
1631 
1632 	/* Can't fail after this point because releasing reference to r
1633 	 * requires obtaining sleeping mutex which we can't do in atomic
1634 	 * context.
1635 	 */
1636 	r = mlx5e_route_lookup_for_update(encap, &key);
1637 	if (!r)
1638 		goto out;
1639 	fib_work->r = r;
1640 	dev_hold(ul_dev);
1641 
1642 	return fib_work;
1643 
1644 out:
1645 	kfree(fib_work);
1646 	return NULL;
1647 }
1648 
1649 static int mlx5e_tc_tun_fib_event(struct notifier_block *nb, unsigned long event, void *ptr)
1650 {
1651 	struct mlx5e_tc_fib_event_data *fib_work;
1652 	struct fib_notifier_info *info = ptr;
1653 	struct mlx5e_tc_tun_encap *encap;
1654 	struct net_device *ul_dev;
1655 	struct mlx5e_priv *priv;
1656 
1657 	encap = container_of(nb, struct mlx5e_tc_tun_encap, fib_nb);
1658 	priv = encap->priv;
1659 	ul_dev = priv->netdev;
1660 	priv = netdev_priv(ul_dev);
1661 
1662 	switch (event) {
1663 	case FIB_EVENT_ENTRY_REPLACE:
1664 	case FIB_EVENT_ENTRY_DEL:
1665 		if (info->family == AF_INET)
1666 			fib_work = mlx5e_init_fib_work_ipv4(priv, ul_dev, encap, event, info);
1667 		else if (info->family == AF_INET6)
1668 			fib_work = mlx5e_init_fib_work_ipv6(priv, ul_dev, encap, event, info);
1669 		else
1670 			return NOTIFY_DONE;
1671 
1672 		if (!IS_ERR_OR_NULL(fib_work)) {
1673 			queue_work(priv->wq, &fib_work->work);
1674 		} else if (IS_ERR(fib_work)) {
1675 			NL_SET_ERR_MSG_MOD(info->extack, "Failed to init fib work");
1676 			mlx5_core_warn(priv->mdev, "Failed to init fib work, %ld\n",
1677 				       PTR_ERR(fib_work));
1678 		}
1679 
1680 		break;
1681 	default:
1682 		return NOTIFY_DONE;
1683 	}
1684 
1685 	return NOTIFY_DONE;
1686 }
1687 
1688 struct mlx5e_tc_tun_encap *mlx5e_tc_tun_init(struct mlx5e_priv *priv)
1689 {
1690 	struct mlx5e_tc_tun_encap *encap;
1691 	int err;
1692 
1693 	encap = kvzalloc(sizeof(*encap), GFP_KERNEL);
1694 	if (!encap)
1695 		return ERR_PTR(-ENOMEM);
1696 
1697 	encap->priv = priv;
1698 	encap->fib_nb.notifier_call = mlx5e_tc_tun_fib_event;
1699 	spin_lock_init(&encap->route_lock);
1700 	hash_init(encap->route_tbl);
1701 	err = register_fib_notifier(dev_net(priv->netdev), &encap->fib_nb,
1702 				    NULL, NULL);
1703 	if (err) {
1704 		kvfree(encap);
1705 		return ERR_PTR(err);
1706 	}
1707 
1708 	return encap;
1709 }
1710 
1711 void mlx5e_tc_tun_cleanup(struct mlx5e_tc_tun_encap *encap)
1712 {
1713 	if (!encap)
1714 		return;
1715 
1716 	unregister_fib_notifier(dev_net(encap->priv->netdev), &encap->fib_nb);
1717 	flush_workqueue(encap->priv->wq); /* flush fib event works */
1718 	kvfree(encap);
1719 }
1720