1 /*
2  * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32 
33 #include <linux/netdevice.h>
34 #include <net/bonding.h>
35 #include <linux/mlx5/driver.h>
36 #include <linux/mlx5/eswitch.h>
37 #include <linux/mlx5/vport.h>
38 #include "lib/devcom.h"
39 #include "mlx5_core.h"
40 #include "eswitch.h"
41 #include "esw/acl/ofld.h"
42 #include "lag.h"
43 #include "mp.h"
44 #include "mpesw.h"
45 
46 enum {
47 	MLX5_LAG_EGRESS_PORT_1 = 1,
48 	MLX5_LAG_EGRESS_PORT_2,
49 };
50 
51 /* General purpose, use for short periods of time.
52  * Beware of lock dependencies (preferably, no locks should be acquired
53  * under it).
54  */
55 static DEFINE_SPINLOCK(lag_lock);
56 
57 static int get_port_sel_mode(enum mlx5_lag_mode mode, unsigned long flags)
58 {
59 	if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags))
60 		return MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT;
61 
62 	if (mode == MLX5_LAG_MODE_MPESW)
63 		return MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_MPESW;
64 
65 	return MLX5_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY;
66 }
67 
68 static u8 lag_active_port_bits(struct mlx5_lag *ldev)
69 {
70 	u8 enabled_ports[MLX5_MAX_PORTS] = {};
71 	u8 active_port = 0;
72 	int num_enabled;
73 	int idx;
74 
75 	mlx5_infer_tx_enabled(&ldev->tracker, ldev->ports, enabled_ports,
76 			      &num_enabled);
77 	for (idx = 0; idx < num_enabled; idx++)
78 		active_port |= BIT_MASK(enabled_ports[idx]);
79 
80 	return active_port;
81 }
82 
83 static int mlx5_cmd_create_lag(struct mlx5_core_dev *dev, u8 *ports, int mode,
84 			       unsigned long flags)
85 {
86 	bool fdb_sel_mode = test_bit(MLX5_LAG_MODE_FLAG_FDB_SEL_MODE_NATIVE,
87 				     &flags);
88 	int port_sel_mode = get_port_sel_mode(mode, flags);
89 	u32 in[MLX5_ST_SZ_DW(create_lag_in)] = {};
90 	void *lag_ctx;
91 
92 	lag_ctx = MLX5_ADDR_OF(create_lag_in, in, ctx);
93 	MLX5_SET(create_lag_in, in, opcode, MLX5_CMD_OP_CREATE_LAG);
94 	MLX5_SET(lagc, lag_ctx, fdb_selection_mode, fdb_sel_mode);
95 
96 	switch (port_sel_mode) {
97 	case MLX5_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY:
98 		MLX5_SET(lagc, lag_ctx, tx_remap_affinity_1, ports[0]);
99 		MLX5_SET(lagc, lag_ctx, tx_remap_affinity_2, ports[1]);
100 		break;
101 	case MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT:
102 		if (!MLX5_CAP_PORT_SELECTION(dev, port_select_flow_table_bypass))
103 			break;
104 
105 		MLX5_SET(lagc, lag_ctx, active_port,
106 			 lag_active_port_bits(mlx5_lag_dev(dev)));
107 		break;
108 	default:
109 		break;
110 	}
111 	MLX5_SET(lagc, lag_ctx, port_select_mode, port_sel_mode);
112 
113 	return mlx5_cmd_exec_in(dev, create_lag, in);
114 }
115 
116 static int mlx5_cmd_modify_lag(struct mlx5_core_dev *dev, u8 num_ports,
117 			       u8 *ports)
118 {
119 	u32 in[MLX5_ST_SZ_DW(modify_lag_in)] = {};
120 	void *lag_ctx = MLX5_ADDR_OF(modify_lag_in, in, ctx);
121 
122 	MLX5_SET(modify_lag_in, in, opcode, MLX5_CMD_OP_MODIFY_LAG);
123 	MLX5_SET(modify_lag_in, in, field_select, 0x1);
124 
125 	MLX5_SET(lagc, lag_ctx, tx_remap_affinity_1, ports[0]);
126 	MLX5_SET(lagc, lag_ctx, tx_remap_affinity_2, ports[1]);
127 
128 	return mlx5_cmd_exec_in(dev, modify_lag, in);
129 }
130 
131 int mlx5_cmd_create_vport_lag(struct mlx5_core_dev *dev)
132 {
133 	u32 in[MLX5_ST_SZ_DW(create_vport_lag_in)] = {};
134 
135 	MLX5_SET(create_vport_lag_in, in, opcode, MLX5_CMD_OP_CREATE_VPORT_LAG);
136 
137 	return mlx5_cmd_exec_in(dev, create_vport_lag, in);
138 }
139 EXPORT_SYMBOL(mlx5_cmd_create_vport_lag);
140 
141 int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev)
142 {
143 	u32 in[MLX5_ST_SZ_DW(destroy_vport_lag_in)] = {};
144 
145 	MLX5_SET(destroy_vport_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_VPORT_LAG);
146 
147 	return mlx5_cmd_exec_in(dev, destroy_vport_lag, in);
148 }
149 EXPORT_SYMBOL(mlx5_cmd_destroy_vport_lag);
150 
151 static void mlx5_infer_tx_disabled(struct lag_tracker *tracker, u8 num_ports,
152 				   u8 *ports, int *num_disabled)
153 {
154 	int i;
155 
156 	*num_disabled = 0;
157 	for (i = 0; i < num_ports; i++) {
158 		if (!tracker->netdev_state[i].tx_enabled ||
159 		    !tracker->netdev_state[i].link_up)
160 			ports[(*num_disabled)++] = i;
161 	}
162 }
163 
164 void mlx5_infer_tx_enabled(struct lag_tracker *tracker, u8 num_ports,
165 			   u8 *ports, int *num_enabled)
166 {
167 	int i;
168 
169 	*num_enabled = 0;
170 	for (i = 0; i < num_ports; i++) {
171 		if (tracker->netdev_state[i].tx_enabled &&
172 		    tracker->netdev_state[i].link_up)
173 			ports[(*num_enabled)++] = i;
174 	}
175 
176 	if (*num_enabled == 0)
177 		mlx5_infer_tx_disabled(tracker, num_ports, ports, num_enabled);
178 }
179 
180 static void mlx5_lag_print_mapping(struct mlx5_core_dev *dev,
181 				   struct mlx5_lag *ldev,
182 				   struct lag_tracker *tracker,
183 				   unsigned long flags)
184 {
185 	char buf[MLX5_MAX_PORTS * 10 + 1] = {};
186 	u8 enabled_ports[MLX5_MAX_PORTS] = {};
187 	int written = 0;
188 	int num_enabled;
189 	int idx;
190 	int err;
191 	int i;
192 	int j;
193 
194 	if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) {
195 		mlx5_infer_tx_enabled(tracker, ldev->ports, enabled_ports,
196 				      &num_enabled);
197 		for (i = 0; i < num_enabled; i++) {
198 			err = scnprintf(buf + written, 4, "%d, ", enabled_ports[i] + 1);
199 			if (err != 3)
200 				return;
201 			written += err;
202 		}
203 		buf[written - 2] = 0;
204 		mlx5_core_info(dev, "lag map active ports: %s\n", buf);
205 	} else {
206 		for (i = 0; i < ldev->ports; i++) {
207 			for (j  = 0; j < ldev->buckets; j++) {
208 				idx = i * ldev->buckets + j;
209 				err = scnprintf(buf + written, 10,
210 						" port %d:%d", i + 1, ldev->v2p_map[idx]);
211 				if (err != 9)
212 					return;
213 				written += err;
214 			}
215 		}
216 		mlx5_core_info(dev, "lag map:%s\n", buf);
217 	}
218 }
219 
220 static int mlx5_lag_netdev_event(struct notifier_block *this,
221 				 unsigned long event, void *ptr);
222 static void mlx5_do_bond_work(struct work_struct *work);
223 
224 static void mlx5_ldev_free(struct kref *ref)
225 {
226 	struct mlx5_lag *ldev = container_of(ref, struct mlx5_lag, ref);
227 
228 	if (ldev->nb.notifier_call)
229 		unregister_netdevice_notifier_net(&init_net, &ldev->nb);
230 	mlx5_lag_mp_cleanup(ldev);
231 	destroy_workqueue(ldev->wq);
232 	mlx5_lag_mpesw_cleanup(ldev);
233 	mutex_destroy(&ldev->lock);
234 	kfree(ldev);
235 }
236 
237 static void mlx5_ldev_put(struct mlx5_lag *ldev)
238 {
239 	kref_put(&ldev->ref, mlx5_ldev_free);
240 }
241 
242 static void mlx5_ldev_get(struct mlx5_lag *ldev)
243 {
244 	kref_get(&ldev->ref);
245 }
246 
247 static struct mlx5_lag *mlx5_lag_dev_alloc(struct mlx5_core_dev *dev)
248 {
249 	struct mlx5_lag *ldev;
250 	int err;
251 
252 	ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);
253 	if (!ldev)
254 		return NULL;
255 
256 	ldev->wq = create_singlethread_workqueue("mlx5_lag");
257 	if (!ldev->wq) {
258 		kfree(ldev);
259 		return NULL;
260 	}
261 
262 	kref_init(&ldev->ref);
263 	mutex_init(&ldev->lock);
264 	INIT_DELAYED_WORK(&ldev->bond_work, mlx5_do_bond_work);
265 
266 	ldev->nb.notifier_call = mlx5_lag_netdev_event;
267 	if (register_netdevice_notifier_net(&init_net, &ldev->nb)) {
268 		ldev->nb.notifier_call = NULL;
269 		mlx5_core_err(dev, "Failed to register LAG netdev notifier\n");
270 	}
271 	ldev->mode = MLX5_LAG_MODE_NONE;
272 
273 	err = mlx5_lag_mp_init(ldev);
274 	if (err)
275 		mlx5_core_err(dev, "Failed to init multipath lag err=%d\n",
276 			      err);
277 
278 	mlx5_lag_mpesw_init(ldev);
279 	ldev->ports = MLX5_CAP_GEN(dev, num_lag_ports);
280 	ldev->buckets = 1;
281 
282 	return ldev;
283 }
284 
285 int mlx5_lag_dev_get_netdev_idx(struct mlx5_lag *ldev,
286 				struct net_device *ndev)
287 {
288 	int i;
289 
290 	for (i = 0; i < ldev->ports; i++)
291 		if (ldev->pf[i].netdev == ndev)
292 			return i;
293 
294 	return -ENOENT;
295 }
296 
297 static bool __mlx5_lag_is_roce(struct mlx5_lag *ldev)
298 {
299 	return ldev->mode == MLX5_LAG_MODE_ROCE;
300 }
301 
302 static bool __mlx5_lag_is_sriov(struct mlx5_lag *ldev)
303 {
304 	return ldev->mode == MLX5_LAG_MODE_SRIOV;
305 }
306 
307 /* Create a mapping between steering slots and active ports.
308  * As we have ldev->buckets slots per port first assume the native
309  * mapping should be used.
310  * If there are ports that are disabled fill the relevant slots
311  * with mapping that points to active ports.
312  */
313 static void mlx5_infer_tx_affinity_mapping(struct lag_tracker *tracker,
314 					   u8 num_ports,
315 					   u8 buckets,
316 					   u8 *ports)
317 {
318 	int disabled[MLX5_MAX_PORTS] = {};
319 	int enabled[MLX5_MAX_PORTS] = {};
320 	int disabled_ports_num = 0;
321 	int enabled_ports_num = 0;
322 	int idx;
323 	u32 rand;
324 	int i;
325 	int j;
326 
327 	for (i = 0; i < num_ports; i++) {
328 		if (tracker->netdev_state[i].tx_enabled &&
329 		    tracker->netdev_state[i].link_up)
330 			enabled[enabled_ports_num++] = i;
331 		else
332 			disabled[disabled_ports_num++] = i;
333 	}
334 
335 	/* Use native mapping by default where each port's buckets
336 	 * point the native port: 1 1 1 .. 1 2 2 2 ... 2 3 3 3 ... 3 etc
337 	 */
338 	for (i = 0; i < num_ports; i++)
339 		for (j = 0; j < buckets; j++) {
340 			idx = i * buckets + j;
341 			ports[idx] = MLX5_LAG_EGRESS_PORT_1 + i;
342 		}
343 
344 	/* If all ports are disabled/enabled keep native mapping */
345 	if (enabled_ports_num == num_ports ||
346 	    disabled_ports_num == num_ports)
347 		return;
348 
349 	/* Go over the disabled ports and for each assign a random active port */
350 	for (i = 0; i < disabled_ports_num; i++) {
351 		for (j = 0; j < buckets; j++) {
352 			get_random_bytes(&rand, 4);
353 			ports[disabled[i] * buckets + j] = enabled[rand % enabled_ports_num] + 1;
354 		}
355 	}
356 }
357 
358 static bool mlx5_lag_has_drop_rule(struct mlx5_lag *ldev)
359 {
360 	int i;
361 
362 	for (i = 0; i < ldev->ports; i++)
363 		if (ldev->pf[i].has_drop)
364 			return true;
365 	return false;
366 }
367 
368 static void mlx5_lag_drop_rule_cleanup(struct mlx5_lag *ldev)
369 {
370 	int i;
371 
372 	for (i = 0; i < ldev->ports; i++) {
373 		if (!ldev->pf[i].has_drop)
374 			continue;
375 
376 		mlx5_esw_acl_ingress_vport_drop_rule_destroy(ldev->pf[i].dev->priv.eswitch,
377 							     MLX5_VPORT_UPLINK);
378 		ldev->pf[i].has_drop = false;
379 	}
380 }
381 
382 static void mlx5_lag_drop_rule_setup(struct mlx5_lag *ldev,
383 				     struct lag_tracker *tracker)
384 {
385 	u8 disabled_ports[MLX5_MAX_PORTS] = {};
386 	struct mlx5_core_dev *dev;
387 	int disabled_index;
388 	int num_disabled;
389 	int err;
390 	int i;
391 
392 	/* First delete the current drop rule so there won't be any dropped
393 	 * packets
394 	 */
395 	mlx5_lag_drop_rule_cleanup(ldev);
396 
397 	if (!ldev->tracker.has_inactive)
398 		return;
399 
400 	mlx5_infer_tx_disabled(tracker, ldev->ports, disabled_ports, &num_disabled);
401 
402 	for (i = 0; i < num_disabled; i++) {
403 		disabled_index = disabled_ports[i];
404 		dev = ldev->pf[disabled_index].dev;
405 		err = mlx5_esw_acl_ingress_vport_drop_rule_create(dev->priv.eswitch,
406 								  MLX5_VPORT_UPLINK);
407 		if (!err)
408 			ldev->pf[disabled_index].has_drop = true;
409 		else
410 			mlx5_core_err(dev,
411 				      "Failed to create lag drop rule, error: %d", err);
412 	}
413 }
414 
415 static int mlx5_cmd_modify_active_port(struct mlx5_core_dev *dev, u8 ports)
416 {
417 	u32 in[MLX5_ST_SZ_DW(modify_lag_in)] = {};
418 	void *lag_ctx;
419 
420 	lag_ctx = MLX5_ADDR_OF(modify_lag_in, in, ctx);
421 
422 	MLX5_SET(modify_lag_in, in, opcode, MLX5_CMD_OP_MODIFY_LAG);
423 	MLX5_SET(modify_lag_in, in, field_select, 0x2);
424 
425 	MLX5_SET(lagc, lag_ctx, active_port, ports);
426 
427 	return mlx5_cmd_exec_in(dev, modify_lag, in);
428 }
429 
430 static int _mlx5_modify_lag(struct mlx5_lag *ldev, u8 *ports)
431 {
432 	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
433 	u8 active_ports;
434 	int ret;
435 
436 	if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &ldev->mode_flags)) {
437 		ret = mlx5_lag_port_sel_modify(ldev, ports);
438 		if (ret ||
439 		    !MLX5_CAP_PORT_SELECTION(dev0, port_select_flow_table_bypass))
440 			return ret;
441 
442 		active_ports = lag_active_port_bits(ldev);
443 
444 		return mlx5_cmd_modify_active_port(dev0, active_ports);
445 	}
446 	return mlx5_cmd_modify_lag(dev0, ldev->ports, ports);
447 }
448 
449 void mlx5_modify_lag(struct mlx5_lag *ldev,
450 		     struct lag_tracker *tracker)
451 {
452 	u8 ports[MLX5_MAX_PORTS * MLX5_LAG_MAX_HASH_BUCKETS] = {};
453 	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
454 	int idx;
455 	int err;
456 	int i;
457 	int j;
458 
459 	mlx5_infer_tx_affinity_mapping(tracker, ldev->ports, ldev->buckets, ports);
460 
461 	for (i = 0; i < ldev->ports; i++) {
462 		for (j = 0; j < ldev->buckets; j++) {
463 			idx = i * ldev->buckets + j;
464 			if (ports[idx] == ldev->v2p_map[idx])
465 				continue;
466 			err = _mlx5_modify_lag(ldev, ports);
467 			if (err) {
468 				mlx5_core_err(dev0,
469 					      "Failed to modify LAG (%d)\n",
470 					      err);
471 				return;
472 			}
473 			memcpy(ldev->v2p_map, ports, sizeof(ports));
474 
475 			mlx5_lag_print_mapping(dev0, ldev, tracker,
476 					       ldev->mode_flags);
477 			break;
478 		}
479 	}
480 
481 	if (tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP &&
482 	    !(ldev->mode == MLX5_LAG_MODE_ROCE))
483 		mlx5_lag_drop_rule_setup(ldev, tracker);
484 }
485 
486 static int mlx5_lag_set_port_sel_mode_roce(struct mlx5_lag *ldev,
487 					   unsigned long *flags)
488 {
489 	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
490 
491 	if (!MLX5_CAP_PORT_SELECTION(dev0, port_select_flow_table)) {
492 		if (ldev->ports > 2)
493 			return -EINVAL;
494 		return 0;
495 	}
496 
497 	if (ldev->ports > 2)
498 		ldev->buckets = MLX5_LAG_MAX_HASH_BUCKETS;
499 
500 	set_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, flags);
501 
502 	return 0;
503 }
504 
505 static void mlx5_lag_set_port_sel_mode_offloads(struct mlx5_lag *ldev,
506 						struct lag_tracker *tracker,
507 						enum mlx5_lag_mode mode,
508 						unsigned long *flags)
509 {
510 	struct lag_func *dev0 = &ldev->pf[MLX5_LAG_P1];
511 
512 	if (mode == MLX5_LAG_MODE_MPESW)
513 		return;
514 
515 	if (MLX5_CAP_PORT_SELECTION(dev0->dev, port_select_flow_table) &&
516 	    tracker->tx_type == NETDEV_LAG_TX_TYPE_HASH)
517 		set_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, flags);
518 }
519 
520 static int mlx5_lag_set_flags(struct mlx5_lag *ldev, enum mlx5_lag_mode mode,
521 			      struct lag_tracker *tracker, bool shared_fdb,
522 			      unsigned long *flags)
523 {
524 	bool roce_lag = mode == MLX5_LAG_MODE_ROCE;
525 
526 	*flags = 0;
527 	if (shared_fdb) {
528 		set_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, flags);
529 		set_bit(MLX5_LAG_MODE_FLAG_FDB_SEL_MODE_NATIVE, flags);
530 	}
531 
532 	if (mode == MLX5_LAG_MODE_MPESW)
533 		set_bit(MLX5_LAG_MODE_FLAG_FDB_SEL_MODE_NATIVE, flags);
534 
535 	if (roce_lag)
536 		return mlx5_lag_set_port_sel_mode_roce(ldev, flags);
537 
538 	mlx5_lag_set_port_sel_mode_offloads(ldev, tracker, mode, flags);
539 	return 0;
540 }
541 
542 char *mlx5_get_str_port_sel_mode(enum mlx5_lag_mode mode, unsigned long flags)
543 {
544 	int port_sel_mode = get_port_sel_mode(mode, flags);
545 
546 	switch (port_sel_mode) {
547 	case MLX5_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY: return "queue_affinity";
548 	case MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT: return "hash";
549 	case MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_MPESW: return "mpesw";
550 	default: return "invalid";
551 	}
552 }
553 
554 static int mlx5_create_lag(struct mlx5_lag *ldev,
555 			   struct lag_tracker *tracker,
556 			   enum mlx5_lag_mode mode,
557 			   unsigned long flags)
558 {
559 	bool shared_fdb = test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags);
560 	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
561 	struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
562 	u32 in[MLX5_ST_SZ_DW(destroy_lag_in)] = {};
563 	int err;
564 
565 	if (tracker)
566 		mlx5_lag_print_mapping(dev0, ldev, tracker, flags);
567 	mlx5_core_info(dev0, "shared_fdb:%d mode:%s\n",
568 		       shared_fdb, mlx5_get_str_port_sel_mode(mode, flags));
569 
570 	err = mlx5_cmd_create_lag(dev0, ldev->v2p_map, mode, flags);
571 	if (err) {
572 		mlx5_core_err(dev0,
573 			      "Failed to create LAG (%d)\n",
574 			      err);
575 		return err;
576 	}
577 
578 	if (shared_fdb) {
579 		err = mlx5_eswitch_offloads_config_single_fdb(dev0->priv.eswitch,
580 							      dev1->priv.eswitch);
581 		if (err)
582 			mlx5_core_err(dev0, "Can't enable single FDB mode\n");
583 		else
584 			mlx5_core_info(dev0, "Operation mode is single FDB\n");
585 	}
586 
587 	if (err) {
588 		MLX5_SET(destroy_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_LAG);
589 		if (mlx5_cmd_exec_in(dev0, destroy_lag, in))
590 			mlx5_core_err(dev0,
591 				      "Failed to deactivate RoCE LAG; driver restart required\n");
592 	}
593 
594 	return err;
595 }
596 
597 int mlx5_activate_lag(struct mlx5_lag *ldev,
598 		      struct lag_tracker *tracker,
599 		      enum mlx5_lag_mode mode,
600 		      bool shared_fdb)
601 {
602 	bool roce_lag = mode == MLX5_LAG_MODE_ROCE;
603 	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
604 	unsigned long flags = 0;
605 	int err;
606 
607 	err = mlx5_lag_set_flags(ldev, mode, tracker, shared_fdb, &flags);
608 	if (err)
609 		return err;
610 
611 	if (mode != MLX5_LAG_MODE_MPESW) {
612 		mlx5_infer_tx_affinity_mapping(tracker, ldev->ports, ldev->buckets, ldev->v2p_map);
613 		if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) {
614 			err = mlx5_lag_port_sel_create(ldev, tracker->hash_type,
615 						       ldev->v2p_map);
616 			if (err) {
617 				mlx5_core_err(dev0,
618 					      "Failed to create LAG port selection(%d)\n",
619 					      err);
620 				return err;
621 			}
622 		}
623 	}
624 
625 	err = mlx5_create_lag(ldev, tracker, mode, flags);
626 	if (err) {
627 		if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags))
628 			mlx5_lag_port_sel_destroy(ldev);
629 		if (roce_lag)
630 			mlx5_core_err(dev0,
631 				      "Failed to activate RoCE LAG\n");
632 		else
633 			mlx5_core_err(dev0,
634 				      "Failed to activate VF LAG\n"
635 				      "Make sure all VFs are unbound prior to VF LAG activation or deactivation\n");
636 		return err;
637 	}
638 
639 	if (tracker && tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP &&
640 	    !roce_lag)
641 		mlx5_lag_drop_rule_setup(ldev, tracker);
642 
643 	ldev->mode = mode;
644 	ldev->mode_flags = flags;
645 	return 0;
646 }
647 
648 static int mlx5_deactivate_lag(struct mlx5_lag *ldev)
649 {
650 	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
651 	struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
652 	u32 in[MLX5_ST_SZ_DW(destroy_lag_in)] = {};
653 	bool roce_lag = __mlx5_lag_is_roce(ldev);
654 	unsigned long flags = ldev->mode_flags;
655 	int err;
656 
657 	ldev->mode = MLX5_LAG_MODE_NONE;
658 	ldev->mode_flags = 0;
659 	mlx5_lag_mp_reset(ldev);
660 
661 	if (test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags)) {
662 		mlx5_eswitch_offloads_destroy_single_fdb(dev0->priv.eswitch,
663 							 dev1->priv.eswitch);
664 		clear_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags);
665 	}
666 
667 	MLX5_SET(destroy_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_LAG);
668 	err = mlx5_cmd_exec_in(dev0, destroy_lag, in);
669 	if (err) {
670 		if (roce_lag) {
671 			mlx5_core_err(dev0,
672 				      "Failed to deactivate RoCE LAG; driver restart required\n");
673 		} else {
674 			mlx5_core_err(dev0,
675 				      "Failed to deactivate VF LAG; driver restart required\n"
676 				      "Make sure all VFs are unbound prior to VF LAG activation or deactivation\n");
677 		}
678 		return err;
679 	}
680 
681 	if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags))
682 		mlx5_lag_port_sel_destroy(ldev);
683 	if (mlx5_lag_has_drop_rule(ldev))
684 		mlx5_lag_drop_rule_cleanup(ldev);
685 
686 	return 0;
687 }
688 
689 #define MLX5_LAG_OFFLOADS_SUPPORTED_PORTS 2
690 static bool mlx5_lag_check_prereq(struct mlx5_lag *ldev)
691 {
692 #ifdef CONFIG_MLX5_ESWITCH
693 	struct mlx5_core_dev *dev;
694 	u8 mode;
695 #endif
696 	int i;
697 
698 	for (i = 0; i < ldev->ports; i++)
699 		if (!ldev->pf[i].dev)
700 			return false;
701 
702 #ifdef CONFIG_MLX5_ESWITCH
703 	for (i = 0; i < ldev->ports; i++) {
704 		dev = ldev->pf[i].dev;
705 		if (mlx5_eswitch_num_vfs(dev->priv.eswitch) && !is_mdev_switchdev_mode(dev))
706 			return false;
707 	}
708 
709 	dev = ldev->pf[MLX5_LAG_P1].dev;
710 	mode = mlx5_eswitch_mode(dev);
711 	for (i = 0; i < ldev->ports; i++)
712 		if (mlx5_eswitch_mode(ldev->pf[i].dev) != mode)
713 			return false;
714 
715 	if (mode == MLX5_ESWITCH_OFFLOADS && ldev->ports != MLX5_LAG_OFFLOADS_SUPPORTED_PORTS)
716 		return false;
717 #else
718 	for (i = 0; i < ldev->ports; i++)
719 		if (mlx5_sriov_is_enabled(ldev->pf[i].dev))
720 			return false;
721 #endif
722 	return true;
723 }
724 
725 static void mlx5_lag_add_devices(struct mlx5_lag *ldev)
726 {
727 	int i;
728 
729 	for (i = 0; i < ldev->ports; i++) {
730 		if (!ldev->pf[i].dev)
731 			continue;
732 
733 		if (ldev->pf[i].dev->priv.flags &
734 		    MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)
735 			continue;
736 
737 		ldev->pf[i].dev->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
738 		mlx5_rescan_drivers_locked(ldev->pf[i].dev);
739 	}
740 }
741 
742 static void mlx5_lag_remove_devices(struct mlx5_lag *ldev)
743 {
744 	int i;
745 
746 	for (i = 0; i < ldev->ports; i++) {
747 		if (!ldev->pf[i].dev)
748 			continue;
749 
750 		if (ldev->pf[i].dev->priv.flags &
751 		    MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)
752 			continue;
753 
754 		ldev->pf[i].dev->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
755 		mlx5_rescan_drivers_locked(ldev->pf[i].dev);
756 	}
757 }
758 
759 void mlx5_disable_lag(struct mlx5_lag *ldev)
760 {
761 	bool shared_fdb = test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &ldev->mode_flags);
762 	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
763 	struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
764 	bool roce_lag;
765 	int err;
766 	int i;
767 
768 	roce_lag = __mlx5_lag_is_roce(ldev);
769 
770 	if (shared_fdb) {
771 		mlx5_lag_remove_devices(ldev);
772 	} else if (roce_lag) {
773 		if (!(dev0->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)) {
774 			dev0->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
775 			mlx5_rescan_drivers_locked(dev0);
776 		}
777 		for (i = 1; i < ldev->ports; i++)
778 			mlx5_nic_vport_disable_roce(ldev->pf[i].dev);
779 	}
780 
781 	err = mlx5_deactivate_lag(ldev);
782 	if (err)
783 		return;
784 
785 	if (shared_fdb || roce_lag)
786 		mlx5_lag_add_devices(ldev);
787 
788 	if (shared_fdb) {
789 		if (!(dev0->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV))
790 			mlx5_eswitch_reload_reps(dev0->priv.eswitch);
791 		if (!(dev1->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV))
792 			mlx5_eswitch_reload_reps(dev1->priv.eswitch);
793 	}
794 }
795 
796 bool mlx5_shared_fdb_supported(struct mlx5_lag *ldev)
797 {
798 	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
799 	struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
800 
801 	if (is_mdev_switchdev_mode(dev0) &&
802 	    is_mdev_switchdev_mode(dev1) &&
803 	    mlx5_eswitch_vport_match_metadata_enabled(dev0->priv.eswitch) &&
804 	    mlx5_eswitch_vport_match_metadata_enabled(dev1->priv.eswitch) &&
805 	    mlx5_devcom_is_paired(dev0->priv.devcom,
806 				  MLX5_DEVCOM_ESW_OFFLOADS) &&
807 	    MLX5_CAP_GEN(dev1, lag_native_fdb_selection) &&
808 	    MLX5_CAP_ESW(dev1, root_ft_on_other_esw) &&
809 	    MLX5_CAP_ESW(dev0, esw_shared_ingress_acl))
810 		return true;
811 
812 	return false;
813 }
814 
815 static bool mlx5_lag_is_roce_lag(struct mlx5_lag *ldev)
816 {
817 	bool roce_lag = true;
818 	int i;
819 
820 	for (i = 0; i < ldev->ports; i++)
821 		roce_lag = roce_lag && !mlx5_sriov_is_enabled(ldev->pf[i].dev);
822 
823 #ifdef CONFIG_MLX5_ESWITCH
824 	for (i = 0; i < ldev->ports; i++)
825 		roce_lag = roce_lag && is_mdev_legacy_mode(ldev->pf[i].dev);
826 #endif
827 
828 	return roce_lag;
829 }
830 
831 static bool mlx5_lag_should_modify_lag(struct mlx5_lag *ldev, bool do_bond)
832 {
833 	return do_bond && __mlx5_lag_is_active(ldev) &&
834 	       ldev->mode != MLX5_LAG_MODE_MPESW;
835 }
836 
837 static bool mlx5_lag_should_disable_lag(struct mlx5_lag *ldev, bool do_bond)
838 {
839 	return !do_bond && __mlx5_lag_is_active(ldev) &&
840 	       ldev->mode != MLX5_LAG_MODE_MPESW;
841 }
842 
843 static void mlx5_do_bond(struct mlx5_lag *ldev)
844 {
845 	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
846 	struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
847 	struct lag_tracker tracker = { };
848 	bool do_bond, roce_lag;
849 	int err;
850 	int i;
851 
852 	if (!mlx5_lag_is_ready(ldev)) {
853 		do_bond = false;
854 	} else {
855 		/* VF LAG is in multipath mode, ignore bond change requests */
856 		if (mlx5_lag_is_multipath(dev0))
857 			return;
858 
859 		tracker = ldev->tracker;
860 
861 		do_bond = tracker.is_bonded && mlx5_lag_check_prereq(ldev);
862 	}
863 
864 	if (do_bond && !__mlx5_lag_is_active(ldev)) {
865 		bool shared_fdb = mlx5_shared_fdb_supported(ldev);
866 
867 		roce_lag = mlx5_lag_is_roce_lag(ldev);
868 
869 		if (shared_fdb || roce_lag)
870 			mlx5_lag_remove_devices(ldev);
871 
872 		err = mlx5_activate_lag(ldev, &tracker,
873 					roce_lag ? MLX5_LAG_MODE_ROCE :
874 						   MLX5_LAG_MODE_SRIOV,
875 					shared_fdb);
876 		if (err) {
877 			if (shared_fdb || roce_lag)
878 				mlx5_lag_add_devices(ldev);
879 
880 			return;
881 		} else if (roce_lag) {
882 			dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
883 			mlx5_rescan_drivers_locked(dev0);
884 			for (i = 1; i < ldev->ports; i++)
885 				mlx5_nic_vport_enable_roce(ldev->pf[i].dev);
886 		} else if (shared_fdb) {
887 			dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
888 			mlx5_rescan_drivers_locked(dev0);
889 
890 			err = mlx5_eswitch_reload_reps(dev0->priv.eswitch);
891 			if (!err)
892 				err = mlx5_eswitch_reload_reps(dev1->priv.eswitch);
893 
894 			if (err) {
895 				dev0->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
896 				mlx5_rescan_drivers_locked(dev0);
897 				mlx5_deactivate_lag(ldev);
898 				mlx5_lag_add_devices(ldev);
899 				mlx5_eswitch_reload_reps(dev0->priv.eswitch);
900 				mlx5_eswitch_reload_reps(dev1->priv.eswitch);
901 				mlx5_core_err(dev0, "Failed to enable lag\n");
902 				return;
903 			}
904 		}
905 	} else if (mlx5_lag_should_modify_lag(ldev, do_bond)) {
906 		mlx5_modify_lag(ldev, &tracker);
907 	} else if (mlx5_lag_should_disable_lag(ldev, do_bond)) {
908 		mlx5_disable_lag(ldev);
909 	}
910 }
911 
912 static void mlx5_queue_bond_work(struct mlx5_lag *ldev, unsigned long delay)
913 {
914 	queue_delayed_work(ldev->wq, &ldev->bond_work, delay);
915 }
916 
917 static void mlx5_do_bond_work(struct work_struct *work)
918 {
919 	struct delayed_work *delayed_work = to_delayed_work(work);
920 	struct mlx5_lag *ldev = container_of(delayed_work, struct mlx5_lag,
921 					     bond_work);
922 	int status;
923 
924 	status = mlx5_dev_list_trylock();
925 	if (!status) {
926 		mlx5_queue_bond_work(ldev, HZ);
927 		return;
928 	}
929 
930 	mutex_lock(&ldev->lock);
931 	if (ldev->mode_changes_in_progress) {
932 		mutex_unlock(&ldev->lock);
933 		mlx5_dev_list_unlock();
934 		mlx5_queue_bond_work(ldev, HZ);
935 		return;
936 	}
937 
938 	mlx5_do_bond(ldev);
939 	mutex_unlock(&ldev->lock);
940 	mlx5_dev_list_unlock();
941 }
942 
943 static int mlx5_handle_changeupper_event(struct mlx5_lag *ldev,
944 					 struct lag_tracker *tracker,
945 					 struct netdev_notifier_changeupper_info *info)
946 {
947 	struct net_device *upper = info->upper_dev, *ndev_tmp;
948 	struct netdev_lag_upper_info *lag_upper_info = NULL;
949 	bool is_bonded, is_in_lag, mode_supported;
950 	bool has_inactive = 0;
951 	struct slave *slave;
952 	u8 bond_status = 0;
953 	int num_slaves = 0;
954 	int changed = 0;
955 	int idx;
956 
957 	if (!netif_is_lag_master(upper))
958 		return 0;
959 
960 	if (info->linking)
961 		lag_upper_info = info->upper_info;
962 
963 	/* The event may still be of interest if the slave does not belong to
964 	 * us, but is enslaved to a master which has one or more of our netdevs
965 	 * as slaves (e.g., if a new slave is added to a master that bonds two
966 	 * of our netdevs, we should unbond).
967 	 */
968 	rcu_read_lock();
969 	for_each_netdev_in_bond_rcu(upper, ndev_tmp) {
970 		idx = mlx5_lag_dev_get_netdev_idx(ldev, ndev_tmp);
971 		if (idx >= 0) {
972 			slave = bond_slave_get_rcu(ndev_tmp);
973 			if (slave)
974 				has_inactive |= bond_is_slave_inactive(slave);
975 			bond_status |= (1 << idx);
976 		}
977 
978 		num_slaves++;
979 	}
980 	rcu_read_unlock();
981 
982 	/* None of this lagdev's netdevs are slaves of this master. */
983 	if (!(bond_status & GENMASK(ldev->ports - 1, 0)))
984 		return 0;
985 
986 	if (lag_upper_info) {
987 		tracker->tx_type = lag_upper_info->tx_type;
988 		tracker->hash_type = lag_upper_info->hash_type;
989 	}
990 
991 	tracker->has_inactive = has_inactive;
992 	/* Determine bonding status:
993 	 * A device is considered bonded if both its physical ports are slaves
994 	 * of the same lag master, and only them.
995 	 */
996 	is_in_lag = num_slaves == ldev->ports &&
997 		bond_status == GENMASK(ldev->ports - 1, 0);
998 
999 	/* Lag mode must be activebackup or hash. */
1000 	mode_supported = tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP ||
1001 			 tracker->tx_type == NETDEV_LAG_TX_TYPE_HASH;
1002 
1003 	is_bonded = is_in_lag && mode_supported;
1004 	if (tracker->is_bonded != is_bonded) {
1005 		tracker->is_bonded = is_bonded;
1006 		changed = 1;
1007 	}
1008 
1009 	if (!is_in_lag)
1010 		return changed;
1011 
1012 	if (!mlx5_lag_is_ready(ldev))
1013 		NL_SET_ERR_MSG_MOD(info->info.extack,
1014 				   "Can't activate LAG offload, PF is configured with more than 64 VFs");
1015 	else if (!mode_supported)
1016 		NL_SET_ERR_MSG_MOD(info->info.extack,
1017 				   "Can't activate LAG offload, TX type isn't supported");
1018 
1019 	return changed;
1020 }
1021 
1022 static int mlx5_handle_changelowerstate_event(struct mlx5_lag *ldev,
1023 					      struct lag_tracker *tracker,
1024 					      struct net_device *ndev,
1025 					      struct netdev_notifier_changelowerstate_info *info)
1026 {
1027 	struct netdev_lag_lower_state_info *lag_lower_info;
1028 	int idx;
1029 
1030 	if (!netif_is_lag_port(ndev))
1031 		return 0;
1032 
1033 	idx = mlx5_lag_dev_get_netdev_idx(ldev, ndev);
1034 	if (idx < 0)
1035 		return 0;
1036 
1037 	/* This information is used to determine virtual to physical
1038 	 * port mapping.
1039 	 */
1040 	lag_lower_info = info->lower_state_info;
1041 	if (!lag_lower_info)
1042 		return 0;
1043 
1044 	tracker->netdev_state[idx] = *lag_lower_info;
1045 
1046 	return 1;
1047 }
1048 
1049 static int mlx5_handle_changeinfodata_event(struct mlx5_lag *ldev,
1050 					    struct lag_tracker *tracker,
1051 					    struct net_device *ndev)
1052 {
1053 	struct net_device *ndev_tmp;
1054 	struct slave *slave;
1055 	bool has_inactive = 0;
1056 	int idx;
1057 
1058 	if (!netif_is_lag_master(ndev))
1059 		return 0;
1060 
1061 	rcu_read_lock();
1062 	for_each_netdev_in_bond_rcu(ndev, ndev_tmp) {
1063 		idx = mlx5_lag_dev_get_netdev_idx(ldev, ndev_tmp);
1064 		if (idx < 0)
1065 			continue;
1066 
1067 		slave = bond_slave_get_rcu(ndev_tmp);
1068 		if (slave)
1069 			has_inactive |= bond_is_slave_inactive(slave);
1070 	}
1071 	rcu_read_unlock();
1072 
1073 	if (tracker->has_inactive == has_inactive)
1074 		return 0;
1075 
1076 	tracker->has_inactive = has_inactive;
1077 
1078 	return 1;
1079 }
1080 
1081 /* this handler is always registered to netdev events */
1082 static int mlx5_lag_netdev_event(struct notifier_block *this,
1083 				 unsigned long event, void *ptr)
1084 {
1085 	struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
1086 	struct lag_tracker tracker;
1087 	struct mlx5_lag *ldev;
1088 	int changed = 0;
1089 
1090 	if (event != NETDEV_CHANGEUPPER &&
1091 	    event != NETDEV_CHANGELOWERSTATE &&
1092 	    event != NETDEV_CHANGEINFODATA)
1093 		return NOTIFY_DONE;
1094 
1095 	ldev    = container_of(this, struct mlx5_lag, nb);
1096 
1097 	tracker = ldev->tracker;
1098 
1099 	switch (event) {
1100 	case NETDEV_CHANGEUPPER:
1101 		changed = mlx5_handle_changeupper_event(ldev, &tracker, ptr);
1102 		break;
1103 	case NETDEV_CHANGELOWERSTATE:
1104 		changed = mlx5_handle_changelowerstate_event(ldev, &tracker,
1105 							     ndev, ptr);
1106 		break;
1107 	case NETDEV_CHANGEINFODATA:
1108 		changed = mlx5_handle_changeinfodata_event(ldev, &tracker, ndev);
1109 		break;
1110 	}
1111 
1112 	ldev->tracker = tracker;
1113 
1114 	if (changed)
1115 		mlx5_queue_bond_work(ldev, 0);
1116 
1117 	return NOTIFY_DONE;
1118 }
1119 
1120 static void mlx5_ldev_add_netdev(struct mlx5_lag *ldev,
1121 				 struct mlx5_core_dev *dev,
1122 				 struct net_device *netdev)
1123 {
1124 	unsigned int fn = mlx5_get_dev_index(dev);
1125 	unsigned long flags;
1126 
1127 	if (fn >= ldev->ports)
1128 		return;
1129 
1130 	spin_lock_irqsave(&lag_lock, flags);
1131 	ldev->pf[fn].netdev = netdev;
1132 	ldev->tracker.netdev_state[fn].link_up = 0;
1133 	ldev->tracker.netdev_state[fn].tx_enabled = 0;
1134 	spin_unlock_irqrestore(&lag_lock, flags);
1135 }
1136 
1137 static void mlx5_ldev_remove_netdev(struct mlx5_lag *ldev,
1138 				    struct net_device *netdev)
1139 {
1140 	unsigned long flags;
1141 	int i;
1142 
1143 	spin_lock_irqsave(&lag_lock, flags);
1144 	for (i = 0; i < ldev->ports; i++) {
1145 		if (ldev->pf[i].netdev == netdev) {
1146 			ldev->pf[i].netdev = NULL;
1147 			break;
1148 		}
1149 	}
1150 	spin_unlock_irqrestore(&lag_lock, flags);
1151 }
1152 
1153 static void mlx5_ldev_add_mdev(struct mlx5_lag *ldev,
1154 			       struct mlx5_core_dev *dev)
1155 {
1156 	unsigned int fn = mlx5_get_dev_index(dev);
1157 
1158 	if (fn >= ldev->ports)
1159 		return;
1160 
1161 	ldev->pf[fn].dev = dev;
1162 	dev->priv.lag = ldev;
1163 }
1164 
1165 static void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev,
1166 				  struct mlx5_core_dev *dev)
1167 {
1168 	int i;
1169 
1170 	for (i = 0; i < ldev->ports; i++)
1171 		if (ldev->pf[i].dev == dev)
1172 			break;
1173 
1174 	if (i == ldev->ports)
1175 		return;
1176 
1177 	ldev->pf[i].dev = NULL;
1178 	dev->priv.lag = NULL;
1179 }
1180 
1181 /* Must be called with intf_mutex held */
1182 static int __mlx5_lag_dev_add_mdev(struct mlx5_core_dev *dev)
1183 {
1184 	struct mlx5_lag *ldev = NULL;
1185 	struct mlx5_core_dev *tmp_dev;
1186 
1187 	tmp_dev = mlx5_get_next_phys_dev_lag(dev);
1188 	if (tmp_dev)
1189 		ldev = tmp_dev->priv.lag;
1190 
1191 	if (!ldev) {
1192 		ldev = mlx5_lag_dev_alloc(dev);
1193 		if (!ldev) {
1194 			mlx5_core_err(dev, "Failed to alloc lag dev\n");
1195 			return 0;
1196 		}
1197 		mlx5_ldev_add_mdev(ldev, dev);
1198 		return 0;
1199 	}
1200 
1201 	mutex_lock(&ldev->lock);
1202 	if (ldev->mode_changes_in_progress) {
1203 		mutex_unlock(&ldev->lock);
1204 		return -EAGAIN;
1205 	}
1206 	mlx5_ldev_get(ldev);
1207 	mlx5_ldev_add_mdev(ldev, dev);
1208 	mutex_unlock(&ldev->lock);
1209 
1210 	return 0;
1211 }
1212 
1213 void mlx5_lag_remove_mdev(struct mlx5_core_dev *dev)
1214 {
1215 	struct mlx5_lag *ldev;
1216 
1217 	ldev = mlx5_lag_dev(dev);
1218 	if (!ldev)
1219 		return;
1220 
1221 	/* mdev is being removed, might as well remove debugfs
1222 	 * as early as possible.
1223 	 */
1224 	mlx5_ldev_remove_debugfs(dev->priv.dbg.lag_debugfs);
1225 recheck:
1226 	mutex_lock(&ldev->lock);
1227 	if (ldev->mode_changes_in_progress) {
1228 		mutex_unlock(&ldev->lock);
1229 		msleep(100);
1230 		goto recheck;
1231 	}
1232 	mlx5_ldev_remove_mdev(ldev, dev);
1233 	mutex_unlock(&ldev->lock);
1234 	mlx5_ldev_put(ldev);
1235 }
1236 
1237 void mlx5_lag_add_mdev(struct mlx5_core_dev *dev)
1238 {
1239 	int err;
1240 
1241 	if (!MLX5_CAP_GEN(dev, vport_group_manager) ||
1242 	    !MLX5_CAP_GEN(dev, lag_master) ||
1243 	    (MLX5_CAP_GEN(dev, num_lag_ports) > MLX5_MAX_PORTS ||
1244 	     MLX5_CAP_GEN(dev, num_lag_ports) <= 1))
1245 		return;
1246 
1247 recheck:
1248 	mlx5_dev_list_lock();
1249 	err = __mlx5_lag_dev_add_mdev(dev);
1250 	mlx5_dev_list_unlock();
1251 
1252 	if (err) {
1253 		msleep(100);
1254 		goto recheck;
1255 	}
1256 	mlx5_ldev_add_debugfs(dev);
1257 }
1258 
1259 void mlx5_lag_remove_netdev(struct mlx5_core_dev *dev,
1260 			    struct net_device *netdev)
1261 {
1262 	struct mlx5_lag *ldev;
1263 	bool lag_is_active;
1264 
1265 	ldev = mlx5_lag_dev(dev);
1266 	if (!ldev)
1267 		return;
1268 
1269 	mutex_lock(&ldev->lock);
1270 	mlx5_ldev_remove_netdev(ldev, netdev);
1271 	clear_bit(MLX5_LAG_FLAG_NDEVS_READY, &ldev->state_flags);
1272 
1273 	lag_is_active = __mlx5_lag_is_active(ldev);
1274 	mutex_unlock(&ldev->lock);
1275 
1276 	if (lag_is_active)
1277 		mlx5_queue_bond_work(ldev, 0);
1278 }
1279 
1280 void mlx5_lag_add_netdev(struct mlx5_core_dev *dev,
1281 			 struct net_device *netdev)
1282 {
1283 	struct mlx5_lag *ldev;
1284 	int i;
1285 
1286 	ldev = mlx5_lag_dev(dev);
1287 	if (!ldev)
1288 		return;
1289 
1290 	mutex_lock(&ldev->lock);
1291 	mlx5_ldev_add_netdev(ldev, dev, netdev);
1292 
1293 	for (i = 0; i < ldev->ports; i++)
1294 		if (!ldev->pf[i].netdev)
1295 			break;
1296 
1297 	if (i >= ldev->ports)
1298 		set_bit(MLX5_LAG_FLAG_NDEVS_READY, &ldev->state_flags);
1299 	mutex_unlock(&ldev->lock);
1300 	mlx5_queue_bond_work(ldev, 0);
1301 }
1302 
1303 bool mlx5_lag_is_roce(struct mlx5_core_dev *dev)
1304 {
1305 	struct mlx5_lag *ldev;
1306 	unsigned long flags;
1307 	bool res;
1308 
1309 	spin_lock_irqsave(&lag_lock, flags);
1310 	ldev = mlx5_lag_dev(dev);
1311 	res  = ldev && __mlx5_lag_is_roce(ldev);
1312 	spin_unlock_irqrestore(&lag_lock, flags);
1313 
1314 	return res;
1315 }
1316 EXPORT_SYMBOL(mlx5_lag_is_roce);
1317 
1318 bool mlx5_lag_is_active(struct mlx5_core_dev *dev)
1319 {
1320 	struct mlx5_lag *ldev;
1321 	unsigned long flags;
1322 	bool res;
1323 
1324 	spin_lock_irqsave(&lag_lock, flags);
1325 	ldev = mlx5_lag_dev(dev);
1326 	res  = ldev && __mlx5_lag_is_active(ldev);
1327 	spin_unlock_irqrestore(&lag_lock, flags);
1328 
1329 	return res;
1330 }
1331 EXPORT_SYMBOL(mlx5_lag_is_active);
1332 
1333 bool mlx5_lag_mode_is_hash(struct mlx5_core_dev *dev)
1334 {
1335 	struct mlx5_lag *ldev;
1336 	unsigned long flags;
1337 	bool res = 0;
1338 
1339 	spin_lock_irqsave(&lag_lock, flags);
1340 	ldev = mlx5_lag_dev(dev);
1341 	if (ldev)
1342 		res = test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &ldev->mode_flags);
1343 	spin_unlock_irqrestore(&lag_lock, flags);
1344 
1345 	return res;
1346 }
1347 EXPORT_SYMBOL(mlx5_lag_mode_is_hash);
1348 
1349 bool mlx5_lag_is_master(struct mlx5_core_dev *dev)
1350 {
1351 	struct mlx5_lag *ldev;
1352 	unsigned long flags;
1353 	bool res;
1354 
1355 	spin_lock_irqsave(&lag_lock, flags);
1356 	ldev = mlx5_lag_dev(dev);
1357 	res = ldev && __mlx5_lag_is_active(ldev) &&
1358 		dev == ldev->pf[MLX5_LAG_P1].dev;
1359 	spin_unlock_irqrestore(&lag_lock, flags);
1360 
1361 	return res;
1362 }
1363 EXPORT_SYMBOL(mlx5_lag_is_master);
1364 
1365 bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev)
1366 {
1367 	struct mlx5_lag *ldev;
1368 	unsigned long flags;
1369 	bool res;
1370 
1371 	spin_lock_irqsave(&lag_lock, flags);
1372 	ldev = mlx5_lag_dev(dev);
1373 	res  = ldev && __mlx5_lag_is_sriov(ldev);
1374 	spin_unlock_irqrestore(&lag_lock, flags);
1375 
1376 	return res;
1377 }
1378 EXPORT_SYMBOL(mlx5_lag_is_sriov);
1379 
1380 bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev)
1381 {
1382 	struct mlx5_lag *ldev;
1383 	unsigned long flags;
1384 	bool res;
1385 
1386 	spin_lock_irqsave(&lag_lock, flags);
1387 	ldev = mlx5_lag_dev(dev);
1388 	res = ldev && __mlx5_lag_is_sriov(ldev) &&
1389 	      test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &ldev->mode_flags);
1390 	spin_unlock_irqrestore(&lag_lock, flags);
1391 
1392 	return res;
1393 }
1394 EXPORT_SYMBOL(mlx5_lag_is_shared_fdb);
1395 
1396 void mlx5_lag_disable_change(struct mlx5_core_dev *dev)
1397 {
1398 	struct mlx5_lag *ldev;
1399 
1400 	ldev = mlx5_lag_dev(dev);
1401 	if (!ldev)
1402 		return;
1403 
1404 	mlx5_dev_list_lock();
1405 	mutex_lock(&ldev->lock);
1406 
1407 	ldev->mode_changes_in_progress++;
1408 	if (__mlx5_lag_is_active(ldev))
1409 		mlx5_disable_lag(ldev);
1410 
1411 	mutex_unlock(&ldev->lock);
1412 	mlx5_dev_list_unlock();
1413 }
1414 
1415 void mlx5_lag_enable_change(struct mlx5_core_dev *dev)
1416 {
1417 	struct mlx5_lag *ldev;
1418 
1419 	ldev = mlx5_lag_dev(dev);
1420 	if (!ldev)
1421 		return;
1422 
1423 	mutex_lock(&ldev->lock);
1424 	ldev->mode_changes_in_progress--;
1425 	mutex_unlock(&ldev->lock);
1426 	mlx5_queue_bond_work(ldev, 0);
1427 }
1428 
1429 struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev)
1430 {
1431 	struct net_device *ndev = NULL;
1432 	struct mlx5_lag *ldev;
1433 	unsigned long flags;
1434 	int i;
1435 
1436 	spin_lock_irqsave(&lag_lock, flags);
1437 	ldev = mlx5_lag_dev(dev);
1438 
1439 	if (!(ldev && __mlx5_lag_is_roce(ldev)))
1440 		goto unlock;
1441 
1442 	if (ldev->tracker.tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) {
1443 		for (i = 0; i < ldev->ports; i++)
1444 			if (ldev->tracker.netdev_state[i].tx_enabled)
1445 				ndev = ldev->pf[i].netdev;
1446 		if (!ndev)
1447 			ndev = ldev->pf[ldev->ports - 1].netdev;
1448 	} else {
1449 		ndev = ldev->pf[MLX5_LAG_P1].netdev;
1450 	}
1451 	if (ndev)
1452 		dev_hold(ndev);
1453 
1454 unlock:
1455 	spin_unlock_irqrestore(&lag_lock, flags);
1456 
1457 	return ndev;
1458 }
1459 EXPORT_SYMBOL(mlx5_lag_get_roce_netdev);
1460 
1461 u8 mlx5_lag_get_slave_port(struct mlx5_core_dev *dev,
1462 			   struct net_device *slave)
1463 {
1464 	struct mlx5_lag *ldev;
1465 	unsigned long flags;
1466 	u8 port = 0;
1467 	int i;
1468 
1469 	spin_lock_irqsave(&lag_lock, flags);
1470 	ldev = mlx5_lag_dev(dev);
1471 	if (!(ldev && __mlx5_lag_is_roce(ldev)))
1472 		goto unlock;
1473 
1474 	for (i = 0; i < ldev->ports; i++) {
1475 		if (ldev->pf[MLX5_LAG_P1].netdev == slave) {
1476 			port = i;
1477 			break;
1478 		}
1479 	}
1480 
1481 	port = ldev->v2p_map[port * ldev->buckets];
1482 
1483 unlock:
1484 	spin_unlock_irqrestore(&lag_lock, flags);
1485 	return port;
1486 }
1487 EXPORT_SYMBOL(mlx5_lag_get_slave_port);
1488 
1489 u8 mlx5_lag_get_num_ports(struct mlx5_core_dev *dev)
1490 {
1491 	struct mlx5_lag *ldev;
1492 
1493 	ldev = mlx5_lag_dev(dev);
1494 	if (!ldev)
1495 		return 0;
1496 
1497 	return ldev->ports;
1498 }
1499 EXPORT_SYMBOL(mlx5_lag_get_num_ports);
1500 
1501 struct mlx5_core_dev *mlx5_lag_get_peer_mdev(struct mlx5_core_dev *dev)
1502 {
1503 	struct mlx5_core_dev *peer_dev = NULL;
1504 	struct mlx5_lag *ldev;
1505 	unsigned long flags;
1506 
1507 	spin_lock_irqsave(&lag_lock, flags);
1508 	ldev = mlx5_lag_dev(dev);
1509 	if (!ldev)
1510 		goto unlock;
1511 
1512 	peer_dev = ldev->pf[MLX5_LAG_P1].dev == dev ?
1513 			   ldev->pf[MLX5_LAG_P2].dev :
1514 			   ldev->pf[MLX5_LAG_P1].dev;
1515 
1516 unlock:
1517 	spin_unlock_irqrestore(&lag_lock, flags);
1518 	return peer_dev;
1519 }
1520 EXPORT_SYMBOL(mlx5_lag_get_peer_mdev);
1521 
1522 int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev,
1523 				 u64 *values,
1524 				 int num_counters,
1525 				 size_t *offsets)
1526 {
1527 	int outlen = MLX5_ST_SZ_BYTES(query_cong_statistics_out);
1528 	struct mlx5_core_dev **mdev;
1529 	struct mlx5_lag *ldev;
1530 	unsigned long flags;
1531 	int num_ports;
1532 	int ret, i, j;
1533 	void *out;
1534 
1535 	out = kvzalloc(outlen, GFP_KERNEL);
1536 	if (!out)
1537 		return -ENOMEM;
1538 
1539 	mdev = kvzalloc(sizeof(mdev[0]) * MLX5_MAX_PORTS, GFP_KERNEL);
1540 	if (!mdev) {
1541 		ret = -ENOMEM;
1542 		goto free_out;
1543 	}
1544 
1545 	memset(values, 0, sizeof(*values) * num_counters);
1546 
1547 	spin_lock_irqsave(&lag_lock, flags);
1548 	ldev = mlx5_lag_dev(dev);
1549 	if (ldev && __mlx5_lag_is_active(ldev)) {
1550 		num_ports = ldev->ports;
1551 		for (i = 0; i < ldev->ports; i++)
1552 			mdev[i] = ldev->pf[i].dev;
1553 	} else {
1554 		num_ports = 1;
1555 		mdev[MLX5_LAG_P1] = dev;
1556 	}
1557 	spin_unlock_irqrestore(&lag_lock, flags);
1558 
1559 	for (i = 0; i < num_ports; ++i) {
1560 		u32 in[MLX5_ST_SZ_DW(query_cong_statistics_in)] = {};
1561 
1562 		MLX5_SET(query_cong_statistics_in, in, opcode,
1563 			 MLX5_CMD_OP_QUERY_CONG_STATISTICS);
1564 		ret = mlx5_cmd_exec_inout(mdev[i], query_cong_statistics, in,
1565 					  out);
1566 		if (ret)
1567 			goto free_mdev;
1568 
1569 		for (j = 0; j < num_counters; ++j)
1570 			values[j] += be64_to_cpup((__be64 *)(out + offsets[j]));
1571 	}
1572 
1573 free_mdev:
1574 	kvfree(mdev);
1575 free_out:
1576 	kvfree(out);
1577 	return ret;
1578 }
1579 EXPORT_SYMBOL(mlx5_lag_query_cong_counters);
1580