1 /*
2  * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32 
33 #include <linux/netdevice.h>
34 #include <net/bonding.h>
35 #include <linux/mlx5/driver.h>
36 #include <linux/mlx5/eswitch.h>
37 #include <linux/mlx5/vport.h>
38 #include "lib/devcom.h"
39 #include "mlx5_core.h"
40 #include "eswitch.h"
41 #include "esw/acl/ofld.h"
42 #include "lag.h"
43 #include "mp.h"
44 #include "mpesw.h"
45 
46 enum {
47 	MLX5_LAG_EGRESS_PORT_1 = 1,
48 	MLX5_LAG_EGRESS_PORT_2,
49 };
50 
51 /* General purpose, use for short periods of time.
52  * Beware of lock dependencies (preferably, no locks should be acquired
53  * under it).
54  */
55 static DEFINE_SPINLOCK(lag_lock);
56 
57 static int get_port_sel_mode(enum mlx5_lag_mode mode, unsigned long flags)
58 {
59 	if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags))
60 		return MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT;
61 
62 	if (mode == MLX5_LAG_MODE_MPESW)
63 		return MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_MPESW;
64 
65 	return MLX5_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY;
66 }
67 
68 static u8 lag_active_port_bits(struct mlx5_lag *ldev)
69 {
70 	u8 enabled_ports[MLX5_MAX_PORTS] = {};
71 	u8 active_port = 0;
72 	int num_enabled;
73 	int idx;
74 
75 	mlx5_infer_tx_enabled(&ldev->tracker, ldev->ports, enabled_ports,
76 			      &num_enabled);
77 	for (idx = 0; idx < num_enabled; idx++)
78 		active_port |= BIT_MASK(enabled_ports[idx]);
79 
80 	return active_port;
81 }
82 
83 static int mlx5_cmd_create_lag(struct mlx5_core_dev *dev, u8 *ports, int mode,
84 			       unsigned long flags)
85 {
86 	bool fdb_sel_mode = test_bit(MLX5_LAG_MODE_FLAG_FDB_SEL_MODE_NATIVE,
87 				     &flags);
88 	int port_sel_mode = get_port_sel_mode(mode, flags);
89 	u32 in[MLX5_ST_SZ_DW(create_lag_in)] = {};
90 	void *lag_ctx;
91 
92 	lag_ctx = MLX5_ADDR_OF(create_lag_in, in, ctx);
93 	MLX5_SET(create_lag_in, in, opcode, MLX5_CMD_OP_CREATE_LAG);
94 	MLX5_SET(lagc, lag_ctx, fdb_selection_mode, fdb_sel_mode);
95 
96 	switch (port_sel_mode) {
97 	case MLX5_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY:
98 		MLX5_SET(lagc, lag_ctx, tx_remap_affinity_1, ports[0]);
99 		MLX5_SET(lagc, lag_ctx, tx_remap_affinity_2, ports[1]);
100 		break;
101 	case MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT:
102 		if (!MLX5_CAP_PORT_SELECTION(dev, port_select_flow_table_bypass))
103 			break;
104 
105 		MLX5_SET(lagc, lag_ctx, active_port,
106 			 lag_active_port_bits(mlx5_lag_dev(dev)));
107 		break;
108 	default:
109 		break;
110 	}
111 	MLX5_SET(lagc, lag_ctx, port_select_mode, port_sel_mode);
112 
113 	return mlx5_cmd_exec_in(dev, create_lag, in);
114 }
115 
116 static int mlx5_cmd_modify_lag(struct mlx5_core_dev *dev, u8 num_ports,
117 			       u8 *ports)
118 {
119 	u32 in[MLX5_ST_SZ_DW(modify_lag_in)] = {};
120 	void *lag_ctx = MLX5_ADDR_OF(modify_lag_in, in, ctx);
121 
122 	MLX5_SET(modify_lag_in, in, opcode, MLX5_CMD_OP_MODIFY_LAG);
123 	MLX5_SET(modify_lag_in, in, field_select, 0x1);
124 
125 	MLX5_SET(lagc, lag_ctx, tx_remap_affinity_1, ports[0]);
126 	MLX5_SET(lagc, lag_ctx, tx_remap_affinity_2, ports[1]);
127 
128 	return mlx5_cmd_exec_in(dev, modify_lag, in);
129 }
130 
131 int mlx5_cmd_create_vport_lag(struct mlx5_core_dev *dev)
132 {
133 	u32 in[MLX5_ST_SZ_DW(create_vport_lag_in)] = {};
134 
135 	MLX5_SET(create_vport_lag_in, in, opcode, MLX5_CMD_OP_CREATE_VPORT_LAG);
136 
137 	return mlx5_cmd_exec_in(dev, create_vport_lag, in);
138 }
139 EXPORT_SYMBOL(mlx5_cmd_create_vport_lag);
140 
141 int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev)
142 {
143 	u32 in[MLX5_ST_SZ_DW(destroy_vport_lag_in)] = {};
144 
145 	MLX5_SET(destroy_vport_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_VPORT_LAG);
146 
147 	return mlx5_cmd_exec_in(dev, destroy_vport_lag, in);
148 }
149 EXPORT_SYMBOL(mlx5_cmd_destroy_vport_lag);
150 
151 static void mlx5_infer_tx_disabled(struct lag_tracker *tracker, u8 num_ports,
152 				   u8 *ports, int *num_disabled)
153 {
154 	int i;
155 
156 	*num_disabled = 0;
157 	for (i = 0; i < num_ports; i++) {
158 		if (!tracker->netdev_state[i].tx_enabled ||
159 		    !tracker->netdev_state[i].link_up)
160 			ports[(*num_disabled)++] = i;
161 	}
162 }
163 
164 void mlx5_infer_tx_enabled(struct lag_tracker *tracker, u8 num_ports,
165 			   u8 *ports, int *num_enabled)
166 {
167 	int i;
168 
169 	*num_enabled = 0;
170 	for (i = 0; i < num_ports; i++) {
171 		if (tracker->netdev_state[i].tx_enabled &&
172 		    tracker->netdev_state[i].link_up)
173 			ports[(*num_enabled)++] = i;
174 	}
175 
176 	if (*num_enabled == 0)
177 		mlx5_infer_tx_disabled(tracker, num_ports, ports, num_enabled);
178 }
179 
180 static void mlx5_lag_print_mapping(struct mlx5_core_dev *dev,
181 				   struct mlx5_lag *ldev,
182 				   struct lag_tracker *tracker,
183 				   unsigned long flags)
184 {
185 	char buf[MLX5_MAX_PORTS * 10 + 1] = {};
186 	u8 enabled_ports[MLX5_MAX_PORTS] = {};
187 	int written = 0;
188 	int num_enabled;
189 	int idx;
190 	int err;
191 	int i;
192 	int j;
193 
194 	if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) {
195 		mlx5_infer_tx_enabled(tracker, ldev->ports, enabled_ports,
196 				      &num_enabled);
197 		for (i = 0; i < num_enabled; i++) {
198 			err = scnprintf(buf + written, 4, "%d, ", enabled_ports[i] + 1);
199 			if (err != 3)
200 				return;
201 			written += err;
202 		}
203 		buf[written - 2] = 0;
204 		mlx5_core_info(dev, "lag map active ports: %s\n", buf);
205 	} else {
206 		for (i = 0; i < ldev->ports; i++) {
207 			for (j  = 0; j < ldev->buckets; j++) {
208 				idx = i * ldev->buckets + j;
209 				err = scnprintf(buf + written, 10,
210 						" port %d:%d", i + 1, ldev->v2p_map[idx]);
211 				if (err != 9)
212 					return;
213 				written += err;
214 			}
215 		}
216 		mlx5_core_info(dev, "lag map:%s\n", buf);
217 	}
218 }
219 
220 static int mlx5_lag_netdev_event(struct notifier_block *this,
221 				 unsigned long event, void *ptr);
222 static void mlx5_do_bond_work(struct work_struct *work);
223 
224 static void mlx5_ldev_free(struct kref *ref)
225 {
226 	struct mlx5_lag *ldev = container_of(ref, struct mlx5_lag, ref);
227 
228 	if (ldev->nb.notifier_call)
229 		unregister_netdevice_notifier_net(&init_net, &ldev->nb);
230 	mlx5_lag_mp_cleanup(ldev);
231 	cancel_delayed_work_sync(&ldev->bond_work);
232 	destroy_workqueue(ldev->wq);
233 	mlx5_lag_mpesw_cleanup(ldev);
234 	mutex_destroy(&ldev->lock);
235 	kfree(ldev);
236 }
237 
238 static void mlx5_ldev_put(struct mlx5_lag *ldev)
239 {
240 	kref_put(&ldev->ref, mlx5_ldev_free);
241 }
242 
243 static void mlx5_ldev_get(struct mlx5_lag *ldev)
244 {
245 	kref_get(&ldev->ref);
246 }
247 
248 static struct mlx5_lag *mlx5_lag_dev_alloc(struct mlx5_core_dev *dev)
249 {
250 	struct mlx5_lag *ldev;
251 	int err;
252 
253 	ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);
254 	if (!ldev)
255 		return NULL;
256 
257 	ldev->wq = create_singlethread_workqueue("mlx5_lag");
258 	if (!ldev->wq) {
259 		kfree(ldev);
260 		return NULL;
261 	}
262 
263 	kref_init(&ldev->ref);
264 	mutex_init(&ldev->lock);
265 	INIT_DELAYED_WORK(&ldev->bond_work, mlx5_do_bond_work);
266 
267 	ldev->nb.notifier_call = mlx5_lag_netdev_event;
268 	if (register_netdevice_notifier_net(&init_net, &ldev->nb)) {
269 		ldev->nb.notifier_call = NULL;
270 		mlx5_core_err(dev, "Failed to register LAG netdev notifier\n");
271 	}
272 	ldev->mode = MLX5_LAG_MODE_NONE;
273 
274 	err = mlx5_lag_mp_init(ldev);
275 	if (err)
276 		mlx5_core_err(dev, "Failed to init multipath lag err=%d\n",
277 			      err);
278 
279 	mlx5_lag_mpesw_init(ldev);
280 	ldev->ports = MLX5_CAP_GEN(dev, num_lag_ports);
281 	ldev->buckets = 1;
282 
283 	return ldev;
284 }
285 
286 int mlx5_lag_dev_get_netdev_idx(struct mlx5_lag *ldev,
287 				struct net_device *ndev)
288 {
289 	int i;
290 
291 	for (i = 0; i < ldev->ports; i++)
292 		if (ldev->pf[i].netdev == ndev)
293 			return i;
294 
295 	return -ENOENT;
296 }
297 
298 static bool __mlx5_lag_is_roce(struct mlx5_lag *ldev)
299 {
300 	return ldev->mode == MLX5_LAG_MODE_ROCE;
301 }
302 
303 static bool __mlx5_lag_is_sriov(struct mlx5_lag *ldev)
304 {
305 	return ldev->mode == MLX5_LAG_MODE_SRIOV;
306 }
307 
308 /* Create a mapping between steering slots and active ports.
309  * As we have ldev->buckets slots per port first assume the native
310  * mapping should be used.
311  * If there are ports that are disabled fill the relevant slots
312  * with mapping that points to active ports.
313  */
314 static void mlx5_infer_tx_affinity_mapping(struct lag_tracker *tracker,
315 					   u8 num_ports,
316 					   u8 buckets,
317 					   u8 *ports)
318 {
319 	int disabled[MLX5_MAX_PORTS] = {};
320 	int enabled[MLX5_MAX_PORTS] = {};
321 	int disabled_ports_num = 0;
322 	int enabled_ports_num = 0;
323 	int idx;
324 	u32 rand;
325 	int i;
326 	int j;
327 
328 	for (i = 0; i < num_ports; i++) {
329 		if (tracker->netdev_state[i].tx_enabled &&
330 		    tracker->netdev_state[i].link_up)
331 			enabled[enabled_ports_num++] = i;
332 		else
333 			disabled[disabled_ports_num++] = i;
334 	}
335 
336 	/* Use native mapping by default where each port's buckets
337 	 * point the native port: 1 1 1 .. 1 2 2 2 ... 2 3 3 3 ... 3 etc
338 	 */
339 	for (i = 0; i < num_ports; i++)
340 		for (j = 0; j < buckets; j++) {
341 			idx = i * buckets + j;
342 			ports[idx] = MLX5_LAG_EGRESS_PORT_1 + i;
343 		}
344 
345 	/* If all ports are disabled/enabled keep native mapping */
346 	if (enabled_ports_num == num_ports ||
347 	    disabled_ports_num == num_ports)
348 		return;
349 
350 	/* Go over the disabled ports and for each assign a random active port */
351 	for (i = 0; i < disabled_ports_num; i++) {
352 		for (j = 0; j < buckets; j++) {
353 			get_random_bytes(&rand, 4);
354 			ports[disabled[i] * buckets + j] = enabled[rand % enabled_ports_num] + 1;
355 		}
356 	}
357 }
358 
359 static bool mlx5_lag_has_drop_rule(struct mlx5_lag *ldev)
360 {
361 	int i;
362 
363 	for (i = 0; i < ldev->ports; i++)
364 		if (ldev->pf[i].has_drop)
365 			return true;
366 	return false;
367 }
368 
369 static void mlx5_lag_drop_rule_cleanup(struct mlx5_lag *ldev)
370 {
371 	int i;
372 
373 	for (i = 0; i < ldev->ports; i++) {
374 		if (!ldev->pf[i].has_drop)
375 			continue;
376 
377 		mlx5_esw_acl_ingress_vport_drop_rule_destroy(ldev->pf[i].dev->priv.eswitch,
378 							     MLX5_VPORT_UPLINK);
379 		ldev->pf[i].has_drop = false;
380 	}
381 }
382 
383 static void mlx5_lag_drop_rule_setup(struct mlx5_lag *ldev,
384 				     struct lag_tracker *tracker)
385 {
386 	u8 disabled_ports[MLX5_MAX_PORTS] = {};
387 	struct mlx5_core_dev *dev;
388 	int disabled_index;
389 	int num_disabled;
390 	int err;
391 	int i;
392 
393 	/* First delete the current drop rule so there won't be any dropped
394 	 * packets
395 	 */
396 	mlx5_lag_drop_rule_cleanup(ldev);
397 
398 	if (!ldev->tracker.has_inactive)
399 		return;
400 
401 	mlx5_infer_tx_disabled(tracker, ldev->ports, disabled_ports, &num_disabled);
402 
403 	for (i = 0; i < num_disabled; i++) {
404 		disabled_index = disabled_ports[i];
405 		dev = ldev->pf[disabled_index].dev;
406 		err = mlx5_esw_acl_ingress_vport_drop_rule_create(dev->priv.eswitch,
407 								  MLX5_VPORT_UPLINK);
408 		if (!err)
409 			ldev->pf[disabled_index].has_drop = true;
410 		else
411 			mlx5_core_err(dev,
412 				      "Failed to create lag drop rule, error: %d", err);
413 	}
414 }
415 
416 static int mlx5_cmd_modify_active_port(struct mlx5_core_dev *dev, u8 ports)
417 {
418 	u32 in[MLX5_ST_SZ_DW(modify_lag_in)] = {};
419 	void *lag_ctx;
420 
421 	lag_ctx = MLX5_ADDR_OF(modify_lag_in, in, ctx);
422 
423 	MLX5_SET(modify_lag_in, in, opcode, MLX5_CMD_OP_MODIFY_LAG);
424 	MLX5_SET(modify_lag_in, in, field_select, 0x2);
425 
426 	MLX5_SET(lagc, lag_ctx, active_port, ports);
427 
428 	return mlx5_cmd_exec_in(dev, modify_lag, in);
429 }
430 
431 static int _mlx5_modify_lag(struct mlx5_lag *ldev, u8 *ports)
432 {
433 	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
434 	u8 active_ports;
435 	int ret;
436 
437 	if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &ldev->mode_flags)) {
438 		ret = mlx5_lag_port_sel_modify(ldev, ports);
439 		if (ret ||
440 		    !MLX5_CAP_PORT_SELECTION(dev0, port_select_flow_table_bypass))
441 			return ret;
442 
443 		active_ports = lag_active_port_bits(ldev);
444 
445 		return mlx5_cmd_modify_active_port(dev0, active_ports);
446 	}
447 	return mlx5_cmd_modify_lag(dev0, ldev->ports, ports);
448 }
449 
450 void mlx5_modify_lag(struct mlx5_lag *ldev,
451 		     struct lag_tracker *tracker)
452 {
453 	u8 ports[MLX5_MAX_PORTS * MLX5_LAG_MAX_HASH_BUCKETS] = {};
454 	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
455 	int idx;
456 	int err;
457 	int i;
458 	int j;
459 
460 	mlx5_infer_tx_affinity_mapping(tracker, ldev->ports, ldev->buckets, ports);
461 
462 	for (i = 0; i < ldev->ports; i++) {
463 		for (j = 0; j < ldev->buckets; j++) {
464 			idx = i * ldev->buckets + j;
465 			if (ports[idx] == ldev->v2p_map[idx])
466 				continue;
467 			err = _mlx5_modify_lag(ldev, ports);
468 			if (err) {
469 				mlx5_core_err(dev0,
470 					      "Failed to modify LAG (%d)\n",
471 					      err);
472 				return;
473 			}
474 			memcpy(ldev->v2p_map, ports, sizeof(ports));
475 
476 			mlx5_lag_print_mapping(dev0, ldev, tracker,
477 					       ldev->mode_flags);
478 			break;
479 		}
480 	}
481 
482 	if (tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP &&
483 	    !(ldev->mode == MLX5_LAG_MODE_ROCE))
484 		mlx5_lag_drop_rule_setup(ldev, tracker);
485 }
486 
487 static int mlx5_lag_set_port_sel_mode_roce(struct mlx5_lag *ldev,
488 					   unsigned long *flags)
489 {
490 	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
491 
492 	if (!MLX5_CAP_PORT_SELECTION(dev0, port_select_flow_table)) {
493 		if (ldev->ports > 2)
494 			return -EINVAL;
495 		return 0;
496 	}
497 
498 	if (ldev->ports > 2)
499 		ldev->buckets = MLX5_LAG_MAX_HASH_BUCKETS;
500 
501 	set_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, flags);
502 
503 	return 0;
504 }
505 
506 static void mlx5_lag_set_port_sel_mode_offloads(struct mlx5_lag *ldev,
507 						struct lag_tracker *tracker,
508 						enum mlx5_lag_mode mode,
509 						unsigned long *flags)
510 {
511 	struct lag_func *dev0 = &ldev->pf[MLX5_LAG_P1];
512 
513 	if (mode == MLX5_LAG_MODE_MPESW)
514 		return;
515 
516 	if (MLX5_CAP_PORT_SELECTION(dev0->dev, port_select_flow_table) &&
517 	    tracker->tx_type == NETDEV_LAG_TX_TYPE_HASH)
518 		set_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, flags);
519 }
520 
521 static int mlx5_lag_set_flags(struct mlx5_lag *ldev, enum mlx5_lag_mode mode,
522 			      struct lag_tracker *tracker, bool shared_fdb,
523 			      unsigned long *flags)
524 {
525 	bool roce_lag = mode == MLX5_LAG_MODE_ROCE;
526 
527 	*flags = 0;
528 	if (shared_fdb) {
529 		set_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, flags);
530 		set_bit(MLX5_LAG_MODE_FLAG_FDB_SEL_MODE_NATIVE, flags);
531 	}
532 
533 	if (mode == MLX5_LAG_MODE_MPESW)
534 		set_bit(MLX5_LAG_MODE_FLAG_FDB_SEL_MODE_NATIVE, flags);
535 
536 	if (roce_lag)
537 		return mlx5_lag_set_port_sel_mode_roce(ldev, flags);
538 
539 	mlx5_lag_set_port_sel_mode_offloads(ldev, tracker, mode, flags);
540 	return 0;
541 }
542 
543 char *mlx5_get_str_port_sel_mode(enum mlx5_lag_mode mode, unsigned long flags)
544 {
545 	int port_sel_mode = get_port_sel_mode(mode, flags);
546 
547 	switch (port_sel_mode) {
548 	case MLX5_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY: return "queue_affinity";
549 	case MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT: return "hash";
550 	case MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_MPESW: return "mpesw";
551 	default: return "invalid";
552 	}
553 }
554 
555 static int mlx5_create_lag(struct mlx5_lag *ldev,
556 			   struct lag_tracker *tracker,
557 			   enum mlx5_lag_mode mode,
558 			   unsigned long flags)
559 {
560 	bool shared_fdb = test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags);
561 	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
562 	struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
563 	u32 in[MLX5_ST_SZ_DW(destroy_lag_in)] = {};
564 	int err;
565 
566 	if (tracker)
567 		mlx5_lag_print_mapping(dev0, ldev, tracker, flags);
568 	mlx5_core_info(dev0, "shared_fdb:%d mode:%s\n",
569 		       shared_fdb, mlx5_get_str_port_sel_mode(mode, flags));
570 
571 	err = mlx5_cmd_create_lag(dev0, ldev->v2p_map, mode, flags);
572 	if (err) {
573 		mlx5_core_err(dev0,
574 			      "Failed to create LAG (%d)\n",
575 			      err);
576 		return err;
577 	}
578 
579 	if (shared_fdb) {
580 		err = mlx5_eswitch_offloads_config_single_fdb(dev0->priv.eswitch,
581 							      dev1->priv.eswitch);
582 		if (err)
583 			mlx5_core_err(dev0, "Can't enable single FDB mode\n");
584 		else
585 			mlx5_core_info(dev0, "Operation mode is single FDB\n");
586 	}
587 
588 	if (err) {
589 		MLX5_SET(destroy_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_LAG);
590 		if (mlx5_cmd_exec_in(dev0, destroy_lag, in))
591 			mlx5_core_err(dev0,
592 				      "Failed to deactivate RoCE LAG; driver restart required\n");
593 	}
594 
595 	return err;
596 }
597 
598 int mlx5_activate_lag(struct mlx5_lag *ldev,
599 		      struct lag_tracker *tracker,
600 		      enum mlx5_lag_mode mode,
601 		      bool shared_fdb)
602 {
603 	bool roce_lag = mode == MLX5_LAG_MODE_ROCE;
604 	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
605 	unsigned long flags = 0;
606 	int err;
607 
608 	err = mlx5_lag_set_flags(ldev, mode, tracker, shared_fdb, &flags);
609 	if (err)
610 		return err;
611 
612 	if (mode != MLX5_LAG_MODE_MPESW) {
613 		mlx5_infer_tx_affinity_mapping(tracker, ldev->ports, ldev->buckets, ldev->v2p_map);
614 		if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) {
615 			err = mlx5_lag_port_sel_create(ldev, tracker->hash_type,
616 						       ldev->v2p_map);
617 			if (err) {
618 				mlx5_core_err(dev0,
619 					      "Failed to create LAG port selection(%d)\n",
620 					      err);
621 				return err;
622 			}
623 		}
624 	}
625 
626 	err = mlx5_create_lag(ldev, tracker, mode, flags);
627 	if (err) {
628 		if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags))
629 			mlx5_lag_port_sel_destroy(ldev);
630 		if (roce_lag)
631 			mlx5_core_err(dev0,
632 				      "Failed to activate RoCE LAG\n");
633 		else
634 			mlx5_core_err(dev0,
635 				      "Failed to activate VF LAG\n"
636 				      "Make sure all VFs are unbound prior to VF LAG activation or deactivation\n");
637 		return err;
638 	}
639 
640 	if (tracker && tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP &&
641 	    !roce_lag)
642 		mlx5_lag_drop_rule_setup(ldev, tracker);
643 
644 	ldev->mode = mode;
645 	ldev->mode_flags = flags;
646 	return 0;
647 }
648 
649 static int mlx5_deactivate_lag(struct mlx5_lag *ldev)
650 {
651 	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
652 	struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
653 	u32 in[MLX5_ST_SZ_DW(destroy_lag_in)] = {};
654 	bool roce_lag = __mlx5_lag_is_roce(ldev);
655 	unsigned long flags = ldev->mode_flags;
656 	int err;
657 
658 	ldev->mode = MLX5_LAG_MODE_NONE;
659 	ldev->mode_flags = 0;
660 	mlx5_lag_mp_reset(ldev);
661 
662 	if (test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags)) {
663 		mlx5_eswitch_offloads_destroy_single_fdb(dev0->priv.eswitch,
664 							 dev1->priv.eswitch);
665 		clear_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags);
666 	}
667 
668 	MLX5_SET(destroy_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_LAG);
669 	err = mlx5_cmd_exec_in(dev0, destroy_lag, in);
670 	if (err) {
671 		if (roce_lag) {
672 			mlx5_core_err(dev0,
673 				      "Failed to deactivate RoCE LAG; driver restart required\n");
674 		} else {
675 			mlx5_core_err(dev0,
676 				      "Failed to deactivate VF LAG; driver restart required\n"
677 				      "Make sure all VFs are unbound prior to VF LAG activation or deactivation\n");
678 		}
679 		return err;
680 	}
681 
682 	if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags))
683 		mlx5_lag_port_sel_destroy(ldev);
684 	if (mlx5_lag_has_drop_rule(ldev))
685 		mlx5_lag_drop_rule_cleanup(ldev);
686 
687 	return 0;
688 }
689 
690 #define MLX5_LAG_OFFLOADS_SUPPORTED_PORTS 2
691 static bool mlx5_lag_check_prereq(struct mlx5_lag *ldev)
692 {
693 #ifdef CONFIG_MLX5_ESWITCH
694 	struct mlx5_core_dev *dev;
695 	u8 mode;
696 #endif
697 	int i;
698 
699 	for (i = 0; i < ldev->ports; i++)
700 		if (!ldev->pf[i].dev)
701 			return false;
702 
703 #ifdef CONFIG_MLX5_ESWITCH
704 	for (i = 0; i < ldev->ports; i++) {
705 		dev = ldev->pf[i].dev;
706 		if (mlx5_eswitch_num_vfs(dev->priv.eswitch) && !is_mdev_switchdev_mode(dev))
707 			return false;
708 	}
709 
710 	dev = ldev->pf[MLX5_LAG_P1].dev;
711 	mode = mlx5_eswitch_mode(dev);
712 	for (i = 0; i < ldev->ports; i++)
713 		if (mlx5_eswitch_mode(ldev->pf[i].dev) != mode)
714 			return false;
715 
716 	if (mode == MLX5_ESWITCH_OFFLOADS && ldev->ports != MLX5_LAG_OFFLOADS_SUPPORTED_PORTS)
717 		return false;
718 #else
719 	for (i = 0; i < ldev->ports; i++)
720 		if (mlx5_sriov_is_enabled(ldev->pf[i].dev))
721 			return false;
722 #endif
723 	return true;
724 }
725 
726 static void mlx5_lag_add_devices(struct mlx5_lag *ldev)
727 {
728 	int i;
729 
730 	for (i = 0; i < ldev->ports; i++) {
731 		if (!ldev->pf[i].dev)
732 			continue;
733 
734 		if (ldev->pf[i].dev->priv.flags &
735 		    MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)
736 			continue;
737 
738 		ldev->pf[i].dev->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
739 		mlx5_rescan_drivers_locked(ldev->pf[i].dev);
740 	}
741 }
742 
743 static void mlx5_lag_remove_devices(struct mlx5_lag *ldev)
744 {
745 	int i;
746 
747 	for (i = 0; i < ldev->ports; i++) {
748 		if (!ldev->pf[i].dev)
749 			continue;
750 
751 		if (ldev->pf[i].dev->priv.flags &
752 		    MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)
753 			continue;
754 
755 		ldev->pf[i].dev->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
756 		mlx5_rescan_drivers_locked(ldev->pf[i].dev);
757 	}
758 }
759 
760 void mlx5_disable_lag(struct mlx5_lag *ldev)
761 {
762 	bool shared_fdb = test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &ldev->mode_flags);
763 	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
764 	struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
765 	bool roce_lag;
766 	int err;
767 	int i;
768 
769 	roce_lag = __mlx5_lag_is_roce(ldev);
770 
771 	if (shared_fdb) {
772 		mlx5_lag_remove_devices(ldev);
773 	} else if (roce_lag) {
774 		if (!(dev0->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)) {
775 			dev0->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
776 			mlx5_rescan_drivers_locked(dev0);
777 		}
778 		for (i = 1; i < ldev->ports; i++)
779 			mlx5_nic_vport_disable_roce(ldev->pf[i].dev);
780 	}
781 
782 	err = mlx5_deactivate_lag(ldev);
783 	if (err)
784 		return;
785 
786 	if (shared_fdb || roce_lag)
787 		mlx5_lag_add_devices(ldev);
788 
789 	if (shared_fdb) {
790 		if (!(dev0->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV))
791 			mlx5_eswitch_reload_reps(dev0->priv.eswitch);
792 		if (!(dev1->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV))
793 			mlx5_eswitch_reload_reps(dev1->priv.eswitch);
794 	}
795 }
796 
797 bool mlx5_shared_fdb_supported(struct mlx5_lag *ldev)
798 {
799 	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
800 	struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
801 
802 	if (is_mdev_switchdev_mode(dev0) &&
803 	    is_mdev_switchdev_mode(dev1) &&
804 	    mlx5_eswitch_vport_match_metadata_enabled(dev0->priv.eswitch) &&
805 	    mlx5_eswitch_vport_match_metadata_enabled(dev1->priv.eswitch) &&
806 	    mlx5_devcom_is_paired(dev0->priv.devcom,
807 				  MLX5_DEVCOM_ESW_OFFLOADS) &&
808 	    MLX5_CAP_GEN(dev1, lag_native_fdb_selection) &&
809 	    MLX5_CAP_ESW(dev1, root_ft_on_other_esw) &&
810 	    MLX5_CAP_ESW(dev0, esw_shared_ingress_acl))
811 		return true;
812 
813 	return false;
814 }
815 
816 static bool mlx5_lag_is_roce_lag(struct mlx5_lag *ldev)
817 {
818 	bool roce_lag = true;
819 	int i;
820 
821 	for (i = 0; i < ldev->ports; i++)
822 		roce_lag = roce_lag && !mlx5_sriov_is_enabled(ldev->pf[i].dev);
823 
824 #ifdef CONFIG_MLX5_ESWITCH
825 	for (i = 0; i < ldev->ports; i++)
826 		roce_lag = roce_lag && is_mdev_legacy_mode(ldev->pf[i].dev);
827 #endif
828 
829 	return roce_lag;
830 }
831 
832 static bool mlx5_lag_should_modify_lag(struct mlx5_lag *ldev, bool do_bond)
833 {
834 	return do_bond && __mlx5_lag_is_active(ldev) &&
835 	       ldev->mode != MLX5_LAG_MODE_MPESW;
836 }
837 
838 static bool mlx5_lag_should_disable_lag(struct mlx5_lag *ldev, bool do_bond)
839 {
840 	return !do_bond && __mlx5_lag_is_active(ldev) &&
841 	       ldev->mode != MLX5_LAG_MODE_MPESW;
842 }
843 
844 static void mlx5_do_bond(struct mlx5_lag *ldev)
845 {
846 	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
847 	struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
848 	struct lag_tracker tracker = { };
849 	bool do_bond, roce_lag;
850 	int err;
851 	int i;
852 
853 	if (!mlx5_lag_is_ready(ldev)) {
854 		do_bond = false;
855 	} else {
856 		/* VF LAG is in multipath mode, ignore bond change requests */
857 		if (mlx5_lag_is_multipath(dev0))
858 			return;
859 
860 		tracker = ldev->tracker;
861 
862 		do_bond = tracker.is_bonded && mlx5_lag_check_prereq(ldev);
863 	}
864 
865 	if (do_bond && !__mlx5_lag_is_active(ldev)) {
866 		bool shared_fdb = mlx5_shared_fdb_supported(ldev);
867 
868 		roce_lag = mlx5_lag_is_roce_lag(ldev);
869 
870 		if (shared_fdb || roce_lag)
871 			mlx5_lag_remove_devices(ldev);
872 
873 		err = mlx5_activate_lag(ldev, &tracker,
874 					roce_lag ? MLX5_LAG_MODE_ROCE :
875 						   MLX5_LAG_MODE_SRIOV,
876 					shared_fdb);
877 		if (err) {
878 			if (shared_fdb || roce_lag)
879 				mlx5_lag_add_devices(ldev);
880 
881 			return;
882 		} else if (roce_lag) {
883 			dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
884 			mlx5_rescan_drivers_locked(dev0);
885 			for (i = 1; i < ldev->ports; i++)
886 				mlx5_nic_vport_enable_roce(ldev->pf[i].dev);
887 		} else if (shared_fdb) {
888 			dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
889 			mlx5_rescan_drivers_locked(dev0);
890 
891 			err = mlx5_eswitch_reload_reps(dev0->priv.eswitch);
892 			if (!err)
893 				err = mlx5_eswitch_reload_reps(dev1->priv.eswitch);
894 
895 			if (err) {
896 				dev0->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
897 				mlx5_rescan_drivers_locked(dev0);
898 				mlx5_deactivate_lag(ldev);
899 				mlx5_lag_add_devices(ldev);
900 				mlx5_eswitch_reload_reps(dev0->priv.eswitch);
901 				mlx5_eswitch_reload_reps(dev1->priv.eswitch);
902 				mlx5_core_err(dev0, "Failed to enable lag\n");
903 				return;
904 			}
905 		}
906 	} else if (mlx5_lag_should_modify_lag(ldev, do_bond)) {
907 		mlx5_modify_lag(ldev, &tracker);
908 	} else if (mlx5_lag_should_disable_lag(ldev, do_bond)) {
909 		mlx5_disable_lag(ldev);
910 	}
911 }
912 
913 static void mlx5_queue_bond_work(struct mlx5_lag *ldev, unsigned long delay)
914 {
915 	queue_delayed_work(ldev->wq, &ldev->bond_work, delay);
916 }
917 
918 static void mlx5_do_bond_work(struct work_struct *work)
919 {
920 	struct delayed_work *delayed_work = to_delayed_work(work);
921 	struct mlx5_lag *ldev = container_of(delayed_work, struct mlx5_lag,
922 					     bond_work);
923 	int status;
924 
925 	status = mlx5_dev_list_trylock();
926 	if (!status) {
927 		mlx5_queue_bond_work(ldev, HZ);
928 		return;
929 	}
930 
931 	mutex_lock(&ldev->lock);
932 	if (ldev->mode_changes_in_progress) {
933 		mutex_unlock(&ldev->lock);
934 		mlx5_dev_list_unlock();
935 		mlx5_queue_bond_work(ldev, HZ);
936 		return;
937 	}
938 
939 	mlx5_do_bond(ldev);
940 	mutex_unlock(&ldev->lock);
941 	mlx5_dev_list_unlock();
942 }
943 
944 static int mlx5_handle_changeupper_event(struct mlx5_lag *ldev,
945 					 struct lag_tracker *tracker,
946 					 struct netdev_notifier_changeupper_info *info)
947 {
948 	struct net_device *upper = info->upper_dev, *ndev_tmp;
949 	struct netdev_lag_upper_info *lag_upper_info = NULL;
950 	bool is_bonded, is_in_lag, mode_supported;
951 	bool has_inactive = 0;
952 	struct slave *slave;
953 	u8 bond_status = 0;
954 	int num_slaves = 0;
955 	int changed = 0;
956 	int idx;
957 
958 	if (!netif_is_lag_master(upper))
959 		return 0;
960 
961 	if (info->linking)
962 		lag_upper_info = info->upper_info;
963 
964 	/* The event may still be of interest if the slave does not belong to
965 	 * us, but is enslaved to a master which has one or more of our netdevs
966 	 * as slaves (e.g., if a new slave is added to a master that bonds two
967 	 * of our netdevs, we should unbond).
968 	 */
969 	rcu_read_lock();
970 	for_each_netdev_in_bond_rcu(upper, ndev_tmp) {
971 		idx = mlx5_lag_dev_get_netdev_idx(ldev, ndev_tmp);
972 		if (idx >= 0) {
973 			slave = bond_slave_get_rcu(ndev_tmp);
974 			if (slave)
975 				has_inactive |= bond_is_slave_inactive(slave);
976 			bond_status |= (1 << idx);
977 		}
978 
979 		num_slaves++;
980 	}
981 	rcu_read_unlock();
982 
983 	/* None of this lagdev's netdevs are slaves of this master. */
984 	if (!(bond_status & GENMASK(ldev->ports - 1, 0)))
985 		return 0;
986 
987 	if (lag_upper_info) {
988 		tracker->tx_type = lag_upper_info->tx_type;
989 		tracker->hash_type = lag_upper_info->hash_type;
990 	}
991 
992 	tracker->has_inactive = has_inactive;
993 	/* Determine bonding status:
994 	 * A device is considered bonded if both its physical ports are slaves
995 	 * of the same lag master, and only them.
996 	 */
997 	is_in_lag = num_slaves == ldev->ports &&
998 		bond_status == GENMASK(ldev->ports - 1, 0);
999 
1000 	/* Lag mode must be activebackup or hash. */
1001 	mode_supported = tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP ||
1002 			 tracker->tx_type == NETDEV_LAG_TX_TYPE_HASH;
1003 
1004 	is_bonded = is_in_lag && mode_supported;
1005 	if (tracker->is_bonded != is_bonded) {
1006 		tracker->is_bonded = is_bonded;
1007 		changed = 1;
1008 	}
1009 
1010 	if (!is_in_lag)
1011 		return changed;
1012 
1013 	if (!mlx5_lag_is_ready(ldev))
1014 		NL_SET_ERR_MSG_MOD(info->info.extack,
1015 				   "Can't activate LAG offload, PF is configured with more than 64 VFs");
1016 	else if (!mode_supported)
1017 		NL_SET_ERR_MSG_MOD(info->info.extack,
1018 				   "Can't activate LAG offload, TX type isn't supported");
1019 
1020 	return changed;
1021 }
1022 
1023 static int mlx5_handle_changelowerstate_event(struct mlx5_lag *ldev,
1024 					      struct lag_tracker *tracker,
1025 					      struct net_device *ndev,
1026 					      struct netdev_notifier_changelowerstate_info *info)
1027 {
1028 	struct netdev_lag_lower_state_info *lag_lower_info;
1029 	int idx;
1030 
1031 	if (!netif_is_lag_port(ndev))
1032 		return 0;
1033 
1034 	idx = mlx5_lag_dev_get_netdev_idx(ldev, ndev);
1035 	if (idx < 0)
1036 		return 0;
1037 
1038 	/* This information is used to determine virtual to physical
1039 	 * port mapping.
1040 	 */
1041 	lag_lower_info = info->lower_state_info;
1042 	if (!lag_lower_info)
1043 		return 0;
1044 
1045 	tracker->netdev_state[idx] = *lag_lower_info;
1046 
1047 	return 1;
1048 }
1049 
1050 static int mlx5_handle_changeinfodata_event(struct mlx5_lag *ldev,
1051 					    struct lag_tracker *tracker,
1052 					    struct net_device *ndev)
1053 {
1054 	struct net_device *ndev_tmp;
1055 	struct slave *slave;
1056 	bool has_inactive = 0;
1057 	int idx;
1058 
1059 	if (!netif_is_lag_master(ndev))
1060 		return 0;
1061 
1062 	rcu_read_lock();
1063 	for_each_netdev_in_bond_rcu(ndev, ndev_tmp) {
1064 		idx = mlx5_lag_dev_get_netdev_idx(ldev, ndev_tmp);
1065 		if (idx < 0)
1066 			continue;
1067 
1068 		slave = bond_slave_get_rcu(ndev_tmp);
1069 		if (slave)
1070 			has_inactive |= bond_is_slave_inactive(slave);
1071 	}
1072 	rcu_read_unlock();
1073 
1074 	if (tracker->has_inactive == has_inactive)
1075 		return 0;
1076 
1077 	tracker->has_inactive = has_inactive;
1078 
1079 	return 1;
1080 }
1081 
1082 /* this handler is always registered to netdev events */
1083 static int mlx5_lag_netdev_event(struct notifier_block *this,
1084 				 unsigned long event, void *ptr)
1085 {
1086 	struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
1087 	struct lag_tracker tracker;
1088 	struct mlx5_lag *ldev;
1089 	int changed = 0;
1090 
1091 	if (event != NETDEV_CHANGEUPPER &&
1092 	    event != NETDEV_CHANGELOWERSTATE &&
1093 	    event != NETDEV_CHANGEINFODATA)
1094 		return NOTIFY_DONE;
1095 
1096 	ldev    = container_of(this, struct mlx5_lag, nb);
1097 
1098 	tracker = ldev->tracker;
1099 
1100 	switch (event) {
1101 	case NETDEV_CHANGEUPPER:
1102 		changed = mlx5_handle_changeupper_event(ldev, &tracker, ptr);
1103 		break;
1104 	case NETDEV_CHANGELOWERSTATE:
1105 		changed = mlx5_handle_changelowerstate_event(ldev, &tracker,
1106 							     ndev, ptr);
1107 		break;
1108 	case NETDEV_CHANGEINFODATA:
1109 		changed = mlx5_handle_changeinfodata_event(ldev, &tracker, ndev);
1110 		break;
1111 	}
1112 
1113 	ldev->tracker = tracker;
1114 
1115 	if (changed)
1116 		mlx5_queue_bond_work(ldev, 0);
1117 
1118 	return NOTIFY_DONE;
1119 }
1120 
1121 static void mlx5_ldev_add_netdev(struct mlx5_lag *ldev,
1122 				 struct mlx5_core_dev *dev,
1123 				 struct net_device *netdev)
1124 {
1125 	unsigned int fn = mlx5_get_dev_index(dev);
1126 	unsigned long flags;
1127 
1128 	if (fn >= ldev->ports)
1129 		return;
1130 
1131 	spin_lock_irqsave(&lag_lock, flags);
1132 	ldev->pf[fn].netdev = netdev;
1133 	ldev->tracker.netdev_state[fn].link_up = 0;
1134 	ldev->tracker.netdev_state[fn].tx_enabled = 0;
1135 	spin_unlock_irqrestore(&lag_lock, flags);
1136 }
1137 
1138 static void mlx5_ldev_remove_netdev(struct mlx5_lag *ldev,
1139 				    struct net_device *netdev)
1140 {
1141 	unsigned long flags;
1142 	int i;
1143 
1144 	spin_lock_irqsave(&lag_lock, flags);
1145 	for (i = 0; i < ldev->ports; i++) {
1146 		if (ldev->pf[i].netdev == netdev) {
1147 			ldev->pf[i].netdev = NULL;
1148 			break;
1149 		}
1150 	}
1151 	spin_unlock_irqrestore(&lag_lock, flags);
1152 }
1153 
1154 static void mlx5_ldev_add_mdev(struct mlx5_lag *ldev,
1155 			       struct mlx5_core_dev *dev)
1156 {
1157 	unsigned int fn = mlx5_get_dev_index(dev);
1158 
1159 	if (fn >= ldev->ports)
1160 		return;
1161 
1162 	ldev->pf[fn].dev = dev;
1163 	dev->priv.lag = ldev;
1164 }
1165 
1166 static void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev,
1167 				  struct mlx5_core_dev *dev)
1168 {
1169 	int i;
1170 
1171 	for (i = 0; i < ldev->ports; i++)
1172 		if (ldev->pf[i].dev == dev)
1173 			break;
1174 
1175 	if (i == ldev->ports)
1176 		return;
1177 
1178 	ldev->pf[i].dev = NULL;
1179 	dev->priv.lag = NULL;
1180 }
1181 
1182 /* Must be called with intf_mutex held */
1183 static int __mlx5_lag_dev_add_mdev(struct mlx5_core_dev *dev)
1184 {
1185 	struct mlx5_lag *ldev = NULL;
1186 	struct mlx5_core_dev *tmp_dev;
1187 
1188 	tmp_dev = mlx5_get_next_phys_dev_lag(dev);
1189 	if (tmp_dev)
1190 		ldev = tmp_dev->priv.lag;
1191 
1192 	if (!ldev) {
1193 		ldev = mlx5_lag_dev_alloc(dev);
1194 		if (!ldev) {
1195 			mlx5_core_err(dev, "Failed to alloc lag dev\n");
1196 			return 0;
1197 		}
1198 		mlx5_ldev_add_mdev(ldev, dev);
1199 		return 0;
1200 	}
1201 
1202 	mutex_lock(&ldev->lock);
1203 	if (ldev->mode_changes_in_progress) {
1204 		mutex_unlock(&ldev->lock);
1205 		return -EAGAIN;
1206 	}
1207 	mlx5_ldev_get(ldev);
1208 	mlx5_ldev_add_mdev(ldev, dev);
1209 	mutex_unlock(&ldev->lock);
1210 
1211 	return 0;
1212 }
1213 
1214 void mlx5_lag_remove_mdev(struct mlx5_core_dev *dev)
1215 {
1216 	struct mlx5_lag *ldev;
1217 
1218 	ldev = mlx5_lag_dev(dev);
1219 	if (!ldev)
1220 		return;
1221 
1222 	/* mdev is being removed, might as well remove debugfs
1223 	 * as early as possible.
1224 	 */
1225 	mlx5_ldev_remove_debugfs(dev->priv.dbg.lag_debugfs);
1226 recheck:
1227 	mutex_lock(&ldev->lock);
1228 	if (ldev->mode_changes_in_progress) {
1229 		mutex_unlock(&ldev->lock);
1230 		msleep(100);
1231 		goto recheck;
1232 	}
1233 	mlx5_ldev_remove_mdev(ldev, dev);
1234 	mutex_unlock(&ldev->lock);
1235 	mlx5_ldev_put(ldev);
1236 }
1237 
1238 void mlx5_lag_add_mdev(struct mlx5_core_dev *dev)
1239 {
1240 	int err;
1241 
1242 	if (!MLX5_CAP_GEN(dev, vport_group_manager) ||
1243 	    !MLX5_CAP_GEN(dev, lag_master) ||
1244 	    (MLX5_CAP_GEN(dev, num_lag_ports) > MLX5_MAX_PORTS ||
1245 	     MLX5_CAP_GEN(dev, num_lag_ports) <= 1))
1246 		return;
1247 
1248 recheck:
1249 	mlx5_dev_list_lock();
1250 	err = __mlx5_lag_dev_add_mdev(dev);
1251 	mlx5_dev_list_unlock();
1252 
1253 	if (err) {
1254 		msleep(100);
1255 		goto recheck;
1256 	}
1257 	mlx5_ldev_add_debugfs(dev);
1258 }
1259 
1260 void mlx5_lag_remove_netdev(struct mlx5_core_dev *dev,
1261 			    struct net_device *netdev)
1262 {
1263 	struct mlx5_lag *ldev;
1264 	bool lag_is_active;
1265 
1266 	ldev = mlx5_lag_dev(dev);
1267 	if (!ldev)
1268 		return;
1269 
1270 	mutex_lock(&ldev->lock);
1271 	mlx5_ldev_remove_netdev(ldev, netdev);
1272 	clear_bit(MLX5_LAG_FLAG_NDEVS_READY, &ldev->state_flags);
1273 
1274 	lag_is_active = __mlx5_lag_is_active(ldev);
1275 	mutex_unlock(&ldev->lock);
1276 
1277 	if (lag_is_active)
1278 		mlx5_queue_bond_work(ldev, 0);
1279 }
1280 
1281 void mlx5_lag_add_netdev(struct mlx5_core_dev *dev,
1282 			 struct net_device *netdev)
1283 {
1284 	struct mlx5_lag *ldev;
1285 	int i;
1286 
1287 	ldev = mlx5_lag_dev(dev);
1288 	if (!ldev)
1289 		return;
1290 
1291 	mutex_lock(&ldev->lock);
1292 	mlx5_ldev_add_netdev(ldev, dev, netdev);
1293 
1294 	for (i = 0; i < ldev->ports; i++)
1295 		if (!ldev->pf[i].netdev)
1296 			break;
1297 
1298 	if (i >= ldev->ports)
1299 		set_bit(MLX5_LAG_FLAG_NDEVS_READY, &ldev->state_flags);
1300 	mutex_unlock(&ldev->lock);
1301 	mlx5_queue_bond_work(ldev, 0);
1302 }
1303 
1304 bool mlx5_lag_is_roce(struct mlx5_core_dev *dev)
1305 {
1306 	struct mlx5_lag *ldev;
1307 	unsigned long flags;
1308 	bool res;
1309 
1310 	spin_lock_irqsave(&lag_lock, flags);
1311 	ldev = mlx5_lag_dev(dev);
1312 	res  = ldev && __mlx5_lag_is_roce(ldev);
1313 	spin_unlock_irqrestore(&lag_lock, flags);
1314 
1315 	return res;
1316 }
1317 EXPORT_SYMBOL(mlx5_lag_is_roce);
1318 
1319 bool mlx5_lag_is_active(struct mlx5_core_dev *dev)
1320 {
1321 	struct mlx5_lag *ldev;
1322 	unsigned long flags;
1323 	bool res;
1324 
1325 	spin_lock_irqsave(&lag_lock, flags);
1326 	ldev = mlx5_lag_dev(dev);
1327 	res  = ldev && __mlx5_lag_is_active(ldev);
1328 	spin_unlock_irqrestore(&lag_lock, flags);
1329 
1330 	return res;
1331 }
1332 EXPORT_SYMBOL(mlx5_lag_is_active);
1333 
1334 bool mlx5_lag_mode_is_hash(struct mlx5_core_dev *dev)
1335 {
1336 	struct mlx5_lag *ldev;
1337 	unsigned long flags;
1338 	bool res = 0;
1339 
1340 	spin_lock_irqsave(&lag_lock, flags);
1341 	ldev = mlx5_lag_dev(dev);
1342 	if (ldev)
1343 		res = test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &ldev->mode_flags);
1344 	spin_unlock_irqrestore(&lag_lock, flags);
1345 
1346 	return res;
1347 }
1348 EXPORT_SYMBOL(mlx5_lag_mode_is_hash);
1349 
1350 bool mlx5_lag_is_master(struct mlx5_core_dev *dev)
1351 {
1352 	struct mlx5_lag *ldev;
1353 	unsigned long flags;
1354 	bool res;
1355 
1356 	spin_lock_irqsave(&lag_lock, flags);
1357 	ldev = mlx5_lag_dev(dev);
1358 	res = ldev && __mlx5_lag_is_active(ldev) &&
1359 		dev == ldev->pf[MLX5_LAG_P1].dev;
1360 	spin_unlock_irqrestore(&lag_lock, flags);
1361 
1362 	return res;
1363 }
1364 EXPORT_SYMBOL(mlx5_lag_is_master);
1365 
1366 bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev)
1367 {
1368 	struct mlx5_lag *ldev;
1369 	unsigned long flags;
1370 	bool res;
1371 
1372 	spin_lock_irqsave(&lag_lock, flags);
1373 	ldev = mlx5_lag_dev(dev);
1374 	res  = ldev && __mlx5_lag_is_sriov(ldev);
1375 	spin_unlock_irqrestore(&lag_lock, flags);
1376 
1377 	return res;
1378 }
1379 EXPORT_SYMBOL(mlx5_lag_is_sriov);
1380 
1381 bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev)
1382 {
1383 	struct mlx5_lag *ldev;
1384 	unsigned long flags;
1385 	bool res;
1386 
1387 	spin_lock_irqsave(&lag_lock, flags);
1388 	ldev = mlx5_lag_dev(dev);
1389 	res = ldev && __mlx5_lag_is_sriov(ldev) &&
1390 	      test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &ldev->mode_flags);
1391 	spin_unlock_irqrestore(&lag_lock, flags);
1392 
1393 	return res;
1394 }
1395 EXPORT_SYMBOL(mlx5_lag_is_shared_fdb);
1396 
1397 void mlx5_lag_disable_change(struct mlx5_core_dev *dev)
1398 {
1399 	struct mlx5_lag *ldev;
1400 
1401 	ldev = mlx5_lag_dev(dev);
1402 	if (!ldev)
1403 		return;
1404 
1405 	mlx5_dev_list_lock();
1406 	mutex_lock(&ldev->lock);
1407 
1408 	ldev->mode_changes_in_progress++;
1409 	if (__mlx5_lag_is_active(ldev))
1410 		mlx5_disable_lag(ldev);
1411 
1412 	mutex_unlock(&ldev->lock);
1413 	mlx5_dev_list_unlock();
1414 }
1415 
1416 void mlx5_lag_enable_change(struct mlx5_core_dev *dev)
1417 {
1418 	struct mlx5_lag *ldev;
1419 
1420 	ldev = mlx5_lag_dev(dev);
1421 	if (!ldev)
1422 		return;
1423 
1424 	mutex_lock(&ldev->lock);
1425 	ldev->mode_changes_in_progress--;
1426 	mutex_unlock(&ldev->lock);
1427 	mlx5_queue_bond_work(ldev, 0);
1428 }
1429 
1430 struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev)
1431 {
1432 	struct net_device *ndev = NULL;
1433 	struct mlx5_lag *ldev;
1434 	unsigned long flags;
1435 	int i;
1436 
1437 	spin_lock_irqsave(&lag_lock, flags);
1438 	ldev = mlx5_lag_dev(dev);
1439 
1440 	if (!(ldev && __mlx5_lag_is_roce(ldev)))
1441 		goto unlock;
1442 
1443 	if (ldev->tracker.tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) {
1444 		for (i = 0; i < ldev->ports; i++)
1445 			if (ldev->tracker.netdev_state[i].tx_enabled)
1446 				ndev = ldev->pf[i].netdev;
1447 		if (!ndev)
1448 			ndev = ldev->pf[ldev->ports - 1].netdev;
1449 	} else {
1450 		ndev = ldev->pf[MLX5_LAG_P1].netdev;
1451 	}
1452 	if (ndev)
1453 		dev_hold(ndev);
1454 
1455 unlock:
1456 	spin_unlock_irqrestore(&lag_lock, flags);
1457 
1458 	return ndev;
1459 }
1460 EXPORT_SYMBOL(mlx5_lag_get_roce_netdev);
1461 
1462 u8 mlx5_lag_get_slave_port(struct mlx5_core_dev *dev,
1463 			   struct net_device *slave)
1464 {
1465 	struct mlx5_lag *ldev;
1466 	unsigned long flags;
1467 	u8 port = 0;
1468 	int i;
1469 
1470 	spin_lock_irqsave(&lag_lock, flags);
1471 	ldev = mlx5_lag_dev(dev);
1472 	if (!(ldev && __mlx5_lag_is_roce(ldev)))
1473 		goto unlock;
1474 
1475 	for (i = 0; i < ldev->ports; i++) {
1476 		if (ldev->pf[MLX5_LAG_P1].netdev == slave) {
1477 			port = i;
1478 			break;
1479 		}
1480 	}
1481 
1482 	port = ldev->v2p_map[port * ldev->buckets];
1483 
1484 unlock:
1485 	spin_unlock_irqrestore(&lag_lock, flags);
1486 	return port;
1487 }
1488 EXPORT_SYMBOL(mlx5_lag_get_slave_port);
1489 
1490 u8 mlx5_lag_get_num_ports(struct mlx5_core_dev *dev)
1491 {
1492 	struct mlx5_lag *ldev;
1493 
1494 	ldev = mlx5_lag_dev(dev);
1495 	if (!ldev)
1496 		return 0;
1497 
1498 	return ldev->ports;
1499 }
1500 EXPORT_SYMBOL(mlx5_lag_get_num_ports);
1501 
1502 struct mlx5_core_dev *mlx5_lag_get_peer_mdev(struct mlx5_core_dev *dev)
1503 {
1504 	struct mlx5_core_dev *peer_dev = NULL;
1505 	struct mlx5_lag *ldev;
1506 	unsigned long flags;
1507 
1508 	spin_lock_irqsave(&lag_lock, flags);
1509 	ldev = mlx5_lag_dev(dev);
1510 	if (!ldev)
1511 		goto unlock;
1512 
1513 	peer_dev = ldev->pf[MLX5_LAG_P1].dev == dev ?
1514 			   ldev->pf[MLX5_LAG_P2].dev :
1515 			   ldev->pf[MLX5_LAG_P1].dev;
1516 
1517 unlock:
1518 	spin_unlock_irqrestore(&lag_lock, flags);
1519 	return peer_dev;
1520 }
1521 EXPORT_SYMBOL(mlx5_lag_get_peer_mdev);
1522 
1523 int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev,
1524 				 u64 *values,
1525 				 int num_counters,
1526 				 size_t *offsets)
1527 {
1528 	int outlen = MLX5_ST_SZ_BYTES(query_cong_statistics_out);
1529 	struct mlx5_core_dev **mdev;
1530 	struct mlx5_lag *ldev;
1531 	unsigned long flags;
1532 	int num_ports;
1533 	int ret, i, j;
1534 	void *out;
1535 
1536 	out = kvzalloc(outlen, GFP_KERNEL);
1537 	if (!out)
1538 		return -ENOMEM;
1539 
1540 	mdev = kvzalloc(sizeof(mdev[0]) * MLX5_MAX_PORTS, GFP_KERNEL);
1541 	if (!mdev) {
1542 		ret = -ENOMEM;
1543 		goto free_out;
1544 	}
1545 
1546 	memset(values, 0, sizeof(*values) * num_counters);
1547 
1548 	spin_lock_irqsave(&lag_lock, flags);
1549 	ldev = mlx5_lag_dev(dev);
1550 	if (ldev && __mlx5_lag_is_active(ldev)) {
1551 		num_ports = ldev->ports;
1552 		for (i = 0; i < ldev->ports; i++)
1553 			mdev[i] = ldev->pf[i].dev;
1554 	} else {
1555 		num_ports = 1;
1556 		mdev[MLX5_LAG_P1] = dev;
1557 	}
1558 	spin_unlock_irqrestore(&lag_lock, flags);
1559 
1560 	for (i = 0; i < num_ports; ++i) {
1561 		u32 in[MLX5_ST_SZ_DW(query_cong_statistics_in)] = {};
1562 
1563 		MLX5_SET(query_cong_statistics_in, in, opcode,
1564 			 MLX5_CMD_OP_QUERY_CONG_STATISTICS);
1565 		ret = mlx5_cmd_exec_inout(mdev[i], query_cong_statistics, in,
1566 					  out);
1567 		if (ret)
1568 			goto free_mdev;
1569 
1570 		for (j = 0; j < num_counters; ++j)
1571 			values[j] += be64_to_cpup((__be64 *)(out + offsets[j]));
1572 	}
1573 
1574 free_mdev:
1575 	kvfree(mdev);
1576 free_out:
1577 	kvfree(out);
1578 	return ret;
1579 }
1580 EXPORT_SYMBOL(mlx5_lag_query_cong_counters);
1581