xref: /openbmc/linux/drivers/infiniband/hw/mlx5/main.c (revision cff5a0f3a3cda0d852425093f92acca169eb5aea)
1 /*
2  * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32 
33 #include <linux/highmem.h>
34 #include <linux/module.h>
35 #include <linux/init.h>
36 #include <linux/errno.h>
37 #include <linux/pci.h>
38 #include <linux/dma-mapping.h>
39 #include <linux/slab.h>
40 #include <linux/io-mapping.h>
41 #include <linux/sched.h>
42 #include <rdma/ib_user_verbs.h>
43 #include <rdma/ib_addr.h>
44 #include <rdma/ib_cache.h>
45 #include <linux/mlx5/port.h>
46 #include <linux/mlx5/vport.h>
47 #include <rdma/ib_smi.h>
48 #include <rdma/ib_umem.h>
49 #include <linux/in.h>
50 #include <linux/etherdevice.h>
51 #include <linux/mlx5/fs.h>
52 #include "user.h"
53 #include "mlx5_ib.h"
54 
55 #define DRIVER_NAME "mlx5_ib"
56 #define DRIVER_VERSION "2.2-1"
57 #define DRIVER_RELDATE	"Feb 2014"
58 
59 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
60 MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver");
61 MODULE_LICENSE("Dual BSD/GPL");
62 MODULE_VERSION(DRIVER_VERSION);
63 
64 static int deprecated_prof_sel = 2;
65 module_param_named(prof_sel, deprecated_prof_sel, int, 0444);
66 MODULE_PARM_DESC(prof_sel, "profile selector. Deprecated here. Moved to module mlx5_core");
67 
68 static char mlx5_version[] =
69 	DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v"
70 	DRIVER_VERSION " (" DRIVER_RELDATE ")\n";
71 
72 enum {
73 	MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3,
74 };
75 
76 static enum rdma_link_layer
77 mlx5_port_type_cap_to_rdma_ll(int port_type_cap)
78 {
79 	switch (port_type_cap) {
80 	case MLX5_CAP_PORT_TYPE_IB:
81 		return IB_LINK_LAYER_INFINIBAND;
82 	case MLX5_CAP_PORT_TYPE_ETH:
83 		return IB_LINK_LAYER_ETHERNET;
84 	default:
85 		return IB_LINK_LAYER_UNSPECIFIED;
86 	}
87 }
88 
89 static enum rdma_link_layer
90 mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num)
91 {
92 	struct mlx5_ib_dev *dev = to_mdev(device);
93 	int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type);
94 
95 	return mlx5_port_type_cap_to_rdma_ll(port_type_cap);
96 }
97 
98 static int mlx5_netdev_event(struct notifier_block *this,
99 			     unsigned long event, void *ptr)
100 {
101 	struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
102 	struct mlx5_ib_dev *ibdev = container_of(this, struct mlx5_ib_dev,
103 						 roce.nb);
104 
105 	if ((event != NETDEV_UNREGISTER) && (event != NETDEV_REGISTER))
106 		return NOTIFY_DONE;
107 
108 	write_lock(&ibdev->roce.netdev_lock);
109 	if (ndev->dev.parent == &ibdev->mdev->pdev->dev)
110 		ibdev->roce.netdev = (event == NETDEV_UNREGISTER) ? NULL : ndev;
111 	write_unlock(&ibdev->roce.netdev_lock);
112 
113 	return NOTIFY_DONE;
114 }
115 
116 static struct net_device *mlx5_ib_get_netdev(struct ib_device *device,
117 					     u8 port_num)
118 {
119 	struct mlx5_ib_dev *ibdev = to_mdev(device);
120 	struct net_device *ndev;
121 
122 	/* Ensure ndev does not disappear before we invoke dev_hold()
123 	 */
124 	read_lock(&ibdev->roce.netdev_lock);
125 	ndev = ibdev->roce.netdev;
126 	if (ndev)
127 		dev_hold(ndev);
128 	read_unlock(&ibdev->roce.netdev_lock);
129 
130 	return ndev;
131 }
132 
133 static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
134 				struct ib_port_attr *props)
135 {
136 	struct mlx5_ib_dev *dev = to_mdev(device);
137 	struct net_device *ndev;
138 	enum ib_mtu ndev_ib_mtu;
139 	u16 qkey_viol_cntr;
140 
141 	memset(props, 0, sizeof(*props));
142 
143 	props->port_cap_flags  |= IB_PORT_CM_SUP;
144 	props->port_cap_flags  |= IB_PORT_IP_BASED_GIDS;
145 
146 	props->gid_tbl_len      = MLX5_CAP_ROCE(dev->mdev,
147 						roce_address_table_size);
148 	props->max_mtu          = IB_MTU_4096;
149 	props->max_msg_sz       = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg);
150 	props->pkey_tbl_len     = 1;
151 	props->state            = IB_PORT_DOWN;
152 	props->phys_state       = 3;
153 
154 	mlx5_query_nic_vport_qkey_viol_cntr(dev->mdev, &qkey_viol_cntr);
155 	props->qkey_viol_cntr = qkey_viol_cntr;
156 
157 	ndev = mlx5_ib_get_netdev(device, port_num);
158 	if (!ndev)
159 		return 0;
160 
161 	if (netif_running(ndev) && netif_carrier_ok(ndev)) {
162 		props->state      = IB_PORT_ACTIVE;
163 		props->phys_state = 5;
164 	}
165 
166 	ndev_ib_mtu = iboe_get_mtu(ndev->mtu);
167 
168 	dev_put(ndev);
169 
170 	props->active_mtu	= min(props->max_mtu, ndev_ib_mtu);
171 
172 	props->active_width	= IB_WIDTH_4X;  /* TODO */
173 	props->active_speed	= IB_SPEED_QDR; /* TODO */
174 
175 	return 0;
176 }
177 
178 static void ib_gid_to_mlx5_roce_addr(const union ib_gid *gid,
179 				     const struct ib_gid_attr *attr,
180 				     void *mlx5_addr)
181 {
182 #define MLX5_SET_RA(p, f, v) MLX5_SET(roce_addr_layout, p, f, v)
183 	char *mlx5_addr_l3_addr	= MLX5_ADDR_OF(roce_addr_layout, mlx5_addr,
184 					       source_l3_address);
185 	void *mlx5_addr_mac	= MLX5_ADDR_OF(roce_addr_layout, mlx5_addr,
186 					       source_mac_47_32);
187 
188 	if (!gid)
189 		return;
190 
191 	ether_addr_copy(mlx5_addr_mac, attr->ndev->dev_addr);
192 
193 	if (is_vlan_dev(attr->ndev)) {
194 		MLX5_SET_RA(mlx5_addr, vlan_valid, 1);
195 		MLX5_SET_RA(mlx5_addr, vlan_id, vlan_dev_vlan_id(attr->ndev));
196 	}
197 
198 	switch (attr->gid_type) {
199 	case IB_GID_TYPE_IB:
200 		MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_1);
201 		break;
202 	case IB_GID_TYPE_ROCE_UDP_ENCAP:
203 		MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_2);
204 		break;
205 
206 	default:
207 		WARN_ON(true);
208 	}
209 
210 	if (attr->gid_type != IB_GID_TYPE_IB) {
211 		if (ipv6_addr_v4mapped((void *)gid))
212 			MLX5_SET_RA(mlx5_addr, roce_l3_type,
213 				    MLX5_ROCE_L3_TYPE_IPV4);
214 		else
215 			MLX5_SET_RA(mlx5_addr, roce_l3_type,
216 				    MLX5_ROCE_L3_TYPE_IPV6);
217 	}
218 
219 	if ((attr->gid_type == IB_GID_TYPE_IB) ||
220 	    !ipv6_addr_v4mapped((void *)gid))
221 		memcpy(mlx5_addr_l3_addr, gid, sizeof(*gid));
222 	else
223 		memcpy(&mlx5_addr_l3_addr[12], &gid->raw[12], 4);
224 }
225 
226 static int set_roce_addr(struct ib_device *device, u8 port_num,
227 			 unsigned int index,
228 			 const union ib_gid *gid,
229 			 const struct ib_gid_attr *attr)
230 {
231 	struct mlx5_ib_dev *dev	= to_mdev(device);
232 	u32  in[MLX5_ST_SZ_DW(set_roce_address_in)];
233 	u32 out[MLX5_ST_SZ_DW(set_roce_address_out)];
234 	void *in_addr = MLX5_ADDR_OF(set_roce_address_in, in, roce_address);
235 	enum rdma_link_layer ll = mlx5_ib_port_link_layer(device, port_num);
236 
237 	if (ll != IB_LINK_LAYER_ETHERNET)
238 		return -EINVAL;
239 
240 	memset(in, 0, sizeof(in));
241 
242 	ib_gid_to_mlx5_roce_addr(gid, attr, in_addr);
243 
244 	MLX5_SET(set_roce_address_in, in, roce_address_index, index);
245 	MLX5_SET(set_roce_address_in, in, opcode, MLX5_CMD_OP_SET_ROCE_ADDRESS);
246 
247 	memset(out, 0, sizeof(out));
248 	return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
249 }
250 
251 static int mlx5_ib_add_gid(struct ib_device *device, u8 port_num,
252 			   unsigned int index, const union ib_gid *gid,
253 			   const struct ib_gid_attr *attr,
254 			   __always_unused void **context)
255 {
256 	return set_roce_addr(device, port_num, index, gid, attr);
257 }
258 
259 static int mlx5_ib_del_gid(struct ib_device *device, u8 port_num,
260 			   unsigned int index, __always_unused void **context)
261 {
262 	return set_roce_addr(device, port_num, index, NULL, NULL);
263 }
264 
265 __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num,
266 			       int index)
267 {
268 	struct ib_gid_attr attr;
269 	union ib_gid gid;
270 
271 	if (ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr))
272 		return 0;
273 
274 	if (!attr.ndev)
275 		return 0;
276 
277 	dev_put(attr.ndev);
278 
279 	if (attr.gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP)
280 		return 0;
281 
282 	return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port));
283 }
284 
285 static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev)
286 {
287 	return !MLX5_CAP_GEN(dev->mdev, ib_virt);
288 }
289 
290 enum {
291 	MLX5_VPORT_ACCESS_METHOD_MAD,
292 	MLX5_VPORT_ACCESS_METHOD_HCA,
293 	MLX5_VPORT_ACCESS_METHOD_NIC,
294 };
295 
296 static int mlx5_get_vport_access_method(struct ib_device *ibdev)
297 {
298 	if (mlx5_use_mad_ifc(to_mdev(ibdev)))
299 		return MLX5_VPORT_ACCESS_METHOD_MAD;
300 
301 	if (mlx5_ib_port_link_layer(ibdev, 1) ==
302 	    IB_LINK_LAYER_ETHERNET)
303 		return MLX5_VPORT_ACCESS_METHOD_NIC;
304 
305 	return MLX5_VPORT_ACCESS_METHOD_HCA;
306 }
307 
308 static void get_atomic_caps(struct mlx5_ib_dev *dev,
309 			    struct ib_device_attr *props)
310 {
311 	u8 tmp;
312 	u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations);
313 	u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp);
314 	u8 atomic_req_8B_endianness_mode =
315 		MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianess_mode);
316 
317 	/* Check if HW supports 8 bytes standard atomic operations and capable
318 	 * of host endianness respond
319 	 */
320 	tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD;
321 	if (((atomic_operations & tmp) == tmp) &&
322 	    (atomic_size_qp & MLX5_ATOMIC_SIZE_QP_8BYTES) &&
323 	    (atomic_req_8B_endianness_mode)) {
324 		props->atomic_cap = IB_ATOMIC_HCA;
325 	} else {
326 		props->atomic_cap = IB_ATOMIC_NONE;
327 	}
328 }
329 
330 static int mlx5_query_system_image_guid(struct ib_device *ibdev,
331 					__be64 *sys_image_guid)
332 {
333 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
334 	struct mlx5_core_dev *mdev = dev->mdev;
335 	u64 tmp;
336 	int err;
337 
338 	switch (mlx5_get_vport_access_method(ibdev)) {
339 	case MLX5_VPORT_ACCESS_METHOD_MAD:
340 		return mlx5_query_mad_ifc_system_image_guid(ibdev,
341 							    sys_image_guid);
342 
343 	case MLX5_VPORT_ACCESS_METHOD_HCA:
344 		err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp);
345 		break;
346 
347 	case MLX5_VPORT_ACCESS_METHOD_NIC:
348 		err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp);
349 		break;
350 
351 	default:
352 		return -EINVAL;
353 	}
354 
355 	if (!err)
356 		*sys_image_guid = cpu_to_be64(tmp);
357 
358 	return err;
359 
360 }
361 
362 static int mlx5_query_max_pkeys(struct ib_device *ibdev,
363 				u16 *max_pkeys)
364 {
365 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
366 	struct mlx5_core_dev *mdev = dev->mdev;
367 
368 	switch (mlx5_get_vport_access_method(ibdev)) {
369 	case MLX5_VPORT_ACCESS_METHOD_MAD:
370 		return mlx5_query_mad_ifc_max_pkeys(ibdev, max_pkeys);
371 
372 	case MLX5_VPORT_ACCESS_METHOD_HCA:
373 	case MLX5_VPORT_ACCESS_METHOD_NIC:
374 		*max_pkeys = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev,
375 						pkey_table_size));
376 		return 0;
377 
378 	default:
379 		return -EINVAL;
380 	}
381 }
382 
383 static int mlx5_query_vendor_id(struct ib_device *ibdev,
384 				u32 *vendor_id)
385 {
386 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
387 
388 	switch (mlx5_get_vport_access_method(ibdev)) {
389 	case MLX5_VPORT_ACCESS_METHOD_MAD:
390 		return mlx5_query_mad_ifc_vendor_id(ibdev, vendor_id);
391 
392 	case MLX5_VPORT_ACCESS_METHOD_HCA:
393 	case MLX5_VPORT_ACCESS_METHOD_NIC:
394 		return mlx5_core_query_vendor_id(dev->mdev, vendor_id);
395 
396 	default:
397 		return -EINVAL;
398 	}
399 }
400 
401 static int mlx5_query_node_guid(struct mlx5_ib_dev *dev,
402 				__be64 *node_guid)
403 {
404 	u64 tmp;
405 	int err;
406 
407 	switch (mlx5_get_vport_access_method(&dev->ib_dev)) {
408 	case MLX5_VPORT_ACCESS_METHOD_MAD:
409 		return mlx5_query_mad_ifc_node_guid(dev, node_guid);
410 
411 	case MLX5_VPORT_ACCESS_METHOD_HCA:
412 		err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp);
413 		break;
414 
415 	case MLX5_VPORT_ACCESS_METHOD_NIC:
416 		err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp);
417 		break;
418 
419 	default:
420 		return -EINVAL;
421 	}
422 
423 	if (!err)
424 		*node_guid = cpu_to_be64(tmp);
425 
426 	return err;
427 }
428 
429 struct mlx5_reg_node_desc {
430 	u8	desc[64];
431 };
432 
433 static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc)
434 {
435 	struct mlx5_reg_node_desc in;
436 
437 	if (mlx5_use_mad_ifc(dev))
438 		return mlx5_query_mad_ifc_node_desc(dev, node_desc);
439 
440 	memset(&in, 0, sizeof(in));
441 
442 	return mlx5_core_access_reg(dev->mdev, &in, sizeof(in), node_desc,
443 				    sizeof(struct mlx5_reg_node_desc),
444 				    MLX5_REG_NODE_DESC, 0, 0);
445 }
446 
447 static int mlx5_ib_query_device(struct ib_device *ibdev,
448 				struct ib_device_attr *props,
449 				struct ib_udata *uhw)
450 {
451 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
452 	struct mlx5_core_dev *mdev = dev->mdev;
453 	int err = -ENOMEM;
454 	int max_rq_sg;
455 	int max_sq_sg;
456 	u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz);
457 
458 	if (uhw->inlen || uhw->outlen)
459 		return -EINVAL;
460 
461 	memset(props, 0, sizeof(*props));
462 	err = mlx5_query_system_image_guid(ibdev,
463 					   &props->sys_image_guid);
464 	if (err)
465 		return err;
466 
467 	err = mlx5_query_max_pkeys(ibdev, &props->max_pkeys);
468 	if (err)
469 		return err;
470 
471 	err = mlx5_query_vendor_id(ibdev, &props->vendor_id);
472 	if (err)
473 		return err;
474 
475 	props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) |
476 		(fw_rev_min(dev->mdev) << 16) |
477 		fw_rev_sub(dev->mdev);
478 	props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |
479 		IB_DEVICE_PORT_ACTIVE_EVENT		|
480 		IB_DEVICE_SYS_IMAGE_GUID		|
481 		IB_DEVICE_RC_RNR_NAK_GEN;
482 
483 	if (MLX5_CAP_GEN(mdev, pkv))
484 		props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
485 	if (MLX5_CAP_GEN(mdev, qkv))
486 		props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
487 	if (MLX5_CAP_GEN(mdev, apm))
488 		props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
489 	if (MLX5_CAP_GEN(mdev, xrc))
490 		props->device_cap_flags |= IB_DEVICE_XRC;
491 	if (MLX5_CAP_GEN(mdev, imaicl)) {
492 		props->device_cap_flags |= IB_DEVICE_MEM_WINDOW |
493 					   IB_DEVICE_MEM_WINDOW_TYPE_2B;
494 		props->max_mw = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
495 		/* We support 'Gappy' memory registration too */
496 		props->device_cap_flags |= IB_DEVICE_SG_GAPS_REG;
497 	}
498 	props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
499 	if (MLX5_CAP_GEN(mdev, sho)) {
500 		props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER;
501 		/* At this stage no support for signature handover */
502 		props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 |
503 				      IB_PROT_T10DIF_TYPE_2 |
504 				      IB_PROT_T10DIF_TYPE_3;
505 		props->sig_guard_cap = IB_GUARD_T10DIF_CRC |
506 				       IB_GUARD_T10DIF_CSUM;
507 	}
508 	if (MLX5_CAP_GEN(mdev, block_lb_mc))
509 		props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
510 
511 	if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
512 	    (MLX5_CAP_ETH(dev->mdev, csum_cap)))
513 			props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM;
514 
515 	if (MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) {
516 		props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
517 		props->device_cap_flags |= IB_DEVICE_UD_TSO;
518 	}
519 
520 	if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
521 	    MLX5_CAP_ETH(dev->mdev, scatter_fcs))
522 		props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS;
523 
524 	props->vendor_part_id	   = mdev->pdev->device;
525 	props->hw_ver		   = mdev->pdev->revision;
526 
527 	props->max_mr_size	   = ~0ull;
528 	props->page_size_cap	   = ~(min_page_size - 1);
529 	props->max_qp		   = 1 << MLX5_CAP_GEN(mdev, log_max_qp);
530 	props->max_qp_wr	   = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
531 	max_rq_sg =  MLX5_CAP_GEN(mdev, max_wqe_sz_rq) /
532 		     sizeof(struct mlx5_wqe_data_seg);
533 	max_sq_sg = (MLX5_CAP_GEN(mdev, max_wqe_sz_sq) -
534 		     sizeof(struct mlx5_wqe_ctrl_seg)) /
535 		     sizeof(struct mlx5_wqe_data_seg);
536 	props->max_sge = min(max_rq_sg, max_sq_sg);
537 	props->max_sge_rd	   = MLX5_MAX_SGE_RD;
538 	props->max_cq		   = 1 << MLX5_CAP_GEN(mdev, log_max_cq);
539 	props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1;
540 	props->max_mr		   = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
541 	props->max_pd		   = 1 << MLX5_CAP_GEN(mdev, log_max_pd);
542 	props->max_qp_rd_atom	   = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp);
543 	props->max_qp_init_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_res_qp);
544 	props->max_srq		   = 1 << MLX5_CAP_GEN(mdev, log_max_srq);
545 	props->max_srq_wr = (1 << MLX5_CAP_GEN(mdev, log_max_srq_sz)) - 1;
546 	props->local_ca_ack_delay  = MLX5_CAP_GEN(mdev, local_ca_ack_delay);
547 	props->max_res_rd_atom	   = props->max_qp_rd_atom * props->max_qp;
548 	props->max_srq_sge	   = max_rq_sg - 1;
549 	props->max_fast_reg_page_list_len =
550 		1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size);
551 	get_atomic_caps(dev, props);
552 	props->masked_atomic_cap   = IB_ATOMIC_NONE;
553 	props->max_mcast_grp	   = 1 << MLX5_CAP_GEN(mdev, log_max_mcg);
554 	props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg);
555 	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
556 					   props->max_mcast_grp;
557 	props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */
558 	props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz);
559 	props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL;
560 
561 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
562 	if (MLX5_CAP_GEN(mdev, pg))
563 		props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
564 	props->odp_caps = dev->odp_caps;
565 #endif
566 
567 	if (MLX5_CAP_GEN(mdev, cd))
568 		props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL;
569 
570 	if (!mlx5_core_is_pf(mdev))
571 		props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION;
572 
573 	return 0;
574 }
575 
576 enum mlx5_ib_width {
577 	MLX5_IB_WIDTH_1X	= 1 << 0,
578 	MLX5_IB_WIDTH_2X	= 1 << 1,
579 	MLX5_IB_WIDTH_4X	= 1 << 2,
580 	MLX5_IB_WIDTH_8X	= 1 << 3,
581 	MLX5_IB_WIDTH_12X	= 1 << 4
582 };
583 
584 static int translate_active_width(struct ib_device *ibdev, u8 active_width,
585 				  u8 *ib_width)
586 {
587 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
588 	int err = 0;
589 
590 	if (active_width & MLX5_IB_WIDTH_1X) {
591 		*ib_width = IB_WIDTH_1X;
592 	} else if (active_width & MLX5_IB_WIDTH_2X) {
593 		mlx5_ib_dbg(dev, "active_width %d is not supported by IB spec\n",
594 			    (int)active_width);
595 		err = -EINVAL;
596 	} else if (active_width & MLX5_IB_WIDTH_4X) {
597 		*ib_width = IB_WIDTH_4X;
598 	} else if (active_width & MLX5_IB_WIDTH_8X) {
599 		*ib_width = IB_WIDTH_8X;
600 	} else if (active_width & MLX5_IB_WIDTH_12X) {
601 		*ib_width = IB_WIDTH_12X;
602 	} else {
603 		mlx5_ib_dbg(dev, "Invalid active_width %d\n",
604 			    (int)active_width);
605 		err = -EINVAL;
606 	}
607 
608 	return err;
609 }
610 
611 static int mlx5_mtu_to_ib_mtu(int mtu)
612 {
613 	switch (mtu) {
614 	case 256: return 1;
615 	case 512: return 2;
616 	case 1024: return 3;
617 	case 2048: return 4;
618 	case 4096: return 5;
619 	default:
620 		pr_warn("invalid mtu\n");
621 		return -1;
622 	}
623 }
624 
625 enum ib_max_vl_num {
626 	__IB_MAX_VL_0		= 1,
627 	__IB_MAX_VL_0_1		= 2,
628 	__IB_MAX_VL_0_3		= 3,
629 	__IB_MAX_VL_0_7		= 4,
630 	__IB_MAX_VL_0_14	= 5,
631 };
632 
633 enum mlx5_vl_hw_cap {
634 	MLX5_VL_HW_0	= 1,
635 	MLX5_VL_HW_0_1	= 2,
636 	MLX5_VL_HW_0_2	= 3,
637 	MLX5_VL_HW_0_3	= 4,
638 	MLX5_VL_HW_0_4	= 5,
639 	MLX5_VL_HW_0_5	= 6,
640 	MLX5_VL_HW_0_6	= 7,
641 	MLX5_VL_HW_0_7	= 8,
642 	MLX5_VL_HW_0_14	= 15
643 };
644 
645 static int translate_max_vl_num(struct ib_device *ibdev, u8 vl_hw_cap,
646 				u8 *max_vl_num)
647 {
648 	switch (vl_hw_cap) {
649 	case MLX5_VL_HW_0:
650 		*max_vl_num = __IB_MAX_VL_0;
651 		break;
652 	case MLX5_VL_HW_0_1:
653 		*max_vl_num = __IB_MAX_VL_0_1;
654 		break;
655 	case MLX5_VL_HW_0_3:
656 		*max_vl_num = __IB_MAX_VL_0_3;
657 		break;
658 	case MLX5_VL_HW_0_7:
659 		*max_vl_num = __IB_MAX_VL_0_7;
660 		break;
661 	case MLX5_VL_HW_0_14:
662 		*max_vl_num = __IB_MAX_VL_0_14;
663 		break;
664 
665 	default:
666 		return -EINVAL;
667 	}
668 
669 	return 0;
670 }
671 
672 static int mlx5_query_hca_port(struct ib_device *ibdev, u8 port,
673 			       struct ib_port_attr *props)
674 {
675 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
676 	struct mlx5_core_dev *mdev = dev->mdev;
677 	struct mlx5_hca_vport_context *rep;
678 	u16 max_mtu;
679 	u16 oper_mtu;
680 	int err;
681 	u8 ib_link_width_oper;
682 	u8 vl_hw_cap;
683 
684 	rep = kzalloc(sizeof(*rep), GFP_KERNEL);
685 	if (!rep) {
686 		err = -ENOMEM;
687 		goto out;
688 	}
689 
690 	memset(props, 0, sizeof(*props));
691 
692 	err = mlx5_query_hca_vport_context(mdev, 0, port, 0, rep);
693 	if (err)
694 		goto out;
695 
696 	props->lid		= rep->lid;
697 	props->lmc		= rep->lmc;
698 	props->sm_lid		= rep->sm_lid;
699 	props->sm_sl		= rep->sm_sl;
700 	props->state		= rep->vport_state;
701 	props->phys_state	= rep->port_physical_state;
702 	props->port_cap_flags	= rep->cap_mask1;
703 	props->gid_tbl_len	= mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size));
704 	props->max_msg_sz	= 1 << MLX5_CAP_GEN(mdev, log_max_msg);
705 	props->pkey_tbl_len	= mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size));
706 	props->bad_pkey_cntr	= rep->pkey_violation_counter;
707 	props->qkey_viol_cntr	= rep->qkey_violation_counter;
708 	props->subnet_timeout	= rep->subnet_timeout;
709 	props->init_type_reply	= rep->init_type_reply;
710 	props->grh_required	= rep->grh_required;
711 
712 	err = mlx5_query_port_link_width_oper(mdev, &ib_link_width_oper, port);
713 	if (err)
714 		goto out;
715 
716 	err = translate_active_width(ibdev, ib_link_width_oper,
717 				     &props->active_width);
718 	if (err)
719 		goto out;
720 	err = mlx5_query_port_proto_oper(mdev, &props->active_speed, MLX5_PTYS_IB,
721 					 port);
722 	if (err)
723 		goto out;
724 
725 	mlx5_query_port_max_mtu(mdev, &max_mtu, port);
726 
727 	props->max_mtu = mlx5_mtu_to_ib_mtu(max_mtu);
728 
729 	mlx5_query_port_oper_mtu(mdev, &oper_mtu, port);
730 
731 	props->active_mtu = mlx5_mtu_to_ib_mtu(oper_mtu);
732 
733 	err = mlx5_query_port_vl_hw_cap(mdev, &vl_hw_cap, port);
734 	if (err)
735 		goto out;
736 
737 	err = translate_max_vl_num(ibdev, vl_hw_cap,
738 				   &props->max_vl_num);
739 out:
740 	kfree(rep);
741 	return err;
742 }
743 
744 int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
745 		       struct ib_port_attr *props)
746 {
747 	switch (mlx5_get_vport_access_method(ibdev)) {
748 	case MLX5_VPORT_ACCESS_METHOD_MAD:
749 		return mlx5_query_mad_ifc_port(ibdev, port, props);
750 
751 	case MLX5_VPORT_ACCESS_METHOD_HCA:
752 		return mlx5_query_hca_port(ibdev, port, props);
753 
754 	case MLX5_VPORT_ACCESS_METHOD_NIC:
755 		return mlx5_query_port_roce(ibdev, port, props);
756 
757 	default:
758 		return -EINVAL;
759 	}
760 }
761 
762 static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
763 			     union ib_gid *gid)
764 {
765 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
766 	struct mlx5_core_dev *mdev = dev->mdev;
767 
768 	switch (mlx5_get_vport_access_method(ibdev)) {
769 	case MLX5_VPORT_ACCESS_METHOD_MAD:
770 		return mlx5_query_mad_ifc_gids(ibdev, port, index, gid);
771 
772 	case MLX5_VPORT_ACCESS_METHOD_HCA:
773 		return mlx5_query_hca_vport_gid(mdev, 0, port, 0, index, gid);
774 
775 	default:
776 		return -EINVAL;
777 	}
778 
779 }
780 
781 static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
782 			      u16 *pkey)
783 {
784 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
785 	struct mlx5_core_dev *mdev = dev->mdev;
786 
787 	switch (mlx5_get_vport_access_method(ibdev)) {
788 	case MLX5_VPORT_ACCESS_METHOD_MAD:
789 		return mlx5_query_mad_ifc_pkey(ibdev, port, index, pkey);
790 
791 	case MLX5_VPORT_ACCESS_METHOD_HCA:
792 	case MLX5_VPORT_ACCESS_METHOD_NIC:
793 		return mlx5_query_hca_vport_pkey(mdev, 0, port,  0, index,
794 						 pkey);
795 	default:
796 		return -EINVAL;
797 	}
798 }
799 
800 static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask,
801 				 struct ib_device_modify *props)
802 {
803 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
804 	struct mlx5_reg_node_desc in;
805 	struct mlx5_reg_node_desc out;
806 	int err;
807 
808 	if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
809 		return -EOPNOTSUPP;
810 
811 	if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
812 		return 0;
813 
814 	/*
815 	 * If possible, pass node desc to FW, so it can generate
816 	 * a 144 trap.  If cmd fails, just ignore.
817 	 */
818 	memcpy(&in, props->node_desc, 64);
819 	err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out,
820 				   sizeof(out), MLX5_REG_NODE_DESC, 0, 1);
821 	if (err)
822 		return err;
823 
824 	memcpy(ibdev->node_desc, props->node_desc, 64);
825 
826 	return err;
827 }
828 
829 static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
830 			       struct ib_port_modify *props)
831 {
832 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
833 	struct ib_port_attr attr;
834 	u32 tmp;
835 	int err;
836 
837 	mutex_lock(&dev->cap_mask_mutex);
838 
839 	err = mlx5_ib_query_port(ibdev, port, &attr);
840 	if (err)
841 		goto out;
842 
843 	tmp = (attr.port_cap_flags | props->set_port_cap_mask) &
844 		~props->clr_port_cap_mask;
845 
846 	err = mlx5_set_port_caps(dev->mdev, port, tmp);
847 
848 out:
849 	mutex_unlock(&dev->cap_mask_mutex);
850 	return err;
851 }
852 
853 static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
854 						  struct ib_udata *udata)
855 {
856 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
857 	struct mlx5_ib_alloc_ucontext_req_v2 req = {};
858 	struct mlx5_ib_alloc_ucontext_resp resp = {};
859 	struct mlx5_ib_ucontext *context;
860 	struct mlx5_uuar_info *uuari;
861 	struct mlx5_uar *uars;
862 	int gross_uuars;
863 	int num_uars;
864 	int ver;
865 	int uuarn;
866 	int err;
867 	int i;
868 	size_t reqlen;
869 	size_t min_req_v2 = offsetof(struct mlx5_ib_alloc_ucontext_req_v2,
870 				     max_cqe_version);
871 
872 	if (!dev->ib_active)
873 		return ERR_PTR(-EAGAIN);
874 
875 	if (udata->inlen < sizeof(struct ib_uverbs_cmd_hdr))
876 		return ERR_PTR(-EINVAL);
877 
878 	reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr);
879 	if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req))
880 		ver = 0;
881 	else if (reqlen >= min_req_v2)
882 		ver = 2;
883 	else
884 		return ERR_PTR(-EINVAL);
885 
886 	err = ib_copy_from_udata(&req, udata, min(reqlen, sizeof(req)));
887 	if (err)
888 		return ERR_PTR(err);
889 
890 	if (req.flags)
891 		return ERR_PTR(-EINVAL);
892 
893 	if (req.total_num_uuars > MLX5_MAX_UUARS)
894 		return ERR_PTR(-ENOMEM);
895 
896 	if (req.total_num_uuars == 0)
897 		return ERR_PTR(-EINVAL);
898 
899 	if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2)
900 		return ERR_PTR(-EOPNOTSUPP);
901 
902 	if (reqlen > sizeof(req) &&
903 	    !ib_is_udata_cleared(udata, sizeof(req),
904 				 reqlen - sizeof(req)))
905 		return ERR_PTR(-EOPNOTSUPP);
906 
907 	req.total_num_uuars = ALIGN(req.total_num_uuars,
908 				    MLX5_NON_FP_BF_REGS_PER_PAGE);
909 	if (req.num_low_latency_uuars > req.total_num_uuars - 1)
910 		return ERR_PTR(-EINVAL);
911 
912 	num_uars = req.total_num_uuars / MLX5_NON_FP_BF_REGS_PER_PAGE;
913 	gross_uuars = num_uars * MLX5_BF_REGS_PER_PAGE;
914 	resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
915 	resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size);
916 	resp.cache_line_size = L1_CACHE_BYTES;
917 	resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq);
918 	resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq);
919 	resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
920 	resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
921 	resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
922 	resp.cqe_version = min_t(__u8,
923 				 (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version),
924 				 req.max_cqe_version);
925 	resp.response_length = min(offsetof(typeof(resp), response_length) +
926 				   sizeof(resp.response_length), udata->outlen);
927 
928 	context = kzalloc(sizeof(*context), GFP_KERNEL);
929 	if (!context)
930 		return ERR_PTR(-ENOMEM);
931 
932 	uuari = &context->uuari;
933 	mutex_init(&uuari->lock);
934 	uars = kcalloc(num_uars, sizeof(*uars), GFP_KERNEL);
935 	if (!uars) {
936 		err = -ENOMEM;
937 		goto out_ctx;
938 	}
939 
940 	uuari->bitmap = kcalloc(BITS_TO_LONGS(gross_uuars),
941 				sizeof(*uuari->bitmap),
942 				GFP_KERNEL);
943 	if (!uuari->bitmap) {
944 		err = -ENOMEM;
945 		goto out_uar_ctx;
946 	}
947 	/*
948 	 * clear all fast path uuars
949 	 */
950 	for (i = 0; i < gross_uuars; i++) {
951 		uuarn = i & 3;
952 		if (uuarn == 2 || uuarn == 3)
953 			set_bit(i, uuari->bitmap);
954 	}
955 
956 	uuari->count = kcalloc(gross_uuars, sizeof(*uuari->count), GFP_KERNEL);
957 	if (!uuari->count) {
958 		err = -ENOMEM;
959 		goto out_bitmap;
960 	}
961 
962 	for (i = 0; i < num_uars; i++) {
963 		err = mlx5_cmd_alloc_uar(dev->mdev, &uars[i].index);
964 		if (err)
965 			goto out_count;
966 	}
967 
968 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
969 	context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range;
970 #endif
971 
972 	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) {
973 		err = mlx5_core_alloc_transport_domain(dev->mdev,
974 						       &context->tdn);
975 		if (err)
976 			goto out_uars;
977 	}
978 
979 	INIT_LIST_HEAD(&context->db_page_list);
980 	mutex_init(&context->db_page_mutex);
981 
982 	resp.tot_uuars = req.total_num_uuars;
983 	resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports);
984 
985 	if (field_avail(typeof(resp), cqe_version, udata->outlen))
986 		resp.response_length += sizeof(resp.cqe_version);
987 
988 	if (field_avail(typeof(resp), hca_core_clock_offset, udata->outlen)) {
989 		resp.comp_mask |=
990 			MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET;
991 		resp.hca_core_clock_offset =
992 			offsetof(struct mlx5_init_seg, internal_timer_h) %
993 			PAGE_SIZE;
994 		resp.response_length += sizeof(resp.hca_core_clock_offset) +
995 					sizeof(resp.reserved2) +
996 					sizeof(resp.reserved3);
997 	}
998 
999 	err = ib_copy_to_udata(udata, &resp, resp.response_length);
1000 	if (err)
1001 		goto out_td;
1002 
1003 	uuari->ver = ver;
1004 	uuari->num_low_latency_uuars = req.num_low_latency_uuars;
1005 	uuari->uars = uars;
1006 	uuari->num_uars = num_uars;
1007 	context->cqe_version = resp.cqe_version;
1008 
1009 	return &context->ibucontext;
1010 
1011 out_td:
1012 	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
1013 		mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn);
1014 
1015 out_uars:
1016 	for (i--; i >= 0; i--)
1017 		mlx5_cmd_free_uar(dev->mdev, uars[i].index);
1018 out_count:
1019 	kfree(uuari->count);
1020 
1021 out_bitmap:
1022 	kfree(uuari->bitmap);
1023 
1024 out_uar_ctx:
1025 	kfree(uars);
1026 
1027 out_ctx:
1028 	kfree(context);
1029 	return ERR_PTR(err);
1030 }
1031 
1032 static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
1033 {
1034 	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
1035 	struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
1036 	struct mlx5_uuar_info *uuari = &context->uuari;
1037 	int i;
1038 
1039 	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
1040 		mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn);
1041 
1042 	for (i = 0; i < uuari->num_uars; i++) {
1043 		if (mlx5_cmd_free_uar(dev->mdev, uuari->uars[i].index))
1044 			mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index);
1045 	}
1046 
1047 	kfree(uuari->count);
1048 	kfree(uuari->bitmap);
1049 	kfree(uuari->uars);
1050 	kfree(context);
1051 
1052 	return 0;
1053 }
1054 
1055 static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, int index)
1056 {
1057 	return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + index;
1058 }
1059 
1060 static int get_command(unsigned long offset)
1061 {
1062 	return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK;
1063 }
1064 
1065 static int get_arg(unsigned long offset)
1066 {
1067 	return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1);
1068 }
1069 
1070 static int get_index(unsigned long offset)
1071 {
1072 	return get_arg(offset);
1073 }
1074 
1075 static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
1076 {
1077 	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
1078 	struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
1079 	struct mlx5_uuar_info *uuari = &context->uuari;
1080 	unsigned long command;
1081 	unsigned long idx;
1082 	phys_addr_t pfn;
1083 
1084 	command = get_command(vma->vm_pgoff);
1085 	switch (command) {
1086 	case MLX5_IB_MMAP_REGULAR_PAGE:
1087 		if (vma->vm_end - vma->vm_start != PAGE_SIZE)
1088 			return -EINVAL;
1089 
1090 		idx = get_index(vma->vm_pgoff);
1091 		if (idx >= uuari->num_uars)
1092 			return -EINVAL;
1093 
1094 		pfn = uar_index2pfn(dev, uuari->uars[idx].index);
1095 		mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn 0x%llx\n", idx,
1096 			    (unsigned long long)pfn);
1097 
1098 		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
1099 		if (io_remap_pfn_range(vma, vma->vm_start, pfn,
1100 				       PAGE_SIZE, vma->vm_page_prot))
1101 			return -EAGAIN;
1102 
1103 		mlx5_ib_dbg(dev, "mapped WC at 0x%lx, PA 0x%llx\n",
1104 			    vma->vm_start,
1105 			    (unsigned long long)pfn << PAGE_SHIFT);
1106 		break;
1107 
1108 	case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
1109 		return -ENOSYS;
1110 
1111 	case MLX5_IB_MMAP_CORE_CLOCK:
1112 		if (vma->vm_end - vma->vm_start != PAGE_SIZE)
1113 			return -EINVAL;
1114 
1115 		if (vma->vm_flags & (VM_WRITE | VM_EXEC))
1116 			return -EPERM;
1117 
1118 		/* Don't expose to user-space information it shouldn't have */
1119 		if (PAGE_SIZE > 4096)
1120 			return -EOPNOTSUPP;
1121 
1122 		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
1123 		pfn = (dev->mdev->iseg_base +
1124 		       offsetof(struct mlx5_init_seg, internal_timer_h)) >>
1125 			PAGE_SHIFT;
1126 		if (io_remap_pfn_range(vma, vma->vm_start, pfn,
1127 				       PAGE_SIZE, vma->vm_page_prot))
1128 			return -EAGAIN;
1129 
1130 		mlx5_ib_dbg(dev, "mapped internal timer at 0x%lx, PA 0x%llx\n",
1131 			    vma->vm_start,
1132 			    (unsigned long long)pfn << PAGE_SHIFT);
1133 		break;
1134 
1135 	default:
1136 		return -EINVAL;
1137 	}
1138 
1139 	return 0;
1140 }
1141 
1142 static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev,
1143 				      struct ib_ucontext *context,
1144 				      struct ib_udata *udata)
1145 {
1146 	struct mlx5_ib_alloc_pd_resp resp;
1147 	struct mlx5_ib_pd *pd;
1148 	int err;
1149 
1150 	pd = kmalloc(sizeof(*pd), GFP_KERNEL);
1151 	if (!pd)
1152 		return ERR_PTR(-ENOMEM);
1153 
1154 	err = mlx5_core_alloc_pd(to_mdev(ibdev)->mdev, &pd->pdn);
1155 	if (err) {
1156 		kfree(pd);
1157 		return ERR_PTR(err);
1158 	}
1159 
1160 	if (context) {
1161 		resp.pdn = pd->pdn;
1162 		if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
1163 			mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn);
1164 			kfree(pd);
1165 			return ERR_PTR(-EFAULT);
1166 		}
1167 	}
1168 
1169 	return &pd->ibpd;
1170 }
1171 
1172 static int mlx5_ib_dealloc_pd(struct ib_pd *pd)
1173 {
1174 	struct mlx5_ib_dev *mdev = to_mdev(pd->device);
1175 	struct mlx5_ib_pd *mpd = to_mpd(pd);
1176 
1177 	mlx5_core_dealloc_pd(mdev->mdev, mpd->pdn);
1178 	kfree(mpd);
1179 
1180 	return 0;
1181 }
1182 
1183 static bool outer_header_zero(u32 *match_criteria)
1184 {
1185 	int size = MLX5_ST_SZ_BYTES(fte_match_param);
1186 	char *outer_headers_c = MLX5_ADDR_OF(fte_match_param, match_criteria,
1187 					     outer_headers);
1188 
1189 	return outer_headers_c[0] == 0 && !memcmp(outer_headers_c,
1190 						  outer_headers_c + 1,
1191 						  size - 1);
1192 }
1193 
1194 static int parse_flow_attr(u32 *match_c, u32 *match_v,
1195 			   union ib_flow_spec *ib_spec)
1196 {
1197 	void *outer_headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
1198 					     outer_headers);
1199 	void *outer_headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
1200 					     outer_headers);
1201 	switch (ib_spec->type) {
1202 	case IB_FLOW_SPEC_ETH:
1203 		if (ib_spec->size != sizeof(ib_spec->eth))
1204 			return -EINVAL;
1205 
1206 		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
1207 					     dmac_47_16),
1208 				ib_spec->eth.mask.dst_mac);
1209 		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
1210 					     dmac_47_16),
1211 				ib_spec->eth.val.dst_mac);
1212 
1213 		if (ib_spec->eth.mask.vlan_tag) {
1214 			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1215 				 vlan_tag, 1);
1216 			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1217 				 vlan_tag, 1);
1218 
1219 			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1220 				 first_vid, ntohs(ib_spec->eth.mask.vlan_tag));
1221 			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1222 				 first_vid, ntohs(ib_spec->eth.val.vlan_tag));
1223 
1224 			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1225 				 first_cfi,
1226 				 ntohs(ib_spec->eth.mask.vlan_tag) >> 12);
1227 			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1228 				 first_cfi,
1229 				 ntohs(ib_spec->eth.val.vlan_tag) >> 12);
1230 
1231 			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1232 				 first_prio,
1233 				 ntohs(ib_spec->eth.mask.vlan_tag) >> 13);
1234 			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1235 				 first_prio,
1236 				 ntohs(ib_spec->eth.val.vlan_tag) >> 13);
1237 		}
1238 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1239 			 ethertype, ntohs(ib_spec->eth.mask.ether_type));
1240 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1241 			 ethertype, ntohs(ib_spec->eth.val.ether_type));
1242 		break;
1243 	case IB_FLOW_SPEC_IPV4:
1244 		if (ib_spec->size != sizeof(ib_spec->ipv4))
1245 			return -EINVAL;
1246 
1247 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1248 			 ethertype, 0xffff);
1249 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1250 			 ethertype, ETH_P_IP);
1251 
1252 		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
1253 				    src_ipv4_src_ipv6.ipv4_layout.ipv4),
1254 		       &ib_spec->ipv4.mask.src_ip,
1255 		       sizeof(ib_spec->ipv4.mask.src_ip));
1256 		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
1257 				    src_ipv4_src_ipv6.ipv4_layout.ipv4),
1258 		       &ib_spec->ipv4.val.src_ip,
1259 		       sizeof(ib_spec->ipv4.val.src_ip));
1260 		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
1261 				    dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
1262 		       &ib_spec->ipv4.mask.dst_ip,
1263 		       sizeof(ib_spec->ipv4.mask.dst_ip));
1264 		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
1265 				    dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
1266 		       &ib_spec->ipv4.val.dst_ip,
1267 		       sizeof(ib_spec->ipv4.val.dst_ip));
1268 		break;
1269 	case IB_FLOW_SPEC_TCP:
1270 		if (ib_spec->size != sizeof(ib_spec->tcp_udp))
1271 			return -EINVAL;
1272 
1273 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ip_protocol,
1274 			 0xff);
1275 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ip_protocol,
1276 			 IPPROTO_TCP);
1277 
1278 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, tcp_sport,
1279 			 ntohs(ib_spec->tcp_udp.mask.src_port));
1280 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, tcp_sport,
1281 			 ntohs(ib_spec->tcp_udp.val.src_port));
1282 
1283 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, tcp_dport,
1284 			 ntohs(ib_spec->tcp_udp.mask.dst_port));
1285 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, tcp_dport,
1286 			 ntohs(ib_spec->tcp_udp.val.dst_port));
1287 		break;
1288 	case IB_FLOW_SPEC_UDP:
1289 		if (ib_spec->size != sizeof(ib_spec->tcp_udp))
1290 			return -EINVAL;
1291 
1292 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ip_protocol,
1293 			 0xff);
1294 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ip_protocol,
1295 			 IPPROTO_UDP);
1296 
1297 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, udp_sport,
1298 			 ntohs(ib_spec->tcp_udp.mask.src_port));
1299 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, udp_sport,
1300 			 ntohs(ib_spec->tcp_udp.val.src_port));
1301 
1302 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, udp_dport,
1303 			 ntohs(ib_spec->tcp_udp.mask.dst_port));
1304 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, udp_dport,
1305 			 ntohs(ib_spec->tcp_udp.val.dst_port));
1306 		break;
1307 	default:
1308 		return -EINVAL;
1309 	}
1310 
1311 	return 0;
1312 }
1313 
1314 /* If a flow could catch both multicast and unicast packets,
1315  * it won't fall into the multicast flow steering table and this rule
1316  * could steal other multicast packets.
1317  */
1318 static bool flow_is_multicast_only(struct ib_flow_attr *ib_attr)
1319 {
1320 	struct ib_flow_spec_eth *eth_spec;
1321 
1322 	if (ib_attr->type != IB_FLOW_ATTR_NORMAL ||
1323 	    ib_attr->size < sizeof(struct ib_flow_attr) +
1324 	    sizeof(struct ib_flow_spec_eth) ||
1325 	    ib_attr->num_of_specs < 1)
1326 		return false;
1327 
1328 	eth_spec = (struct ib_flow_spec_eth *)(ib_attr + 1);
1329 	if (eth_spec->type != IB_FLOW_SPEC_ETH ||
1330 	    eth_spec->size != sizeof(*eth_spec))
1331 		return false;
1332 
1333 	return is_multicast_ether_addr(eth_spec->mask.dst_mac) &&
1334 	       is_multicast_ether_addr(eth_spec->val.dst_mac);
1335 }
1336 
1337 static bool is_valid_attr(struct ib_flow_attr *flow_attr)
1338 {
1339 	union ib_flow_spec *ib_spec = (union ib_flow_spec *)(flow_attr + 1);
1340 	bool has_ipv4_spec = false;
1341 	bool eth_type_ipv4 = true;
1342 	unsigned int spec_index;
1343 
1344 	/* Validate that ethertype is correct */
1345 	for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
1346 		if (ib_spec->type == IB_FLOW_SPEC_ETH &&
1347 		    ib_spec->eth.mask.ether_type) {
1348 			if (!((ib_spec->eth.mask.ether_type == htons(0xffff)) &&
1349 			      ib_spec->eth.val.ether_type == htons(ETH_P_IP)))
1350 				eth_type_ipv4 = false;
1351 		} else if (ib_spec->type == IB_FLOW_SPEC_IPV4) {
1352 			has_ipv4_spec = true;
1353 		}
1354 		ib_spec = (void *)ib_spec + ib_spec->size;
1355 	}
1356 	return !has_ipv4_spec || eth_type_ipv4;
1357 }
1358 
1359 static void put_flow_table(struct mlx5_ib_dev *dev,
1360 			   struct mlx5_ib_flow_prio *prio, bool ft_added)
1361 {
1362 	prio->refcount -= !!ft_added;
1363 	if (!prio->refcount) {
1364 		mlx5_destroy_flow_table(prio->flow_table);
1365 		prio->flow_table = NULL;
1366 	}
1367 }
1368 
1369 static int mlx5_ib_destroy_flow(struct ib_flow *flow_id)
1370 {
1371 	struct mlx5_ib_dev *dev = to_mdev(flow_id->qp->device);
1372 	struct mlx5_ib_flow_handler *handler = container_of(flow_id,
1373 							  struct mlx5_ib_flow_handler,
1374 							  ibflow);
1375 	struct mlx5_ib_flow_handler *iter, *tmp;
1376 
1377 	mutex_lock(&dev->flow_db.lock);
1378 
1379 	list_for_each_entry_safe(iter, tmp, &handler->list, list) {
1380 		mlx5_del_flow_rule(iter->rule);
1381 		list_del(&iter->list);
1382 		kfree(iter);
1383 	}
1384 
1385 	mlx5_del_flow_rule(handler->rule);
1386 	put_flow_table(dev, &dev->flow_db.prios[handler->prio], true);
1387 	mutex_unlock(&dev->flow_db.lock);
1388 
1389 	kfree(handler);
1390 
1391 	return 0;
1392 }
1393 
1394 static int ib_prio_to_core_prio(unsigned int priority, bool dont_trap)
1395 {
1396 	priority *= 2;
1397 	if (!dont_trap)
1398 		priority++;
1399 	return priority;
1400 }
1401 
1402 #define MLX5_FS_MAX_TYPES	 10
1403 #define MLX5_FS_MAX_ENTRIES	 32000UL
1404 static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
1405 						struct ib_flow_attr *flow_attr)
1406 {
1407 	bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP;
1408 	struct mlx5_flow_namespace *ns = NULL;
1409 	struct mlx5_ib_flow_prio *prio;
1410 	struct mlx5_flow_table *ft;
1411 	int num_entries;
1412 	int num_groups;
1413 	int priority;
1414 	int err = 0;
1415 
1416 	if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
1417 		if (flow_is_multicast_only(flow_attr) &&
1418 		    !dont_trap)
1419 			priority = MLX5_IB_FLOW_MCAST_PRIO;
1420 		else
1421 			priority = ib_prio_to_core_prio(flow_attr->priority,
1422 							dont_trap);
1423 		ns = mlx5_get_flow_namespace(dev->mdev,
1424 					     MLX5_FLOW_NAMESPACE_BYPASS);
1425 		num_entries = MLX5_FS_MAX_ENTRIES;
1426 		num_groups = MLX5_FS_MAX_TYPES;
1427 		prio = &dev->flow_db.prios[priority];
1428 	} else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
1429 		   flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
1430 		ns = mlx5_get_flow_namespace(dev->mdev,
1431 					     MLX5_FLOW_NAMESPACE_LEFTOVERS);
1432 		build_leftovers_ft_param(&priority,
1433 					 &num_entries,
1434 					 &num_groups);
1435 		prio = &dev->flow_db.prios[MLX5_IB_FLOW_LEFTOVERS_PRIO];
1436 	}
1437 
1438 	if (!ns)
1439 		return ERR_PTR(-ENOTSUPP);
1440 
1441 	ft = prio->flow_table;
1442 	if (!ft) {
1443 		ft = mlx5_create_auto_grouped_flow_table(ns, priority,
1444 							 num_entries,
1445 							 num_groups);
1446 
1447 		if (!IS_ERR(ft)) {
1448 			prio->refcount = 0;
1449 			prio->flow_table = ft;
1450 		} else {
1451 			err = PTR_ERR(ft);
1452 		}
1453 	}
1454 
1455 	return err ? ERR_PTR(err) : prio;
1456 }
1457 
1458 static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
1459 						     struct mlx5_ib_flow_prio *ft_prio,
1460 						     struct ib_flow_attr *flow_attr,
1461 						     struct mlx5_flow_destination *dst)
1462 {
1463 	struct mlx5_flow_table	*ft = ft_prio->flow_table;
1464 	struct mlx5_ib_flow_handler *handler;
1465 	void *ib_flow = flow_attr + 1;
1466 	u8 match_criteria_enable = 0;
1467 	unsigned int spec_index;
1468 	u32 *match_c;
1469 	u32 *match_v;
1470 	u32 action;
1471 	int err = 0;
1472 
1473 	if (!is_valid_attr(flow_attr))
1474 		return ERR_PTR(-EINVAL);
1475 
1476 	match_c = kzalloc(MLX5_ST_SZ_BYTES(fte_match_param), GFP_KERNEL);
1477 	match_v = kzalloc(MLX5_ST_SZ_BYTES(fte_match_param), GFP_KERNEL);
1478 	handler = kzalloc(sizeof(*handler), GFP_KERNEL);
1479 	if (!handler || !match_c || !match_v) {
1480 		err = -ENOMEM;
1481 		goto free;
1482 	}
1483 
1484 	INIT_LIST_HEAD(&handler->list);
1485 
1486 	for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
1487 		err = parse_flow_attr(match_c, match_v, ib_flow);
1488 		if (err < 0)
1489 			goto free;
1490 
1491 		ib_flow += ((union ib_flow_spec *)ib_flow)->size;
1492 	}
1493 
1494 	/* Outer header support only */
1495 	match_criteria_enable = (!outer_header_zero(match_c)) << 0;
1496 	action = dst ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST :
1497 		MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
1498 	handler->rule = mlx5_add_flow_rule(ft, match_criteria_enable,
1499 					   match_c, match_v,
1500 					   action,
1501 					   MLX5_FS_DEFAULT_FLOW_TAG,
1502 					   dst);
1503 
1504 	if (IS_ERR(handler->rule)) {
1505 		err = PTR_ERR(handler->rule);
1506 		goto free;
1507 	}
1508 
1509 	handler->prio = ft_prio - dev->flow_db.prios;
1510 
1511 	ft_prio->flow_table = ft;
1512 free:
1513 	if (err)
1514 		kfree(handler);
1515 	kfree(match_c);
1516 	kfree(match_v);
1517 	return err ? ERR_PTR(err) : handler;
1518 }
1519 
1520 static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *dev,
1521 							  struct mlx5_ib_flow_prio *ft_prio,
1522 							  struct ib_flow_attr *flow_attr,
1523 							  struct mlx5_flow_destination *dst)
1524 {
1525 	struct mlx5_ib_flow_handler *handler_dst = NULL;
1526 	struct mlx5_ib_flow_handler *handler = NULL;
1527 
1528 	handler = create_flow_rule(dev, ft_prio, flow_attr, NULL);
1529 	if (!IS_ERR(handler)) {
1530 		handler_dst = create_flow_rule(dev, ft_prio,
1531 					       flow_attr, dst);
1532 		if (IS_ERR(handler_dst)) {
1533 			mlx5_del_flow_rule(handler->rule);
1534 			kfree(handler);
1535 			handler = handler_dst;
1536 		} else {
1537 			list_add(&handler_dst->list, &handler->list);
1538 		}
1539 	}
1540 
1541 	return handler;
1542 }
1543 enum {
1544 	LEFTOVERS_MC,
1545 	LEFTOVERS_UC,
1546 };
1547 
1548 static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev,
1549 							  struct mlx5_ib_flow_prio *ft_prio,
1550 							  struct ib_flow_attr *flow_attr,
1551 							  struct mlx5_flow_destination *dst)
1552 {
1553 	struct mlx5_ib_flow_handler *handler_ucast = NULL;
1554 	struct mlx5_ib_flow_handler *handler = NULL;
1555 
1556 	static struct {
1557 		struct ib_flow_attr	flow_attr;
1558 		struct ib_flow_spec_eth eth_flow;
1559 	} leftovers_specs[] = {
1560 		[LEFTOVERS_MC] = {
1561 			.flow_attr = {
1562 				.num_of_specs = 1,
1563 				.size = sizeof(leftovers_specs[0])
1564 			},
1565 			.eth_flow = {
1566 				.type = IB_FLOW_SPEC_ETH,
1567 				.size = sizeof(struct ib_flow_spec_eth),
1568 				.mask = {.dst_mac = {0x1} },
1569 				.val =  {.dst_mac = {0x1} }
1570 			}
1571 		},
1572 		[LEFTOVERS_UC] = {
1573 			.flow_attr = {
1574 				.num_of_specs = 1,
1575 				.size = sizeof(leftovers_specs[0])
1576 			},
1577 			.eth_flow = {
1578 				.type = IB_FLOW_SPEC_ETH,
1579 				.size = sizeof(struct ib_flow_spec_eth),
1580 				.mask = {.dst_mac = {0x1} },
1581 				.val = {.dst_mac = {} }
1582 			}
1583 		}
1584 	};
1585 
1586 	handler = create_flow_rule(dev, ft_prio,
1587 				   &leftovers_specs[LEFTOVERS_MC].flow_attr,
1588 				   dst);
1589 	if (!IS_ERR(handler) &&
1590 	    flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) {
1591 		handler_ucast = create_flow_rule(dev, ft_prio,
1592 						 &leftovers_specs[LEFTOVERS_UC].flow_attr,
1593 						 dst);
1594 		if (IS_ERR(handler_ucast)) {
1595 			kfree(handler);
1596 			handler = handler_ucast;
1597 		} else {
1598 			list_add(&handler_ucast->list, &handler->list);
1599 		}
1600 	}
1601 
1602 	return handler;
1603 }
1604 
1605 static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp,
1606 					   struct ib_flow_attr *flow_attr,
1607 					   int domain)
1608 {
1609 	struct mlx5_ib_dev *dev = to_mdev(qp->device);
1610 	struct mlx5_ib_flow_handler *handler = NULL;
1611 	struct mlx5_flow_destination *dst = NULL;
1612 	struct mlx5_ib_flow_prio *ft_prio;
1613 	int err;
1614 
1615 	if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO)
1616 		return ERR_PTR(-ENOSPC);
1617 
1618 	if (domain != IB_FLOW_DOMAIN_USER ||
1619 	    flow_attr->port > MLX5_CAP_GEN(dev->mdev, num_ports) ||
1620 	    (flow_attr->flags & ~IB_FLOW_ATTR_FLAGS_DONT_TRAP))
1621 		return ERR_PTR(-EINVAL);
1622 
1623 	dst = kzalloc(sizeof(*dst), GFP_KERNEL);
1624 	if (!dst)
1625 		return ERR_PTR(-ENOMEM);
1626 
1627 	mutex_lock(&dev->flow_db.lock);
1628 
1629 	ft_prio = get_flow_table(dev, flow_attr);
1630 	if (IS_ERR(ft_prio)) {
1631 		err = PTR_ERR(ft_prio);
1632 		goto unlock;
1633 	}
1634 
1635 	dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR;
1636 	dst->tir_num = to_mqp(qp)->raw_packet_qp.rq.tirn;
1637 
1638 	if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
1639 		if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP)  {
1640 			handler = create_dont_trap_rule(dev, ft_prio,
1641 							flow_attr, dst);
1642 		} else {
1643 			handler = create_flow_rule(dev, ft_prio, flow_attr,
1644 						   dst);
1645 		}
1646 	} else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
1647 		   flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
1648 		handler = create_leftovers_rule(dev, ft_prio, flow_attr,
1649 						dst);
1650 	} else {
1651 		err = -EINVAL;
1652 		goto destroy_ft;
1653 	}
1654 
1655 	if (IS_ERR(handler)) {
1656 		err = PTR_ERR(handler);
1657 		handler = NULL;
1658 		goto destroy_ft;
1659 	}
1660 
1661 	ft_prio->refcount++;
1662 	mutex_unlock(&dev->flow_db.lock);
1663 	kfree(dst);
1664 
1665 	return &handler->ibflow;
1666 
1667 destroy_ft:
1668 	put_flow_table(dev, ft_prio, false);
1669 unlock:
1670 	mutex_unlock(&dev->flow_db.lock);
1671 	kfree(dst);
1672 	kfree(handler);
1673 	return ERR_PTR(err);
1674 }
1675 
1676 static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
1677 {
1678 	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
1679 	int err;
1680 
1681 	err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num);
1682 	if (err)
1683 		mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n",
1684 			     ibqp->qp_num, gid->raw);
1685 
1686 	return err;
1687 }
1688 
1689 static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
1690 {
1691 	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
1692 	int err;
1693 
1694 	err = mlx5_core_detach_mcg(dev->mdev, gid, ibqp->qp_num);
1695 	if (err)
1696 		mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n",
1697 			     ibqp->qp_num, gid->raw);
1698 
1699 	return err;
1700 }
1701 
1702 static int init_node_data(struct mlx5_ib_dev *dev)
1703 {
1704 	int err;
1705 
1706 	err = mlx5_query_node_desc(dev, dev->ib_dev.node_desc);
1707 	if (err)
1708 		return err;
1709 
1710 	dev->mdev->rev_id = dev->mdev->pdev->revision;
1711 
1712 	return mlx5_query_node_guid(dev, &dev->ib_dev.node_guid);
1713 }
1714 
1715 static ssize_t show_fw_pages(struct device *device, struct device_attribute *attr,
1716 			     char *buf)
1717 {
1718 	struct mlx5_ib_dev *dev =
1719 		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
1720 
1721 	return sprintf(buf, "%d\n", dev->mdev->priv.fw_pages);
1722 }
1723 
1724 static ssize_t show_reg_pages(struct device *device,
1725 			      struct device_attribute *attr, char *buf)
1726 {
1727 	struct mlx5_ib_dev *dev =
1728 		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
1729 
1730 	return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
1731 }
1732 
1733 static ssize_t show_hca(struct device *device, struct device_attribute *attr,
1734 			char *buf)
1735 {
1736 	struct mlx5_ib_dev *dev =
1737 		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
1738 	return sprintf(buf, "MT%d\n", dev->mdev->pdev->device);
1739 }
1740 
1741 static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
1742 			   char *buf)
1743 {
1744 	struct mlx5_ib_dev *dev =
1745 		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
1746 	return sprintf(buf, "%d.%d.%d\n", fw_rev_maj(dev->mdev),
1747 		       fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev));
1748 }
1749 
1750 static ssize_t show_rev(struct device *device, struct device_attribute *attr,
1751 			char *buf)
1752 {
1753 	struct mlx5_ib_dev *dev =
1754 		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
1755 	return sprintf(buf, "%x\n", dev->mdev->rev_id);
1756 }
1757 
1758 static ssize_t show_board(struct device *device, struct device_attribute *attr,
1759 			  char *buf)
1760 {
1761 	struct mlx5_ib_dev *dev =
1762 		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
1763 	return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN,
1764 		       dev->mdev->board_id);
1765 }
1766 
1767 static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
1768 static DEVICE_ATTR(fw_ver,   S_IRUGO, show_fw_ver, NULL);
1769 static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
1770 static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
1771 static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL);
1772 static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL);
1773 
1774 static struct device_attribute *mlx5_class_attributes[] = {
1775 	&dev_attr_hw_rev,
1776 	&dev_attr_fw_ver,
1777 	&dev_attr_hca_type,
1778 	&dev_attr_board_id,
1779 	&dev_attr_fw_pages,
1780 	&dev_attr_reg_pages,
1781 };
1782 
1783 static void pkey_change_handler(struct work_struct *work)
1784 {
1785 	struct mlx5_ib_port_resources *ports =
1786 		container_of(work, struct mlx5_ib_port_resources,
1787 			     pkey_change_work);
1788 
1789 	mutex_lock(&ports->devr->mutex);
1790 	mlx5_ib_gsi_pkey_change(ports->gsi);
1791 	mutex_unlock(&ports->devr->mutex);
1792 }
1793 
1794 static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
1795 			  enum mlx5_dev_event event, unsigned long param)
1796 {
1797 	struct mlx5_ib_dev *ibdev = (struct mlx5_ib_dev *)context;
1798 	struct ib_event ibev;
1799 
1800 	u8 port = 0;
1801 
1802 	switch (event) {
1803 	case MLX5_DEV_EVENT_SYS_ERROR:
1804 		ibdev->ib_active = false;
1805 		ibev.event = IB_EVENT_DEVICE_FATAL;
1806 		break;
1807 
1808 	case MLX5_DEV_EVENT_PORT_UP:
1809 		ibev.event = IB_EVENT_PORT_ACTIVE;
1810 		port = (u8)param;
1811 		break;
1812 
1813 	case MLX5_DEV_EVENT_PORT_DOWN:
1814 		ibev.event = IB_EVENT_PORT_ERR;
1815 		port = (u8)param;
1816 		break;
1817 
1818 	case MLX5_DEV_EVENT_PORT_INITIALIZED:
1819 		/* not used by ULPs */
1820 		return;
1821 
1822 	case MLX5_DEV_EVENT_LID_CHANGE:
1823 		ibev.event = IB_EVENT_LID_CHANGE;
1824 		port = (u8)param;
1825 		break;
1826 
1827 	case MLX5_DEV_EVENT_PKEY_CHANGE:
1828 		ibev.event = IB_EVENT_PKEY_CHANGE;
1829 		port = (u8)param;
1830 
1831 		schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work);
1832 		break;
1833 
1834 	case MLX5_DEV_EVENT_GUID_CHANGE:
1835 		ibev.event = IB_EVENT_GID_CHANGE;
1836 		port = (u8)param;
1837 		break;
1838 
1839 	case MLX5_DEV_EVENT_CLIENT_REREG:
1840 		ibev.event = IB_EVENT_CLIENT_REREGISTER;
1841 		port = (u8)param;
1842 		break;
1843 	}
1844 
1845 	ibev.device	      = &ibdev->ib_dev;
1846 	ibev.element.port_num = port;
1847 
1848 	if (port < 1 || port > ibdev->num_ports) {
1849 		mlx5_ib_warn(ibdev, "warning: event on port %d\n", port);
1850 		return;
1851 	}
1852 
1853 	if (ibdev->ib_active)
1854 		ib_dispatch_event(&ibev);
1855 }
1856 
1857 static void get_ext_port_caps(struct mlx5_ib_dev *dev)
1858 {
1859 	int port;
1860 
1861 	for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++)
1862 		mlx5_query_ext_port_caps(dev, port);
1863 }
1864 
1865 static int get_port_caps(struct mlx5_ib_dev *dev)
1866 {
1867 	struct ib_device_attr *dprops = NULL;
1868 	struct ib_port_attr *pprops = NULL;
1869 	int err = -ENOMEM;
1870 	int port;
1871 	struct ib_udata uhw = {.inlen = 0, .outlen = 0};
1872 
1873 	pprops = kmalloc(sizeof(*pprops), GFP_KERNEL);
1874 	if (!pprops)
1875 		goto out;
1876 
1877 	dprops = kmalloc(sizeof(*dprops), GFP_KERNEL);
1878 	if (!dprops)
1879 		goto out;
1880 
1881 	err = mlx5_ib_query_device(&dev->ib_dev, dprops, &uhw);
1882 	if (err) {
1883 		mlx5_ib_warn(dev, "query_device failed %d\n", err);
1884 		goto out;
1885 	}
1886 
1887 	for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) {
1888 		err = mlx5_ib_query_port(&dev->ib_dev, port, pprops);
1889 		if (err) {
1890 			mlx5_ib_warn(dev, "query_port %d failed %d\n",
1891 				     port, err);
1892 			break;
1893 		}
1894 		dev->mdev->port_caps[port - 1].pkey_table_len =
1895 						dprops->max_pkeys;
1896 		dev->mdev->port_caps[port - 1].gid_table_len =
1897 						pprops->gid_tbl_len;
1898 		mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n",
1899 			    dprops->max_pkeys, pprops->gid_tbl_len);
1900 	}
1901 
1902 out:
1903 	kfree(pprops);
1904 	kfree(dprops);
1905 
1906 	return err;
1907 }
1908 
1909 static void destroy_umrc_res(struct mlx5_ib_dev *dev)
1910 {
1911 	int err;
1912 
1913 	err = mlx5_mr_cache_cleanup(dev);
1914 	if (err)
1915 		mlx5_ib_warn(dev, "mr cache cleanup failed\n");
1916 
1917 	mlx5_ib_destroy_qp(dev->umrc.qp);
1918 	ib_free_cq(dev->umrc.cq);
1919 	ib_dealloc_pd(dev->umrc.pd);
1920 }
1921 
1922 enum {
1923 	MAX_UMR_WR = 128,
1924 };
1925 
1926 static int create_umr_res(struct mlx5_ib_dev *dev)
1927 {
1928 	struct ib_qp_init_attr *init_attr = NULL;
1929 	struct ib_qp_attr *attr = NULL;
1930 	struct ib_pd *pd;
1931 	struct ib_cq *cq;
1932 	struct ib_qp *qp;
1933 	int ret;
1934 
1935 	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
1936 	init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL);
1937 	if (!attr || !init_attr) {
1938 		ret = -ENOMEM;
1939 		goto error_0;
1940 	}
1941 
1942 	pd = ib_alloc_pd(&dev->ib_dev);
1943 	if (IS_ERR(pd)) {
1944 		mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
1945 		ret = PTR_ERR(pd);
1946 		goto error_0;
1947 	}
1948 
1949 	cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ);
1950 	if (IS_ERR(cq)) {
1951 		mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n");
1952 		ret = PTR_ERR(cq);
1953 		goto error_2;
1954 	}
1955 
1956 	init_attr->send_cq = cq;
1957 	init_attr->recv_cq = cq;
1958 	init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
1959 	init_attr->cap.max_send_wr = MAX_UMR_WR;
1960 	init_attr->cap.max_send_sge = 1;
1961 	init_attr->qp_type = MLX5_IB_QPT_REG_UMR;
1962 	init_attr->port_num = 1;
1963 	qp = mlx5_ib_create_qp(pd, init_attr, NULL);
1964 	if (IS_ERR(qp)) {
1965 		mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n");
1966 		ret = PTR_ERR(qp);
1967 		goto error_3;
1968 	}
1969 	qp->device     = &dev->ib_dev;
1970 	qp->real_qp    = qp;
1971 	qp->uobject    = NULL;
1972 	qp->qp_type    = MLX5_IB_QPT_REG_UMR;
1973 
1974 	attr->qp_state = IB_QPS_INIT;
1975 	attr->port_num = 1;
1976 	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX |
1977 				IB_QP_PORT, NULL);
1978 	if (ret) {
1979 		mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
1980 		goto error_4;
1981 	}
1982 
1983 	memset(attr, 0, sizeof(*attr));
1984 	attr->qp_state = IB_QPS_RTR;
1985 	attr->path_mtu = IB_MTU_256;
1986 
1987 	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
1988 	if (ret) {
1989 		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n");
1990 		goto error_4;
1991 	}
1992 
1993 	memset(attr, 0, sizeof(*attr));
1994 	attr->qp_state = IB_QPS_RTS;
1995 	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
1996 	if (ret) {
1997 		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n");
1998 		goto error_4;
1999 	}
2000 
2001 	dev->umrc.qp = qp;
2002 	dev->umrc.cq = cq;
2003 	dev->umrc.pd = pd;
2004 
2005 	sema_init(&dev->umrc.sem, MAX_UMR_WR);
2006 	ret = mlx5_mr_cache_init(dev);
2007 	if (ret) {
2008 		mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
2009 		goto error_4;
2010 	}
2011 
2012 	kfree(attr);
2013 	kfree(init_attr);
2014 
2015 	return 0;
2016 
2017 error_4:
2018 	mlx5_ib_destroy_qp(qp);
2019 
2020 error_3:
2021 	ib_free_cq(cq);
2022 
2023 error_2:
2024 	ib_dealloc_pd(pd);
2025 
2026 error_0:
2027 	kfree(attr);
2028 	kfree(init_attr);
2029 	return ret;
2030 }
2031 
2032 static int create_dev_resources(struct mlx5_ib_resources *devr)
2033 {
2034 	struct ib_srq_init_attr attr;
2035 	struct mlx5_ib_dev *dev;
2036 	struct ib_cq_init_attr cq_attr = {.cqe = 1};
2037 	int port;
2038 	int ret = 0;
2039 
2040 	dev = container_of(devr, struct mlx5_ib_dev, devr);
2041 
2042 	mutex_init(&devr->mutex);
2043 
2044 	devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL);
2045 	if (IS_ERR(devr->p0)) {
2046 		ret = PTR_ERR(devr->p0);
2047 		goto error0;
2048 	}
2049 	devr->p0->device  = &dev->ib_dev;
2050 	devr->p0->uobject = NULL;
2051 	atomic_set(&devr->p0->usecnt, 0);
2052 
2053 	devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, &cq_attr, NULL, NULL);
2054 	if (IS_ERR(devr->c0)) {
2055 		ret = PTR_ERR(devr->c0);
2056 		goto error1;
2057 	}
2058 	devr->c0->device        = &dev->ib_dev;
2059 	devr->c0->uobject       = NULL;
2060 	devr->c0->comp_handler  = NULL;
2061 	devr->c0->event_handler = NULL;
2062 	devr->c0->cq_context    = NULL;
2063 	atomic_set(&devr->c0->usecnt, 0);
2064 
2065 	devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
2066 	if (IS_ERR(devr->x0)) {
2067 		ret = PTR_ERR(devr->x0);
2068 		goto error2;
2069 	}
2070 	devr->x0->device = &dev->ib_dev;
2071 	devr->x0->inode = NULL;
2072 	atomic_set(&devr->x0->usecnt, 0);
2073 	mutex_init(&devr->x0->tgt_qp_mutex);
2074 	INIT_LIST_HEAD(&devr->x0->tgt_qp_list);
2075 
2076 	devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
2077 	if (IS_ERR(devr->x1)) {
2078 		ret = PTR_ERR(devr->x1);
2079 		goto error3;
2080 	}
2081 	devr->x1->device = &dev->ib_dev;
2082 	devr->x1->inode = NULL;
2083 	atomic_set(&devr->x1->usecnt, 0);
2084 	mutex_init(&devr->x1->tgt_qp_mutex);
2085 	INIT_LIST_HEAD(&devr->x1->tgt_qp_list);
2086 
2087 	memset(&attr, 0, sizeof(attr));
2088 	attr.attr.max_sge = 1;
2089 	attr.attr.max_wr = 1;
2090 	attr.srq_type = IB_SRQT_XRC;
2091 	attr.ext.xrc.cq = devr->c0;
2092 	attr.ext.xrc.xrcd = devr->x0;
2093 
2094 	devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
2095 	if (IS_ERR(devr->s0)) {
2096 		ret = PTR_ERR(devr->s0);
2097 		goto error4;
2098 	}
2099 	devr->s0->device	= &dev->ib_dev;
2100 	devr->s0->pd		= devr->p0;
2101 	devr->s0->uobject       = NULL;
2102 	devr->s0->event_handler = NULL;
2103 	devr->s0->srq_context   = NULL;
2104 	devr->s0->srq_type      = IB_SRQT_XRC;
2105 	devr->s0->ext.xrc.xrcd	= devr->x0;
2106 	devr->s0->ext.xrc.cq	= devr->c0;
2107 	atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt);
2108 	atomic_inc(&devr->s0->ext.xrc.cq->usecnt);
2109 	atomic_inc(&devr->p0->usecnt);
2110 	atomic_set(&devr->s0->usecnt, 0);
2111 
2112 	memset(&attr, 0, sizeof(attr));
2113 	attr.attr.max_sge = 1;
2114 	attr.attr.max_wr = 1;
2115 	attr.srq_type = IB_SRQT_BASIC;
2116 	devr->s1 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
2117 	if (IS_ERR(devr->s1)) {
2118 		ret = PTR_ERR(devr->s1);
2119 		goto error5;
2120 	}
2121 	devr->s1->device	= &dev->ib_dev;
2122 	devr->s1->pd		= devr->p0;
2123 	devr->s1->uobject       = NULL;
2124 	devr->s1->event_handler = NULL;
2125 	devr->s1->srq_context   = NULL;
2126 	devr->s1->srq_type      = IB_SRQT_BASIC;
2127 	devr->s1->ext.xrc.cq	= devr->c0;
2128 	atomic_inc(&devr->p0->usecnt);
2129 	atomic_set(&devr->s0->usecnt, 0);
2130 
2131 	for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) {
2132 		INIT_WORK(&devr->ports[port].pkey_change_work,
2133 			  pkey_change_handler);
2134 		devr->ports[port].devr = devr;
2135 	}
2136 
2137 	return 0;
2138 
2139 error5:
2140 	mlx5_ib_destroy_srq(devr->s0);
2141 error4:
2142 	mlx5_ib_dealloc_xrcd(devr->x1);
2143 error3:
2144 	mlx5_ib_dealloc_xrcd(devr->x0);
2145 error2:
2146 	mlx5_ib_destroy_cq(devr->c0);
2147 error1:
2148 	mlx5_ib_dealloc_pd(devr->p0);
2149 error0:
2150 	return ret;
2151 }
2152 
2153 static void destroy_dev_resources(struct mlx5_ib_resources *devr)
2154 {
2155 	struct mlx5_ib_dev *dev =
2156 		container_of(devr, struct mlx5_ib_dev, devr);
2157 	int port;
2158 
2159 	mlx5_ib_destroy_srq(devr->s1);
2160 	mlx5_ib_destroy_srq(devr->s0);
2161 	mlx5_ib_dealloc_xrcd(devr->x0);
2162 	mlx5_ib_dealloc_xrcd(devr->x1);
2163 	mlx5_ib_destroy_cq(devr->c0);
2164 	mlx5_ib_dealloc_pd(devr->p0);
2165 
2166 	/* Make sure no change P_Key work items are still executing */
2167 	for (port = 0; port < dev->num_ports; ++port)
2168 		cancel_work_sync(&devr->ports[port].pkey_change_work);
2169 }
2170 
2171 static u32 get_core_cap_flags(struct ib_device *ibdev)
2172 {
2173 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
2174 	enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1);
2175 	u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type);
2176 	u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version);
2177 	u32 ret = 0;
2178 
2179 	if (ll == IB_LINK_LAYER_INFINIBAND)
2180 		return RDMA_CORE_PORT_IBA_IB;
2181 
2182 	if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP))
2183 		return 0;
2184 
2185 	if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP))
2186 		return 0;
2187 
2188 	if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP)
2189 		ret |= RDMA_CORE_PORT_IBA_ROCE;
2190 
2191 	if (roce_version_cap & MLX5_ROCE_VERSION_2_CAP)
2192 		ret |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
2193 
2194 	return ret;
2195 }
2196 
2197 static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
2198 			       struct ib_port_immutable *immutable)
2199 {
2200 	struct ib_port_attr attr;
2201 	int err;
2202 
2203 	err = mlx5_ib_query_port(ibdev, port_num, &attr);
2204 	if (err)
2205 		return err;
2206 
2207 	immutable->pkey_tbl_len = attr.pkey_tbl_len;
2208 	immutable->gid_tbl_len = attr.gid_tbl_len;
2209 	immutable->core_cap_flags = get_core_cap_flags(ibdev);
2210 	immutable->max_mad_size = IB_MGMT_MAD_SIZE;
2211 
2212 	return 0;
2213 }
2214 
2215 static int mlx5_enable_roce(struct mlx5_ib_dev *dev)
2216 {
2217 	int err;
2218 
2219 	dev->roce.nb.notifier_call = mlx5_netdev_event;
2220 	err = register_netdevice_notifier(&dev->roce.nb);
2221 	if (err)
2222 		return err;
2223 
2224 	err = mlx5_nic_vport_enable_roce(dev->mdev);
2225 	if (err)
2226 		goto err_unregister_netdevice_notifier;
2227 
2228 	return 0;
2229 
2230 err_unregister_netdevice_notifier:
2231 	unregister_netdevice_notifier(&dev->roce.nb);
2232 	return err;
2233 }
2234 
2235 static void mlx5_disable_roce(struct mlx5_ib_dev *dev)
2236 {
2237 	mlx5_nic_vport_disable_roce(dev->mdev);
2238 	unregister_netdevice_notifier(&dev->roce.nb);
2239 }
2240 
2241 static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
2242 {
2243 	struct mlx5_ib_dev *dev;
2244 	enum rdma_link_layer ll;
2245 	int port_type_cap;
2246 	int err;
2247 	int i;
2248 
2249 	port_type_cap = MLX5_CAP_GEN(mdev, port_type);
2250 	ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
2251 
2252 	if ((ll == IB_LINK_LAYER_ETHERNET) && !MLX5_CAP_GEN(mdev, roce))
2253 		return NULL;
2254 
2255 	printk_once(KERN_INFO "%s", mlx5_version);
2256 
2257 	dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev));
2258 	if (!dev)
2259 		return NULL;
2260 
2261 	dev->mdev = mdev;
2262 
2263 	rwlock_init(&dev->roce.netdev_lock);
2264 	err = get_port_caps(dev);
2265 	if (err)
2266 		goto err_dealloc;
2267 
2268 	if (mlx5_use_mad_ifc(dev))
2269 		get_ext_port_caps(dev);
2270 
2271 	MLX5_INIT_DOORBELL_LOCK(&dev->uar_lock);
2272 
2273 	strlcpy(dev->ib_dev.name, "mlx5_%d", IB_DEVICE_NAME_MAX);
2274 	dev->ib_dev.owner		= THIS_MODULE;
2275 	dev->ib_dev.node_type		= RDMA_NODE_IB_CA;
2276 	dev->ib_dev.local_dma_lkey	= 0 /* not supported for now */;
2277 	dev->num_ports		= MLX5_CAP_GEN(mdev, num_ports);
2278 	dev->ib_dev.phys_port_cnt     = dev->num_ports;
2279 	dev->ib_dev.num_comp_vectors    =
2280 		dev->mdev->priv.eq_table.num_comp_vectors;
2281 	dev->ib_dev.dma_device	= &mdev->pdev->dev;
2282 
2283 	dev->ib_dev.uverbs_abi_ver	= MLX5_IB_UVERBS_ABI_VERSION;
2284 	dev->ib_dev.uverbs_cmd_mask	=
2285 		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
2286 		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
2287 		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
2288 		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
2289 		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
2290 		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
2291 		(1ull << IB_USER_VERBS_CMD_REREG_MR)		|
2292 		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
2293 		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)	|
2294 		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
2295 		(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)		|
2296 		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
2297 		(1ull << IB_USER_VERBS_CMD_CREATE_QP)		|
2298 		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)		|
2299 		(1ull << IB_USER_VERBS_CMD_QUERY_QP)		|
2300 		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)		|
2301 		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)	|
2302 		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST)	|
2303 		(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)		|
2304 		(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)		|
2305 		(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)		|
2306 		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)		|
2307 		(1ull << IB_USER_VERBS_CMD_CREATE_XSRQ)		|
2308 		(1ull << IB_USER_VERBS_CMD_OPEN_QP);
2309 	dev->ib_dev.uverbs_ex_cmd_mask =
2310 		(1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE)	|
2311 		(1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ)	|
2312 		(1ull << IB_USER_VERBS_EX_CMD_CREATE_QP);
2313 
2314 	dev->ib_dev.query_device	= mlx5_ib_query_device;
2315 	dev->ib_dev.query_port		= mlx5_ib_query_port;
2316 	dev->ib_dev.get_link_layer	= mlx5_ib_port_link_layer;
2317 	if (ll == IB_LINK_LAYER_ETHERNET)
2318 		dev->ib_dev.get_netdev	= mlx5_ib_get_netdev;
2319 	dev->ib_dev.query_gid		= mlx5_ib_query_gid;
2320 	dev->ib_dev.add_gid		= mlx5_ib_add_gid;
2321 	dev->ib_dev.del_gid		= mlx5_ib_del_gid;
2322 	dev->ib_dev.query_pkey		= mlx5_ib_query_pkey;
2323 	dev->ib_dev.modify_device	= mlx5_ib_modify_device;
2324 	dev->ib_dev.modify_port		= mlx5_ib_modify_port;
2325 	dev->ib_dev.alloc_ucontext	= mlx5_ib_alloc_ucontext;
2326 	dev->ib_dev.dealloc_ucontext	= mlx5_ib_dealloc_ucontext;
2327 	dev->ib_dev.mmap		= mlx5_ib_mmap;
2328 	dev->ib_dev.alloc_pd		= mlx5_ib_alloc_pd;
2329 	dev->ib_dev.dealloc_pd		= mlx5_ib_dealloc_pd;
2330 	dev->ib_dev.create_ah		= mlx5_ib_create_ah;
2331 	dev->ib_dev.query_ah		= mlx5_ib_query_ah;
2332 	dev->ib_dev.destroy_ah		= mlx5_ib_destroy_ah;
2333 	dev->ib_dev.create_srq		= mlx5_ib_create_srq;
2334 	dev->ib_dev.modify_srq		= mlx5_ib_modify_srq;
2335 	dev->ib_dev.query_srq		= mlx5_ib_query_srq;
2336 	dev->ib_dev.destroy_srq		= mlx5_ib_destroy_srq;
2337 	dev->ib_dev.post_srq_recv	= mlx5_ib_post_srq_recv;
2338 	dev->ib_dev.create_qp		= mlx5_ib_create_qp;
2339 	dev->ib_dev.modify_qp		= mlx5_ib_modify_qp;
2340 	dev->ib_dev.query_qp		= mlx5_ib_query_qp;
2341 	dev->ib_dev.destroy_qp		= mlx5_ib_destroy_qp;
2342 	dev->ib_dev.post_send		= mlx5_ib_post_send;
2343 	dev->ib_dev.post_recv		= mlx5_ib_post_recv;
2344 	dev->ib_dev.create_cq		= mlx5_ib_create_cq;
2345 	dev->ib_dev.modify_cq		= mlx5_ib_modify_cq;
2346 	dev->ib_dev.resize_cq		= mlx5_ib_resize_cq;
2347 	dev->ib_dev.destroy_cq		= mlx5_ib_destroy_cq;
2348 	dev->ib_dev.poll_cq		= mlx5_ib_poll_cq;
2349 	dev->ib_dev.req_notify_cq	= mlx5_ib_arm_cq;
2350 	dev->ib_dev.get_dma_mr		= mlx5_ib_get_dma_mr;
2351 	dev->ib_dev.reg_user_mr		= mlx5_ib_reg_user_mr;
2352 	dev->ib_dev.rereg_user_mr	= mlx5_ib_rereg_user_mr;
2353 	dev->ib_dev.dereg_mr		= mlx5_ib_dereg_mr;
2354 	dev->ib_dev.attach_mcast	= mlx5_ib_mcg_attach;
2355 	dev->ib_dev.detach_mcast	= mlx5_ib_mcg_detach;
2356 	dev->ib_dev.process_mad		= mlx5_ib_process_mad;
2357 	dev->ib_dev.alloc_mr		= mlx5_ib_alloc_mr;
2358 	dev->ib_dev.map_mr_sg		= mlx5_ib_map_mr_sg;
2359 	dev->ib_dev.check_mr_status	= mlx5_ib_check_mr_status;
2360 	dev->ib_dev.get_port_immutable  = mlx5_port_immutable;
2361 	if (mlx5_core_is_pf(mdev)) {
2362 		dev->ib_dev.get_vf_config	= mlx5_ib_get_vf_config;
2363 		dev->ib_dev.set_vf_link_state	= mlx5_ib_set_vf_link_state;
2364 		dev->ib_dev.get_vf_stats	= mlx5_ib_get_vf_stats;
2365 		dev->ib_dev.set_vf_guid		= mlx5_ib_set_vf_guid;
2366 	}
2367 
2368 	mlx5_ib_internal_fill_odp_caps(dev);
2369 
2370 	if (MLX5_CAP_GEN(mdev, imaicl)) {
2371 		dev->ib_dev.alloc_mw		= mlx5_ib_alloc_mw;
2372 		dev->ib_dev.dealloc_mw		= mlx5_ib_dealloc_mw;
2373 		dev->ib_dev.uverbs_cmd_mask |=
2374 			(1ull << IB_USER_VERBS_CMD_ALLOC_MW)	|
2375 			(1ull << IB_USER_VERBS_CMD_DEALLOC_MW);
2376 	}
2377 
2378 	if (MLX5_CAP_GEN(mdev, xrc)) {
2379 		dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd;
2380 		dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd;
2381 		dev->ib_dev.uverbs_cmd_mask |=
2382 			(1ull << IB_USER_VERBS_CMD_OPEN_XRCD) |
2383 			(1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
2384 	}
2385 
2386 	if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) ==
2387 	    IB_LINK_LAYER_ETHERNET) {
2388 		dev->ib_dev.create_flow	= mlx5_ib_create_flow;
2389 		dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow;
2390 		dev->ib_dev.uverbs_ex_cmd_mask |=
2391 			(1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) |
2392 			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW);
2393 	}
2394 	err = init_node_data(dev);
2395 	if (err)
2396 		goto err_dealloc;
2397 
2398 	mutex_init(&dev->flow_db.lock);
2399 	mutex_init(&dev->cap_mask_mutex);
2400 
2401 	if (ll == IB_LINK_LAYER_ETHERNET) {
2402 		err = mlx5_enable_roce(dev);
2403 		if (err)
2404 			goto err_dealloc;
2405 	}
2406 
2407 	err = create_dev_resources(&dev->devr);
2408 	if (err)
2409 		goto err_disable_roce;
2410 
2411 	err = mlx5_ib_odp_init_one(dev);
2412 	if (err)
2413 		goto err_rsrc;
2414 
2415 	err = ib_register_device(&dev->ib_dev, NULL);
2416 	if (err)
2417 		goto err_odp;
2418 
2419 	err = create_umr_res(dev);
2420 	if (err)
2421 		goto err_dev;
2422 
2423 	for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) {
2424 		err = device_create_file(&dev->ib_dev.dev,
2425 					 mlx5_class_attributes[i]);
2426 		if (err)
2427 			goto err_umrc;
2428 	}
2429 
2430 	dev->ib_active = true;
2431 
2432 	return dev;
2433 
2434 err_umrc:
2435 	destroy_umrc_res(dev);
2436 
2437 err_dev:
2438 	ib_unregister_device(&dev->ib_dev);
2439 
2440 err_odp:
2441 	mlx5_ib_odp_remove_one(dev);
2442 
2443 err_rsrc:
2444 	destroy_dev_resources(&dev->devr);
2445 
2446 err_disable_roce:
2447 	if (ll == IB_LINK_LAYER_ETHERNET)
2448 		mlx5_disable_roce(dev);
2449 
2450 err_dealloc:
2451 	ib_dealloc_device((struct ib_device *)dev);
2452 
2453 	return NULL;
2454 }
2455 
2456 static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
2457 {
2458 	struct mlx5_ib_dev *dev = context;
2459 	enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, 1);
2460 
2461 	ib_unregister_device(&dev->ib_dev);
2462 	destroy_umrc_res(dev);
2463 	mlx5_ib_odp_remove_one(dev);
2464 	destroy_dev_resources(&dev->devr);
2465 	if (ll == IB_LINK_LAYER_ETHERNET)
2466 		mlx5_disable_roce(dev);
2467 	ib_dealloc_device(&dev->ib_dev);
2468 }
2469 
2470 static struct mlx5_interface mlx5_ib_interface = {
2471 	.add            = mlx5_ib_add,
2472 	.remove         = mlx5_ib_remove,
2473 	.event          = mlx5_ib_event,
2474 	.protocol	= MLX5_INTERFACE_PROTOCOL_IB,
2475 };
2476 
2477 static int __init mlx5_ib_init(void)
2478 {
2479 	int err;
2480 
2481 	if (deprecated_prof_sel != 2)
2482 		pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n");
2483 
2484 	err = mlx5_ib_odp_init();
2485 	if (err)
2486 		return err;
2487 
2488 	err = mlx5_register_interface(&mlx5_ib_interface);
2489 	if (err)
2490 		goto clean_odp;
2491 
2492 	return err;
2493 
2494 clean_odp:
2495 	mlx5_ib_odp_cleanup();
2496 	return err;
2497 }
2498 
2499 static void __exit mlx5_ib_cleanup(void)
2500 {
2501 	mlx5_unregister_interface(&mlx5_ib_interface);
2502 	mlx5_ib_odp_cleanup();
2503 }
2504 
2505 module_init(mlx5_ib_init);
2506 module_exit(mlx5_ib_cleanup);
2507