xref: /openbmc/linux/drivers/infiniband/hw/mlx5/main.c (revision 3e26a691)
1 /*
2  * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32 
33 #include <linux/highmem.h>
34 #include <linux/module.h>
35 #include <linux/init.h>
36 #include <linux/errno.h>
37 #include <linux/pci.h>
38 #include <linux/dma-mapping.h>
39 #include <linux/slab.h>
40 #include <linux/io-mapping.h>
41 #include <linux/sched.h>
42 #include <rdma/ib_user_verbs.h>
43 #include <rdma/ib_addr.h>
44 #include <rdma/ib_cache.h>
45 #include <linux/mlx5/port.h>
46 #include <linux/mlx5/vport.h>
47 #include <rdma/ib_smi.h>
48 #include <rdma/ib_umem.h>
49 #include <linux/in.h>
50 #include <linux/etherdevice.h>
51 #include <linux/mlx5/fs.h>
52 #include "user.h"
53 #include "mlx5_ib.h"
54 
55 #define DRIVER_NAME "mlx5_ib"
56 #define DRIVER_VERSION "2.2-1"
57 #define DRIVER_RELDATE	"Feb 2014"
58 
59 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
60 MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver");
61 MODULE_LICENSE("Dual BSD/GPL");
62 MODULE_VERSION(DRIVER_VERSION);
63 
64 static int deprecated_prof_sel = 2;
65 module_param_named(prof_sel, deprecated_prof_sel, int, 0444);
66 MODULE_PARM_DESC(prof_sel, "profile selector. Deprecated here. Moved to module mlx5_core");
67 
68 static char mlx5_version[] =
69 	DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v"
70 	DRIVER_VERSION " (" DRIVER_RELDATE ")\n";
71 
72 enum {
73 	MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3,
74 };
75 
76 static enum rdma_link_layer
77 mlx5_port_type_cap_to_rdma_ll(int port_type_cap)
78 {
79 	switch (port_type_cap) {
80 	case MLX5_CAP_PORT_TYPE_IB:
81 		return IB_LINK_LAYER_INFINIBAND;
82 	case MLX5_CAP_PORT_TYPE_ETH:
83 		return IB_LINK_LAYER_ETHERNET;
84 	default:
85 		return IB_LINK_LAYER_UNSPECIFIED;
86 	}
87 }
88 
89 static enum rdma_link_layer
90 mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num)
91 {
92 	struct mlx5_ib_dev *dev = to_mdev(device);
93 	int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type);
94 
95 	return mlx5_port_type_cap_to_rdma_ll(port_type_cap);
96 }
97 
98 static int mlx5_netdev_event(struct notifier_block *this,
99 			     unsigned long event, void *ptr)
100 {
101 	struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
102 	struct mlx5_ib_dev *ibdev = container_of(this, struct mlx5_ib_dev,
103 						 roce.nb);
104 
105 	if ((event != NETDEV_UNREGISTER) && (event != NETDEV_REGISTER))
106 		return NOTIFY_DONE;
107 
108 	write_lock(&ibdev->roce.netdev_lock);
109 	if (ndev->dev.parent == &ibdev->mdev->pdev->dev)
110 		ibdev->roce.netdev = (event == NETDEV_UNREGISTER) ? NULL : ndev;
111 	write_unlock(&ibdev->roce.netdev_lock);
112 
113 	return NOTIFY_DONE;
114 }
115 
116 static struct net_device *mlx5_ib_get_netdev(struct ib_device *device,
117 					     u8 port_num)
118 {
119 	struct mlx5_ib_dev *ibdev = to_mdev(device);
120 	struct net_device *ndev;
121 
122 	/* Ensure ndev does not disappear before we invoke dev_hold()
123 	 */
124 	read_lock(&ibdev->roce.netdev_lock);
125 	ndev = ibdev->roce.netdev;
126 	if (ndev)
127 		dev_hold(ndev);
128 	read_unlock(&ibdev->roce.netdev_lock);
129 
130 	return ndev;
131 }
132 
133 static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
134 				struct ib_port_attr *props)
135 {
136 	struct mlx5_ib_dev *dev = to_mdev(device);
137 	struct net_device *ndev;
138 	enum ib_mtu ndev_ib_mtu;
139 	u16 qkey_viol_cntr;
140 
141 	memset(props, 0, sizeof(*props));
142 
143 	props->port_cap_flags  |= IB_PORT_CM_SUP;
144 	props->port_cap_flags  |= IB_PORT_IP_BASED_GIDS;
145 
146 	props->gid_tbl_len      = MLX5_CAP_ROCE(dev->mdev,
147 						roce_address_table_size);
148 	props->max_mtu          = IB_MTU_4096;
149 	props->max_msg_sz       = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg);
150 	props->pkey_tbl_len     = 1;
151 	props->state            = IB_PORT_DOWN;
152 	props->phys_state       = 3;
153 
154 	mlx5_query_nic_vport_qkey_viol_cntr(dev->mdev, &qkey_viol_cntr);
155 	props->qkey_viol_cntr = qkey_viol_cntr;
156 
157 	ndev = mlx5_ib_get_netdev(device, port_num);
158 	if (!ndev)
159 		return 0;
160 
161 	if (netif_running(ndev) && netif_carrier_ok(ndev)) {
162 		props->state      = IB_PORT_ACTIVE;
163 		props->phys_state = 5;
164 	}
165 
166 	ndev_ib_mtu = iboe_get_mtu(ndev->mtu);
167 
168 	dev_put(ndev);
169 
170 	props->active_mtu	= min(props->max_mtu, ndev_ib_mtu);
171 
172 	props->active_width	= IB_WIDTH_4X;  /* TODO */
173 	props->active_speed	= IB_SPEED_QDR; /* TODO */
174 
175 	return 0;
176 }
177 
178 static void ib_gid_to_mlx5_roce_addr(const union ib_gid *gid,
179 				     const struct ib_gid_attr *attr,
180 				     void *mlx5_addr)
181 {
182 #define MLX5_SET_RA(p, f, v) MLX5_SET(roce_addr_layout, p, f, v)
183 	char *mlx5_addr_l3_addr	= MLX5_ADDR_OF(roce_addr_layout, mlx5_addr,
184 					       source_l3_address);
185 	void *mlx5_addr_mac	= MLX5_ADDR_OF(roce_addr_layout, mlx5_addr,
186 					       source_mac_47_32);
187 
188 	if (!gid)
189 		return;
190 
191 	ether_addr_copy(mlx5_addr_mac, attr->ndev->dev_addr);
192 
193 	if (is_vlan_dev(attr->ndev)) {
194 		MLX5_SET_RA(mlx5_addr, vlan_valid, 1);
195 		MLX5_SET_RA(mlx5_addr, vlan_id, vlan_dev_vlan_id(attr->ndev));
196 	}
197 
198 	switch (attr->gid_type) {
199 	case IB_GID_TYPE_IB:
200 		MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_1);
201 		break;
202 	case IB_GID_TYPE_ROCE_UDP_ENCAP:
203 		MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_2);
204 		break;
205 
206 	default:
207 		WARN_ON(true);
208 	}
209 
210 	if (attr->gid_type != IB_GID_TYPE_IB) {
211 		if (ipv6_addr_v4mapped((void *)gid))
212 			MLX5_SET_RA(mlx5_addr, roce_l3_type,
213 				    MLX5_ROCE_L3_TYPE_IPV4);
214 		else
215 			MLX5_SET_RA(mlx5_addr, roce_l3_type,
216 				    MLX5_ROCE_L3_TYPE_IPV6);
217 	}
218 
219 	if ((attr->gid_type == IB_GID_TYPE_IB) ||
220 	    !ipv6_addr_v4mapped((void *)gid))
221 		memcpy(mlx5_addr_l3_addr, gid, sizeof(*gid));
222 	else
223 		memcpy(&mlx5_addr_l3_addr[12], &gid->raw[12], 4);
224 }
225 
226 static int set_roce_addr(struct ib_device *device, u8 port_num,
227 			 unsigned int index,
228 			 const union ib_gid *gid,
229 			 const struct ib_gid_attr *attr)
230 {
231 	struct mlx5_ib_dev *dev	= to_mdev(device);
232 	u32  in[MLX5_ST_SZ_DW(set_roce_address_in)];
233 	u32 out[MLX5_ST_SZ_DW(set_roce_address_out)];
234 	void *in_addr = MLX5_ADDR_OF(set_roce_address_in, in, roce_address);
235 	enum rdma_link_layer ll = mlx5_ib_port_link_layer(device, port_num);
236 
237 	if (ll != IB_LINK_LAYER_ETHERNET)
238 		return -EINVAL;
239 
240 	memset(in, 0, sizeof(in));
241 
242 	ib_gid_to_mlx5_roce_addr(gid, attr, in_addr);
243 
244 	MLX5_SET(set_roce_address_in, in, roce_address_index, index);
245 	MLX5_SET(set_roce_address_in, in, opcode, MLX5_CMD_OP_SET_ROCE_ADDRESS);
246 
247 	memset(out, 0, sizeof(out));
248 	return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
249 }
250 
251 static int mlx5_ib_add_gid(struct ib_device *device, u8 port_num,
252 			   unsigned int index, const union ib_gid *gid,
253 			   const struct ib_gid_attr *attr,
254 			   __always_unused void **context)
255 {
256 	return set_roce_addr(device, port_num, index, gid, attr);
257 }
258 
259 static int mlx5_ib_del_gid(struct ib_device *device, u8 port_num,
260 			   unsigned int index, __always_unused void **context)
261 {
262 	return set_roce_addr(device, port_num, index, NULL, NULL);
263 }
264 
265 __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num,
266 			       int index)
267 {
268 	struct ib_gid_attr attr;
269 	union ib_gid gid;
270 
271 	if (ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr))
272 		return 0;
273 
274 	if (!attr.ndev)
275 		return 0;
276 
277 	dev_put(attr.ndev);
278 
279 	if (attr.gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP)
280 		return 0;
281 
282 	return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port));
283 }
284 
285 static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev)
286 {
287 	return !MLX5_CAP_GEN(dev->mdev, ib_virt);
288 }
289 
290 enum {
291 	MLX5_VPORT_ACCESS_METHOD_MAD,
292 	MLX5_VPORT_ACCESS_METHOD_HCA,
293 	MLX5_VPORT_ACCESS_METHOD_NIC,
294 };
295 
296 static int mlx5_get_vport_access_method(struct ib_device *ibdev)
297 {
298 	if (mlx5_use_mad_ifc(to_mdev(ibdev)))
299 		return MLX5_VPORT_ACCESS_METHOD_MAD;
300 
301 	if (mlx5_ib_port_link_layer(ibdev, 1) ==
302 	    IB_LINK_LAYER_ETHERNET)
303 		return MLX5_VPORT_ACCESS_METHOD_NIC;
304 
305 	return MLX5_VPORT_ACCESS_METHOD_HCA;
306 }
307 
308 static void get_atomic_caps(struct mlx5_ib_dev *dev,
309 			    struct ib_device_attr *props)
310 {
311 	u8 tmp;
312 	u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations);
313 	u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp);
314 	u8 atomic_req_8B_endianness_mode =
315 		MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianess_mode);
316 
317 	/* Check if HW supports 8 bytes standard atomic operations and capable
318 	 * of host endianness respond
319 	 */
320 	tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD;
321 	if (((atomic_operations & tmp) == tmp) &&
322 	    (atomic_size_qp & MLX5_ATOMIC_SIZE_QP_8BYTES) &&
323 	    (atomic_req_8B_endianness_mode)) {
324 		props->atomic_cap = IB_ATOMIC_HCA;
325 	} else {
326 		props->atomic_cap = IB_ATOMIC_NONE;
327 	}
328 }
329 
330 static int mlx5_query_system_image_guid(struct ib_device *ibdev,
331 					__be64 *sys_image_guid)
332 {
333 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
334 	struct mlx5_core_dev *mdev = dev->mdev;
335 	u64 tmp;
336 	int err;
337 
338 	switch (mlx5_get_vport_access_method(ibdev)) {
339 	case MLX5_VPORT_ACCESS_METHOD_MAD:
340 		return mlx5_query_mad_ifc_system_image_guid(ibdev,
341 							    sys_image_guid);
342 
343 	case MLX5_VPORT_ACCESS_METHOD_HCA:
344 		err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp);
345 		break;
346 
347 	case MLX5_VPORT_ACCESS_METHOD_NIC:
348 		err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp);
349 		break;
350 
351 	default:
352 		return -EINVAL;
353 	}
354 
355 	if (!err)
356 		*sys_image_guid = cpu_to_be64(tmp);
357 
358 	return err;
359 
360 }
361 
362 static int mlx5_query_max_pkeys(struct ib_device *ibdev,
363 				u16 *max_pkeys)
364 {
365 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
366 	struct mlx5_core_dev *mdev = dev->mdev;
367 
368 	switch (mlx5_get_vport_access_method(ibdev)) {
369 	case MLX5_VPORT_ACCESS_METHOD_MAD:
370 		return mlx5_query_mad_ifc_max_pkeys(ibdev, max_pkeys);
371 
372 	case MLX5_VPORT_ACCESS_METHOD_HCA:
373 	case MLX5_VPORT_ACCESS_METHOD_NIC:
374 		*max_pkeys = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev,
375 						pkey_table_size));
376 		return 0;
377 
378 	default:
379 		return -EINVAL;
380 	}
381 }
382 
383 static int mlx5_query_vendor_id(struct ib_device *ibdev,
384 				u32 *vendor_id)
385 {
386 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
387 
388 	switch (mlx5_get_vport_access_method(ibdev)) {
389 	case MLX5_VPORT_ACCESS_METHOD_MAD:
390 		return mlx5_query_mad_ifc_vendor_id(ibdev, vendor_id);
391 
392 	case MLX5_VPORT_ACCESS_METHOD_HCA:
393 	case MLX5_VPORT_ACCESS_METHOD_NIC:
394 		return mlx5_core_query_vendor_id(dev->mdev, vendor_id);
395 
396 	default:
397 		return -EINVAL;
398 	}
399 }
400 
401 static int mlx5_query_node_guid(struct mlx5_ib_dev *dev,
402 				__be64 *node_guid)
403 {
404 	u64 tmp;
405 	int err;
406 
407 	switch (mlx5_get_vport_access_method(&dev->ib_dev)) {
408 	case MLX5_VPORT_ACCESS_METHOD_MAD:
409 		return mlx5_query_mad_ifc_node_guid(dev, node_guid);
410 
411 	case MLX5_VPORT_ACCESS_METHOD_HCA:
412 		err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp);
413 		break;
414 
415 	case MLX5_VPORT_ACCESS_METHOD_NIC:
416 		err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp);
417 		break;
418 
419 	default:
420 		return -EINVAL;
421 	}
422 
423 	if (!err)
424 		*node_guid = cpu_to_be64(tmp);
425 
426 	return err;
427 }
428 
429 struct mlx5_reg_node_desc {
430 	u8	desc[64];
431 };
432 
433 static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc)
434 {
435 	struct mlx5_reg_node_desc in;
436 
437 	if (mlx5_use_mad_ifc(dev))
438 		return mlx5_query_mad_ifc_node_desc(dev, node_desc);
439 
440 	memset(&in, 0, sizeof(in));
441 
442 	return mlx5_core_access_reg(dev->mdev, &in, sizeof(in), node_desc,
443 				    sizeof(struct mlx5_reg_node_desc),
444 				    MLX5_REG_NODE_DESC, 0, 0);
445 }
446 
447 static int mlx5_ib_query_device(struct ib_device *ibdev,
448 				struct ib_device_attr *props,
449 				struct ib_udata *uhw)
450 {
451 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
452 	struct mlx5_core_dev *mdev = dev->mdev;
453 	int err = -ENOMEM;
454 	int max_rq_sg;
455 	int max_sq_sg;
456 	u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz);
457 
458 	if (uhw->inlen || uhw->outlen)
459 		return -EINVAL;
460 
461 	memset(props, 0, sizeof(*props));
462 	err = mlx5_query_system_image_guid(ibdev,
463 					   &props->sys_image_guid);
464 	if (err)
465 		return err;
466 
467 	err = mlx5_query_max_pkeys(ibdev, &props->max_pkeys);
468 	if (err)
469 		return err;
470 
471 	err = mlx5_query_vendor_id(ibdev, &props->vendor_id);
472 	if (err)
473 		return err;
474 
475 	props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) |
476 		(fw_rev_min(dev->mdev) << 16) |
477 		fw_rev_sub(dev->mdev);
478 	props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |
479 		IB_DEVICE_PORT_ACTIVE_EVENT		|
480 		IB_DEVICE_SYS_IMAGE_GUID		|
481 		IB_DEVICE_RC_RNR_NAK_GEN;
482 
483 	if (MLX5_CAP_GEN(mdev, pkv))
484 		props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
485 	if (MLX5_CAP_GEN(mdev, qkv))
486 		props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
487 	if (MLX5_CAP_GEN(mdev, apm))
488 		props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
489 	if (MLX5_CAP_GEN(mdev, xrc))
490 		props->device_cap_flags |= IB_DEVICE_XRC;
491 	if (MLX5_CAP_GEN(mdev, imaicl)) {
492 		props->device_cap_flags |= IB_DEVICE_MEM_WINDOW |
493 					   IB_DEVICE_MEM_WINDOW_TYPE_2B;
494 		props->max_mw = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
495 		/* We support 'Gappy' memory registration too */
496 		props->device_cap_flags |= IB_DEVICE_SG_GAPS_REG;
497 	}
498 	props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
499 	if (MLX5_CAP_GEN(mdev, sho)) {
500 		props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER;
501 		/* At this stage no support for signature handover */
502 		props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 |
503 				      IB_PROT_T10DIF_TYPE_2 |
504 				      IB_PROT_T10DIF_TYPE_3;
505 		props->sig_guard_cap = IB_GUARD_T10DIF_CRC |
506 				       IB_GUARD_T10DIF_CSUM;
507 	}
508 	if (MLX5_CAP_GEN(mdev, block_lb_mc))
509 		props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
510 
511 	if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
512 	    (MLX5_CAP_ETH(dev->mdev, csum_cap)))
513 			props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM;
514 
515 	if (MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) {
516 		props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
517 		props->device_cap_flags |= IB_DEVICE_UD_TSO;
518 	}
519 
520 	props->vendor_part_id	   = mdev->pdev->device;
521 	props->hw_ver		   = mdev->pdev->revision;
522 
523 	props->max_mr_size	   = ~0ull;
524 	props->page_size_cap	   = ~(min_page_size - 1);
525 	props->max_qp		   = 1 << MLX5_CAP_GEN(mdev, log_max_qp);
526 	props->max_qp_wr	   = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
527 	max_rq_sg =  MLX5_CAP_GEN(mdev, max_wqe_sz_rq) /
528 		     sizeof(struct mlx5_wqe_data_seg);
529 	max_sq_sg = (MLX5_CAP_GEN(mdev, max_wqe_sz_sq) -
530 		     sizeof(struct mlx5_wqe_ctrl_seg)) /
531 		     sizeof(struct mlx5_wqe_data_seg);
532 	props->max_sge = min(max_rq_sg, max_sq_sg);
533 	props->max_sge_rd = props->max_sge;
534 	props->max_cq		   = 1 << MLX5_CAP_GEN(mdev, log_max_cq);
535 	props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1;
536 	props->max_mr		   = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
537 	props->max_pd		   = 1 << MLX5_CAP_GEN(mdev, log_max_pd);
538 	props->max_qp_rd_atom	   = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp);
539 	props->max_qp_init_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_res_qp);
540 	props->max_srq		   = 1 << MLX5_CAP_GEN(mdev, log_max_srq);
541 	props->max_srq_wr = (1 << MLX5_CAP_GEN(mdev, log_max_srq_sz)) - 1;
542 	props->local_ca_ack_delay  = MLX5_CAP_GEN(mdev, local_ca_ack_delay);
543 	props->max_res_rd_atom	   = props->max_qp_rd_atom * props->max_qp;
544 	props->max_srq_sge	   = max_rq_sg - 1;
545 	props->max_fast_reg_page_list_len =
546 		1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size);
547 	get_atomic_caps(dev, props);
548 	props->masked_atomic_cap   = IB_ATOMIC_NONE;
549 	props->max_mcast_grp	   = 1 << MLX5_CAP_GEN(mdev, log_max_mcg);
550 	props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg);
551 	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
552 					   props->max_mcast_grp;
553 	props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */
554 	props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz);
555 	props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL;
556 
557 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
558 	if (MLX5_CAP_GEN(mdev, pg))
559 		props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
560 	props->odp_caps = dev->odp_caps;
561 #endif
562 
563 	if (MLX5_CAP_GEN(mdev, cd))
564 		props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL;
565 
566 	if (!mlx5_core_is_pf(mdev))
567 		props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION;
568 
569 	return 0;
570 }
571 
572 enum mlx5_ib_width {
573 	MLX5_IB_WIDTH_1X	= 1 << 0,
574 	MLX5_IB_WIDTH_2X	= 1 << 1,
575 	MLX5_IB_WIDTH_4X	= 1 << 2,
576 	MLX5_IB_WIDTH_8X	= 1 << 3,
577 	MLX5_IB_WIDTH_12X	= 1 << 4
578 };
579 
580 static int translate_active_width(struct ib_device *ibdev, u8 active_width,
581 				  u8 *ib_width)
582 {
583 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
584 	int err = 0;
585 
586 	if (active_width & MLX5_IB_WIDTH_1X) {
587 		*ib_width = IB_WIDTH_1X;
588 	} else if (active_width & MLX5_IB_WIDTH_2X) {
589 		mlx5_ib_dbg(dev, "active_width %d is not supported by IB spec\n",
590 			    (int)active_width);
591 		err = -EINVAL;
592 	} else if (active_width & MLX5_IB_WIDTH_4X) {
593 		*ib_width = IB_WIDTH_4X;
594 	} else if (active_width & MLX5_IB_WIDTH_8X) {
595 		*ib_width = IB_WIDTH_8X;
596 	} else if (active_width & MLX5_IB_WIDTH_12X) {
597 		*ib_width = IB_WIDTH_12X;
598 	} else {
599 		mlx5_ib_dbg(dev, "Invalid active_width %d\n",
600 			    (int)active_width);
601 		err = -EINVAL;
602 	}
603 
604 	return err;
605 }
606 
607 static int mlx5_mtu_to_ib_mtu(int mtu)
608 {
609 	switch (mtu) {
610 	case 256: return 1;
611 	case 512: return 2;
612 	case 1024: return 3;
613 	case 2048: return 4;
614 	case 4096: return 5;
615 	default:
616 		pr_warn("invalid mtu\n");
617 		return -1;
618 	}
619 }
620 
621 enum ib_max_vl_num {
622 	__IB_MAX_VL_0		= 1,
623 	__IB_MAX_VL_0_1		= 2,
624 	__IB_MAX_VL_0_3		= 3,
625 	__IB_MAX_VL_0_7		= 4,
626 	__IB_MAX_VL_0_14	= 5,
627 };
628 
629 enum mlx5_vl_hw_cap {
630 	MLX5_VL_HW_0	= 1,
631 	MLX5_VL_HW_0_1	= 2,
632 	MLX5_VL_HW_0_2	= 3,
633 	MLX5_VL_HW_0_3	= 4,
634 	MLX5_VL_HW_0_4	= 5,
635 	MLX5_VL_HW_0_5	= 6,
636 	MLX5_VL_HW_0_6	= 7,
637 	MLX5_VL_HW_0_7	= 8,
638 	MLX5_VL_HW_0_14	= 15
639 };
640 
641 static int translate_max_vl_num(struct ib_device *ibdev, u8 vl_hw_cap,
642 				u8 *max_vl_num)
643 {
644 	switch (vl_hw_cap) {
645 	case MLX5_VL_HW_0:
646 		*max_vl_num = __IB_MAX_VL_0;
647 		break;
648 	case MLX5_VL_HW_0_1:
649 		*max_vl_num = __IB_MAX_VL_0_1;
650 		break;
651 	case MLX5_VL_HW_0_3:
652 		*max_vl_num = __IB_MAX_VL_0_3;
653 		break;
654 	case MLX5_VL_HW_0_7:
655 		*max_vl_num = __IB_MAX_VL_0_7;
656 		break;
657 	case MLX5_VL_HW_0_14:
658 		*max_vl_num = __IB_MAX_VL_0_14;
659 		break;
660 
661 	default:
662 		return -EINVAL;
663 	}
664 
665 	return 0;
666 }
667 
668 static int mlx5_query_hca_port(struct ib_device *ibdev, u8 port,
669 			       struct ib_port_attr *props)
670 {
671 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
672 	struct mlx5_core_dev *mdev = dev->mdev;
673 	struct mlx5_hca_vport_context *rep;
674 	int max_mtu;
675 	int oper_mtu;
676 	int err;
677 	u8 ib_link_width_oper;
678 	u8 vl_hw_cap;
679 
680 	rep = kzalloc(sizeof(*rep), GFP_KERNEL);
681 	if (!rep) {
682 		err = -ENOMEM;
683 		goto out;
684 	}
685 
686 	memset(props, 0, sizeof(*props));
687 
688 	err = mlx5_query_hca_vport_context(mdev, 0, port, 0, rep);
689 	if (err)
690 		goto out;
691 
692 	props->lid		= rep->lid;
693 	props->lmc		= rep->lmc;
694 	props->sm_lid		= rep->sm_lid;
695 	props->sm_sl		= rep->sm_sl;
696 	props->state		= rep->vport_state;
697 	props->phys_state	= rep->port_physical_state;
698 	props->port_cap_flags	= rep->cap_mask1;
699 	props->gid_tbl_len	= mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size));
700 	props->max_msg_sz	= 1 << MLX5_CAP_GEN(mdev, log_max_msg);
701 	props->pkey_tbl_len	= mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size));
702 	props->bad_pkey_cntr	= rep->pkey_violation_counter;
703 	props->qkey_viol_cntr	= rep->qkey_violation_counter;
704 	props->subnet_timeout	= rep->subnet_timeout;
705 	props->init_type_reply	= rep->init_type_reply;
706 	props->grh_required	= rep->grh_required;
707 
708 	err = mlx5_query_port_link_width_oper(mdev, &ib_link_width_oper, port);
709 	if (err)
710 		goto out;
711 
712 	err = translate_active_width(ibdev, ib_link_width_oper,
713 				     &props->active_width);
714 	if (err)
715 		goto out;
716 	err = mlx5_query_port_proto_oper(mdev, &props->active_speed, MLX5_PTYS_IB,
717 					 port);
718 	if (err)
719 		goto out;
720 
721 	mlx5_query_port_max_mtu(mdev, &max_mtu, port);
722 
723 	props->max_mtu = mlx5_mtu_to_ib_mtu(max_mtu);
724 
725 	mlx5_query_port_oper_mtu(mdev, &oper_mtu, port);
726 
727 	props->active_mtu = mlx5_mtu_to_ib_mtu(oper_mtu);
728 
729 	err = mlx5_query_port_vl_hw_cap(mdev, &vl_hw_cap, port);
730 	if (err)
731 		goto out;
732 
733 	err = translate_max_vl_num(ibdev, vl_hw_cap,
734 				   &props->max_vl_num);
735 out:
736 	kfree(rep);
737 	return err;
738 }
739 
740 int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
741 		       struct ib_port_attr *props)
742 {
743 	switch (mlx5_get_vport_access_method(ibdev)) {
744 	case MLX5_VPORT_ACCESS_METHOD_MAD:
745 		return mlx5_query_mad_ifc_port(ibdev, port, props);
746 
747 	case MLX5_VPORT_ACCESS_METHOD_HCA:
748 		return mlx5_query_hca_port(ibdev, port, props);
749 
750 	case MLX5_VPORT_ACCESS_METHOD_NIC:
751 		return mlx5_query_port_roce(ibdev, port, props);
752 
753 	default:
754 		return -EINVAL;
755 	}
756 }
757 
758 static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
759 			     union ib_gid *gid)
760 {
761 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
762 	struct mlx5_core_dev *mdev = dev->mdev;
763 
764 	switch (mlx5_get_vport_access_method(ibdev)) {
765 	case MLX5_VPORT_ACCESS_METHOD_MAD:
766 		return mlx5_query_mad_ifc_gids(ibdev, port, index, gid);
767 
768 	case MLX5_VPORT_ACCESS_METHOD_HCA:
769 		return mlx5_query_hca_vport_gid(mdev, 0, port, 0, index, gid);
770 
771 	default:
772 		return -EINVAL;
773 	}
774 
775 }
776 
777 static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
778 			      u16 *pkey)
779 {
780 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
781 	struct mlx5_core_dev *mdev = dev->mdev;
782 
783 	switch (mlx5_get_vport_access_method(ibdev)) {
784 	case MLX5_VPORT_ACCESS_METHOD_MAD:
785 		return mlx5_query_mad_ifc_pkey(ibdev, port, index, pkey);
786 
787 	case MLX5_VPORT_ACCESS_METHOD_HCA:
788 	case MLX5_VPORT_ACCESS_METHOD_NIC:
789 		return mlx5_query_hca_vport_pkey(mdev, 0, port,  0, index,
790 						 pkey);
791 	default:
792 		return -EINVAL;
793 	}
794 }
795 
796 static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask,
797 				 struct ib_device_modify *props)
798 {
799 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
800 	struct mlx5_reg_node_desc in;
801 	struct mlx5_reg_node_desc out;
802 	int err;
803 
804 	if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
805 		return -EOPNOTSUPP;
806 
807 	if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
808 		return 0;
809 
810 	/*
811 	 * If possible, pass node desc to FW, so it can generate
812 	 * a 144 trap.  If cmd fails, just ignore.
813 	 */
814 	memcpy(&in, props->node_desc, 64);
815 	err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out,
816 				   sizeof(out), MLX5_REG_NODE_DESC, 0, 1);
817 	if (err)
818 		return err;
819 
820 	memcpy(ibdev->node_desc, props->node_desc, 64);
821 
822 	return err;
823 }
824 
825 static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
826 			       struct ib_port_modify *props)
827 {
828 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
829 	struct ib_port_attr attr;
830 	u32 tmp;
831 	int err;
832 
833 	mutex_lock(&dev->cap_mask_mutex);
834 
835 	err = mlx5_ib_query_port(ibdev, port, &attr);
836 	if (err)
837 		goto out;
838 
839 	tmp = (attr.port_cap_flags | props->set_port_cap_mask) &
840 		~props->clr_port_cap_mask;
841 
842 	err = mlx5_set_port_caps(dev->mdev, port, tmp);
843 
844 out:
845 	mutex_unlock(&dev->cap_mask_mutex);
846 	return err;
847 }
848 
849 static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
850 						  struct ib_udata *udata)
851 {
852 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
853 	struct mlx5_ib_alloc_ucontext_req_v2 req = {};
854 	struct mlx5_ib_alloc_ucontext_resp resp = {};
855 	struct mlx5_ib_ucontext *context;
856 	struct mlx5_uuar_info *uuari;
857 	struct mlx5_uar *uars;
858 	int gross_uuars;
859 	int num_uars;
860 	int ver;
861 	int uuarn;
862 	int err;
863 	int i;
864 	size_t reqlen;
865 	size_t min_req_v2 = offsetof(struct mlx5_ib_alloc_ucontext_req_v2,
866 				     max_cqe_version);
867 
868 	if (!dev->ib_active)
869 		return ERR_PTR(-EAGAIN);
870 
871 	if (udata->inlen < sizeof(struct ib_uverbs_cmd_hdr))
872 		return ERR_PTR(-EINVAL);
873 
874 	reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr);
875 	if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req))
876 		ver = 0;
877 	else if (reqlen >= min_req_v2)
878 		ver = 2;
879 	else
880 		return ERR_PTR(-EINVAL);
881 
882 	err = ib_copy_from_udata(&req, udata, min(reqlen, sizeof(req)));
883 	if (err)
884 		return ERR_PTR(err);
885 
886 	if (req.flags)
887 		return ERR_PTR(-EINVAL);
888 
889 	if (req.total_num_uuars > MLX5_MAX_UUARS)
890 		return ERR_PTR(-ENOMEM);
891 
892 	if (req.total_num_uuars == 0)
893 		return ERR_PTR(-EINVAL);
894 
895 	if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2)
896 		return ERR_PTR(-EOPNOTSUPP);
897 
898 	if (reqlen > sizeof(req) &&
899 	    !ib_is_udata_cleared(udata, sizeof(req),
900 				 reqlen - sizeof(req)))
901 		return ERR_PTR(-EOPNOTSUPP);
902 
903 	req.total_num_uuars = ALIGN(req.total_num_uuars,
904 				    MLX5_NON_FP_BF_REGS_PER_PAGE);
905 	if (req.num_low_latency_uuars > req.total_num_uuars - 1)
906 		return ERR_PTR(-EINVAL);
907 
908 	num_uars = req.total_num_uuars / MLX5_NON_FP_BF_REGS_PER_PAGE;
909 	gross_uuars = num_uars * MLX5_BF_REGS_PER_PAGE;
910 	resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
911 	resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size);
912 	resp.cache_line_size = L1_CACHE_BYTES;
913 	resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq);
914 	resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq);
915 	resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
916 	resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
917 	resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
918 	resp.cqe_version = min_t(__u8,
919 				 (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version),
920 				 req.max_cqe_version);
921 	resp.response_length = min(offsetof(typeof(resp), response_length) +
922 				   sizeof(resp.response_length), udata->outlen);
923 
924 	context = kzalloc(sizeof(*context), GFP_KERNEL);
925 	if (!context)
926 		return ERR_PTR(-ENOMEM);
927 
928 	uuari = &context->uuari;
929 	mutex_init(&uuari->lock);
930 	uars = kcalloc(num_uars, sizeof(*uars), GFP_KERNEL);
931 	if (!uars) {
932 		err = -ENOMEM;
933 		goto out_ctx;
934 	}
935 
936 	uuari->bitmap = kcalloc(BITS_TO_LONGS(gross_uuars),
937 				sizeof(*uuari->bitmap),
938 				GFP_KERNEL);
939 	if (!uuari->bitmap) {
940 		err = -ENOMEM;
941 		goto out_uar_ctx;
942 	}
943 	/*
944 	 * clear all fast path uuars
945 	 */
946 	for (i = 0; i < gross_uuars; i++) {
947 		uuarn = i & 3;
948 		if (uuarn == 2 || uuarn == 3)
949 			set_bit(i, uuari->bitmap);
950 	}
951 
952 	uuari->count = kcalloc(gross_uuars, sizeof(*uuari->count), GFP_KERNEL);
953 	if (!uuari->count) {
954 		err = -ENOMEM;
955 		goto out_bitmap;
956 	}
957 
958 	for (i = 0; i < num_uars; i++) {
959 		err = mlx5_cmd_alloc_uar(dev->mdev, &uars[i].index);
960 		if (err)
961 			goto out_count;
962 	}
963 
964 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
965 	context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range;
966 #endif
967 
968 	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) {
969 		err = mlx5_core_alloc_transport_domain(dev->mdev,
970 						       &context->tdn);
971 		if (err)
972 			goto out_uars;
973 	}
974 
975 	INIT_LIST_HEAD(&context->db_page_list);
976 	mutex_init(&context->db_page_mutex);
977 
978 	resp.tot_uuars = req.total_num_uuars;
979 	resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports);
980 
981 	if (field_avail(typeof(resp), cqe_version, udata->outlen))
982 		resp.response_length += sizeof(resp.cqe_version);
983 
984 	if (field_avail(typeof(resp), hca_core_clock_offset, udata->outlen)) {
985 		resp.comp_mask |=
986 			MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET;
987 		resp.hca_core_clock_offset =
988 			offsetof(struct mlx5_init_seg, internal_timer_h) %
989 			PAGE_SIZE;
990 		resp.response_length += sizeof(resp.hca_core_clock_offset) +
991 					sizeof(resp.reserved2) +
992 					sizeof(resp.reserved3);
993 	}
994 
995 	err = ib_copy_to_udata(udata, &resp, resp.response_length);
996 	if (err)
997 		goto out_td;
998 
999 	uuari->ver = ver;
1000 	uuari->num_low_latency_uuars = req.num_low_latency_uuars;
1001 	uuari->uars = uars;
1002 	uuari->num_uars = num_uars;
1003 	context->cqe_version = resp.cqe_version;
1004 
1005 	return &context->ibucontext;
1006 
1007 out_td:
1008 	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
1009 		mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn);
1010 
1011 out_uars:
1012 	for (i--; i >= 0; i--)
1013 		mlx5_cmd_free_uar(dev->mdev, uars[i].index);
1014 out_count:
1015 	kfree(uuari->count);
1016 
1017 out_bitmap:
1018 	kfree(uuari->bitmap);
1019 
1020 out_uar_ctx:
1021 	kfree(uars);
1022 
1023 out_ctx:
1024 	kfree(context);
1025 	return ERR_PTR(err);
1026 }
1027 
1028 static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
1029 {
1030 	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
1031 	struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
1032 	struct mlx5_uuar_info *uuari = &context->uuari;
1033 	int i;
1034 
1035 	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
1036 		mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn);
1037 
1038 	for (i = 0; i < uuari->num_uars; i++) {
1039 		if (mlx5_cmd_free_uar(dev->mdev, uuari->uars[i].index))
1040 			mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index);
1041 	}
1042 
1043 	kfree(uuari->count);
1044 	kfree(uuari->bitmap);
1045 	kfree(uuari->uars);
1046 	kfree(context);
1047 
1048 	return 0;
1049 }
1050 
1051 static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, int index)
1052 {
1053 	return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + index;
1054 }
1055 
1056 static int get_command(unsigned long offset)
1057 {
1058 	return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK;
1059 }
1060 
1061 static int get_arg(unsigned long offset)
1062 {
1063 	return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1);
1064 }
1065 
1066 static int get_index(unsigned long offset)
1067 {
1068 	return get_arg(offset);
1069 }
1070 
1071 static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
1072 {
1073 	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
1074 	struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
1075 	struct mlx5_uuar_info *uuari = &context->uuari;
1076 	unsigned long command;
1077 	unsigned long idx;
1078 	phys_addr_t pfn;
1079 
1080 	command = get_command(vma->vm_pgoff);
1081 	switch (command) {
1082 	case MLX5_IB_MMAP_REGULAR_PAGE:
1083 		if (vma->vm_end - vma->vm_start != PAGE_SIZE)
1084 			return -EINVAL;
1085 
1086 		idx = get_index(vma->vm_pgoff);
1087 		if (idx >= uuari->num_uars)
1088 			return -EINVAL;
1089 
1090 		pfn = uar_index2pfn(dev, uuari->uars[idx].index);
1091 		mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn 0x%llx\n", idx,
1092 			    (unsigned long long)pfn);
1093 
1094 		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
1095 		if (io_remap_pfn_range(vma, vma->vm_start, pfn,
1096 				       PAGE_SIZE, vma->vm_page_prot))
1097 			return -EAGAIN;
1098 
1099 		mlx5_ib_dbg(dev, "mapped WC at 0x%lx, PA 0x%llx\n",
1100 			    vma->vm_start,
1101 			    (unsigned long long)pfn << PAGE_SHIFT);
1102 		break;
1103 
1104 	case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
1105 		return -ENOSYS;
1106 
1107 	case MLX5_IB_MMAP_CORE_CLOCK:
1108 		if (vma->vm_end - vma->vm_start != PAGE_SIZE)
1109 			return -EINVAL;
1110 
1111 		if (vma->vm_flags & (VM_WRITE | VM_EXEC))
1112 			return -EPERM;
1113 
1114 		/* Don't expose to user-space information it shouldn't have */
1115 		if (PAGE_SIZE > 4096)
1116 			return -EOPNOTSUPP;
1117 
1118 		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
1119 		pfn = (dev->mdev->iseg_base +
1120 		       offsetof(struct mlx5_init_seg, internal_timer_h)) >>
1121 			PAGE_SHIFT;
1122 		if (io_remap_pfn_range(vma, vma->vm_start, pfn,
1123 				       PAGE_SIZE, vma->vm_page_prot))
1124 			return -EAGAIN;
1125 
1126 		mlx5_ib_dbg(dev, "mapped internal timer at 0x%lx, PA 0x%llx\n",
1127 			    vma->vm_start,
1128 			    (unsigned long long)pfn << PAGE_SHIFT);
1129 		break;
1130 
1131 	default:
1132 		return -EINVAL;
1133 	}
1134 
1135 	return 0;
1136 }
1137 
1138 static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev,
1139 				      struct ib_ucontext *context,
1140 				      struct ib_udata *udata)
1141 {
1142 	struct mlx5_ib_alloc_pd_resp resp;
1143 	struct mlx5_ib_pd *pd;
1144 	int err;
1145 
1146 	pd = kmalloc(sizeof(*pd), GFP_KERNEL);
1147 	if (!pd)
1148 		return ERR_PTR(-ENOMEM);
1149 
1150 	err = mlx5_core_alloc_pd(to_mdev(ibdev)->mdev, &pd->pdn);
1151 	if (err) {
1152 		kfree(pd);
1153 		return ERR_PTR(err);
1154 	}
1155 
1156 	if (context) {
1157 		resp.pdn = pd->pdn;
1158 		if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
1159 			mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn);
1160 			kfree(pd);
1161 			return ERR_PTR(-EFAULT);
1162 		}
1163 	}
1164 
1165 	return &pd->ibpd;
1166 }
1167 
1168 static int mlx5_ib_dealloc_pd(struct ib_pd *pd)
1169 {
1170 	struct mlx5_ib_dev *mdev = to_mdev(pd->device);
1171 	struct mlx5_ib_pd *mpd = to_mpd(pd);
1172 
1173 	mlx5_core_dealloc_pd(mdev->mdev, mpd->pdn);
1174 	kfree(mpd);
1175 
1176 	return 0;
1177 }
1178 
1179 static bool outer_header_zero(u32 *match_criteria)
1180 {
1181 	int size = MLX5_ST_SZ_BYTES(fte_match_param);
1182 	char *outer_headers_c = MLX5_ADDR_OF(fte_match_param, match_criteria,
1183 					     outer_headers);
1184 
1185 	return outer_headers_c[0] == 0 && !memcmp(outer_headers_c,
1186 						  outer_headers_c + 1,
1187 						  size - 1);
1188 }
1189 
1190 static int parse_flow_attr(u32 *match_c, u32 *match_v,
1191 			   union ib_flow_spec *ib_spec)
1192 {
1193 	void *outer_headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
1194 					     outer_headers);
1195 	void *outer_headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
1196 					     outer_headers);
1197 	switch (ib_spec->type) {
1198 	case IB_FLOW_SPEC_ETH:
1199 		if (ib_spec->size != sizeof(ib_spec->eth))
1200 			return -EINVAL;
1201 
1202 		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
1203 					     dmac_47_16),
1204 				ib_spec->eth.mask.dst_mac);
1205 		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
1206 					     dmac_47_16),
1207 				ib_spec->eth.val.dst_mac);
1208 
1209 		if (ib_spec->eth.mask.vlan_tag) {
1210 			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1211 				 vlan_tag, 1);
1212 			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1213 				 vlan_tag, 1);
1214 
1215 			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1216 				 first_vid, ntohs(ib_spec->eth.mask.vlan_tag));
1217 			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1218 				 first_vid, ntohs(ib_spec->eth.val.vlan_tag));
1219 
1220 			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1221 				 first_cfi,
1222 				 ntohs(ib_spec->eth.mask.vlan_tag) >> 12);
1223 			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1224 				 first_cfi,
1225 				 ntohs(ib_spec->eth.val.vlan_tag) >> 12);
1226 
1227 			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1228 				 first_prio,
1229 				 ntohs(ib_spec->eth.mask.vlan_tag) >> 13);
1230 			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1231 				 first_prio,
1232 				 ntohs(ib_spec->eth.val.vlan_tag) >> 13);
1233 		}
1234 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1235 			 ethertype, ntohs(ib_spec->eth.mask.ether_type));
1236 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1237 			 ethertype, ntohs(ib_spec->eth.val.ether_type));
1238 		break;
1239 	case IB_FLOW_SPEC_IPV4:
1240 		if (ib_spec->size != sizeof(ib_spec->ipv4))
1241 			return -EINVAL;
1242 
1243 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1244 			 ethertype, 0xffff);
1245 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1246 			 ethertype, ETH_P_IP);
1247 
1248 		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
1249 				    src_ipv4_src_ipv6.ipv4_layout.ipv4),
1250 		       &ib_spec->ipv4.mask.src_ip,
1251 		       sizeof(ib_spec->ipv4.mask.src_ip));
1252 		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
1253 				    src_ipv4_src_ipv6.ipv4_layout.ipv4),
1254 		       &ib_spec->ipv4.val.src_ip,
1255 		       sizeof(ib_spec->ipv4.val.src_ip));
1256 		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
1257 				    dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
1258 		       &ib_spec->ipv4.mask.dst_ip,
1259 		       sizeof(ib_spec->ipv4.mask.dst_ip));
1260 		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
1261 				    dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
1262 		       &ib_spec->ipv4.val.dst_ip,
1263 		       sizeof(ib_spec->ipv4.val.dst_ip));
1264 		break;
1265 	case IB_FLOW_SPEC_TCP:
1266 		if (ib_spec->size != sizeof(ib_spec->tcp_udp))
1267 			return -EINVAL;
1268 
1269 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ip_protocol,
1270 			 0xff);
1271 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ip_protocol,
1272 			 IPPROTO_TCP);
1273 
1274 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, tcp_sport,
1275 			 ntohs(ib_spec->tcp_udp.mask.src_port));
1276 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, tcp_sport,
1277 			 ntohs(ib_spec->tcp_udp.val.src_port));
1278 
1279 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, tcp_dport,
1280 			 ntohs(ib_spec->tcp_udp.mask.dst_port));
1281 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, tcp_dport,
1282 			 ntohs(ib_spec->tcp_udp.val.dst_port));
1283 		break;
1284 	case IB_FLOW_SPEC_UDP:
1285 		if (ib_spec->size != sizeof(ib_spec->tcp_udp))
1286 			return -EINVAL;
1287 
1288 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ip_protocol,
1289 			 0xff);
1290 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ip_protocol,
1291 			 IPPROTO_UDP);
1292 
1293 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, udp_sport,
1294 			 ntohs(ib_spec->tcp_udp.mask.src_port));
1295 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, udp_sport,
1296 			 ntohs(ib_spec->tcp_udp.val.src_port));
1297 
1298 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, udp_dport,
1299 			 ntohs(ib_spec->tcp_udp.mask.dst_port));
1300 		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, udp_dport,
1301 			 ntohs(ib_spec->tcp_udp.val.dst_port));
1302 		break;
1303 	default:
1304 		return -EINVAL;
1305 	}
1306 
1307 	return 0;
1308 }
1309 
1310 /* If a flow could catch both multicast and unicast packets,
1311  * it won't fall into the multicast flow steering table and this rule
1312  * could steal other multicast packets.
1313  */
1314 static bool flow_is_multicast_only(struct ib_flow_attr *ib_attr)
1315 {
1316 	struct ib_flow_spec_eth *eth_spec;
1317 
1318 	if (ib_attr->type != IB_FLOW_ATTR_NORMAL ||
1319 	    ib_attr->size < sizeof(struct ib_flow_attr) +
1320 	    sizeof(struct ib_flow_spec_eth) ||
1321 	    ib_attr->num_of_specs < 1)
1322 		return false;
1323 
1324 	eth_spec = (struct ib_flow_spec_eth *)(ib_attr + 1);
1325 	if (eth_spec->type != IB_FLOW_SPEC_ETH ||
1326 	    eth_spec->size != sizeof(*eth_spec))
1327 		return false;
1328 
1329 	return is_multicast_ether_addr(eth_spec->mask.dst_mac) &&
1330 	       is_multicast_ether_addr(eth_spec->val.dst_mac);
1331 }
1332 
1333 static bool is_valid_attr(struct ib_flow_attr *flow_attr)
1334 {
1335 	union ib_flow_spec *ib_spec = (union ib_flow_spec *)(flow_attr + 1);
1336 	bool has_ipv4_spec = false;
1337 	bool eth_type_ipv4 = true;
1338 	unsigned int spec_index;
1339 
1340 	/* Validate that ethertype is correct */
1341 	for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
1342 		if (ib_spec->type == IB_FLOW_SPEC_ETH &&
1343 		    ib_spec->eth.mask.ether_type) {
1344 			if (!((ib_spec->eth.mask.ether_type == htons(0xffff)) &&
1345 			      ib_spec->eth.val.ether_type == htons(ETH_P_IP)))
1346 				eth_type_ipv4 = false;
1347 		} else if (ib_spec->type == IB_FLOW_SPEC_IPV4) {
1348 			has_ipv4_spec = true;
1349 		}
1350 		ib_spec = (void *)ib_spec + ib_spec->size;
1351 	}
1352 	return !has_ipv4_spec || eth_type_ipv4;
1353 }
1354 
1355 static void put_flow_table(struct mlx5_ib_dev *dev,
1356 			   struct mlx5_ib_flow_prio *prio, bool ft_added)
1357 {
1358 	prio->refcount -= !!ft_added;
1359 	if (!prio->refcount) {
1360 		mlx5_destroy_flow_table(prio->flow_table);
1361 		prio->flow_table = NULL;
1362 	}
1363 }
1364 
1365 static int mlx5_ib_destroy_flow(struct ib_flow *flow_id)
1366 {
1367 	struct mlx5_ib_dev *dev = to_mdev(flow_id->qp->device);
1368 	struct mlx5_ib_flow_handler *handler = container_of(flow_id,
1369 							  struct mlx5_ib_flow_handler,
1370 							  ibflow);
1371 	struct mlx5_ib_flow_handler *iter, *tmp;
1372 
1373 	mutex_lock(&dev->flow_db.lock);
1374 
1375 	list_for_each_entry_safe(iter, tmp, &handler->list, list) {
1376 		mlx5_del_flow_rule(iter->rule);
1377 		list_del(&iter->list);
1378 		kfree(iter);
1379 	}
1380 
1381 	mlx5_del_flow_rule(handler->rule);
1382 	put_flow_table(dev, &dev->flow_db.prios[handler->prio], true);
1383 	mutex_unlock(&dev->flow_db.lock);
1384 
1385 	kfree(handler);
1386 
1387 	return 0;
1388 }
1389 
1390 static int ib_prio_to_core_prio(unsigned int priority, bool dont_trap)
1391 {
1392 	priority *= 2;
1393 	if (!dont_trap)
1394 		priority++;
1395 	return priority;
1396 }
1397 
1398 #define MLX5_FS_MAX_TYPES	 10
1399 #define MLX5_FS_MAX_ENTRIES	 32000UL
1400 static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
1401 						struct ib_flow_attr *flow_attr)
1402 {
1403 	bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP;
1404 	struct mlx5_flow_namespace *ns = NULL;
1405 	struct mlx5_ib_flow_prio *prio;
1406 	struct mlx5_flow_table *ft;
1407 	int num_entries;
1408 	int num_groups;
1409 	int priority;
1410 	int err = 0;
1411 
1412 	if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
1413 		if (flow_is_multicast_only(flow_attr) &&
1414 		    !dont_trap)
1415 			priority = MLX5_IB_FLOW_MCAST_PRIO;
1416 		else
1417 			priority = ib_prio_to_core_prio(flow_attr->priority,
1418 							dont_trap);
1419 		ns = mlx5_get_flow_namespace(dev->mdev,
1420 					     MLX5_FLOW_NAMESPACE_BYPASS);
1421 		num_entries = MLX5_FS_MAX_ENTRIES;
1422 		num_groups = MLX5_FS_MAX_TYPES;
1423 		prio = &dev->flow_db.prios[priority];
1424 	} else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
1425 		   flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
1426 		ns = mlx5_get_flow_namespace(dev->mdev,
1427 					     MLX5_FLOW_NAMESPACE_LEFTOVERS);
1428 		build_leftovers_ft_param(&priority,
1429 					 &num_entries,
1430 					 &num_groups);
1431 		prio = &dev->flow_db.prios[MLX5_IB_FLOW_LEFTOVERS_PRIO];
1432 	}
1433 
1434 	if (!ns)
1435 		return ERR_PTR(-ENOTSUPP);
1436 
1437 	ft = prio->flow_table;
1438 	if (!ft) {
1439 		ft = mlx5_create_auto_grouped_flow_table(ns, priority,
1440 							 num_entries,
1441 							 num_groups);
1442 
1443 		if (!IS_ERR(ft)) {
1444 			prio->refcount = 0;
1445 			prio->flow_table = ft;
1446 		} else {
1447 			err = PTR_ERR(ft);
1448 		}
1449 	}
1450 
1451 	return err ? ERR_PTR(err) : prio;
1452 }
1453 
1454 static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
1455 						     struct mlx5_ib_flow_prio *ft_prio,
1456 						     struct ib_flow_attr *flow_attr,
1457 						     struct mlx5_flow_destination *dst)
1458 {
1459 	struct mlx5_flow_table	*ft = ft_prio->flow_table;
1460 	struct mlx5_ib_flow_handler *handler;
1461 	void *ib_flow = flow_attr + 1;
1462 	u8 match_criteria_enable = 0;
1463 	unsigned int spec_index;
1464 	u32 *match_c;
1465 	u32 *match_v;
1466 	u32 action;
1467 	int err = 0;
1468 
1469 	if (!is_valid_attr(flow_attr))
1470 		return ERR_PTR(-EINVAL);
1471 
1472 	match_c = kzalloc(MLX5_ST_SZ_BYTES(fte_match_param), GFP_KERNEL);
1473 	match_v = kzalloc(MLX5_ST_SZ_BYTES(fte_match_param), GFP_KERNEL);
1474 	handler = kzalloc(sizeof(*handler), GFP_KERNEL);
1475 	if (!handler || !match_c || !match_v) {
1476 		err = -ENOMEM;
1477 		goto free;
1478 	}
1479 
1480 	INIT_LIST_HEAD(&handler->list);
1481 
1482 	for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
1483 		err = parse_flow_attr(match_c, match_v, ib_flow);
1484 		if (err < 0)
1485 			goto free;
1486 
1487 		ib_flow += ((union ib_flow_spec *)ib_flow)->size;
1488 	}
1489 
1490 	/* Outer header support only */
1491 	match_criteria_enable = (!outer_header_zero(match_c)) << 0;
1492 	action = dst ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST :
1493 		MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
1494 	handler->rule = mlx5_add_flow_rule(ft, match_criteria_enable,
1495 					   match_c, match_v,
1496 					   action,
1497 					   MLX5_FS_DEFAULT_FLOW_TAG,
1498 					   dst);
1499 
1500 	if (IS_ERR(handler->rule)) {
1501 		err = PTR_ERR(handler->rule);
1502 		goto free;
1503 	}
1504 
1505 	handler->prio = ft_prio - dev->flow_db.prios;
1506 
1507 	ft_prio->flow_table = ft;
1508 free:
1509 	if (err)
1510 		kfree(handler);
1511 	kfree(match_c);
1512 	kfree(match_v);
1513 	return err ? ERR_PTR(err) : handler;
1514 }
1515 
1516 static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *dev,
1517 							  struct mlx5_ib_flow_prio *ft_prio,
1518 							  struct ib_flow_attr *flow_attr,
1519 							  struct mlx5_flow_destination *dst)
1520 {
1521 	struct mlx5_ib_flow_handler *handler_dst = NULL;
1522 	struct mlx5_ib_flow_handler *handler = NULL;
1523 
1524 	handler = create_flow_rule(dev, ft_prio, flow_attr, NULL);
1525 	if (!IS_ERR(handler)) {
1526 		handler_dst = create_flow_rule(dev, ft_prio,
1527 					       flow_attr, dst);
1528 		if (IS_ERR(handler_dst)) {
1529 			mlx5_del_flow_rule(handler->rule);
1530 			kfree(handler);
1531 			handler = handler_dst;
1532 		} else {
1533 			list_add(&handler_dst->list, &handler->list);
1534 		}
1535 	}
1536 
1537 	return handler;
1538 }
1539 enum {
1540 	LEFTOVERS_MC,
1541 	LEFTOVERS_UC,
1542 };
1543 
1544 static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev,
1545 							  struct mlx5_ib_flow_prio *ft_prio,
1546 							  struct ib_flow_attr *flow_attr,
1547 							  struct mlx5_flow_destination *dst)
1548 {
1549 	struct mlx5_ib_flow_handler *handler_ucast = NULL;
1550 	struct mlx5_ib_flow_handler *handler = NULL;
1551 
1552 	static struct {
1553 		struct ib_flow_attr	flow_attr;
1554 		struct ib_flow_spec_eth eth_flow;
1555 	} leftovers_specs[] = {
1556 		[LEFTOVERS_MC] = {
1557 			.flow_attr = {
1558 				.num_of_specs = 1,
1559 				.size = sizeof(leftovers_specs[0])
1560 			},
1561 			.eth_flow = {
1562 				.type = IB_FLOW_SPEC_ETH,
1563 				.size = sizeof(struct ib_flow_spec_eth),
1564 				.mask = {.dst_mac = {0x1} },
1565 				.val =  {.dst_mac = {0x1} }
1566 			}
1567 		},
1568 		[LEFTOVERS_UC] = {
1569 			.flow_attr = {
1570 				.num_of_specs = 1,
1571 				.size = sizeof(leftovers_specs[0])
1572 			},
1573 			.eth_flow = {
1574 				.type = IB_FLOW_SPEC_ETH,
1575 				.size = sizeof(struct ib_flow_spec_eth),
1576 				.mask = {.dst_mac = {0x1} },
1577 				.val = {.dst_mac = {} }
1578 			}
1579 		}
1580 	};
1581 
1582 	handler = create_flow_rule(dev, ft_prio,
1583 				   &leftovers_specs[LEFTOVERS_MC].flow_attr,
1584 				   dst);
1585 	if (!IS_ERR(handler) &&
1586 	    flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) {
1587 		handler_ucast = create_flow_rule(dev, ft_prio,
1588 						 &leftovers_specs[LEFTOVERS_UC].flow_attr,
1589 						 dst);
1590 		if (IS_ERR(handler_ucast)) {
1591 			kfree(handler);
1592 			handler = handler_ucast;
1593 		} else {
1594 			list_add(&handler_ucast->list, &handler->list);
1595 		}
1596 	}
1597 
1598 	return handler;
1599 }
1600 
1601 static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp,
1602 					   struct ib_flow_attr *flow_attr,
1603 					   int domain)
1604 {
1605 	struct mlx5_ib_dev *dev = to_mdev(qp->device);
1606 	struct mlx5_ib_flow_handler *handler = NULL;
1607 	struct mlx5_flow_destination *dst = NULL;
1608 	struct mlx5_ib_flow_prio *ft_prio;
1609 	int err;
1610 
1611 	if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO)
1612 		return ERR_PTR(-ENOSPC);
1613 
1614 	if (domain != IB_FLOW_DOMAIN_USER ||
1615 	    flow_attr->port > MLX5_CAP_GEN(dev->mdev, num_ports) ||
1616 	    (flow_attr->flags & ~IB_FLOW_ATTR_FLAGS_DONT_TRAP))
1617 		return ERR_PTR(-EINVAL);
1618 
1619 	dst = kzalloc(sizeof(*dst), GFP_KERNEL);
1620 	if (!dst)
1621 		return ERR_PTR(-ENOMEM);
1622 
1623 	mutex_lock(&dev->flow_db.lock);
1624 
1625 	ft_prio = get_flow_table(dev, flow_attr);
1626 	if (IS_ERR(ft_prio)) {
1627 		err = PTR_ERR(ft_prio);
1628 		goto unlock;
1629 	}
1630 
1631 	dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR;
1632 	dst->tir_num = to_mqp(qp)->raw_packet_qp.rq.tirn;
1633 
1634 	if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
1635 		if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP)  {
1636 			handler = create_dont_trap_rule(dev, ft_prio,
1637 							flow_attr, dst);
1638 		} else {
1639 			handler = create_flow_rule(dev, ft_prio, flow_attr,
1640 						   dst);
1641 		}
1642 	} else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
1643 		   flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
1644 		handler = create_leftovers_rule(dev, ft_prio, flow_attr,
1645 						dst);
1646 	} else {
1647 		err = -EINVAL;
1648 		goto destroy_ft;
1649 	}
1650 
1651 	if (IS_ERR(handler)) {
1652 		err = PTR_ERR(handler);
1653 		handler = NULL;
1654 		goto destroy_ft;
1655 	}
1656 
1657 	ft_prio->refcount++;
1658 	mutex_unlock(&dev->flow_db.lock);
1659 	kfree(dst);
1660 
1661 	return &handler->ibflow;
1662 
1663 destroy_ft:
1664 	put_flow_table(dev, ft_prio, false);
1665 unlock:
1666 	mutex_unlock(&dev->flow_db.lock);
1667 	kfree(dst);
1668 	kfree(handler);
1669 	return ERR_PTR(err);
1670 }
1671 
1672 static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
1673 {
1674 	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
1675 	int err;
1676 
1677 	err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num);
1678 	if (err)
1679 		mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n",
1680 			     ibqp->qp_num, gid->raw);
1681 
1682 	return err;
1683 }
1684 
1685 static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
1686 {
1687 	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
1688 	int err;
1689 
1690 	err = mlx5_core_detach_mcg(dev->mdev, gid, ibqp->qp_num);
1691 	if (err)
1692 		mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n",
1693 			     ibqp->qp_num, gid->raw);
1694 
1695 	return err;
1696 }
1697 
1698 static int init_node_data(struct mlx5_ib_dev *dev)
1699 {
1700 	int err;
1701 
1702 	err = mlx5_query_node_desc(dev, dev->ib_dev.node_desc);
1703 	if (err)
1704 		return err;
1705 
1706 	dev->mdev->rev_id = dev->mdev->pdev->revision;
1707 
1708 	return mlx5_query_node_guid(dev, &dev->ib_dev.node_guid);
1709 }
1710 
1711 static ssize_t show_fw_pages(struct device *device, struct device_attribute *attr,
1712 			     char *buf)
1713 {
1714 	struct mlx5_ib_dev *dev =
1715 		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
1716 
1717 	return sprintf(buf, "%d\n", dev->mdev->priv.fw_pages);
1718 }
1719 
1720 static ssize_t show_reg_pages(struct device *device,
1721 			      struct device_attribute *attr, char *buf)
1722 {
1723 	struct mlx5_ib_dev *dev =
1724 		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
1725 
1726 	return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
1727 }
1728 
1729 static ssize_t show_hca(struct device *device, struct device_attribute *attr,
1730 			char *buf)
1731 {
1732 	struct mlx5_ib_dev *dev =
1733 		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
1734 	return sprintf(buf, "MT%d\n", dev->mdev->pdev->device);
1735 }
1736 
1737 static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
1738 			   char *buf)
1739 {
1740 	struct mlx5_ib_dev *dev =
1741 		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
1742 	return sprintf(buf, "%d.%d.%d\n", fw_rev_maj(dev->mdev),
1743 		       fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev));
1744 }
1745 
1746 static ssize_t show_rev(struct device *device, struct device_attribute *attr,
1747 			char *buf)
1748 {
1749 	struct mlx5_ib_dev *dev =
1750 		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
1751 	return sprintf(buf, "%x\n", dev->mdev->rev_id);
1752 }
1753 
1754 static ssize_t show_board(struct device *device, struct device_attribute *attr,
1755 			  char *buf)
1756 {
1757 	struct mlx5_ib_dev *dev =
1758 		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
1759 	return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN,
1760 		       dev->mdev->board_id);
1761 }
1762 
1763 static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
1764 static DEVICE_ATTR(fw_ver,   S_IRUGO, show_fw_ver, NULL);
1765 static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
1766 static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
1767 static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL);
1768 static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL);
1769 
1770 static struct device_attribute *mlx5_class_attributes[] = {
1771 	&dev_attr_hw_rev,
1772 	&dev_attr_fw_ver,
1773 	&dev_attr_hca_type,
1774 	&dev_attr_board_id,
1775 	&dev_attr_fw_pages,
1776 	&dev_attr_reg_pages,
1777 };
1778 
1779 static void pkey_change_handler(struct work_struct *work)
1780 {
1781 	struct mlx5_ib_port_resources *ports =
1782 		container_of(work, struct mlx5_ib_port_resources,
1783 			     pkey_change_work);
1784 
1785 	mutex_lock(&ports->devr->mutex);
1786 	mlx5_ib_gsi_pkey_change(ports->gsi);
1787 	mutex_unlock(&ports->devr->mutex);
1788 }
1789 
1790 static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
1791 			  enum mlx5_dev_event event, unsigned long param)
1792 {
1793 	struct mlx5_ib_dev *ibdev = (struct mlx5_ib_dev *)context;
1794 	struct ib_event ibev;
1795 
1796 	u8 port = 0;
1797 
1798 	switch (event) {
1799 	case MLX5_DEV_EVENT_SYS_ERROR:
1800 		ibdev->ib_active = false;
1801 		ibev.event = IB_EVENT_DEVICE_FATAL;
1802 		break;
1803 
1804 	case MLX5_DEV_EVENT_PORT_UP:
1805 		ibev.event = IB_EVENT_PORT_ACTIVE;
1806 		port = (u8)param;
1807 		break;
1808 
1809 	case MLX5_DEV_EVENT_PORT_DOWN:
1810 		ibev.event = IB_EVENT_PORT_ERR;
1811 		port = (u8)param;
1812 		break;
1813 
1814 	case MLX5_DEV_EVENT_PORT_INITIALIZED:
1815 		/* not used by ULPs */
1816 		return;
1817 
1818 	case MLX5_DEV_EVENT_LID_CHANGE:
1819 		ibev.event = IB_EVENT_LID_CHANGE;
1820 		port = (u8)param;
1821 		break;
1822 
1823 	case MLX5_DEV_EVENT_PKEY_CHANGE:
1824 		ibev.event = IB_EVENT_PKEY_CHANGE;
1825 		port = (u8)param;
1826 
1827 		schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work);
1828 		break;
1829 
1830 	case MLX5_DEV_EVENT_GUID_CHANGE:
1831 		ibev.event = IB_EVENT_GID_CHANGE;
1832 		port = (u8)param;
1833 		break;
1834 
1835 	case MLX5_DEV_EVENT_CLIENT_REREG:
1836 		ibev.event = IB_EVENT_CLIENT_REREGISTER;
1837 		port = (u8)param;
1838 		break;
1839 	}
1840 
1841 	ibev.device	      = &ibdev->ib_dev;
1842 	ibev.element.port_num = port;
1843 
1844 	if (port < 1 || port > ibdev->num_ports) {
1845 		mlx5_ib_warn(ibdev, "warning: event on port %d\n", port);
1846 		return;
1847 	}
1848 
1849 	if (ibdev->ib_active)
1850 		ib_dispatch_event(&ibev);
1851 }
1852 
1853 static void get_ext_port_caps(struct mlx5_ib_dev *dev)
1854 {
1855 	int port;
1856 
1857 	for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++)
1858 		mlx5_query_ext_port_caps(dev, port);
1859 }
1860 
1861 static int get_port_caps(struct mlx5_ib_dev *dev)
1862 {
1863 	struct ib_device_attr *dprops = NULL;
1864 	struct ib_port_attr *pprops = NULL;
1865 	int err = -ENOMEM;
1866 	int port;
1867 	struct ib_udata uhw = {.inlen = 0, .outlen = 0};
1868 
1869 	pprops = kmalloc(sizeof(*pprops), GFP_KERNEL);
1870 	if (!pprops)
1871 		goto out;
1872 
1873 	dprops = kmalloc(sizeof(*dprops), GFP_KERNEL);
1874 	if (!dprops)
1875 		goto out;
1876 
1877 	err = mlx5_ib_query_device(&dev->ib_dev, dprops, &uhw);
1878 	if (err) {
1879 		mlx5_ib_warn(dev, "query_device failed %d\n", err);
1880 		goto out;
1881 	}
1882 
1883 	for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) {
1884 		err = mlx5_ib_query_port(&dev->ib_dev, port, pprops);
1885 		if (err) {
1886 			mlx5_ib_warn(dev, "query_port %d failed %d\n",
1887 				     port, err);
1888 			break;
1889 		}
1890 		dev->mdev->port_caps[port - 1].pkey_table_len =
1891 						dprops->max_pkeys;
1892 		dev->mdev->port_caps[port - 1].gid_table_len =
1893 						pprops->gid_tbl_len;
1894 		mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n",
1895 			    dprops->max_pkeys, pprops->gid_tbl_len);
1896 	}
1897 
1898 out:
1899 	kfree(pprops);
1900 	kfree(dprops);
1901 
1902 	return err;
1903 }
1904 
1905 static void destroy_umrc_res(struct mlx5_ib_dev *dev)
1906 {
1907 	int err;
1908 
1909 	err = mlx5_mr_cache_cleanup(dev);
1910 	if (err)
1911 		mlx5_ib_warn(dev, "mr cache cleanup failed\n");
1912 
1913 	mlx5_ib_destroy_qp(dev->umrc.qp);
1914 	ib_free_cq(dev->umrc.cq);
1915 	ib_dealloc_pd(dev->umrc.pd);
1916 }
1917 
1918 enum {
1919 	MAX_UMR_WR = 128,
1920 };
1921 
1922 static int create_umr_res(struct mlx5_ib_dev *dev)
1923 {
1924 	struct ib_qp_init_attr *init_attr = NULL;
1925 	struct ib_qp_attr *attr = NULL;
1926 	struct ib_pd *pd;
1927 	struct ib_cq *cq;
1928 	struct ib_qp *qp;
1929 	int ret;
1930 
1931 	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
1932 	init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL);
1933 	if (!attr || !init_attr) {
1934 		ret = -ENOMEM;
1935 		goto error_0;
1936 	}
1937 
1938 	pd = ib_alloc_pd(&dev->ib_dev);
1939 	if (IS_ERR(pd)) {
1940 		mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
1941 		ret = PTR_ERR(pd);
1942 		goto error_0;
1943 	}
1944 
1945 	cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ);
1946 	if (IS_ERR(cq)) {
1947 		mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n");
1948 		ret = PTR_ERR(cq);
1949 		goto error_2;
1950 	}
1951 
1952 	init_attr->send_cq = cq;
1953 	init_attr->recv_cq = cq;
1954 	init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
1955 	init_attr->cap.max_send_wr = MAX_UMR_WR;
1956 	init_attr->cap.max_send_sge = 1;
1957 	init_attr->qp_type = MLX5_IB_QPT_REG_UMR;
1958 	init_attr->port_num = 1;
1959 	qp = mlx5_ib_create_qp(pd, init_attr, NULL);
1960 	if (IS_ERR(qp)) {
1961 		mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n");
1962 		ret = PTR_ERR(qp);
1963 		goto error_3;
1964 	}
1965 	qp->device     = &dev->ib_dev;
1966 	qp->real_qp    = qp;
1967 	qp->uobject    = NULL;
1968 	qp->qp_type    = MLX5_IB_QPT_REG_UMR;
1969 
1970 	attr->qp_state = IB_QPS_INIT;
1971 	attr->port_num = 1;
1972 	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX |
1973 				IB_QP_PORT, NULL);
1974 	if (ret) {
1975 		mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
1976 		goto error_4;
1977 	}
1978 
1979 	memset(attr, 0, sizeof(*attr));
1980 	attr->qp_state = IB_QPS_RTR;
1981 	attr->path_mtu = IB_MTU_256;
1982 
1983 	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
1984 	if (ret) {
1985 		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n");
1986 		goto error_4;
1987 	}
1988 
1989 	memset(attr, 0, sizeof(*attr));
1990 	attr->qp_state = IB_QPS_RTS;
1991 	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
1992 	if (ret) {
1993 		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n");
1994 		goto error_4;
1995 	}
1996 
1997 	dev->umrc.qp = qp;
1998 	dev->umrc.cq = cq;
1999 	dev->umrc.pd = pd;
2000 
2001 	sema_init(&dev->umrc.sem, MAX_UMR_WR);
2002 	ret = mlx5_mr_cache_init(dev);
2003 	if (ret) {
2004 		mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
2005 		goto error_4;
2006 	}
2007 
2008 	kfree(attr);
2009 	kfree(init_attr);
2010 
2011 	return 0;
2012 
2013 error_4:
2014 	mlx5_ib_destroy_qp(qp);
2015 
2016 error_3:
2017 	ib_free_cq(cq);
2018 
2019 error_2:
2020 	ib_dealloc_pd(pd);
2021 
2022 error_0:
2023 	kfree(attr);
2024 	kfree(init_attr);
2025 	return ret;
2026 }
2027 
2028 static int create_dev_resources(struct mlx5_ib_resources *devr)
2029 {
2030 	struct ib_srq_init_attr attr;
2031 	struct mlx5_ib_dev *dev;
2032 	struct ib_cq_init_attr cq_attr = {.cqe = 1};
2033 	int port;
2034 	int ret = 0;
2035 
2036 	dev = container_of(devr, struct mlx5_ib_dev, devr);
2037 
2038 	mutex_init(&devr->mutex);
2039 
2040 	devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL);
2041 	if (IS_ERR(devr->p0)) {
2042 		ret = PTR_ERR(devr->p0);
2043 		goto error0;
2044 	}
2045 	devr->p0->device  = &dev->ib_dev;
2046 	devr->p0->uobject = NULL;
2047 	atomic_set(&devr->p0->usecnt, 0);
2048 
2049 	devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, &cq_attr, NULL, NULL);
2050 	if (IS_ERR(devr->c0)) {
2051 		ret = PTR_ERR(devr->c0);
2052 		goto error1;
2053 	}
2054 	devr->c0->device        = &dev->ib_dev;
2055 	devr->c0->uobject       = NULL;
2056 	devr->c0->comp_handler  = NULL;
2057 	devr->c0->event_handler = NULL;
2058 	devr->c0->cq_context    = NULL;
2059 	atomic_set(&devr->c0->usecnt, 0);
2060 
2061 	devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
2062 	if (IS_ERR(devr->x0)) {
2063 		ret = PTR_ERR(devr->x0);
2064 		goto error2;
2065 	}
2066 	devr->x0->device = &dev->ib_dev;
2067 	devr->x0->inode = NULL;
2068 	atomic_set(&devr->x0->usecnt, 0);
2069 	mutex_init(&devr->x0->tgt_qp_mutex);
2070 	INIT_LIST_HEAD(&devr->x0->tgt_qp_list);
2071 
2072 	devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
2073 	if (IS_ERR(devr->x1)) {
2074 		ret = PTR_ERR(devr->x1);
2075 		goto error3;
2076 	}
2077 	devr->x1->device = &dev->ib_dev;
2078 	devr->x1->inode = NULL;
2079 	atomic_set(&devr->x1->usecnt, 0);
2080 	mutex_init(&devr->x1->tgt_qp_mutex);
2081 	INIT_LIST_HEAD(&devr->x1->tgt_qp_list);
2082 
2083 	memset(&attr, 0, sizeof(attr));
2084 	attr.attr.max_sge = 1;
2085 	attr.attr.max_wr = 1;
2086 	attr.srq_type = IB_SRQT_XRC;
2087 	attr.ext.xrc.cq = devr->c0;
2088 	attr.ext.xrc.xrcd = devr->x0;
2089 
2090 	devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
2091 	if (IS_ERR(devr->s0)) {
2092 		ret = PTR_ERR(devr->s0);
2093 		goto error4;
2094 	}
2095 	devr->s0->device	= &dev->ib_dev;
2096 	devr->s0->pd		= devr->p0;
2097 	devr->s0->uobject       = NULL;
2098 	devr->s0->event_handler = NULL;
2099 	devr->s0->srq_context   = NULL;
2100 	devr->s0->srq_type      = IB_SRQT_XRC;
2101 	devr->s0->ext.xrc.xrcd	= devr->x0;
2102 	devr->s0->ext.xrc.cq	= devr->c0;
2103 	atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt);
2104 	atomic_inc(&devr->s0->ext.xrc.cq->usecnt);
2105 	atomic_inc(&devr->p0->usecnt);
2106 	atomic_set(&devr->s0->usecnt, 0);
2107 
2108 	memset(&attr, 0, sizeof(attr));
2109 	attr.attr.max_sge = 1;
2110 	attr.attr.max_wr = 1;
2111 	attr.srq_type = IB_SRQT_BASIC;
2112 	devr->s1 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
2113 	if (IS_ERR(devr->s1)) {
2114 		ret = PTR_ERR(devr->s1);
2115 		goto error5;
2116 	}
2117 	devr->s1->device	= &dev->ib_dev;
2118 	devr->s1->pd		= devr->p0;
2119 	devr->s1->uobject       = NULL;
2120 	devr->s1->event_handler = NULL;
2121 	devr->s1->srq_context   = NULL;
2122 	devr->s1->srq_type      = IB_SRQT_BASIC;
2123 	devr->s1->ext.xrc.cq	= devr->c0;
2124 	atomic_inc(&devr->p0->usecnt);
2125 	atomic_set(&devr->s0->usecnt, 0);
2126 
2127 	for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) {
2128 		INIT_WORK(&devr->ports[port].pkey_change_work,
2129 			  pkey_change_handler);
2130 		devr->ports[port].devr = devr;
2131 	}
2132 
2133 	return 0;
2134 
2135 error5:
2136 	mlx5_ib_destroy_srq(devr->s0);
2137 error4:
2138 	mlx5_ib_dealloc_xrcd(devr->x1);
2139 error3:
2140 	mlx5_ib_dealloc_xrcd(devr->x0);
2141 error2:
2142 	mlx5_ib_destroy_cq(devr->c0);
2143 error1:
2144 	mlx5_ib_dealloc_pd(devr->p0);
2145 error0:
2146 	return ret;
2147 }
2148 
2149 static void destroy_dev_resources(struct mlx5_ib_resources *devr)
2150 {
2151 	struct mlx5_ib_dev *dev =
2152 		container_of(devr, struct mlx5_ib_dev, devr);
2153 	int port;
2154 
2155 	mlx5_ib_destroy_srq(devr->s1);
2156 	mlx5_ib_destroy_srq(devr->s0);
2157 	mlx5_ib_dealloc_xrcd(devr->x0);
2158 	mlx5_ib_dealloc_xrcd(devr->x1);
2159 	mlx5_ib_destroy_cq(devr->c0);
2160 	mlx5_ib_dealloc_pd(devr->p0);
2161 
2162 	/* Make sure no change P_Key work items are still executing */
2163 	for (port = 0; port < dev->num_ports; ++port)
2164 		cancel_work_sync(&devr->ports[port].pkey_change_work);
2165 }
2166 
2167 static u32 get_core_cap_flags(struct ib_device *ibdev)
2168 {
2169 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
2170 	enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1);
2171 	u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type);
2172 	u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version);
2173 	u32 ret = 0;
2174 
2175 	if (ll == IB_LINK_LAYER_INFINIBAND)
2176 		return RDMA_CORE_PORT_IBA_IB;
2177 
2178 	if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP))
2179 		return 0;
2180 
2181 	if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP))
2182 		return 0;
2183 
2184 	if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP)
2185 		ret |= RDMA_CORE_PORT_IBA_ROCE;
2186 
2187 	if (roce_version_cap & MLX5_ROCE_VERSION_2_CAP)
2188 		ret |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
2189 
2190 	return ret;
2191 }
2192 
2193 static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
2194 			       struct ib_port_immutable *immutable)
2195 {
2196 	struct ib_port_attr attr;
2197 	int err;
2198 
2199 	err = mlx5_ib_query_port(ibdev, port_num, &attr);
2200 	if (err)
2201 		return err;
2202 
2203 	immutable->pkey_tbl_len = attr.pkey_tbl_len;
2204 	immutable->gid_tbl_len = attr.gid_tbl_len;
2205 	immutable->core_cap_flags = get_core_cap_flags(ibdev);
2206 	immutable->max_mad_size = IB_MGMT_MAD_SIZE;
2207 
2208 	return 0;
2209 }
2210 
2211 static int mlx5_enable_roce(struct mlx5_ib_dev *dev)
2212 {
2213 	int err;
2214 
2215 	dev->roce.nb.notifier_call = mlx5_netdev_event;
2216 	err = register_netdevice_notifier(&dev->roce.nb);
2217 	if (err)
2218 		return err;
2219 
2220 	err = mlx5_nic_vport_enable_roce(dev->mdev);
2221 	if (err)
2222 		goto err_unregister_netdevice_notifier;
2223 
2224 	return 0;
2225 
2226 err_unregister_netdevice_notifier:
2227 	unregister_netdevice_notifier(&dev->roce.nb);
2228 	return err;
2229 }
2230 
2231 static void mlx5_disable_roce(struct mlx5_ib_dev *dev)
2232 {
2233 	mlx5_nic_vport_disable_roce(dev->mdev);
2234 	unregister_netdevice_notifier(&dev->roce.nb);
2235 }
2236 
2237 static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
2238 {
2239 	struct mlx5_ib_dev *dev;
2240 	enum rdma_link_layer ll;
2241 	int port_type_cap;
2242 	int err;
2243 	int i;
2244 
2245 	port_type_cap = MLX5_CAP_GEN(mdev, port_type);
2246 	ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
2247 
2248 	if ((ll == IB_LINK_LAYER_ETHERNET) && !MLX5_CAP_GEN(mdev, roce))
2249 		return NULL;
2250 
2251 	printk_once(KERN_INFO "%s", mlx5_version);
2252 
2253 	dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev));
2254 	if (!dev)
2255 		return NULL;
2256 
2257 	dev->mdev = mdev;
2258 
2259 	rwlock_init(&dev->roce.netdev_lock);
2260 	err = get_port_caps(dev);
2261 	if (err)
2262 		goto err_dealloc;
2263 
2264 	if (mlx5_use_mad_ifc(dev))
2265 		get_ext_port_caps(dev);
2266 
2267 	MLX5_INIT_DOORBELL_LOCK(&dev->uar_lock);
2268 
2269 	strlcpy(dev->ib_dev.name, "mlx5_%d", IB_DEVICE_NAME_MAX);
2270 	dev->ib_dev.owner		= THIS_MODULE;
2271 	dev->ib_dev.node_type		= RDMA_NODE_IB_CA;
2272 	dev->ib_dev.local_dma_lkey	= 0 /* not supported for now */;
2273 	dev->num_ports		= MLX5_CAP_GEN(mdev, num_ports);
2274 	dev->ib_dev.phys_port_cnt     = dev->num_ports;
2275 	dev->ib_dev.num_comp_vectors    =
2276 		dev->mdev->priv.eq_table.num_comp_vectors;
2277 	dev->ib_dev.dma_device	= &mdev->pdev->dev;
2278 
2279 	dev->ib_dev.uverbs_abi_ver	= MLX5_IB_UVERBS_ABI_VERSION;
2280 	dev->ib_dev.uverbs_cmd_mask	=
2281 		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
2282 		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
2283 		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
2284 		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
2285 		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
2286 		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
2287 		(1ull << IB_USER_VERBS_CMD_REREG_MR)		|
2288 		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
2289 		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)	|
2290 		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
2291 		(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)		|
2292 		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
2293 		(1ull << IB_USER_VERBS_CMD_CREATE_QP)		|
2294 		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)		|
2295 		(1ull << IB_USER_VERBS_CMD_QUERY_QP)		|
2296 		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)		|
2297 		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)	|
2298 		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST)	|
2299 		(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)		|
2300 		(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)		|
2301 		(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)		|
2302 		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)		|
2303 		(1ull << IB_USER_VERBS_CMD_CREATE_XSRQ)		|
2304 		(1ull << IB_USER_VERBS_CMD_OPEN_QP);
2305 	dev->ib_dev.uverbs_ex_cmd_mask =
2306 		(1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE)	|
2307 		(1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ)	|
2308 		(1ull << IB_USER_VERBS_EX_CMD_CREATE_QP);
2309 
2310 	dev->ib_dev.query_device	= mlx5_ib_query_device;
2311 	dev->ib_dev.query_port		= mlx5_ib_query_port;
2312 	dev->ib_dev.get_link_layer	= mlx5_ib_port_link_layer;
2313 	if (ll == IB_LINK_LAYER_ETHERNET)
2314 		dev->ib_dev.get_netdev	= mlx5_ib_get_netdev;
2315 	dev->ib_dev.query_gid		= mlx5_ib_query_gid;
2316 	dev->ib_dev.add_gid		= mlx5_ib_add_gid;
2317 	dev->ib_dev.del_gid		= mlx5_ib_del_gid;
2318 	dev->ib_dev.query_pkey		= mlx5_ib_query_pkey;
2319 	dev->ib_dev.modify_device	= mlx5_ib_modify_device;
2320 	dev->ib_dev.modify_port		= mlx5_ib_modify_port;
2321 	dev->ib_dev.alloc_ucontext	= mlx5_ib_alloc_ucontext;
2322 	dev->ib_dev.dealloc_ucontext	= mlx5_ib_dealloc_ucontext;
2323 	dev->ib_dev.mmap		= mlx5_ib_mmap;
2324 	dev->ib_dev.alloc_pd		= mlx5_ib_alloc_pd;
2325 	dev->ib_dev.dealloc_pd		= mlx5_ib_dealloc_pd;
2326 	dev->ib_dev.create_ah		= mlx5_ib_create_ah;
2327 	dev->ib_dev.query_ah		= mlx5_ib_query_ah;
2328 	dev->ib_dev.destroy_ah		= mlx5_ib_destroy_ah;
2329 	dev->ib_dev.create_srq		= mlx5_ib_create_srq;
2330 	dev->ib_dev.modify_srq		= mlx5_ib_modify_srq;
2331 	dev->ib_dev.query_srq		= mlx5_ib_query_srq;
2332 	dev->ib_dev.destroy_srq		= mlx5_ib_destroy_srq;
2333 	dev->ib_dev.post_srq_recv	= mlx5_ib_post_srq_recv;
2334 	dev->ib_dev.create_qp		= mlx5_ib_create_qp;
2335 	dev->ib_dev.modify_qp		= mlx5_ib_modify_qp;
2336 	dev->ib_dev.query_qp		= mlx5_ib_query_qp;
2337 	dev->ib_dev.destroy_qp		= mlx5_ib_destroy_qp;
2338 	dev->ib_dev.post_send		= mlx5_ib_post_send;
2339 	dev->ib_dev.post_recv		= mlx5_ib_post_recv;
2340 	dev->ib_dev.create_cq		= mlx5_ib_create_cq;
2341 	dev->ib_dev.modify_cq		= mlx5_ib_modify_cq;
2342 	dev->ib_dev.resize_cq		= mlx5_ib_resize_cq;
2343 	dev->ib_dev.destroy_cq		= mlx5_ib_destroy_cq;
2344 	dev->ib_dev.poll_cq		= mlx5_ib_poll_cq;
2345 	dev->ib_dev.req_notify_cq	= mlx5_ib_arm_cq;
2346 	dev->ib_dev.get_dma_mr		= mlx5_ib_get_dma_mr;
2347 	dev->ib_dev.reg_user_mr		= mlx5_ib_reg_user_mr;
2348 	dev->ib_dev.rereg_user_mr	= mlx5_ib_rereg_user_mr;
2349 	dev->ib_dev.dereg_mr		= mlx5_ib_dereg_mr;
2350 	dev->ib_dev.attach_mcast	= mlx5_ib_mcg_attach;
2351 	dev->ib_dev.detach_mcast	= mlx5_ib_mcg_detach;
2352 	dev->ib_dev.process_mad		= mlx5_ib_process_mad;
2353 	dev->ib_dev.alloc_mr		= mlx5_ib_alloc_mr;
2354 	dev->ib_dev.map_mr_sg		= mlx5_ib_map_mr_sg;
2355 	dev->ib_dev.check_mr_status	= mlx5_ib_check_mr_status;
2356 	dev->ib_dev.get_port_immutable  = mlx5_port_immutable;
2357 	if (mlx5_core_is_pf(mdev)) {
2358 		dev->ib_dev.get_vf_config	= mlx5_ib_get_vf_config;
2359 		dev->ib_dev.set_vf_link_state	= mlx5_ib_set_vf_link_state;
2360 		dev->ib_dev.get_vf_stats	= mlx5_ib_get_vf_stats;
2361 		dev->ib_dev.set_vf_guid		= mlx5_ib_set_vf_guid;
2362 	}
2363 
2364 	mlx5_ib_internal_fill_odp_caps(dev);
2365 
2366 	if (MLX5_CAP_GEN(mdev, imaicl)) {
2367 		dev->ib_dev.alloc_mw		= mlx5_ib_alloc_mw;
2368 		dev->ib_dev.dealloc_mw		= mlx5_ib_dealloc_mw;
2369 		dev->ib_dev.uverbs_cmd_mask |=
2370 			(1ull << IB_USER_VERBS_CMD_ALLOC_MW)	|
2371 			(1ull << IB_USER_VERBS_CMD_DEALLOC_MW);
2372 	}
2373 
2374 	if (MLX5_CAP_GEN(mdev, xrc)) {
2375 		dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd;
2376 		dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd;
2377 		dev->ib_dev.uverbs_cmd_mask |=
2378 			(1ull << IB_USER_VERBS_CMD_OPEN_XRCD) |
2379 			(1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
2380 	}
2381 
2382 	if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) ==
2383 	    IB_LINK_LAYER_ETHERNET) {
2384 		dev->ib_dev.create_flow	= mlx5_ib_create_flow;
2385 		dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow;
2386 		dev->ib_dev.uverbs_ex_cmd_mask |=
2387 			(1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) |
2388 			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW);
2389 	}
2390 	err = init_node_data(dev);
2391 	if (err)
2392 		goto err_dealloc;
2393 
2394 	mutex_init(&dev->flow_db.lock);
2395 	mutex_init(&dev->cap_mask_mutex);
2396 
2397 	if (ll == IB_LINK_LAYER_ETHERNET) {
2398 		err = mlx5_enable_roce(dev);
2399 		if (err)
2400 			goto err_dealloc;
2401 	}
2402 
2403 	err = create_dev_resources(&dev->devr);
2404 	if (err)
2405 		goto err_disable_roce;
2406 
2407 	err = mlx5_ib_odp_init_one(dev);
2408 	if (err)
2409 		goto err_rsrc;
2410 
2411 	err = ib_register_device(&dev->ib_dev, NULL);
2412 	if (err)
2413 		goto err_odp;
2414 
2415 	err = create_umr_res(dev);
2416 	if (err)
2417 		goto err_dev;
2418 
2419 	for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) {
2420 		err = device_create_file(&dev->ib_dev.dev,
2421 					 mlx5_class_attributes[i]);
2422 		if (err)
2423 			goto err_umrc;
2424 	}
2425 
2426 	dev->ib_active = true;
2427 
2428 	return dev;
2429 
2430 err_umrc:
2431 	destroy_umrc_res(dev);
2432 
2433 err_dev:
2434 	ib_unregister_device(&dev->ib_dev);
2435 
2436 err_odp:
2437 	mlx5_ib_odp_remove_one(dev);
2438 
2439 err_rsrc:
2440 	destroy_dev_resources(&dev->devr);
2441 
2442 err_disable_roce:
2443 	if (ll == IB_LINK_LAYER_ETHERNET)
2444 		mlx5_disable_roce(dev);
2445 
2446 err_dealloc:
2447 	ib_dealloc_device((struct ib_device *)dev);
2448 
2449 	return NULL;
2450 }
2451 
2452 static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
2453 {
2454 	struct mlx5_ib_dev *dev = context;
2455 	enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, 1);
2456 
2457 	ib_unregister_device(&dev->ib_dev);
2458 	destroy_umrc_res(dev);
2459 	mlx5_ib_odp_remove_one(dev);
2460 	destroy_dev_resources(&dev->devr);
2461 	if (ll == IB_LINK_LAYER_ETHERNET)
2462 		mlx5_disable_roce(dev);
2463 	ib_dealloc_device(&dev->ib_dev);
2464 }
2465 
2466 static struct mlx5_interface mlx5_ib_interface = {
2467 	.add            = mlx5_ib_add,
2468 	.remove         = mlx5_ib_remove,
2469 	.event          = mlx5_ib_event,
2470 	.protocol	= MLX5_INTERFACE_PROTOCOL_IB,
2471 };
2472 
2473 static int __init mlx5_ib_init(void)
2474 {
2475 	int err;
2476 
2477 	if (deprecated_prof_sel != 2)
2478 		pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n");
2479 
2480 	err = mlx5_ib_odp_init();
2481 	if (err)
2482 		return err;
2483 
2484 	err = mlx5_register_interface(&mlx5_ib_interface);
2485 	if (err)
2486 		goto clean_odp;
2487 
2488 	return err;
2489 
2490 clean_odp:
2491 	mlx5_ib_odp_cleanup();
2492 	return err;
2493 }
2494 
2495 static void __exit mlx5_ib_cleanup(void)
2496 {
2497 	mlx5_unregister_interface(&mlx5_ib_interface);
2498 	mlx5_ib_odp_cleanup();
2499 }
2500 
2501 module_init(mlx5_ib_init);
2502 module_exit(mlx5_ib_cleanup);
2503