xref: /openbmc/linux/drivers/cxl/mem.c (revision 163b0991)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright(c) 2020 Intel Corporation. All rights reserved. */
3 #include <uapi/linux/cxl_mem.h>
4 #include <linux/security.h>
5 #include <linux/debugfs.h>
6 #include <linux/module.h>
7 #include <linux/mutex.h>
8 #include <linux/cdev.h>
9 #include <linux/idr.h>
10 #include <linux/pci.h>
11 #include <linux/io.h>
12 #include <linux/io-64-nonatomic-lo-hi.h>
13 #include "pci.h"
14 #include "cxl.h"
15 
16 /**
17  * DOC: cxl mem
18  *
19  * This implements a CXL memory device ("type-3") as it is defined by the
20  * Compute Express Link specification.
21  *
22  * The driver has several responsibilities, mainly:
23  *  - Create the memX device and register on the CXL bus.
24  *  - Enumerate device's register interface and map them.
25  *  - Probe the device attributes to establish sysfs interface.
26  *  - Provide an IOCTL interface to userspace to communicate with the device for
27  *    things like firmware update.
28  *  - Support management of interleave sets.
29  *  - Handle and manage error conditions.
30  */
31 
32 /*
33  * An entire PCI topology full of devices should be enough for any
34  * config
35  */
36 #define CXL_MEM_MAX_DEVS 65536
37 
38 #define cxl_doorbell_busy(cxlm)                                                \
39 	(readl((cxlm)->mbox_regs + CXLDEV_MBOX_CTRL_OFFSET) &                  \
40 	 CXLDEV_MBOX_CTRL_DOORBELL)
41 
42 /* CXL 2.0 - 8.2.8.4 */
43 #define CXL_MAILBOX_TIMEOUT_MS (2 * HZ)
44 
45 enum opcode {
46 	CXL_MBOX_OP_INVALID		= 0x0000,
47 	CXL_MBOX_OP_RAW			= CXL_MBOX_OP_INVALID,
48 	CXL_MBOX_OP_GET_FW_INFO		= 0x0200,
49 	CXL_MBOX_OP_ACTIVATE_FW		= 0x0202,
50 	CXL_MBOX_OP_GET_SUPPORTED_LOGS	= 0x0400,
51 	CXL_MBOX_OP_GET_LOG		= 0x0401,
52 	CXL_MBOX_OP_IDENTIFY		= 0x4000,
53 	CXL_MBOX_OP_GET_PARTITION_INFO	= 0x4100,
54 	CXL_MBOX_OP_SET_PARTITION_INFO	= 0x4101,
55 	CXL_MBOX_OP_GET_LSA		= 0x4102,
56 	CXL_MBOX_OP_SET_LSA		= 0x4103,
57 	CXL_MBOX_OP_GET_HEALTH_INFO	= 0x4200,
58 	CXL_MBOX_OP_SET_SHUTDOWN_STATE	= 0x4204,
59 	CXL_MBOX_OP_SCAN_MEDIA		= 0x4304,
60 	CXL_MBOX_OP_GET_SCAN_MEDIA	= 0x4305,
61 	CXL_MBOX_OP_MAX			= 0x10000
62 };
63 
64 /**
65  * struct mbox_cmd - A command to be submitted to hardware.
66  * @opcode: (input) The command set and command submitted to hardware.
67  * @payload_in: (input) Pointer to the input payload.
68  * @payload_out: (output) Pointer to the output payload. Must be allocated by
69  *		 the caller.
70  * @size_in: (input) Number of bytes to load from @payload_in.
71  * @size_out: (input) Max number of bytes loaded into @payload_out.
72  *            (output) Number of bytes generated by the device. For fixed size
73  *            outputs commands this is always expected to be deterministic. For
74  *            variable sized output commands, it tells the exact number of bytes
75  *            written.
76  * @return_code: (output) Error code returned from hardware.
77  *
78  * This is the primary mechanism used to send commands to the hardware.
79  * All the fields except @payload_* correspond exactly to the fields described in
80  * Command Register section of the CXL 2.0 8.2.8.4.5. @payload_in and
81  * @payload_out are written to, and read from the Command Payload Registers
82  * defined in CXL 2.0 8.2.8.4.8.
83  */
84 struct mbox_cmd {
85 	u16 opcode;
86 	void *payload_in;
87 	void *payload_out;
88 	size_t size_in;
89 	size_t size_out;
90 	u16 return_code;
91 #define CXL_MBOX_SUCCESS 0
92 };
93 
94 /**
95  * struct cxl_memdev - CXL bus object representing a Type-3 Memory Device
96  * @dev: driver core device object
97  * @cdev: char dev core object for ioctl operations
98  * @cxlm: pointer to the parent device driver data
99  * @ops_active: active user of @cxlm in ops handlers
100  * @ops_dead: completion when all @cxlm ops users have exited
101  * @id: id number of this memdev instance.
102  */
103 struct cxl_memdev {
104 	struct device dev;
105 	struct cdev cdev;
106 	struct cxl_mem *cxlm;
107 	struct percpu_ref ops_active;
108 	struct completion ops_dead;
109 	int id;
110 };
111 
112 static int cxl_mem_major;
113 static DEFINE_IDA(cxl_memdev_ida);
114 static struct dentry *cxl_debugfs;
115 static bool cxl_raw_allow_all;
116 
117 enum {
118 	CEL_UUID,
119 	VENDOR_DEBUG_UUID,
120 };
121 
122 /* See CXL 2.0 Table 170. Get Log Input Payload */
123 static const uuid_t log_uuid[] = {
124 	[CEL_UUID] = UUID_INIT(0xda9c0b5, 0xbf41, 0x4b78, 0x8f, 0x79, 0x96,
125 			       0xb1, 0x62, 0x3b, 0x3f, 0x17),
126 	[VENDOR_DEBUG_UUID] = UUID_INIT(0xe1819d9, 0x11a9, 0x400c, 0x81, 0x1f,
127 					0xd6, 0x07, 0x19, 0x40, 0x3d, 0x86),
128 };
129 
130 /**
131  * struct cxl_mem_command - Driver representation of a memory device command
132  * @info: Command information as it exists for the UAPI
133  * @opcode: The actual bits used for the mailbox protocol
134  * @flags: Set of flags effecting driver behavior.
135  *
136  *  * %CXL_CMD_FLAG_FORCE_ENABLE: In cases of error, commands with this flag
137  *    will be enabled by the driver regardless of what hardware may have
138  *    advertised.
139  *
140  * The cxl_mem_command is the driver's internal representation of commands that
141  * are supported by the driver. Some of these commands may not be supported by
142  * the hardware. The driver will use @info to validate the fields passed in by
143  * the user then submit the @opcode to the hardware.
144  *
145  * See struct cxl_command_info.
146  */
147 struct cxl_mem_command {
148 	struct cxl_command_info info;
149 	enum opcode opcode;
150 	u32 flags;
151 #define CXL_CMD_FLAG_NONE 0
152 #define CXL_CMD_FLAG_FORCE_ENABLE BIT(0)
153 };
154 
155 #define CXL_CMD(_id, sin, sout, _flags)                                        \
156 	[CXL_MEM_COMMAND_ID_##_id] = {                                         \
157 	.info =	{                                                              \
158 			.id = CXL_MEM_COMMAND_ID_##_id,                        \
159 			.size_in = sin,                                        \
160 			.size_out = sout,                                      \
161 		},                                                             \
162 	.opcode = CXL_MBOX_OP_##_id,                                           \
163 	.flags = _flags,                                                       \
164 	}
165 
166 /*
167  * This table defines the supported mailbox commands for the driver. This table
168  * is made up of a UAPI structure. Non-negative values as parameters in the
169  * table will be validated against the user's input. For example, if size_in is
170  * 0, and the user passed in 1, it is an error.
171  */
172 static struct cxl_mem_command mem_commands[] = {
173 	CXL_CMD(IDENTIFY, 0, 0x43, CXL_CMD_FLAG_FORCE_ENABLE),
174 #ifdef CONFIG_CXL_MEM_RAW_COMMANDS
175 	CXL_CMD(RAW, ~0, ~0, 0),
176 #endif
177 	CXL_CMD(GET_SUPPORTED_LOGS, 0, ~0, CXL_CMD_FLAG_FORCE_ENABLE),
178 	CXL_CMD(GET_FW_INFO, 0, 0x50, 0),
179 	CXL_CMD(GET_PARTITION_INFO, 0, 0x20, 0),
180 	CXL_CMD(GET_LSA, 0x8, ~0, 0),
181 	CXL_CMD(GET_HEALTH_INFO, 0, 0x12, 0),
182 	CXL_CMD(GET_LOG, 0x18, ~0, CXL_CMD_FLAG_FORCE_ENABLE),
183 };
184 
185 /*
186  * Commands that RAW doesn't permit. The rationale for each:
187  *
188  * CXL_MBOX_OP_ACTIVATE_FW: Firmware activation requires adjustment /
189  * coordination of transaction timeout values at the root bridge level.
190  *
191  * CXL_MBOX_OP_SET_PARTITION_INFO: The device memory map may change live
192  * and needs to be coordinated with HDM updates.
193  *
194  * CXL_MBOX_OP_SET_LSA: The label storage area may be cached by the
195  * driver and any writes from userspace invalidates those contents.
196  *
197  * CXL_MBOX_OP_SET_SHUTDOWN_STATE: Set shutdown state assumes no writes
198  * to the device after it is marked clean, userspace can not make that
199  * assertion.
200  *
201  * CXL_MBOX_OP_[GET_]SCAN_MEDIA: The kernel provides a native error list that
202  * is kept up to date with patrol notifications and error management.
203  */
204 static u16 cxl_disabled_raw_commands[] = {
205 	CXL_MBOX_OP_ACTIVATE_FW,
206 	CXL_MBOX_OP_SET_PARTITION_INFO,
207 	CXL_MBOX_OP_SET_LSA,
208 	CXL_MBOX_OP_SET_SHUTDOWN_STATE,
209 	CXL_MBOX_OP_SCAN_MEDIA,
210 	CXL_MBOX_OP_GET_SCAN_MEDIA,
211 };
212 
213 /*
214  * Command sets that RAW doesn't permit. All opcodes in this set are
215  * disabled because they pass plain text security payloads over the
216  * user/kernel boundary. This functionality is intended to be wrapped
217  * behind the keys ABI which allows for encrypted payloads in the UAPI
218  */
219 static u8 security_command_sets[] = {
220 	0x44, /* Sanitize */
221 	0x45, /* Persistent Memory Data-at-rest Security */
222 	0x46, /* Security Passthrough */
223 };
224 
225 #define cxl_for_each_cmd(cmd)                                                  \
226 	for ((cmd) = &mem_commands[0];                                         \
227 	     ((cmd) - mem_commands) < ARRAY_SIZE(mem_commands); (cmd)++)
228 
229 #define cxl_cmd_count ARRAY_SIZE(mem_commands)
230 
231 static int cxl_mem_wait_for_doorbell(struct cxl_mem *cxlm)
232 {
233 	const unsigned long start = jiffies;
234 	unsigned long end = start;
235 
236 	while (cxl_doorbell_busy(cxlm)) {
237 		end = jiffies;
238 
239 		if (time_after(end, start + CXL_MAILBOX_TIMEOUT_MS)) {
240 			/* Check again in case preempted before timeout test */
241 			if (!cxl_doorbell_busy(cxlm))
242 				break;
243 			return -ETIMEDOUT;
244 		}
245 		cpu_relax();
246 	}
247 
248 	dev_dbg(&cxlm->pdev->dev, "Doorbell wait took %dms",
249 		jiffies_to_msecs(end) - jiffies_to_msecs(start));
250 	return 0;
251 }
252 
253 static bool cxl_is_security_command(u16 opcode)
254 {
255 	int i;
256 
257 	for (i = 0; i < ARRAY_SIZE(security_command_sets); i++)
258 		if (security_command_sets[i] == (opcode >> 8))
259 			return true;
260 	return false;
261 }
262 
263 static void cxl_mem_mbox_timeout(struct cxl_mem *cxlm,
264 				 struct mbox_cmd *mbox_cmd)
265 {
266 	struct device *dev = &cxlm->pdev->dev;
267 
268 	dev_dbg(dev, "Mailbox command (opcode: %#x size: %zub) timed out\n",
269 		mbox_cmd->opcode, mbox_cmd->size_in);
270 }
271 
272 /**
273  * __cxl_mem_mbox_send_cmd() - Execute a mailbox command
274  * @cxlm: The CXL memory device to communicate with.
275  * @mbox_cmd: Command to send to the memory device.
276  *
277  * Context: Any context. Expects mbox_mutex to be held.
278  * Return: -ETIMEDOUT if timeout occurred waiting for completion. 0 on success.
279  *         Caller should check the return code in @mbox_cmd to make sure it
280  *         succeeded.
281  *
282  * This is a generic form of the CXL mailbox send command thus only using the
283  * registers defined by the mailbox capability ID - CXL 2.0 8.2.8.4. Memory
284  * devices, and perhaps other types of CXL devices may have further information
285  * available upon error conditions. Driver facilities wishing to send mailbox
286  * commands should use the wrapper command.
287  *
288  * The CXL spec allows for up to two mailboxes. The intention is for the primary
289  * mailbox to be OS controlled and the secondary mailbox to be used by system
290  * firmware. This allows the OS and firmware to communicate with the device and
291  * not need to coordinate with each other. The driver only uses the primary
292  * mailbox.
293  */
294 static int __cxl_mem_mbox_send_cmd(struct cxl_mem *cxlm,
295 				   struct mbox_cmd *mbox_cmd)
296 {
297 	void __iomem *payload = cxlm->mbox_regs + CXLDEV_MBOX_PAYLOAD_OFFSET;
298 	u64 cmd_reg, status_reg;
299 	size_t out_len;
300 	int rc;
301 
302 	lockdep_assert_held(&cxlm->mbox_mutex);
303 
304 	/*
305 	 * Here are the steps from 8.2.8.4 of the CXL 2.0 spec.
306 	 *   1. Caller reads MB Control Register to verify doorbell is clear
307 	 *   2. Caller writes Command Register
308 	 *   3. Caller writes Command Payload Registers if input payload is non-empty
309 	 *   4. Caller writes MB Control Register to set doorbell
310 	 *   5. Caller either polls for doorbell to be clear or waits for interrupt if configured
311 	 *   6. Caller reads MB Status Register to fetch Return code
312 	 *   7. If command successful, Caller reads Command Register to get Payload Length
313 	 *   8. If output payload is non-empty, host reads Command Payload Registers
314 	 *
315 	 * Hardware is free to do whatever it wants before the doorbell is rung,
316 	 * and isn't allowed to change anything after it clears the doorbell. As
317 	 * such, steps 2 and 3 can happen in any order, and steps 6, 7, 8 can
318 	 * also happen in any order (though some orders might not make sense).
319 	 */
320 
321 	/* #1 */
322 	if (cxl_doorbell_busy(cxlm)) {
323 		dev_err_ratelimited(&cxlm->pdev->dev,
324 				    "Mailbox re-busy after acquiring\n");
325 		return -EBUSY;
326 	}
327 
328 	cmd_reg = FIELD_PREP(CXLDEV_MBOX_CMD_COMMAND_OPCODE_MASK,
329 			     mbox_cmd->opcode);
330 	if (mbox_cmd->size_in) {
331 		if (WARN_ON(!mbox_cmd->payload_in))
332 			return -EINVAL;
333 
334 		cmd_reg |= FIELD_PREP(CXLDEV_MBOX_CMD_PAYLOAD_LENGTH_MASK,
335 				      mbox_cmd->size_in);
336 		memcpy_toio(payload, mbox_cmd->payload_in, mbox_cmd->size_in);
337 	}
338 
339 	/* #2, #3 */
340 	writeq(cmd_reg, cxlm->mbox_regs + CXLDEV_MBOX_CMD_OFFSET);
341 
342 	/* #4 */
343 	dev_dbg(&cxlm->pdev->dev, "Sending command\n");
344 	writel(CXLDEV_MBOX_CTRL_DOORBELL,
345 	       cxlm->mbox_regs + CXLDEV_MBOX_CTRL_OFFSET);
346 
347 	/* #5 */
348 	rc = cxl_mem_wait_for_doorbell(cxlm);
349 	if (rc == -ETIMEDOUT) {
350 		cxl_mem_mbox_timeout(cxlm, mbox_cmd);
351 		return rc;
352 	}
353 
354 	/* #6 */
355 	status_reg = readq(cxlm->mbox_regs + CXLDEV_MBOX_STATUS_OFFSET);
356 	mbox_cmd->return_code =
357 		FIELD_GET(CXLDEV_MBOX_STATUS_RET_CODE_MASK, status_reg);
358 
359 	if (mbox_cmd->return_code != 0) {
360 		dev_dbg(&cxlm->pdev->dev, "Mailbox operation had an error\n");
361 		return 0;
362 	}
363 
364 	/* #7 */
365 	cmd_reg = readq(cxlm->mbox_regs + CXLDEV_MBOX_CMD_OFFSET);
366 	out_len = FIELD_GET(CXLDEV_MBOX_CMD_PAYLOAD_LENGTH_MASK, cmd_reg);
367 
368 	/* #8 */
369 	if (out_len && mbox_cmd->payload_out) {
370 		/*
371 		 * Sanitize the copy. If hardware misbehaves, out_len per the
372 		 * spec can actually be greater than the max allowed size (21
373 		 * bits available but spec defined 1M max). The caller also may
374 		 * have requested less data than the hardware supplied even
375 		 * within spec.
376 		 */
377 		size_t n = min3(mbox_cmd->size_out, cxlm->payload_size, out_len);
378 
379 		memcpy_fromio(mbox_cmd->payload_out, payload, n);
380 		mbox_cmd->size_out = n;
381 	} else {
382 		mbox_cmd->size_out = 0;
383 	}
384 
385 	return 0;
386 }
387 
388 /**
389  * cxl_mem_mbox_get() - Acquire exclusive access to the mailbox.
390  * @cxlm: The memory device to gain access to.
391  *
392  * Context: Any context. Takes the mbox_mutex.
393  * Return: 0 if exclusive access was acquired.
394  */
395 static int cxl_mem_mbox_get(struct cxl_mem *cxlm)
396 {
397 	struct device *dev = &cxlm->pdev->dev;
398 	u64 md_status;
399 	int rc;
400 
401 	mutex_lock_io(&cxlm->mbox_mutex);
402 
403 	/*
404 	 * XXX: There is some amount of ambiguity in the 2.0 version of the spec
405 	 * around the mailbox interface ready (8.2.8.5.1.1).  The purpose of the
406 	 * bit is to allow firmware running on the device to notify the driver
407 	 * that it's ready to receive commands. It is unclear if the bit needs
408 	 * to be read for each transaction mailbox, ie. the firmware can switch
409 	 * it on and off as needed. Second, there is no defined timeout for
410 	 * mailbox ready, like there is for the doorbell interface.
411 	 *
412 	 * Assumptions:
413 	 * 1. The firmware might toggle the Mailbox Interface Ready bit, check
414 	 *    it for every command.
415 	 *
416 	 * 2. If the doorbell is clear, the firmware should have first set the
417 	 *    Mailbox Interface Ready bit. Therefore, waiting for the doorbell
418 	 *    to be ready is sufficient.
419 	 */
420 	rc = cxl_mem_wait_for_doorbell(cxlm);
421 	if (rc) {
422 		dev_warn(dev, "Mailbox interface not ready\n");
423 		goto out;
424 	}
425 
426 	md_status = readq(cxlm->memdev_regs + CXLMDEV_STATUS_OFFSET);
427 	if (!(md_status & CXLMDEV_MBOX_IF_READY && CXLMDEV_READY(md_status))) {
428 		dev_err(dev, "mbox: reported doorbell ready, but not mbox ready\n");
429 		rc = -EBUSY;
430 		goto out;
431 	}
432 
433 	/*
434 	 * Hardware shouldn't allow a ready status but also have failure bits
435 	 * set. Spit out an error, this should be a bug report
436 	 */
437 	rc = -EFAULT;
438 	if (md_status & CXLMDEV_DEV_FATAL) {
439 		dev_err(dev, "mbox: reported ready, but fatal\n");
440 		goto out;
441 	}
442 	if (md_status & CXLMDEV_FW_HALT) {
443 		dev_err(dev, "mbox: reported ready, but halted\n");
444 		goto out;
445 	}
446 	if (CXLMDEV_RESET_NEEDED(md_status)) {
447 		dev_err(dev, "mbox: reported ready, but reset needed\n");
448 		goto out;
449 	}
450 
451 	/* with lock held */
452 	return 0;
453 
454 out:
455 	mutex_unlock(&cxlm->mbox_mutex);
456 	return rc;
457 }
458 
459 /**
460  * cxl_mem_mbox_put() - Release exclusive access to the mailbox.
461  * @cxlm: The CXL memory device to communicate with.
462  *
463  * Context: Any context. Expects mbox_mutex to be held.
464  */
465 static void cxl_mem_mbox_put(struct cxl_mem *cxlm)
466 {
467 	mutex_unlock(&cxlm->mbox_mutex);
468 }
469 
470 /**
471  * handle_mailbox_cmd_from_user() - Dispatch a mailbox command for userspace.
472  * @cxlm: The CXL memory device to communicate with.
473  * @cmd: The validated command.
474  * @in_payload: Pointer to userspace's input payload.
475  * @out_payload: Pointer to userspace's output payload.
476  * @size_out: (Input) Max payload size to copy out.
477  *            (Output) Payload size hardware generated.
478  * @retval: Hardware generated return code from the operation.
479  *
480  * Return:
481  *  * %0	- Mailbox transaction succeeded. This implies the mailbox
482  *		  protocol completed successfully not that the operation itself
483  *		  was successful.
484  *  * %-ENOMEM  - Couldn't allocate a bounce buffer.
485  *  * %-EFAULT	- Something happened with copy_to/from_user.
486  *  * %-EINTR	- Mailbox acquisition interrupted.
487  *  * %-EXXX	- Transaction level failures.
488  *
489  * Creates the appropriate mailbox command and dispatches it on behalf of a
490  * userspace request. The input and output payloads are copied between
491  * userspace.
492  *
493  * See cxl_send_cmd().
494  */
495 static int handle_mailbox_cmd_from_user(struct cxl_mem *cxlm,
496 					const struct cxl_mem_command *cmd,
497 					u64 in_payload, u64 out_payload,
498 					s32 *size_out, u32 *retval)
499 {
500 	struct device *dev = &cxlm->pdev->dev;
501 	struct mbox_cmd mbox_cmd = {
502 		.opcode = cmd->opcode,
503 		.size_in = cmd->info.size_in,
504 		.size_out = cmd->info.size_out,
505 	};
506 	int rc;
507 
508 	if (cmd->info.size_out) {
509 		mbox_cmd.payload_out = kvzalloc(cmd->info.size_out, GFP_KERNEL);
510 		if (!mbox_cmd.payload_out)
511 			return -ENOMEM;
512 	}
513 
514 	if (cmd->info.size_in) {
515 		mbox_cmd.payload_in = vmemdup_user(u64_to_user_ptr(in_payload),
516 						   cmd->info.size_in);
517 		if (IS_ERR(mbox_cmd.payload_in)) {
518 			kvfree(mbox_cmd.payload_out);
519 			return PTR_ERR(mbox_cmd.payload_in);
520 		}
521 	}
522 
523 	rc = cxl_mem_mbox_get(cxlm);
524 	if (rc)
525 		goto out;
526 
527 	dev_dbg(dev,
528 		"Submitting %s command for user\n"
529 		"\topcode: %x\n"
530 		"\tsize: %ub\n",
531 		cxl_command_names[cmd->info.id].name, mbox_cmd.opcode,
532 		cmd->info.size_in);
533 
534 	dev_WARN_ONCE(dev, cmd->info.id == CXL_MEM_COMMAND_ID_RAW,
535 		      "raw command path used\n");
536 
537 	rc = __cxl_mem_mbox_send_cmd(cxlm, &mbox_cmd);
538 	cxl_mem_mbox_put(cxlm);
539 	if (rc)
540 		goto out;
541 
542 	/*
543 	 * @size_out contains the max size that's allowed to be written back out
544 	 * to userspace. While the payload may have written more output than
545 	 * this it will have to be ignored.
546 	 */
547 	if (mbox_cmd.size_out) {
548 		dev_WARN_ONCE(dev, mbox_cmd.size_out > *size_out,
549 			      "Invalid return size\n");
550 		if (copy_to_user(u64_to_user_ptr(out_payload),
551 				 mbox_cmd.payload_out, mbox_cmd.size_out)) {
552 			rc = -EFAULT;
553 			goto out;
554 		}
555 	}
556 
557 	*size_out = mbox_cmd.size_out;
558 	*retval = mbox_cmd.return_code;
559 
560 out:
561 	kvfree(mbox_cmd.payload_in);
562 	kvfree(mbox_cmd.payload_out);
563 	return rc;
564 }
565 
566 static bool cxl_mem_raw_command_allowed(u16 opcode)
567 {
568 	int i;
569 
570 	if (!IS_ENABLED(CONFIG_CXL_MEM_RAW_COMMANDS))
571 		return false;
572 
573 	if (security_locked_down(LOCKDOWN_NONE))
574 		return false;
575 
576 	if (cxl_raw_allow_all)
577 		return true;
578 
579 	if (cxl_is_security_command(opcode))
580 		return false;
581 
582 	for (i = 0; i < ARRAY_SIZE(cxl_disabled_raw_commands); i++)
583 		if (cxl_disabled_raw_commands[i] == opcode)
584 			return false;
585 
586 	return true;
587 }
588 
589 /**
590  * cxl_validate_cmd_from_user() - Check fields for CXL_MEM_SEND_COMMAND.
591  * @cxlm: &struct cxl_mem device whose mailbox will be used.
592  * @send_cmd: &struct cxl_send_command copied in from userspace.
593  * @out_cmd: Sanitized and populated &struct cxl_mem_command.
594  *
595  * Return:
596  *  * %0	- @out_cmd is ready to send.
597  *  * %-ENOTTY	- Invalid command specified.
598  *  * %-EINVAL	- Reserved fields or invalid values were used.
599  *  * %-ENOMEM	- Input or output buffer wasn't sized properly.
600  *  * %-EPERM	- Attempted to use a protected command.
601  *
602  * The result of this command is a fully validated command in @out_cmd that is
603  * safe to send to the hardware.
604  *
605  * See handle_mailbox_cmd_from_user()
606  */
607 static int cxl_validate_cmd_from_user(struct cxl_mem *cxlm,
608 				      const struct cxl_send_command *send_cmd,
609 				      struct cxl_mem_command *out_cmd)
610 {
611 	const struct cxl_command_info *info;
612 	struct cxl_mem_command *c;
613 
614 	if (send_cmd->id == 0 || send_cmd->id >= CXL_MEM_COMMAND_ID_MAX)
615 		return -ENOTTY;
616 
617 	/*
618 	 * The user can never specify an input payload larger than what hardware
619 	 * supports, but output can be arbitrarily large (simply write out as
620 	 * much data as the hardware provides).
621 	 */
622 	if (send_cmd->in.size > cxlm->payload_size)
623 		return -EINVAL;
624 
625 	/*
626 	 * Checks are bypassed for raw commands but a WARN/taint will occur
627 	 * later in the callchain
628 	 */
629 	if (send_cmd->id == CXL_MEM_COMMAND_ID_RAW) {
630 		const struct cxl_mem_command temp = {
631 			.info = {
632 				.id = CXL_MEM_COMMAND_ID_RAW,
633 				.flags = 0,
634 				.size_in = send_cmd->in.size,
635 				.size_out = send_cmd->out.size,
636 			},
637 			.opcode = send_cmd->raw.opcode
638 		};
639 
640 		if (send_cmd->raw.rsvd)
641 			return -EINVAL;
642 
643 		/*
644 		 * Unlike supported commands, the output size of RAW commands
645 		 * gets passed along without further checking, so it must be
646 		 * validated here.
647 		 */
648 		if (send_cmd->out.size > cxlm->payload_size)
649 			return -EINVAL;
650 
651 		if (!cxl_mem_raw_command_allowed(send_cmd->raw.opcode))
652 			return -EPERM;
653 
654 		memcpy(out_cmd, &temp, sizeof(temp));
655 
656 		return 0;
657 	}
658 
659 	if (send_cmd->flags & ~CXL_MEM_COMMAND_FLAG_MASK)
660 		return -EINVAL;
661 
662 	if (send_cmd->rsvd)
663 		return -EINVAL;
664 
665 	if (send_cmd->in.rsvd || send_cmd->out.rsvd)
666 		return -EINVAL;
667 
668 	/* Convert user's command into the internal representation */
669 	c = &mem_commands[send_cmd->id];
670 	info = &c->info;
671 
672 	/* Check that the command is enabled for hardware */
673 	if (!test_bit(info->id, cxlm->enabled_cmds))
674 		return -ENOTTY;
675 
676 	/* Check the input buffer is the expected size */
677 	if (info->size_in >= 0 && info->size_in != send_cmd->in.size)
678 		return -ENOMEM;
679 
680 	/* Check the output buffer is at least large enough */
681 	if (info->size_out >= 0 && send_cmd->out.size < info->size_out)
682 		return -ENOMEM;
683 
684 	memcpy(out_cmd, c, sizeof(*c));
685 	out_cmd->info.size_in = send_cmd->in.size;
686 	/*
687 	 * XXX: out_cmd->info.size_out will be controlled by the driver, and the
688 	 * specified number of bytes @send_cmd->out.size will be copied back out
689 	 * to userspace.
690 	 */
691 
692 	return 0;
693 }
694 
695 static int cxl_query_cmd(struct cxl_memdev *cxlmd,
696 			 struct cxl_mem_query_commands __user *q)
697 {
698 	struct device *dev = &cxlmd->dev;
699 	struct cxl_mem_command *cmd;
700 	u32 n_commands;
701 	int j = 0;
702 
703 	dev_dbg(dev, "Query IOCTL\n");
704 
705 	if (get_user(n_commands, &q->n_commands))
706 		return -EFAULT;
707 
708 	/* returns the total number if 0 elements are requested. */
709 	if (n_commands == 0)
710 		return put_user(cxl_cmd_count, &q->n_commands);
711 
712 	/*
713 	 * otherwise, return max(n_commands, total commands) cxl_command_info
714 	 * structures.
715 	 */
716 	cxl_for_each_cmd(cmd) {
717 		const struct cxl_command_info *info = &cmd->info;
718 
719 		if (copy_to_user(&q->commands[j++], info, sizeof(*info)))
720 			return -EFAULT;
721 
722 		if (j == n_commands)
723 			break;
724 	}
725 
726 	return 0;
727 }
728 
729 static int cxl_send_cmd(struct cxl_memdev *cxlmd,
730 			struct cxl_send_command __user *s)
731 {
732 	struct cxl_mem *cxlm = cxlmd->cxlm;
733 	struct device *dev = &cxlmd->dev;
734 	struct cxl_send_command send;
735 	struct cxl_mem_command c;
736 	int rc;
737 
738 	dev_dbg(dev, "Send IOCTL\n");
739 
740 	if (copy_from_user(&send, s, sizeof(send)))
741 		return -EFAULT;
742 
743 	rc = cxl_validate_cmd_from_user(cxlmd->cxlm, &send, &c);
744 	if (rc)
745 		return rc;
746 
747 	/* Prepare to handle a full payload for variable sized output */
748 	if (c.info.size_out < 0)
749 		c.info.size_out = cxlm->payload_size;
750 
751 	rc = handle_mailbox_cmd_from_user(cxlm, &c, send.in.payload,
752 					  send.out.payload, &send.out.size,
753 					  &send.retval);
754 	if (rc)
755 		return rc;
756 
757 	if (copy_to_user(s, &send, sizeof(send)))
758 		return -EFAULT;
759 
760 	return 0;
761 }
762 
763 static long __cxl_memdev_ioctl(struct cxl_memdev *cxlmd, unsigned int cmd,
764 			       unsigned long arg)
765 {
766 	switch (cmd) {
767 	case CXL_MEM_QUERY_COMMANDS:
768 		return cxl_query_cmd(cxlmd, (void __user *)arg);
769 	case CXL_MEM_SEND_COMMAND:
770 		return cxl_send_cmd(cxlmd, (void __user *)arg);
771 	default:
772 		return -ENOTTY;
773 	}
774 }
775 
776 static long cxl_memdev_ioctl(struct file *file, unsigned int cmd,
777 			     unsigned long arg)
778 {
779 	struct cxl_memdev *cxlmd;
780 	struct inode *inode;
781 	int rc = -ENOTTY;
782 
783 	inode = file_inode(file);
784 	cxlmd = container_of(inode->i_cdev, typeof(*cxlmd), cdev);
785 
786 	if (!percpu_ref_tryget_live(&cxlmd->ops_active))
787 		return -ENXIO;
788 
789 	rc = __cxl_memdev_ioctl(cxlmd, cmd, arg);
790 
791 	percpu_ref_put(&cxlmd->ops_active);
792 
793 	return rc;
794 }
795 
796 static const struct file_operations cxl_memdev_fops = {
797 	.owner = THIS_MODULE,
798 	.unlocked_ioctl = cxl_memdev_ioctl,
799 	.compat_ioctl = compat_ptr_ioctl,
800 	.llseek = noop_llseek,
801 };
802 
803 static inline struct cxl_mem_command *cxl_mem_find_command(u16 opcode)
804 {
805 	struct cxl_mem_command *c;
806 
807 	cxl_for_each_cmd(c)
808 		if (c->opcode == opcode)
809 			return c;
810 
811 	return NULL;
812 }
813 
814 /**
815  * cxl_mem_mbox_send_cmd() - Send a mailbox command to a memory device.
816  * @cxlm: The CXL memory device to communicate with.
817  * @opcode: Opcode for the mailbox command.
818  * @in: The input payload for the mailbox command.
819  * @in_size: The length of the input payload
820  * @out: Caller allocated buffer for the output.
821  * @out_size: Expected size of output.
822  *
823  * Context: Any context. Will acquire and release mbox_mutex.
824  * Return:
825  *  * %>=0	- Number of bytes returned in @out.
826  *  * %-E2BIG	- Payload is too large for hardware.
827  *  * %-EBUSY	- Couldn't acquire exclusive mailbox access.
828  *  * %-EFAULT	- Hardware error occurred.
829  *  * %-ENXIO	- Command completed, but device reported an error.
830  *  * %-EIO	- Unexpected output size.
831  *
832  * Mailbox commands may execute successfully yet the device itself reported an
833  * error. While this distinction can be useful for commands from userspace, the
834  * kernel will only be able to use results when both are successful.
835  *
836  * See __cxl_mem_mbox_send_cmd()
837  */
838 static int cxl_mem_mbox_send_cmd(struct cxl_mem *cxlm, u16 opcode,
839 				 void *in, size_t in_size,
840 				 void *out, size_t out_size)
841 {
842 	const struct cxl_mem_command *cmd = cxl_mem_find_command(opcode);
843 	struct mbox_cmd mbox_cmd = {
844 		.opcode = opcode,
845 		.payload_in = in,
846 		.size_in = in_size,
847 		.size_out = out_size,
848 		.payload_out = out,
849 	};
850 	int rc;
851 
852 	if (out_size > cxlm->payload_size)
853 		return -E2BIG;
854 
855 	rc = cxl_mem_mbox_get(cxlm);
856 	if (rc)
857 		return rc;
858 
859 	rc = __cxl_mem_mbox_send_cmd(cxlm, &mbox_cmd);
860 	cxl_mem_mbox_put(cxlm);
861 	if (rc)
862 		return rc;
863 
864 	/* TODO: Map return code to proper kernel style errno */
865 	if (mbox_cmd.return_code != CXL_MBOX_SUCCESS)
866 		return -ENXIO;
867 
868 	/*
869 	 * Variable sized commands can't be validated and so it's up to the
870 	 * caller to do that if they wish.
871 	 */
872 	if (cmd->info.size_out >= 0 && mbox_cmd.size_out != out_size)
873 		return -EIO;
874 
875 	return 0;
876 }
877 
878 /**
879  * cxl_mem_setup_regs() - Setup necessary MMIO.
880  * @cxlm: The CXL memory device to communicate with.
881  *
882  * Return: 0 if all necessary registers mapped.
883  *
884  * A memory device is required by spec to implement a certain set of MMIO
885  * regions. The purpose of this function is to enumerate and map those
886  * registers.
887  */
888 static int cxl_mem_setup_regs(struct cxl_mem *cxlm)
889 {
890 	struct device *dev = &cxlm->pdev->dev;
891 	int cap, cap_count;
892 	u64 cap_array;
893 
894 	cap_array = readq(cxlm->regs + CXLDEV_CAP_ARRAY_OFFSET);
895 	if (FIELD_GET(CXLDEV_CAP_ARRAY_ID_MASK, cap_array) !=
896 	    CXLDEV_CAP_ARRAY_CAP_ID)
897 		return -ENODEV;
898 
899 	cap_count = FIELD_GET(CXLDEV_CAP_ARRAY_COUNT_MASK, cap_array);
900 
901 	for (cap = 1; cap <= cap_count; cap++) {
902 		void __iomem *register_block;
903 		u32 offset;
904 		u16 cap_id;
905 
906 		cap_id = FIELD_GET(CXLDEV_CAP_HDR_CAP_ID_MASK,
907 				   readl(cxlm->regs + cap * 0x10));
908 		offset = readl(cxlm->regs + cap * 0x10 + 0x4);
909 		register_block = cxlm->regs + offset;
910 
911 		switch (cap_id) {
912 		case CXLDEV_CAP_CAP_ID_DEVICE_STATUS:
913 			dev_dbg(dev, "found Status capability (0x%x)\n", offset);
914 			cxlm->status_regs = register_block;
915 			break;
916 		case CXLDEV_CAP_CAP_ID_PRIMARY_MAILBOX:
917 			dev_dbg(dev, "found Mailbox capability (0x%x)\n", offset);
918 			cxlm->mbox_regs = register_block;
919 			break;
920 		case CXLDEV_CAP_CAP_ID_SECONDARY_MAILBOX:
921 			dev_dbg(dev, "found Secondary Mailbox capability (0x%x)\n", offset);
922 			break;
923 		case CXLDEV_CAP_CAP_ID_MEMDEV:
924 			dev_dbg(dev, "found Memory Device capability (0x%x)\n", offset);
925 			cxlm->memdev_regs = register_block;
926 			break;
927 		default:
928 			dev_dbg(dev, "Unknown cap ID: %d (0x%x)\n", cap_id, offset);
929 			break;
930 		}
931 	}
932 
933 	if (!cxlm->status_regs || !cxlm->mbox_regs || !cxlm->memdev_regs) {
934 		dev_err(dev, "registers not found: %s%s%s\n",
935 			!cxlm->status_regs ? "status " : "",
936 			!cxlm->mbox_regs ? "mbox " : "",
937 			!cxlm->memdev_regs ? "memdev" : "");
938 		return -ENXIO;
939 	}
940 
941 	return 0;
942 }
943 
944 static int cxl_mem_setup_mailbox(struct cxl_mem *cxlm)
945 {
946 	const int cap = readl(cxlm->mbox_regs + CXLDEV_MBOX_CAPS_OFFSET);
947 
948 	cxlm->payload_size =
949 		1 << FIELD_GET(CXLDEV_MBOX_CAP_PAYLOAD_SIZE_MASK, cap);
950 
951 	/*
952 	 * CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register
953 	 *
954 	 * If the size is too small, mandatory commands will not work and so
955 	 * there's no point in going forward. If the size is too large, there's
956 	 * no harm is soft limiting it.
957 	 */
958 	cxlm->payload_size = min_t(size_t, cxlm->payload_size, SZ_1M);
959 	if (cxlm->payload_size < 256) {
960 		dev_err(&cxlm->pdev->dev, "Mailbox is too small (%zub)",
961 			cxlm->payload_size);
962 		return -ENXIO;
963 	}
964 
965 	dev_dbg(&cxlm->pdev->dev, "Mailbox payload sized %zu",
966 		cxlm->payload_size);
967 
968 	return 0;
969 }
970 
971 static struct cxl_mem *cxl_mem_create(struct pci_dev *pdev, u32 reg_lo,
972 				      u32 reg_hi)
973 {
974 	struct device *dev = &pdev->dev;
975 	struct cxl_mem *cxlm;
976 	void __iomem *regs;
977 	u64 offset;
978 	u8 bar;
979 	int rc;
980 
981 	cxlm = devm_kzalloc(&pdev->dev, sizeof(*cxlm), GFP_KERNEL);
982 	if (!cxlm) {
983 		dev_err(dev, "No memory available\n");
984 		return NULL;
985 	}
986 
987 	offset = ((u64)reg_hi << 32) | FIELD_GET(CXL_REGLOC_ADDR_MASK, reg_lo);
988 	bar = FIELD_GET(CXL_REGLOC_BIR_MASK, reg_lo);
989 
990 	/* Basic sanity check that BAR is big enough */
991 	if (pci_resource_len(pdev, bar) < offset) {
992 		dev_err(dev, "BAR%d: %pr: too small (offset: %#llx)\n", bar,
993 			&pdev->resource[bar], (unsigned long long)offset);
994 		return NULL;
995 	}
996 
997 	rc = pcim_iomap_regions(pdev, BIT(bar), pci_name(pdev));
998 	if (rc) {
999 		dev_err(dev, "failed to map registers\n");
1000 		return NULL;
1001 	}
1002 	regs = pcim_iomap_table(pdev)[bar];
1003 
1004 	mutex_init(&cxlm->mbox_mutex);
1005 	cxlm->pdev = pdev;
1006 	cxlm->regs = regs + offset;
1007 	cxlm->enabled_cmds =
1008 		devm_kmalloc_array(dev, BITS_TO_LONGS(cxl_cmd_count),
1009 				   sizeof(unsigned long),
1010 				   GFP_KERNEL | __GFP_ZERO);
1011 	if (!cxlm->enabled_cmds) {
1012 		dev_err(dev, "No memory available for bitmap\n");
1013 		return NULL;
1014 	}
1015 
1016 	dev_dbg(dev, "Mapped CXL Memory Device resource\n");
1017 	return cxlm;
1018 }
1019 
1020 static int cxl_mem_dvsec(struct pci_dev *pdev, int dvsec)
1021 {
1022 	int pos;
1023 
1024 	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_DVSEC);
1025 	if (!pos)
1026 		return 0;
1027 
1028 	while (pos) {
1029 		u16 vendor, id;
1030 
1031 		pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER1, &vendor);
1032 		pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER2, &id);
1033 		if (vendor == PCI_DVSEC_VENDOR_ID_CXL && dvsec == id)
1034 			return pos;
1035 
1036 		pos = pci_find_next_ext_capability(pdev, pos,
1037 						   PCI_EXT_CAP_ID_DVSEC);
1038 	}
1039 
1040 	return 0;
1041 }
1042 
1043 static struct cxl_memdev *to_cxl_memdev(struct device *dev)
1044 {
1045 	return container_of(dev, struct cxl_memdev, dev);
1046 }
1047 
1048 static void cxl_memdev_release(struct device *dev)
1049 {
1050 	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
1051 
1052 	percpu_ref_exit(&cxlmd->ops_active);
1053 	ida_free(&cxl_memdev_ida, cxlmd->id);
1054 	kfree(cxlmd);
1055 }
1056 
1057 static char *cxl_memdev_devnode(struct device *dev, umode_t *mode, kuid_t *uid,
1058 				kgid_t *gid)
1059 {
1060 	return kasprintf(GFP_KERNEL, "cxl/%s", dev_name(dev));
1061 }
1062 
1063 static ssize_t firmware_version_show(struct device *dev,
1064 				     struct device_attribute *attr, char *buf)
1065 {
1066 	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
1067 	struct cxl_mem *cxlm = cxlmd->cxlm;
1068 
1069 	return sprintf(buf, "%.16s\n", cxlm->firmware_version);
1070 }
1071 static DEVICE_ATTR_RO(firmware_version);
1072 
1073 static ssize_t payload_max_show(struct device *dev,
1074 				struct device_attribute *attr, char *buf)
1075 {
1076 	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
1077 	struct cxl_mem *cxlm = cxlmd->cxlm;
1078 
1079 	return sprintf(buf, "%zu\n", cxlm->payload_size);
1080 }
1081 static DEVICE_ATTR_RO(payload_max);
1082 
1083 static ssize_t ram_size_show(struct device *dev, struct device_attribute *attr,
1084 			     char *buf)
1085 {
1086 	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
1087 	struct cxl_mem *cxlm = cxlmd->cxlm;
1088 	unsigned long long len = range_len(&cxlm->ram_range);
1089 
1090 	return sprintf(buf, "%#llx\n", len);
1091 }
1092 
1093 static struct device_attribute dev_attr_ram_size =
1094 	__ATTR(size, 0444, ram_size_show, NULL);
1095 
1096 static ssize_t pmem_size_show(struct device *dev, struct device_attribute *attr,
1097 			      char *buf)
1098 {
1099 	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
1100 	struct cxl_mem *cxlm = cxlmd->cxlm;
1101 	unsigned long long len = range_len(&cxlm->pmem_range);
1102 
1103 	return sprintf(buf, "%#llx\n", len);
1104 }
1105 
1106 static struct device_attribute dev_attr_pmem_size =
1107 	__ATTR(size, 0444, pmem_size_show, NULL);
1108 
1109 static struct attribute *cxl_memdev_attributes[] = {
1110 	&dev_attr_firmware_version.attr,
1111 	&dev_attr_payload_max.attr,
1112 	NULL,
1113 };
1114 
1115 static struct attribute *cxl_memdev_pmem_attributes[] = {
1116 	&dev_attr_pmem_size.attr,
1117 	NULL,
1118 };
1119 
1120 static struct attribute *cxl_memdev_ram_attributes[] = {
1121 	&dev_attr_ram_size.attr,
1122 	NULL,
1123 };
1124 
1125 static struct attribute_group cxl_memdev_attribute_group = {
1126 	.attrs = cxl_memdev_attributes,
1127 };
1128 
1129 static struct attribute_group cxl_memdev_ram_attribute_group = {
1130 	.name = "ram",
1131 	.attrs = cxl_memdev_ram_attributes,
1132 };
1133 
1134 static struct attribute_group cxl_memdev_pmem_attribute_group = {
1135 	.name = "pmem",
1136 	.attrs = cxl_memdev_pmem_attributes,
1137 };
1138 
1139 static const struct attribute_group *cxl_memdev_attribute_groups[] = {
1140 	&cxl_memdev_attribute_group,
1141 	&cxl_memdev_ram_attribute_group,
1142 	&cxl_memdev_pmem_attribute_group,
1143 	NULL,
1144 };
1145 
1146 static const struct device_type cxl_memdev_type = {
1147 	.name = "cxl_memdev",
1148 	.release = cxl_memdev_release,
1149 	.devnode = cxl_memdev_devnode,
1150 	.groups = cxl_memdev_attribute_groups,
1151 };
1152 
1153 static void cxlmdev_unregister(void *_cxlmd)
1154 {
1155 	struct cxl_memdev *cxlmd = _cxlmd;
1156 	struct device *dev = &cxlmd->dev;
1157 
1158 	percpu_ref_kill(&cxlmd->ops_active);
1159 	cdev_device_del(&cxlmd->cdev, dev);
1160 	wait_for_completion(&cxlmd->ops_dead);
1161 	cxlmd->cxlm = NULL;
1162 	put_device(dev);
1163 }
1164 
1165 static void cxlmdev_ops_active_release(struct percpu_ref *ref)
1166 {
1167 	struct cxl_memdev *cxlmd =
1168 		container_of(ref, typeof(*cxlmd), ops_active);
1169 
1170 	complete(&cxlmd->ops_dead);
1171 }
1172 
1173 static int cxl_mem_add_memdev(struct cxl_mem *cxlm)
1174 {
1175 	struct pci_dev *pdev = cxlm->pdev;
1176 	struct cxl_memdev *cxlmd;
1177 	struct device *dev;
1178 	struct cdev *cdev;
1179 	int rc;
1180 
1181 	cxlmd = kzalloc(sizeof(*cxlmd), GFP_KERNEL);
1182 	if (!cxlmd)
1183 		return -ENOMEM;
1184 	init_completion(&cxlmd->ops_dead);
1185 
1186 	/*
1187 	 * @cxlm is deallocated when the driver unbinds so operations
1188 	 * that are using it need to hold a live reference.
1189 	 */
1190 	cxlmd->cxlm = cxlm;
1191 	rc = percpu_ref_init(&cxlmd->ops_active, cxlmdev_ops_active_release, 0,
1192 			     GFP_KERNEL);
1193 	if (rc)
1194 		goto err_ref;
1195 
1196 	rc = ida_alloc_range(&cxl_memdev_ida, 0, CXL_MEM_MAX_DEVS, GFP_KERNEL);
1197 	if (rc < 0)
1198 		goto err_id;
1199 	cxlmd->id = rc;
1200 
1201 	dev = &cxlmd->dev;
1202 	device_initialize(dev);
1203 	dev->parent = &pdev->dev;
1204 	dev->bus = &cxl_bus_type;
1205 	dev->devt = MKDEV(cxl_mem_major, cxlmd->id);
1206 	dev->type = &cxl_memdev_type;
1207 	dev_set_name(dev, "mem%d", cxlmd->id);
1208 
1209 	cdev = &cxlmd->cdev;
1210 	cdev_init(cdev, &cxl_memdev_fops);
1211 
1212 	rc = cdev_device_add(cdev, dev);
1213 	if (rc)
1214 		goto err_add;
1215 
1216 	return devm_add_action_or_reset(dev->parent, cxlmdev_unregister, cxlmd);
1217 
1218 err_add:
1219 	ida_free(&cxl_memdev_ida, cxlmd->id);
1220 err_id:
1221 	/*
1222 	 * Theoretically userspace could have already entered the fops,
1223 	 * so flush ops_active.
1224 	 */
1225 	percpu_ref_kill(&cxlmd->ops_active);
1226 	wait_for_completion(&cxlmd->ops_dead);
1227 	percpu_ref_exit(&cxlmd->ops_active);
1228 err_ref:
1229 	kfree(cxlmd);
1230 
1231 	return rc;
1232 }
1233 
1234 static int cxl_xfer_log(struct cxl_mem *cxlm, uuid_t *uuid, u32 size, u8 *out)
1235 {
1236 	u32 remaining = size;
1237 	u32 offset = 0;
1238 
1239 	while (remaining) {
1240 		u32 xfer_size = min_t(u32, remaining, cxlm->payload_size);
1241 		struct cxl_mbox_get_log {
1242 			uuid_t uuid;
1243 			__le32 offset;
1244 			__le32 length;
1245 		} __packed log = {
1246 			.uuid = *uuid,
1247 			.offset = cpu_to_le32(offset),
1248 			.length = cpu_to_le32(xfer_size)
1249 		};
1250 		int rc;
1251 
1252 		rc = cxl_mem_mbox_send_cmd(cxlm, CXL_MBOX_OP_GET_LOG, &log,
1253 					   sizeof(log), out, xfer_size);
1254 		if (rc < 0)
1255 			return rc;
1256 
1257 		out += xfer_size;
1258 		remaining -= xfer_size;
1259 		offset += xfer_size;
1260 	}
1261 
1262 	return 0;
1263 }
1264 
1265 /**
1266  * cxl_walk_cel() - Walk through the Command Effects Log.
1267  * @cxlm: Device.
1268  * @size: Length of the Command Effects Log.
1269  * @cel: CEL
1270  *
1271  * Iterate over each entry in the CEL and determine if the driver supports the
1272  * command. If so, the command is enabled for the device and can be used later.
1273  */
1274 static void cxl_walk_cel(struct cxl_mem *cxlm, size_t size, u8 *cel)
1275 {
1276 	struct cel_entry {
1277 		__le16 opcode;
1278 		__le16 effect;
1279 	} __packed * cel_entry;
1280 	const int cel_entries = size / sizeof(*cel_entry);
1281 	int i;
1282 
1283 	cel_entry = (struct cel_entry *)cel;
1284 
1285 	for (i = 0; i < cel_entries; i++) {
1286 		u16 opcode = le16_to_cpu(cel_entry[i].opcode);
1287 		struct cxl_mem_command *cmd = cxl_mem_find_command(opcode);
1288 
1289 		if (!cmd) {
1290 			dev_dbg(&cxlm->pdev->dev,
1291 				"Opcode 0x%04x unsupported by driver", opcode);
1292 			continue;
1293 		}
1294 
1295 		set_bit(cmd->info.id, cxlm->enabled_cmds);
1296 	}
1297 }
1298 
1299 struct cxl_mbox_get_supported_logs {
1300 	__le16 entries;
1301 	u8 rsvd[6];
1302 	struct gsl_entry {
1303 		uuid_t uuid;
1304 		__le32 size;
1305 	} __packed entry[];
1306 } __packed;
1307 
1308 static struct cxl_mbox_get_supported_logs *cxl_get_gsl(struct cxl_mem *cxlm)
1309 {
1310 	struct cxl_mbox_get_supported_logs *ret;
1311 	int rc;
1312 
1313 	ret = kvmalloc(cxlm->payload_size, GFP_KERNEL);
1314 	if (!ret)
1315 		return ERR_PTR(-ENOMEM);
1316 
1317 	rc = cxl_mem_mbox_send_cmd(cxlm, CXL_MBOX_OP_GET_SUPPORTED_LOGS, NULL,
1318 				   0, ret, cxlm->payload_size);
1319 	if (rc < 0) {
1320 		kvfree(ret);
1321 		return ERR_PTR(rc);
1322 	}
1323 
1324 	return ret;
1325 }
1326 
1327 /**
1328  * cxl_mem_enumerate_cmds() - Enumerate commands for a device.
1329  * @cxlm: The device.
1330  *
1331  * Returns 0 if enumerate completed successfully.
1332  *
1333  * CXL devices have optional support for certain commands. This function will
1334  * determine the set of supported commands for the hardware and update the
1335  * enabled_cmds bitmap in the @cxlm.
1336  */
1337 static int cxl_mem_enumerate_cmds(struct cxl_mem *cxlm)
1338 {
1339 	struct cxl_mbox_get_supported_logs *gsl;
1340 	struct device *dev = &cxlm->pdev->dev;
1341 	struct cxl_mem_command *cmd;
1342 	int i, rc;
1343 
1344 	gsl = cxl_get_gsl(cxlm);
1345 	if (IS_ERR(gsl))
1346 		return PTR_ERR(gsl);
1347 
1348 	rc = -ENOENT;
1349 	for (i = 0; i < le16_to_cpu(gsl->entries); i++) {
1350 		u32 size = le32_to_cpu(gsl->entry[i].size);
1351 		uuid_t uuid = gsl->entry[i].uuid;
1352 		u8 *log;
1353 
1354 		dev_dbg(dev, "Found LOG type %pU of size %d", &uuid, size);
1355 
1356 		if (!uuid_equal(&uuid, &log_uuid[CEL_UUID]))
1357 			continue;
1358 
1359 		log = kvmalloc(size, GFP_KERNEL);
1360 		if (!log) {
1361 			rc = -ENOMEM;
1362 			goto out;
1363 		}
1364 
1365 		rc = cxl_xfer_log(cxlm, &uuid, size, log);
1366 		if (rc) {
1367 			kvfree(log);
1368 			goto out;
1369 		}
1370 
1371 		cxl_walk_cel(cxlm, size, log);
1372 		kvfree(log);
1373 
1374 		/* In case CEL was bogus, enable some default commands. */
1375 		cxl_for_each_cmd(cmd)
1376 			if (cmd->flags & CXL_CMD_FLAG_FORCE_ENABLE)
1377 				set_bit(cmd->info.id, cxlm->enabled_cmds);
1378 
1379 		/* Found the required CEL */
1380 		rc = 0;
1381 	}
1382 
1383 out:
1384 	kvfree(gsl);
1385 	return rc;
1386 }
1387 
1388 /**
1389  * cxl_mem_identify() - Send the IDENTIFY command to the device.
1390  * @cxlm: The device to identify.
1391  *
1392  * Return: 0 if identify was executed successfully.
1393  *
1394  * This will dispatch the identify command to the device and on success populate
1395  * structures to be exported to sysfs.
1396  */
1397 static int cxl_mem_identify(struct cxl_mem *cxlm)
1398 {
1399 	struct cxl_mbox_identify {
1400 		char fw_revision[0x10];
1401 		__le64 total_capacity;
1402 		__le64 volatile_capacity;
1403 		__le64 persistent_capacity;
1404 		__le64 partition_align;
1405 		__le16 info_event_log_size;
1406 		__le16 warning_event_log_size;
1407 		__le16 failure_event_log_size;
1408 		__le16 fatal_event_log_size;
1409 		__le32 lsa_size;
1410 		u8 poison_list_max_mer[3];
1411 		__le16 inject_poison_limit;
1412 		u8 poison_caps;
1413 		u8 qos_telemetry_caps;
1414 	} __packed id;
1415 	int rc;
1416 
1417 	rc = cxl_mem_mbox_send_cmd(cxlm, CXL_MBOX_OP_IDENTIFY, NULL, 0, &id,
1418 				   sizeof(id));
1419 	if (rc < 0)
1420 		return rc;
1421 
1422 	/*
1423 	 * TODO: enumerate DPA map, as 'ram' and 'pmem' do not alias.
1424 	 * For now, only the capacity is exported in sysfs
1425 	 */
1426 	cxlm->ram_range.start = 0;
1427 	cxlm->ram_range.end = le64_to_cpu(id.volatile_capacity) - 1;
1428 
1429 	cxlm->pmem_range.start = 0;
1430 	cxlm->pmem_range.end = le64_to_cpu(id.persistent_capacity) - 1;
1431 
1432 	memcpy(cxlm->firmware_version, id.fw_revision, sizeof(id.fw_revision));
1433 
1434 	return 0;
1435 }
1436 
1437 static int cxl_mem_probe(struct pci_dev *pdev, const struct pci_device_id *id)
1438 {
1439 	struct device *dev = &pdev->dev;
1440 	struct cxl_mem *cxlm = NULL;
1441 	u32 regloc_size, regblocks;
1442 	int rc, regloc, i;
1443 
1444 	rc = pcim_enable_device(pdev);
1445 	if (rc)
1446 		return rc;
1447 
1448 	regloc = cxl_mem_dvsec(pdev, PCI_DVSEC_ID_CXL_REGLOC_OFFSET);
1449 	if (!regloc) {
1450 		dev_err(dev, "register location dvsec not found\n");
1451 		return -ENXIO;
1452 	}
1453 
1454 	/* Get the size of the Register Locator DVSEC */
1455 	pci_read_config_dword(pdev, regloc + PCI_DVSEC_HEADER1, &regloc_size);
1456 	regloc_size = FIELD_GET(PCI_DVSEC_HEADER1_LENGTH_MASK, regloc_size);
1457 
1458 	regloc += PCI_DVSEC_ID_CXL_REGLOC_BLOCK1_OFFSET;
1459 	regblocks = (regloc_size - PCI_DVSEC_ID_CXL_REGLOC_BLOCK1_OFFSET) / 8;
1460 
1461 	for (i = 0; i < regblocks; i++, regloc += 8) {
1462 		u32 reg_lo, reg_hi;
1463 		u8 reg_type;
1464 
1465 		/* "register low and high" contain other bits */
1466 		pci_read_config_dword(pdev, regloc, &reg_lo);
1467 		pci_read_config_dword(pdev, regloc + 4, &reg_hi);
1468 
1469 		reg_type = FIELD_GET(CXL_REGLOC_RBI_MASK, reg_lo);
1470 
1471 		if (reg_type == CXL_REGLOC_RBI_MEMDEV) {
1472 			cxlm = cxl_mem_create(pdev, reg_lo, reg_hi);
1473 			break;
1474 		}
1475 	}
1476 
1477 	if (!cxlm)
1478 		return -ENODEV;
1479 
1480 	rc = cxl_mem_setup_regs(cxlm);
1481 	if (rc)
1482 		return rc;
1483 
1484 	rc = cxl_mem_setup_mailbox(cxlm);
1485 	if (rc)
1486 		return rc;
1487 
1488 	rc = cxl_mem_enumerate_cmds(cxlm);
1489 	if (rc)
1490 		return rc;
1491 
1492 	rc = cxl_mem_identify(cxlm);
1493 	if (rc)
1494 		return rc;
1495 
1496 	return cxl_mem_add_memdev(cxlm);
1497 }
1498 
1499 static const struct pci_device_id cxl_mem_pci_tbl[] = {
1500 	/* PCI class code for CXL.mem Type-3 Devices */
1501 	{ PCI_DEVICE_CLASS((PCI_CLASS_MEMORY_CXL << 8 | CXL_MEMORY_PROGIF), ~0)},
1502 	{ /* terminate list */ },
1503 };
1504 MODULE_DEVICE_TABLE(pci, cxl_mem_pci_tbl);
1505 
1506 static struct pci_driver cxl_mem_driver = {
1507 	.name			= KBUILD_MODNAME,
1508 	.id_table		= cxl_mem_pci_tbl,
1509 	.probe			= cxl_mem_probe,
1510 	.driver	= {
1511 		.probe_type	= PROBE_PREFER_ASYNCHRONOUS,
1512 	},
1513 };
1514 
1515 static __init int cxl_mem_init(void)
1516 {
1517 	struct dentry *mbox_debugfs;
1518 	dev_t devt;
1519 	int rc;
1520 
1521 	rc = alloc_chrdev_region(&devt, 0, CXL_MEM_MAX_DEVS, "cxl");
1522 	if (rc)
1523 		return rc;
1524 
1525 	cxl_mem_major = MAJOR(devt);
1526 
1527 	rc = pci_register_driver(&cxl_mem_driver);
1528 	if (rc) {
1529 		unregister_chrdev_region(MKDEV(cxl_mem_major, 0),
1530 					 CXL_MEM_MAX_DEVS);
1531 		return rc;
1532 	}
1533 
1534 	cxl_debugfs = debugfs_create_dir("cxl", NULL);
1535 	mbox_debugfs = debugfs_create_dir("mbox", cxl_debugfs);
1536 	debugfs_create_bool("raw_allow_all", 0600, mbox_debugfs,
1537 			    &cxl_raw_allow_all);
1538 
1539 	return 0;
1540 }
1541 
1542 static __exit void cxl_mem_exit(void)
1543 {
1544 	debugfs_remove_recursive(cxl_debugfs);
1545 	pci_unregister_driver(&cxl_mem_driver);
1546 	unregister_chrdev_region(MKDEV(cxl_mem_major, 0), CXL_MEM_MAX_DEVS);
1547 }
1548 
1549 MODULE_LICENSE("GPL v2");
1550 module_init(cxl_mem_init);
1551 module_exit(cxl_mem_exit);
1552 MODULE_IMPORT_NS(CXL);
1553