xref: /openbmc/linux/drivers/nvme/host/core.c (revision a8fe58ce)
1 /*
2  * NVM Express device driver
3  * Copyright (c) 2011-2014, Intel Corporation.
4  *
5  * This program is free software; you can redistribute it and/or modify it
6  * under the terms and conditions of the GNU General Public License,
7  * version 2, as published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
12  * more details.
13  */
14 
15 #include <linux/blkdev.h>
16 #include <linux/blk-mq.h>
17 #include <linux/delay.h>
18 #include <linux/errno.h>
19 #include <linux/hdreg.h>
20 #include <linux/kernel.h>
21 #include <linux/module.h>
22 #include <linux/list_sort.h>
23 #include <linux/slab.h>
24 #include <linux/types.h>
25 #include <linux/pr.h>
26 #include <linux/ptrace.h>
27 #include <linux/nvme_ioctl.h>
28 #include <linux/t10-pi.h>
29 #include <scsi/sg.h>
30 #include <asm/unaligned.h>
31 
32 #include "nvme.h"
33 
34 #define NVME_MINORS		(1U << MINORBITS)
35 
36 static int nvme_major;
37 module_param(nvme_major, int, 0);
38 
39 static int nvme_char_major;
40 module_param(nvme_char_major, int, 0);
41 
42 static LIST_HEAD(nvme_ctrl_list);
43 DEFINE_SPINLOCK(dev_list_lock);
44 
45 static struct class *nvme_class;
46 
47 static void nvme_free_ns(struct kref *kref)
48 {
49 	struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
50 
51 	if (ns->type == NVME_NS_LIGHTNVM)
52 		nvme_nvm_unregister(ns->queue, ns->disk->disk_name);
53 
54 	spin_lock(&dev_list_lock);
55 	ns->disk->private_data = NULL;
56 	spin_unlock(&dev_list_lock);
57 
58 	nvme_put_ctrl(ns->ctrl);
59 	put_disk(ns->disk);
60 	kfree(ns);
61 }
62 
63 static void nvme_put_ns(struct nvme_ns *ns)
64 {
65 	kref_put(&ns->kref, nvme_free_ns);
66 }
67 
68 static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk)
69 {
70 	struct nvme_ns *ns;
71 
72 	spin_lock(&dev_list_lock);
73 	ns = disk->private_data;
74 	if (ns && !kref_get_unless_zero(&ns->kref))
75 		ns = NULL;
76 	spin_unlock(&dev_list_lock);
77 
78 	return ns;
79 }
80 
81 void nvme_requeue_req(struct request *req)
82 {
83 	unsigned long flags;
84 
85 	blk_mq_requeue_request(req);
86 	spin_lock_irqsave(req->q->queue_lock, flags);
87 	if (!blk_queue_stopped(req->q))
88 		blk_mq_kick_requeue_list(req->q);
89 	spin_unlock_irqrestore(req->q->queue_lock, flags);
90 }
91 
92 struct request *nvme_alloc_request(struct request_queue *q,
93 		struct nvme_command *cmd, unsigned int flags)
94 {
95 	bool write = cmd->common.opcode & 1;
96 	struct request *req;
97 
98 	req = blk_mq_alloc_request(q, write, flags);
99 	if (IS_ERR(req))
100 		return req;
101 
102 	req->cmd_type = REQ_TYPE_DRV_PRIV;
103 	req->cmd_flags |= REQ_FAILFAST_DRIVER;
104 	req->__data_len = 0;
105 	req->__sector = (sector_t) -1;
106 	req->bio = req->biotail = NULL;
107 
108 	req->cmd = (unsigned char *)cmd;
109 	req->cmd_len = sizeof(struct nvme_command);
110 	req->special = (void *)0;
111 
112 	return req;
113 }
114 
115 /*
116  * Returns 0 on success.  If the result is negative, it's a Linux error code;
117  * if the result is positive, it's an NVM Express status code
118  */
119 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
120 		void *buffer, unsigned bufflen, u32 *result, unsigned timeout)
121 {
122 	struct request *req;
123 	int ret;
124 
125 	req = nvme_alloc_request(q, cmd, 0);
126 	if (IS_ERR(req))
127 		return PTR_ERR(req);
128 
129 	req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
130 
131 	if (buffer && bufflen) {
132 		ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
133 		if (ret)
134 			goto out;
135 	}
136 
137 	blk_execute_rq(req->q, NULL, req, 0);
138 	if (result)
139 		*result = (u32)(uintptr_t)req->special;
140 	ret = req->errors;
141  out:
142 	blk_mq_free_request(req);
143 	return ret;
144 }
145 
146 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
147 		void *buffer, unsigned bufflen)
148 {
149 	return __nvme_submit_sync_cmd(q, cmd, buffer, bufflen, NULL, 0);
150 }
151 
152 int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
153 		void __user *ubuffer, unsigned bufflen,
154 		void __user *meta_buffer, unsigned meta_len, u32 meta_seed,
155 		u32 *result, unsigned timeout)
156 {
157 	bool write = cmd->common.opcode & 1;
158 	struct nvme_ns *ns = q->queuedata;
159 	struct gendisk *disk = ns ? ns->disk : NULL;
160 	struct request *req;
161 	struct bio *bio = NULL;
162 	void *meta = NULL;
163 	int ret;
164 
165 	req = nvme_alloc_request(q, cmd, 0);
166 	if (IS_ERR(req))
167 		return PTR_ERR(req);
168 
169 	req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
170 
171 	if (ubuffer && bufflen) {
172 		ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
173 				GFP_KERNEL);
174 		if (ret)
175 			goto out;
176 		bio = req->bio;
177 
178 		if (!disk)
179 			goto submit;
180 		bio->bi_bdev = bdget_disk(disk, 0);
181 		if (!bio->bi_bdev) {
182 			ret = -ENODEV;
183 			goto out_unmap;
184 		}
185 
186 		if (meta_buffer) {
187 			struct bio_integrity_payload *bip;
188 
189 			meta = kmalloc(meta_len, GFP_KERNEL);
190 			if (!meta) {
191 				ret = -ENOMEM;
192 				goto out_unmap;
193 			}
194 
195 			if (write) {
196 				if (copy_from_user(meta, meta_buffer,
197 						meta_len)) {
198 					ret = -EFAULT;
199 					goto out_free_meta;
200 				}
201 			}
202 
203 			bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
204 			if (IS_ERR(bip)) {
205 				ret = PTR_ERR(bip);
206 				goto out_free_meta;
207 			}
208 
209 			bip->bip_iter.bi_size = meta_len;
210 			bip->bip_iter.bi_sector = meta_seed;
211 
212 			ret = bio_integrity_add_page(bio, virt_to_page(meta),
213 					meta_len, offset_in_page(meta));
214 			if (ret != meta_len) {
215 				ret = -ENOMEM;
216 				goto out_free_meta;
217 			}
218 		}
219 	}
220  submit:
221 	blk_execute_rq(req->q, disk, req, 0);
222 	ret = req->errors;
223 	if (result)
224 		*result = (u32)(uintptr_t)req->special;
225 	if (meta && !ret && !write) {
226 		if (copy_to_user(meta_buffer, meta, meta_len))
227 			ret = -EFAULT;
228 	}
229  out_free_meta:
230 	kfree(meta);
231  out_unmap:
232 	if (bio) {
233 		if (disk && bio->bi_bdev)
234 			bdput(bio->bi_bdev);
235 		blk_rq_unmap_user(bio);
236 	}
237  out:
238 	blk_mq_free_request(req);
239 	return ret;
240 }
241 
242 int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
243 		void __user *ubuffer, unsigned bufflen, u32 *result,
244 		unsigned timeout)
245 {
246 	return __nvme_submit_user_cmd(q, cmd, ubuffer, bufflen, NULL, 0, 0,
247 			result, timeout);
248 }
249 
250 int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
251 {
252 	struct nvme_command c = { };
253 	int error;
254 
255 	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
256 	c.identify.opcode = nvme_admin_identify;
257 	c.identify.cns = cpu_to_le32(1);
258 
259 	*id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
260 	if (!*id)
261 		return -ENOMEM;
262 
263 	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
264 			sizeof(struct nvme_id_ctrl));
265 	if (error)
266 		kfree(*id);
267 	return error;
268 }
269 
270 static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list)
271 {
272 	struct nvme_command c = { };
273 
274 	c.identify.opcode = nvme_admin_identify;
275 	c.identify.cns = cpu_to_le32(2);
276 	c.identify.nsid = cpu_to_le32(nsid);
277 	return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000);
278 }
279 
280 int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid,
281 		struct nvme_id_ns **id)
282 {
283 	struct nvme_command c = { };
284 	int error;
285 
286 	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
287 	c.identify.opcode = nvme_admin_identify,
288 	c.identify.nsid = cpu_to_le32(nsid),
289 
290 	*id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
291 	if (!*id)
292 		return -ENOMEM;
293 
294 	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
295 			sizeof(struct nvme_id_ns));
296 	if (error)
297 		kfree(*id);
298 	return error;
299 }
300 
301 int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid,
302 					dma_addr_t dma_addr, u32 *result)
303 {
304 	struct nvme_command c;
305 
306 	memset(&c, 0, sizeof(c));
307 	c.features.opcode = nvme_admin_get_features;
308 	c.features.nsid = cpu_to_le32(nsid);
309 	c.features.prp1 = cpu_to_le64(dma_addr);
310 	c.features.fid = cpu_to_le32(fid);
311 
312 	return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0);
313 }
314 
315 int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
316 					dma_addr_t dma_addr, u32 *result)
317 {
318 	struct nvme_command c;
319 
320 	memset(&c, 0, sizeof(c));
321 	c.features.opcode = nvme_admin_set_features;
322 	c.features.prp1 = cpu_to_le64(dma_addr);
323 	c.features.fid = cpu_to_le32(fid);
324 	c.features.dword11 = cpu_to_le32(dword11);
325 
326 	return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0);
327 }
328 
329 int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log)
330 {
331 	struct nvme_command c = { };
332 	int error;
333 
334 	c.common.opcode = nvme_admin_get_log_page,
335 	c.common.nsid = cpu_to_le32(0xFFFFFFFF),
336 	c.common.cdw10[0] = cpu_to_le32(
337 			(((sizeof(struct nvme_smart_log) / 4) - 1) << 16) |
338 			 NVME_LOG_SMART),
339 
340 	*log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL);
341 	if (!*log)
342 		return -ENOMEM;
343 
344 	error = nvme_submit_sync_cmd(dev->admin_q, &c, *log,
345 			sizeof(struct nvme_smart_log));
346 	if (error)
347 		kfree(*log);
348 	return error;
349 }
350 
351 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
352 {
353 	u32 q_count = (*count - 1) | ((*count - 1) << 16);
354 	u32 result;
355 	int status, nr_io_queues;
356 
357 	status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0,
358 			&result);
359 	if (status)
360 		return status;
361 
362 	nr_io_queues = min(result & 0xffff, result >> 16) + 1;
363 	*count = min(*count, nr_io_queues);
364 	return 0;
365 }
366 
367 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
368 {
369 	struct nvme_user_io io;
370 	struct nvme_command c;
371 	unsigned length, meta_len;
372 	void __user *metadata;
373 
374 	if (copy_from_user(&io, uio, sizeof(io)))
375 		return -EFAULT;
376 
377 	switch (io.opcode) {
378 	case nvme_cmd_write:
379 	case nvme_cmd_read:
380 	case nvme_cmd_compare:
381 		break;
382 	default:
383 		return -EINVAL;
384 	}
385 
386 	length = (io.nblocks + 1) << ns->lba_shift;
387 	meta_len = (io.nblocks + 1) * ns->ms;
388 	metadata = (void __user *)(uintptr_t)io.metadata;
389 
390 	if (ns->ext) {
391 		length += meta_len;
392 		meta_len = 0;
393 	} else if (meta_len) {
394 		if ((io.metadata & 3) || !io.metadata)
395 			return -EINVAL;
396 	}
397 
398 	memset(&c, 0, sizeof(c));
399 	c.rw.opcode = io.opcode;
400 	c.rw.flags = io.flags;
401 	c.rw.nsid = cpu_to_le32(ns->ns_id);
402 	c.rw.slba = cpu_to_le64(io.slba);
403 	c.rw.length = cpu_to_le16(io.nblocks);
404 	c.rw.control = cpu_to_le16(io.control);
405 	c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
406 	c.rw.reftag = cpu_to_le32(io.reftag);
407 	c.rw.apptag = cpu_to_le16(io.apptag);
408 	c.rw.appmask = cpu_to_le16(io.appmask);
409 
410 	return __nvme_submit_user_cmd(ns->queue, &c,
411 			(void __user *)(uintptr_t)io.addr, length,
412 			metadata, meta_len, io.slba, NULL, 0);
413 }
414 
415 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
416 			struct nvme_passthru_cmd __user *ucmd)
417 {
418 	struct nvme_passthru_cmd cmd;
419 	struct nvme_command c;
420 	unsigned timeout = 0;
421 	int status;
422 
423 	if (!capable(CAP_SYS_ADMIN))
424 		return -EACCES;
425 	if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
426 		return -EFAULT;
427 
428 	memset(&c, 0, sizeof(c));
429 	c.common.opcode = cmd.opcode;
430 	c.common.flags = cmd.flags;
431 	c.common.nsid = cpu_to_le32(cmd.nsid);
432 	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
433 	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
434 	c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
435 	c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
436 	c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
437 	c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
438 	c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
439 	c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
440 
441 	if (cmd.timeout_ms)
442 		timeout = msecs_to_jiffies(cmd.timeout_ms);
443 
444 	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
445 			(void __user *)(uintptr_t)cmd.addr, cmd.data_len,
446 			&cmd.result, timeout);
447 	if (status >= 0) {
448 		if (put_user(cmd.result, &ucmd->result))
449 			return -EFAULT;
450 	}
451 
452 	return status;
453 }
454 
455 static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
456 		unsigned int cmd, unsigned long arg)
457 {
458 	struct nvme_ns *ns = bdev->bd_disk->private_data;
459 
460 	switch (cmd) {
461 	case NVME_IOCTL_ID:
462 		force_successful_syscall_return();
463 		return ns->ns_id;
464 	case NVME_IOCTL_ADMIN_CMD:
465 		return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg);
466 	case NVME_IOCTL_IO_CMD:
467 		return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg);
468 	case NVME_IOCTL_SUBMIT_IO:
469 		return nvme_submit_io(ns, (void __user *)arg);
470 #ifdef CONFIG_BLK_DEV_NVME_SCSI
471 	case SG_GET_VERSION_NUM:
472 		return nvme_sg_get_version_num((void __user *)arg);
473 	case SG_IO:
474 		return nvme_sg_io(ns, (void __user *)arg);
475 #endif
476 	default:
477 		return -ENOTTY;
478 	}
479 }
480 
481 #ifdef CONFIG_COMPAT
482 static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
483 			unsigned int cmd, unsigned long arg)
484 {
485 	switch (cmd) {
486 	case SG_IO:
487 		return -ENOIOCTLCMD;
488 	}
489 	return nvme_ioctl(bdev, mode, cmd, arg);
490 }
491 #else
492 #define nvme_compat_ioctl	NULL
493 #endif
494 
495 static int nvme_open(struct block_device *bdev, fmode_t mode)
496 {
497 	return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO;
498 }
499 
500 static void nvme_release(struct gendisk *disk, fmode_t mode)
501 {
502 	nvme_put_ns(disk->private_data);
503 }
504 
505 static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
506 {
507 	/* some standard values */
508 	geo->heads = 1 << 6;
509 	geo->sectors = 1 << 5;
510 	geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
511 	return 0;
512 }
513 
514 #ifdef CONFIG_BLK_DEV_INTEGRITY
515 static void nvme_init_integrity(struct nvme_ns *ns)
516 {
517 	struct blk_integrity integrity;
518 
519 	switch (ns->pi_type) {
520 	case NVME_NS_DPS_PI_TYPE3:
521 		integrity.profile = &t10_pi_type3_crc;
522 		break;
523 	case NVME_NS_DPS_PI_TYPE1:
524 	case NVME_NS_DPS_PI_TYPE2:
525 		integrity.profile = &t10_pi_type1_crc;
526 		break;
527 	default:
528 		integrity.profile = NULL;
529 		break;
530 	}
531 	integrity.tuple_size = ns->ms;
532 	blk_integrity_register(ns->disk, &integrity);
533 	blk_queue_max_integrity_segments(ns->queue, 1);
534 }
535 #else
536 static void nvme_init_integrity(struct nvme_ns *ns)
537 {
538 }
539 #endif /* CONFIG_BLK_DEV_INTEGRITY */
540 
541 static void nvme_config_discard(struct nvme_ns *ns)
542 {
543 	u32 logical_block_size = queue_logical_block_size(ns->queue);
544 	ns->queue->limits.discard_zeroes_data = 0;
545 	ns->queue->limits.discard_alignment = logical_block_size;
546 	ns->queue->limits.discard_granularity = logical_block_size;
547 	blk_queue_max_discard_sectors(ns->queue, 0xffffffff);
548 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
549 }
550 
551 static int nvme_revalidate_disk(struct gendisk *disk)
552 {
553 	struct nvme_ns *ns = disk->private_data;
554 	struct nvme_id_ns *id;
555 	u8 lbaf, pi_type;
556 	u16 old_ms;
557 	unsigned short bs;
558 
559 	if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) {
560 		dev_warn(ns->ctrl->dev, "%s: Identify failure nvme%dn%d\n",
561 				__func__, ns->ctrl->instance, ns->ns_id);
562 		return -ENODEV;
563 	}
564 	if (id->ncap == 0) {
565 		kfree(id);
566 		return -ENODEV;
567 	}
568 
569 	if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) {
570 		if (nvme_nvm_register(ns->queue, disk->disk_name)) {
571 			dev_warn(ns->ctrl->dev,
572 				"%s: LightNVM init failure\n", __func__);
573 			kfree(id);
574 			return -ENODEV;
575 		}
576 		ns->type = NVME_NS_LIGHTNVM;
577 	}
578 
579 	if (ns->ctrl->vs >= NVME_VS(1, 1))
580 		memcpy(ns->eui, id->eui64, sizeof(ns->eui));
581 	if (ns->ctrl->vs >= NVME_VS(1, 2))
582 		memcpy(ns->uuid, id->nguid, sizeof(ns->uuid));
583 
584 	old_ms = ns->ms;
585 	lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
586 	ns->lba_shift = id->lbaf[lbaf].ds;
587 	ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
588 	ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
589 
590 	/*
591 	 * If identify namespace failed, use default 512 byte block size so
592 	 * block layer can use before failing read/write for 0 capacity.
593 	 */
594 	if (ns->lba_shift == 0)
595 		ns->lba_shift = 9;
596 	bs = 1 << ns->lba_shift;
597 	/* XXX: PI implementation requires metadata equal t10 pi tuple size */
598 	pi_type = ns->ms == sizeof(struct t10_pi_tuple) ?
599 					id->dps & NVME_NS_DPS_PI_MASK : 0;
600 
601 	blk_mq_freeze_queue(disk->queue);
602 	if (blk_get_integrity(disk) && (ns->pi_type != pi_type ||
603 				ns->ms != old_ms ||
604 				bs != queue_logical_block_size(disk->queue) ||
605 				(ns->ms && ns->ext)))
606 		blk_integrity_unregister(disk);
607 
608 	ns->pi_type = pi_type;
609 	blk_queue_logical_block_size(ns->queue, bs);
610 
611 	if (ns->ms && !blk_get_integrity(disk) && !ns->ext)
612 		nvme_init_integrity(ns);
613 	if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
614 		set_capacity(disk, 0);
615 	else
616 		set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
617 
618 	if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)
619 		nvme_config_discard(ns);
620 	blk_mq_unfreeze_queue(disk->queue);
621 
622 	kfree(id);
623 	return 0;
624 }
625 
626 static char nvme_pr_type(enum pr_type type)
627 {
628 	switch (type) {
629 	case PR_WRITE_EXCLUSIVE:
630 		return 1;
631 	case PR_EXCLUSIVE_ACCESS:
632 		return 2;
633 	case PR_WRITE_EXCLUSIVE_REG_ONLY:
634 		return 3;
635 	case PR_EXCLUSIVE_ACCESS_REG_ONLY:
636 		return 4;
637 	case PR_WRITE_EXCLUSIVE_ALL_REGS:
638 		return 5;
639 	case PR_EXCLUSIVE_ACCESS_ALL_REGS:
640 		return 6;
641 	default:
642 		return 0;
643 	}
644 };
645 
646 static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
647 				u64 key, u64 sa_key, u8 op)
648 {
649 	struct nvme_ns *ns = bdev->bd_disk->private_data;
650 	struct nvme_command c;
651 	u8 data[16] = { 0, };
652 
653 	put_unaligned_le64(key, &data[0]);
654 	put_unaligned_le64(sa_key, &data[8]);
655 
656 	memset(&c, 0, sizeof(c));
657 	c.common.opcode = op;
658 	c.common.nsid = cpu_to_le32(ns->ns_id);
659 	c.common.cdw10[0] = cpu_to_le32(cdw10);
660 
661 	return nvme_submit_sync_cmd(ns->queue, &c, data, 16);
662 }
663 
664 static int nvme_pr_register(struct block_device *bdev, u64 old,
665 		u64 new, unsigned flags)
666 {
667 	u32 cdw10;
668 
669 	if (flags & ~PR_FL_IGNORE_KEY)
670 		return -EOPNOTSUPP;
671 
672 	cdw10 = old ? 2 : 0;
673 	cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
674 	cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
675 	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
676 }
677 
678 static int nvme_pr_reserve(struct block_device *bdev, u64 key,
679 		enum pr_type type, unsigned flags)
680 {
681 	u32 cdw10;
682 
683 	if (flags & ~PR_FL_IGNORE_KEY)
684 		return -EOPNOTSUPP;
685 
686 	cdw10 = nvme_pr_type(type) << 8;
687 	cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
688 	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
689 }
690 
691 static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
692 		enum pr_type type, bool abort)
693 {
694 	u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1;
695 	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
696 }
697 
698 static int nvme_pr_clear(struct block_device *bdev, u64 key)
699 {
700 	u32 cdw10 = 1 | (key ? 1 << 3 : 0);
701 	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
702 }
703 
704 static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
705 {
706 	u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0;
707 	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
708 }
709 
710 static const struct pr_ops nvme_pr_ops = {
711 	.pr_register	= nvme_pr_register,
712 	.pr_reserve	= nvme_pr_reserve,
713 	.pr_release	= nvme_pr_release,
714 	.pr_preempt	= nvme_pr_preempt,
715 	.pr_clear	= nvme_pr_clear,
716 };
717 
718 static const struct block_device_operations nvme_fops = {
719 	.owner		= THIS_MODULE,
720 	.ioctl		= nvme_ioctl,
721 	.compat_ioctl	= nvme_compat_ioctl,
722 	.open		= nvme_open,
723 	.release	= nvme_release,
724 	.getgeo		= nvme_getgeo,
725 	.revalidate_disk= nvme_revalidate_disk,
726 	.pr_ops		= &nvme_pr_ops,
727 };
728 
729 static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
730 {
731 	unsigned long timeout =
732 		((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
733 	u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
734 	int ret;
735 
736 	while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
737 		if ((csts & NVME_CSTS_RDY) == bit)
738 			break;
739 
740 		msleep(100);
741 		if (fatal_signal_pending(current))
742 			return -EINTR;
743 		if (time_after(jiffies, timeout)) {
744 			dev_err(ctrl->dev,
745 				"Device not ready; aborting %s\n", enabled ?
746 						"initialisation" : "reset");
747 			return -ENODEV;
748 		}
749 	}
750 
751 	return ret;
752 }
753 
754 /*
755  * If the device has been passed off to us in an enabled state, just clear
756  * the enabled bit.  The spec says we should set the 'shutdown notification
757  * bits', but doing so may cause the device to complete commands to the
758  * admin queue ... and we don't know what memory that might be pointing at!
759  */
760 int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
761 {
762 	int ret;
763 
764 	ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
765 	ctrl->ctrl_config &= ~NVME_CC_ENABLE;
766 
767 	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
768 	if (ret)
769 		return ret;
770 	return nvme_wait_ready(ctrl, cap, false);
771 }
772 
773 int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
774 {
775 	/*
776 	 * Default to a 4K page size, with the intention to update this
777 	 * path in the future to accomodate architectures with differing
778 	 * kernel and IO page sizes.
779 	 */
780 	unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12;
781 	int ret;
782 
783 	if (page_shift < dev_page_min) {
784 		dev_err(ctrl->dev,
785 			"Minimum device page size %u too large for host (%u)\n",
786 			1 << dev_page_min, 1 << page_shift);
787 		return -ENODEV;
788 	}
789 
790 	ctrl->page_size = 1 << page_shift;
791 
792 	ctrl->ctrl_config = NVME_CC_CSS_NVM;
793 	ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
794 	ctrl->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
795 	ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
796 	ctrl->ctrl_config |= NVME_CC_ENABLE;
797 
798 	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
799 	if (ret)
800 		return ret;
801 	return nvme_wait_ready(ctrl, cap, true);
802 }
803 
804 int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
805 {
806 	unsigned long timeout = SHUTDOWN_TIMEOUT + jiffies;
807 	u32 csts;
808 	int ret;
809 
810 	ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
811 	ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
812 
813 	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
814 	if (ret)
815 		return ret;
816 
817 	while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
818 		if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT)
819 			break;
820 
821 		msleep(100);
822 		if (fatal_signal_pending(current))
823 			return -EINTR;
824 		if (time_after(jiffies, timeout)) {
825 			dev_err(ctrl->dev,
826 				"Device shutdown incomplete; abort shutdown\n");
827 			return -ENODEV;
828 		}
829 	}
830 
831 	return ret;
832 }
833 
834 /*
835  * Initialize the cached copies of the Identify data and various controller
836  * register in our nvme_ctrl structure.  This should be called as soon as
837  * the admin queue is fully up and running.
838  */
839 int nvme_init_identify(struct nvme_ctrl *ctrl)
840 {
841 	struct nvme_id_ctrl *id;
842 	u64 cap;
843 	int ret, page_shift;
844 
845 	ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
846 	if (ret) {
847 		dev_err(ctrl->dev, "Reading VS failed (%d)\n", ret);
848 		return ret;
849 	}
850 
851 	ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap);
852 	if (ret) {
853 		dev_err(ctrl->dev, "Reading CAP failed (%d)\n", ret);
854 		return ret;
855 	}
856 	page_shift = NVME_CAP_MPSMIN(cap) + 12;
857 
858 	if (ctrl->vs >= NVME_VS(1, 1))
859 		ctrl->subsystem = NVME_CAP_NSSRC(cap);
860 
861 	ret = nvme_identify_ctrl(ctrl, &id);
862 	if (ret) {
863 		dev_err(ctrl->dev, "Identify Controller failed (%d)\n", ret);
864 		return -EIO;
865 	}
866 
867 	ctrl->oncs = le16_to_cpup(&id->oncs);
868 	atomic_set(&ctrl->abort_limit, id->acl + 1);
869 	ctrl->vwc = id->vwc;
870 	memcpy(ctrl->serial, id->sn, sizeof(id->sn));
871 	memcpy(ctrl->model, id->mn, sizeof(id->mn));
872 	memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr));
873 	if (id->mdts)
874 		ctrl->max_hw_sectors = 1 << (id->mdts + page_shift - 9);
875 	else
876 		ctrl->max_hw_sectors = UINT_MAX;
877 
878 	if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && id->vs[3]) {
879 		unsigned int max_hw_sectors;
880 
881 		ctrl->stripe_size = 1 << (id->vs[3] + page_shift);
882 		max_hw_sectors = ctrl->stripe_size >> (page_shift - 9);
883 		if (ctrl->max_hw_sectors) {
884 			ctrl->max_hw_sectors = min(max_hw_sectors,
885 							ctrl->max_hw_sectors);
886 		} else {
887 			ctrl->max_hw_sectors = max_hw_sectors;
888 		}
889 	}
890 
891 	kfree(id);
892 	return 0;
893 }
894 
895 static int nvme_dev_open(struct inode *inode, struct file *file)
896 {
897 	struct nvme_ctrl *ctrl;
898 	int instance = iminor(inode);
899 	int ret = -ENODEV;
900 
901 	spin_lock(&dev_list_lock);
902 	list_for_each_entry(ctrl, &nvme_ctrl_list, node) {
903 		if (ctrl->instance != instance)
904 			continue;
905 
906 		if (!ctrl->admin_q) {
907 			ret = -EWOULDBLOCK;
908 			break;
909 		}
910 		if (!kref_get_unless_zero(&ctrl->kref))
911 			break;
912 		file->private_data = ctrl;
913 		ret = 0;
914 		break;
915 	}
916 	spin_unlock(&dev_list_lock);
917 
918 	return ret;
919 }
920 
921 static int nvme_dev_release(struct inode *inode, struct file *file)
922 {
923 	nvme_put_ctrl(file->private_data);
924 	return 0;
925 }
926 
927 static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
928 {
929 	struct nvme_ns *ns;
930 	int ret;
931 
932 	mutex_lock(&ctrl->namespaces_mutex);
933 	if (list_empty(&ctrl->namespaces)) {
934 		ret = -ENOTTY;
935 		goto out_unlock;
936 	}
937 
938 	ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
939 	if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
940 		dev_warn(ctrl->dev,
941 			"NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n");
942 		ret = -EINVAL;
943 		goto out_unlock;
944 	}
945 
946 	dev_warn(ctrl->dev,
947 		"using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
948 	kref_get(&ns->kref);
949 	mutex_unlock(&ctrl->namespaces_mutex);
950 
951 	ret = nvme_user_cmd(ctrl, ns, argp);
952 	nvme_put_ns(ns);
953 	return ret;
954 
955 out_unlock:
956 	mutex_unlock(&ctrl->namespaces_mutex);
957 	return ret;
958 }
959 
960 static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
961 		unsigned long arg)
962 {
963 	struct nvme_ctrl *ctrl = file->private_data;
964 	void __user *argp = (void __user *)arg;
965 
966 	switch (cmd) {
967 	case NVME_IOCTL_ADMIN_CMD:
968 		return nvme_user_cmd(ctrl, NULL, argp);
969 	case NVME_IOCTL_IO_CMD:
970 		return nvme_dev_user_cmd(ctrl, argp);
971 	case NVME_IOCTL_RESET:
972 		dev_warn(ctrl->dev, "resetting controller\n");
973 		return ctrl->ops->reset_ctrl(ctrl);
974 	case NVME_IOCTL_SUBSYS_RESET:
975 		return nvme_reset_subsystem(ctrl);
976 	default:
977 		return -ENOTTY;
978 	}
979 }
980 
981 static const struct file_operations nvme_dev_fops = {
982 	.owner		= THIS_MODULE,
983 	.open		= nvme_dev_open,
984 	.release	= nvme_dev_release,
985 	.unlocked_ioctl	= nvme_dev_ioctl,
986 	.compat_ioctl	= nvme_dev_ioctl,
987 };
988 
989 static ssize_t nvme_sysfs_reset(struct device *dev,
990 				struct device_attribute *attr, const char *buf,
991 				size_t count)
992 {
993 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
994 	int ret;
995 
996 	ret = ctrl->ops->reset_ctrl(ctrl);
997 	if (ret < 0)
998 		return ret;
999 	return count;
1000 }
1001 static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
1002 
1003 static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
1004 								char *buf)
1005 {
1006 	struct nvme_ns *ns = dev_to_disk(dev)->private_data;
1007 	return sprintf(buf, "%pU\n", ns->uuid);
1008 }
1009 static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL);
1010 
1011 static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
1012 								char *buf)
1013 {
1014 	struct nvme_ns *ns = dev_to_disk(dev)->private_data;
1015 	return sprintf(buf, "%8phd\n", ns->eui);
1016 }
1017 static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL);
1018 
1019 static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
1020 								char *buf)
1021 {
1022 	struct nvme_ns *ns = dev_to_disk(dev)->private_data;
1023 	return sprintf(buf, "%d\n", ns->ns_id);
1024 }
1025 static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL);
1026 
1027 static struct attribute *nvme_ns_attrs[] = {
1028 	&dev_attr_uuid.attr,
1029 	&dev_attr_eui.attr,
1030 	&dev_attr_nsid.attr,
1031 	NULL,
1032 };
1033 
1034 static umode_t nvme_attrs_are_visible(struct kobject *kobj,
1035 		struct attribute *a, int n)
1036 {
1037 	struct device *dev = container_of(kobj, struct device, kobj);
1038 	struct nvme_ns *ns = dev_to_disk(dev)->private_data;
1039 
1040 	if (a == &dev_attr_uuid.attr) {
1041 		if (!memchr_inv(ns->uuid, 0, sizeof(ns->uuid)))
1042 			return 0;
1043 	}
1044 	if (a == &dev_attr_eui.attr) {
1045 		if (!memchr_inv(ns->eui, 0, sizeof(ns->eui)))
1046 			return 0;
1047 	}
1048 	return a->mode;
1049 }
1050 
1051 static const struct attribute_group nvme_ns_attr_group = {
1052 	.attrs		= nvme_ns_attrs,
1053 	.is_visible	= nvme_attrs_are_visible,
1054 };
1055 
1056 #define nvme_show_function(field)						\
1057 static ssize_t  field##_show(struct device *dev,				\
1058 			    struct device_attribute *attr, char *buf)		\
1059 {										\
1060         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);				\
1061         return sprintf(buf, "%.*s\n", (int)sizeof(ctrl->field), ctrl->field);	\
1062 }										\
1063 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
1064 
1065 nvme_show_function(model);
1066 nvme_show_function(serial);
1067 nvme_show_function(firmware_rev);
1068 
1069 static struct attribute *nvme_dev_attrs[] = {
1070 	&dev_attr_reset_controller.attr,
1071 	&dev_attr_model.attr,
1072 	&dev_attr_serial.attr,
1073 	&dev_attr_firmware_rev.attr,
1074 	NULL
1075 };
1076 
1077 static struct attribute_group nvme_dev_attrs_group = {
1078 	.attrs = nvme_dev_attrs,
1079 };
1080 
1081 static const struct attribute_group *nvme_dev_attr_groups[] = {
1082 	&nvme_dev_attrs_group,
1083 	NULL,
1084 };
1085 
1086 static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
1087 {
1088 	struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
1089 	struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
1090 
1091 	return nsa->ns_id - nsb->ns_id;
1092 }
1093 
1094 static struct nvme_ns *nvme_find_ns(struct nvme_ctrl *ctrl, unsigned nsid)
1095 {
1096 	struct nvme_ns *ns;
1097 
1098 	lockdep_assert_held(&ctrl->namespaces_mutex);
1099 
1100 	list_for_each_entry(ns, &ctrl->namespaces, list) {
1101 		if (ns->ns_id == nsid)
1102 			return ns;
1103 		if (ns->ns_id > nsid)
1104 			break;
1105 	}
1106 	return NULL;
1107 }
1108 
1109 static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
1110 {
1111 	struct nvme_ns *ns;
1112 	struct gendisk *disk;
1113 	int node = dev_to_node(ctrl->dev);
1114 
1115 	lockdep_assert_held(&ctrl->namespaces_mutex);
1116 
1117 	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
1118 	if (!ns)
1119 		return;
1120 
1121 	ns->queue = blk_mq_init_queue(ctrl->tagset);
1122 	if (IS_ERR(ns->queue))
1123 		goto out_free_ns;
1124 	queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
1125 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
1126 	ns->queue->queuedata = ns;
1127 	ns->ctrl = ctrl;
1128 
1129 	disk = alloc_disk_node(0, node);
1130 	if (!disk)
1131 		goto out_free_queue;
1132 
1133 	kref_init(&ns->kref);
1134 	ns->ns_id = nsid;
1135 	ns->disk = disk;
1136 	ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
1137 
1138 	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
1139 	if (ctrl->max_hw_sectors) {
1140 		blk_queue_max_hw_sectors(ns->queue, ctrl->max_hw_sectors);
1141 		blk_queue_max_segments(ns->queue,
1142 			(ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1);
1143 	}
1144 	if (ctrl->stripe_size)
1145 		blk_queue_chunk_sectors(ns->queue, ctrl->stripe_size >> 9);
1146 	if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
1147 		blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
1148 	blk_queue_virt_boundary(ns->queue, ctrl->page_size - 1);
1149 
1150 	disk->major = nvme_major;
1151 	disk->first_minor = 0;
1152 	disk->fops = &nvme_fops;
1153 	disk->private_data = ns;
1154 	disk->queue = ns->queue;
1155 	disk->driverfs_dev = ctrl->device;
1156 	disk->flags = GENHD_FL_EXT_DEVT;
1157 	sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, nsid);
1158 
1159 	if (nvme_revalidate_disk(ns->disk))
1160 		goto out_free_disk;
1161 
1162 	list_add_tail(&ns->list, &ctrl->namespaces);
1163 	kref_get(&ctrl->kref);
1164 	if (ns->type == NVME_NS_LIGHTNVM)
1165 		return;
1166 
1167 	add_disk(ns->disk);
1168 	if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
1169 					&nvme_ns_attr_group))
1170 		pr_warn("%s: failed to create sysfs group for identification\n",
1171 			ns->disk->disk_name);
1172 	return;
1173  out_free_disk:
1174 	kfree(disk);
1175  out_free_queue:
1176 	blk_cleanup_queue(ns->queue);
1177  out_free_ns:
1178 	kfree(ns);
1179 }
1180 
1181 static void nvme_ns_remove(struct nvme_ns *ns)
1182 {
1183 	bool kill = nvme_io_incapable(ns->ctrl) &&
1184 			!blk_queue_dying(ns->queue);
1185 
1186 	lockdep_assert_held(&ns->ctrl->namespaces_mutex);
1187 
1188 	if (kill) {
1189 		blk_set_queue_dying(ns->queue);
1190 
1191 		/*
1192 		 * The controller was shutdown first if we got here through
1193 		 * device removal. The shutdown may requeue outstanding
1194 		 * requests. These need to be aborted immediately so
1195 		 * del_gendisk doesn't block indefinitely for their completion.
1196 		 */
1197 		blk_mq_abort_requeue_list(ns->queue);
1198 	}
1199 	if (ns->disk->flags & GENHD_FL_UP) {
1200 		if (blk_get_integrity(ns->disk))
1201 			blk_integrity_unregister(ns->disk);
1202 		sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
1203 					&nvme_ns_attr_group);
1204 		del_gendisk(ns->disk);
1205 	}
1206 	if (kill || !blk_queue_dying(ns->queue)) {
1207 		blk_mq_abort_requeue_list(ns->queue);
1208 		blk_cleanup_queue(ns->queue);
1209 	}
1210 	list_del_init(&ns->list);
1211 	nvme_put_ns(ns);
1212 }
1213 
1214 static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
1215 {
1216 	struct nvme_ns *ns;
1217 
1218 	ns = nvme_find_ns(ctrl, nsid);
1219 	if (ns) {
1220 		if (revalidate_disk(ns->disk))
1221 			nvme_ns_remove(ns);
1222 	} else
1223 		nvme_alloc_ns(ctrl, nsid);
1224 }
1225 
1226 static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn)
1227 {
1228 	struct nvme_ns *ns;
1229 	__le32 *ns_list;
1230 	unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024);
1231 	int ret = 0;
1232 
1233 	ns_list = kzalloc(0x1000, GFP_KERNEL);
1234 	if (!ns_list)
1235 		return -ENOMEM;
1236 
1237 	for (i = 0; i < num_lists; i++) {
1238 		ret = nvme_identify_ns_list(ctrl, prev, ns_list);
1239 		if (ret)
1240 			goto out;
1241 
1242 		for (j = 0; j < min(nn, 1024U); j++) {
1243 			nsid = le32_to_cpu(ns_list[j]);
1244 			if (!nsid)
1245 				goto out;
1246 
1247 			nvme_validate_ns(ctrl, nsid);
1248 
1249 			while (++prev < nsid) {
1250 				ns = nvme_find_ns(ctrl, prev);
1251 				if (ns)
1252 					nvme_ns_remove(ns);
1253 			}
1254 		}
1255 		nn -= j;
1256 	}
1257  out:
1258 	kfree(ns_list);
1259 	return ret;
1260 }
1261 
1262 static void __nvme_scan_namespaces(struct nvme_ctrl *ctrl, unsigned nn)
1263 {
1264 	struct nvme_ns *ns, *next;
1265 	unsigned i;
1266 
1267 	lockdep_assert_held(&ctrl->namespaces_mutex);
1268 
1269 	for (i = 1; i <= nn; i++)
1270 		nvme_validate_ns(ctrl, i);
1271 
1272 	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
1273 		if (ns->ns_id > nn)
1274 			nvme_ns_remove(ns);
1275 	}
1276 }
1277 
1278 void nvme_scan_namespaces(struct nvme_ctrl *ctrl)
1279 {
1280 	struct nvme_id_ctrl *id;
1281 	unsigned nn;
1282 
1283 	if (nvme_identify_ctrl(ctrl, &id))
1284 		return;
1285 
1286 	mutex_lock(&ctrl->namespaces_mutex);
1287 	nn = le32_to_cpu(id->nn);
1288 	if (ctrl->vs >= NVME_VS(1, 1) &&
1289 	    !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) {
1290 		if (!nvme_scan_ns_list(ctrl, nn))
1291 			goto done;
1292 	}
1293 	__nvme_scan_namespaces(ctrl, le32_to_cpup(&id->nn));
1294  done:
1295 	list_sort(NULL, &ctrl->namespaces, ns_cmp);
1296 	mutex_unlock(&ctrl->namespaces_mutex);
1297 	kfree(id);
1298 }
1299 
1300 void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
1301 {
1302 	struct nvme_ns *ns, *next;
1303 
1304 	mutex_lock(&ctrl->namespaces_mutex);
1305 	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list)
1306 		nvme_ns_remove(ns);
1307 	mutex_unlock(&ctrl->namespaces_mutex);
1308 }
1309 
1310 static DEFINE_IDA(nvme_instance_ida);
1311 
1312 static int nvme_set_instance(struct nvme_ctrl *ctrl)
1313 {
1314 	int instance, error;
1315 
1316 	do {
1317 		if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
1318 			return -ENODEV;
1319 
1320 		spin_lock(&dev_list_lock);
1321 		error = ida_get_new(&nvme_instance_ida, &instance);
1322 		spin_unlock(&dev_list_lock);
1323 	} while (error == -EAGAIN);
1324 
1325 	if (error)
1326 		return -ENODEV;
1327 
1328 	ctrl->instance = instance;
1329 	return 0;
1330 }
1331 
1332 static void nvme_release_instance(struct nvme_ctrl *ctrl)
1333 {
1334 	spin_lock(&dev_list_lock);
1335 	ida_remove(&nvme_instance_ida, ctrl->instance);
1336 	spin_unlock(&dev_list_lock);
1337 }
1338 
1339 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
1340  {
1341 	device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance));
1342 
1343 	spin_lock(&dev_list_lock);
1344 	list_del(&ctrl->node);
1345 	spin_unlock(&dev_list_lock);
1346 }
1347 
1348 static void nvme_free_ctrl(struct kref *kref)
1349 {
1350 	struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref);
1351 
1352 	put_device(ctrl->device);
1353 	nvme_release_instance(ctrl);
1354 
1355 	ctrl->ops->free_ctrl(ctrl);
1356 }
1357 
1358 void nvme_put_ctrl(struct nvme_ctrl *ctrl)
1359 {
1360 	kref_put(&ctrl->kref, nvme_free_ctrl);
1361 }
1362 
1363 /*
1364  * Initialize a NVMe controller structures.  This needs to be called during
1365  * earliest initialization so that we have the initialized structured around
1366  * during probing.
1367  */
1368 int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
1369 		const struct nvme_ctrl_ops *ops, unsigned long quirks)
1370 {
1371 	int ret;
1372 
1373 	INIT_LIST_HEAD(&ctrl->namespaces);
1374 	mutex_init(&ctrl->namespaces_mutex);
1375 	kref_init(&ctrl->kref);
1376 	ctrl->dev = dev;
1377 	ctrl->ops = ops;
1378 	ctrl->quirks = quirks;
1379 
1380 	ret = nvme_set_instance(ctrl);
1381 	if (ret)
1382 		goto out;
1383 
1384 	ctrl->device = device_create_with_groups(nvme_class, ctrl->dev,
1385 				MKDEV(nvme_char_major, ctrl->instance),
1386 				dev, nvme_dev_attr_groups,
1387 				"nvme%d", ctrl->instance);
1388 	if (IS_ERR(ctrl->device)) {
1389 		ret = PTR_ERR(ctrl->device);
1390 		goto out_release_instance;
1391 	}
1392 	get_device(ctrl->device);
1393 	dev_set_drvdata(ctrl->device, ctrl);
1394 
1395 	spin_lock(&dev_list_lock);
1396 	list_add_tail(&ctrl->node, &nvme_ctrl_list);
1397 	spin_unlock(&dev_list_lock);
1398 
1399 	return 0;
1400 out_release_instance:
1401 	nvme_release_instance(ctrl);
1402 out:
1403 	return ret;
1404 }
1405 
1406 void nvme_stop_queues(struct nvme_ctrl *ctrl)
1407 {
1408 	struct nvme_ns *ns;
1409 
1410 	mutex_lock(&ctrl->namespaces_mutex);
1411 	list_for_each_entry(ns, &ctrl->namespaces, list) {
1412 		spin_lock_irq(ns->queue->queue_lock);
1413 		queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue);
1414 		spin_unlock_irq(ns->queue->queue_lock);
1415 
1416 		blk_mq_cancel_requeue_work(ns->queue);
1417 		blk_mq_stop_hw_queues(ns->queue);
1418 	}
1419 	mutex_unlock(&ctrl->namespaces_mutex);
1420 }
1421 
1422 void nvme_start_queues(struct nvme_ctrl *ctrl)
1423 {
1424 	struct nvme_ns *ns;
1425 
1426 	mutex_lock(&ctrl->namespaces_mutex);
1427 	list_for_each_entry(ns, &ctrl->namespaces, list) {
1428 		queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue);
1429 		blk_mq_start_stopped_hw_queues(ns->queue, true);
1430 		blk_mq_kick_requeue_list(ns->queue);
1431 	}
1432 	mutex_unlock(&ctrl->namespaces_mutex);
1433 }
1434 
1435 int __init nvme_core_init(void)
1436 {
1437 	int result;
1438 
1439 	result = register_blkdev(nvme_major, "nvme");
1440 	if (result < 0)
1441 		return result;
1442 	else if (result > 0)
1443 		nvme_major = result;
1444 
1445 	result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme",
1446 							&nvme_dev_fops);
1447 	if (result < 0)
1448 		goto unregister_blkdev;
1449 	else if (result > 0)
1450 		nvme_char_major = result;
1451 
1452 	nvme_class = class_create(THIS_MODULE, "nvme");
1453 	if (IS_ERR(nvme_class)) {
1454 		result = PTR_ERR(nvme_class);
1455 		goto unregister_chrdev;
1456 	}
1457 
1458 	return 0;
1459 
1460  unregister_chrdev:
1461 	__unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
1462  unregister_blkdev:
1463 	unregister_blkdev(nvme_major, "nvme");
1464 	return result;
1465 }
1466 
1467 void nvme_core_exit(void)
1468 {
1469 	unregister_blkdev(nvme_major, "nvme");
1470 	class_destroy(nvme_class);
1471 	__unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
1472 }
1473