xref: /openbmc/linux/drivers/nvme/host/core.c (revision 8ee90c5c)
1 /*
2  * NVM Express device driver
3  * Copyright (c) 2011-2014, Intel Corporation.
4  *
5  * This program is free software; you can redistribute it and/or modify it
6  * under the terms and conditions of the GNU General Public License,
7  * version 2, as published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
12  * more details.
13  */
14 
15 #include <linux/blkdev.h>
16 #include <linux/blk-mq.h>
17 #include <linux/delay.h>
18 #include <linux/errno.h>
19 #include <linux/hdreg.h>
20 #include <linux/kernel.h>
21 #include <linux/module.h>
22 #include <linux/list_sort.h>
23 #include <linux/slab.h>
24 #include <linux/types.h>
25 #include <linux/pr.h>
26 #include <linux/ptrace.h>
27 #include <linux/nvme_ioctl.h>
28 #include <linux/t10-pi.h>
29 #include <linux/pm_qos.h>
30 #include <asm/unaligned.h>
31 
32 #include "nvme.h"
33 #include "fabrics.h"
34 
35 #define NVME_MINORS		(1U << MINORBITS)
36 
37 unsigned char admin_timeout = 60;
38 module_param(admin_timeout, byte, 0644);
39 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
40 EXPORT_SYMBOL_GPL(admin_timeout);
41 
42 unsigned char nvme_io_timeout = 30;
43 module_param_named(io_timeout, nvme_io_timeout, byte, 0644);
44 MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
45 EXPORT_SYMBOL_GPL(nvme_io_timeout);
46 
47 static unsigned char shutdown_timeout = 5;
48 module_param(shutdown_timeout, byte, 0644);
49 MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
50 
51 static u8 nvme_max_retries = 5;
52 module_param_named(max_retries, nvme_max_retries, byte, 0644);
53 MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
54 
55 static int nvme_char_major;
56 module_param(nvme_char_major, int, 0);
57 
58 static unsigned long default_ps_max_latency_us = 100000;
59 module_param(default_ps_max_latency_us, ulong, 0644);
60 MODULE_PARM_DESC(default_ps_max_latency_us,
61 		 "max power saving latency for new devices; use PM QOS to change per device");
62 
63 static bool force_apst;
64 module_param(force_apst, bool, 0644);
65 MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
66 
67 static bool streams;
68 module_param(streams, bool, 0644);
69 MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
70 
71 struct workqueue_struct *nvme_wq;
72 EXPORT_SYMBOL_GPL(nvme_wq);
73 
74 static LIST_HEAD(nvme_ctrl_list);
75 static DEFINE_SPINLOCK(dev_list_lock);
76 
77 static struct class *nvme_class;
78 
79 static __le32 nvme_get_log_dw10(u8 lid, size_t size)
80 {
81 	return cpu_to_le32((((size / 4) - 1) << 16) | lid);
82 }
83 
84 int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
85 {
86 	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
87 		return -EBUSY;
88 	if (!queue_work(nvme_wq, &ctrl->reset_work))
89 		return -EBUSY;
90 	return 0;
91 }
92 EXPORT_SYMBOL_GPL(nvme_reset_ctrl);
93 
94 static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
95 {
96 	int ret;
97 
98 	ret = nvme_reset_ctrl(ctrl);
99 	if (!ret)
100 		flush_work(&ctrl->reset_work);
101 	return ret;
102 }
103 
104 static blk_status_t nvme_error_status(struct request *req)
105 {
106 	switch (nvme_req(req)->status & 0x7ff) {
107 	case NVME_SC_SUCCESS:
108 		return BLK_STS_OK;
109 	case NVME_SC_CAP_EXCEEDED:
110 		return BLK_STS_NOSPC;
111 	case NVME_SC_ONCS_NOT_SUPPORTED:
112 		return BLK_STS_NOTSUPP;
113 	case NVME_SC_WRITE_FAULT:
114 	case NVME_SC_READ_ERROR:
115 	case NVME_SC_UNWRITTEN_BLOCK:
116 	case NVME_SC_ACCESS_DENIED:
117 	case NVME_SC_READ_ONLY:
118 		return BLK_STS_MEDIUM;
119 	case NVME_SC_GUARD_CHECK:
120 	case NVME_SC_APPTAG_CHECK:
121 	case NVME_SC_REFTAG_CHECK:
122 	case NVME_SC_INVALID_PI:
123 		return BLK_STS_PROTECTION;
124 	case NVME_SC_RESERVATION_CONFLICT:
125 		return BLK_STS_NEXUS;
126 	default:
127 		return BLK_STS_IOERR;
128 	}
129 }
130 
131 static inline bool nvme_req_needs_retry(struct request *req)
132 {
133 	if (blk_noretry_request(req))
134 		return false;
135 	if (nvme_req(req)->status & NVME_SC_DNR)
136 		return false;
137 	if (nvme_req(req)->retries >= nvme_max_retries)
138 		return false;
139 	return true;
140 }
141 
142 void nvme_complete_rq(struct request *req)
143 {
144 	if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) {
145 		nvme_req(req)->retries++;
146 		blk_mq_requeue_request(req, true);
147 		return;
148 	}
149 
150 	blk_mq_end_request(req, nvme_error_status(req));
151 }
152 EXPORT_SYMBOL_GPL(nvme_complete_rq);
153 
154 void nvme_cancel_request(struct request *req, void *data, bool reserved)
155 {
156 	int status;
157 
158 	if (!blk_mq_request_started(req))
159 		return;
160 
161 	dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
162 				"Cancelling I/O %d", req->tag);
163 
164 	status = NVME_SC_ABORT_REQ;
165 	if (blk_queue_dying(req->q))
166 		status |= NVME_SC_DNR;
167 	nvme_req(req)->status = status;
168 	blk_mq_complete_request(req);
169 
170 }
171 EXPORT_SYMBOL_GPL(nvme_cancel_request);
172 
173 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
174 		enum nvme_ctrl_state new_state)
175 {
176 	enum nvme_ctrl_state old_state;
177 	unsigned long flags;
178 	bool changed = false;
179 
180 	spin_lock_irqsave(&ctrl->lock, flags);
181 
182 	old_state = ctrl->state;
183 	switch (new_state) {
184 	case NVME_CTRL_LIVE:
185 		switch (old_state) {
186 		case NVME_CTRL_NEW:
187 		case NVME_CTRL_RESETTING:
188 		case NVME_CTRL_RECONNECTING:
189 			changed = true;
190 			/* FALLTHRU */
191 		default:
192 			break;
193 		}
194 		break;
195 	case NVME_CTRL_RESETTING:
196 		switch (old_state) {
197 		case NVME_CTRL_NEW:
198 		case NVME_CTRL_LIVE:
199 			changed = true;
200 			/* FALLTHRU */
201 		default:
202 			break;
203 		}
204 		break;
205 	case NVME_CTRL_RECONNECTING:
206 		switch (old_state) {
207 		case NVME_CTRL_LIVE:
208 			changed = true;
209 			/* FALLTHRU */
210 		default:
211 			break;
212 		}
213 		break;
214 	case NVME_CTRL_DELETING:
215 		switch (old_state) {
216 		case NVME_CTRL_LIVE:
217 		case NVME_CTRL_RESETTING:
218 		case NVME_CTRL_RECONNECTING:
219 			changed = true;
220 			/* FALLTHRU */
221 		default:
222 			break;
223 		}
224 		break;
225 	case NVME_CTRL_DEAD:
226 		switch (old_state) {
227 		case NVME_CTRL_DELETING:
228 			changed = true;
229 			/* FALLTHRU */
230 		default:
231 			break;
232 		}
233 		break;
234 	default:
235 		break;
236 	}
237 
238 	if (changed)
239 		ctrl->state = new_state;
240 
241 	spin_unlock_irqrestore(&ctrl->lock, flags);
242 
243 	return changed;
244 }
245 EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
246 
247 static void nvme_free_ns(struct kref *kref)
248 {
249 	struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
250 
251 	if (ns->ndev)
252 		nvme_nvm_unregister(ns);
253 
254 	if (ns->disk) {
255 		spin_lock(&dev_list_lock);
256 		ns->disk->private_data = NULL;
257 		spin_unlock(&dev_list_lock);
258 	}
259 
260 	put_disk(ns->disk);
261 	ida_simple_remove(&ns->ctrl->ns_ida, ns->instance);
262 	nvme_put_ctrl(ns->ctrl);
263 	kfree(ns);
264 }
265 
266 static void nvme_put_ns(struct nvme_ns *ns)
267 {
268 	kref_put(&ns->kref, nvme_free_ns);
269 }
270 
271 static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk)
272 {
273 	struct nvme_ns *ns;
274 
275 	spin_lock(&dev_list_lock);
276 	ns = disk->private_data;
277 	if (ns) {
278 		if (!kref_get_unless_zero(&ns->kref))
279 			goto fail;
280 		if (!try_module_get(ns->ctrl->ops->module))
281 			goto fail_put_ns;
282 	}
283 	spin_unlock(&dev_list_lock);
284 
285 	return ns;
286 
287 fail_put_ns:
288 	kref_put(&ns->kref, nvme_free_ns);
289 fail:
290 	spin_unlock(&dev_list_lock);
291 	return NULL;
292 }
293 
294 struct request *nvme_alloc_request(struct request_queue *q,
295 		struct nvme_command *cmd, unsigned int flags, int qid)
296 {
297 	unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN;
298 	struct request *req;
299 
300 	if (qid == NVME_QID_ANY) {
301 		req = blk_mq_alloc_request(q, op, flags);
302 	} else {
303 		req = blk_mq_alloc_request_hctx(q, op, flags,
304 				qid ? qid - 1 : 0);
305 	}
306 	if (IS_ERR(req))
307 		return req;
308 
309 	req->cmd_flags |= REQ_FAILFAST_DRIVER;
310 	nvme_req(req)->cmd = cmd;
311 
312 	return req;
313 }
314 EXPORT_SYMBOL_GPL(nvme_alloc_request);
315 
316 static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable)
317 {
318 	struct nvme_command c;
319 
320 	memset(&c, 0, sizeof(c));
321 
322 	c.directive.opcode = nvme_admin_directive_send;
323 	c.directive.nsid = cpu_to_le32(NVME_NSID_ALL);
324 	c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE;
325 	c.directive.dtype = NVME_DIR_IDENTIFY;
326 	c.directive.tdtype = NVME_DIR_STREAMS;
327 	c.directive.endir = enable ? NVME_DIR_ENDIR : 0;
328 
329 	return nvme_submit_sync_cmd(ctrl->admin_q, &c, NULL, 0);
330 }
331 
332 static int nvme_disable_streams(struct nvme_ctrl *ctrl)
333 {
334 	return nvme_toggle_streams(ctrl, false);
335 }
336 
337 static int nvme_enable_streams(struct nvme_ctrl *ctrl)
338 {
339 	return nvme_toggle_streams(ctrl, true);
340 }
341 
342 static int nvme_get_stream_params(struct nvme_ctrl *ctrl,
343 				  struct streams_directive_params *s, u32 nsid)
344 {
345 	struct nvme_command c;
346 
347 	memset(&c, 0, sizeof(c));
348 	memset(s, 0, sizeof(*s));
349 
350 	c.directive.opcode = nvme_admin_directive_recv;
351 	c.directive.nsid = cpu_to_le32(nsid);
352 	c.directive.numd = cpu_to_le32((sizeof(*s) >> 2) - 1);
353 	c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM;
354 	c.directive.dtype = NVME_DIR_STREAMS;
355 
356 	return nvme_submit_sync_cmd(ctrl->admin_q, &c, s, sizeof(*s));
357 }
358 
359 static int nvme_configure_directives(struct nvme_ctrl *ctrl)
360 {
361 	struct streams_directive_params s;
362 	int ret;
363 
364 	if (!(ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES))
365 		return 0;
366 	if (!streams)
367 		return 0;
368 
369 	ret = nvme_enable_streams(ctrl);
370 	if (ret)
371 		return ret;
372 
373 	ret = nvme_get_stream_params(ctrl, &s, NVME_NSID_ALL);
374 	if (ret)
375 		return ret;
376 
377 	ctrl->nssa = le16_to_cpu(s.nssa);
378 	if (ctrl->nssa < BLK_MAX_WRITE_HINTS - 1) {
379 		dev_info(ctrl->device, "too few streams (%u) available\n",
380 					ctrl->nssa);
381 		nvme_disable_streams(ctrl);
382 		return 0;
383 	}
384 
385 	ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1);
386 	dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams);
387 	return 0;
388 }
389 
390 /*
391  * Check if 'req' has a write hint associated with it. If it does, assign
392  * a valid namespace stream to the write.
393  */
394 static void nvme_assign_write_stream(struct nvme_ctrl *ctrl,
395 				     struct request *req, u16 *control,
396 				     u32 *dsmgmt)
397 {
398 	enum rw_hint streamid = req->write_hint;
399 
400 	if (streamid == WRITE_LIFE_NOT_SET || streamid == WRITE_LIFE_NONE)
401 		streamid = 0;
402 	else {
403 		streamid--;
404 		if (WARN_ON_ONCE(streamid > ctrl->nr_streams))
405 			return;
406 
407 		*control |= NVME_RW_DTYPE_STREAMS;
408 		*dsmgmt |= streamid << 16;
409 	}
410 
411 	if (streamid < ARRAY_SIZE(req->q->write_hints))
412 		req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9;
413 }
414 
415 static inline void nvme_setup_flush(struct nvme_ns *ns,
416 		struct nvme_command *cmnd)
417 {
418 	memset(cmnd, 0, sizeof(*cmnd));
419 	cmnd->common.opcode = nvme_cmd_flush;
420 	cmnd->common.nsid = cpu_to_le32(ns->ns_id);
421 }
422 
423 static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
424 		struct nvme_command *cmnd)
425 {
426 	unsigned short segments = blk_rq_nr_discard_segments(req), n = 0;
427 	struct nvme_dsm_range *range;
428 	struct bio *bio;
429 
430 	range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC);
431 	if (!range)
432 		return BLK_STS_RESOURCE;
433 
434 	__rq_for_each_bio(bio, req) {
435 		u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector);
436 		u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;
437 
438 		range[n].cattr = cpu_to_le32(0);
439 		range[n].nlb = cpu_to_le32(nlb);
440 		range[n].slba = cpu_to_le64(slba);
441 		n++;
442 	}
443 
444 	if (WARN_ON_ONCE(n != segments)) {
445 		kfree(range);
446 		return BLK_STS_IOERR;
447 	}
448 
449 	memset(cmnd, 0, sizeof(*cmnd));
450 	cmnd->dsm.opcode = nvme_cmd_dsm;
451 	cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
452 	cmnd->dsm.nr = cpu_to_le32(segments - 1);
453 	cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
454 
455 	req->special_vec.bv_page = virt_to_page(range);
456 	req->special_vec.bv_offset = offset_in_page(range);
457 	req->special_vec.bv_len = sizeof(*range) * segments;
458 	req->rq_flags |= RQF_SPECIAL_PAYLOAD;
459 
460 	return BLK_STS_OK;
461 }
462 
463 static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
464 		struct request *req, struct nvme_command *cmnd)
465 {
466 	struct nvme_ctrl *ctrl = ns->ctrl;
467 	u16 control = 0;
468 	u32 dsmgmt = 0;
469 
470 	/*
471 	 * If formated with metadata, require the block layer provide a buffer
472 	 * unless this namespace is formated such that the metadata can be
473 	 * stripped/generated by the controller with PRACT=1.
474 	 */
475 	if (ns && ns->ms &&
476 	    (!ns->pi_type || ns->ms != sizeof(struct t10_pi_tuple)) &&
477 	    !blk_integrity_rq(req) && !blk_rq_is_passthrough(req))
478 		return BLK_STS_NOTSUPP;
479 
480 	if (req->cmd_flags & REQ_FUA)
481 		control |= NVME_RW_FUA;
482 	if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
483 		control |= NVME_RW_LR;
484 
485 	if (req->cmd_flags & REQ_RAHEAD)
486 		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
487 
488 	memset(cmnd, 0, sizeof(*cmnd));
489 	cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
490 	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
491 	cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
492 	cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
493 
494 	if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams)
495 		nvme_assign_write_stream(ctrl, req, &control, &dsmgmt);
496 
497 	if (ns->ms) {
498 		switch (ns->pi_type) {
499 		case NVME_NS_DPS_PI_TYPE3:
500 			control |= NVME_RW_PRINFO_PRCHK_GUARD;
501 			break;
502 		case NVME_NS_DPS_PI_TYPE1:
503 		case NVME_NS_DPS_PI_TYPE2:
504 			control |= NVME_RW_PRINFO_PRCHK_GUARD |
505 					NVME_RW_PRINFO_PRCHK_REF;
506 			cmnd->rw.reftag = cpu_to_le32(
507 					nvme_block_nr(ns, blk_rq_pos(req)));
508 			break;
509 		}
510 		if (!blk_integrity_rq(req))
511 			control |= NVME_RW_PRINFO_PRACT;
512 	}
513 
514 	cmnd->rw.control = cpu_to_le16(control);
515 	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
516 	return 0;
517 }
518 
519 blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
520 		struct nvme_command *cmd)
521 {
522 	blk_status_t ret = BLK_STS_OK;
523 
524 	if (!(req->rq_flags & RQF_DONTPREP)) {
525 		nvme_req(req)->retries = 0;
526 		nvme_req(req)->flags = 0;
527 		req->rq_flags |= RQF_DONTPREP;
528 	}
529 
530 	switch (req_op(req)) {
531 	case REQ_OP_DRV_IN:
532 	case REQ_OP_DRV_OUT:
533 		memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd));
534 		break;
535 	case REQ_OP_FLUSH:
536 		nvme_setup_flush(ns, cmd);
537 		break;
538 	case REQ_OP_WRITE_ZEROES:
539 		/* currently only aliased to deallocate for a few ctrls: */
540 	case REQ_OP_DISCARD:
541 		ret = nvme_setup_discard(ns, req, cmd);
542 		break;
543 	case REQ_OP_READ:
544 	case REQ_OP_WRITE:
545 		ret = nvme_setup_rw(ns, req, cmd);
546 		break;
547 	default:
548 		WARN_ON_ONCE(1);
549 		return BLK_STS_IOERR;
550 	}
551 
552 	cmd->common.command_id = req->tag;
553 	return ret;
554 }
555 EXPORT_SYMBOL_GPL(nvme_setup_cmd);
556 
557 /*
558  * Returns 0 on success.  If the result is negative, it's a Linux error code;
559  * if the result is positive, it's an NVM Express status code
560  */
561 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
562 		union nvme_result *result, void *buffer, unsigned bufflen,
563 		unsigned timeout, int qid, int at_head, int flags)
564 {
565 	struct request *req;
566 	int ret;
567 
568 	req = nvme_alloc_request(q, cmd, flags, qid);
569 	if (IS_ERR(req))
570 		return PTR_ERR(req);
571 
572 	req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
573 
574 	if (buffer && bufflen) {
575 		ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
576 		if (ret)
577 			goto out;
578 	}
579 
580 	blk_execute_rq(req->q, NULL, req, at_head);
581 	if (result)
582 		*result = nvme_req(req)->result;
583 	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
584 		ret = -EINTR;
585 	else
586 		ret = nvme_req(req)->status;
587  out:
588 	blk_mq_free_request(req);
589 	return ret;
590 }
591 EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
592 
593 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
594 		void *buffer, unsigned bufflen)
595 {
596 	return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0,
597 			NVME_QID_ANY, 0, 0);
598 }
599 EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
600 
601 static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf,
602 		unsigned len, u32 seed, bool write)
603 {
604 	struct bio_integrity_payload *bip;
605 	int ret = -ENOMEM;
606 	void *buf;
607 
608 	buf = kmalloc(len, GFP_KERNEL);
609 	if (!buf)
610 		goto out;
611 
612 	ret = -EFAULT;
613 	if (write && copy_from_user(buf, ubuf, len))
614 		goto out_free_meta;
615 
616 	bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
617 	if (IS_ERR(bip)) {
618 		ret = PTR_ERR(bip);
619 		goto out_free_meta;
620 	}
621 
622 	bip->bip_iter.bi_size = len;
623 	bip->bip_iter.bi_sector = seed;
624 	ret = bio_integrity_add_page(bio, virt_to_page(buf), len,
625 			offset_in_page(buf));
626 	if (ret == len)
627 		return buf;
628 	ret = -ENOMEM;
629 out_free_meta:
630 	kfree(buf);
631 out:
632 	return ERR_PTR(ret);
633 }
634 
635 static int nvme_submit_user_cmd(struct request_queue *q,
636 		struct nvme_command *cmd, void __user *ubuffer,
637 		unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
638 		u32 meta_seed, u32 *result, unsigned timeout)
639 {
640 	bool write = nvme_is_write(cmd);
641 	struct nvme_ns *ns = q->queuedata;
642 	struct gendisk *disk = ns ? ns->disk : NULL;
643 	struct request *req;
644 	struct bio *bio = NULL;
645 	void *meta = NULL;
646 	int ret;
647 
648 	req = nvme_alloc_request(q, cmd, 0, NVME_QID_ANY);
649 	if (IS_ERR(req))
650 		return PTR_ERR(req);
651 
652 	req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
653 
654 	if (ubuffer && bufflen) {
655 		ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
656 				GFP_KERNEL);
657 		if (ret)
658 			goto out;
659 		bio = req->bio;
660 		bio->bi_disk = disk;
661 		if (disk && meta_buffer && meta_len) {
662 			meta = nvme_add_user_metadata(bio, meta_buffer, meta_len,
663 					meta_seed, write);
664 			if (IS_ERR(meta)) {
665 				ret = PTR_ERR(meta);
666 				goto out_unmap;
667 			}
668 		}
669 	}
670 
671 	blk_execute_rq(req->q, disk, req, 0);
672 	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
673 		ret = -EINTR;
674 	else
675 		ret = nvme_req(req)->status;
676 	if (result)
677 		*result = le32_to_cpu(nvme_req(req)->result.u32);
678 	if (meta && !ret && !write) {
679 		if (copy_to_user(meta_buffer, meta, meta_len))
680 			ret = -EFAULT;
681 	}
682 	kfree(meta);
683  out_unmap:
684 	if (bio)
685 		blk_rq_unmap_user(bio);
686  out:
687 	blk_mq_free_request(req);
688 	return ret;
689 }
690 
691 static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
692 {
693 	struct nvme_ctrl *ctrl = rq->end_io_data;
694 
695 	blk_mq_free_request(rq);
696 
697 	if (status) {
698 		dev_err(ctrl->device,
699 			"failed nvme_keep_alive_end_io error=%d\n",
700 				status);
701 		return;
702 	}
703 
704 	schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
705 }
706 
707 static int nvme_keep_alive(struct nvme_ctrl *ctrl)
708 {
709 	struct nvme_command c;
710 	struct request *rq;
711 
712 	memset(&c, 0, sizeof(c));
713 	c.common.opcode = nvme_admin_keep_alive;
714 
715 	rq = nvme_alloc_request(ctrl->admin_q, &c, BLK_MQ_REQ_RESERVED,
716 			NVME_QID_ANY);
717 	if (IS_ERR(rq))
718 		return PTR_ERR(rq);
719 
720 	rq->timeout = ctrl->kato * HZ;
721 	rq->end_io_data = ctrl;
722 
723 	blk_execute_rq_nowait(rq->q, NULL, rq, 0, nvme_keep_alive_end_io);
724 
725 	return 0;
726 }
727 
728 static void nvme_keep_alive_work(struct work_struct *work)
729 {
730 	struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
731 			struct nvme_ctrl, ka_work);
732 
733 	if (nvme_keep_alive(ctrl)) {
734 		/* allocation failure, reset the controller */
735 		dev_err(ctrl->device, "keep-alive failed\n");
736 		nvme_reset_ctrl(ctrl);
737 		return;
738 	}
739 }
740 
741 void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
742 {
743 	if (unlikely(ctrl->kato == 0))
744 		return;
745 
746 	INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
747 	schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
748 }
749 EXPORT_SYMBOL_GPL(nvme_start_keep_alive);
750 
751 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
752 {
753 	if (unlikely(ctrl->kato == 0))
754 		return;
755 
756 	cancel_delayed_work_sync(&ctrl->ka_work);
757 }
758 EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);
759 
760 static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
761 {
762 	struct nvme_command c = { };
763 	int error;
764 
765 	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
766 	c.identify.opcode = nvme_admin_identify;
767 	c.identify.cns = NVME_ID_CNS_CTRL;
768 
769 	*id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
770 	if (!*id)
771 		return -ENOMEM;
772 
773 	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
774 			sizeof(struct nvme_id_ctrl));
775 	if (error)
776 		kfree(*id);
777 	return error;
778 }
779 
780 static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
781 		u8 *eui64, u8 *nguid, uuid_t *uuid)
782 {
783 	struct nvme_command c = { };
784 	int status;
785 	void *data;
786 	int pos;
787 	int len;
788 
789 	c.identify.opcode = nvme_admin_identify;
790 	c.identify.nsid = cpu_to_le32(nsid);
791 	c.identify.cns = NVME_ID_CNS_NS_DESC_LIST;
792 
793 	data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
794 	if (!data)
795 		return -ENOMEM;
796 
797 	status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data,
798 				      NVME_IDENTIFY_DATA_SIZE);
799 	if (status)
800 		goto free_data;
801 
802 	for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) {
803 		struct nvme_ns_id_desc *cur = data + pos;
804 
805 		if (cur->nidl == 0)
806 			break;
807 
808 		switch (cur->nidt) {
809 		case NVME_NIDT_EUI64:
810 			if (cur->nidl != NVME_NIDT_EUI64_LEN) {
811 				dev_warn(ctrl->device,
812 					 "ctrl returned bogus length: %d for NVME_NIDT_EUI64\n",
813 					 cur->nidl);
814 				goto free_data;
815 			}
816 			len = NVME_NIDT_EUI64_LEN;
817 			memcpy(eui64, data + pos + sizeof(*cur), len);
818 			break;
819 		case NVME_NIDT_NGUID:
820 			if (cur->nidl != NVME_NIDT_NGUID_LEN) {
821 				dev_warn(ctrl->device,
822 					 "ctrl returned bogus length: %d for NVME_NIDT_NGUID\n",
823 					 cur->nidl);
824 				goto free_data;
825 			}
826 			len = NVME_NIDT_NGUID_LEN;
827 			memcpy(nguid, data + pos + sizeof(*cur), len);
828 			break;
829 		case NVME_NIDT_UUID:
830 			if (cur->nidl != NVME_NIDT_UUID_LEN) {
831 				dev_warn(ctrl->device,
832 					 "ctrl returned bogus length: %d for NVME_NIDT_UUID\n",
833 					 cur->nidl);
834 				goto free_data;
835 			}
836 			len = NVME_NIDT_UUID_LEN;
837 			uuid_copy(uuid, data + pos + sizeof(*cur));
838 			break;
839 		default:
840 			/* Skip unnkown types */
841 			len = cur->nidl;
842 			break;
843 		}
844 
845 		len += sizeof(*cur);
846 	}
847 free_data:
848 	kfree(data);
849 	return status;
850 }
851 
852 static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list)
853 {
854 	struct nvme_command c = { };
855 
856 	c.identify.opcode = nvme_admin_identify;
857 	c.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST;
858 	c.identify.nsid = cpu_to_le32(nsid);
859 	return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000);
860 }
861 
862 static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl,
863 		unsigned nsid)
864 {
865 	struct nvme_id_ns *id;
866 	struct nvme_command c = { };
867 	int error;
868 
869 	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
870 	c.identify.opcode = nvme_admin_identify;
871 	c.identify.nsid = cpu_to_le32(nsid);
872 	c.identify.cns = NVME_ID_CNS_NS;
873 
874 	id = kmalloc(sizeof(*id), GFP_KERNEL);
875 	if (!id)
876 		return NULL;
877 
878 	error = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
879 	if (error) {
880 		dev_warn(ctrl->device, "Identify namespace failed\n");
881 		kfree(id);
882 		return NULL;
883 	}
884 
885 	return id;
886 }
887 
888 static int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
889 		      void *buffer, size_t buflen, u32 *result)
890 {
891 	struct nvme_command c;
892 	union nvme_result res;
893 	int ret;
894 
895 	memset(&c, 0, sizeof(c));
896 	c.features.opcode = nvme_admin_set_features;
897 	c.features.fid = cpu_to_le32(fid);
898 	c.features.dword11 = cpu_to_le32(dword11);
899 
900 	ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
901 			buffer, buflen, 0, NVME_QID_ANY, 0, 0);
902 	if (ret >= 0 && result)
903 		*result = le32_to_cpu(res.u32);
904 	return ret;
905 }
906 
907 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
908 {
909 	u32 q_count = (*count - 1) | ((*count - 1) << 16);
910 	u32 result;
911 	int status, nr_io_queues;
912 
913 	status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
914 			&result);
915 	if (status < 0)
916 		return status;
917 
918 	/*
919 	 * Degraded controllers might return an error when setting the queue
920 	 * count.  We still want to be able to bring them online and offer
921 	 * access to the admin queue, as that might be only way to fix them up.
922 	 */
923 	if (status > 0) {
924 		dev_err(ctrl->device, "Could not set queue count (%d)\n", status);
925 		*count = 0;
926 	} else {
927 		nr_io_queues = min(result & 0xffff, result >> 16) + 1;
928 		*count = min(*count, nr_io_queues);
929 	}
930 
931 	return 0;
932 }
933 EXPORT_SYMBOL_GPL(nvme_set_queue_count);
934 
935 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
936 {
937 	struct nvme_user_io io;
938 	struct nvme_command c;
939 	unsigned length, meta_len;
940 	void __user *metadata;
941 
942 	if (copy_from_user(&io, uio, sizeof(io)))
943 		return -EFAULT;
944 	if (io.flags)
945 		return -EINVAL;
946 
947 	switch (io.opcode) {
948 	case nvme_cmd_write:
949 	case nvme_cmd_read:
950 	case nvme_cmd_compare:
951 		break;
952 	default:
953 		return -EINVAL;
954 	}
955 
956 	length = (io.nblocks + 1) << ns->lba_shift;
957 	meta_len = (io.nblocks + 1) * ns->ms;
958 	metadata = (void __user *)(uintptr_t)io.metadata;
959 
960 	if (ns->ext) {
961 		length += meta_len;
962 		meta_len = 0;
963 	} else if (meta_len) {
964 		if ((io.metadata & 3) || !io.metadata)
965 			return -EINVAL;
966 	}
967 
968 	memset(&c, 0, sizeof(c));
969 	c.rw.opcode = io.opcode;
970 	c.rw.flags = io.flags;
971 	c.rw.nsid = cpu_to_le32(ns->ns_id);
972 	c.rw.slba = cpu_to_le64(io.slba);
973 	c.rw.length = cpu_to_le16(io.nblocks);
974 	c.rw.control = cpu_to_le16(io.control);
975 	c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
976 	c.rw.reftag = cpu_to_le32(io.reftag);
977 	c.rw.apptag = cpu_to_le16(io.apptag);
978 	c.rw.appmask = cpu_to_le16(io.appmask);
979 
980 	return nvme_submit_user_cmd(ns->queue, &c,
981 			(void __user *)(uintptr_t)io.addr, length,
982 			metadata, meta_len, io.slba, NULL, 0);
983 }
984 
985 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
986 			struct nvme_passthru_cmd __user *ucmd)
987 {
988 	struct nvme_passthru_cmd cmd;
989 	struct nvme_command c;
990 	unsigned timeout = 0;
991 	int status;
992 
993 	if (!capable(CAP_SYS_ADMIN))
994 		return -EACCES;
995 	if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
996 		return -EFAULT;
997 	if (cmd.flags)
998 		return -EINVAL;
999 
1000 	memset(&c, 0, sizeof(c));
1001 	c.common.opcode = cmd.opcode;
1002 	c.common.flags = cmd.flags;
1003 	c.common.nsid = cpu_to_le32(cmd.nsid);
1004 	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
1005 	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
1006 	c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
1007 	c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
1008 	c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
1009 	c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
1010 	c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
1011 	c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
1012 
1013 	if (cmd.timeout_ms)
1014 		timeout = msecs_to_jiffies(cmd.timeout_ms);
1015 
1016 	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
1017 			(void __user *)(uintptr_t)cmd.addr, cmd.data_len,
1018 			(void __user *)(uintptr_t)cmd.metadata, cmd.metadata,
1019 			0, &cmd.result, timeout);
1020 	if (status >= 0) {
1021 		if (put_user(cmd.result, &ucmd->result))
1022 			return -EFAULT;
1023 	}
1024 
1025 	return status;
1026 }
1027 
1028 static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
1029 		unsigned int cmd, unsigned long arg)
1030 {
1031 	struct nvme_ns *ns = bdev->bd_disk->private_data;
1032 
1033 	switch (cmd) {
1034 	case NVME_IOCTL_ID:
1035 		force_successful_syscall_return();
1036 		return ns->ns_id;
1037 	case NVME_IOCTL_ADMIN_CMD:
1038 		return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg);
1039 	case NVME_IOCTL_IO_CMD:
1040 		return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg);
1041 	case NVME_IOCTL_SUBMIT_IO:
1042 		return nvme_submit_io(ns, (void __user *)arg);
1043 	default:
1044 #ifdef CONFIG_NVM
1045 		if (ns->ndev)
1046 			return nvme_nvm_ioctl(ns, cmd, arg);
1047 #endif
1048 		if (is_sed_ioctl(cmd))
1049 			return sed_ioctl(ns->ctrl->opal_dev, cmd,
1050 					 (void __user *) arg);
1051 		return -ENOTTY;
1052 	}
1053 }
1054 
1055 #ifdef CONFIG_COMPAT
1056 static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
1057 			unsigned int cmd, unsigned long arg)
1058 {
1059 	return nvme_ioctl(bdev, mode, cmd, arg);
1060 }
1061 #else
1062 #define nvme_compat_ioctl	NULL
1063 #endif
1064 
1065 static int nvme_open(struct block_device *bdev, fmode_t mode)
1066 {
1067 	return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO;
1068 }
1069 
1070 static void nvme_release(struct gendisk *disk, fmode_t mode)
1071 {
1072 	struct nvme_ns *ns = disk->private_data;
1073 
1074 	module_put(ns->ctrl->ops->module);
1075 	nvme_put_ns(ns);
1076 }
1077 
1078 static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
1079 {
1080 	/* some standard values */
1081 	geo->heads = 1 << 6;
1082 	geo->sectors = 1 << 5;
1083 	geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
1084 	return 0;
1085 }
1086 
1087 #ifdef CONFIG_BLK_DEV_INTEGRITY
1088 static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id,
1089 		u16 bs)
1090 {
1091 	struct nvme_ns *ns = disk->private_data;
1092 	u16 old_ms = ns->ms;
1093 	u8 pi_type = 0;
1094 
1095 	ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
1096 	ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
1097 
1098 	/* PI implementation requires metadata equal t10 pi tuple size */
1099 	if (ns->ms == sizeof(struct t10_pi_tuple))
1100 		pi_type = id->dps & NVME_NS_DPS_PI_MASK;
1101 
1102 	if (blk_get_integrity(disk) &&
1103 	    (ns->pi_type != pi_type || ns->ms != old_ms ||
1104 	     bs != queue_logical_block_size(disk->queue) ||
1105 	     (ns->ms && ns->ext)))
1106 		blk_integrity_unregister(disk);
1107 
1108 	ns->pi_type = pi_type;
1109 }
1110 
1111 static void nvme_init_integrity(struct nvme_ns *ns)
1112 {
1113 	struct blk_integrity integrity;
1114 
1115 	memset(&integrity, 0, sizeof(integrity));
1116 	switch (ns->pi_type) {
1117 	case NVME_NS_DPS_PI_TYPE3:
1118 		integrity.profile = &t10_pi_type3_crc;
1119 		integrity.tag_size = sizeof(u16) + sizeof(u32);
1120 		integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1121 		break;
1122 	case NVME_NS_DPS_PI_TYPE1:
1123 	case NVME_NS_DPS_PI_TYPE2:
1124 		integrity.profile = &t10_pi_type1_crc;
1125 		integrity.tag_size = sizeof(u16);
1126 		integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1127 		break;
1128 	default:
1129 		integrity.profile = NULL;
1130 		break;
1131 	}
1132 	integrity.tuple_size = ns->ms;
1133 	blk_integrity_register(ns->disk, &integrity);
1134 	blk_queue_max_integrity_segments(ns->queue, 1);
1135 }
1136 #else
1137 static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id,
1138 		u16 bs)
1139 {
1140 }
1141 static void nvme_init_integrity(struct nvme_ns *ns)
1142 {
1143 }
1144 #endif /* CONFIG_BLK_DEV_INTEGRITY */
1145 
1146 static void nvme_set_chunk_size(struct nvme_ns *ns)
1147 {
1148 	u32 chunk_size = (((u32)ns->noiob) << (ns->lba_shift - 9));
1149 	blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size));
1150 }
1151 
1152 static void nvme_config_discard(struct nvme_ns *ns)
1153 {
1154 	struct nvme_ctrl *ctrl = ns->ctrl;
1155 	u32 logical_block_size = queue_logical_block_size(ns->queue);
1156 
1157 	BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
1158 			NVME_DSM_MAX_RANGES);
1159 
1160 	if (ctrl->nr_streams && ns->sws && ns->sgs) {
1161 		unsigned int sz = logical_block_size * ns->sws * ns->sgs;
1162 
1163 		ns->queue->limits.discard_alignment = sz;
1164 		ns->queue->limits.discard_granularity = sz;
1165 	} else {
1166 		ns->queue->limits.discard_alignment = logical_block_size;
1167 		ns->queue->limits.discard_granularity = logical_block_size;
1168 	}
1169 	blk_queue_max_discard_sectors(ns->queue, UINT_MAX);
1170 	blk_queue_max_discard_segments(ns->queue, NVME_DSM_MAX_RANGES);
1171 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
1172 
1173 	if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
1174 		blk_queue_max_write_zeroes_sectors(ns->queue, UINT_MAX);
1175 }
1176 
1177 static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
1178 		struct nvme_id_ns *id, u8 *eui64, u8 *nguid, uuid_t *uuid)
1179 {
1180 	if (ctrl->vs >= NVME_VS(1, 1, 0))
1181 		memcpy(eui64, id->eui64, sizeof(id->eui64));
1182 	if (ctrl->vs >= NVME_VS(1, 2, 0))
1183 		memcpy(nguid, id->nguid, sizeof(id->nguid));
1184 	if (ctrl->vs >= NVME_VS(1, 3, 0)) {
1185 		 /* Don't treat error as fatal we potentially
1186 		  * already have a NGUID or EUI-64
1187 		  */
1188 		if (nvme_identify_ns_descs(ctrl, nsid, eui64, nguid, uuid))
1189 			dev_warn(ctrl->device,
1190 				 "%s: Identify Descriptors failed\n", __func__);
1191 	}
1192 }
1193 
1194 static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
1195 {
1196 	struct nvme_ns *ns = disk->private_data;
1197 	struct nvme_ctrl *ctrl = ns->ctrl;
1198 	u16 bs;
1199 
1200 	/*
1201 	 * If identify namespace failed, use default 512 byte block size so
1202 	 * block layer can use before failing read/write for 0 capacity.
1203 	 */
1204 	ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
1205 	if (ns->lba_shift == 0)
1206 		ns->lba_shift = 9;
1207 	bs = 1 << ns->lba_shift;
1208 	ns->noiob = le16_to_cpu(id->noiob);
1209 
1210 	blk_mq_freeze_queue(disk->queue);
1211 
1212 	if (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)
1213 		nvme_prep_integrity(disk, id, bs);
1214 	blk_queue_logical_block_size(ns->queue, bs);
1215 	if (ns->noiob)
1216 		nvme_set_chunk_size(ns);
1217 	if (ns->ms && !blk_get_integrity(disk) && !ns->ext)
1218 		nvme_init_integrity(ns);
1219 	if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
1220 		set_capacity(disk, 0);
1221 	else
1222 		set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
1223 
1224 	if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
1225 		nvme_config_discard(ns);
1226 	blk_mq_unfreeze_queue(disk->queue);
1227 }
1228 
1229 static int nvme_revalidate_disk(struct gendisk *disk)
1230 {
1231 	struct nvme_ns *ns = disk->private_data;
1232 	struct nvme_ctrl *ctrl = ns->ctrl;
1233 	struct nvme_id_ns *id;
1234 	u8 eui64[8] = { 0 }, nguid[16] = { 0 };
1235 	uuid_t uuid = uuid_null;
1236 	int ret = 0;
1237 
1238 	if (test_bit(NVME_NS_DEAD, &ns->flags)) {
1239 		set_capacity(disk, 0);
1240 		return -ENODEV;
1241 	}
1242 
1243 	id = nvme_identify_ns(ctrl, ns->ns_id);
1244 	if (!id)
1245 		return -ENODEV;
1246 
1247 	if (id->ncap == 0) {
1248 		ret = -ENODEV;
1249 		goto out;
1250 	}
1251 
1252 	nvme_report_ns_ids(ctrl, ns->ns_id, id, eui64, nguid, &uuid);
1253 	if (!uuid_equal(&ns->uuid, &uuid) ||
1254 	    memcmp(&ns->nguid, &nguid, sizeof(ns->nguid)) ||
1255 	    memcmp(&ns->eui, &eui64, sizeof(ns->eui))) {
1256 		dev_err(ctrl->device,
1257 			"identifiers changed for nsid %d\n", ns->ns_id);
1258 		ret = -ENODEV;
1259 	}
1260 
1261 out:
1262 	kfree(id);
1263 	return ret;
1264 }
1265 
1266 static char nvme_pr_type(enum pr_type type)
1267 {
1268 	switch (type) {
1269 	case PR_WRITE_EXCLUSIVE:
1270 		return 1;
1271 	case PR_EXCLUSIVE_ACCESS:
1272 		return 2;
1273 	case PR_WRITE_EXCLUSIVE_REG_ONLY:
1274 		return 3;
1275 	case PR_EXCLUSIVE_ACCESS_REG_ONLY:
1276 		return 4;
1277 	case PR_WRITE_EXCLUSIVE_ALL_REGS:
1278 		return 5;
1279 	case PR_EXCLUSIVE_ACCESS_ALL_REGS:
1280 		return 6;
1281 	default:
1282 		return 0;
1283 	}
1284 };
1285 
1286 static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
1287 				u64 key, u64 sa_key, u8 op)
1288 {
1289 	struct nvme_ns *ns = bdev->bd_disk->private_data;
1290 	struct nvme_command c;
1291 	u8 data[16] = { 0, };
1292 
1293 	put_unaligned_le64(key, &data[0]);
1294 	put_unaligned_le64(sa_key, &data[8]);
1295 
1296 	memset(&c, 0, sizeof(c));
1297 	c.common.opcode = op;
1298 	c.common.nsid = cpu_to_le32(ns->ns_id);
1299 	c.common.cdw10[0] = cpu_to_le32(cdw10);
1300 
1301 	return nvme_submit_sync_cmd(ns->queue, &c, data, 16);
1302 }
1303 
1304 static int nvme_pr_register(struct block_device *bdev, u64 old,
1305 		u64 new, unsigned flags)
1306 {
1307 	u32 cdw10;
1308 
1309 	if (flags & ~PR_FL_IGNORE_KEY)
1310 		return -EOPNOTSUPP;
1311 
1312 	cdw10 = old ? 2 : 0;
1313 	cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
1314 	cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
1315 	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
1316 }
1317 
1318 static int nvme_pr_reserve(struct block_device *bdev, u64 key,
1319 		enum pr_type type, unsigned flags)
1320 {
1321 	u32 cdw10;
1322 
1323 	if (flags & ~PR_FL_IGNORE_KEY)
1324 		return -EOPNOTSUPP;
1325 
1326 	cdw10 = nvme_pr_type(type) << 8;
1327 	cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
1328 	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
1329 }
1330 
1331 static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
1332 		enum pr_type type, bool abort)
1333 {
1334 	u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1;
1335 	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
1336 }
1337 
1338 static int nvme_pr_clear(struct block_device *bdev, u64 key)
1339 {
1340 	u32 cdw10 = 1 | (key ? 1 << 3 : 0);
1341 	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
1342 }
1343 
1344 static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
1345 {
1346 	u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0;
1347 	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
1348 }
1349 
1350 static const struct pr_ops nvme_pr_ops = {
1351 	.pr_register	= nvme_pr_register,
1352 	.pr_reserve	= nvme_pr_reserve,
1353 	.pr_release	= nvme_pr_release,
1354 	.pr_preempt	= nvme_pr_preempt,
1355 	.pr_clear	= nvme_pr_clear,
1356 };
1357 
1358 #ifdef CONFIG_BLK_SED_OPAL
1359 int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
1360 		bool send)
1361 {
1362 	struct nvme_ctrl *ctrl = data;
1363 	struct nvme_command cmd;
1364 
1365 	memset(&cmd, 0, sizeof(cmd));
1366 	if (send)
1367 		cmd.common.opcode = nvme_admin_security_send;
1368 	else
1369 		cmd.common.opcode = nvme_admin_security_recv;
1370 	cmd.common.nsid = 0;
1371 	cmd.common.cdw10[0] = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
1372 	cmd.common.cdw10[1] = cpu_to_le32(len);
1373 
1374 	return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
1375 				      ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0);
1376 }
1377 EXPORT_SYMBOL_GPL(nvme_sec_submit);
1378 #endif /* CONFIG_BLK_SED_OPAL */
1379 
1380 static const struct block_device_operations nvme_fops = {
1381 	.owner		= THIS_MODULE,
1382 	.ioctl		= nvme_ioctl,
1383 	.compat_ioctl	= nvme_compat_ioctl,
1384 	.open		= nvme_open,
1385 	.release	= nvme_release,
1386 	.getgeo		= nvme_getgeo,
1387 	.revalidate_disk= nvme_revalidate_disk,
1388 	.pr_ops		= &nvme_pr_ops,
1389 };
1390 
1391 static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
1392 {
1393 	unsigned long timeout =
1394 		((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
1395 	u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
1396 	int ret;
1397 
1398 	while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
1399 		if (csts == ~0)
1400 			return -ENODEV;
1401 		if ((csts & NVME_CSTS_RDY) == bit)
1402 			break;
1403 
1404 		msleep(100);
1405 		if (fatal_signal_pending(current))
1406 			return -EINTR;
1407 		if (time_after(jiffies, timeout)) {
1408 			dev_err(ctrl->device,
1409 				"Device not ready; aborting %s\n", enabled ?
1410 						"initialisation" : "reset");
1411 			return -ENODEV;
1412 		}
1413 	}
1414 
1415 	return ret;
1416 }
1417 
1418 /*
1419  * If the device has been passed off to us in an enabled state, just clear
1420  * the enabled bit.  The spec says we should set the 'shutdown notification
1421  * bits', but doing so may cause the device to complete commands to the
1422  * admin queue ... and we don't know what memory that might be pointing at!
1423  */
1424 int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
1425 {
1426 	int ret;
1427 
1428 	ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
1429 	ctrl->ctrl_config &= ~NVME_CC_ENABLE;
1430 
1431 	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
1432 	if (ret)
1433 		return ret;
1434 
1435 	if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
1436 		msleep(NVME_QUIRK_DELAY_AMOUNT);
1437 
1438 	return nvme_wait_ready(ctrl, cap, false);
1439 }
1440 EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
1441 
1442 int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
1443 {
1444 	/*
1445 	 * Default to a 4K page size, with the intention to update this
1446 	 * path in the future to accomodate architectures with differing
1447 	 * kernel and IO page sizes.
1448 	 */
1449 	unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12;
1450 	int ret;
1451 
1452 	if (page_shift < dev_page_min) {
1453 		dev_err(ctrl->device,
1454 			"Minimum device page size %u too large for host (%u)\n",
1455 			1 << dev_page_min, 1 << page_shift);
1456 		return -ENODEV;
1457 	}
1458 
1459 	ctrl->page_size = 1 << page_shift;
1460 
1461 	ctrl->ctrl_config = NVME_CC_CSS_NVM;
1462 	ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
1463 	ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
1464 	ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
1465 	ctrl->ctrl_config |= NVME_CC_ENABLE;
1466 
1467 	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
1468 	if (ret)
1469 		return ret;
1470 	return nvme_wait_ready(ctrl, cap, true);
1471 }
1472 EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
1473 
1474 int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
1475 {
1476 	unsigned long timeout = jiffies + (ctrl->shutdown_timeout * HZ);
1477 	u32 csts;
1478 	int ret;
1479 
1480 	ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
1481 	ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
1482 
1483 	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
1484 	if (ret)
1485 		return ret;
1486 
1487 	while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
1488 		if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT)
1489 			break;
1490 
1491 		msleep(100);
1492 		if (fatal_signal_pending(current))
1493 			return -EINTR;
1494 		if (time_after(jiffies, timeout)) {
1495 			dev_err(ctrl->device,
1496 				"Device shutdown incomplete; abort shutdown\n");
1497 			return -ENODEV;
1498 		}
1499 	}
1500 
1501 	return ret;
1502 }
1503 EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl);
1504 
1505 static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
1506 		struct request_queue *q)
1507 {
1508 	bool vwc = false;
1509 
1510 	if (ctrl->max_hw_sectors) {
1511 		u32 max_segments =
1512 			(ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1;
1513 
1514 		blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
1515 		blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
1516 	}
1517 	if (ctrl->quirks & NVME_QUIRK_STRIPE_SIZE)
1518 		blk_queue_chunk_sectors(q, ctrl->max_hw_sectors);
1519 	blk_queue_virt_boundary(q, ctrl->page_size - 1);
1520 	if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
1521 		vwc = true;
1522 	blk_queue_write_cache(q, vwc, vwc);
1523 }
1524 
1525 static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
1526 {
1527 	__le64 ts;
1528 	int ret;
1529 
1530 	if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP))
1531 		return 0;
1532 
1533 	ts = cpu_to_le64(ktime_to_ms(ktime_get_real()));
1534 	ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts),
1535 			NULL);
1536 	if (ret)
1537 		dev_warn_once(ctrl->device,
1538 			"could not set timestamp (%d)\n", ret);
1539 	return ret;
1540 }
1541 
1542 static int nvme_configure_apst(struct nvme_ctrl *ctrl)
1543 {
1544 	/*
1545 	 * APST (Autonomous Power State Transition) lets us program a
1546 	 * table of power state transitions that the controller will
1547 	 * perform automatically.  We configure it with a simple
1548 	 * heuristic: we are willing to spend at most 2% of the time
1549 	 * transitioning between power states.  Therefore, when running
1550 	 * in any given state, we will enter the next lower-power
1551 	 * non-operational state after waiting 50 * (enlat + exlat)
1552 	 * microseconds, as long as that state's exit latency is under
1553 	 * the requested maximum latency.
1554 	 *
1555 	 * We will not autonomously enter any non-operational state for
1556 	 * which the total latency exceeds ps_max_latency_us.  Users
1557 	 * can set ps_max_latency_us to zero to turn off APST.
1558 	 */
1559 
1560 	unsigned apste;
1561 	struct nvme_feat_auto_pst *table;
1562 	u64 max_lat_us = 0;
1563 	int max_ps = -1;
1564 	int ret;
1565 
1566 	/*
1567 	 * If APST isn't supported or if we haven't been initialized yet,
1568 	 * then don't do anything.
1569 	 */
1570 	if (!ctrl->apsta)
1571 		return 0;
1572 
1573 	if (ctrl->npss > 31) {
1574 		dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
1575 		return 0;
1576 	}
1577 
1578 	table = kzalloc(sizeof(*table), GFP_KERNEL);
1579 	if (!table)
1580 		return 0;
1581 
1582 	if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) {
1583 		/* Turn off APST. */
1584 		apste = 0;
1585 		dev_dbg(ctrl->device, "APST disabled\n");
1586 	} else {
1587 		__le64 target = cpu_to_le64(0);
1588 		int state;
1589 
1590 		/*
1591 		 * Walk through all states from lowest- to highest-power.
1592 		 * According to the spec, lower-numbered states use more
1593 		 * power.  NPSS, despite the name, is the index of the
1594 		 * lowest-power state, not the number of states.
1595 		 */
1596 		for (state = (int)ctrl->npss; state >= 0; state--) {
1597 			u64 total_latency_us, exit_latency_us, transition_ms;
1598 
1599 			if (target)
1600 				table->entries[state] = target;
1601 
1602 			/*
1603 			 * Don't allow transitions to the deepest state
1604 			 * if it's quirked off.
1605 			 */
1606 			if (state == ctrl->npss &&
1607 			    (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS))
1608 				continue;
1609 
1610 			/*
1611 			 * Is this state a useful non-operational state for
1612 			 * higher-power states to autonomously transition to?
1613 			 */
1614 			if (!(ctrl->psd[state].flags &
1615 			      NVME_PS_FLAGS_NON_OP_STATE))
1616 				continue;
1617 
1618 			exit_latency_us =
1619 				(u64)le32_to_cpu(ctrl->psd[state].exit_lat);
1620 			if (exit_latency_us > ctrl->ps_max_latency_us)
1621 				continue;
1622 
1623 			total_latency_us =
1624 				exit_latency_us +
1625 				le32_to_cpu(ctrl->psd[state].entry_lat);
1626 
1627 			/*
1628 			 * This state is good.  Use it as the APST idle
1629 			 * target for higher power states.
1630 			 */
1631 			transition_ms = total_latency_us + 19;
1632 			do_div(transition_ms, 20);
1633 			if (transition_ms > (1 << 24) - 1)
1634 				transition_ms = (1 << 24) - 1;
1635 
1636 			target = cpu_to_le64((state << 3) |
1637 					     (transition_ms << 8));
1638 
1639 			if (max_ps == -1)
1640 				max_ps = state;
1641 
1642 			if (total_latency_us > max_lat_us)
1643 				max_lat_us = total_latency_us;
1644 		}
1645 
1646 		apste = 1;
1647 
1648 		if (max_ps == -1) {
1649 			dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n");
1650 		} else {
1651 			dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n",
1652 				max_ps, max_lat_us, (int)sizeof(*table), table);
1653 		}
1654 	}
1655 
1656 	ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
1657 				table, sizeof(*table), NULL);
1658 	if (ret)
1659 		dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
1660 
1661 	kfree(table);
1662 	return ret;
1663 }
1664 
1665 static void nvme_set_latency_tolerance(struct device *dev, s32 val)
1666 {
1667 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
1668 	u64 latency;
1669 
1670 	switch (val) {
1671 	case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT:
1672 	case PM_QOS_LATENCY_ANY:
1673 		latency = U64_MAX;
1674 		break;
1675 
1676 	default:
1677 		latency = val;
1678 	}
1679 
1680 	if (ctrl->ps_max_latency_us != latency) {
1681 		ctrl->ps_max_latency_us = latency;
1682 		nvme_configure_apst(ctrl);
1683 	}
1684 }
1685 
1686 struct nvme_core_quirk_entry {
1687 	/*
1688 	 * NVMe model and firmware strings are padded with spaces.  For
1689 	 * simplicity, strings in the quirk table are padded with NULLs
1690 	 * instead.
1691 	 */
1692 	u16 vid;
1693 	const char *mn;
1694 	const char *fr;
1695 	unsigned long quirks;
1696 };
1697 
1698 static const struct nvme_core_quirk_entry core_quirks[] = {
1699 	{
1700 		/*
1701 		 * This Toshiba device seems to die using any APST states.  See:
1702 		 * https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678184/comments/11
1703 		 */
1704 		.vid = 0x1179,
1705 		.mn = "THNSF5256GPUK TOSHIBA",
1706 		.quirks = NVME_QUIRK_NO_APST,
1707 	}
1708 };
1709 
1710 /* match is null-terminated but idstr is space-padded. */
1711 static bool string_matches(const char *idstr, const char *match, size_t len)
1712 {
1713 	size_t matchlen;
1714 
1715 	if (!match)
1716 		return true;
1717 
1718 	matchlen = strlen(match);
1719 	WARN_ON_ONCE(matchlen > len);
1720 
1721 	if (memcmp(idstr, match, matchlen))
1722 		return false;
1723 
1724 	for (; matchlen < len; matchlen++)
1725 		if (idstr[matchlen] != ' ')
1726 			return false;
1727 
1728 	return true;
1729 }
1730 
1731 static bool quirk_matches(const struct nvme_id_ctrl *id,
1732 			  const struct nvme_core_quirk_entry *q)
1733 {
1734 	return q->vid == le16_to_cpu(id->vid) &&
1735 		string_matches(id->mn, q->mn, sizeof(id->mn)) &&
1736 		string_matches(id->fr, q->fr, sizeof(id->fr));
1737 }
1738 
1739 static void nvme_init_subnqn(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
1740 {
1741 	size_t nqnlen;
1742 	int off;
1743 
1744 	nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
1745 	if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
1746 		strcpy(ctrl->subnqn, id->subnqn);
1747 		return;
1748 	}
1749 
1750 	if (ctrl->vs >= NVME_VS(1, 2, 1))
1751 		dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
1752 
1753 	/* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */
1754 	off = snprintf(ctrl->subnqn, NVMF_NQN_SIZE,
1755 			"nqn.2014.08.org.nvmexpress:%4x%4x",
1756 			le16_to_cpu(id->vid), le16_to_cpu(id->ssvid));
1757 	memcpy(ctrl->subnqn + off, id->sn, sizeof(id->sn));
1758 	off += sizeof(id->sn);
1759 	memcpy(ctrl->subnqn + off, id->mn, sizeof(id->mn));
1760 	off += sizeof(id->mn);
1761 	memset(ctrl->subnqn + off, 0, sizeof(ctrl->subnqn) - off);
1762 }
1763 
1764 /*
1765  * Initialize the cached copies of the Identify data and various controller
1766  * register in our nvme_ctrl structure.  This should be called as soon as
1767  * the admin queue is fully up and running.
1768  */
1769 int nvme_init_identify(struct nvme_ctrl *ctrl)
1770 {
1771 	struct nvme_id_ctrl *id;
1772 	u64 cap;
1773 	int ret, page_shift;
1774 	u32 max_hw_sectors;
1775 	bool prev_apst_enabled;
1776 
1777 	ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
1778 	if (ret) {
1779 		dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
1780 		return ret;
1781 	}
1782 
1783 	ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap);
1784 	if (ret) {
1785 		dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
1786 		return ret;
1787 	}
1788 	page_shift = NVME_CAP_MPSMIN(cap) + 12;
1789 
1790 	if (ctrl->vs >= NVME_VS(1, 1, 0))
1791 		ctrl->subsystem = NVME_CAP_NSSRC(cap);
1792 
1793 	ret = nvme_identify_ctrl(ctrl, &id);
1794 	if (ret) {
1795 		dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret);
1796 		return -EIO;
1797 	}
1798 
1799 	nvme_init_subnqn(ctrl, id);
1800 
1801 	if (!ctrl->identified) {
1802 		/*
1803 		 * Check for quirks.  Quirk can depend on firmware version,
1804 		 * so, in principle, the set of quirks present can change
1805 		 * across a reset.  As a possible future enhancement, we
1806 		 * could re-scan for quirks every time we reinitialize
1807 		 * the device, but we'd have to make sure that the driver
1808 		 * behaves intelligently if the quirks change.
1809 		 */
1810 
1811 		int i;
1812 
1813 		for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
1814 			if (quirk_matches(id, &core_quirks[i]))
1815 				ctrl->quirks |= core_quirks[i].quirks;
1816 		}
1817 	}
1818 
1819 	if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
1820 		dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
1821 		ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
1822 	}
1823 
1824 	ctrl->oacs = le16_to_cpu(id->oacs);
1825 	ctrl->vid = le16_to_cpu(id->vid);
1826 	ctrl->oncs = le16_to_cpup(&id->oncs);
1827 	atomic_set(&ctrl->abort_limit, id->acl + 1);
1828 	ctrl->vwc = id->vwc;
1829 	ctrl->cntlid = le16_to_cpup(&id->cntlid);
1830 	memcpy(ctrl->serial, id->sn, sizeof(id->sn));
1831 	memcpy(ctrl->model, id->mn, sizeof(id->mn));
1832 	memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr));
1833 	if (id->mdts)
1834 		max_hw_sectors = 1 << (id->mdts + page_shift - 9);
1835 	else
1836 		max_hw_sectors = UINT_MAX;
1837 	ctrl->max_hw_sectors =
1838 		min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
1839 
1840 	nvme_set_queue_limits(ctrl, ctrl->admin_q);
1841 	ctrl->sgls = le32_to_cpu(id->sgls);
1842 	ctrl->kas = le16_to_cpu(id->kas);
1843 
1844 	if (id->rtd3e) {
1845 		/* us -> s */
1846 		u32 transition_time = le32_to_cpu(id->rtd3e) / 1000000;
1847 
1848 		ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
1849 						 shutdown_timeout, 60);
1850 
1851 		if (ctrl->shutdown_timeout != shutdown_timeout)
1852 			dev_warn(ctrl->device,
1853 				 "Shutdown timeout set to %u seconds\n",
1854 				 ctrl->shutdown_timeout);
1855 	} else
1856 		ctrl->shutdown_timeout = shutdown_timeout;
1857 
1858 	ctrl->npss = id->npss;
1859 	ctrl->apsta = id->apsta;
1860 	prev_apst_enabled = ctrl->apst_enabled;
1861 	if (ctrl->quirks & NVME_QUIRK_NO_APST) {
1862 		if (force_apst && id->apsta) {
1863 			dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
1864 			ctrl->apst_enabled = true;
1865 		} else {
1866 			ctrl->apst_enabled = false;
1867 		}
1868 	} else {
1869 		ctrl->apst_enabled = id->apsta;
1870 	}
1871 	memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
1872 
1873 	if (ctrl->ops->flags & NVME_F_FABRICS) {
1874 		ctrl->icdoff = le16_to_cpu(id->icdoff);
1875 		ctrl->ioccsz = le32_to_cpu(id->ioccsz);
1876 		ctrl->iorcsz = le32_to_cpu(id->iorcsz);
1877 		ctrl->maxcmd = le16_to_cpu(id->maxcmd);
1878 
1879 		/*
1880 		 * In fabrics we need to verify the cntlid matches the
1881 		 * admin connect
1882 		 */
1883 		if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
1884 			ret = -EINVAL;
1885 			goto out_free;
1886 		}
1887 
1888 		if (!ctrl->opts->discovery_nqn && !ctrl->kas) {
1889 			dev_err(ctrl->device,
1890 				"keep-alive support is mandatory for fabrics\n");
1891 			ret = -EINVAL;
1892 			goto out_free;
1893 		}
1894 	} else {
1895 		ctrl->cntlid = le16_to_cpu(id->cntlid);
1896 		ctrl->hmpre = le32_to_cpu(id->hmpre);
1897 		ctrl->hmmin = le32_to_cpu(id->hmmin);
1898 		ctrl->hmminds = le32_to_cpu(id->hmminds);
1899 		ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
1900 	}
1901 
1902 	kfree(id);
1903 
1904 	if (ctrl->apst_enabled && !prev_apst_enabled)
1905 		dev_pm_qos_expose_latency_tolerance(ctrl->device);
1906 	else if (!ctrl->apst_enabled && prev_apst_enabled)
1907 		dev_pm_qos_hide_latency_tolerance(ctrl->device);
1908 
1909 	ret = nvme_configure_apst(ctrl);
1910 	if (ret < 0)
1911 		return ret;
1912 
1913 	ret = nvme_configure_timestamp(ctrl);
1914 	if (ret < 0)
1915 		return ret;
1916 
1917 	ret = nvme_configure_directives(ctrl);
1918 	if (ret < 0)
1919 		return ret;
1920 
1921 	ctrl->identified = true;
1922 
1923 	return 0;
1924 
1925 out_free:
1926 	kfree(id);
1927 	return ret;
1928 }
1929 EXPORT_SYMBOL_GPL(nvme_init_identify);
1930 
1931 static int nvme_dev_open(struct inode *inode, struct file *file)
1932 {
1933 	struct nvme_ctrl *ctrl;
1934 	int instance = iminor(inode);
1935 	int ret = -ENODEV;
1936 
1937 	spin_lock(&dev_list_lock);
1938 	list_for_each_entry(ctrl, &nvme_ctrl_list, node) {
1939 		if (ctrl->instance != instance)
1940 			continue;
1941 
1942 		if (!ctrl->admin_q) {
1943 			ret = -EWOULDBLOCK;
1944 			break;
1945 		}
1946 		if (!kref_get_unless_zero(&ctrl->kref))
1947 			break;
1948 		file->private_data = ctrl;
1949 		ret = 0;
1950 		break;
1951 	}
1952 	spin_unlock(&dev_list_lock);
1953 
1954 	return ret;
1955 }
1956 
1957 static int nvme_dev_release(struct inode *inode, struct file *file)
1958 {
1959 	nvme_put_ctrl(file->private_data);
1960 	return 0;
1961 }
1962 
1963 static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
1964 {
1965 	struct nvme_ns *ns;
1966 	int ret;
1967 
1968 	mutex_lock(&ctrl->namespaces_mutex);
1969 	if (list_empty(&ctrl->namespaces)) {
1970 		ret = -ENOTTY;
1971 		goto out_unlock;
1972 	}
1973 
1974 	ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
1975 	if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
1976 		dev_warn(ctrl->device,
1977 			"NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n");
1978 		ret = -EINVAL;
1979 		goto out_unlock;
1980 	}
1981 
1982 	dev_warn(ctrl->device,
1983 		"using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
1984 	kref_get(&ns->kref);
1985 	mutex_unlock(&ctrl->namespaces_mutex);
1986 
1987 	ret = nvme_user_cmd(ctrl, ns, argp);
1988 	nvme_put_ns(ns);
1989 	return ret;
1990 
1991 out_unlock:
1992 	mutex_unlock(&ctrl->namespaces_mutex);
1993 	return ret;
1994 }
1995 
1996 static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
1997 		unsigned long arg)
1998 {
1999 	struct nvme_ctrl *ctrl = file->private_data;
2000 	void __user *argp = (void __user *)arg;
2001 
2002 	switch (cmd) {
2003 	case NVME_IOCTL_ADMIN_CMD:
2004 		return nvme_user_cmd(ctrl, NULL, argp);
2005 	case NVME_IOCTL_IO_CMD:
2006 		return nvme_dev_user_cmd(ctrl, argp);
2007 	case NVME_IOCTL_RESET:
2008 		dev_warn(ctrl->device, "resetting controller\n");
2009 		return nvme_reset_ctrl_sync(ctrl);
2010 	case NVME_IOCTL_SUBSYS_RESET:
2011 		return nvme_reset_subsystem(ctrl);
2012 	case NVME_IOCTL_RESCAN:
2013 		nvme_queue_scan(ctrl);
2014 		return 0;
2015 	default:
2016 		return -ENOTTY;
2017 	}
2018 }
2019 
2020 static const struct file_operations nvme_dev_fops = {
2021 	.owner		= THIS_MODULE,
2022 	.open		= nvme_dev_open,
2023 	.release	= nvme_dev_release,
2024 	.unlocked_ioctl	= nvme_dev_ioctl,
2025 	.compat_ioctl	= nvme_dev_ioctl,
2026 };
2027 
2028 static ssize_t nvme_sysfs_reset(struct device *dev,
2029 				struct device_attribute *attr, const char *buf,
2030 				size_t count)
2031 {
2032 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2033 	int ret;
2034 
2035 	ret = nvme_reset_ctrl_sync(ctrl);
2036 	if (ret < 0)
2037 		return ret;
2038 	return count;
2039 }
2040 static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
2041 
2042 static ssize_t nvme_sysfs_rescan(struct device *dev,
2043 				struct device_attribute *attr, const char *buf,
2044 				size_t count)
2045 {
2046 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2047 
2048 	nvme_queue_scan(ctrl);
2049 	return count;
2050 }
2051 static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan);
2052 
2053 static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
2054 								char *buf)
2055 {
2056 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
2057 	struct nvme_ctrl *ctrl = ns->ctrl;
2058 	int serial_len = sizeof(ctrl->serial);
2059 	int model_len = sizeof(ctrl->model);
2060 
2061 	if (!uuid_is_null(&ns->uuid))
2062 		return sprintf(buf, "uuid.%pU\n", &ns->uuid);
2063 
2064 	if (memchr_inv(ns->nguid, 0, sizeof(ns->nguid)))
2065 		return sprintf(buf, "eui.%16phN\n", ns->nguid);
2066 
2067 	if (memchr_inv(ns->eui, 0, sizeof(ns->eui)))
2068 		return sprintf(buf, "eui.%8phN\n", ns->eui);
2069 
2070 	while (serial_len > 0 && (ctrl->serial[serial_len - 1] == ' ' ||
2071 				  ctrl->serial[serial_len - 1] == '\0'))
2072 		serial_len--;
2073 	while (model_len > 0 && (ctrl->model[model_len - 1] == ' ' ||
2074 				 ctrl->model[model_len - 1] == '\0'))
2075 		model_len--;
2076 
2077 	return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", ctrl->vid,
2078 		serial_len, ctrl->serial, model_len, ctrl->model, ns->ns_id);
2079 }
2080 static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL);
2081 
2082 static ssize_t nguid_show(struct device *dev, struct device_attribute *attr,
2083 			  char *buf)
2084 {
2085 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
2086 	return sprintf(buf, "%pU\n", ns->nguid);
2087 }
2088 static DEVICE_ATTR(nguid, S_IRUGO, nguid_show, NULL);
2089 
2090 static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
2091 								char *buf)
2092 {
2093 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
2094 
2095 	/* For backward compatibility expose the NGUID to userspace if
2096 	 * we have no UUID set
2097 	 */
2098 	if (uuid_is_null(&ns->uuid)) {
2099 		printk_ratelimited(KERN_WARNING
2100 				   "No UUID available providing old NGUID\n");
2101 		return sprintf(buf, "%pU\n", ns->nguid);
2102 	}
2103 	return sprintf(buf, "%pU\n", &ns->uuid);
2104 }
2105 static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL);
2106 
2107 static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
2108 								char *buf)
2109 {
2110 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
2111 	return sprintf(buf, "%8phd\n", ns->eui);
2112 }
2113 static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL);
2114 
2115 static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
2116 								char *buf)
2117 {
2118 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
2119 	return sprintf(buf, "%d\n", ns->ns_id);
2120 }
2121 static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL);
2122 
2123 static struct attribute *nvme_ns_attrs[] = {
2124 	&dev_attr_wwid.attr,
2125 	&dev_attr_uuid.attr,
2126 	&dev_attr_nguid.attr,
2127 	&dev_attr_eui.attr,
2128 	&dev_attr_nsid.attr,
2129 	NULL,
2130 };
2131 
2132 static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
2133 		struct attribute *a, int n)
2134 {
2135 	struct device *dev = container_of(kobj, struct device, kobj);
2136 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
2137 
2138 	if (a == &dev_attr_uuid.attr) {
2139 		if (uuid_is_null(&ns->uuid) &&
2140 		    !memchr_inv(ns->nguid, 0, sizeof(ns->nguid)))
2141 			return 0;
2142 	}
2143 	if (a == &dev_attr_nguid.attr) {
2144 		if (!memchr_inv(ns->nguid, 0, sizeof(ns->nguid)))
2145 			return 0;
2146 	}
2147 	if (a == &dev_attr_eui.attr) {
2148 		if (!memchr_inv(ns->eui, 0, sizeof(ns->eui)))
2149 			return 0;
2150 	}
2151 	return a->mode;
2152 }
2153 
2154 static const struct attribute_group nvme_ns_attr_group = {
2155 	.attrs		= nvme_ns_attrs,
2156 	.is_visible	= nvme_ns_attrs_are_visible,
2157 };
2158 
2159 #define nvme_show_str_function(field)						\
2160 static ssize_t  field##_show(struct device *dev,				\
2161 			    struct device_attribute *attr, char *buf)		\
2162 {										\
2163         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);				\
2164         return sprintf(buf, "%.*s\n", (int)sizeof(ctrl->field), ctrl->field);	\
2165 }										\
2166 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
2167 
2168 #define nvme_show_int_function(field)						\
2169 static ssize_t  field##_show(struct device *dev,				\
2170 			    struct device_attribute *attr, char *buf)		\
2171 {										\
2172         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);				\
2173         return sprintf(buf, "%d\n", ctrl->field);	\
2174 }										\
2175 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
2176 
2177 nvme_show_str_function(model);
2178 nvme_show_str_function(serial);
2179 nvme_show_str_function(firmware_rev);
2180 nvme_show_int_function(cntlid);
2181 
2182 static ssize_t nvme_sysfs_delete(struct device *dev,
2183 				struct device_attribute *attr, const char *buf,
2184 				size_t count)
2185 {
2186 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2187 
2188 	if (device_remove_file_self(dev, attr))
2189 		ctrl->ops->delete_ctrl(ctrl);
2190 	return count;
2191 }
2192 static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete);
2193 
2194 static ssize_t nvme_sysfs_show_transport(struct device *dev,
2195 					 struct device_attribute *attr,
2196 					 char *buf)
2197 {
2198 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2199 
2200 	return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->ops->name);
2201 }
2202 static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL);
2203 
2204 static ssize_t nvme_sysfs_show_state(struct device *dev,
2205 				     struct device_attribute *attr,
2206 				     char *buf)
2207 {
2208 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2209 	static const char *const state_name[] = {
2210 		[NVME_CTRL_NEW]		= "new",
2211 		[NVME_CTRL_LIVE]	= "live",
2212 		[NVME_CTRL_RESETTING]	= "resetting",
2213 		[NVME_CTRL_RECONNECTING]= "reconnecting",
2214 		[NVME_CTRL_DELETING]	= "deleting",
2215 		[NVME_CTRL_DEAD]	= "dead",
2216 	};
2217 
2218 	if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) &&
2219 	    state_name[ctrl->state])
2220 		return sprintf(buf, "%s\n", state_name[ctrl->state]);
2221 
2222 	return sprintf(buf, "unknown state\n");
2223 }
2224 
2225 static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL);
2226 
2227 static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev,
2228 					 struct device_attribute *attr,
2229 					 char *buf)
2230 {
2231 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2232 
2233 	return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subnqn);
2234 }
2235 static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL);
2236 
2237 static ssize_t nvme_sysfs_show_address(struct device *dev,
2238 					 struct device_attribute *attr,
2239 					 char *buf)
2240 {
2241 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2242 
2243 	return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE);
2244 }
2245 static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL);
2246 
2247 static struct attribute *nvme_dev_attrs[] = {
2248 	&dev_attr_reset_controller.attr,
2249 	&dev_attr_rescan_controller.attr,
2250 	&dev_attr_model.attr,
2251 	&dev_attr_serial.attr,
2252 	&dev_attr_firmware_rev.attr,
2253 	&dev_attr_cntlid.attr,
2254 	&dev_attr_delete_controller.attr,
2255 	&dev_attr_transport.attr,
2256 	&dev_attr_subsysnqn.attr,
2257 	&dev_attr_address.attr,
2258 	&dev_attr_state.attr,
2259 	NULL
2260 };
2261 
2262 static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
2263 		struct attribute *a, int n)
2264 {
2265 	struct device *dev = container_of(kobj, struct device, kobj);
2266 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2267 
2268 	if (a == &dev_attr_delete_controller.attr && !ctrl->ops->delete_ctrl)
2269 		return 0;
2270 	if (a == &dev_attr_address.attr && !ctrl->ops->get_address)
2271 		return 0;
2272 
2273 	return a->mode;
2274 }
2275 
2276 static struct attribute_group nvme_dev_attrs_group = {
2277 	.attrs		= nvme_dev_attrs,
2278 	.is_visible	= nvme_dev_attrs_are_visible,
2279 };
2280 
2281 static const struct attribute_group *nvme_dev_attr_groups[] = {
2282 	&nvme_dev_attrs_group,
2283 	NULL,
2284 };
2285 
2286 static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
2287 {
2288 	struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
2289 	struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
2290 
2291 	return nsa->ns_id - nsb->ns_id;
2292 }
2293 
2294 static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
2295 {
2296 	struct nvme_ns *ns, *ret = NULL;
2297 
2298 	mutex_lock(&ctrl->namespaces_mutex);
2299 	list_for_each_entry(ns, &ctrl->namespaces, list) {
2300 		if (ns->ns_id == nsid) {
2301 			kref_get(&ns->kref);
2302 			ret = ns;
2303 			break;
2304 		}
2305 		if (ns->ns_id > nsid)
2306 			break;
2307 	}
2308 	mutex_unlock(&ctrl->namespaces_mutex);
2309 	return ret;
2310 }
2311 
2312 static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns)
2313 {
2314 	struct streams_directive_params s;
2315 	int ret;
2316 
2317 	if (!ctrl->nr_streams)
2318 		return 0;
2319 
2320 	ret = nvme_get_stream_params(ctrl, &s, ns->ns_id);
2321 	if (ret)
2322 		return ret;
2323 
2324 	ns->sws = le32_to_cpu(s.sws);
2325 	ns->sgs = le16_to_cpu(s.sgs);
2326 
2327 	if (ns->sws) {
2328 		unsigned int bs = 1 << ns->lba_shift;
2329 
2330 		blk_queue_io_min(ns->queue, bs * ns->sws);
2331 		if (ns->sgs)
2332 			blk_queue_io_opt(ns->queue, bs * ns->sws * ns->sgs);
2333 	}
2334 
2335 	return 0;
2336 }
2337 
2338 static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
2339 {
2340 	struct nvme_ns *ns;
2341 	struct gendisk *disk;
2342 	struct nvme_id_ns *id;
2343 	char disk_name[DISK_NAME_LEN];
2344 	int node = dev_to_node(ctrl->dev);
2345 
2346 	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
2347 	if (!ns)
2348 		return;
2349 
2350 	ns->instance = ida_simple_get(&ctrl->ns_ida, 1, 0, GFP_KERNEL);
2351 	if (ns->instance < 0)
2352 		goto out_free_ns;
2353 
2354 	ns->queue = blk_mq_init_queue(ctrl->tagset);
2355 	if (IS_ERR(ns->queue))
2356 		goto out_release_instance;
2357 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
2358 	ns->queue->queuedata = ns;
2359 	ns->ctrl = ctrl;
2360 
2361 	kref_init(&ns->kref);
2362 	ns->ns_id = nsid;
2363 	ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
2364 
2365 	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
2366 	nvme_set_queue_limits(ctrl, ns->queue);
2367 	nvme_setup_streams_ns(ctrl, ns);
2368 
2369 	sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->instance);
2370 
2371 	id = nvme_identify_ns(ctrl, nsid);
2372 	if (!id)
2373 		goto out_free_queue;
2374 
2375 	if (id->ncap == 0)
2376 		goto out_free_id;
2377 
2378 	nvme_report_ns_ids(ctrl, ns->ns_id, id, ns->eui, ns->nguid, &ns->uuid);
2379 
2380 	if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
2381 		if (nvme_nvm_register(ns, disk_name, node)) {
2382 			dev_warn(ctrl->device, "LightNVM init failure\n");
2383 			goto out_free_id;
2384 		}
2385 	}
2386 
2387 	disk = alloc_disk_node(0, node);
2388 	if (!disk)
2389 		goto out_free_id;
2390 
2391 	disk->fops = &nvme_fops;
2392 	disk->private_data = ns;
2393 	disk->queue = ns->queue;
2394 	disk->flags = GENHD_FL_EXT_DEVT;
2395 	memcpy(disk->disk_name, disk_name, DISK_NAME_LEN);
2396 	ns->disk = disk;
2397 
2398 	__nvme_revalidate_disk(disk, id);
2399 
2400 	mutex_lock(&ctrl->namespaces_mutex);
2401 	list_add_tail(&ns->list, &ctrl->namespaces);
2402 	mutex_unlock(&ctrl->namespaces_mutex);
2403 
2404 	kref_get(&ctrl->kref);
2405 
2406 	kfree(id);
2407 
2408 	device_add_disk(ctrl->device, ns->disk);
2409 	if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
2410 					&nvme_ns_attr_group))
2411 		pr_warn("%s: failed to create sysfs group for identification\n",
2412 			ns->disk->disk_name);
2413 	if (ns->ndev && nvme_nvm_register_sysfs(ns))
2414 		pr_warn("%s: failed to register lightnvm sysfs group for identification\n",
2415 			ns->disk->disk_name);
2416 	return;
2417  out_free_id:
2418 	kfree(id);
2419  out_free_queue:
2420 	blk_cleanup_queue(ns->queue);
2421  out_release_instance:
2422 	ida_simple_remove(&ctrl->ns_ida, ns->instance);
2423  out_free_ns:
2424 	kfree(ns);
2425 }
2426 
2427 static void nvme_ns_remove(struct nvme_ns *ns)
2428 {
2429 	if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
2430 		return;
2431 
2432 	if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
2433 		if (blk_get_integrity(ns->disk))
2434 			blk_integrity_unregister(ns->disk);
2435 		sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
2436 					&nvme_ns_attr_group);
2437 		if (ns->ndev)
2438 			nvme_nvm_unregister_sysfs(ns);
2439 		del_gendisk(ns->disk);
2440 		blk_cleanup_queue(ns->queue);
2441 	}
2442 
2443 	mutex_lock(&ns->ctrl->namespaces_mutex);
2444 	list_del_init(&ns->list);
2445 	mutex_unlock(&ns->ctrl->namespaces_mutex);
2446 
2447 	nvme_put_ns(ns);
2448 }
2449 
2450 static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
2451 {
2452 	struct nvme_ns *ns;
2453 
2454 	ns = nvme_find_get_ns(ctrl, nsid);
2455 	if (ns) {
2456 		if (ns->disk && revalidate_disk(ns->disk))
2457 			nvme_ns_remove(ns);
2458 		nvme_put_ns(ns);
2459 	} else
2460 		nvme_alloc_ns(ctrl, nsid);
2461 }
2462 
2463 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
2464 					unsigned nsid)
2465 {
2466 	struct nvme_ns *ns, *next;
2467 
2468 	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
2469 		if (ns->ns_id > nsid)
2470 			nvme_ns_remove(ns);
2471 	}
2472 }
2473 
2474 static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn)
2475 {
2476 	struct nvme_ns *ns;
2477 	__le32 *ns_list;
2478 	unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024);
2479 	int ret = 0;
2480 
2481 	ns_list = kzalloc(0x1000, GFP_KERNEL);
2482 	if (!ns_list)
2483 		return -ENOMEM;
2484 
2485 	for (i = 0; i < num_lists; i++) {
2486 		ret = nvme_identify_ns_list(ctrl, prev, ns_list);
2487 		if (ret)
2488 			goto free;
2489 
2490 		for (j = 0; j < min(nn, 1024U); j++) {
2491 			nsid = le32_to_cpu(ns_list[j]);
2492 			if (!nsid)
2493 				goto out;
2494 
2495 			nvme_validate_ns(ctrl, nsid);
2496 
2497 			while (++prev < nsid) {
2498 				ns = nvme_find_get_ns(ctrl, prev);
2499 				if (ns) {
2500 					nvme_ns_remove(ns);
2501 					nvme_put_ns(ns);
2502 				}
2503 			}
2504 		}
2505 		nn -= j;
2506 	}
2507  out:
2508 	nvme_remove_invalid_namespaces(ctrl, prev);
2509  free:
2510 	kfree(ns_list);
2511 	return ret;
2512 }
2513 
2514 static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl, unsigned nn)
2515 {
2516 	unsigned i;
2517 
2518 	for (i = 1; i <= nn; i++)
2519 		nvme_validate_ns(ctrl, i);
2520 
2521 	nvme_remove_invalid_namespaces(ctrl, nn);
2522 }
2523 
2524 static void nvme_scan_work(struct work_struct *work)
2525 {
2526 	struct nvme_ctrl *ctrl =
2527 		container_of(work, struct nvme_ctrl, scan_work);
2528 	struct nvme_id_ctrl *id;
2529 	unsigned nn;
2530 
2531 	if (ctrl->state != NVME_CTRL_LIVE)
2532 		return;
2533 
2534 	if (nvme_identify_ctrl(ctrl, &id))
2535 		return;
2536 
2537 	nn = le32_to_cpu(id->nn);
2538 	if (ctrl->vs >= NVME_VS(1, 1, 0) &&
2539 	    !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) {
2540 		if (!nvme_scan_ns_list(ctrl, nn))
2541 			goto done;
2542 	}
2543 	nvme_scan_ns_sequential(ctrl, nn);
2544  done:
2545 	mutex_lock(&ctrl->namespaces_mutex);
2546 	list_sort(NULL, &ctrl->namespaces, ns_cmp);
2547 	mutex_unlock(&ctrl->namespaces_mutex);
2548 	kfree(id);
2549 }
2550 
2551 void nvme_queue_scan(struct nvme_ctrl *ctrl)
2552 {
2553 	/*
2554 	 * Do not queue new scan work when a controller is reset during
2555 	 * removal.
2556 	 */
2557 	if (ctrl->state == NVME_CTRL_LIVE)
2558 		queue_work(nvme_wq, &ctrl->scan_work);
2559 }
2560 EXPORT_SYMBOL_GPL(nvme_queue_scan);
2561 
2562 /*
2563  * This function iterates the namespace list unlocked to allow recovery from
2564  * controller failure. It is up to the caller to ensure the namespace list is
2565  * not modified by scan work while this function is executing.
2566  */
2567 void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
2568 {
2569 	struct nvme_ns *ns, *next;
2570 
2571 	/*
2572 	 * The dead states indicates the controller was not gracefully
2573 	 * disconnected. In that case, we won't be able to flush any data while
2574 	 * removing the namespaces' disks; fail all the queues now to avoid
2575 	 * potentially having to clean up the failed sync later.
2576 	 */
2577 	if (ctrl->state == NVME_CTRL_DEAD)
2578 		nvme_kill_queues(ctrl);
2579 
2580 	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list)
2581 		nvme_ns_remove(ns);
2582 }
2583 EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
2584 
2585 static void nvme_async_event_work(struct work_struct *work)
2586 {
2587 	struct nvme_ctrl *ctrl =
2588 		container_of(work, struct nvme_ctrl, async_event_work);
2589 
2590 	spin_lock_irq(&ctrl->lock);
2591 	while (ctrl->state == NVME_CTRL_LIVE && ctrl->event_limit > 0) {
2592 		int aer_idx = --ctrl->event_limit;
2593 
2594 		spin_unlock_irq(&ctrl->lock);
2595 		ctrl->ops->submit_async_event(ctrl, aer_idx);
2596 		spin_lock_irq(&ctrl->lock);
2597 	}
2598 	spin_unlock_irq(&ctrl->lock);
2599 }
2600 
2601 static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
2602 {
2603 
2604 	u32 csts;
2605 
2606 	if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts))
2607 		return false;
2608 
2609 	if (csts == ~0)
2610 		return false;
2611 
2612 	return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP));
2613 }
2614 
2615 static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
2616 {
2617 	struct nvme_command c = { };
2618 	struct nvme_fw_slot_info_log *log;
2619 
2620 	log = kmalloc(sizeof(*log), GFP_KERNEL);
2621 	if (!log)
2622 		return;
2623 
2624 	c.common.opcode = nvme_admin_get_log_page;
2625 	c.common.nsid = cpu_to_le32(NVME_NSID_ALL);
2626 	c.common.cdw10[0] = nvme_get_log_dw10(NVME_LOG_FW_SLOT, sizeof(*log));
2627 
2628 	if (!nvme_submit_sync_cmd(ctrl->admin_q, &c, log, sizeof(*log)))
2629 		dev_warn(ctrl->device,
2630 				"Get FW SLOT INFO log error\n");
2631 	kfree(log);
2632 }
2633 
2634 static void nvme_fw_act_work(struct work_struct *work)
2635 {
2636 	struct nvme_ctrl *ctrl = container_of(work,
2637 				struct nvme_ctrl, fw_act_work);
2638 	unsigned long fw_act_timeout;
2639 
2640 	if (ctrl->mtfa)
2641 		fw_act_timeout = jiffies +
2642 				msecs_to_jiffies(ctrl->mtfa * 100);
2643 	else
2644 		fw_act_timeout = jiffies +
2645 				msecs_to_jiffies(admin_timeout * 1000);
2646 
2647 	nvme_stop_queues(ctrl);
2648 	while (nvme_ctrl_pp_status(ctrl)) {
2649 		if (time_after(jiffies, fw_act_timeout)) {
2650 			dev_warn(ctrl->device,
2651 				"Fw activation timeout, reset controller\n");
2652 			nvme_reset_ctrl(ctrl);
2653 			break;
2654 		}
2655 		msleep(100);
2656 	}
2657 
2658 	if (ctrl->state != NVME_CTRL_LIVE)
2659 		return;
2660 
2661 	nvme_start_queues(ctrl);
2662 	/* read FW slot informationi to clear the AER*/
2663 	nvme_get_fw_slot_info(ctrl);
2664 }
2665 
2666 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
2667 		union nvme_result *res)
2668 {
2669 	u32 result = le32_to_cpu(res->u32);
2670 	bool done = true;
2671 
2672 	switch (le16_to_cpu(status) >> 1) {
2673 	case NVME_SC_SUCCESS:
2674 		done = false;
2675 		/*FALLTHRU*/
2676 	case NVME_SC_ABORT_REQ:
2677 		++ctrl->event_limit;
2678 		if (ctrl->state == NVME_CTRL_LIVE)
2679 			queue_work(nvme_wq, &ctrl->async_event_work);
2680 		break;
2681 	default:
2682 		break;
2683 	}
2684 
2685 	if (done)
2686 		return;
2687 
2688 	switch (result & 0xff07) {
2689 	case NVME_AER_NOTICE_NS_CHANGED:
2690 		dev_info(ctrl->device, "rescanning\n");
2691 		nvme_queue_scan(ctrl);
2692 		break;
2693 	case NVME_AER_NOTICE_FW_ACT_STARTING:
2694 		queue_work(nvme_wq, &ctrl->fw_act_work);
2695 		break;
2696 	default:
2697 		dev_warn(ctrl->device, "async event result %08x\n", result);
2698 	}
2699 }
2700 EXPORT_SYMBOL_GPL(nvme_complete_async_event);
2701 
2702 void nvme_queue_async_events(struct nvme_ctrl *ctrl)
2703 {
2704 	ctrl->event_limit = NVME_NR_AERS;
2705 	queue_work(nvme_wq, &ctrl->async_event_work);
2706 }
2707 EXPORT_SYMBOL_GPL(nvme_queue_async_events);
2708 
2709 static DEFINE_IDA(nvme_instance_ida);
2710 
2711 static int nvme_set_instance(struct nvme_ctrl *ctrl)
2712 {
2713 	int instance, error;
2714 
2715 	do {
2716 		if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
2717 			return -ENODEV;
2718 
2719 		spin_lock(&dev_list_lock);
2720 		error = ida_get_new(&nvme_instance_ida, &instance);
2721 		spin_unlock(&dev_list_lock);
2722 	} while (error == -EAGAIN);
2723 
2724 	if (error)
2725 		return -ENODEV;
2726 
2727 	ctrl->instance = instance;
2728 	return 0;
2729 }
2730 
2731 static void nvme_release_instance(struct nvme_ctrl *ctrl)
2732 {
2733 	spin_lock(&dev_list_lock);
2734 	ida_remove(&nvme_instance_ida, ctrl->instance);
2735 	spin_unlock(&dev_list_lock);
2736 }
2737 
2738 void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
2739 {
2740 	nvme_stop_keep_alive(ctrl);
2741 	flush_work(&ctrl->async_event_work);
2742 	flush_work(&ctrl->scan_work);
2743 	cancel_work_sync(&ctrl->fw_act_work);
2744 }
2745 EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
2746 
2747 void nvme_start_ctrl(struct nvme_ctrl *ctrl)
2748 {
2749 	if (ctrl->kato)
2750 		nvme_start_keep_alive(ctrl);
2751 
2752 	if (ctrl->queue_count > 1) {
2753 		nvme_queue_scan(ctrl);
2754 		nvme_queue_async_events(ctrl);
2755 		nvme_start_queues(ctrl);
2756 	}
2757 }
2758 EXPORT_SYMBOL_GPL(nvme_start_ctrl);
2759 
2760 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
2761 {
2762 	device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance));
2763 
2764 	spin_lock(&dev_list_lock);
2765 	list_del(&ctrl->node);
2766 	spin_unlock(&dev_list_lock);
2767 }
2768 EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
2769 
2770 static void nvme_free_ctrl(struct kref *kref)
2771 {
2772 	struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref);
2773 
2774 	put_device(ctrl->device);
2775 	nvme_release_instance(ctrl);
2776 	ida_destroy(&ctrl->ns_ida);
2777 
2778 	ctrl->ops->free_ctrl(ctrl);
2779 }
2780 
2781 void nvme_put_ctrl(struct nvme_ctrl *ctrl)
2782 {
2783 	kref_put(&ctrl->kref, nvme_free_ctrl);
2784 }
2785 EXPORT_SYMBOL_GPL(nvme_put_ctrl);
2786 
2787 /*
2788  * Initialize a NVMe controller structures.  This needs to be called during
2789  * earliest initialization so that we have the initialized structured around
2790  * during probing.
2791  */
2792 int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
2793 		const struct nvme_ctrl_ops *ops, unsigned long quirks)
2794 {
2795 	int ret;
2796 
2797 	ctrl->state = NVME_CTRL_NEW;
2798 	spin_lock_init(&ctrl->lock);
2799 	INIT_LIST_HEAD(&ctrl->namespaces);
2800 	mutex_init(&ctrl->namespaces_mutex);
2801 	kref_init(&ctrl->kref);
2802 	ctrl->dev = dev;
2803 	ctrl->ops = ops;
2804 	ctrl->quirks = quirks;
2805 	INIT_WORK(&ctrl->scan_work, nvme_scan_work);
2806 	INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
2807 	INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
2808 
2809 	ret = nvme_set_instance(ctrl);
2810 	if (ret)
2811 		goto out;
2812 
2813 	ctrl->device = device_create_with_groups(nvme_class, ctrl->dev,
2814 				MKDEV(nvme_char_major, ctrl->instance),
2815 				ctrl, nvme_dev_attr_groups,
2816 				"nvme%d", ctrl->instance);
2817 	if (IS_ERR(ctrl->device)) {
2818 		ret = PTR_ERR(ctrl->device);
2819 		goto out_release_instance;
2820 	}
2821 	get_device(ctrl->device);
2822 	ida_init(&ctrl->ns_ida);
2823 
2824 	spin_lock(&dev_list_lock);
2825 	list_add_tail(&ctrl->node, &nvme_ctrl_list);
2826 	spin_unlock(&dev_list_lock);
2827 
2828 	/*
2829 	 * Initialize latency tolerance controls.  The sysfs files won't
2830 	 * be visible to userspace unless the device actually supports APST.
2831 	 */
2832 	ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
2833 	dev_pm_qos_update_user_latency_tolerance(ctrl->device,
2834 		min(default_ps_max_latency_us, (unsigned long)S32_MAX));
2835 
2836 	return 0;
2837 out_release_instance:
2838 	nvme_release_instance(ctrl);
2839 out:
2840 	return ret;
2841 }
2842 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
2843 
2844 /**
2845  * nvme_kill_queues(): Ends all namespace queues
2846  * @ctrl: the dead controller that needs to end
2847  *
2848  * Call this function when the driver determines it is unable to get the
2849  * controller in a state capable of servicing IO.
2850  */
2851 void nvme_kill_queues(struct nvme_ctrl *ctrl)
2852 {
2853 	struct nvme_ns *ns;
2854 
2855 	mutex_lock(&ctrl->namespaces_mutex);
2856 
2857 	/* Forcibly unquiesce queues to avoid blocking dispatch */
2858 	if (ctrl->admin_q)
2859 		blk_mq_unquiesce_queue(ctrl->admin_q);
2860 
2861 	list_for_each_entry(ns, &ctrl->namespaces, list) {
2862 		/*
2863 		 * Revalidating a dead namespace sets capacity to 0. This will
2864 		 * end buffered writers dirtying pages that can't be synced.
2865 		 */
2866 		if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags))
2867 			continue;
2868 		revalidate_disk(ns->disk);
2869 		blk_set_queue_dying(ns->queue);
2870 
2871 		/* Forcibly unquiesce queues to avoid blocking dispatch */
2872 		blk_mq_unquiesce_queue(ns->queue);
2873 	}
2874 	mutex_unlock(&ctrl->namespaces_mutex);
2875 }
2876 EXPORT_SYMBOL_GPL(nvme_kill_queues);
2877 
2878 void nvme_unfreeze(struct nvme_ctrl *ctrl)
2879 {
2880 	struct nvme_ns *ns;
2881 
2882 	mutex_lock(&ctrl->namespaces_mutex);
2883 	list_for_each_entry(ns, &ctrl->namespaces, list)
2884 		blk_mq_unfreeze_queue(ns->queue);
2885 	mutex_unlock(&ctrl->namespaces_mutex);
2886 }
2887 EXPORT_SYMBOL_GPL(nvme_unfreeze);
2888 
2889 void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
2890 {
2891 	struct nvme_ns *ns;
2892 
2893 	mutex_lock(&ctrl->namespaces_mutex);
2894 	list_for_each_entry(ns, &ctrl->namespaces, list) {
2895 		timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
2896 		if (timeout <= 0)
2897 			break;
2898 	}
2899 	mutex_unlock(&ctrl->namespaces_mutex);
2900 }
2901 EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
2902 
2903 void nvme_wait_freeze(struct nvme_ctrl *ctrl)
2904 {
2905 	struct nvme_ns *ns;
2906 
2907 	mutex_lock(&ctrl->namespaces_mutex);
2908 	list_for_each_entry(ns, &ctrl->namespaces, list)
2909 		blk_mq_freeze_queue_wait(ns->queue);
2910 	mutex_unlock(&ctrl->namespaces_mutex);
2911 }
2912 EXPORT_SYMBOL_GPL(nvme_wait_freeze);
2913 
2914 void nvme_start_freeze(struct nvme_ctrl *ctrl)
2915 {
2916 	struct nvme_ns *ns;
2917 
2918 	mutex_lock(&ctrl->namespaces_mutex);
2919 	list_for_each_entry(ns, &ctrl->namespaces, list)
2920 		blk_freeze_queue_start(ns->queue);
2921 	mutex_unlock(&ctrl->namespaces_mutex);
2922 }
2923 EXPORT_SYMBOL_GPL(nvme_start_freeze);
2924 
2925 void nvme_stop_queues(struct nvme_ctrl *ctrl)
2926 {
2927 	struct nvme_ns *ns;
2928 
2929 	mutex_lock(&ctrl->namespaces_mutex);
2930 	list_for_each_entry(ns, &ctrl->namespaces, list)
2931 		blk_mq_quiesce_queue(ns->queue);
2932 	mutex_unlock(&ctrl->namespaces_mutex);
2933 }
2934 EXPORT_SYMBOL_GPL(nvme_stop_queues);
2935 
2936 void nvme_start_queues(struct nvme_ctrl *ctrl)
2937 {
2938 	struct nvme_ns *ns;
2939 
2940 	mutex_lock(&ctrl->namespaces_mutex);
2941 	list_for_each_entry(ns, &ctrl->namespaces, list)
2942 		blk_mq_unquiesce_queue(ns->queue);
2943 	mutex_unlock(&ctrl->namespaces_mutex);
2944 }
2945 EXPORT_SYMBOL_GPL(nvme_start_queues);
2946 
2947 int __init nvme_core_init(void)
2948 {
2949 	int result;
2950 
2951 	nvme_wq = alloc_workqueue("nvme-wq",
2952 			WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
2953 	if (!nvme_wq)
2954 		return -ENOMEM;
2955 
2956 	result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme",
2957 							&nvme_dev_fops);
2958 	if (result < 0)
2959 		goto destroy_wq;
2960 	else if (result > 0)
2961 		nvme_char_major = result;
2962 
2963 	nvme_class = class_create(THIS_MODULE, "nvme");
2964 	if (IS_ERR(nvme_class)) {
2965 		result = PTR_ERR(nvme_class);
2966 		goto unregister_chrdev;
2967 	}
2968 
2969 	return 0;
2970 
2971 unregister_chrdev:
2972 	__unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
2973 destroy_wq:
2974 	destroy_workqueue(nvme_wq);
2975 	return result;
2976 }
2977 
2978 void nvme_core_exit(void)
2979 {
2980 	class_destroy(nvme_class);
2981 	__unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
2982 	destroy_workqueue(nvme_wq);
2983 }
2984 
2985 MODULE_LICENSE("GPL");
2986 MODULE_VERSION("1.0");
2987 module_init(nvme_core_init);
2988 module_exit(nvme_core_exit);
2989