xref: /openbmc/linux/drivers/nvme/host/core.c (revision b240b419db5d624ce7a5a397d6f62a1a686009ec)
1 /*
2  * NVM Express device driver
3  * Copyright (c) 2011-2014, Intel Corporation.
4  *
5  * This program is free software; you can redistribute it and/or modify it
6  * under the terms and conditions of the GNU General Public License,
7  * version 2, as published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
12  * more details.
13  */
14 
15 #include <linux/blkdev.h>
16 #include <linux/blk-mq.h>
17 #include <linux/delay.h>
18 #include <linux/errno.h>
19 #include <linux/hdreg.h>
20 #include <linux/kernel.h>
21 #include <linux/module.h>
22 #include <linux/list_sort.h>
23 #include <linux/slab.h>
24 #include <linux/types.h>
25 #include <linux/pr.h>
26 #include <linux/ptrace.h>
27 #include <linux/nvme_ioctl.h>
28 #include <linux/t10-pi.h>
29 #include <linux/pm_qos.h>
30 #include <asm/unaligned.h>
31 
32 #define CREATE_TRACE_POINTS
33 #include "trace.h"
34 
35 #include "nvme.h"
36 #include "fabrics.h"
37 
38 #define NVME_MINORS		(1U << MINORBITS)
39 
40 unsigned int admin_timeout = 60;
41 module_param(admin_timeout, uint, 0644);
42 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
43 EXPORT_SYMBOL_GPL(admin_timeout);
44 
45 unsigned int nvme_io_timeout = 30;
46 module_param_named(io_timeout, nvme_io_timeout, uint, 0644);
47 MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
48 EXPORT_SYMBOL_GPL(nvme_io_timeout);
49 
50 static unsigned char shutdown_timeout = 5;
51 module_param(shutdown_timeout, byte, 0644);
52 MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
53 
54 static u8 nvme_max_retries = 5;
55 module_param_named(max_retries, nvme_max_retries, byte, 0644);
56 MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
57 
58 static unsigned long default_ps_max_latency_us = 100000;
59 module_param(default_ps_max_latency_us, ulong, 0644);
60 MODULE_PARM_DESC(default_ps_max_latency_us,
61 		 "max power saving latency for new devices; use PM QOS to change per device");
62 
63 static bool force_apst;
64 module_param(force_apst, bool, 0644);
65 MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
66 
67 static bool streams;
68 module_param(streams, bool, 0644);
69 MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
70 
71 /*
72  * nvme_wq - hosts nvme related works that are not reset or delete
73  * nvme_reset_wq - hosts nvme reset works
74  * nvme_delete_wq - hosts nvme delete works
75  *
76  * nvme_wq will host works such are scan, aen handling, fw activation,
77  * keep-alive error recovery, periodic reconnects etc. nvme_reset_wq
78  * runs reset works which also flush works hosted on nvme_wq for
79  * serialization purposes. nvme_delete_wq host controller deletion
80  * works which flush reset works for serialization.
81  */
82 struct workqueue_struct *nvme_wq;
83 EXPORT_SYMBOL_GPL(nvme_wq);
84 
85 struct workqueue_struct *nvme_reset_wq;
86 EXPORT_SYMBOL_GPL(nvme_reset_wq);
87 
88 struct workqueue_struct *nvme_delete_wq;
89 EXPORT_SYMBOL_GPL(nvme_delete_wq);
90 
91 static DEFINE_IDA(nvme_subsystems_ida);
92 static LIST_HEAD(nvme_subsystems);
93 static DEFINE_MUTEX(nvme_subsystems_lock);
94 
95 static DEFINE_IDA(nvme_instance_ida);
96 static dev_t nvme_chr_devt;
97 static struct class *nvme_class;
98 static struct class *nvme_subsys_class;
99 
100 static void nvme_ns_remove(struct nvme_ns *ns);
101 static int nvme_revalidate_disk(struct gendisk *disk);
102 
103 int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
104 {
105 	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
106 		return -EBUSY;
107 	if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
108 		return -EBUSY;
109 	return 0;
110 }
111 EXPORT_SYMBOL_GPL(nvme_reset_ctrl);
112 
113 int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
114 {
115 	int ret;
116 
117 	ret = nvme_reset_ctrl(ctrl);
118 	if (!ret) {
119 		flush_work(&ctrl->reset_work);
120 		if (ctrl->state != NVME_CTRL_LIVE)
121 			ret = -ENETRESET;
122 	}
123 
124 	return ret;
125 }
126 EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync);
127 
128 static void nvme_delete_ctrl_work(struct work_struct *work)
129 {
130 	struct nvme_ctrl *ctrl =
131 		container_of(work, struct nvme_ctrl, delete_work);
132 
133 	dev_info(ctrl->device,
134 		 "Removing ctrl: NQN \"%s\"\n", ctrl->opts->subsysnqn);
135 
136 	flush_work(&ctrl->reset_work);
137 	nvme_stop_ctrl(ctrl);
138 	nvme_remove_namespaces(ctrl);
139 	ctrl->ops->delete_ctrl(ctrl);
140 	nvme_uninit_ctrl(ctrl);
141 	nvme_put_ctrl(ctrl);
142 }
143 
144 int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
145 {
146 	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
147 		return -EBUSY;
148 	if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
149 		return -EBUSY;
150 	return 0;
151 }
152 EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
153 
154 int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
155 {
156 	int ret = 0;
157 
158 	/*
159 	 * Keep a reference until the work is flushed since ->delete_ctrl
160 	 * can free the controller.
161 	 */
162 	nvme_get_ctrl(ctrl);
163 	ret = nvme_delete_ctrl(ctrl);
164 	if (!ret)
165 		flush_work(&ctrl->delete_work);
166 	nvme_put_ctrl(ctrl);
167 	return ret;
168 }
169 EXPORT_SYMBOL_GPL(nvme_delete_ctrl_sync);
170 
171 static inline bool nvme_ns_has_pi(struct nvme_ns *ns)
172 {
173 	return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple);
174 }
175 
176 static blk_status_t nvme_error_status(struct request *req)
177 {
178 	switch (nvme_req(req)->status & 0x7ff) {
179 	case NVME_SC_SUCCESS:
180 		return BLK_STS_OK;
181 	case NVME_SC_CAP_EXCEEDED:
182 		return BLK_STS_NOSPC;
183 	case NVME_SC_LBA_RANGE:
184 		return BLK_STS_TARGET;
185 	case NVME_SC_BAD_ATTRIBUTES:
186 	case NVME_SC_ONCS_NOT_SUPPORTED:
187 	case NVME_SC_INVALID_OPCODE:
188 	case NVME_SC_INVALID_FIELD:
189 	case NVME_SC_INVALID_NS:
190 		return BLK_STS_NOTSUPP;
191 	case NVME_SC_WRITE_FAULT:
192 	case NVME_SC_READ_ERROR:
193 	case NVME_SC_UNWRITTEN_BLOCK:
194 	case NVME_SC_ACCESS_DENIED:
195 	case NVME_SC_READ_ONLY:
196 	case NVME_SC_COMPARE_FAILED:
197 		return BLK_STS_MEDIUM;
198 	case NVME_SC_GUARD_CHECK:
199 	case NVME_SC_APPTAG_CHECK:
200 	case NVME_SC_REFTAG_CHECK:
201 	case NVME_SC_INVALID_PI:
202 		return BLK_STS_PROTECTION;
203 	case NVME_SC_RESERVATION_CONFLICT:
204 		return BLK_STS_NEXUS;
205 	default:
206 		return BLK_STS_IOERR;
207 	}
208 }
209 
210 static inline bool nvme_req_needs_retry(struct request *req)
211 {
212 	if (blk_noretry_request(req))
213 		return false;
214 	if (nvme_req(req)->status & NVME_SC_DNR)
215 		return false;
216 	if (nvme_req(req)->retries >= nvme_max_retries)
217 		return false;
218 	return true;
219 }
220 
221 void nvme_complete_rq(struct request *req)
222 {
223 	blk_status_t status = nvme_error_status(req);
224 
225 	trace_nvme_complete_rq(req);
226 
227 	if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
228 		if (nvme_req_needs_failover(req, status)) {
229 			nvme_failover_req(req);
230 			return;
231 		}
232 
233 		if (!blk_queue_dying(req->q)) {
234 			nvme_req(req)->retries++;
235 			blk_mq_requeue_request(req, true);
236 			return;
237 		}
238 	}
239 	blk_mq_end_request(req, status);
240 }
241 EXPORT_SYMBOL_GPL(nvme_complete_rq);
242 
243 void nvme_cancel_request(struct request *req, void *data, bool reserved)
244 {
245 	if (!blk_mq_request_started(req))
246 		return;
247 
248 	dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
249 				"Cancelling I/O %d", req->tag);
250 
251 	nvme_req(req)->status = NVME_SC_ABORT_REQ;
252 	blk_mq_complete_request(req);
253 
254 }
255 EXPORT_SYMBOL_GPL(nvme_cancel_request);
256 
257 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
258 		enum nvme_ctrl_state new_state)
259 {
260 	enum nvme_ctrl_state old_state;
261 	unsigned long flags;
262 	bool changed = false;
263 
264 	spin_lock_irqsave(&ctrl->lock, flags);
265 
266 	old_state = ctrl->state;
267 	switch (new_state) {
268 	case NVME_CTRL_ADMIN_ONLY:
269 		switch (old_state) {
270 		case NVME_CTRL_CONNECTING:
271 			changed = true;
272 			/* FALLTHRU */
273 		default:
274 			break;
275 		}
276 		break;
277 	case NVME_CTRL_LIVE:
278 		switch (old_state) {
279 		case NVME_CTRL_NEW:
280 		case NVME_CTRL_RESETTING:
281 		case NVME_CTRL_CONNECTING:
282 			changed = true;
283 			/* FALLTHRU */
284 		default:
285 			break;
286 		}
287 		break;
288 	case NVME_CTRL_RESETTING:
289 		switch (old_state) {
290 		case NVME_CTRL_NEW:
291 		case NVME_CTRL_LIVE:
292 		case NVME_CTRL_ADMIN_ONLY:
293 			changed = true;
294 			/* FALLTHRU */
295 		default:
296 			break;
297 		}
298 		break;
299 	case NVME_CTRL_CONNECTING:
300 		switch (old_state) {
301 		case NVME_CTRL_NEW:
302 		case NVME_CTRL_RESETTING:
303 			changed = true;
304 			/* FALLTHRU */
305 		default:
306 			break;
307 		}
308 		break;
309 	case NVME_CTRL_DELETING:
310 		switch (old_state) {
311 		case NVME_CTRL_LIVE:
312 		case NVME_CTRL_ADMIN_ONLY:
313 		case NVME_CTRL_RESETTING:
314 		case NVME_CTRL_CONNECTING:
315 			changed = true;
316 			/* FALLTHRU */
317 		default:
318 			break;
319 		}
320 		break;
321 	case NVME_CTRL_DEAD:
322 		switch (old_state) {
323 		case NVME_CTRL_DELETING:
324 			changed = true;
325 			/* FALLTHRU */
326 		default:
327 			break;
328 		}
329 		break;
330 	default:
331 		break;
332 	}
333 
334 	if (changed)
335 		ctrl->state = new_state;
336 
337 	spin_unlock_irqrestore(&ctrl->lock, flags);
338 	if (changed && ctrl->state == NVME_CTRL_LIVE)
339 		nvme_kick_requeue_lists(ctrl);
340 	return changed;
341 }
342 EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
343 
344 static void nvme_free_ns_head(struct kref *ref)
345 {
346 	struct nvme_ns_head *head =
347 		container_of(ref, struct nvme_ns_head, ref);
348 
349 	nvme_mpath_remove_disk(head);
350 	ida_simple_remove(&head->subsys->ns_ida, head->instance);
351 	list_del_init(&head->entry);
352 	cleanup_srcu_struct(&head->srcu);
353 	kfree(head);
354 }
355 
356 static void nvme_put_ns_head(struct nvme_ns_head *head)
357 {
358 	kref_put(&head->ref, nvme_free_ns_head);
359 }
360 
361 static void nvme_free_ns(struct kref *kref)
362 {
363 	struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
364 
365 	if (ns->ndev)
366 		nvme_nvm_unregister(ns);
367 
368 	put_disk(ns->disk);
369 	nvme_put_ns_head(ns->head);
370 	nvme_put_ctrl(ns->ctrl);
371 	kfree(ns);
372 }
373 
374 static void nvme_put_ns(struct nvme_ns *ns)
375 {
376 	kref_put(&ns->kref, nvme_free_ns);
377 }
378 
379 struct request *nvme_alloc_request(struct request_queue *q,
380 		struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid)
381 {
382 	unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN;
383 	struct request *req;
384 
385 	if (qid == NVME_QID_ANY) {
386 		req = blk_mq_alloc_request(q, op, flags);
387 	} else {
388 		req = blk_mq_alloc_request_hctx(q, op, flags,
389 				qid ? qid - 1 : 0);
390 	}
391 	if (IS_ERR(req))
392 		return req;
393 
394 	req->cmd_flags |= REQ_FAILFAST_DRIVER;
395 	nvme_req(req)->cmd = cmd;
396 
397 	return req;
398 }
399 EXPORT_SYMBOL_GPL(nvme_alloc_request);
400 
401 static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable)
402 {
403 	struct nvme_command c;
404 
405 	memset(&c, 0, sizeof(c));
406 
407 	c.directive.opcode = nvme_admin_directive_send;
408 	c.directive.nsid = cpu_to_le32(NVME_NSID_ALL);
409 	c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE;
410 	c.directive.dtype = NVME_DIR_IDENTIFY;
411 	c.directive.tdtype = NVME_DIR_STREAMS;
412 	c.directive.endir = enable ? NVME_DIR_ENDIR : 0;
413 
414 	return nvme_submit_sync_cmd(ctrl->admin_q, &c, NULL, 0);
415 }
416 
417 static int nvme_disable_streams(struct nvme_ctrl *ctrl)
418 {
419 	return nvme_toggle_streams(ctrl, false);
420 }
421 
422 static int nvme_enable_streams(struct nvme_ctrl *ctrl)
423 {
424 	return nvme_toggle_streams(ctrl, true);
425 }
426 
427 static int nvme_get_stream_params(struct nvme_ctrl *ctrl,
428 				  struct streams_directive_params *s, u32 nsid)
429 {
430 	struct nvme_command c;
431 
432 	memset(&c, 0, sizeof(c));
433 	memset(s, 0, sizeof(*s));
434 
435 	c.directive.opcode = nvme_admin_directive_recv;
436 	c.directive.nsid = cpu_to_le32(nsid);
437 	c.directive.numd = cpu_to_le32((sizeof(*s) >> 2) - 1);
438 	c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM;
439 	c.directive.dtype = NVME_DIR_STREAMS;
440 
441 	return nvme_submit_sync_cmd(ctrl->admin_q, &c, s, sizeof(*s));
442 }
443 
444 static int nvme_configure_directives(struct nvme_ctrl *ctrl)
445 {
446 	struct streams_directive_params s;
447 	int ret;
448 
449 	if (!(ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES))
450 		return 0;
451 	if (!streams)
452 		return 0;
453 
454 	ret = nvme_enable_streams(ctrl);
455 	if (ret)
456 		return ret;
457 
458 	ret = nvme_get_stream_params(ctrl, &s, NVME_NSID_ALL);
459 	if (ret)
460 		return ret;
461 
462 	ctrl->nssa = le16_to_cpu(s.nssa);
463 	if (ctrl->nssa < BLK_MAX_WRITE_HINTS - 1) {
464 		dev_info(ctrl->device, "too few streams (%u) available\n",
465 					ctrl->nssa);
466 		nvme_disable_streams(ctrl);
467 		return 0;
468 	}
469 
470 	ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1);
471 	dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams);
472 	return 0;
473 }
474 
475 /*
476  * Check if 'req' has a write hint associated with it. If it does, assign
477  * a valid namespace stream to the write.
478  */
479 static void nvme_assign_write_stream(struct nvme_ctrl *ctrl,
480 				     struct request *req, u16 *control,
481 				     u32 *dsmgmt)
482 {
483 	enum rw_hint streamid = req->write_hint;
484 
485 	if (streamid == WRITE_LIFE_NOT_SET || streamid == WRITE_LIFE_NONE)
486 		streamid = 0;
487 	else {
488 		streamid--;
489 		if (WARN_ON_ONCE(streamid > ctrl->nr_streams))
490 			return;
491 
492 		*control |= NVME_RW_DTYPE_STREAMS;
493 		*dsmgmt |= streamid << 16;
494 	}
495 
496 	if (streamid < ARRAY_SIZE(req->q->write_hints))
497 		req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9;
498 }
499 
500 static inline void nvme_setup_flush(struct nvme_ns *ns,
501 		struct nvme_command *cmnd)
502 {
503 	memset(cmnd, 0, sizeof(*cmnd));
504 	cmnd->common.opcode = nvme_cmd_flush;
505 	cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
506 }
507 
508 static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
509 		struct nvme_command *cmnd)
510 {
511 	unsigned short segments = blk_rq_nr_discard_segments(req), n = 0;
512 	struct nvme_dsm_range *range;
513 	struct bio *bio;
514 
515 	range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC);
516 	if (!range)
517 		return BLK_STS_RESOURCE;
518 
519 	__rq_for_each_bio(bio, req) {
520 		u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector);
521 		u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;
522 
523 		if (n < segments) {
524 			range[n].cattr = cpu_to_le32(0);
525 			range[n].nlb = cpu_to_le32(nlb);
526 			range[n].slba = cpu_to_le64(slba);
527 		}
528 		n++;
529 	}
530 
531 	if (WARN_ON_ONCE(n != segments)) {
532 		kfree(range);
533 		return BLK_STS_IOERR;
534 	}
535 
536 	memset(cmnd, 0, sizeof(*cmnd));
537 	cmnd->dsm.opcode = nvme_cmd_dsm;
538 	cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
539 	cmnd->dsm.nr = cpu_to_le32(segments - 1);
540 	cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
541 
542 	req->special_vec.bv_page = virt_to_page(range);
543 	req->special_vec.bv_offset = offset_in_page(range);
544 	req->special_vec.bv_len = sizeof(*range) * segments;
545 	req->rq_flags |= RQF_SPECIAL_PAYLOAD;
546 
547 	return BLK_STS_OK;
548 }
549 
550 static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
551 		struct request *req, struct nvme_command *cmnd)
552 {
553 	struct nvme_ctrl *ctrl = ns->ctrl;
554 	u16 control = 0;
555 	u32 dsmgmt = 0;
556 
557 	if (req->cmd_flags & REQ_FUA)
558 		control |= NVME_RW_FUA;
559 	if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
560 		control |= NVME_RW_LR;
561 
562 	if (req->cmd_flags & REQ_RAHEAD)
563 		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
564 
565 	memset(cmnd, 0, sizeof(*cmnd));
566 	cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
567 	cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
568 	cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
569 	cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
570 
571 	if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams)
572 		nvme_assign_write_stream(ctrl, req, &control, &dsmgmt);
573 
574 	if (ns->ms) {
575 		/*
576 		 * If formated with metadata, the block layer always provides a
577 		 * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled.  Else
578 		 * we enable the PRACT bit for protection information or set the
579 		 * namespace capacity to zero to prevent any I/O.
580 		 */
581 		if (!blk_integrity_rq(req)) {
582 			if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
583 				return BLK_STS_NOTSUPP;
584 			control |= NVME_RW_PRINFO_PRACT;
585 		}
586 
587 		switch (ns->pi_type) {
588 		case NVME_NS_DPS_PI_TYPE3:
589 			control |= NVME_RW_PRINFO_PRCHK_GUARD;
590 			break;
591 		case NVME_NS_DPS_PI_TYPE1:
592 		case NVME_NS_DPS_PI_TYPE2:
593 			control |= NVME_RW_PRINFO_PRCHK_GUARD |
594 					NVME_RW_PRINFO_PRCHK_REF;
595 			cmnd->rw.reftag = cpu_to_le32(
596 					nvme_block_nr(ns, blk_rq_pos(req)));
597 			break;
598 		}
599 	}
600 
601 	cmnd->rw.control = cpu_to_le16(control);
602 	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
603 	return 0;
604 }
605 
606 blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
607 		struct nvme_command *cmd)
608 {
609 	blk_status_t ret = BLK_STS_OK;
610 
611 	if (!(req->rq_flags & RQF_DONTPREP)) {
612 		nvme_req(req)->retries = 0;
613 		nvme_req(req)->flags = 0;
614 		req->rq_flags |= RQF_DONTPREP;
615 	}
616 
617 	switch (req_op(req)) {
618 	case REQ_OP_DRV_IN:
619 	case REQ_OP_DRV_OUT:
620 		memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd));
621 		break;
622 	case REQ_OP_FLUSH:
623 		nvme_setup_flush(ns, cmd);
624 		break;
625 	case REQ_OP_WRITE_ZEROES:
626 		/* currently only aliased to deallocate for a few ctrls: */
627 	case REQ_OP_DISCARD:
628 		ret = nvme_setup_discard(ns, req, cmd);
629 		break;
630 	case REQ_OP_READ:
631 	case REQ_OP_WRITE:
632 		ret = nvme_setup_rw(ns, req, cmd);
633 		break;
634 	default:
635 		WARN_ON_ONCE(1);
636 		return BLK_STS_IOERR;
637 	}
638 
639 	cmd->common.command_id = req->tag;
640 	if (ns)
641 		trace_nvme_setup_nvm_cmd(req->q->id, cmd);
642 	else
643 		trace_nvme_setup_admin_cmd(cmd);
644 	return ret;
645 }
646 EXPORT_SYMBOL_GPL(nvme_setup_cmd);
647 
648 /*
649  * Returns 0 on success.  If the result is negative, it's a Linux error code;
650  * if the result is positive, it's an NVM Express status code
651  */
652 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
653 		union nvme_result *result, void *buffer, unsigned bufflen,
654 		unsigned timeout, int qid, int at_head,
655 		blk_mq_req_flags_t flags)
656 {
657 	struct request *req;
658 	int ret;
659 
660 	req = nvme_alloc_request(q, cmd, flags, qid);
661 	if (IS_ERR(req))
662 		return PTR_ERR(req);
663 
664 	req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
665 
666 	if (buffer && bufflen) {
667 		ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
668 		if (ret)
669 			goto out;
670 	}
671 
672 	blk_execute_rq(req->q, NULL, req, at_head);
673 	if (result)
674 		*result = nvme_req(req)->result;
675 	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
676 		ret = -EINTR;
677 	else
678 		ret = nvme_req(req)->status;
679  out:
680 	blk_mq_free_request(req);
681 	return ret;
682 }
683 EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
684 
685 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
686 		void *buffer, unsigned bufflen)
687 {
688 	return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0,
689 			NVME_QID_ANY, 0, 0);
690 }
691 EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
692 
693 static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf,
694 		unsigned len, u32 seed, bool write)
695 {
696 	struct bio_integrity_payload *bip;
697 	int ret = -ENOMEM;
698 	void *buf;
699 
700 	buf = kmalloc(len, GFP_KERNEL);
701 	if (!buf)
702 		goto out;
703 
704 	ret = -EFAULT;
705 	if (write && copy_from_user(buf, ubuf, len))
706 		goto out_free_meta;
707 
708 	bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
709 	if (IS_ERR(bip)) {
710 		ret = PTR_ERR(bip);
711 		goto out_free_meta;
712 	}
713 
714 	bip->bip_iter.bi_size = len;
715 	bip->bip_iter.bi_sector = seed;
716 	ret = bio_integrity_add_page(bio, virt_to_page(buf), len,
717 			offset_in_page(buf));
718 	if (ret == len)
719 		return buf;
720 	ret = -ENOMEM;
721 out_free_meta:
722 	kfree(buf);
723 out:
724 	return ERR_PTR(ret);
725 }
726 
727 static int nvme_submit_user_cmd(struct request_queue *q,
728 		struct nvme_command *cmd, void __user *ubuffer,
729 		unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
730 		u32 meta_seed, u32 *result, unsigned timeout)
731 {
732 	bool write = nvme_is_write(cmd);
733 	struct nvme_ns *ns = q->queuedata;
734 	struct gendisk *disk = ns ? ns->disk : NULL;
735 	struct request *req;
736 	struct bio *bio = NULL;
737 	void *meta = NULL;
738 	int ret;
739 
740 	req = nvme_alloc_request(q, cmd, 0, NVME_QID_ANY);
741 	if (IS_ERR(req))
742 		return PTR_ERR(req);
743 
744 	req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
745 
746 	if (ubuffer && bufflen) {
747 		ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
748 				GFP_KERNEL);
749 		if (ret)
750 			goto out;
751 		bio = req->bio;
752 		bio->bi_disk = disk;
753 		if (disk && meta_buffer && meta_len) {
754 			meta = nvme_add_user_metadata(bio, meta_buffer, meta_len,
755 					meta_seed, write);
756 			if (IS_ERR(meta)) {
757 				ret = PTR_ERR(meta);
758 				goto out_unmap;
759 			}
760 		}
761 	}
762 
763 	blk_execute_rq(req->q, disk, req, 0);
764 	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
765 		ret = -EINTR;
766 	else
767 		ret = nvme_req(req)->status;
768 	if (result)
769 		*result = le32_to_cpu(nvme_req(req)->result.u32);
770 	if (meta && !ret && !write) {
771 		if (copy_to_user(meta_buffer, meta, meta_len))
772 			ret = -EFAULT;
773 	}
774 	kfree(meta);
775  out_unmap:
776 	if (bio)
777 		blk_rq_unmap_user(bio);
778  out:
779 	blk_mq_free_request(req);
780 	return ret;
781 }
782 
783 static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
784 {
785 	struct nvme_ctrl *ctrl = rq->end_io_data;
786 
787 	blk_mq_free_request(rq);
788 
789 	if (status) {
790 		dev_err(ctrl->device,
791 			"failed nvme_keep_alive_end_io error=%d\n",
792 				status);
793 		return;
794 	}
795 
796 	schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
797 }
798 
799 static int nvme_keep_alive(struct nvme_ctrl *ctrl)
800 {
801 	struct request *rq;
802 
803 	rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd, BLK_MQ_REQ_RESERVED,
804 			NVME_QID_ANY);
805 	if (IS_ERR(rq))
806 		return PTR_ERR(rq);
807 
808 	rq->timeout = ctrl->kato * HZ;
809 	rq->end_io_data = ctrl;
810 
811 	blk_execute_rq_nowait(rq->q, NULL, rq, 0, nvme_keep_alive_end_io);
812 
813 	return 0;
814 }
815 
816 static void nvme_keep_alive_work(struct work_struct *work)
817 {
818 	struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
819 			struct nvme_ctrl, ka_work);
820 
821 	if (nvme_keep_alive(ctrl)) {
822 		/* allocation failure, reset the controller */
823 		dev_err(ctrl->device, "keep-alive failed\n");
824 		nvme_reset_ctrl(ctrl);
825 		return;
826 	}
827 }
828 
829 void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
830 {
831 	if (unlikely(ctrl->kato == 0))
832 		return;
833 
834 	INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
835 	memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
836 	ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
837 	schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
838 }
839 EXPORT_SYMBOL_GPL(nvme_start_keep_alive);
840 
841 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
842 {
843 	if (unlikely(ctrl->kato == 0))
844 		return;
845 
846 	cancel_delayed_work_sync(&ctrl->ka_work);
847 }
848 EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);
849 
850 static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
851 {
852 	struct nvme_command c = { };
853 	int error;
854 
855 	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
856 	c.identify.opcode = nvme_admin_identify;
857 	c.identify.cns = NVME_ID_CNS_CTRL;
858 
859 	*id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
860 	if (!*id)
861 		return -ENOMEM;
862 
863 	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
864 			sizeof(struct nvme_id_ctrl));
865 	if (error)
866 		kfree(*id);
867 	return error;
868 }
869 
870 static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
871 		struct nvme_ns_ids *ids)
872 {
873 	struct nvme_command c = { };
874 	int status;
875 	void *data;
876 	int pos;
877 	int len;
878 
879 	c.identify.opcode = nvme_admin_identify;
880 	c.identify.nsid = cpu_to_le32(nsid);
881 	c.identify.cns = NVME_ID_CNS_NS_DESC_LIST;
882 
883 	data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
884 	if (!data)
885 		return -ENOMEM;
886 
887 	status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data,
888 				      NVME_IDENTIFY_DATA_SIZE);
889 	if (status)
890 		goto free_data;
891 
892 	for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) {
893 		struct nvme_ns_id_desc *cur = data + pos;
894 
895 		if (cur->nidl == 0)
896 			break;
897 
898 		switch (cur->nidt) {
899 		case NVME_NIDT_EUI64:
900 			if (cur->nidl != NVME_NIDT_EUI64_LEN) {
901 				dev_warn(ctrl->device,
902 					 "ctrl returned bogus length: %d for NVME_NIDT_EUI64\n",
903 					 cur->nidl);
904 				goto free_data;
905 			}
906 			len = NVME_NIDT_EUI64_LEN;
907 			memcpy(ids->eui64, data + pos + sizeof(*cur), len);
908 			break;
909 		case NVME_NIDT_NGUID:
910 			if (cur->nidl != NVME_NIDT_NGUID_LEN) {
911 				dev_warn(ctrl->device,
912 					 "ctrl returned bogus length: %d for NVME_NIDT_NGUID\n",
913 					 cur->nidl);
914 				goto free_data;
915 			}
916 			len = NVME_NIDT_NGUID_LEN;
917 			memcpy(ids->nguid, data + pos + sizeof(*cur), len);
918 			break;
919 		case NVME_NIDT_UUID:
920 			if (cur->nidl != NVME_NIDT_UUID_LEN) {
921 				dev_warn(ctrl->device,
922 					 "ctrl returned bogus length: %d for NVME_NIDT_UUID\n",
923 					 cur->nidl);
924 				goto free_data;
925 			}
926 			len = NVME_NIDT_UUID_LEN;
927 			uuid_copy(&ids->uuid, data + pos + sizeof(*cur));
928 			break;
929 		default:
930 			/* Skip unnkown types */
931 			len = cur->nidl;
932 			break;
933 		}
934 
935 		len += sizeof(*cur);
936 	}
937 free_data:
938 	kfree(data);
939 	return status;
940 }
941 
942 static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list)
943 {
944 	struct nvme_command c = { };
945 
946 	c.identify.opcode = nvme_admin_identify;
947 	c.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST;
948 	c.identify.nsid = cpu_to_le32(nsid);
949 	return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list,
950 				    NVME_IDENTIFY_DATA_SIZE);
951 }
952 
953 static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl,
954 		unsigned nsid)
955 {
956 	struct nvme_id_ns *id;
957 	struct nvme_command c = { };
958 	int error;
959 
960 	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
961 	c.identify.opcode = nvme_admin_identify;
962 	c.identify.nsid = cpu_to_le32(nsid);
963 	c.identify.cns = NVME_ID_CNS_NS;
964 
965 	id = kmalloc(sizeof(*id), GFP_KERNEL);
966 	if (!id)
967 		return NULL;
968 
969 	error = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
970 	if (error) {
971 		dev_warn(ctrl->device, "Identify namespace failed\n");
972 		kfree(id);
973 		return NULL;
974 	}
975 
976 	return id;
977 }
978 
979 static int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
980 		      void *buffer, size_t buflen, u32 *result)
981 {
982 	struct nvme_command c;
983 	union nvme_result res;
984 	int ret;
985 
986 	memset(&c, 0, sizeof(c));
987 	c.features.opcode = nvme_admin_set_features;
988 	c.features.fid = cpu_to_le32(fid);
989 	c.features.dword11 = cpu_to_le32(dword11);
990 
991 	ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
992 			buffer, buflen, 0, NVME_QID_ANY, 0, 0);
993 	if (ret >= 0 && result)
994 		*result = le32_to_cpu(res.u32);
995 	return ret;
996 }
997 
998 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
999 {
1000 	u32 q_count = (*count - 1) | ((*count - 1) << 16);
1001 	u32 result;
1002 	int status, nr_io_queues;
1003 
1004 	status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
1005 			&result);
1006 	if (status < 0)
1007 		return status;
1008 
1009 	/*
1010 	 * Degraded controllers might return an error when setting the queue
1011 	 * count.  We still want to be able to bring them online and offer
1012 	 * access to the admin queue, as that might be only way to fix them up.
1013 	 */
1014 	if (status > 0) {
1015 		dev_err(ctrl->device, "Could not set queue count (%d)\n", status);
1016 		*count = 0;
1017 	} else {
1018 		nr_io_queues = min(result & 0xffff, result >> 16) + 1;
1019 		*count = min(*count, nr_io_queues);
1020 	}
1021 
1022 	return 0;
1023 }
1024 EXPORT_SYMBOL_GPL(nvme_set_queue_count);
1025 
1026 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
1027 {
1028 	struct nvme_user_io io;
1029 	struct nvme_command c;
1030 	unsigned length, meta_len;
1031 	void __user *metadata;
1032 
1033 	if (copy_from_user(&io, uio, sizeof(io)))
1034 		return -EFAULT;
1035 	if (io.flags)
1036 		return -EINVAL;
1037 
1038 	switch (io.opcode) {
1039 	case nvme_cmd_write:
1040 	case nvme_cmd_read:
1041 	case nvme_cmd_compare:
1042 		break;
1043 	default:
1044 		return -EINVAL;
1045 	}
1046 
1047 	length = (io.nblocks + 1) << ns->lba_shift;
1048 	meta_len = (io.nblocks + 1) * ns->ms;
1049 	metadata = (void __user *)(uintptr_t)io.metadata;
1050 
1051 	if (ns->ext) {
1052 		length += meta_len;
1053 		meta_len = 0;
1054 	} else if (meta_len) {
1055 		if ((io.metadata & 3) || !io.metadata)
1056 			return -EINVAL;
1057 	}
1058 
1059 	memset(&c, 0, sizeof(c));
1060 	c.rw.opcode = io.opcode;
1061 	c.rw.flags = io.flags;
1062 	c.rw.nsid = cpu_to_le32(ns->head->ns_id);
1063 	c.rw.slba = cpu_to_le64(io.slba);
1064 	c.rw.length = cpu_to_le16(io.nblocks);
1065 	c.rw.control = cpu_to_le16(io.control);
1066 	c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
1067 	c.rw.reftag = cpu_to_le32(io.reftag);
1068 	c.rw.apptag = cpu_to_le16(io.apptag);
1069 	c.rw.appmask = cpu_to_le16(io.appmask);
1070 
1071 	return nvme_submit_user_cmd(ns->queue, &c,
1072 			(void __user *)(uintptr_t)io.addr, length,
1073 			metadata, meta_len, io.slba, NULL, 0);
1074 }
1075 
1076 static u32 nvme_known_admin_effects(u8 opcode)
1077 {
1078 	switch (opcode) {
1079 	case nvme_admin_format_nvm:
1080 		return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC |
1081 					NVME_CMD_EFFECTS_CSE_MASK;
1082 	case nvme_admin_sanitize_nvm:
1083 		return NVME_CMD_EFFECTS_CSE_MASK;
1084 	default:
1085 		break;
1086 	}
1087 	return 0;
1088 }
1089 
1090 static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1091 								u8 opcode)
1092 {
1093 	u32 effects = 0;
1094 
1095 	if (ns) {
1096 		if (ctrl->effects)
1097 			effects = le32_to_cpu(ctrl->effects->iocs[opcode]);
1098 		if (effects & ~NVME_CMD_EFFECTS_CSUPP)
1099 			dev_warn(ctrl->device,
1100 				 "IO command:%02x has unhandled effects:%08x\n",
1101 				 opcode, effects);
1102 		return 0;
1103 	}
1104 
1105 	if (ctrl->effects)
1106 		effects = le32_to_cpu(ctrl->effects->iocs[opcode]);
1107 	else
1108 		effects = nvme_known_admin_effects(opcode);
1109 
1110 	/*
1111 	 * For simplicity, IO to all namespaces is quiesced even if the command
1112 	 * effects say only one namespace is affected.
1113 	 */
1114 	if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
1115 		nvme_start_freeze(ctrl);
1116 		nvme_wait_freeze(ctrl);
1117 	}
1118 	return effects;
1119 }
1120 
1121 static void nvme_update_formats(struct nvme_ctrl *ctrl)
1122 {
1123 	struct nvme_ns *ns, *next;
1124 	LIST_HEAD(rm_list);
1125 
1126 	down_write(&ctrl->namespaces_rwsem);
1127 	list_for_each_entry(ns, &ctrl->namespaces, list) {
1128 		if (ns->disk && nvme_revalidate_disk(ns->disk)) {
1129 			list_move_tail(&ns->list, &rm_list);
1130 		}
1131 	}
1132 	up_write(&ctrl->namespaces_rwsem);
1133 
1134 	list_for_each_entry_safe(ns, next, &rm_list, list)
1135 		nvme_ns_remove(ns);
1136 }
1137 
1138 static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
1139 {
1140 	/*
1141 	 * Revalidate LBA changes prior to unfreezing. This is necessary to
1142 	 * prevent memory corruption if a logical block size was changed by
1143 	 * this command.
1144 	 */
1145 	if (effects & NVME_CMD_EFFECTS_LBCC)
1146 		nvme_update_formats(ctrl);
1147 	if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK))
1148 		nvme_unfreeze(ctrl);
1149 	if (effects & NVME_CMD_EFFECTS_CCC)
1150 		nvme_init_identify(ctrl);
1151 	if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC))
1152 		nvme_queue_scan(ctrl);
1153 }
1154 
1155 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1156 			struct nvme_passthru_cmd __user *ucmd)
1157 {
1158 	struct nvme_passthru_cmd cmd;
1159 	struct nvme_command c;
1160 	unsigned timeout = 0;
1161 	u32 effects;
1162 	int status;
1163 
1164 	if (!capable(CAP_SYS_ADMIN))
1165 		return -EACCES;
1166 	if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
1167 		return -EFAULT;
1168 	if (cmd.flags)
1169 		return -EINVAL;
1170 
1171 	memset(&c, 0, sizeof(c));
1172 	c.common.opcode = cmd.opcode;
1173 	c.common.flags = cmd.flags;
1174 	c.common.nsid = cpu_to_le32(cmd.nsid);
1175 	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
1176 	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
1177 	c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
1178 	c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
1179 	c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
1180 	c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
1181 	c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
1182 	c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
1183 
1184 	if (cmd.timeout_ms)
1185 		timeout = msecs_to_jiffies(cmd.timeout_ms);
1186 
1187 	effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
1188 	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
1189 			(void __user *)(uintptr_t)cmd.addr, cmd.data_len,
1190 			(void __user *)(uintptr_t)cmd.metadata, cmd.metadata,
1191 			0, &cmd.result, timeout);
1192 	nvme_passthru_end(ctrl, effects);
1193 
1194 	if (status >= 0) {
1195 		if (put_user(cmd.result, &ucmd->result))
1196 			return -EFAULT;
1197 	}
1198 
1199 	return status;
1200 }
1201 
1202 /*
1203  * Issue ioctl requests on the first available path.  Note that unlike normal
1204  * block layer requests we will not retry failed request on another controller.
1205  */
1206 static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
1207 		struct nvme_ns_head **head, int *srcu_idx)
1208 {
1209 #ifdef CONFIG_NVME_MULTIPATH
1210 	if (disk->fops == &nvme_ns_head_ops) {
1211 		*head = disk->private_data;
1212 		*srcu_idx = srcu_read_lock(&(*head)->srcu);
1213 		return nvme_find_path(*head);
1214 	}
1215 #endif
1216 	*head = NULL;
1217 	*srcu_idx = -1;
1218 	return disk->private_data;
1219 }
1220 
1221 static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
1222 {
1223 	if (head)
1224 		srcu_read_unlock(&head->srcu, idx);
1225 }
1226 
1227 static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned cmd, unsigned long arg)
1228 {
1229 	switch (cmd) {
1230 	case NVME_IOCTL_ID:
1231 		force_successful_syscall_return();
1232 		return ns->head->ns_id;
1233 	case NVME_IOCTL_ADMIN_CMD:
1234 		return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg);
1235 	case NVME_IOCTL_IO_CMD:
1236 		return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg);
1237 	case NVME_IOCTL_SUBMIT_IO:
1238 		return nvme_submit_io(ns, (void __user *)arg);
1239 	default:
1240 #ifdef CONFIG_NVM
1241 		if (ns->ndev)
1242 			return nvme_nvm_ioctl(ns, cmd, arg);
1243 #endif
1244 		if (is_sed_ioctl(cmd))
1245 			return sed_ioctl(ns->ctrl->opal_dev, cmd,
1246 					 (void __user *) arg);
1247 		return -ENOTTY;
1248 	}
1249 }
1250 
1251 static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
1252 		unsigned int cmd, unsigned long arg)
1253 {
1254 	struct nvme_ns_head *head = NULL;
1255 	struct nvme_ns *ns;
1256 	int srcu_idx, ret;
1257 
1258 	ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
1259 	if (unlikely(!ns))
1260 		ret = -EWOULDBLOCK;
1261 	else
1262 		ret = nvme_ns_ioctl(ns, cmd, arg);
1263 	nvme_put_ns_from_disk(head, srcu_idx);
1264 	return ret;
1265 }
1266 
1267 static int nvme_open(struct block_device *bdev, fmode_t mode)
1268 {
1269 	struct nvme_ns *ns = bdev->bd_disk->private_data;
1270 
1271 #ifdef CONFIG_NVME_MULTIPATH
1272 	/* should never be called due to GENHD_FL_HIDDEN */
1273 	if (WARN_ON_ONCE(ns->head->disk))
1274 		goto fail;
1275 #endif
1276 	if (!kref_get_unless_zero(&ns->kref))
1277 		goto fail;
1278 	if (!try_module_get(ns->ctrl->ops->module))
1279 		goto fail_put_ns;
1280 
1281 	return 0;
1282 
1283 fail_put_ns:
1284 	nvme_put_ns(ns);
1285 fail:
1286 	return -ENXIO;
1287 }
1288 
1289 static void nvme_release(struct gendisk *disk, fmode_t mode)
1290 {
1291 	struct nvme_ns *ns = disk->private_data;
1292 
1293 	module_put(ns->ctrl->ops->module);
1294 	nvme_put_ns(ns);
1295 }
1296 
1297 static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
1298 {
1299 	/* some standard values */
1300 	geo->heads = 1 << 6;
1301 	geo->sectors = 1 << 5;
1302 	geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
1303 	return 0;
1304 }
1305 
1306 #ifdef CONFIG_BLK_DEV_INTEGRITY
1307 static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type)
1308 {
1309 	struct blk_integrity integrity;
1310 
1311 	memset(&integrity, 0, sizeof(integrity));
1312 	switch (pi_type) {
1313 	case NVME_NS_DPS_PI_TYPE3:
1314 		integrity.profile = &t10_pi_type3_crc;
1315 		integrity.tag_size = sizeof(u16) + sizeof(u32);
1316 		integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1317 		break;
1318 	case NVME_NS_DPS_PI_TYPE1:
1319 	case NVME_NS_DPS_PI_TYPE2:
1320 		integrity.profile = &t10_pi_type1_crc;
1321 		integrity.tag_size = sizeof(u16);
1322 		integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1323 		break;
1324 	default:
1325 		integrity.profile = NULL;
1326 		break;
1327 	}
1328 	integrity.tuple_size = ms;
1329 	blk_integrity_register(disk, &integrity);
1330 	blk_queue_max_integrity_segments(disk->queue, 1);
1331 }
1332 #else
1333 static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type)
1334 {
1335 }
1336 #endif /* CONFIG_BLK_DEV_INTEGRITY */
1337 
1338 static void nvme_set_chunk_size(struct nvme_ns *ns)
1339 {
1340 	u32 chunk_size = (((u32)ns->noiob) << (ns->lba_shift - 9));
1341 	blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size));
1342 }
1343 
1344 static void nvme_config_discard(struct nvme_ctrl *ctrl,
1345 		unsigned stream_alignment, struct request_queue *queue)
1346 {
1347 	u32 size = queue_logical_block_size(queue);
1348 
1349 	if (stream_alignment)
1350 		size *= stream_alignment;
1351 
1352 	BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
1353 			NVME_DSM_MAX_RANGES);
1354 
1355 	queue->limits.discard_alignment = 0;
1356 	queue->limits.discard_granularity = size;
1357 
1358 	blk_queue_max_discard_sectors(queue, UINT_MAX);
1359 	blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES);
1360 	blk_queue_flag_set(QUEUE_FLAG_DISCARD, queue);
1361 
1362 	if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
1363 		blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
1364 }
1365 
1366 static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
1367 		struct nvme_id_ns *id, struct nvme_ns_ids *ids)
1368 {
1369 	memset(ids, 0, sizeof(*ids));
1370 
1371 	if (ctrl->vs >= NVME_VS(1, 1, 0))
1372 		memcpy(ids->eui64, id->eui64, sizeof(id->eui64));
1373 	if (ctrl->vs >= NVME_VS(1, 2, 0))
1374 		memcpy(ids->nguid, id->nguid, sizeof(id->nguid));
1375 	if (ctrl->vs >= NVME_VS(1, 3, 0)) {
1376 		 /* Don't treat error as fatal we potentially
1377 		  * already have a NGUID or EUI-64
1378 		  */
1379 		if (nvme_identify_ns_descs(ctrl, nsid, ids))
1380 			dev_warn(ctrl->device,
1381 				 "%s: Identify Descriptors failed\n", __func__);
1382 	}
1383 }
1384 
1385 static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids)
1386 {
1387 	return !uuid_is_null(&ids->uuid) ||
1388 		memchr_inv(ids->nguid, 0, sizeof(ids->nguid)) ||
1389 		memchr_inv(ids->eui64, 0, sizeof(ids->eui64));
1390 }
1391 
1392 static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
1393 {
1394 	return uuid_equal(&a->uuid, &b->uuid) &&
1395 		memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
1396 		memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0;
1397 }
1398 
1399 static void nvme_update_disk_info(struct gendisk *disk,
1400 		struct nvme_ns *ns, struct nvme_id_ns *id)
1401 {
1402 	sector_t capacity = le64_to_cpup(&id->nsze) << (ns->lba_shift - 9);
1403 	unsigned short bs = 1 << ns->lba_shift;
1404 	unsigned stream_alignment = 0;
1405 
1406 	if (ns->ctrl->nr_streams && ns->sws && ns->sgs)
1407 		stream_alignment = ns->sws * ns->sgs;
1408 
1409 	blk_mq_freeze_queue(disk->queue);
1410 	blk_integrity_unregister(disk);
1411 
1412 	blk_queue_logical_block_size(disk->queue, bs);
1413 	blk_queue_physical_block_size(disk->queue, bs);
1414 	blk_queue_io_min(disk->queue, bs);
1415 
1416 	if (ns->ms && !ns->ext &&
1417 	    (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
1418 		nvme_init_integrity(disk, ns->ms, ns->pi_type);
1419 	if (ns->ms && !nvme_ns_has_pi(ns) && !blk_get_integrity(disk))
1420 		capacity = 0;
1421 	set_capacity(disk, capacity);
1422 
1423 	if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)
1424 		nvme_config_discard(ns->ctrl, stream_alignment, disk->queue);
1425 	blk_mq_unfreeze_queue(disk->queue);
1426 }
1427 
1428 static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
1429 {
1430 	struct nvme_ns *ns = disk->private_data;
1431 
1432 	/*
1433 	 * If identify namespace failed, use default 512 byte block size so
1434 	 * block layer can use before failing read/write for 0 capacity.
1435 	 */
1436 	ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
1437 	if (ns->lba_shift == 0)
1438 		ns->lba_shift = 9;
1439 	ns->noiob = le16_to_cpu(id->noiob);
1440 	ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
1441 	ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
1442 	/* the PI implementation requires metadata equal t10 pi tuple size */
1443 	if (ns->ms == sizeof(struct t10_pi_tuple))
1444 		ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
1445 	else
1446 		ns->pi_type = 0;
1447 
1448 	if (ns->noiob)
1449 		nvme_set_chunk_size(ns);
1450 	nvme_update_disk_info(disk, ns, id);
1451 	if (ns->ndev)
1452 		nvme_nvm_update_nvm_info(ns);
1453 #ifdef CONFIG_NVME_MULTIPATH
1454 	if (ns->head->disk)
1455 		nvme_update_disk_info(ns->head->disk, ns, id);
1456 #endif
1457 }
1458 
1459 static int nvme_revalidate_disk(struct gendisk *disk)
1460 {
1461 	struct nvme_ns *ns = disk->private_data;
1462 	struct nvme_ctrl *ctrl = ns->ctrl;
1463 	struct nvme_id_ns *id;
1464 	struct nvme_ns_ids ids;
1465 	int ret = 0;
1466 
1467 	if (test_bit(NVME_NS_DEAD, &ns->flags)) {
1468 		set_capacity(disk, 0);
1469 		return -ENODEV;
1470 	}
1471 
1472 	id = nvme_identify_ns(ctrl, ns->head->ns_id);
1473 	if (!id)
1474 		return -ENODEV;
1475 
1476 	if (id->ncap == 0) {
1477 		ret = -ENODEV;
1478 		goto out;
1479 	}
1480 
1481 	__nvme_revalidate_disk(disk, id);
1482 	nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids);
1483 	if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) {
1484 		dev_err(ctrl->device,
1485 			"identifiers changed for nsid %d\n", ns->head->ns_id);
1486 		ret = -ENODEV;
1487 	}
1488 
1489 out:
1490 	kfree(id);
1491 	return ret;
1492 }
1493 
1494 static char nvme_pr_type(enum pr_type type)
1495 {
1496 	switch (type) {
1497 	case PR_WRITE_EXCLUSIVE:
1498 		return 1;
1499 	case PR_EXCLUSIVE_ACCESS:
1500 		return 2;
1501 	case PR_WRITE_EXCLUSIVE_REG_ONLY:
1502 		return 3;
1503 	case PR_EXCLUSIVE_ACCESS_REG_ONLY:
1504 		return 4;
1505 	case PR_WRITE_EXCLUSIVE_ALL_REGS:
1506 		return 5;
1507 	case PR_EXCLUSIVE_ACCESS_ALL_REGS:
1508 		return 6;
1509 	default:
1510 		return 0;
1511 	}
1512 };
1513 
1514 static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
1515 				u64 key, u64 sa_key, u8 op)
1516 {
1517 	struct nvme_ns_head *head = NULL;
1518 	struct nvme_ns *ns;
1519 	struct nvme_command c;
1520 	int srcu_idx, ret;
1521 	u8 data[16] = { 0, };
1522 
1523 	ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
1524 	if (unlikely(!ns))
1525 		return -EWOULDBLOCK;
1526 
1527 	put_unaligned_le64(key, &data[0]);
1528 	put_unaligned_le64(sa_key, &data[8]);
1529 
1530 	memset(&c, 0, sizeof(c));
1531 	c.common.opcode = op;
1532 	c.common.nsid = cpu_to_le32(ns->head->ns_id);
1533 	c.common.cdw10[0] = cpu_to_le32(cdw10);
1534 
1535 	ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16);
1536 	nvme_put_ns_from_disk(head, srcu_idx);
1537 	return ret;
1538 }
1539 
1540 static int nvme_pr_register(struct block_device *bdev, u64 old,
1541 		u64 new, unsigned flags)
1542 {
1543 	u32 cdw10;
1544 
1545 	if (flags & ~PR_FL_IGNORE_KEY)
1546 		return -EOPNOTSUPP;
1547 
1548 	cdw10 = old ? 2 : 0;
1549 	cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
1550 	cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
1551 	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
1552 }
1553 
1554 static int nvme_pr_reserve(struct block_device *bdev, u64 key,
1555 		enum pr_type type, unsigned flags)
1556 {
1557 	u32 cdw10;
1558 
1559 	if (flags & ~PR_FL_IGNORE_KEY)
1560 		return -EOPNOTSUPP;
1561 
1562 	cdw10 = nvme_pr_type(type) << 8;
1563 	cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
1564 	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
1565 }
1566 
1567 static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
1568 		enum pr_type type, bool abort)
1569 {
1570 	u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1;
1571 	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
1572 }
1573 
1574 static int nvme_pr_clear(struct block_device *bdev, u64 key)
1575 {
1576 	u32 cdw10 = 1 | (key ? 1 << 3 : 0);
1577 	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
1578 }
1579 
1580 static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
1581 {
1582 	u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0;
1583 	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
1584 }
1585 
1586 static const struct pr_ops nvme_pr_ops = {
1587 	.pr_register	= nvme_pr_register,
1588 	.pr_reserve	= nvme_pr_reserve,
1589 	.pr_release	= nvme_pr_release,
1590 	.pr_preempt	= nvme_pr_preempt,
1591 	.pr_clear	= nvme_pr_clear,
1592 };
1593 
1594 #ifdef CONFIG_BLK_SED_OPAL
1595 int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
1596 		bool send)
1597 {
1598 	struct nvme_ctrl *ctrl = data;
1599 	struct nvme_command cmd;
1600 
1601 	memset(&cmd, 0, sizeof(cmd));
1602 	if (send)
1603 		cmd.common.opcode = nvme_admin_security_send;
1604 	else
1605 		cmd.common.opcode = nvme_admin_security_recv;
1606 	cmd.common.nsid = 0;
1607 	cmd.common.cdw10[0] = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
1608 	cmd.common.cdw10[1] = cpu_to_le32(len);
1609 
1610 	return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
1611 				      ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0);
1612 }
1613 EXPORT_SYMBOL_GPL(nvme_sec_submit);
1614 #endif /* CONFIG_BLK_SED_OPAL */
1615 
1616 static const struct block_device_operations nvme_fops = {
1617 	.owner		= THIS_MODULE,
1618 	.ioctl		= nvme_ioctl,
1619 	.compat_ioctl	= nvme_ioctl,
1620 	.open		= nvme_open,
1621 	.release	= nvme_release,
1622 	.getgeo		= nvme_getgeo,
1623 	.revalidate_disk= nvme_revalidate_disk,
1624 	.pr_ops		= &nvme_pr_ops,
1625 };
1626 
1627 #ifdef CONFIG_NVME_MULTIPATH
1628 static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode)
1629 {
1630 	struct nvme_ns_head *head = bdev->bd_disk->private_data;
1631 
1632 	if (!kref_get_unless_zero(&head->ref))
1633 		return -ENXIO;
1634 	return 0;
1635 }
1636 
1637 static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode)
1638 {
1639 	nvme_put_ns_head(disk->private_data);
1640 }
1641 
1642 const struct block_device_operations nvme_ns_head_ops = {
1643 	.owner		= THIS_MODULE,
1644 	.open		= nvme_ns_head_open,
1645 	.release	= nvme_ns_head_release,
1646 	.ioctl		= nvme_ioctl,
1647 	.compat_ioctl	= nvme_ioctl,
1648 	.getgeo		= nvme_getgeo,
1649 	.pr_ops		= &nvme_pr_ops,
1650 };
1651 #endif /* CONFIG_NVME_MULTIPATH */
1652 
1653 static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
1654 {
1655 	unsigned long timeout =
1656 		((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
1657 	u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
1658 	int ret;
1659 
1660 	while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
1661 		if (csts == ~0)
1662 			return -ENODEV;
1663 		if ((csts & NVME_CSTS_RDY) == bit)
1664 			break;
1665 
1666 		msleep(100);
1667 		if (fatal_signal_pending(current))
1668 			return -EINTR;
1669 		if (time_after(jiffies, timeout)) {
1670 			dev_err(ctrl->device,
1671 				"Device not ready; aborting %s\n", enabled ?
1672 						"initialisation" : "reset");
1673 			return -ENODEV;
1674 		}
1675 	}
1676 
1677 	return ret;
1678 }
1679 
1680 /*
1681  * If the device has been passed off to us in an enabled state, just clear
1682  * the enabled bit.  The spec says we should set the 'shutdown notification
1683  * bits', but doing so may cause the device to complete commands to the
1684  * admin queue ... and we don't know what memory that might be pointing at!
1685  */
1686 int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
1687 {
1688 	int ret;
1689 
1690 	ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
1691 	ctrl->ctrl_config &= ~NVME_CC_ENABLE;
1692 
1693 	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
1694 	if (ret)
1695 		return ret;
1696 
1697 	if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
1698 		msleep(NVME_QUIRK_DELAY_AMOUNT);
1699 
1700 	return nvme_wait_ready(ctrl, cap, false);
1701 }
1702 EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
1703 
1704 int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
1705 {
1706 	/*
1707 	 * Default to a 4K page size, with the intention to update this
1708 	 * path in the future to accomodate architectures with differing
1709 	 * kernel and IO page sizes.
1710 	 */
1711 	unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12;
1712 	int ret;
1713 
1714 	if (page_shift < dev_page_min) {
1715 		dev_err(ctrl->device,
1716 			"Minimum device page size %u too large for host (%u)\n",
1717 			1 << dev_page_min, 1 << page_shift);
1718 		return -ENODEV;
1719 	}
1720 
1721 	ctrl->page_size = 1 << page_shift;
1722 
1723 	ctrl->ctrl_config = NVME_CC_CSS_NVM;
1724 	ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
1725 	ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
1726 	ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
1727 	ctrl->ctrl_config |= NVME_CC_ENABLE;
1728 
1729 	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
1730 	if (ret)
1731 		return ret;
1732 	return nvme_wait_ready(ctrl, cap, true);
1733 }
1734 EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
1735 
1736 int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
1737 {
1738 	unsigned long timeout = jiffies + (ctrl->shutdown_timeout * HZ);
1739 	u32 csts;
1740 	int ret;
1741 
1742 	ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
1743 	ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
1744 
1745 	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
1746 	if (ret)
1747 		return ret;
1748 
1749 	while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
1750 		if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT)
1751 			break;
1752 
1753 		msleep(100);
1754 		if (fatal_signal_pending(current))
1755 			return -EINTR;
1756 		if (time_after(jiffies, timeout)) {
1757 			dev_err(ctrl->device,
1758 				"Device shutdown incomplete; abort shutdown\n");
1759 			return -ENODEV;
1760 		}
1761 	}
1762 
1763 	return ret;
1764 }
1765 EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl);
1766 
1767 static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
1768 		struct request_queue *q)
1769 {
1770 	bool vwc = false;
1771 
1772 	if (ctrl->max_hw_sectors) {
1773 		u32 max_segments =
1774 			(ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1;
1775 
1776 		blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
1777 		blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
1778 	}
1779 	if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
1780 	    is_power_of_2(ctrl->max_hw_sectors))
1781 		blk_queue_chunk_sectors(q, ctrl->max_hw_sectors);
1782 	blk_queue_virt_boundary(q, ctrl->page_size - 1);
1783 	if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
1784 		vwc = true;
1785 	blk_queue_write_cache(q, vwc, vwc);
1786 }
1787 
1788 static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
1789 {
1790 	__le64 ts;
1791 	int ret;
1792 
1793 	if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP))
1794 		return 0;
1795 
1796 	ts = cpu_to_le64(ktime_to_ms(ktime_get_real()));
1797 	ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts),
1798 			NULL);
1799 	if (ret)
1800 		dev_warn_once(ctrl->device,
1801 			"could not set timestamp (%d)\n", ret);
1802 	return ret;
1803 }
1804 
1805 static int nvme_configure_apst(struct nvme_ctrl *ctrl)
1806 {
1807 	/*
1808 	 * APST (Autonomous Power State Transition) lets us program a
1809 	 * table of power state transitions that the controller will
1810 	 * perform automatically.  We configure it with a simple
1811 	 * heuristic: we are willing to spend at most 2% of the time
1812 	 * transitioning between power states.  Therefore, when running
1813 	 * in any given state, we will enter the next lower-power
1814 	 * non-operational state after waiting 50 * (enlat + exlat)
1815 	 * microseconds, as long as that state's exit latency is under
1816 	 * the requested maximum latency.
1817 	 *
1818 	 * We will not autonomously enter any non-operational state for
1819 	 * which the total latency exceeds ps_max_latency_us.  Users
1820 	 * can set ps_max_latency_us to zero to turn off APST.
1821 	 */
1822 
1823 	unsigned apste;
1824 	struct nvme_feat_auto_pst *table;
1825 	u64 max_lat_us = 0;
1826 	int max_ps = -1;
1827 	int ret;
1828 
1829 	/*
1830 	 * If APST isn't supported or if we haven't been initialized yet,
1831 	 * then don't do anything.
1832 	 */
1833 	if (!ctrl->apsta)
1834 		return 0;
1835 
1836 	if (ctrl->npss > 31) {
1837 		dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
1838 		return 0;
1839 	}
1840 
1841 	table = kzalloc(sizeof(*table), GFP_KERNEL);
1842 	if (!table)
1843 		return 0;
1844 
1845 	if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) {
1846 		/* Turn off APST. */
1847 		apste = 0;
1848 		dev_dbg(ctrl->device, "APST disabled\n");
1849 	} else {
1850 		__le64 target = cpu_to_le64(0);
1851 		int state;
1852 
1853 		/*
1854 		 * Walk through all states from lowest- to highest-power.
1855 		 * According to the spec, lower-numbered states use more
1856 		 * power.  NPSS, despite the name, is the index of the
1857 		 * lowest-power state, not the number of states.
1858 		 */
1859 		for (state = (int)ctrl->npss; state >= 0; state--) {
1860 			u64 total_latency_us, exit_latency_us, transition_ms;
1861 
1862 			if (target)
1863 				table->entries[state] = target;
1864 
1865 			/*
1866 			 * Don't allow transitions to the deepest state
1867 			 * if it's quirked off.
1868 			 */
1869 			if (state == ctrl->npss &&
1870 			    (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS))
1871 				continue;
1872 
1873 			/*
1874 			 * Is this state a useful non-operational state for
1875 			 * higher-power states to autonomously transition to?
1876 			 */
1877 			if (!(ctrl->psd[state].flags &
1878 			      NVME_PS_FLAGS_NON_OP_STATE))
1879 				continue;
1880 
1881 			exit_latency_us =
1882 				(u64)le32_to_cpu(ctrl->psd[state].exit_lat);
1883 			if (exit_latency_us > ctrl->ps_max_latency_us)
1884 				continue;
1885 
1886 			total_latency_us =
1887 				exit_latency_us +
1888 				le32_to_cpu(ctrl->psd[state].entry_lat);
1889 
1890 			/*
1891 			 * This state is good.  Use it as the APST idle
1892 			 * target for higher power states.
1893 			 */
1894 			transition_ms = total_latency_us + 19;
1895 			do_div(transition_ms, 20);
1896 			if (transition_ms > (1 << 24) - 1)
1897 				transition_ms = (1 << 24) - 1;
1898 
1899 			target = cpu_to_le64((state << 3) |
1900 					     (transition_ms << 8));
1901 
1902 			if (max_ps == -1)
1903 				max_ps = state;
1904 
1905 			if (total_latency_us > max_lat_us)
1906 				max_lat_us = total_latency_us;
1907 		}
1908 
1909 		apste = 1;
1910 
1911 		if (max_ps == -1) {
1912 			dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n");
1913 		} else {
1914 			dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n",
1915 				max_ps, max_lat_us, (int)sizeof(*table), table);
1916 		}
1917 	}
1918 
1919 	ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
1920 				table, sizeof(*table), NULL);
1921 	if (ret)
1922 		dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
1923 
1924 	kfree(table);
1925 	return ret;
1926 }
1927 
1928 static void nvme_set_latency_tolerance(struct device *dev, s32 val)
1929 {
1930 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
1931 	u64 latency;
1932 
1933 	switch (val) {
1934 	case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT:
1935 	case PM_QOS_LATENCY_ANY:
1936 		latency = U64_MAX;
1937 		break;
1938 
1939 	default:
1940 		latency = val;
1941 	}
1942 
1943 	if (ctrl->ps_max_latency_us != latency) {
1944 		ctrl->ps_max_latency_us = latency;
1945 		nvme_configure_apst(ctrl);
1946 	}
1947 }
1948 
1949 struct nvme_core_quirk_entry {
1950 	/*
1951 	 * NVMe model and firmware strings are padded with spaces.  For
1952 	 * simplicity, strings in the quirk table are padded with NULLs
1953 	 * instead.
1954 	 */
1955 	u16 vid;
1956 	const char *mn;
1957 	const char *fr;
1958 	unsigned long quirks;
1959 };
1960 
1961 static const struct nvme_core_quirk_entry core_quirks[] = {
1962 	{
1963 		/*
1964 		 * This Toshiba device seems to die using any APST states.  See:
1965 		 * https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678184/comments/11
1966 		 */
1967 		.vid = 0x1179,
1968 		.mn = "THNSF5256GPUK TOSHIBA",
1969 		.quirks = NVME_QUIRK_NO_APST,
1970 	}
1971 };
1972 
1973 /* match is null-terminated but idstr is space-padded. */
1974 static bool string_matches(const char *idstr, const char *match, size_t len)
1975 {
1976 	size_t matchlen;
1977 
1978 	if (!match)
1979 		return true;
1980 
1981 	matchlen = strlen(match);
1982 	WARN_ON_ONCE(matchlen > len);
1983 
1984 	if (memcmp(idstr, match, matchlen))
1985 		return false;
1986 
1987 	for (; matchlen < len; matchlen++)
1988 		if (idstr[matchlen] != ' ')
1989 			return false;
1990 
1991 	return true;
1992 }
1993 
1994 static bool quirk_matches(const struct nvme_id_ctrl *id,
1995 			  const struct nvme_core_quirk_entry *q)
1996 {
1997 	return q->vid == le16_to_cpu(id->vid) &&
1998 		string_matches(id->mn, q->mn, sizeof(id->mn)) &&
1999 		string_matches(id->fr, q->fr, sizeof(id->fr));
2000 }
2001 
2002 static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl,
2003 		struct nvme_id_ctrl *id)
2004 {
2005 	size_t nqnlen;
2006 	int off;
2007 
2008 	nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
2009 	if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
2010 		strncpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
2011 		return;
2012 	}
2013 
2014 	if (ctrl->vs >= NVME_VS(1, 2, 1))
2015 		dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
2016 
2017 	/* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */
2018 	off = snprintf(subsys->subnqn, NVMF_NQN_SIZE,
2019 			"nqn.2014.08.org.nvmexpress:%4x%4x",
2020 			le16_to_cpu(id->vid), le16_to_cpu(id->ssvid));
2021 	memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn));
2022 	off += sizeof(id->sn);
2023 	memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn));
2024 	off += sizeof(id->mn);
2025 	memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off);
2026 }
2027 
2028 static void __nvme_release_subsystem(struct nvme_subsystem *subsys)
2029 {
2030 	ida_simple_remove(&nvme_subsystems_ida, subsys->instance);
2031 	kfree(subsys);
2032 }
2033 
2034 static void nvme_release_subsystem(struct device *dev)
2035 {
2036 	__nvme_release_subsystem(container_of(dev, struct nvme_subsystem, dev));
2037 }
2038 
2039 static void nvme_destroy_subsystem(struct kref *ref)
2040 {
2041 	struct nvme_subsystem *subsys =
2042 			container_of(ref, struct nvme_subsystem, ref);
2043 
2044 	mutex_lock(&nvme_subsystems_lock);
2045 	list_del(&subsys->entry);
2046 	mutex_unlock(&nvme_subsystems_lock);
2047 
2048 	ida_destroy(&subsys->ns_ida);
2049 	device_del(&subsys->dev);
2050 	put_device(&subsys->dev);
2051 }
2052 
2053 static void nvme_put_subsystem(struct nvme_subsystem *subsys)
2054 {
2055 	kref_put(&subsys->ref, nvme_destroy_subsystem);
2056 }
2057 
2058 static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn)
2059 {
2060 	struct nvme_subsystem *subsys;
2061 
2062 	lockdep_assert_held(&nvme_subsystems_lock);
2063 
2064 	list_for_each_entry(subsys, &nvme_subsystems, entry) {
2065 		if (strcmp(subsys->subnqn, subsysnqn))
2066 			continue;
2067 		if (!kref_get_unless_zero(&subsys->ref))
2068 			continue;
2069 		return subsys;
2070 	}
2071 
2072 	return NULL;
2073 }
2074 
2075 #define SUBSYS_ATTR_RO(_name, _mode, _show)			\
2076 	struct device_attribute subsys_attr_##_name = \
2077 		__ATTR(_name, _mode, _show, NULL)
2078 
2079 static ssize_t nvme_subsys_show_nqn(struct device *dev,
2080 				    struct device_attribute *attr,
2081 				    char *buf)
2082 {
2083 	struct nvme_subsystem *subsys =
2084 		container_of(dev, struct nvme_subsystem, dev);
2085 
2086 	return snprintf(buf, PAGE_SIZE, "%s\n", subsys->subnqn);
2087 }
2088 static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn);
2089 
2090 #define nvme_subsys_show_str_function(field)				\
2091 static ssize_t subsys_##field##_show(struct device *dev,		\
2092 			    struct device_attribute *attr, char *buf)	\
2093 {									\
2094 	struct nvme_subsystem *subsys =					\
2095 		container_of(dev, struct nvme_subsystem, dev);		\
2096 	return sprintf(buf, "%.*s\n",					\
2097 		       (int)sizeof(subsys->field), subsys->field);	\
2098 }									\
2099 static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show);
2100 
2101 nvme_subsys_show_str_function(model);
2102 nvme_subsys_show_str_function(serial);
2103 nvme_subsys_show_str_function(firmware_rev);
2104 
2105 static struct attribute *nvme_subsys_attrs[] = {
2106 	&subsys_attr_model.attr,
2107 	&subsys_attr_serial.attr,
2108 	&subsys_attr_firmware_rev.attr,
2109 	&subsys_attr_subsysnqn.attr,
2110 	NULL,
2111 };
2112 
2113 static struct attribute_group nvme_subsys_attrs_group = {
2114 	.attrs = nvme_subsys_attrs,
2115 };
2116 
2117 static const struct attribute_group *nvme_subsys_attrs_groups[] = {
2118 	&nvme_subsys_attrs_group,
2119 	NULL,
2120 };
2121 
2122 static int nvme_active_ctrls(struct nvme_subsystem *subsys)
2123 {
2124 	int count = 0;
2125 	struct nvme_ctrl *ctrl;
2126 
2127 	mutex_lock(&subsys->lock);
2128 	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
2129 		if (ctrl->state != NVME_CTRL_DELETING &&
2130 		    ctrl->state != NVME_CTRL_DEAD)
2131 			count++;
2132 	}
2133 	mutex_unlock(&subsys->lock);
2134 
2135 	return count;
2136 }
2137 
2138 static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
2139 {
2140 	struct nvme_subsystem *subsys, *found;
2141 	int ret;
2142 
2143 	subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
2144 	if (!subsys)
2145 		return -ENOMEM;
2146 	ret = ida_simple_get(&nvme_subsystems_ida, 0, 0, GFP_KERNEL);
2147 	if (ret < 0) {
2148 		kfree(subsys);
2149 		return ret;
2150 	}
2151 	subsys->instance = ret;
2152 	mutex_init(&subsys->lock);
2153 	kref_init(&subsys->ref);
2154 	INIT_LIST_HEAD(&subsys->ctrls);
2155 	INIT_LIST_HEAD(&subsys->nsheads);
2156 	nvme_init_subnqn(subsys, ctrl, id);
2157 	memcpy(subsys->serial, id->sn, sizeof(subsys->serial));
2158 	memcpy(subsys->model, id->mn, sizeof(subsys->model));
2159 	memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
2160 	subsys->vendor_id = le16_to_cpu(id->vid);
2161 	subsys->cmic = id->cmic;
2162 
2163 	subsys->dev.class = nvme_subsys_class;
2164 	subsys->dev.release = nvme_release_subsystem;
2165 	subsys->dev.groups = nvme_subsys_attrs_groups;
2166 	dev_set_name(&subsys->dev, "nvme-subsys%d", subsys->instance);
2167 	device_initialize(&subsys->dev);
2168 
2169 	mutex_lock(&nvme_subsystems_lock);
2170 	found = __nvme_find_get_subsystem(subsys->subnqn);
2171 	if (found) {
2172 		/*
2173 		 * Verify that the subsystem actually supports multiple
2174 		 * controllers, else bail out.
2175 		 */
2176 		if (nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) {
2177 			dev_err(ctrl->device,
2178 				"ignoring ctrl due to duplicate subnqn (%s).\n",
2179 				found->subnqn);
2180 			nvme_put_subsystem(found);
2181 			ret = -EINVAL;
2182 			goto out_unlock;
2183 		}
2184 
2185 		__nvme_release_subsystem(subsys);
2186 		subsys = found;
2187 	} else {
2188 		ret = device_add(&subsys->dev);
2189 		if (ret) {
2190 			dev_err(ctrl->device,
2191 				"failed to register subsystem device.\n");
2192 			goto out_unlock;
2193 		}
2194 		ida_init(&subsys->ns_ida);
2195 		list_add_tail(&subsys->entry, &nvme_subsystems);
2196 	}
2197 
2198 	ctrl->subsys = subsys;
2199 	mutex_unlock(&nvme_subsystems_lock);
2200 
2201 	if (sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
2202 			dev_name(ctrl->device))) {
2203 		dev_err(ctrl->device,
2204 			"failed to create sysfs link from subsystem.\n");
2205 		/* the transport driver will eventually put the subsystem */
2206 		return -EINVAL;
2207 	}
2208 
2209 	mutex_lock(&subsys->lock);
2210 	list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
2211 	mutex_unlock(&subsys->lock);
2212 
2213 	return 0;
2214 
2215 out_unlock:
2216 	mutex_unlock(&nvme_subsystems_lock);
2217 	put_device(&subsys->dev);
2218 	return ret;
2219 }
2220 
2221 int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
2222 		     u8 log_page, void *log,
2223 		     size_t size, size_t offset)
2224 {
2225 	struct nvme_command c = { };
2226 	unsigned long dwlen = size / 4 - 1;
2227 
2228 	c.get_log_page.opcode = nvme_admin_get_log_page;
2229 
2230 	if (ns)
2231 		c.get_log_page.nsid = cpu_to_le32(ns->head->ns_id);
2232 	else
2233 		c.get_log_page.nsid = cpu_to_le32(NVME_NSID_ALL);
2234 
2235 	c.get_log_page.lid = log_page;
2236 	c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1));
2237 	c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
2238 	c.get_log_page.lpol = cpu_to_le32(offset & ((1ULL << 32) - 1));
2239 	c.get_log_page.lpou = cpu_to_le32(offset >> 32ULL);
2240 
2241 	return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
2242 }
2243 
2244 static int nvme_get_log(struct nvme_ctrl *ctrl, u8 log_page, void *log,
2245 			size_t size)
2246 {
2247 	return nvme_get_log_ext(ctrl, NULL, log_page, log, size, 0);
2248 }
2249 
2250 static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
2251 {
2252 	int ret;
2253 
2254 	if (!ctrl->effects)
2255 		ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL);
2256 
2257 	if (!ctrl->effects)
2258 		return 0;
2259 
2260 	ret = nvme_get_log(ctrl, NVME_LOG_CMD_EFFECTS, ctrl->effects,
2261 					sizeof(*ctrl->effects));
2262 	if (ret) {
2263 		kfree(ctrl->effects);
2264 		ctrl->effects = NULL;
2265 	}
2266 	return ret;
2267 }
2268 
2269 /*
2270  * Initialize the cached copies of the Identify data and various controller
2271  * register in our nvme_ctrl structure.  This should be called as soon as
2272  * the admin queue is fully up and running.
2273  */
2274 int nvme_init_identify(struct nvme_ctrl *ctrl)
2275 {
2276 	struct nvme_id_ctrl *id;
2277 	u64 cap;
2278 	int ret, page_shift;
2279 	u32 max_hw_sectors;
2280 	bool prev_apst_enabled;
2281 
2282 	ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
2283 	if (ret) {
2284 		dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
2285 		return ret;
2286 	}
2287 
2288 	ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap);
2289 	if (ret) {
2290 		dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
2291 		return ret;
2292 	}
2293 	page_shift = NVME_CAP_MPSMIN(cap) + 12;
2294 
2295 	if (ctrl->vs >= NVME_VS(1, 1, 0))
2296 		ctrl->subsystem = NVME_CAP_NSSRC(cap);
2297 
2298 	ret = nvme_identify_ctrl(ctrl, &id);
2299 	if (ret) {
2300 		dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret);
2301 		return -EIO;
2302 	}
2303 
2304 	if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
2305 		ret = nvme_get_effects_log(ctrl);
2306 		if (ret < 0)
2307 			return ret;
2308 	}
2309 
2310 	if (!ctrl->identified) {
2311 		int i;
2312 
2313 		ret = nvme_init_subsystem(ctrl, id);
2314 		if (ret)
2315 			goto out_free;
2316 
2317 		/*
2318 		 * Check for quirks.  Quirk can depend on firmware version,
2319 		 * so, in principle, the set of quirks present can change
2320 		 * across a reset.  As a possible future enhancement, we
2321 		 * could re-scan for quirks every time we reinitialize
2322 		 * the device, but we'd have to make sure that the driver
2323 		 * behaves intelligently if the quirks change.
2324 		 */
2325 		for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
2326 			if (quirk_matches(id, &core_quirks[i]))
2327 				ctrl->quirks |= core_quirks[i].quirks;
2328 		}
2329 	}
2330 
2331 	if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
2332 		dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
2333 		ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
2334 	}
2335 
2336 	ctrl->oacs = le16_to_cpu(id->oacs);
2337 	ctrl->oncs = le16_to_cpup(&id->oncs);
2338 	atomic_set(&ctrl->abort_limit, id->acl + 1);
2339 	ctrl->vwc = id->vwc;
2340 	ctrl->cntlid = le16_to_cpup(&id->cntlid);
2341 	if (id->mdts)
2342 		max_hw_sectors = 1 << (id->mdts + page_shift - 9);
2343 	else
2344 		max_hw_sectors = UINT_MAX;
2345 	ctrl->max_hw_sectors =
2346 		min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
2347 
2348 	nvme_set_queue_limits(ctrl, ctrl->admin_q);
2349 	ctrl->sgls = le32_to_cpu(id->sgls);
2350 	ctrl->kas = le16_to_cpu(id->kas);
2351 
2352 	if (id->rtd3e) {
2353 		/* us -> s */
2354 		u32 transition_time = le32_to_cpu(id->rtd3e) / 1000000;
2355 
2356 		ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
2357 						 shutdown_timeout, 60);
2358 
2359 		if (ctrl->shutdown_timeout != shutdown_timeout)
2360 			dev_info(ctrl->device,
2361 				 "Shutdown timeout set to %u seconds\n",
2362 				 ctrl->shutdown_timeout);
2363 	} else
2364 		ctrl->shutdown_timeout = shutdown_timeout;
2365 
2366 	ctrl->npss = id->npss;
2367 	ctrl->apsta = id->apsta;
2368 	prev_apst_enabled = ctrl->apst_enabled;
2369 	if (ctrl->quirks & NVME_QUIRK_NO_APST) {
2370 		if (force_apst && id->apsta) {
2371 			dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
2372 			ctrl->apst_enabled = true;
2373 		} else {
2374 			ctrl->apst_enabled = false;
2375 		}
2376 	} else {
2377 		ctrl->apst_enabled = id->apsta;
2378 	}
2379 	memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
2380 
2381 	if (ctrl->ops->flags & NVME_F_FABRICS) {
2382 		ctrl->icdoff = le16_to_cpu(id->icdoff);
2383 		ctrl->ioccsz = le32_to_cpu(id->ioccsz);
2384 		ctrl->iorcsz = le32_to_cpu(id->iorcsz);
2385 		ctrl->maxcmd = le16_to_cpu(id->maxcmd);
2386 
2387 		/*
2388 		 * In fabrics we need to verify the cntlid matches the
2389 		 * admin connect
2390 		 */
2391 		if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
2392 			ret = -EINVAL;
2393 			goto out_free;
2394 		}
2395 
2396 		if (!ctrl->opts->discovery_nqn && !ctrl->kas) {
2397 			dev_err(ctrl->device,
2398 				"keep-alive support is mandatory for fabrics\n");
2399 			ret = -EINVAL;
2400 			goto out_free;
2401 		}
2402 	} else {
2403 		ctrl->cntlid = le16_to_cpu(id->cntlid);
2404 		ctrl->hmpre = le32_to_cpu(id->hmpre);
2405 		ctrl->hmmin = le32_to_cpu(id->hmmin);
2406 		ctrl->hmminds = le32_to_cpu(id->hmminds);
2407 		ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
2408 	}
2409 
2410 	kfree(id);
2411 
2412 	if (ctrl->apst_enabled && !prev_apst_enabled)
2413 		dev_pm_qos_expose_latency_tolerance(ctrl->device);
2414 	else if (!ctrl->apst_enabled && prev_apst_enabled)
2415 		dev_pm_qos_hide_latency_tolerance(ctrl->device);
2416 
2417 	ret = nvme_configure_apst(ctrl);
2418 	if (ret < 0)
2419 		return ret;
2420 
2421 	ret = nvme_configure_timestamp(ctrl);
2422 	if (ret < 0)
2423 		return ret;
2424 
2425 	ret = nvme_configure_directives(ctrl);
2426 	if (ret < 0)
2427 		return ret;
2428 
2429 	ctrl->identified = true;
2430 
2431 	return 0;
2432 
2433 out_free:
2434 	kfree(id);
2435 	return ret;
2436 }
2437 EXPORT_SYMBOL_GPL(nvme_init_identify);
2438 
2439 static int nvme_dev_open(struct inode *inode, struct file *file)
2440 {
2441 	struct nvme_ctrl *ctrl =
2442 		container_of(inode->i_cdev, struct nvme_ctrl, cdev);
2443 
2444 	switch (ctrl->state) {
2445 	case NVME_CTRL_LIVE:
2446 	case NVME_CTRL_ADMIN_ONLY:
2447 		break;
2448 	default:
2449 		return -EWOULDBLOCK;
2450 	}
2451 
2452 	file->private_data = ctrl;
2453 	return 0;
2454 }
2455 
2456 static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
2457 {
2458 	struct nvme_ns *ns;
2459 	int ret;
2460 
2461 	down_read(&ctrl->namespaces_rwsem);
2462 	if (list_empty(&ctrl->namespaces)) {
2463 		ret = -ENOTTY;
2464 		goto out_unlock;
2465 	}
2466 
2467 	ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
2468 	if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
2469 		dev_warn(ctrl->device,
2470 			"NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n");
2471 		ret = -EINVAL;
2472 		goto out_unlock;
2473 	}
2474 
2475 	dev_warn(ctrl->device,
2476 		"using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
2477 	kref_get(&ns->kref);
2478 	up_read(&ctrl->namespaces_rwsem);
2479 
2480 	ret = nvme_user_cmd(ctrl, ns, argp);
2481 	nvme_put_ns(ns);
2482 	return ret;
2483 
2484 out_unlock:
2485 	up_read(&ctrl->namespaces_rwsem);
2486 	return ret;
2487 }
2488 
2489 static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
2490 		unsigned long arg)
2491 {
2492 	struct nvme_ctrl *ctrl = file->private_data;
2493 	void __user *argp = (void __user *)arg;
2494 
2495 	switch (cmd) {
2496 	case NVME_IOCTL_ADMIN_CMD:
2497 		return nvme_user_cmd(ctrl, NULL, argp);
2498 	case NVME_IOCTL_IO_CMD:
2499 		return nvme_dev_user_cmd(ctrl, argp);
2500 	case NVME_IOCTL_RESET:
2501 		dev_warn(ctrl->device, "resetting controller\n");
2502 		return nvme_reset_ctrl_sync(ctrl);
2503 	case NVME_IOCTL_SUBSYS_RESET:
2504 		return nvme_reset_subsystem(ctrl);
2505 	case NVME_IOCTL_RESCAN:
2506 		nvme_queue_scan(ctrl);
2507 		return 0;
2508 	default:
2509 		return -ENOTTY;
2510 	}
2511 }
2512 
2513 static const struct file_operations nvme_dev_fops = {
2514 	.owner		= THIS_MODULE,
2515 	.open		= nvme_dev_open,
2516 	.unlocked_ioctl	= nvme_dev_ioctl,
2517 	.compat_ioctl	= nvme_dev_ioctl,
2518 };
2519 
2520 static ssize_t nvme_sysfs_reset(struct device *dev,
2521 				struct device_attribute *attr, const char *buf,
2522 				size_t count)
2523 {
2524 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2525 	int ret;
2526 
2527 	ret = nvme_reset_ctrl_sync(ctrl);
2528 	if (ret < 0)
2529 		return ret;
2530 	return count;
2531 }
2532 static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
2533 
2534 static ssize_t nvme_sysfs_rescan(struct device *dev,
2535 				struct device_attribute *attr, const char *buf,
2536 				size_t count)
2537 {
2538 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2539 
2540 	nvme_queue_scan(ctrl);
2541 	return count;
2542 }
2543 static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan);
2544 
2545 static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev)
2546 {
2547 	struct gendisk *disk = dev_to_disk(dev);
2548 
2549 	if (disk->fops == &nvme_fops)
2550 		return nvme_get_ns_from_dev(dev)->head;
2551 	else
2552 		return disk->private_data;
2553 }
2554 
2555 static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
2556 		char *buf)
2557 {
2558 	struct nvme_ns_head *head = dev_to_ns_head(dev);
2559 	struct nvme_ns_ids *ids = &head->ids;
2560 	struct nvme_subsystem *subsys = head->subsys;
2561 	int serial_len = sizeof(subsys->serial);
2562 	int model_len = sizeof(subsys->model);
2563 
2564 	if (!uuid_is_null(&ids->uuid))
2565 		return sprintf(buf, "uuid.%pU\n", &ids->uuid);
2566 
2567 	if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
2568 		return sprintf(buf, "eui.%16phN\n", ids->nguid);
2569 
2570 	if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
2571 		return sprintf(buf, "eui.%8phN\n", ids->eui64);
2572 
2573 	while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' ||
2574 				  subsys->serial[serial_len - 1] == '\0'))
2575 		serial_len--;
2576 	while (model_len > 0 && (subsys->model[model_len - 1] == ' ' ||
2577 				 subsys->model[model_len - 1] == '\0'))
2578 		model_len--;
2579 
2580 	return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id,
2581 		serial_len, subsys->serial, model_len, subsys->model,
2582 		head->ns_id);
2583 }
2584 static DEVICE_ATTR_RO(wwid);
2585 
2586 static ssize_t nguid_show(struct device *dev, struct device_attribute *attr,
2587 		char *buf)
2588 {
2589 	return sprintf(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid);
2590 }
2591 static DEVICE_ATTR_RO(nguid);
2592 
2593 static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
2594 		char *buf)
2595 {
2596 	struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
2597 
2598 	/* For backward compatibility expose the NGUID to userspace if
2599 	 * we have no UUID set
2600 	 */
2601 	if (uuid_is_null(&ids->uuid)) {
2602 		printk_ratelimited(KERN_WARNING
2603 				   "No UUID available providing old NGUID\n");
2604 		return sprintf(buf, "%pU\n", ids->nguid);
2605 	}
2606 	return sprintf(buf, "%pU\n", &ids->uuid);
2607 }
2608 static DEVICE_ATTR_RO(uuid);
2609 
2610 static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
2611 		char *buf)
2612 {
2613 	return sprintf(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64);
2614 }
2615 static DEVICE_ATTR_RO(eui);
2616 
2617 static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
2618 		char *buf)
2619 {
2620 	return sprintf(buf, "%d\n", dev_to_ns_head(dev)->ns_id);
2621 }
2622 static DEVICE_ATTR_RO(nsid);
2623 
2624 static struct attribute *nvme_ns_id_attrs[] = {
2625 	&dev_attr_wwid.attr,
2626 	&dev_attr_uuid.attr,
2627 	&dev_attr_nguid.attr,
2628 	&dev_attr_eui.attr,
2629 	&dev_attr_nsid.attr,
2630 	NULL,
2631 };
2632 
2633 static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
2634 		struct attribute *a, int n)
2635 {
2636 	struct device *dev = container_of(kobj, struct device, kobj);
2637 	struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
2638 
2639 	if (a == &dev_attr_uuid.attr) {
2640 		if (uuid_is_null(&ids->uuid) &&
2641 		    !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
2642 			return 0;
2643 	}
2644 	if (a == &dev_attr_nguid.attr) {
2645 		if (!memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
2646 			return 0;
2647 	}
2648 	if (a == &dev_attr_eui.attr) {
2649 		if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
2650 			return 0;
2651 	}
2652 	return a->mode;
2653 }
2654 
2655 const struct attribute_group nvme_ns_id_attr_group = {
2656 	.attrs		= nvme_ns_id_attrs,
2657 	.is_visible	= nvme_ns_id_attrs_are_visible,
2658 };
2659 
2660 #define nvme_show_str_function(field)						\
2661 static ssize_t  field##_show(struct device *dev,				\
2662 			    struct device_attribute *attr, char *buf)		\
2663 {										\
2664         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);				\
2665         return sprintf(buf, "%.*s\n",						\
2666 		(int)sizeof(ctrl->subsys->field), ctrl->subsys->field);		\
2667 }										\
2668 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
2669 
2670 nvme_show_str_function(model);
2671 nvme_show_str_function(serial);
2672 nvme_show_str_function(firmware_rev);
2673 
2674 #define nvme_show_int_function(field)						\
2675 static ssize_t  field##_show(struct device *dev,				\
2676 			    struct device_attribute *attr, char *buf)		\
2677 {										\
2678         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);				\
2679         return sprintf(buf, "%d\n", ctrl->field);	\
2680 }										\
2681 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
2682 
2683 nvme_show_int_function(cntlid);
2684 
2685 static ssize_t nvme_sysfs_delete(struct device *dev,
2686 				struct device_attribute *attr, const char *buf,
2687 				size_t count)
2688 {
2689 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2690 
2691 	if (device_remove_file_self(dev, attr))
2692 		nvme_delete_ctrl_sync(ctrl);
2693 	return count;
2694 }
2695 static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete);
2696 
2697 static ssize_t nvme_sysfs_show_transport(struct device *dev,
2698 					 struct device_attribute *attr,
2699 					 char *buf)
2700 {
2701 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2702 
2703 	return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->ops->name);
2704 }
2705 static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL);
2706 
2707 static ssize_t nvme_sysfs_show_state(struct device *dev,
2708 				     struct device_attribute *attr,
2709 				     char *buf)
2710 {
2711 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2712 	static const char *const state_name[] = {
2713 		[NVME_CTRL_NEW]		= "new",
2714 		[NVME_CTRL_LIVE]	= "live",
2715 		[NVME_CTRL_ADMIN_ONLY]	= "only-admin",
2716 		[NVME_CTRL_RESETTING]	= "resetting",
2717 		[NVME_CTRL_CONNECTING]	= "connecting",
2718 		[NVME_CTRL_DELETING]	= "deleting",
2719 		[NVME_CTRL_DEAD]	= "dead",
2720 	};
2721 
2722 	if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) &&
2723 	    state_name[ctrl->state])
2724 		return sprintf(buf, "%s\n", state_name[ctrl->state]);
2725 
2726 	return sprintf(buf, "unknown state\n");
2727 }
2728 
2729 static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL);
2730 
2731 static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev,
2732 					 struct device_attribute *attr,
2733 					 char *buf)
2734 {
2735 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2736 
2737 	return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subsys->subnqn);
2738 }
2739 static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL);
2740 
2741 static ssize_t nvme_sysfs_show_address(struct device *dev,
2742 					 struct device_attribute *attr,
2743 					 char *buf)
2744 {
2745 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2746 
2747 	return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE);
2748 }
2749 static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL);
2750 
2751 static struct attribute *nvme_dev_attrs[] = {
2752 	&dev_attr_reset_controller.attr,
2753 	&dev_attr_rescan_controller.attr,
2754 	&dev_attr_model.attr,
2755 	&dev_attr_serial.attr,
2756 	&dev_attr_firmware_rev.attr,
2757 	&dev_attr_cntlid.attr,
2758 	&dev_attr_delete_controller.attr,
2759 	&dev_attr_transport.attr,
2760 	&dev_attr_subsysnqn.attr,
2761 	&dev_attr_address.attr,
2762 	&dev_attr_state.attr,
2763 	NULL
2764 };
2765 
2766 static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
2767 		struct attribute *a, int n)
2768 {
2769 	struct device *dev = container_of(kobj, struct device, kobj);
2770 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2771 
2772 	if (a == &dev_attr_delete_controller.attr && !ctrl->ops->delete_ctrl)
2773 		return 0;
2774 	if (a == &dev_attr_address.attr && !ctrl->ops->get_address)
2775 		return 0;
2776 
2777 	return a->mode;
2778 }
2779 
2780 static struct attribute_group nvme_dev_attrs_group = {
2781 	.attrs		= nvme_dev_attrs,
2782 	.is_visible	= nvme_dev_attrs_are_visible,
2783 };
2784 
2785 static const struct attribute_group *nvme_dev_attr_groups[] = {
2786 	&nvme_dev_attrs_group,
2787 	NULL,
2788 };
2789 
2790 static struct nvme_ns_head *__nvme_find_ns_head(struct nvme_subsystem *subsys,
2791 		unsigned nsid)
2792 {
2793 	struct nvme_ns_head *h;
2794 
2795 	lockdep_assert_held(&subsys->lock);
2796 
2797 	list_for_each_entry(h, &subsys->nsheads, entry) {
2798 		if (h->ns_id == nsid && kref_get_unless_zero(&h->ref))
2799 			return h;
2800 	}
2801 
2802 	return NULL;
2803 }
2804 
2805 static int __nvme_check_ids(struct nvme_subsystem *subsys,
2806 		struct nvme_ns_head *new)
2807 {
2808 	struct nvme_ns_head *h;
2809 
2810 	lockdep_assert_held(&subsys->lock);
2811 
2812 	list_for_each_entry(h, &subsys->nsheads, entry) {
2813 		if (nvme_ns_ids_valid(&new->ids) &&
2814 		    !list_empty(&h->list) &&
2815 		    nvme_ns_ids_equal(&new->ids, &h->ids))
2816 			return -EINVAL;
2817 	}
2818 
2819 	return 0;
2820 }
2821 
2822 static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
2823 		unsigned nsid, struct nvme_id_ns *id)
2824 {
2825 	struct nvme_ns_head *head;
2826 	int ret = -ENOMEM;
2827 
2828 	head = kzalloc(sizeof(*head), GFP_KERNEL);
2829 	if (!head)
2830 		goto out;
2831 	ret = ida_simple_get(&ctrl->subsys->ns_ida, 1, 0, GFP_KERNEL);
2832 	if (ret < 0)
2833 		goto out_free_head;
2834 	head->instance = ret;
2835 	INIT_LIST_HEAD(&head->list);
2836 	init_srcu_struct(&head->srcu);
2837 	head->subsys = ctrl->subsys;
2838 	head->ns_id = nsid;
2839 	kref_init(&head->ref);
2840 
2841 	nvme_report_ns_ids(ctrl, nsid, id, &head->ids);
2842 
2843 	ret = __nvme_check_ids(ctrl->subsys, head);
2844 	if (ret) {
2845 		dev_err(ctrl->device,
2846 			"duplicate IDs for nsid %d\n", nsid);
2847 		goto out_cleanup_srcu;
2848 	}
2849 
2850 	ret = nvme_mpath_alloc_disk(ctrl, head);
2851 	if (ret)
2852 		goto out_cleanup_srcu;
2853 
2854 	list_add_tail(&head->entry, &ctrl->subsys->nsheads);
2855 	return head;
2856 out_cleanup_srcu:
2857 	cleanup_srcu_struct(&head->srcu);
2858 	ida_simple_remove(&ctrl->subsys->ns_ida, head->instance);
2859 out_free_head:
2860 	kfree(head);
2861 out:
2862 	return ERR_PTR(ret);
2863 }
2864 
2865 static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
2866 		struct nvme_id_ns *id)
2867 {
2868 	struct nvme_ctrl *ctrl = ns->ctrl;
2869 	bool is_shared = id->nmic & (1 << 0);
2870 	struct nvme_ns_head *head = NULL;
2871 	int ret = 0;
2872 
2873 	mutex_lock(&ctrl->subsys->lock);
2874 	if (is_shared)
2875 		head = __nvme_find_ns_head(ctrl->subsys, nsid);
2876 	if (!head) {
2877 		head = nvme_alloc_ns_head(ctrl, nsid, id);
2878 		if (IS_ERR(head)) {
2879 			ret = PTR_ERR(head);
2880 			goto out_unlock;
2881 		}
2882 	} else {
2883 		struct nvme_ns_ids ids;
2884 
2885 		nvme_report_ns_ids(ctrl, nsid, id, &ids);
2886 		if (!nvme_ns_ids_equal(&head->ids, &ids)) {
2887 			dev_err(ctrl->device,
2888 				"IDs don't match for shared namespace %d\n",
2889 					nsid);
2890 			ret = -EINVAL;
2891 			goto out_unlock;
2892 		}
2893 	}
2894 
2895 	list_add_tail(&ns->siblings, &head->list);
2896 	ns->head = head;
2897 
2898 out_unlock:
2899 	mutex_unlock(&ctrl->subsys->lock);
2900 	return ret;
2901 }
2902 
2903 static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
2904 {
2905 	struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
2906 	struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
2907 
2908 	return nsa->head->ns_id - nsb->head->ns_id;
2909 }
2910 
2911 static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
2912 {
2913 	struct nvme_ns *ns, *ret = NULL;
2914 
2915 	down_read(&ctrl->namespaces_rwsem);
2916 	list_for_each_entry(ns, &ctrl->namespaces, list) {
2917 		if (ns->head->ns_id == nsid) {
2918 			if (!kref_get_unless_zero(&ns->kref))
2919 				continue;
2920 			ret = ns;
2921 			break;
2922 		}
2923 		if (ns->head->ns_id > nsid)
2924 			break;
2925 	}
2926 	up_read(&ctrl->namespaces_rwsem);
2927 	return ret;
2928 }
2929 
2930 static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns)
2931 {
2932 	struct streams_directive_params s;
2933 	int ret;
2934 
2935 	if (!ctrl->nr_streams)
2936 		return 0;
2937 
2938 	ret = nvme_get_stream_params(ctrl, &s, ns->head->ns_id);
2939 	if (ret)
2940 		return ret;
2941 
2942 	ns->sws = le32_to_cpu(s.sws);
2943 	ns->sgs = le16_to_cpu(s.sgs);
2944 
2945 	if (ns->sws) {
2946 		unsigned int bs = 1 << ns->lba_shift;
2947 
2948 		blk_queue_io_min(ns->queue, bs * ns->sws);
2949 		if (ns->sgs)
2950 			blk_queue_io_opt(ns->queue, bs * ns->sws * ns->sgs);
2951 	}
2952 
2953 	return 0;
2954 }
2955 
2956 static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
2957 {
2958 	struct nvme_ns *ns;
2959 	struct gendisk *disk;
2960 	struct nvme_id_ns *id;
2961 	char disk_name[DISK_NAME_LEN];
2962 	int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT;
2963 
2964 	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
2965 	if (!ns)
2966 		return;
2967 
2968 	ns->queue = blk_mq_init_queue(ctrl->tagset);
2969 	if (IS_ERR(ns->queue))
2970 		goto out_free_ns;
2971 	blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
2972 	ns->queue->queuedata = ns;
2973 	ns->ctrl = ctrl;
2974 
2975 	kref_init(&ns->kref);
2976 	ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
2977 
2978 	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
2979 	nvme_set_queue_limits(ctrl, ns->queue);
2980 
2981 	id = nvme_identify_ns(ctrl, nsid);
2982 	if (!id)
2983 		goto out_free_queue;
2984 
2985 	if (id->ncap == 0)
2986 		goto out_free_id;
2987 
2988 	if (nvme_init_ns_head(ns, nsid, id))
2989 		goto out_free_id;
2990 	nvme_setup_streams_ns(ctrl, ns);
2991 
2992 #ifdef CONFIG_NVME_MULTIPATH
2993 	/*
2994 	 * If multipathing is enabled we need to always use the subsystem
2995 	 * instance number for numbering our devices to avoid conflicts
2996 	 * between subsystems that have multiple controllers and thus use
2997 	 * the multipath-aware subsystem node and those that have a single
2998 	 * controller and use the controller node directly.
2999 	 */
3000 	if (ns->head->disk) {
3001 		sprintf(disk_name, "nvme%dc%dn%d", ctrl->subsys->instance,
3002 				ctrl->cntlid, ns->head->instance);
3003 		flags = GENHD_FL_HIDDEN;
3004 	} else {
3005 		sprintf(disk_name, "nvme%dn%d", ctrl->subsys->instance,
3006 				ns->head->instance);
3007 	}
3008 #else
3009 	/*
3010 	 * But without the multipath code enabled, multiple controller per
3011 	 * subsystems are visible as devices and thus we cannot use the
3012 	 * subsystem instance.
3013 	 */
3014 	sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance);
3015 #endif
3016 
3017 	if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
3018 		if (nvme_nvm_register(ns, disk_name, node)) {
3019 			dev_warn(ctrl->device, "LightNVM init failure\n");
3020 			goto out_unlink_ns;
3021 		}
3022 	}
3023 
3024 	disk = alloc_disk_node(0, node);
3025 	if (!disk)
3026 		goto out_unlink_ns;
3027 
3028 	disk->fops = &nvme_fops;
3029 	disk->private_data = ns;
3030 	disk->queue = ns->queue;
3031 	disk->flags = flags;
3032 	memcpy(disk->disk_name, disk_name, DISK_NAME_LEN);
3033 	ns->disk = disk;
3034 
3035 	__nvme_revalidate_disk(disk, id);
3036 
3037 	down_write(&ctrl->namespaces_rwsem);
3038 	list_add_tail(&ns->list, &ctrl->namespaces);
3039 	up_write(&ctrl->namespaces_rwsem);
3040 
3041 	nvme_get_ctrl(ctrl);
3042 
3043 	kfree(id);
3044 
3045 	device_add_disk(ctrl->device, ns->disk);
3046 	if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
3047 					&nvme_ns_id_attr_group))
3048 		pr_warn("%s: failed to create sysfs group for identification\n",
3049 			ns->disk->disk_name);
3050 	if (ns->ndev && nvme_nvm_register_sysfs(ns))
3051 		pr_warn("%s: failed to register lightnvm sysfs group for identification\n",
3052 			ns->disk->disk_name);
3053 
3054 	nvme_mpath_add_disk(ns->head);
3055 	nvme_fault_inject_init(ns);
3056 	return;
3057  out_unlink_ns:
3058 	mutex_lock(&ctrl->subsys->lock);
3059 	list_del_rcu(&ns->siblings);
3060 	mutex_unlock(&ctrl->subsys->lock);
3061  out_free_id:
3062 	kfree(id);
3063  out_free_queue:
3064 	blk_cleanup_queue(ns->queue);
3065  out_free_ns:
3066 	kfree(ns);
3067 }
3068 
3069 static void nvme_ns_remove(struct nvme_ns *ns)
3070 {
3071 	if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
3072 		return;
3073 
3074 	nvme_fault_inject_fini(ns);
3075 	if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
3076 		sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
3077 					&nvme_ns_id_attr_group);
3078 		if (ns->ndev)
3079 			nvme_nvm_unregister_sysfs(ns);
3080 		del_gendisk(ns->disk);
3081 		blk_cleanup_queue(ns->queue);
3082 		if (blk_get_integrity(ns->disk))
3083 			blk_integrity_unregister(ns->disk);
3084 	}
3085 
3086 	mutex_lock(&ns->ctrl->subsys->lock);
3087 	nvme_mpath_clear_current_path(ns);
3088 	list_del_rcu(&ns->siblings);
3089 	mutex_unlock(&ns->ctrl->subsys->lock);
3090 
3091 	down_write(&ns->ctrl->namespaces_rwsem);
3092 	list_del_init(&ns->list);
3093 	up_write(&ns->ctrl->namespaces_rwsem);
3094 
3095 	synchronize_srcu(&ns->head->srcu);
3096 	nvme_mpath_check_last_path(ns);
3097 	nvme_put_ns(ns);
3098 }
3099 
3100 static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
3101 {
3102 	struct nvme_ns *ns;
3103 
3104 	ns = nvme_find_get_ns(ctrl, nsid);
3105 	if (ns) {
3106 		if (ns->disk && revalidate_disk(ns->disk))
3107 			nvme_ns_remove(ns);
3108 		nvme_put_ns(ns);
3109 	} else
3110 		nvme_alloc_ns(ctrl, nsid);
3111 }
3112 
3113 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
3114 					unsigned nsid)
3115 {
3116 	struct nvme_ns *ns, *next;
3117 	LIST_HEAD(rm_list);
3118 
3119 	down_write(&ctrl->namespaces_rwsem);
3120 	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
3121 		if (ns->head->ns_id > nsid)
3122 			list_move_tail(&ns->list, &rm_list);
3123 	}
3124 	up_write(&ctrl->namespaces_rwsem);
3125 
3126 	list_for_each_entry_safe(ns, next, &rm_list, list)
3127 		nvme_ns_remove(ns);
3128 
3129 }
3130 
3131 static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn)
3132 {
3133 	struct nvme_ns *ns;
3134 	__le32 *ns_list;
3135 	unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024);
3136 	int ret = 0;
3137 
3138 	ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
3139 	if (!ns_list)
3140 		return -ENOMEM;
3141 
3142 	for (i = 0; i < num_lists; i++) {
3143 		ret = nvme_identify_ns_list(ctrl, prev, ns_list);
3144 		if (ret)
3145 			goto free;
3146 
3147 		for (j = 0; j < min(nn, 1024U); j++) {
3148 			nsid = le32_to_cpu(ns_list[j]);
3149 			if (!nsid)
3150 				goto out;
3151 
3152 			nvme_validate_ns(ctrl, nsid);
3153 
3154 			while (++prev < nsid) {
3155 				ns = nvme_find_get_ns(ctrl, prev);
3156 				if (ns) {
3157 					nvme_ns_remove(ns);
3158 					nvme_put_ns(ns);
3159 				}
3160 			}
3161 		}
3162 		nn -= j;
3163 	}
3164  out:
3165 	nvme_remove_invalid_namespaces(ctrl, prev);
3166  free:
3167 	kfree(ns_list);
3168 	return ret;
3169 }
3170 
3171 static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl, unsigned nn)
3172 {
3173 	unsigned i;
3174 
3175 	for (i = 1; i <= nn; i++)
3176 		nvme_validate_ns(ctrl, i);
3177 
3178 	nvme_remove_invalid_namespaces(ctrl, nn);
3179 }
3180 
3181 static void nvme_scan_work(struct work_struct *work)
3182 {
3183 	struct nvme_ctrl *ctrl =
3184 		container_of(work, struct nvme_ctrl, scan_work);
3185 	struct nvme_id_ctrl *id;
3186 	unsigned nn;
3187 
3188 	if (ctrl->state != NVME_CTRL_LIVE)
3189 		return;
3190 
3191 	WARN_ON_ONCE(!ctrl->tagset);
3192 
3193 	if (nvme_identify_ctrl(ctrl, &id))
3194 		return;
3195 
3196 	nn = le32_to_cpu(id->nn);
3197 	if (ctrl->vs >= NVME_VS(1, 1, 0) &&
3198 	    !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) {
3199 		if (!nvme_scan_ns_list(ctrl, nn))
3200 			goto done;
3201 	}
3202 	nvme_scan_ns_sequential(ctrl, nn);
3203  done:
3204 	down_write(&ctrl->namespaces_rwsem);
3205 	list_sort(NULL, &ctrl->namespaces, ns_cmp);
3206 	up_write(&ctrl->namespaces_rwsem);
3207 	kfree(id);
3208 }
3209 
3210 void nvme_queue_scan(struct nvme_ctrl *ctrl)
3211 {
3212 	/*
3213 	 * Only new queue scan work when admin and IO queues are both alive
3214 	 */
3215 	if (ctrl->state == NVME_CTRL_LIVE)
3216 		queue_work(nvme_wq, &ctrl->scan_work);
3217 }
3218 EXPORT_SYMBOL_GPL(nvme_queue_scan);
3219 
3220 /*
3221  * This function iterates the namespace list unlocked to allow recovery from
3222  * controller failure. It is up to the caller to ensure the namespace list is
3223  * not modified by scan work while this function is executing.
3224  */
3225 void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
3226 {
3227 	struct nvme_ns *ns, *next;
3228 	LIST_HEAD(ns_list);
3229 
3230 	/*
3231 	 * The dead states indicates the controller was not gracefully
3232 	 * disconnected. In that case, we won't be able to flush any data while
3233 	 * removing the namespaces' disks; fail all the queues now to avoid
3234 	 * potentially having to clean up the failed sync later.
3235 	 */
3236 	if (ctrl->state == NVME_CTRL_DEAD)
3237 		nvme_kill_queues(ctrl);
3238 
3239 	down_write(&ctrl->namespaces_rwsem);
3240 	list_splice_init(&ctrl->namespaces, &ns_list);
3241 	up_write(&ctrl->namespaces_rwsem);
3242 
3243 	list_for_each_entry_safe(ns, next, &ns_list, list)
3244 		nvme_ns_remove(ns);
3245 }
3246 EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
3247 
3248 static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
3249 {
3250 	char *envp[2] = { NULL, NULL };
3251 	u32 aen_result = ctrl->aen_result;
3252 
3253 	ctrl->aen_result = 0;
3254 	if (!aen_result)
3255 		return;
3256 
3257 	envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result);
3258 	if (!envp[0])
3259 		return;
3260 	kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
3261 	kfree(envp[0]);
3262 }
3263 
3264 static void nvme_async_event_work(struct work_struct *work)
3265 {
3266 	struct nvme_ctrl *ctrl =
3267 		container_of(work, struct nvme_ctrl, async_event_work);
3268 
3269 	nvme_aen_uevent(ctrl);
3270 	ctrl->ops->submit_async_event(ctrl);
3271 }
3272 
3273 static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
3274 {
3275 
3276 	u32 csts;
3277 
3278 	if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts))
3279 		return false;
3280 
3281 	if (csts == ~0)
3282 		return false;
3283 
3284 	return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP));
3285 }
3286 
3287 static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
3288 {
3289 	struct nvme_fw_slot_info_log *log;
3290 
3291 	log = kmalloc(sizeof(*log), GFP_KERNEL);
3292 	if (!log)
3293 		return;
3294 
3295 	if (nvme_get_log(ctrl, NVME_LOG_FW_SLOT, log, sizeof(*log)))
3296 		dev_warn(ctrl->device,
3297 				"Get FW SLOT INFO log error\n");
3298 	kfree(log);
3299 }
3300 
3301 static void nvme_fw_act_work(struct work_struct *work)
3302 {
3303 	struct nvme_ctrl *ctrl = container_of(work,
3304 				struct nvme_ctrl, fw_act_work);
3305 	unsigned long fw_act_timeout;
3306 
3307 	if (ctrl->mtfa)
3308 		fw_act_timeout = jiffies +
3309 				msecs_to_jiffies(ctrl->mtfa * 100);
3310 	else
3311 		fw_act_timeout = jiffies +
3312 				msecs_to_jiffies(admin_timeout * 1000);
3313 
3314 	nvme_stop_queues(ctrl);
3315 	while (nvme_ctrl_pp_status(ctrl)) {
3316 		if (time_after(jiffies, fw_act_timeout)) {
3317 			dev_warn(ctrl->device,
3318 				"Fw activation timeout, reset controller\n");
3319 			nvme_reset_ctrl(ctrl);
3320 			break;
3321 		}
3322 		msleep(100);
3323 	}
3324 
3325 	if (ctrl->state != NVME_CTRL_LIVE)
3326 		return;
3327 
3328 	nvme_start_queues(ctrl);
3329 	/* read FW slot information to clear the AER */
3330 	nvme_get_fw_slot_info(ctrl);
3331 }
3332 
3333 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
3334 		union nvme_result *res)
3335 {
3336 	u32 result = le32_to_cpu(res->u32);
3337 
3338 	if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
3339 		return;
3340 
3341 	switch (result & 0x7) {
3342 	case NVME_AER_ERROR:
3343 	case NVME_AER_SMART:
3344 	case NVME_AER_CSS:
3345 	case NVME_AER_VS:
3346 		ctrl->aen_result = result;
3347 		break;
3348 	default:
3349 		break;
3350 	}
3351 
3352 	switch (result & 0xff07) {
3353 	case NVME_AER_NOTICE_NS_CHANGED:
3354 		dev_info(ctrl->device, "rescanning\n");
3355 		nvme_queue_scan(ctrl);
3356 		break;
3357 	case NVME_AER_NOTICE_FW_ACT_STARTING:
3358 		queue_work(nvme_wq, &ctrl->fw_act_work);
3359 		break;
3360 	default:
3361 		dev_warn(ctrl->device, "async event result %08x\n", result);
3362 	}
3363 	queue_work(nvme_wq, &ctrl->async_event_work);
3364 }
3365 EXPORT_SYMBOL_GPL(nvme_complete_async_event);
3366 
3367 void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
3368 {
3369 	nvme_stop_keep_alive(ctrl);
3370 	flush_work(&ctrl->async_event_work);
3371 	flush_work(&ctrl->scan_work);
3372 	cancel_work_sync(&ctrl->fw_act_work);
3373 	if (ctrl->ops->stop_ctrl)
3374 		ctrl->ops->stop_ctrl(ctrl);
3375 }
3376 EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
3377 
3378 void nvme_start_ctrl(struct nvme_ctrl *ctrl)
3379 {
3380 	if (ctrl->kato)
3381 		nvme_start_keep_alive(ctrl);
3382 
3383 	if (ctrl->queue_count > 1) {
3384 		nvme_queue_scan(ctrl);
3385 		queue_work(nvme_wq, &ctrl->async_event_work);
3386 		nvme_start_queues(ctrl);
3387 	}
3388 }
3389 EXPORT_SYMBOL_GPL(nvme_start_ctrl);
3390 
3391 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
3392 {
3393 	cdev_device_del(&ctrl->cdev, ctrl->device);
3394 }
3395 EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
3396 
3397 static void nvme_free_ctrl(struct device *dev)
3398 {
3399 	struct nvme_ctrl *ctrl =
3400 		container_of(dev, struct nvme_ctrl, ctrl_device);
3401 	struct nvme_subsystem *subsys = ctrl->subsys;
3402 
3403 	ida_simple_remove(&nvme_instance_ida, ctrl->instance);
3404 	kfree(ctrl->effects);
3405 
3406 	if (subsys) {
3407 		mutex_lock(&subsys->lock);
3408 		list_del(&ctrl->subsys_entry);
3409 		mutex_unlock(&subsys->lock);
3410 		sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device));
3411 	}
3412 
3413 	ctrl->ops->free_ctrl(ctrl);
3414 
3415 	if (subsys)
3416 		nvme_put_subsystem(subsys);
3417 }
3418 
3419 /*
3420  * Initialize a NVMe controller structures.  This needs to be called during
3421  * earliest initialization so that we have the initialized structured around
3422  * during probing.
3423  */
3424 int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
3425 		const struct nvme_ctrl_ops *ops, unsigned long quirks)
3426 {
3427 	int ret;
3428 
3429 	ctrl->state = NVME_CTRL_NEW;
3430 	spin_lock_init(&ctrl->lock);
3431 	INIT_LIST_HEAD(&ctrl->namespaces);
3432 	init_rwsem(&ctrl->namespaces_rwsem);
3433 	ctrl->dev = dev;
3434 	ctrl->ops = ops;
3435 	ctrl->quirks = quirks;
3436 	INIT_WORK(&ctrl->scan_work, nvme_scan_work);
3437 	INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
3438 	INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
3439 	INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
3440 
3441 	ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL);
3442 	if (ret < 0)
3443 		goto out;
3444 	ctrl->instance = ret;
3445 
3446 	device_initialize(&ctrl->ctrl_device);
3447 	ctrl->device = &ctrl->ctrl_device;
3448 	ctrl->device->devt = MKDEV(MAJOR(nvme_chr_devt), ctrl->instance);
3449 	ctrl->device->class = nvme_class;
3450 	ctrl->device->parent = ctrl->dev;
3451 	ctrl->device->groups = nvme_dev_attr_groups;
3452 	ctrl->device->release = nvme_free_ctrl;
3453 	dev_set_drvdata(ctrl->device, ctrl);
3454 	ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance);
3455 	if (ret)
3456 		goto out_release_instance;
3457 
3458 	cdev_init(&ctrl->cdev, &nvme_dev_fops);
3459 	ctrl->cdev.owner = ops->module;
3460 	ret = cdev_device_add(&ctrl->cdev, ctrl->device);
3461 	if (ret)
3462 		goto out_free_name;
3463 
3464 	/*
3465 	 * Initialize latency tolerance controls.  The sysfs files won't
3466 	 * be visible to userspace unless the device actually supports APST.
3467 	 */
3468 	ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
3469 	dev_pm_qos_update_user_latency_tolerance(ctrl->device,
3470 		min(default_ps_max_latency_us, (unsigned long)S32_MAX));
3471 
3472 	return 0;
3473 out_free_name:
3474 	kfree_const(dev->kobj.name);
3475 out_release_instance:
3476 	ida_simple_remove(&nvme_instance_ida, ctrl->instance);
3477 out:
3478 	return ret;
3479 }
3480 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
3481 
3482 /**
3483  * nvme_kill_queues(): Ends all namespace queues
3484  * @ctrl: the dead controller that needs to end
3485  *
3486  * Call this function when the driver determines it is unable to get the
3487  * controller in a state capable of servicing IO.
3488  */
3489 void nvme_kill_queues(struct nvme_ctrl *ctrl)
3490 {
3491 	struct nvme_ns *ns;
3492 
3493 	down_read(&ctrl->namespaces_rwsem);
3494 
3495 	/* Forcibly unquiesce queues to avoid blocking dispatch */
3496 	if (ctrl->admin_q)
3497 		blk_mq_unquiesce_queue(ctrl->admin_q);
3498 
3499 	list_for_each_entry(ns, &ctrl->namespaces, list) {
3500 		/*
3501 		 * Revalidating a dead namespace sets capacity to 0. This will
3502 		 * end buffered writers dirtying pages that can't be synced.
3503 		 */
3504 		if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags))
3505 			continue;
3506 		revalidate_disk(ns->disk);
3507 		blk_set_queue_dying(ns->queue);
3508 
3509 		/* Forcibly unquiesce queues to avoid blocking dispatch */
3510 		blk_mq_unquiesce_queue(ns->queue);
3511 	}
3512 	up_read(&ctrl->namespaces_rwsem);
3513 }
3514 EXPORT_SYMBOL_GPL(nvme_kill_queues);
3515 
3516 void nvme_unfreeze(struct nvme_ctrl *ctrl)
3517 {
3518 	struct nvme_ns *ns;
3519 
3520 	down_read(&ctrl->namespaces_rwsem);
3521 	list_for_each_entry(ns, &ctrl->namespaces, list)
3522 		blk_mq_unfreeze_queue(ns->queue);
3523 	up_read(&ctrl->namespaces_rwsem);
3524 }
3525 EXPORT_SYMBOL_GPL(nvme_unfreeze);
3526 
3527 void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
3528 {
3529 	struct nvme_ns *ns;
3530 
3531 	down_read(&ctrl->namespaces_rwsem);
3532 	list_for_each_entry(ns, &ctrl->namespaces, list) {
3533 		timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
3534 		if (timeout <= 0)
3535 			break;
3536 	}
3537 	up_read(&ctrl->namespaces_rwsem);
3538 }
3539 EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
3540 
3541 void nvme_wait_freeze(struct nvme_ctrl *ctrl)
3542 {
3543 	struct nvme_ns *ns;
3544 
3545 	down_read(&ctrl->namespaces_rwsem);
3546 	list_for_each_entry(ns, &ctrl->namespaces, list)
3547 		blk_mq_freeze_queue_wait(ns->queue);
3548 	up_read(&ctrl->namespaces_rwsem);
3549 }
3550 EXPORT_SYMBOL_GPL(nvme_wait_freeze);
3551 
3552 void nvme_start_freeze(struct nvme_ctrl *ctrl)
3553 {
3554 	struct nvme_ns *ns;
3555 
3556 	down_read(&ctrl->namespaces_rwsem);
3557 	list_for_each_entry(ns, &ctrl->namespaces, list)
3558 		blk_freeze_queue_start(ns->queue);
3559 	up_read(&ctrl->namespaces_rwsem);
3560 }
3561 EXPORT_SYMBOL_GPL(nvme_start_freeze);
3562 
3563 void nvme_stop_queues(struct nvme_ctrl *ctrl)
3564 {
3565 	struct nvme_ns *ns;
3566 
3567 	down_read(&ctrl->namespaces_rwsem);
3568 	list_for_each_entry(ns, &ctrl->namespaces, list)
3569 		blk_mq_quiesce_queue(ns->queue);
3570 	up_read(&ctrl->namespaces_rwsem);
3571 }
3572 EXPORT_SYMBOL_GPL(nvme_stop_queues);
3573 
3574 void nvme_start_queues(struct nvme_ctrl *ctrl)
3575 {
3576 	struct nvme_ns *ns;
3577 
3578 	down_read(&ctrl->namespaces_rwsem);
3579 	list_for_each_entry(ns, &ctrl->namespaces, list)
3580 		blk_mq_unquiesce_queue(ns->queue);
3581 	up_read(&ctrl->namespaces_rwsem);
3582 }
3583 EXPORT_SYMBOL_GPL(nvme_start_queues);
3584 
3585 int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set)
3586 {
3587 	if (!ctrl->ops->reinit_request)
3588 		return 0;
3589 
3590 	return blk_mq_tagset_iter(set, set->driver_data,
3591 			ctrl->ops->reinit_request);
3592 }
3593 EXPORT_SYMBOL_GPL(nvme_reinit_tagset);
3594 
3595 int __init nvme_core_init(void)
3596 {
3597 	int result = -ENOMEM;
3598 
3599 	nvme_wq = alloc_workqueue("nvme-wq",
3600 			WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
3601 	if (!nvme_wq)
3602 		goto out;
3603 
3604 	nvme_reset_wq = alloc_workqueue("nvme-reset-wq",
3605 			WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
3606 	if (!nvme_reset_wq)
3607 		goto destroy_wq;
3608 
3609 	nvme_delete_wq = alloc_workqueue("nvme-delete-wq",
3610 			WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
3611 	if (!nvme_delete_wq)
3612 		goto destroy_reset_wq;
3613 
3614 	result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme");
3615 	if (result < 0)
3616 		goto destroy_delete_wq;
3617 
3618 	nvme_class = class_create(THIS_MODULE, "nvme");
3619 	if (IS_ERR(nvme_class)) {
3620 		result = PTR_ERR(nvme_class);
3621 		goto unregister_chrdev;
3622 	}
3623 
3624 	nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem");
3625 	if (IS_ERR(nvme_subsys_class)) {
3626 		result = PTR_ERR(nvme_subsys_class);
3627 		goto destroy_class;
3628 	}
3629 	return 0;
3630 
3631 destroy_class:
3632 	class_destroy(nvme_class);
3633 unregister_chrdev:
3634 	unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
3635 destroy_delete_wq:
3636 	destroy_workqueue(nvme_delete_wq);
3637 destroy_reset_wq:
3638 	destroy_workqueue(nvme_reset_wq);
3639 destroy_wq:
3640 	destroy_workqueue(nvme_wq);
3641 out:
3642 	return result;
3643 }
3644 
3645 void nvme_core_exit(void)
3646 {
3647 	ida_destroy(&nvme_subsystems_ida);
3648 	class_destroy(nvme_subsys_class);
3649 	class_destroy(nvme_class);
3650 	unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
3651 	destroy_workqueue(nvme_delete_wq);
3652 	destroy_workqueue(nvme_reset_wq);
3653 	destroy_workqueue(nvme_wq);
3654 }
3655 
3656 MODULE_LICENSE("GPL");
3657 MODULE_VERSION("1.0");
3658 module_init(nvme_core_init);
3659 module_exit(nvme_core_exit);
3660