xref: /openbmc/linux/drivers/nvme/host/multipath.c (revision ecc23d0a422a3118fcf6e4f0a46e17a6c2047b02)
1bc50ad75SChristoph Hellwig // SPDX-License-Identifier: GPL-2.0
232acab31SChristoph Hellwig /*
30d0b660fSChristoph Hellwig  * Copyright (c) 2017-2018 Christoph Hellwig.
432acab31SChristoph Hellwig  */
532acab31SChristoph Hellwig 
6b2ce4d90SKeith Busch #include <linux/backing-dev.h>
732acab31SChristoph Hellwig #include <linux/moduleparam.h>
85e6a7d1eSHannes Reinecke #include <linux/vmalloc.h>
92796b569SHannes Reinecke #include <trace/events/block.h>
1032acab31SChristoph Hellwig #include "nvme.h"
1132acab31SChristoph Hellwig 
12b739e137SChristoph Hellwig bool multipath = true;
135cadde80SKeith Busch module_param(multipath, bool, 0444);
1432acab31SChristoph Hellwig MODULE_PARM_DESC(multipath,
1532acab31SChristoph Hellwig 	"turn on native support for multiple controllers per subsystem");
1632acab31SChristoph Hellwig 
17e3d34794SHannes Reinecke static const char *nvme_iopolicy_names[] = {
18e3d34794SHannes Reinecke 	[NVME_IOPOLICY_NUMA]	= "numa",
19e3d34794SHannes Reinecke 	[NVME_IOPOLICY_RR]	= "round-robin",
2085b9f3e6SThomas Song 	[NVME_IOPOLICY_QD]      = "queue-depth",
21e3d34794SHannes Reinecke };
22e3d34794SHannes Reinecke 
23e3d34794SHannes Reinecke static int iopolicy = NVME_IOPOLICY_NUMA;
24e3d34794SHannes Reinecke 
nvme_set_iopolicy(const char * val,const struct kernel_param * kp)25e3d34794SHannes Reinecke static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
26e3d34794SHannes Reinecke {
27e3d34794SHannes Reinecke 	if (!val)
28e3d34794SHannes Reinecke 		return -EINVAL;
29e3d34794SHannes Reinecke 	if (!strncmp(val, "numa", 4))
30e3d34794SHannes Reinecke 		iopolicy = NVME_IOPOLICY_NUMA;
31e3d34794SHannes Reinecke 	else if (!strncmp(val, "round-robin", 11))
32e3d34794SHannes Reinecke 		iopolicy = NVME_IOPOLICY_RR;
3385b9f3e6SThomas Song 	else if (!strncmp(val, "queue-depth", 11))
3485b9f3e6SThomas Song 		iopolicy = NVME_IOPOLICY_QD;
35e3d34794SHannes Reinecke 	else
36e3d34794SHannes Reinecke 		return -EINVAL;
37e3d34794SHannes Reinecke 
38e3d34794SHannes Reinecke 	return 0;
39e3d34794SHannes Reinecke }
40e3d34794SHannes Reinecke 
nvme_get_iopolicy(char * buf,const struct kernel_param * kp)41e3d34794SHannes Reinecke static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
42e3d34794SHannes Reinecke {
43e3d34794SHannes Reinecke 	return sprintf(buf, "%s\n", nvme_iopolicy_names[iopolicy]);
44e3d34794SHannes Reinecke }
45e3d34794SHannes Reinecke 
46e3d34794SHannes Reinecke module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
47e3d34794SHannes Reinecke 	&iopolicy, 0644);
48e3d34794SHannes Reinecke MODULE_PARM_DESC(iopolicy,
4985b9f3e6SThomas Song 	"Default multipath I/O policy; 'numa' (default), 'round-robin' or 'queue-depth'");
50e3d34794SHannes Reinecke 
nvme_mpath_default_iopolicy(struct nvme_subsystem * subsys)51e3d34794SHannes Reinecke void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
52e3d34794SHannes Reinecke {
53e3d34794SHannes Reinecke 	subsys->iopolicy = iopolicy;
54e3d34794SHannes Reinecke }
55e3d34794SHannes Reinecke 
nvme_mpath_unfreeze(struct nvme_subsystem * subsys)56b9156daeSSagi Grimberg void nvme_mpath_unfreeze(struct nvme_subsystem *subsys)
57b9156daeSSagi Grimberg {
58b9156daeSSagi Grimberg 	struct nvme_ns_head *h;
59b9156daeSSagi Grimberg 
60b9156daeSSagi Grimberg 	lockdep_assert_held(&subsys->lock);
61b9156daeSSagi Grimberg 	list_for_each_entry(h, &subsys->nsheads, entry)
62b9156daeSSagi Grimberg 		if (h->disk)
63b9156daeSSagi Grimberg 			blk_mq_unfreeze_queue(h->disk->queue);
64b9156daeSSagi Grimberg }
65b9156daeSSagi Grimberg 
nvme_mpath_wait_freeze(struct nvme_subsystem * subsys)66b9156daeSSagi Grimberg void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys)
67b9156daeSSagi Grimberg {
68b9156daeSSagi Grimberg 	struct nvme_ns_head *h;
69b9156daeSSagi Grimberg 
70b9156daeSSagi Grimberg 	lockdep_assert_held(&subsys->lock);
71b9156daeSSagi Grimberg 	list_for_each_entry(h, &subsys->nsheads, entry)
72b9156daeSSagi Grimberg 		if (h->disk)
73b9156daeSSagi Grimberg 			blk_mq_freeze_queue_wait(h->disk->queue);
74b9156daeSSagi Grimberg }
75b9156daeSSagi Grimberg 
nvme_mpath_start_freeze(struct nvme_subsystem * subsys)76b9156daeSSagi Grimberg void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
77b9156daeSSagi Grimberg {
78b9156daeSSagi Grimberg 	struct nvme_ns_head *h;
79b9156daeSSagi Grimberg 
80b9156daeSSagi Grimberg 	lockdep_assert_held(&subsys->lock);
81b9156daeSSagi Grimberg 	list_for_each_entry(h, &subsys->nsheads, entry)
82b9156daeSSagi Grimberg 		if (h->disk)
83b9156daeSSagi Grimberg 			blk_freeze_queue_start(h->disk->queue);
84b9156daeSSagi Grimberg }
85b9156daeSSagi Grimberg 
nvme_failover_req(struct request * req)865ddaabe8SChristoph Hellwig void nvme_failover_req(struct request *req)
8732acab31SChristoph Hellwig {
8832acab31SChristoph Hellwig 	struct nvme_ns *ns = req->q->queuedata;
895ddaabe8SChristoph Hellwig 	u16 status = nvme_req(req)->status & 0x7ff;
9032acab31SChristoph Hellwig 	unsigned long flags;
91ce86dad2SDaniel Wagner 	struct bio *bio;
9232acab31SChristoph Hellwig 
930d0b660fSChristoph Hellwig 	nvme_mpath_clear_current_path(ns);
945ddaabe8SChristoph Hellwig 
955ddaabe8SChristoph Hellwig 	/*
965ddaabe8SChristoph Hellwig 	 * If we got back an ANA error, we know the controller is alive but not
975ddaabe8SChristoph Hellwig 	 * ready to serve this namespace.  Kick of a re-read of the ANA
985ddaabe8SChristoph Hellwig 	 * information page, and just try any other available path for now.
995ddaabe8SChristoph Hellwig 	 */
1005ddaabe8SChristoph Hellwig 	if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
1010d0b660fSChristoph Hellwig 		set_bit(NVME_NS_ANA_PENDING, &ns->flags);
1020d0b660fSChristoph Hellwig 		queue_work(nvme_wq, &ns->ctrl->ana_work);
1030d0b660fSChristoph Hellwig 	}
1040d0b660fSChristoph Hellwig 
105764e9332SJohn Meneghini 	spin_lock_irqsave(&ns->head->requeue_lock, flags);
106c712dcccSChristoph Hellwig 	for (bio = req->bio; bio; bio = bio->bi_next) {
107ce86dad2SDaniel Wagner 		bio_set_dev(bio, ns->head->disk->part0);
108c712dcccSChristoph Hellwig 		if (bio->bi_opf & REQ_POLLED) {
109c712dcccSChristoph Hellwig 			bio->bi_opf &= ~REQ_POLLED;
110c712dcccSChristoph Hellwig 			bio->bi_cookie = BLK_QC_T_NONE;
111c712dcccSChristoph Hellwig 		}
11299160af4SSagi Grimberg 		/*
11399160af4SSagi Grimberg 		 * The alternate request queue that we may end up submitting
11499160af4SSagi Grimberg 		 * the bio to may be frozen temporarily, in this case REQ_NOWAIT
11599160af4SSagi Grimberg 		 * will fail the I/O immediately with EAGAIN to the issuer.
11699160af4SSagi Grimberg 		 * We are not in the issuer context which cannot block. Clear
11799160af4SSagi Grimberg 		 * the flag to avoid spurious EAGAIN I/O failures.
11899160af4SSagi Grimberg 		 */
11999160af4SSagi Grimberg 		bio->bi_opf &= ~REQ_NOWAIT;
120c712dcccSChristoph Hellwig 	}
121764e9332SJohn Meneghini 	blk_steal_bios(&ns->head->requeue_list, req);
122764e9332SJohn Meneghini 	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
123764e9332SJohn Meneghini 
12436989c68SKeith Busch 	nvme_req(req)->status = 0;
12536989c68SKeith Busch 	nvme_end_req(req);
12632acab31SChristoph Hellwig 	kblockd_schedule_work(&ns->head->requeue_work);
12732acab31SChristoph Hellwig }
12832acab31SChristoph Hellwig 
nvme_mpath_start_request(struct request * rq)129d4d957b5SSagi Grimberg void nvme_mpath_start_request(struct request *rq)
130d4d957b5SSagi Grimberg {
131d4d957b5SSagi Grimberg 	struct nvme_ns *ns = rq->q->queuedata;
132d4d957b5SSagi Grimberg 	struct gendisk *disk = ns->head->disk;
133d4d957b5SSagi Grimberg 
13485b9f3e6SThomas Song 	if (READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD) {
13585b9f3e6SThomas Song 		atomic_inc(&ns->ctrl->nr_active);
13685b9f3e6SThomas Song 		nvme_req(rq)->flags |= NVME_MPATH_CNT_ACTIVE;
13785b9f3e6SThomas Song 	}
13885b9f3e6SThomas Song 
139d4d957b5SSagi Grimberg 	if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq))
140d4d957b5SSagi Grimberg 		return;
141d4d957b5SSagi Grimberg 
142d4d957b5SSagi Grimberg 	nvme_req(rq)->flags |= NVME_MPATH_IO_STATS;
1435f275713SYu Kuai 	nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0, req_op(rq),
1445f275713SYu Kuai 						      jiffies);
145d4d957b5SSagi Grimberg }
146d4d957b5SSagi Grimberg EXPORT_SYMBOL_GPL(nvme_mpath_start_request);
147d4d957b5SSagi Grimberg 
nvme_mpath_end_request(struct request * rq)148d4d957b5SSagi Grimberg void nvme_mpath_end_request(struct request *rq)
149d4d957b5SSagi Grimberg {
150d4d957b5SSagi Grimberg 	struct nvme_ns *ns = rq->q->queuedata;
151d4d957b5SSagi Grimberg 
15285b9f3e6SThomas Song 	if (nvme_req(rq)->flags & NVME_MPATH_CNT_ACTIVE)
15385b9f3e6SThomas Song 		atomic_dec_if_positive(&ns->ctrl->nr_active);
15485b9f3e6SThomas Song 
155d4d957b5SSagi Grimberg 	if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
156d4d957b5SSagi Grimberg 		return;
157d4d957b5SSagi Grimberg 	bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
1585f275713SYu Kuai 			 blk_rq_bytes(rq) >> SECTOR_SHIFT,
159d4d957b5SSagi Grimberg 			 nvme_req(rq)->start_time);
160d4d957b5SSagi Grimberg }
161d4d957b5SSagi Grimberg 
nvme_kick_requeue_lists(struct nvme_ctrl * ctrl)16232acab31SChristoph Hellwig void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
16332acab31SChristoph Hellwig {
16432acab31SChristoph Hellwig 	struct nvme_ns *ns;
16582f20194SKeith Busch 	int srcu_idx;
16632acab31SChristoph Hellwig 
16782f20194SKeith Busch 	srcu_idx = srcu_read_lock(&ctrl->srcu);
168*1e20e4ffSBreno Leitao 	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
169*1e20e4ffSBreno Leitao 				 srcu_read_lock_held(&ctrl->srcu)) {
170f6f09c15SHannes Reinecke 		if (!ns->head->disk)
171f6f09c15SHannes Reinecke 			continue;
17232acab31SChristoph Hellwig 		kblockd_schedule_work(&ns->head->requeue_work);
173f6f09c15SHannes Reinecke 		if (ctrl->state == NVME_CTRL_LIVE)
174f6f09c15SHannes Reinecke 			disk_uevent(ns->head->disk, KOBJ_CHANGE);
17532acab31SChristoph Hellwig 	}
17682f20194SKeith Busch 	srcu_read_unlock(&ctrl->srcu, srcu_idx);
17732acab31SChristoph Hellwig }
17832acab31SChristoph Hellwig 
1790d0b660fSChristoph Hellwig static const char *nvme_ana_state_names[] = {
1800d0b660fSChristoph Hellwig 	[0]				= "invalid state",
1810d0b660fSChristoph Hellwig 	[NVME_ANA_OPTIMIZED]		= "optimized",
1820d0b660fSChristoph Hellwig 	[NVME_ANA_NONOPTIMIZED]		= "non-optimized",
1830d0b660fSChristoph Hellwig 	[NVME_ANA_INACCESSIBLE]		= "inaccessible",
1840d0b660fSChristoph Hellwig 	[NVME_ANA_PERSISTENT_LOSS]	= "persistent-loss",
1850d0b660fSChristoph Hellwig 	[NVME_ANA_CHANGE]		= "change",
1860d0b660fSChristoph Hellwig };
1870d0b660fSChristoph Hellwig 
nvme_mpath_clear_current_path(struct nvme_ns * ns)1880157ec8dSSagi Grimberg bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
18932acab31SChristoph Hellwig {
190f3334447SChristoph Hellwig 	struct nvme_ns_head *head = ns->head;
1910157ec8dSSagi Grimberg 	bool changed = false;
192f3334447SChristoph Hellwig 	int node;
193f3334447SChristoph Hellwig 
194f3334447SChristoph Hellwig 	if (!head)
1950157ec8dSSagi Grimberg 		goto out;
196f3334447SChristoph Hellwig 
197f3334447SChristoph Hellwig 	for_each_node(node) {
1980157ec8dSSagi Grimberg 		if (ns == rcu_access_pointer(head->current_path[node])) {
199f3334447SChristoph Hellwig 			rcu_assign_pointer(head->current_path[node], NULL);
2000157ec8dSSagi Grimberg 			changed = true;
201f3334447SChristoph Hellwig 		}
202f3334447SChristoph Hellwig 	}
2030157ec8dSSagi Grimberg out:
2040157ec8dSSagi Grimberg 	return changed;
2050157ec8dSSagi Grimberg }
2060157ec8dSSagi Grimberg 
nvme_mpath_clear_ctrl_paths(struct nvme_ctrl * ctrl)2070157ec8dSSagi Grimberg void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
2080157ec8dSSagi Grimberg {
2090157ec8dSSagi Grimberg 	struct nvme_ns *ns;
21082f20194SKeith Busch 	int srcu_idx;
2110157ec8dSSagi Grimberg 
21282f20194SKeith Busch 	srcu_idx = srcu_read_lock(&ctrl->srcu);
213*1e20e4ffSBreno Leitao 	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
214*1e20e4ffSBreno Leitao 				 srcu_read_lock_held(&ctrl->srcu)) {
2152b81a5f0SHannes Reinecke 		nvme_mpath_clear_current_path(ns);
2160157ec8dSSagi Grimberg 		kblockd_schedule_work(&ns->head->requeue_work);
2172b81a5f0SHannes Reinecke 	}
21882f20194SKeith Busch 	srcu_read_unlock(&ctrl->srcu, srcu_idx);
2190157ec8dSSagi Grimberg }
220f3334447SChristoph Hellwig 
nvme_mpath_revalidate_paths(struct nvme_ns * ns)221e7d65803SHannes Reinecke void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
222e7d65803SHannes Reinecke {
223e7d65803SHannes Reinecke 	struct nvme_ns_head *head = ns->head;
224e7d65803SHannes Reinecke 	sector_t capacity = get_capacity(head->disk);
225e7d65803SHannes Reinecke 	int node;
226899d2a05SCaleb Sander 	int srcu_idx;
227e7d65803SHannes Reinecke 
228899d2a05SCaleb Sander 	srcu_idx = srcu_read_lock(&head->srcu);
229*1e20e4ffSBreno Leitao 	list_for_each_entry_srcu(ns, &head->list, siblings,
230*1e20e4ffSBreno Leitao 				 srcu_read_lock_held(&head->srcu)) {
231e7d65803SHannes Reinecke 		if (capacity != get_capacity(ns->disk))
232e7d65803SHannes Reinecke 			clear_bit(NVME_NS_READY, &ns->flags);
233e7d65803SHannes Reinecke 	}
234899d2a05SCaleb Sander 	srcu_read_unlock(&head->srcu, srcu_idx);
235e7d65803SHannes Reinecke 
236e7d65803SHannes Reinecke 	for_each_node(node)
237e7d65803SHannes Reinecke 		rcu_assign_pointer(head->current_path[node], NULL);
23872e3b888SSagi Grimberg 	kblockd_schedule_work(&head->requeue_work);
239e7d65803SHannes Reinecke }
240e7d65803SHannes Reinecke 
nvme_path_is_disabled(struct nvme_ns * ns)241ca7ae5c9SHannes Reinecke static bool nvme_path_is_disabled(struct nvme_ns *ns)
242ca7ae5c9SHannes Reinecke {
243ecca390eSSagi Grimberg 	/*
244ecca390eSSagi Grimberg 	 * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should
245ecca390eSSagi Grimberg 	 * still be able to complete assuming that the controller is connected.
246ecca390eSSagi Grimberg 	 * Otherwise it will fail immediately and return to the requeue list.
247ecca390eSSagi Grimberg 	 */
248ecca390eSSagi Grimberg 	if (ns->ctrl->state != NVME_CTRL_LIVE &&
249ecca390eSSagi Grimberg 	    ns->ctrl->state != NVME_CTRL_DELETING)
250ecca390eSSagi Grimberg 		return true;
251ecca390eSSagi Grimberg 	if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
252e7d65803SHannes Reinecke 	    !test_bit(NVME_NS_READY, &ns->flags))
253ecca390eSSagi Grimberg 		return true;
254ecca390eSSagi Grimberg 	return false;
255ca7ae5c9SHannes Reinecke }
256ca7ae5c9SHannes Reinecke 
__nvme_find_path(struct nvme_ns_head * head,int node)257f3334447SChristoph Hellwig static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
258f3334447SChristoph Hellwig {
259f3334447SChristoph Hellwig 	int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
260f3334447SChristoph Hellwig 	struct nvme_ns *found = NULL, *fallback = NULL, *ns;
26132acab31SChristoph Hellwig 
262*1e20e4ffSBreno Leitao 	list_for_each_entry_srcu(ns, &head->list, siblings,
263*1e20e4ffSBreno Leitao 				 srcu_read_lock_held(&head->srcu)) {
264ca7ae5c9SHannes Reinecke 		if (nvme_path_is_disabled(ns))
2650d0b660fSChristoph Hellwig 			continue;
266f3334447SChristoph Hellwig 
2678871cab4SNilay Shroff 		if (ns->ctrl->numa_node != NUMA_NO_NODE &&
2688871cab4SNilay Shroff 		    READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
269103e515eSHannes Reinecke 			distance = node_distance(node, ns->ctrl->numa_node);
27075c10e73SHannes Reinecke 		else
27175c10e73SHannes Reinecke 			distance = LOCAL_DISTANCE;
272f3334447SChristoph Hellwig 
2730d0b660fSChristoph Hellwig 		switch (ns->ana_state) {
2740d0b660fSChristoph Hellwig 		case NVME_ANA_OPTIMIZED:
275f3334447SChristoph Hellwig 			if (distance < found_distance) {
276f3334447SChristoph Hellwig 				found_distance = distance;
277f3334447SChristoph Hellwig 				found = ns;
278f3334447SChristoph Hellwig 			}
279f3334447SChristoph Hellwig 			break;
2800d0b660fSChristoph Hellwig 		case NVME_ANA_NONOPTIMIZED:
281f3334447SChristoph Hellwig 			if (distance < fallback_distance) {
282f3334447SChristoph Hellwig 				fallback_distance = distance;
2830d0b660fSChristoph Hellwig 				fallback = ns;
284f3334447SChristoph Hellwig 			}
2850d0b660fSChristoph Hellwig 			break;
2860d0b660fSChristoph Hellwig 		default:
2870d0b660fSChristoph Hellwig 			break;
28832acab31SChristoph Hellwig 		}
28932acab31SChristoph Hellwig 	}
29032acab31SChristoph Hellwig 
291f3334447SChristoph Hellwig 	if (!found)
292f3334447SChristoph Hellwig 		found = fallback;
293f3334447SChristoph Hellwig 	if (found)
294f3334447SChristoph Hellwig 		rcu_assign_pointer(head->current_path[node], found);
295f3334447SChristoph Hellwig 	return found;
2960d0b660fSChristoph Hellwig }
2970d0b660fSChristoph Hellwig 
nvme_next_ns(struct nvme_ns_head * head,struct nvme_ns * ns)29875c10e73SHannes Reinecke static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
29975c10e73SHannes Reinecke 		struct nvme_ns *ns)
30075c10e73SHannes Reinecke {
30175c10e73SHannes Reinecke 	ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns,
30275c10e73SHannes Reinecke 			siblings);
30375c10e73SHannes Reinecke 	if (ns)
30475c10e73SHannes Reinecke 		return ns;
30575c10e73SHannes Reinecke 	return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
30675c10e73SHannes Reinecke }
30775c10e73SHannes Reinecke 
nvme_round_robin_path(struct nvme_ns_head * head)308a7071e2bSJohn Meneghini static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head)
30975c10e73SHannes Reinecke {
310e398863bSMartin Wilck 	struct nvme_ns *ns, *found = NULL;
311a7071e2bSJohn Meneghini 	int node = numa_node_id();
312a7071e2bSJohn Meneghini 	struct nvme_ns *old = srcu_dereference(head->current_path[node],
313a7071e2bSJohn Meneghini 					       &head->srcu);
314a7071e2bSJohn Meneghini 
315a7071e2bSJohn Meneghini 	if (unlikely(!old))
316a7071e2bSJohn Meneghini 		return __nvme_find_path(head, node);
31775c10e73SHannes Reinecke 
3182032d074SHannes Reinecke 	if (list_is_singular(&head->list)) {
3192032d074SHannes Reinecke 		if (nvme_path_is_disabled(old))
3202032d074SHannes Reinecke 			return NULL;
32175c10e73SHannes Reinecke 		return old;
3222032d074SHannes Reinecke 	}
32375c10e73SHannes Reinecke 
32475c10e73SHannes Reinecke 	for (ns = nvme_next_ns(head, old);
325d1bcf006SDaniel Wagner 	     ns && ns != old;
32675c10e73SHannes Reinecke 	     ns = nvme_next_ns(head, ns)) {
327ca7ae5c9SHannes Reinecke 		if (nvme_path_is_disabled(ns))
32875c10e73SHannes Reinecke 			continue;
32975c10e73SHannes Reinecke 
33075c10e73SHannes Reinecke 		if (ns->ana_state == NVME_ANA_OPTIMIZED) {
33175c10e73SHannes Reinecke 			found = ns;
33275c10e73SHannes Reinecke 			goto out;
33375c10e73SHannes Reinecke 		}
33475c10e73SHannes Reinecke 		if (ns->ana_state == NVME_ANA_NONOPTIMIZED)
335e398863bSMartin Wilck 			found = ns;
33675c10e73SHannes Reinecke 	}
33775c10e73SHannes Reinecke 
33893eb0381SMartin Wilck 	/*
33993eb0381SMartin Wilck 	 * The loop above skips the current path for round-robin semantics.
34093eb0381SMartin Wilck 	 * Fall back to the current path if either:
34193eb0381SMartin Wilck 	 *  - no other optimized path found and current is optimized,
34293eb0381SMartin Wilck 	 *  - no other usable path found and current is usable.
34393eb0381SMartin Wilck 	 */
3443f6e3246SMartin Wilck 	if (!nvme_path_is_disabled(old) &&
34593eb0381SMartin Wilck 	    (old->ana_state == NVME_ANA_OPTIMIZED ||
346e398863bSMartin Wilck 	     (!found && old->ana_state == NVME_ANA_NONOPTIMIZED)))
34793eb0381SMartin Wilck 		return old;
34893eb0381SMartin Wilck 
349e398863bSMartin Wilck 	if (!found)
35075c10e73SHannes Reinecke 		return NULL;
35175c10e73SHannes Reinecke out:
35275c10e73SHannes Reinecke 	rcu_assign_pointer(head->current_path[node], found);
35375c10e73SHannes Reinecke 	return found;
35475c10e73SHannes Reinecke }
35575c10e73SHannes Reinecke 
nvme_queue_depth_path(struct nvme_ns_head * head)35685b9f3e6SThomas Song static struct nvme_ns *nvme_queue_depth_path(struct nvme_ns_head *head)
35785b9f3e6SThomas Song {
35885b9f3e6SThomas Song 	struct nvme_ns *best_opt = NULL, *best_nonopt = NULL, *ns;
35985b9f3e6SThomas Song 	unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX;
36085b9f3e6SThomas Song 	unsigned int depth;
36185b9f3e6SThomas Song 
362*1e20e4ffSBreno Leitao 	list_for_each_entry_srcu(ns, &head->list, siblings,
363*1e20e4ffSBreno Leitao 				 srcu_read_lock_held(&head->srcu)) {
36485b9f3e6SThomas Song 		if (nvme_path_is_disabled(ns))
36585b9f3e6SThomas Song 			continue;
36685b9f3e6SThomas Song 
36785b9f3e6SThomas Song 		depth = atomic_read(&ns->ctrl->nr_active);
36885b9f3e6SThomas Song 
36985b9f3e6SThomas Song 		switch (ns->ana_state) {
37085b9f3e6SThomas Song 		case NVME_ANA_OPTIMIZED:
37185b9f3e6SThomas Song 			if (depth < min_depth_opt) {
37285b9f3e6SThomas Song 				min_depth_opt = depth;
37385b9f3e6SThomas Song 				best_opt = ns;
37485b9f3e6SThomas Song 			}
37585b9f3e6SThomas Song 			break;
37685b9f3e6SThomas Song 		case NVME_ANA_NONOPTIMIZED:
37785b9f3e6SThomas Song 			if (depth < min_depth_nonopt) {
37885b9f3e6SThomas Song 				min_depth_nonopt = depth;
37985b9f3e6SThomas Song 				best_nonopt = ns;
38085b9f3e6SThomas Song 			}
38185b9f3e6SThomas Song 			break;
38285b9f3e6SThomas Song 		default:
38385b9f3e6SThomas Song 			break;
38485b9f3e6SThomas Song 		}
38585b9f3e6SThomas Song 
38685b9f3e6SThomas Song 		if (min_depth_opt == 0)
38785b9f3e6SThomas Song 			return best_opt;
38885b9f3e6SThomas Song 	}
38985b9f3e6SThomas Song 
39085b9f3e6SThomas Song 	return best_opt ? best_opt : best_nonopt;
39185b9f3e6SThomas Song }
39285b9f3e6SThomas Song 
nvme_path_is_optimized(struct nvme_ns * ns)3930d0b660fSChristoph Hellwig static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
3940d0b660fSChristoph Hellwig {
3950d0b660fSChristoph Hellwig 	return ns->ctrl->state == NVME_CTRL_LIVE &&
3960d0b660fSChristoph Hellwig 		ns->ana_state == NVME_ANA_OPTIMIZED;
39732acab31SChristoph Hellwig }
39832acab31SChristoph Hellwig 
nvme_numa_path(struct nvme_ns_head * head)399a7071e2bSJohn Meneghini static struct nvme_ns *nvme_numa_path(struct nvme_ns_head *head)
40032acab31SChristoph Hellwig {
401f3334447SChristoph Hellwig 	int node = numa_node_id();
402f3334447SChristoph Hellwig 	struct nvme_ns *ns;
40332acab31SChristoph Hellwig 
404f3334447SChristoph Hellwig 	ns = srcu_dereference(head->current_path[node], &head->srcu);
405fbd6a42dSHannes Reinecke 	if (unlikely(!ns))
406fbd6a42dSHannes Reinecke 		return __nvme_find_path(head, node);
407fbd6a42dSHannes Reinecke 	if (unlikely(!nvme_path_is_optimized(ns)))
408fbd6a42dSHannes Reinecke 		return __nvme_find_path(head, node);
40932acab31SChristoph Hellwig 	return ns;
41032acab31SChristoph Hellwig }
41132acab31SChristoph Hellwig 
nvme_find_path(struct nvme_ns_head * head)412a7071e2bSJohn Meneghini inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
413a7071e2bSJohn Meneghini {
41485b9f3e6SThomas Song 	switch (READ_ONCE(head->subsys->iopolicy)) {
41585b9f3e6SThomas Song 	case NVME_IOPOLICY_QD:
41685b9f3e6SThomas Song 		return nvme_queue_depth_path(head);
41785b9f3e6SThomas Song 	case NVME_IOPOLICY_RR:
418a7071e2bSJohn Meneghini 		return nvme_round_robin_path(head);
41985b9f3e6SThomas Song 	default:
420a7071e2bSJohn Meneghini 		return nvme_numa_path(head);
421a7071e2bSJohn Meneghini 	}
42285b9f3e6SThomas Song }
423a7071e2bSJohn Meneghini 
nvme_available_path(struct nvme_ns_head * head)4240157ec8dSSagi Grimberg static bool nvme_available_path(struct nvme_ns_head *head)
4250157ec8dSSagi Grimberg {
4260157ec8dSSagi Grimberg 	struct nvme_ns *ns;
4270157ec8dSSagi Grimberg 
428f0679539SHannes Reinecke 	if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags))
429f0679539SHannes Reinecke 		return NULL;
430f0679539SHannes Reinecke 
431*1e20e4ffSBreno Leitao 	list_for_each_entry_srcu(ns, &head->list, siblings,
432*1e20e4ffSBreno Leitao 				 srcu_read_lock_held(&head->srcu)) {
4338c4dfea9SVictor Gladkov 		if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags))
4348c4dfea9SVictor Gladkov 			continue;
4350157ec8dSSagi Grimberg 		switch (ns->ctrl->state) {
4360157ec8dSSagi Grimberg 		case NVME_CTRL_LIVE:
4370157ec8dSSagi Grimberg 		case NVME_CTRL_RESETTING:
4380157ec8dSSagi Grimberg 		case NVME_CTRL_CONNECTING:
4390157ec8dSSagi Grimberg 			/* fallthru */
4400157ec8dSSagi Grimberg 			return true;
4410157ec8dSSagi Grimberg 		default:
4420157ec8dSSagi Grimberg 			break;
4430157ec8dSSagi Grimberg 		}
4440157ec8dSSagi Grimberg 	}
4450157ec8dSSagi Grimberg 	return false;
4460157ec8dSSagi Grimberg }
4470157ec8dSSagi Grimberg 
nvme_ns_head_submit_bio(struct bio * bio)4483e08773cSChristoph Hellwig static void nvme_ns_head_submit_bio(struct bio *bio)
44932acab31SChristoph Hellwig {
450309dca30SChristoph Hellwig 	struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data;
45132acab31SChristoph Hellwig 	struct device *dev = disk_to_dev(head->disk);
45232acab31SChristoph Hellwig 	struct nvme_ns *ns;
45332acab31SChristoph Hellwig 	int srcu_idx;
45432acab31SChristoph Hellwig 
455525aa5a7SHannes Reinecke 	/*
456f695ca38SChristoph Hellwig 	 * The namespace might be going away and the bio might be moved to a
457f695ca38SChristoph Hellwig 	 * different queue via blk_steal_bios(), so we need to use the bio_split
458f695ca38SChristoph Hellwig 	 * pool from the original queue to allocate the bvecs from.
459525aa5a7SHannes Reinecke 	 */
4605a97806fSChristoph Hellwig 	bio = bio_split_to_limits(bio);
461613b1488SJens Axboe 	if (!bio)
462613b1488SJens Axboe 		return;
463525aa5a7SHannes Reinecke 
46432acab31SChristoph Hellwig 	srcu_idx = srcu_read_lock(&head->srcu);
46532acab31SChristoph Hellwig 	ns = nvme_find_path(head);
46632acab31SChristoph Hellwig 	if (likely(ns)) {
467a7c7f7b2SChristoph Hellwig 		bio_set_dev(bio, ns->disk->part0);
46832acab31SChristoph Hellwig 		bio->bi_opf |= REQ_NVME_MPATH;
4691c02fca6SChristoph Hellwig 		trace_block_bio_remap(bio, disk_devt(ns->head->disk),
4702796b569SHannes Reinecke 				      bio->bi_iter.bi_sector);
4713e08773cSChristoph Hellwig 		submit_bio_noacct(bio);
4720157ec8dSSagi Grimberg 	} else if (nvme_available_path(head)) {
4730157ec8dSSagi Grimberg 		dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
47432acab31SChristoph Hellwig 
47532acab31SChristoph Hellwig 		spin_lock_irq(&head->requeue_lock);
47632acab31SChristoph Hellwig 		bio_list_add(&head->requeue_list, bio);
47732acab31SChristoph Hellwig 		spin_unlock_irq(&head->requeue_lock);
47832acab31SChristoph Hellwig 	} else {
4790157ec8dSSagi Grimberg 		dev_warn_ratelimited(dev, "no available path - failing I/O\n");
48032acab31SChristoph Hellwig 
4818f31ddedSGuoqing Jiang 		bio_io_error(bio);
48232acab31SChristoph Hellwig 	}
48332acab31SChristoph Hellwig 
48432acab31SChristoph Hellwig 	srcu_read_unlock(&head->srcu, srcu_idx);
48532acab31SChristoph Hellwig }
48632acab31SChristoph Hellwig 
nvme_ns_head_open(struct gendisk * disk,blk_mode_t mode)48705bdb996SChristoph Hellwig static int nvme_ns_head_open(struct gendisk *disk, blk_mode_t mode)
4881496bd49SChristoph Hellwig {
489d32e2bf8SChristoph Hellwig 	if (!nvme_tryget_ns_head(disk->private_data))
4901496bd49SChristoph Hellwig 		return -ENXIO;
4911496bd49SChristoph Hellwig 	return 0;
4921496bd49SChristoph Hellwig }
4931496bd49SChristoph Hellwig 
nvme_ns_head_release(struct gendisk * disk)494ae220766SChristoph Hellwig static void nvme_ns_head_release(struct gendisk *disk)
4951496bd49SChristoph Hellwig {
4961496bd49SChristoph Hellwig 	nvme_put_ns_head(disk->private_data);
4971496bd49SChristoph Hellwig }
4981496bd49SChristoph Hellwig 
4998b4fb0f9SChristoph Hellwig #ifdef CONFIG_BLK_DEV_ZONED
nvme_ns_head_report_zones(struct gendisk * disk,sector_t sector,unsigned int nr_zones,report_zones_cb cb,void * data)5008b4fb0f9SChristoph Hellwig static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector,
5018b4fb0f9SChristoph Hellwig 		unsigned int nr_zones, report_zones_cb cb, void *data)
5028b4fb0f9SChristoph Hellwig {
5038b4fb0f9SChristoph Hellwig 	struct nvme_ns_head *head = disk->private_data;
5048b4fb0f9SChristoph Hellwig 	struct nvme_ns *ns;
5058b4fb0f9SChristoph Hellwig 	int srcu_idx, ret = -EWOULDBLOCK;
5068b4fb0f9SChristoph Hellwig 
5078b4fb0f9SChristoph Hellwig 	srcu_idx = srcu_read_lock(&head->srcu);
5088b4fb0f9SChristoph Hellwig 	ns = nvme_find_path(head);
5098b4fb0f9SChristoph Hellwig 	if (ns)
5108b4fb0f9SChristoph Hellwig 		ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data);
5118b4fb0f9SChristoph Hellwig 	srcu_read_unlock(&head->srcu, srcu_idx);
5128b4fb0f9SChristoph Hellwig 	return ret;
5138b4fb0f9SChristoph Hellwig }
5148b4fb0f9SChristoph Hellwig #else
5158b4fb0f9SChristoph Hellwig #define nvme_ns_head_report_zones	NULL
5168b4fb0f9SChristoph Hellwig #endif /* CONFIG_BLK_DEV_ZONED */
5178b4fb0f9SChristoph Hellwig 
5181496bd49SChristoph Hellwig const struct block_device_operations nvme_ns_head_ops = {
5191496bd49SChristoph Hellwig 	.owner		= THIS_MODULE,
5201496bd49SChristoph Hellwig 	.submit_bio	= nvme_ns_head_submit_bio,
5211496bd49SChristoph Hellwig 	.open		= nvme_ns_head_open,
5221496bd49SChristoph Hellwig 	.release	= nvme_ns_head_release,
5231496bd49SChristoph Hellwig 	.ioctl		= nvme_ns_head_ioctl,
524a25d4261SNick Bowler 	.compat_ioctl	= blkdev_compat_ptr_ioctl,
5251496bd49SChristoph Hellwig 	.getgeo		= nvme_getgeo,
5268b4fb0f9SChristoph Hellwig 	.report_zones	= nvme_ns_head_report_zones,
5271496bd49SChristoph Hellwig 	.pr_ops		= &nvme_pr_ops,
5281496bd49SChristoph Hellwig };
5291496bd49SChristoph Hellwig 
cdev_to_ns_head(struct cdev * cdev)5302637baedSMinwoo Im static inline struct nvme_ns_head *cdev_to_ns_head(struct cdev *cdev)
5312637baedSMinwoo Im {
5322637baedSMinwoo Im 	return container_of(cdev, struct nvme_ns_head, cdev);
5332637baedSMinwoo Im }
5342637baedSMinwoo Im 
nvme_ns_head_chr_open(struct inode * inode,struct file * file)5352637baedSMinwoo Im static int nvme_ns_head_chr_open(struct inode *inode, struct file *file)
5362637baedSMinwoo Im {
5372637baedSMinwoo Im 	if (!nvme_tryget_ns_head(cdev_to_ns_head(inode->i_cdev)))
5382637baedSMinwoo Im 		return -ENXIO;
5392637baedSMinwoo Im 	return 0;
5402637baedSMinwoo Im }
5412637baedSMinwoo Im 
nvme_ns_head_chr_release(struct inode * inode,struct file * file)5422637baedSMinwoo Im static int nvme_ns_head_chr_release(struct inode *inode, struct file *file)
5432637baedSMinwoo Im {
5442637baedSMinwoo Im 	nvme_put_ns_head(cdev_to_ns_head(inode->i_cdev));
5452637baedSMinwoo Im 	return 0;
5462637baedSMinwoo Im }
5472637baedSMinwoo Im 
5482637baedSMinwoo Im static const struct file_operations nvme_ns_head_chr_fops = {
5492637baedSMinwoo Im 	.owner		= THIS_MODULE,
5502637baedSMinwoo Im 	.open		= nvme_ns_head_chr_open,
5512637baedSMinwoo Im 	.release	= nvme_ns_head_chr_release,
5522637baedSMinwoo Im 	.unlocked_ioctl	= nvme_ns_head_chr_ioctl,
5532637baedSMinwoo Im 	.compat_ioctl	= compat_ptr_ioctl,
554456cba38SKanchan Joshi 	.uring_cmd	= nvme_ns_head_chr_uring_cmd,
5559408d8a3SKeith Busch 	.uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll,
5562637baedSMinwoo Im };
5572637baedSMinwoo Im 
nvme_add_ns_head_cdev(struct nvme_ns_head * head)5582637baedSMinwoo Im static int nvme_add_ns_head_cdev(struct nvme_ns_head *head)
5592637baedSMinwoo Im {
5602637baedSMinwoo Im 	int ret;
5612637baedSMinwoo Im 
5622637baedSMinwoo Im 	head->cdev_device.parent = &head->subsys->dev;
5632637baedSMinwoo Im 	ret = dev_set_name(&head->cdev_device, "ng%dn%d",
5642637baedSMinwoo Im 			   head->subsys->instance, head->instance);
5652637baedSMinwoo Im 	if (ret)
5662637baedSMinwoo Im 		return ret;
5672637baedSMinwoo Im 	ret = nvme_cdev_add(&head->cdev, &head->cdev_device,
5682637baedSMinwoo Im 			    &nvme_ns_head_chr_fops, THIS_MODULE);
5692637baedSMinwoo Im 	return ret;
5702637baedSMinwoo Im }
5712637baedSMinwoo Im 
nvme_partition_scan_work(struct work_struct * work)5724a57f42eSKeith Busch static void nvme_partition_scan_work(struct work_struct *work)
5734a57f42eSKeith Busch {
5744a57f42eSKeith Busch 	struct nvme_ns_head *head =
5754a57f42eSKeith Busch 		container_of(work, struct nvme_ns_head, partition_scan_work);
5764a57f42eSKeith Busch 
5774a57f42eSKeith Busch 	if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN,
5784a57f42eSKeith Busch 					     &head->disk->state)))
5794a57f42eSKeith Busch 		return;
5804a57f42eSKeith Busch 
5814a57f42eSKeith Busch 	mutex_lock(&head->disk->open_mutex);
5824a57f42eSKeith Busch 	bdev_disk_changed(head->disk, false);
5834a57f42eSKeith Busch 	mutex_unlock(&head->disk->open_mutex);
5844a57f42eSKeith Busch }
5854a57f42eSKeith Busch 
nvme_requeue_work(struct work_struct * work)58632acab31SChristoph Hellwig static void nvme_requeue_work(struct work_struct *work)
58732acab31SChristoph Hellwig {
58832acab31SChristoph Hellwig 	struct nvme_ns_head *head =
58932acab31SChristoph Hellwig 		container_of(work, struct nvme_ns_head, requeue_work);
59032acab31SChristoph Hellwig 	struct bio *bio, *next;
59132acab31SChristoph Hellwig 
59232acab31SChristoph Hellwig 	spin_lock_irq(&head->requeue_lock);
59332acab31SChristoph Hellwig 	next = bio_list_get(&head->requeue_list);
59432acab31SChristoph Hellwig 	spin_unlock_irq(&head->requeue_lock);
59532acab31SChristoph Hellwig 
59632acab31SChristoph Hellwig 	while ((bio = next) != NULL) {
59732acab31SChristoph Hellwig 		next = bio->bi_next;
59832acab31SChristoph Hellwig 		bio->bi_next = NULL;
59932acab31SChristoph Hellwig 
600ed00aabdSChristoph Hellwig 		submit_bio_noacct(bio);
60132acab31SChristoph Hellwig 	}
60232acab31SChristoph Hellwig }
60332acab31SChristoph Hellwig 
nvme_mpath_alloc_disk(struct nvme_ctrl * ctrl,struct nvme_ns_head * head)60432acab31SChristoph Hellwig int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
60532acab31SChristoph Hellwig {
60632acab31SChristoph Hellwig 	bool vwc = false;
60732acab31SChristoph Hellwig 
6080d0b660fSChristoph Hellwig 	mutex_init(&head->lock);
60932acab31SChristoph Hellwig 	bio_list_init(&head->requeue_list);
61032acab31SChristoph Hellwig 	spin_lock_init(&head->requeue_lock);
61132acab31SChristoph Hellwig 	INIT_WORK(&head->requeue_work, nvme_requeue_work);
6124a57f42eSKeith Busch 	INIT_WORK(&head->partition_scan_work, nvme_partition_scan_work);
61332acab31SChristoph Hellwig 
61432acab31SChristoph Hellwig 	/*
61532acab31SChristoph Hellwig 	 * Add a multipath node if the subsystems supports multiple controllers.
6165974ea7cSSungup Moon 	 * We also do this for private namespaces as the namespace sharing flag
6175974ea7cSSungup Moon 	 * could change after a rescan.
61832acab31SChristoph Hellwig 	 */
6195974ea7cSSungup Moon 	if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
6205974ea7cSSungup Moon 	    !nvme_is_unique_nsid(ctrl, head) || !multipath)
62132acab31SChristoph Hellwig 		return 0;
62232acab31SChristoph Hellwig 
623f165fb89SChristoph Hellwig 	head->disk = blk_alloc_disk(ctrl->numa_node);
624f165fb89SChristoph Hellwig 	if (!head->disk)
625f165fb89SChristoph Hellwig 		return -ENOMEM;
626f165fb89SChristoph Hellwig 	head->disk->fops = &nvme_ns_head_ops;
627f165fb89SChristoph Hellwig 	head->disk->private_data = head;
6284a57f42eSKeith Busch 
6294a57f42eSKeith Busch 	/*
6304a57f42eSKeith Busch 	 * We need to suppress the partition scan from occuring within the
6314a57f42eSKeith Busch 	 * controller's scan_work context. If a path error occurs here, the IO
6324a57f42eSKeith Busch 	 * will wait until a path becomes available or all paths are torn down,
6334a57f42eSKeith Busch 	 * but that action also occurs within scan_work, so it would deadlock.
6344a57f42eSKeith Busch 	 * Defer the partion scan to a different context that does not block
6354a57f42eSKeith Busch 	 * scan_work.
6364a57f42eSKeith Busch 	 */
6374a57f42eSKeith Busch 	set_bit(GD_SUPPRESS_PART_SCAN, &head->disk->state);
638f165fb89SChristoph Hellwig 	sprintf(head->disk->disk_name, "nvme%dn%d",
639f165fb89SChristoph Hellwig 			ctrl->subsys->instance, head->instance);
640f165fb89SChristoph Hellwig 
641f165fb89SChristoph Hellwig 	blk_queue_flag_set(QUEUE_FLAG_NONROT, head->disk->queue);
642d32d3d0bSChristoph Hellwig 	blk_queue_flag_set(QUEUE_FLAG_NOWAIT, head->disk->queue);
643d4d957b5SSagi Grimberg 	blk_queue_flag_set(QUEUE_FLAG_IO_STAT, head->disk->queue);
644c712dcccSChristoph Hellwig 	/*
645c712dcccSChristoph Hellwig 	 * This assumes all controllers that refer to a namespace either
646c712dcccSChristoph Hellwig 	 * support poll queues or not.  That is not a strict guarantee,
647c712dcccSChristoph Hellwig 	 * but if the assumption is wrong the effect is only suboptimal
648c712dcccSChristoph Hellwig 	 * performance but not correctness problem.
649c712dcccSChristoph Hellwig 	 */
650c712dcccSChristoph Hellwig 	if (ctrl->tagset->nr_maps > HCTX_TYPE_POLL &&
651c712dcccSChristoph Hellwig 	    ctrl->tagset->map[HCTX_TYPE_POLL].nr_queues)
652c712dcccSChristoph Hellwig 		blk_queue_flag_set(QUEUE_FLAG_POLL, head->disk->queue);
653d32d3d0bSChristoph Hellwig 
654f165fb89SChristoph Hellwig 	/* set to a default value of 512 until the disk is validated */
655f165fb89SChristoph Hellwig 	blk_queue_logical_block_size(head->disk->queue, 512);
656f165fb89SChristoph Hellwig 	blk_set_stacking_limits(&head->disk->queue->limits);
657fe8714b0SKeith Busch 	blk_queue_dma_alignment(head->disk->queue, 3);
65832acab31SChristoph Hellwig 
65932acab31SChristoph Hellwig 	/* we need to propagate up the VMC settings */
66032acab31SChristoph Hellwig 	if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
66132acab31SChristoph Hellwig 		vwc = true;
662f165fb89SChristoph Hellwig 	blk_queue_write_cache(head->disk->queue, vwc, vwc);
66332acab31SChristoph Hellwig 	return 0;
66432acab31SChristoph Hellwig }
66532acab31SChristoph Hellwig 
nvme_mpath_set_live(struct nvme_ns * ns)6660d0b660fSChristoph Hellwig static void nvme_mpath_set_live(struct nvme_ns *ns)
66732acab31SChristoph Hellwig {
6680d0b660fSChristoph Hellwig 	struct nvme_ns_head *head = ns->head;
66911384580SLuis Chamberlain 	int rc;
6700d0b660fSChristoph Hellwig 
67132acab31SChristoph Hellwig 	if (!head->disk)
67232acab31SChristoph Hellwig 		return;
6739bd82b1aSBaegjae Sung 
67411384580SLuis Chamberlain 	/*
67511384580SLuis Chamberlain 	 * test_and_set_bit() is used because it is protecting against two nvme
67611384580SLuis Chamberlain 	 * paths simultaneously calling device_add_disk() on the same namespace
67711384580SLuis Chamberlain 	 * head.
67811384580SLuis Chamberlain 	 */
6792637baedSMinwoo Im 	if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
68011384580SLuis Chamberlain 		rc = device_add_disk(&head->subsys->dev, head->disk,
68133b14f67SHannes Reinecke 				     nvme_ns_id_attr_groups);
68211384580SLuis Chamberlain 		if (rc) {
68330b9bf4bSHannes Reinecke 			clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags);
68411384580SLuis Chamberlain 			return;
68511384580SLuis Chamberlain 		}
6862637baedSMinwoo Im 		nvme_add_ns_head_cdev(head);
6874a57f42eSKeith Busch 		kblockd_schedule_work(&head->partition_scan_work);
6882637baedSMinwoo Im 	}
6890d0b660fSChristoph Hellwig 
690d8a22f85SAnton Eidelman 	mutex_lock(&head->lock);
691886fabf6SKeith Busch 	if (nvme_path_is_optimized(ns)) {
692886fabf6SKeith Busch 		int node, srcu_idx;
693886fabf6SKeith Busch 
694886fabf6SKeith Busch 		srcu_idx = srcu_read_lock(&head->srcu);
695e6e1eda0SNilay Shroff 		for_each_online_node(node)
696886fabf6SKeith Busch 			__nvme_find_path(head, node);
697886fabf6SKeith Busch 		srcu_read_unlock(&head->srcu, srcu_idx);
698886fabf6SKeith Busch 	}
699e164471dSSagi Grimberg 	mutex_unlock(&head->lock);
700886fabf6SKeith Busch 
701e164471dSSagi Grimberg 	synchronize_srcu(&head->srcu);
702e164471dSSagi Grimberg 	kblockd_schedule_work(&head->requeue_work);
7030d0b660fSChristoph Hellwig }
7040d0b660fSChristoph Hellwig 
nvme_parse_ana_log(struct nvme_ctrl * ctrl,void * data,int (* cb)(struct nvme_ctrl * ctrl,struct nvme_ana_group_desc *,void *))7050d0b660fSChristoph Hellwig static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
7060d0b660fSChristoph Hellwig 		int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *,
7070d0b660fSChristoph Hellwig 			void *))
7080d0b660fSChristoph Hellwig {
7090d0b660fSChristoph Hellwig 	void *base = ctrl->ana_log_buf;
7100d0b660fSChristoph Hellwig 	size_t offset = sizeof(struct nvme_ana_rsp_hdr);
7110d0b660fSChristoph Hellwig 	int error, i;
7120d0b660fSChristoph Hellwig 
7130d0b660fSChristoph Hellwig 	lockdep_assert_held(&ctrl->ana_lock);
7140d0b660fSChristoph Hellwig 
7150d0b660fSChristoph Hellwig 	for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) {
7160d0b660fSChristoph Hellwig 		struct nvme_ana_group_desc *desc = base + offset;
71764fab729SPrabhath Sajeepa 		u32 nr_nsids;
71864fab729SPrabhath Sajeepa 		size_t nsid_buf_size;
71964fab729SPrabhath Sajeepa 
72064fab729SPrabhath Sajeepa 		if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
72164fab729SPrabhath Sajeepa 			return -EINVAL;
72264fab729SPrabhath Sajeepa 
72364fab729SPrabhath Sajeepa 		nr_nsids = le32_to_cpu(desc->nnsids);
724d156cfcaSLen Baker 		nsid_buf_size = flex_array_size(desc, nsids, nr_nsids);
7250d0b660fSChristoph Hellwig 
7260d0b660fSChristoph Hellwig 		if (WARN_ON_ONCE(desc->grpid == 0))
7270d0b660fSChristoph Hellwig 			return -EINVAL;
7280d0b660fSChristoph Hellwig 		if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax))
7290d0b660fSChristoph Hellwig 			return -EINVAL;
7300d0b660fSChristoph Hellwig 		if (WARN_ON_ONCE(desc->state == 0))
7310d0b660fSChristoph Hellwig 			return -EINVAL;
7320d0b660fSChristoph Hellwig 		if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE))
7330d0b660fSChristoph Hellwig 			return -EINVAL;
7340d0b660fSChristoph Hellwig 
7350d0b660fSChristoph Hellwig 		offset += sizeof(*desc);
7360d0b660fSChristoph Hellwig 		if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size))
7370d0b660fSChristoph Hellwig 			return -EINVAL;
7380d0b660fSChristoph Hellwig 
7390d0b660fSChristoph Hellwig 		error = cb(ctrl, desc, data);
7400d0b660fSChristoph Hellwig 		if (error)
7410d0b660fSChristoph Hellwig 			return error;
7420d0b660fSChristoph Hellwig 
7430d0b660fSChristoph Hellwig 		offset += nsid_buf_size;
7440d0b660fSChristoph Hellwig 	}
7450d0b660fSChristoph Hellwig 
7460d0b660fSChristoph Hellwig 	return 0;
7470d0b660fSChristoph Hellwig }
7480d0b660fSChristoph Hellwig 
nvme_state_is_live(enum nvme_ana_state state)7490d0b660fSChristoph Hellwig static inline bool nvme_state_is_live(enum nvme_ana_state state)
7500d0b660fSChristoph Hellwig {
7510d0b660fSChristoph Hellwig 	return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED;
7520d0b660fSChristoph Hellwig }
7530d0b660fSChristoph Hellwig 
nvme_update_ns_ana_state(struct nvme_ana_group_desc * desc,struct nvme_ns * ns)7540d0b660fSChristoph Hellwig static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
7550d0b660fSChristoph Hellwig 		struct nvme_ns *ns)
7560d0b660fSChristoph Hellwig {
7570d0b660fSChristoph Hellwig 	ns->ana_grpid = le32_to_cpu(desc->grpid);
7580d0b660fSChristoph Hellwig 	ns->ana_state = desc->state;
7590d0b660fSChristoph Hellwig 	clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
760a4a6f3c8SAnton Eidelman 	/*
761a4a6f3c8SAnton Eidelman 	 * nvme_mpath_set_live() will trigger I/O to the multipath path device
762a4a6f3c8SAnton Eidelman 	 * and in turn to this path device.  However we cannot accept this I/O
763a4a6f3c8SAnton Eidelman 	 * if the controller is not live.  This may deadlock if called from
764a4a6f3c8SAnton Eidelman 	 * nvme_mpath_init_identify() and the ctrl will never complete
765a4a6f3c8SAnton Eidelman 	 * initialization, preventing I/O from completing.  For this case we
766a4a6f3c8SAnton Eidelman 	 * will reprocess the ANA log page in nvme_mpath_update() once the
767a4a6f3c8SAnton Eidelman 	 * controller is ready.
768a4a6f3c8SAnton Eidelman 	 */
769a4a6f3c8SAnton Eidelman 	if (nvme_state_is_live(ns->ana_state) &&
770a4a6f3c8SAnton Eidelman 	    ns->ctrl->state == NVME_CTRL_LIVE)
7710d0b660fSChristoph Hellwig 		nvme_mpath_set_live(ns);
7720d0b660fSChristoph Hellwig }
7730d0b660fSChristoph Hellwig 
nvme_update_ana_state(struct nvme_ctrl * ctrl,struct nvme_ana_group_desc * desc,void * data)7740d0b660fSChristoph Hellwig static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
7750d0b660fSChristoph Hellwig 		struct nvme_ana_group_desc *desc, void *data)
7760d0b660fSChristoph Hellwig {
7770d0b660fSChristoph Hellwig 	u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
7780d0b660fSChristoph Hellwig 	unsigned *nr_change_groups = data;
7790d0b660fSChristoph Hellwig 	struct nvme_ns *ns;
78082f20194SKeith Busch 	int srcu_idx;
7810d0b660fSChristoph Hellwig 
782592b6e7bSHannes Reinecke 	dev_dbg(ctrl->device, "ANA group %d: %s.\n",
7830d0b660fSChristoph Hellwig 			le32_to_cpu(desc->grpid),
7840d0b660fSChristoph Hellwig 			nvme_ana_state_names[desc->state]);
7850d0b660fSChristoph Hellwig 
7860d0b660fSChristoph Hellwig 	if (desc->state == NVME_ANA_CHANGE)
7870d0b660fSChristoph Hellwig 		(*nr_change_groups)++;
7880d0b660fSChristoph Hellwig 
7890d0b660fSChristoph Hellwig 	if (!nr_nsids)
7900d0b660fSChristoph Hellwig 		return 0;
7910d0b660fSChristoph Hellwig 
79282f20194SKeith Busch 	srcu_idx = srcu_read_lock(&ctrl->srcu);
793*1e20e4ffSBreno Leitao 	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
794*1e20e4ffSBreno Leitao 				 srcu_read_lock_held(&ctrl->srcu)) {
79579f528afSAnton Eidelman 		unsigned nsid;
79679f528afSAnton Eidelman again:
79779f528afSAnton Eidelman 		nsid = le32_to_cpu(desc->nsids[n]);
798e01f91dfSAnton Eidelman 		if (ns->head->ns_id < nsid)
7990d0b660fSChristoph Hellwig 			continue;
800e01f91dfSAnton Eidelman 		if (ns->head->ns_id == nsid)
8010d0b660fSChristoph Hellwig 			nvme_update_ns_ana_state(desc, ns);
8020d0b660fSChristoph Hellwig 		if (++n == nr_nsids)
8030d0b660fSChristoph Hellwig 			break;
80479f528afSAnton Eidelman 		if (ns->head->ns_id > nsid)
80579f528afSAnton Eidelman 			goto again;
8060d0b660fSChristoph Hellwig 	}
80782f20194SKeith Busch 	srcu_read_unlock(&ctrl->srcu, srcu_idx);
8080d0b660fSChristoph Hellwig 	return 0;
8090d0b660fSChristoph Hellwig }
8100d0b660fSChristoph Hellwig 
nvme_read_ana_log(struct nvme_ctrl * ctrl)81186cccfbfSAnton Eidelman static int nvme_read_ana_log(struct nvme_ctrl *ctrl)
8120d0b660fSChristoph Hellwig {
8130d0b660fSChristoph Hellwig 	u32 nr_change_groups = 0;
8140d0b660fSChristoph Hellwig 	int error;
8150d0b660fSChristoph Hellwig 
8160d0b660fSChristoph Hellwig 	mutex_lock(&ctrl->ana_lock);
817be93e87eSKeith Busch 	error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM,
8180d0b660fSChristoph Hellwig 			ctrl->ana_log_buf, ctrl->ana_log_size, 0);
8190d0b660fSChristoph Hellwig 	if (error) {
8200d0b660fSChristoph Hellwig 		dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
8210d0b660fSChristoph Hellwig 		goto out_unlock;
8220d0b660fSChristoph Hellwig 	}
8230d0b660fSChristoph Hellwig 
8240d0b660fSChristoph Hellwig 	error = nvme_parse_ana_log(ctrl, &nr_change_groups,
8250d0b660fSChristoph Hellwig 			nvme_update_ana_state);
8260d0b660fSChristoph Hellwig 	if (error)
8270d0b660fSChristoph Hellwig 		goto out_unlock;
8280d0b660fSChristoph Hellwig 
8290d0b660fSChristoph Hellwig 	/*
8300d0b660fSChristoph Hellwig 	 * In theory we should have an ANATT timer per group as they might enter
8310d0b660fSChristoph Hellwig 	 * the change state at different times.  But that is a lot of overhead
8320d0b660fSChristoph Hellwig 	 * just to protect against a target that keeps entering new changes
8330d0b660fSChristoph Hellwig 	 * states while never finishing previous ones.  But we'll still
8340d0b660fSChristoph Hellwig 	 * eventually time out once all groups are in change state, so this
8350d0b660fSChristoph Hellwig 	 * isn't a big deal.
8360d0b660fSChristoph Hellwig 	 *
8370d0b660fSChristoph Hellwig 	 * We also double the ANATT value to provide some slack for transports
8380d0b660fSChristoph Hellwig 	 * or AEN processing overhead.
8390d0b660fSChristoph Hellwig 	 */
8400d0b660fSChristoph Hellwig 	if (nr_change_groups)
8410d0b660fSChristoph Hellwig 		mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies);
8420d0b660fSChristoph Hellwig 	else
8430d0b660fSChristoph Hellwig 		del_timer_sync(&ctrl->anatt_timer);
8440d0b660fSChristoph Hellwig out_unlock:
8450d0b660fSChristoph Hellwig 	mutex_unlock(&ctrl->ana_lock);
8460d0b660fSChristoph Hellwig 	return error;
8470d0b660fSChristoph Hellwig }
8480d0b660fSChristoph Hellwig 
nvme_ana_work(struct work_struct * work)8490d0b660fSChristoph Hellwig static void nvme_ana_work(struct work_struct *work)
8500d0b660fSChristoph Hellwig {
8510d0b660fSChristoph Hellwig 	struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
8520d0b660fSChristoph Hellwig 
853ecca390eSSagi Grimberg 	if (ctrl->state != NVME_CTRL_LIVE)
854ecca390eSSagi Grimberg 		return;
855ecca390eSSagi Grimberg 
85686cccfbfSAnton Eidelman 	nvme_read_ana_log(ctrl);
8570d0b660fSChristoph Hellwig }
8580d0b660fSChristoph Hellwig 
nvme_mpath_update(struct nvme_ctrl * ctrl)859a4a6f3c8SAnton Eidelman void nvme_mpath_update(struct nvme_ctrl *ctrl)
860a4a6f3c8SAnton Eidelman {
861a4a6f3c8SAnton Eidelman 	u32 nr_change_groups = 0;
862a4a6f3c8SAnton Eidelman 
863a4a6f3c8SAnton Eidelman 	if (!ctrl->ana_log_buf)
864a4a6f3c8SAnton Eidelman 		return;
865a4a6f3c8SAnton Eidelman 
866a4a6f3c8SAnton Eidelman 	mutex_lock(&ctrl->ana_lock);
867a4a6f3c8SAnton Eidelman 	nvme_parse_ana_log(ctrl, &nr_change_groups, nvme_update_ana_state);
868a4a6f3c8SAnton Eidelman 	mutex_unlock(&ctrl->ana_lock);
869a4a6f3c8SAnton Eidelman }
870a4a6f3c8SAnton Eidelman 
nvme_anatt_timeout(struct timer_list * t)8710d0b660fSChristoph Hellwig static void nvme_anatt_timeout(struct timer_list *t)
8720d0b660fSChristoph Hellwig {
8730d0b660fSChristoph Hellwig 	struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer);
8740d0b660fSChristoph Hellwig 
8750d0b660fSChristoph Hellwig 	dev_info(ctrl->device, "ANATT timeout, resetting controller.\n");
8760d0b660fSChristoph Hellwig 	nvme_reset_ctrl(ctrl);
8770d0b660fSChristoph Hellwig }
8780d0b660fSChristoph Hellwig 
nvme_mpath_stop(struct nvme_ctrl * ctrl)8790d0b660fSChristoph Hellwig void nvme_mpath_stop(struct nvme_ctrl *ctrl)
8800d0b660fSChristoph Hellwig {
8810d0b660fSChristoph Hellwig 	if (!nvme_ctrl_use_ana(ctrl))
8820d0b660fSChristoph Hellwig 		return;
8830d0b660fSChristoph Hellwig 	del_timer_sync(&ctrl->anatt_timer);
8840d0b660fSChristoph Hellwig 	cancel_work_sync(&ctrl->ana_work);
8850d0b660fSChristoph Hellwig }
8860d0b660fSChristoph Hellwig 
88775c10e73SHannes Reinecke #define SUBSYS_ATTR_RW(_name, _mode, _show, _store)  \
88875c10e73SHannes Reinecke 	struct device_attribute subsys_attr_##_name =	\
88975c10e73SHannes Reinecke 		__ATTR(_name, _mode, _show, _store)
89075c10e73SHannes Reinecke 
nvme_subsys_iopolicy_show(struct device * dev,struct device_attribute * attr,char * buf)89175c10e73SHannes Reinecke static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
89275c10e73SHannes Reinecke 		struct device_attribute *attr, char *buf)
89375c10e73SHannes Reinecke {
89475c10e73SHannes Reinecke 	struct nvme_subsystem *subsys =
89575c10e73SHannes Reinecke 		container_of(dev, struct nvme_subsystem, dev);
89675c10e73SHannes Reinecke 
897bff4bcf3SDaniel Wagner 	return sysfs_emit(buf, "%s\n",
89875c10e73SHannes Reinecke 			  nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
89975c10e73SHannes Reinecke }
90075c10e73SHannes Reinecke 
nvme_subsys_iopolicy_update(struct nvme_subsystem * subsys,int iopolicy)90185b9f3e6SThomas Song static void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys,
90285b9f3e6SThomas Song 		int iopolicy)
90385b9f3e6SThomas Song {
90485b9f3e6SThomas Song 	struct nvme_ctrl *ctrl;
90585b9f3e6SThomas Song 	int old_iopolicy = READ_ONCE(subsys->iopolicy);
90685b9f3e6SThomas Song 
90785b9f3e6SThomas Song 	if (old_iopolicy == iopolicy)
90885b9f3e6SThomas Song 		return;
90985b9f3e6SThomas Song 
91085b9f3e6SThomas Song 	WRITE_ONCE(subsys->iopolicy, iopolicy);
91185b9f3e6SThomas Song 
91285b9f3e6SThomas Song 	/* iopolicy changes clear the mpath by design */
91385b9f3e6SThomas Song 	mutex_lock(&nvme_subsystems_lock);
91485b9f3e6SThomas Song 	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
91585b9f3e6SThomas Song 		nvme_mpath_clear_ctrl_paths(ctrl);
91685b9f3e6SThomas Song 	mutex_unlock(&nvme_subsystems_lock);
91785b9f3e6SThomas Song 
91885b9f3e6SThomas Song 	pr_notice("subsysnqn %s iopolicy changed from %s to %s\n",
91985b9f3e6SThomas Song 			subsys->subnqn,
92085b9f3e6SThomas Song 			nvme_iopolicy_names[old_iopolicy],
92185b9f3e6SThomas Song 			nvme_iopolicy_names[iopolicy]);
92285b9f3e6SThomas Song }
92385b9f3e6SThomas Song 
nvme_subsys_iopolicy_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)92475c10e73SHannes Reinecke static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
92575c10e73SHannes Reinecke 		struct device_attribute *attr, const char *buf, size_t count)
92675c10e73SHannes Reinecke {
92775c10e73SHannes Reinecke 	struct nvme_subsystem *subsys =
92875c10e73SHannes Reinecke 		container_of(dev, struct nvme_subsystem, dev);
92975c10e73SHannes Reinecke 	int i;
93075c10e73SHannes Reinecke 
93175c10e73SHannes Reinecke 	for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
93275c10e73SHannes Reinecke 		if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
93385b9f3e6SThomas Song 			nvme_subsys_iopolicy_update(subsys, i);
93475c10e73SHannes Reinecke 			return count;
93575c10e73SHannes Reinecke 		}
93675c10e73SHannes Reinecke 	}
93775c10e73SHannes Reinecke 
93875c10e73SHannes Reinecke 	return -EINVAL;
93975c10e73SHannes Reinecke }
94075c10e73SHannes Reinecke SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
94175c10e73SHannes Reinecke 		      nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
94275c10e73SHannes Reinecke 
ana_grpid_show(struct device * dev,struct device_attribute * attr,char * buf)9430d0b660fSChristoph Hellwig static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
9440d0b660fSChristoph Hellwig 		char *buf)
9450d0b660fSChristoph Hellwig {
946bff4bcf3SDaniel Wagner 	return sysfs_emit(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid);
9470d0b660fSChristoph Hellwig }
9480d0b660fSChristoph Hellwig DEVICE_ATTR_RO(ana_grpid);
9490d0b660fSChristoph Hellwig 
ana_state_show(struct device * dev,struct device_attribute * attr,char * buf)9500d0b660fSChristoph Hellwig static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
9510d0b660fSChristoph Hellwig 		char *buf)
9520d0b660fSChristoph Hellwig {
9530d0b660fSChristoph Hellwig 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
9540d0b660fSChristoph Hellwig 
955bff4bcf3SDaniel Wagner 	return sysfs_emit(buf, "%s\n", nvme_ana_state_names[ns->ana_state]);
9560d0b660fSChristoph Hellwig }
9570d0b660fSChristoph Hellwig DEVICE_ATTR_RO(ana_state);
9580d0b660fSChristoph Hellwig 
nvme_lookup_ana_group_desc(struct nvme_ctrl * ctrl,struct nvme_ana_group_desc * desc,void * data)959489dd102SAnton Eidelman static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
9600d0b660fSChristoph Hellwig 		struct nvme_ana_group_desc *desc, void *data)
9610d0b660fSChristoph Hellwig {
962489dd102SAnton Eidelman 	struct nvme_ana_group_desc *dst = data;
9630d0b660fSChristoph Hellwig 
964489dd102SAnton Eidelman 	if (desc->grpid != dst->grpid)
9650d0b660fSChristoph Hellwig 		return 0;
966489dd102SAnton Eidelman 
967489dd102SAnton Eidelman 	*dst = *desc;
968489dd102SAnton Eidelman 	return -ENXIO; /* just break out of the loop */
9690d0b660fSChristoph Hellwig }
9700d0b660fSChristoph Hellwig 
nvme_mpath_add_disk(struct nvme_ns * ns,__le32 anagrpid)971c13cf14fSJoel Granados void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
9720d0b660fSChristoph Hellwig {
9730d0b660fSChristoph Hellwig 	if (nvme_ctrl_use_ana(ns->ctrl)) {
974489dd102SAnton Eidelman 		struct nvme_ana_group_desc desc = {
975c13cf14fSJoel Granados 			.grpid = anagrpid,
976489dd102SAnton Eidelman 			.state = 0,
977489dd102SAnton Eidelman 		};
978489dd102SAnton Eidelman 
9790d0b660fSChristoph Hellwig 		mutex_lock(&ns->ctrl->ana_lock);
980c13cf14fSJoel Granados 		ns->ana_grpid = le32_to_cpu(anagrpid);
981489dd102SAnton Eidelman 		nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc);
9820d0b660fSChristoph Hellwig 		mutex_unlock(&ns->ctrl->ana_lock);
983489dd102SAnton Eidelman 		if (desc.state) {
984489dd102SAnton Eidelman 			/* found the group desc: update */
985489dd102SAnton Eidelman 			nvme_update_ns_ana_state(&desc, ns);
986dd8f7fa9SHannes Reinecke 		} else {
987dd8f7fa9SHannes Reinecke 			/* group desc not found: trigger a re-read */
988dd8f7fa9SHannes Reinecke 			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
989dd8f7fa9SHannes Reinecke 			queue_work(nvme_wq, &ns->ctrl->ana_work);
990489dd102SAnton Eidelman 		}
9910d0b660fSChristoph Hellwig 	} else {
9920d0b660fSChristoph Hellwig 		ns->ana_state = NVME_ANA_OPTIMIZED;
9930d0b660fSChristoph Hellwig 		nvme_mpath_set_live(ns);
9940d0b660fSChristoph Hellwig 	}
995b2ce4d90SKeith Busch 
9961cb039f3SChristoph Hellwig 	if (blk_queue_stable_writes(ns->queue) && ns->head->disk)
9971cb039f3SChristoph Hellwig 		blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES,
9981cb039f3SChristoph Hellwig 				   ns->head->disk->queue);
99973a1a229SKeith Busch #ifdef CONFIG_BLK_DEV_ZONED
100073a1a229SKeith Busch 	if (blk_queue_is_zoned(ns->queue) && ns->head->disk)
1001d86e716aSChristoph Hellwig 		ns->head->disk->nr_zones = ns->disk->nr_zones;
100273a1a229SKeith Busch #endif
10039bd82b1aSBaegjae Sung }
100432acab31SChristoph Hellwig 
nvme_mpath_shutdown_disk(struct nvme_ns_head * head)10055396fdacSHannes Reinecke void nvme_mpath_shutdown_disk(struct nvme_ns_head *head)
100632acab31SChristoph Hellwig {
100732acab31SChristoph Hellwig 	if (!head->disk)
100832acab31SChristoph Hellwig 		return;
1009f0679539SHannes Reinecke 	if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
10102637baedSMinwoo Im 		nvme_cdev_del(&head->cdev, &head->cdev_device);
10114a57f42eSKeith Busch 		/*
10124a57f42eSKeith Busch 		 * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
10134a57f42eSKeith Busch 		 * to allow multipath to fail all I/O.
10144a57f42eSKeith Busch 		 */
10154a57f42eSKeith Busch 		synchronize_srcu(&head->srcu);
10164a57f42eSKeith Busch 		kblockd_schedule_work(&head->requeue_work);
101732acab31SChristoph Hellwig 		del_gendisk(head->disk);
10182637baedSMinwoo Im 	}
1019f0679539SHannes Reinecke 	/*
1020f0679539SHannes Reinecke 	 * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
1021f0679539SHannes Reinecke 	 * to allow multipath to fail all I/O.
1022f0679539SHannes Reinecke 	 */
1023f0679539SHannes Reinecke 	synchronize_srcu(&head->srcu);
1024f0679539SHannes Reinecke 	kblockd_schedule_work(&head->requeue_work);
10255396fdacSHannes Reinecke }
10265396fdacSHannes Reinecke 
nvme_mpath_remove_disk(struct nvme_ns_head * head)10275396fdacSHannes Reinecke void nvme_mpath_remove_disk(struct nvme_ns_head *head)
10285396fdacSHannes Reinecke {
10295396fdacSHannes Reinecke 	if (!head->disk)
10305396fdacSHannes Reinecke 		return;
103132acab31SChristoph Hellwig 	/* make sure all pending bios are cleaned up */
103232acab31SChristoph Hellwig 	kblockd_schedule_work(&head->requeue_work);
103332acab31SChristoph Hellwig 	flush_work(&head->requeue_work);
10344a57f42eSKeith Busch 	flush_work(&head->partition_scan_work);
10358b9ab626SChristoph Hellwig 	put_disk(head->disk);
103632acab31SChristoph Hellwig }
10370d0b660fSChristoph Hellwig 
nvme_mpath_init_ctrl(struct nvme_ctrl * ctrl)10385e1f6899SChristoph Hellwig void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl)
10390d0b660fSChristoph Hellwig {
10405e1f6899SChristoph Hellwig 	mutex_init(&ctrl->ana_lock);
10415e1f6899SChristoph Hellwig 	timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
10425e1f6899SChristoph Hellwig 	INIT_WORK(&ctrl->ana_work, nvme_ana_work);
10435e1f6899SChristoph Hellwig }
10445e1f6899SChristoph Hellwig 
nvme_mpath_init_identify(struct nvme_ctrl * ctrl,struct nvme_id_ctrl * id)10455e1f6899SChristoph Hellwig int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
10465e1f6899SChristoph Hellwig {
10475e1f6899SChristoph Hellwig 	size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT;
10485e1f6899SChristoph Hellwig 	size_t ana_log_size;
10495e1f6899SChristoph Hellwig 	int error = 0;
10500d0b660fSChristoph Hellwig 
105166b20ac0SMarta Rybczynska 	/* check if multipath is enabled and we have the capability */
105292decf11SKeith Busch 	if (!multipath || !ctrl->subsys ||
105392decf11SKeith Busch 	    !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
10540d0b660fSChristoph Hellwig 		return 0;
10550d0b660fSChristoph Hellwig 
105685b9f3e6SThomas Song 	/* initialize this in the identify path to cover controller resets */
105785b9f3e6SThomas Song 	atomic_set(&ctrl->nr_active, 0);
105885b9f3e6SThomas Song 
1059120bb362SDaniel Wagner 	if (!ctrl->max_namespaces ||
1060120bb362SDaniel Wagner 	    ctrl->max_namespaces > le32_to_cpu(id->nn)) {
1061120bb362SDaniel Wagner 		dev_err(ctrl->device,
1062120bb362SDaniel Wagner 			"Invalid MNAN value %u\n", ctrl->max_namespaces);
1063120bb362SDaniel Wagner 		return -EINVAL;
1064120bb362SDaniel Wagner 	}
1065120bb362SDaniel Wagner 
10660d0b660fSChristoph Hellwig 	ctrl->anacap = id->anacap;
10670d0b660fSChristoph Hellwig 	ctrl->anatt = id->anatt;
10680d0b660fSChristoph Hellwig 	ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
10690d0b660fSChristoph Hellwig 	ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
10700d0b660fSChristoph Hellwig 
10715e1f6899SChristoph Hellwig 	ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
10725e1f6899SChristoph Hellwig 		ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) +
10735e1f6899SChristoph Hellwig 		ctrl->max_namespaces * sizeof(__le32);
10745e1f6899SChristoph Hellwig 	if (ana_log_size > max_transfer_size) {
10750d0b660fSChristoph Hellwig 		dev_err(ctrl->device,
10765e1f6899SChristoph Hellwig 			"ANA log page size (%zd) larger than MDTS (%zd).\n",
10775e1f6899SChristoph Hellwig 			ana_log_size, max_transfer_size);
10780d0b660fSChristoph Hellwig 		dev_err(ctrl->device, "disabling ANA support.\n");
10795e1f6899SChristoph Hellwig 		goto out_uninit;
10800d0b660fSChristoph Hellwig 	}
10815e1f6899SChristoph Hellwig 	if (ana_log_size > ctrl->ana_log_size) {
10825e1f6899SChristoph Hellwig 		nvme_mpath_stop(ctrl);
1083c7c15ae3SHou Tao 		nvme_mpath_uninit(ctrl);
10845e6a7d1eSHannes Reinecke 		ctrl->ana_log_buf = kvmalloc(ana_log_size, GFP_KERNEL);
10855e1f6899SChristoph Hellwig 		if (!ctrl->ana_log_buf)
10865e1f6899SChristoph Hellwig 			return -ENOMEM;
1087bb830addSSusobhan Dey 	}
10885e1f6899SChristoph Hellwig 	ctrl->ana_log_size = ana_log_size;
108986cccfbfSAnton Eidelman 	error = nvme_read_ana_log(ctrl);
10900d0b660fSChristoph Hellwig 	if (error)
10915e1f6899SChristoph Hellwig 		goto out_uninit;
10920d0b660fSChristoph Hellwig 	return 0;
10935e1f6899SChristoph Hellwig 
10945e1f6899SChristoph Hellwig out_uninit:
10955e1f6899SChristoph Hellwig 	nvme_mpath_uninit(ctrl);
1096bb830addSSusobhan Dey 	return error;
10970d0b660fSChristoph Hellwig }
10980d0b660fSChristoph Hellwig 
nvme_mpath_uninit(struct nvme_ctrl * ctrl)10990d0b660fSChristoph Hellwig void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
11000d0b660fSChristoph Hellwig {
11015e6a7d1eSHannes Reinecke 	kvfree(ctrl->ana_log_buf);
1102c7055fd1SHannes Reinecke 	ctrl->ana_log_buf = NULL;
1103c7c15ae3SHou Tao 	ctrl->ana_log_size = 0;
11040d0b660fSChristoph Hellwig }
1105