1 /* 2 * Copyright (c) 2017 Christoph Hellwig. 3 * 4 * This program is free software; you can redistribute it and/or modify it 5 * under the terms and conditions of the GNU General Public License, 6 * version 2, as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope it will be useful, but WITHOUT 9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 11 * more details. 12 */ 13 14 #include <linux/moduleparam.h> 15 #include "nvme.h" 16 17 static bool multipath = true; 18 module_param(multipath, bool, 0644); 19 MODULE_PARM_DESC(multipath, 20 "turn on native support for multiple controllers per subsystem"); 21 22 void nvme_failover_req(struct request *req) 23 { 24 struct nvme_ns *ns = req->q->queuedata; 25 unsigned long flags; 26 27 spin_lock_irqsave(&ns->head->requeue_lock, flags); 28 blk_steal_bios(&ns->head->requeue_list, req); 29 spin_unlock_irqrestore(&ns->head->requeue_lock, flags); 30 blk_mq_end_request(req, 0); 31 32 nvme_reset_ctrl(ns->ctrl); 33 kblockd_schedule_work(&ns->head->requeue_work); 34 } 35 36 bool nvme_req_needs_failover(struct request *req) 37 { 38 if (!(req->cmd_flags & REQ_NVME_MPATH)) 39 return false; 40 41 switch (nvme_req(req)->status & 0x7ff) { 42 /* 43 * Generic command status: 44 */ 45 case NVME_SC_INVALID_OPCODE: 46 case NVME_SC_INVALID_FIELD: 47 case NVME_SC_INVALID_NS: 48 case NVME_SC_LBA_RANGE: 49 case NVME_SC_CAP_EXCEEDED: 50 case NVME_SC_RESERVATION_CONFLICT: 51 return false; 52 53 /* 54 * I/O command set specific error. Unfortunately these values are 55 * reused for fabrics commands, but those should never get here. 56 */ 57 case NVME_SC_BAD_ATTRIBUTES: 58 case NVME_SC_INVALID_PI: 59 case NVME_SC_READ_ONLY: 60 case NVME_SC_ONCS_NOT_SUPPORTED: 61 WARN_ON_ONCE(nvme_req(req)->cmd->common.opcode == 62 nvme_fabrics_command); 63 return false; 64 65 /* 66 * Media and Data Integrity Errors: 67 */ 68 case NVME_SC_WRITE_FAULT: 69 case NVME_SC_READ_ERROR: 70 case NVME_SC_GUARD_CHECK: 71 case NVME_SC_APPTAG_CHECK: 72 case NVME_SC_REFTAG_CHECK: 73 case NVME_SC_COMPARE_FAILED: 74 case NVME_SC_ACCESS_DENIED: 75 case NVME_SC_UNWRITTEN_BLOCK: 76 return false; 77 } 78 79 /* Everything else could be a path failure, so should be retried */ 80 return true; 81 } 82 83 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) 84 { 85 struct nvme_ns *ns; 86 87 mutex_lock(&ctrl->namespaces_mutex); 88 list_for_each_entry(ns, &ctrl->namespaces, list) { 89 if (ns->head->disk) 90 kblockd_schedule_work(&ns->head->requeue_work); 91 } 92 mutex_unlock(&ctrl->namespaces_mutex); 93 } 94 95 static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head) 96 { 97 struct nvme_ns *ns; 98 99 list_for_each_entry_rcu(ns, &head->list, siblings) { 100 if (ns->ctrl->state == NVME_CTRL_LIVE) { 101 rcu_assign_pointer(head->current_path, ns); 102 return ns; 103 } 104 } 105 106 return NULL; 107 } 108 109 inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) 110 { 111 struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu); 112 113 if (unlikely(!ns || ns->ctrl->state != NVME_CTRL_LIVE)) 114 ns = __nvme_find_path(head); 115 return ns; 116 } 117 118 static blk_qc_t nvme_ns_head_make_request(struct request_queue *q, 119 struct bio *bio) 120 { 121 struct nvme_ns_head *head = q->queuedata; 122 struct device *dev = disk_to_dev(head->disk); 123 struct nvme_ns *ns; 124 blk_qc_t ret = BLK_QC_T_NONE; 125 int srcu_idx; 126 127 srcu_idx = srcu_read_lock(&head->srcu); 128 ns = nvme_find_path(head); 129 if (likely(ns)) { 130 bio->bi_disk = ns->disk; 131 bio->bi_opf |= REQ_NVME_MPATH; 132 ret = direct_make_request(bio); 133 } else if (!list_empty_careful(&head->list)) { 134 dev_warn_ratelimited(dev, "no path available - requeuing I/O\n"); 135 136 spin_lock_irq(&head->requeue_lock); 137 bio_list_add(&head->requeue_list, bio); 138 spin_unlock_irq(&head->requeue_lock); 139 } else { 140 dev_warn_ratelimited(dev, "no path - failing I/O\n"); 141 142 bio->bi_status = BLK_STS_IOERR; 143 bio_endio(bio); 144 } 145 146 srcu_read_unlock(&head->srcu, srcu_idx); 147 return ret; 148 } 149 150 static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc) 151 { 152 struct nvme_ns_head *head = q->queuedata; 153 struct nvme_ns *ns; 154 bool found = false; 155 int srcu_idx; 156 157 srcu_idx = srcu_read_lock(&head->srcu); 158 ns = srcu_dereference(head->current_path, &head->srcu); 159 if (likely(ns && ns->ctrl->state == NVME_CTRL_LIVE)) 160 found = ns->queue->poll_fn(q, qc); 161 srcu_read_unlock(&head->srcu, srcu_idx); 162 return found; 163 } 164 165 static void nvme_requeue_work(struct work_struct *work) 166 { 167 struct nvme_ns_head *head = 168 container_of(work, struct nvme_ns_head, requeue_work); 169 struct bio *bio, *next; 170 171 spin_lock_irq(&head->requeue_lock); 172 next = bio_list_get(&head->requeue_list); 173 spin_unlock_irq(&head->requeue_lock); 174 175 while ((bio = next) != NULL) { 176 next = bio->bi_next; 177 bio->bi_next = NULL; 178 179 /* 180 * Reset disk to the mpath node and resubmit to select a new 181 * path. 182 */ 183 bio->bi_disk = head->disk; 184 generic_make_request(bio); 185 } 186 } 187 188 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) 189 { 190 struct request_queue *q; 191 bool vwc = false; 192 193 bio_list_init(&head->requeue_list); 194 spin_lock_init(&head->requeue_lock); 195 INIT_WORK(&head->requeue_work, nvme_requeue_work); 196 197 /* 198 * Add a multipath node if the subsystems supports multiple controllers. 199 * We also do this for private namespaces as the namespace sharing data could 200 * change after a rescan. 201 */ 202 if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath) 203 return 0; 204 205 q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE); 206 if (!q) 207 goto out; 208 q->queuedata = head; 209 blk_queue_make_request(q, nvme_ns_head_make_request); 210 q->poll_fn = nvme_ns_head_poll; 211 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); 212 /* set to a default value for 512 until disk is validated */ 213 blk_queue_logical_block_size(q, 512); 214 215 /* we need to propagate up the VMC settings */ 216 if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) 217 vwc = true; 218 blk_queue_write_cache(q, vwc, vwc); 219 220 head->disk = alloc_disk(0); 221 if (!head->disk) 222 goto out_cleanup_queue; 223 head->disk->fops = &nvme_ns_head_ops; 224 head->disk->private_data = head; 225 head->disk->queue = q; 226 head->disk->flags = GENHD_FL_EXT_DEVT; 227 sprintf(head->disk->disk_name, "nvme%dn%d", 228 ctrl->subsys->instance, head->instance); 229 return 0; 230 231 out_cleanup_queue: 232 blk_cleanup_queue(q); 233 out: 234 return -ENOMEM; 235 } 236 237 void nvme_mpath_add_disk(struct nvme_ns_head *head) 238 { 239 if (!head->disk) 240 return; 241 device_add_disk(&head->subsys->dev, head->disk); 242 if (sysfs_create_group(&disk_to_dev(head->disk)->kobj, 243 &nvme_ns_id_attr_group)) 244 pr_warn("%s: failed to create sysfs group for identification\n", 245 head->disk->disk_name); 246 } 247 248 void nvme_mpath_add_disk_links(struct nvme_ns *ns) 249 { 250 struct kobject *slave_disk_kobj, *holder_disk_kobj; 251 252 if (!ns->head->disk) 253 return; 254 255 slave_disk_kobj = &disk_to_dev(ns->disk)->kobj; 256 if (sysfs_create_link(ns->head->disk->slave_dir, slave_disk_kobj, 257 kobject_name(slave_disk_kobj))) 258 return; 259 260 holder_disk_kobj = &disk_to_dev(ns->head->disk)->kobj; 261 if (sysfs_create_link(ns->disk->part0.holder_dir, holder_disk_kobj, 262 kobject_name(holder_disk_kobj))) 263 sysfs_remove_link(ns->head->disk->slave_dir, 264 kobject_name(slave_disk_kobj)); 265 } 266 267 void nvme_mpath_remove_disk(struct nvme_ns_head *head) 268 { 269 if (!head->disk) 270 return; 271 sysfs_remove_group(&disk_to_dev(head->disk)->kobj, 272 &nvme_ns_id_attr_group); 273 del_gendisk(head->disk); 274 blk_set_queue_dying(head->disk->queue); 275 /* make sure all pending bios are cleaned up */ 276 kblockd_schedule_work(&head->requeue_work); 277 flush_work(&head->requeue_work); 278 blk_cleanup_queue(head->disk->queue); 279 put_disk(head->disk); 280 } 281 282 void nvme_mpath_remove_disk_links(struct nvme_ns *ns) 283 { 284 if (!ns->head->disk) 285 return; 286 287 sysfs_remove_link(ns->disk->part0.holder_dir, 288 kobject_name(&disk_to_dev(ns->head->disk)->kobj)); 289 sysfs_remove_link(ns->head->disk->slave_dir, 290 kobject_name(&disk_to_dev(ns->disk)->kobj)); 291 } 292