1 /* 2 * blkfront.c 3 * 4 * XenLinux virtual block device driver. 5 * 6 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand 7 * Modifications by Mark A. Williamson are (c) Intel Research Cambridge 8 * Copyright (c) 2004, Christian Limpach 9 * Copyright (c) 2004, Andrew Warfield 10 * Copyright (c) 2005, Christopher Clark 11 * Copyright (c) 2005, XenSource Ltd 12 * 13 * This program is free software; you can redistribute it and/or 14 * modify it under the terms of the GNU General Public License version 2 15 * as published by the Free Software Foundation; or, when distributed 16 * separately from the Linux kernel or incorporated into other 17 * software packages, subject to the following license: 18 * 19 * Permission is hereby granted, free of charge, to any person obtaining a copy 20 * of this source file (the "Software"), to deal in the Software without 21 * restriction, including without limitation the rights to use, copy, modify, 22 * merge, publish, distribute, sublicense, and/or sell copies of the Software, 23 * and to permit persons to whom the Software is furnished to do so, subject to 24 * the following conditions: 25 * 26 * The above copyright notice and this permission notice shall be included in 27 * all copies or substantial portions of the Software. 28 * 29 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 30 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 31 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 32 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 33 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 34 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 35 * IN THE SOFTWARE. 36 */ 37 38 #include <linux/interrupt.h> 39 #include <linux/blkdev.h> 40 #include <linux/blk-mq.h> 41 #include <linux/hdreg.h> 42 #include <linux/cdrom.h> 43 #include <linux/module.h> 44 #include <linux/slab.h> 45 #include <linux/mutex.h> 46 #include <linux/scatterlist.h> 47 #include <linux/bitmap.h> 48 #include <linux/list.h> 49 50 #include <xen/xen.h> 51 #include <xen/xenbus.h> 52 #include <xen/grant_table.h> 53 #include <xen/events.h> 54 #include <xen/page.h> 55 #include <xen/platform_pci.h> 56 57 #include <xen/interface/grant_table.h> 58 #include <xen/interface/io/blkif.h> 59 #include <xen/interface/io/protocols.h> 60 61 #include <asm/xen/hypervisor.h> 62 63 enum blkif_state { 64 BLKIF_STATE_DISCONNECTED, 65 BLKIF_STATE_CONNECTED, 66 BLKIF_STATE_SUSPENDED, 67 }; 68 69 struct grant { 70 grant_ref_t gref; 71 struct page *page; 72 struct list_head node; 73 }; 74 75 struct blk_shadow { 76 struct blkif_request req; 77 struct request *request; 78 struct grant **grants_used; 79 struct grant **indirect_grants; 80 struct scatterlist *sg; 81 unsigned int num_sg; 82 }; 83 84 struct split_bio { 85 struct bio *bio; 86 atomic_t pending; 87 }; 88 89 static DEFINE_MUTEX(blkfront_mutex); 90 static const struct block_device_operations xlvbd_block_fops; 91 92 /* 93 * Maximum number of segments in indirect requests, the actual value used by 94 * the frontend driver is the minimum of this value and the value provided 95 * by the backend driver. 96 */ 97 98 static unsigned int xen_blkif_max_segments = 32; 99 module_param_named(max, xen_blkif_max_segments, int, S_IRUGO); 100 MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)"); 101 102 /* 103 * Maximum order of pages to be used for the shared ring between front and 104 * backend, 4KB page granularity is used. 105 */ 106 static unsigned int xen_blkif_max_ring_order; 107 module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO); 108 MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring"); 109 110 #define BLK_RING_SIZE(info) \ 111 __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * (info)->nr_ring_pages) 112 113 #define BLK_MAX_RING_SIZE \ 114 __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * XENBUS_MAX_RING_GRANTS) 115 116 /* 117 * ring-ref%i i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19 118 * characters are enough. Define to 20 to keep consist with backend. 119 */ 120 #define RINGREF_NAME_LEN (20) 121 122 /* 123 * We have one of these per vbd, whether ide, scsi or 'other'. They 124 * hang in private_data off the gendisk structure. We may end up 125 * putting all kinds of interesting stuff here :-) 126 */ 127 struct blkfront_info 128 { 129 spinlock_t io_lock; 130 struct mutex mutex; 131 struct xenbus_device *xbdev; 132 struct gendisk *gd; 133 int vdevice; 134 blkif_vdev_t handle; 135 enum blkif_state connected; 136 int ring_ref[XENBUS_MAX_RING_GRANTS]; 137 unsigned int nr_ring_pages; 138 struct blkif_front_ring ring; 139 unsigned int evtchn, irq; 140 struct request_queue *rq; 141 struct work_struct work; 142 struct gnttab_free_callback callback; 143 struct blk_shadow shadow[BLK_MAX_RING_SIZE]; 144 struct list_head grants; 145 struct list_head indirect_pages; 146 unsigned int persistent_gnts_c; 147 unsigned long shadow_free; 148 unsigned int feature_flush; 149 unsigned int feature_discard:1; 150 unsigned int feature_secdiscard:1; 151 unsigned int discard_granularity; 152 unsigned int discard_alignment; 153 unsigned int feature_persistent:1; 154 /* Number of 4KB segments handled */ 155 unsigned int max_indirect_segments; 156 int is_ready; 157 struct blk_mq_tag_set tag_set; 158 }; 159 160 static unsigned int nr_minors; 161 static unsigned long *minors; 162 static DEFINE_SPINLOCK(minor_lock); 163 164 #define GRANT_INVALID_REF 0 165 166 #define PARTS_PER_DISK 16 167 #define PARTS_PER_EXT_DISK 256 168 169 #define BLKIF_MAJOR(dev) ((dev)>>8) 170 #define BLKIF_MINOR(dev) ((dev) & 0xff) 171 172 #define EXT_SHIFT 28 173 #define EXTENDED (1<<EXT_SHIFT) 174 #define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED)) 175 #define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED)) 176 #define EMULATED_HD_DISK_MINOR_OFFSET (0) 177 #define EMULATED_HD_DISK_NAME_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET / 256) 178 #define EMULATED_SD_DISK_MINOR_OFFSET (0) 179 #define EMULATED_SD_DISK_NAME_OFFSET (EMULATED_SD_DISK_MINOR_OFFSET / 256) 180 181 #define DEV_NAME "xvd" /* name in /dev */ 182 183 /* 184 * Grants are always the same size as a Xen page (i.e 4KB). 185 * A physical segment is always the same size as a Linux page. 186 * Number of grants per physical segment 187 */ 188 #define GRANTS_PER_PSEG (PAGE_SIZE / XEN_PAGE_SIZE) 189 190 #define GRANTS_PER_INDIRECT_FRAME \ 191 (XEN_PAGE_SIZE / sizeof(struct blkif_request_segment)) 192 193 #define PSEGS_PER_INDIRECT_FRAME \ 194 (GRANTS_INDIRECT_FRAME / GRANTS_PSEGS) 195 196 #define INDIRECT_GREFS(_grants) \ 197 DIV_ROUND_UP(_grants, GRANTS_PER_INDIRECT_FRAME) 198 199 #define GREFS(_psegs) ((_psegs) * GRANTS_PER_PSEG) 200 201 static int blkfront_setup_indirect(struct blkfront_info *info); 202 static int blkfront_gather_backend_features(struct blkfront_info *info); 203 204 static int get_id_from_freelist(struct blkfront_info *info) 205 { 206 unsigned long free = info->shadow_free; 207 BUG_ON(free >= BLK_RING_SIZE(info)); 208 info->shadow_free = info->shadow[free].req.u.rw.id; 209 info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */ 210 return free; 211 } 212 213 static int add_id_to_freelist(struct blkfront_info *info, 214 unsigned long id) 215 { 216 if (info->shadow[id].req.u.rw.id != id) 217 return -EINVAL; 218 if (info->shadow[id].request == NULL) 219 return -EINVAL; 220 info->shadow[id].req.u.rw.id = info->shadow_free; 221 info->shadow[id].request = NULL; 222 info->shadow_free = id; 223 return 0; 224 } 225 226 static int fill_grant_buffer(struct blkfront_info *info, int num) 227 { 228 struct page *granted_page; 229 struct grant *gnt_list_entry, *n; 230 int i = 0; 231 232 while(i < num) { 233 gnt_list_entry = kzalloc(sizeof(struct grant), GFP_NOIO); 234 if (!gnt_list_entry) 235 goto out_of_memory; 236 237 if (info->feature_persistent) { 238 granted_page = alloc_page(GFP_NOIO); 239 if (!granted_page) { 240 kfree(gnt_list_entry); 241 goto out_of_memory; 242 } 243 gnt_list_entry->page = granted_page; 244 } 245 246 gnt_list_entry->gref = GRANT_INVALID_REF; 247 list_add(&gnt_list_entry->node, &info->grants); 248 i++; 249 } 250 251 return 0; 252 253 out_of_memory: 254 list_for_each_entry_safe(gnt_list_entry, n, 255 &info->grants, node) { 256 list_del(&gnt_list_entry->node); 257 if (info->feature_persistent) 258 __free_page(gnt_list_entry->page); 259 kfree(gnt_list_entry); 260 i--; 261 } 262 BUG_ON(i != 0); 263 return -ENOMEM; 264 } 265 266 static struct grant *get_free_grant(struct blkfront_info *info) 267 { 268 struct grant *gnt_list_entry; 269 270 BUG_ON(list_empty(&info->grants)); 271 gnt_list_entry = list_first_entry(&info->grants, struct grant, 272 node); 273 list_del(&gnt_list_entry->node); 274 275 if (gnt_list_entry->gref != GRANT_INVALID_REF) 276 info->persistent_gnts_c--; 277 278 return gnt_list_entry; 279 } 280 281 static inline void grant_foreign_access(const struct grant *gnt_list_entry, 282 const struct blkfront_info *info) 283 { 284 gnttab_page_grant_foreign_access_ref_one(gnt_list_entry->gref, 285 info->xbdev->otherend_id, 286 gnt_list_entry->page, 287 0); 288 } 289 290 static struct grant *get_grant(grant_ref_t *gref_head, 291 unsigned long gfn, 292 struct blkfront_info *info) 293 { 294 struct grant *gnt_list_entry = get_free_grant(info); 295 296 if (gnt_list_entry->gref != GRANT_INVALID_REF) 297 return gnt_list_entry; 298 299 /* Assign a gref to this page */ 300 gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head); 301 BUG_ON(gnt_list_entry->gref == -ENOSPC); 302 if (info->feature_persistent) 303 grant_foreign_access(gnt_list_entry, info); 304 else { 305 /* Grant access to the GFN passed by the caller */ 306 gnttab_grant_foreign_access_ref(gnt_list_entry->gref, 307 info->xbdev->otherend_id, 308 gfn, 0); 309 } 310 311 return gnt_list_entry; 312 } 313 314 static struct grant *get_indirect_grant(grant_ref_t *gref_head, 315 struct blkfront_info *info) 316 { 317 struct grant *gnt_list_entry = get_free_grant(info); 318 319 if (gnt_list_entry->gref != GRANT_INVALID_REF) 320 return gnt_list_entry; 321 322 /* Assign a gref to this page */ 323 gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head); 324 BUG_ON(gnt_list_entry->gref == -ENOSPC); 325 if (!info->feature_persistent) { 326 struct page *indirect_page; 327 328 /* Fetch a pre-allocated page to use for indirect grefs */ 329 BUG_ON(list_empty(&info->indirect_pages)); 330 indirect_page = list_first_entry(&info->indirect_pages, 331 struct page, lru); 332 list_del(&indirect_page->lru); 333 gnt_list_entry->page = indirect_page; 334 } 335 grant_foreign_access(gnt_list_entry, info); 336 337 return gnt_list_entry; 338 } 339 340 static const char *op_name(int op) 341 { 342 static const char *const names[] = { 343 [BLKIF_OP_READ] = "read", 344 [BLKIF_OP_WRITE] = "write", 345 [BLKIF_OP_WRITE_BARRIER] = "barrier", 346 [BLKIF_OP_FLUSH_DISKCACHE] = "flush", 347 [BLKIF_OP_DISCARD] = "discard" }; 348 349 if (op < 0 || op >= ARRAY_SIZE(names)) 350 return "unknown"; 351 352 if (!names[op]) 353 return "reserved"; 354 355 return names[op]; 356 } 357 static int xlbd_reserve_minors(unsigned int minor, unsigned int nr) 358 { 359 unsigned int end = minor + nr; 360 int rc; 361 362 if (end > nr_minors) { 363 unsigned long *bitmap, *old; 364 365 bitmap = kcalloc(BITS_TO_LONGS(end), sizeof(*bitmap), 366 GFP_KERNEL); 367 if (bitmap == NULL) 368 return -ENOMEM; 369 370 spin_lock(&minor_lock); 371 if (end > nr_minors) { 372 old = minors; 373 memcpy(bitmap, minors, 374 BITS_TO_LONGS(nr_minors) * sizeof(*bitmap)); 375 minors = bitmap; 376 nr_minors = BITS_TO_LONGS(end) * BITS_PER_LONG; 377 } else 378 old = bitmap; 379 spin_unlock(&minor_lock); 380 kfree(old); 381 } 382 383 spin_lock(&minor_lock); 384 if (find_next_bit(minors, end, minor) >= end) { 385 bitmap_set(minors, minor, nr); 386 rc = 0; 387 } else 388 rc = -EBUSY; 389 spin_unlock(&minor_lock); 390 391 return rc; 392 } 393 394 static void xlbd_release_minors(unsigned int minor, unsigned int nr) 395 { 396 unsigned int end = minor + nr; 397 398 BUG_ON(end > nr_minors); 399 spin_lock(&minor_lock); 400 bitmap_clear(minors, minor, nr); 401 spin_unlock(&minor_lock); 402 } 403 404 static void blkif_restart_queue_callback(void *arg) 405 { 406 struct blkfront_info *info = (struct blkfront_info *)arg; 407 schedule_work(&info->work); 408 } 409 410 static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg) 411 { 412 /* We don't have real geometry info, but let's at least return 413 values consistent with the size of the device */ 414 sector_t nsect = get_capacity(bd->bd_disk); 415 sector_t cylinders = nsect; 416 417 hg->heads = 0xff; 418 hg->sectors = 0x3f; 419 sector_div(cylinders, hg->heads * hg->sectors); 420 hg->cylinders = cylinders; 421 if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect) 422 hg->cylinders = 0xffff; 423 return 0; 424 } 425 426 static int blkif_ioctl(struct block_device *bdev, fmode_t mode, 427 unsigned command, unsigned long argument) 428 { 429 struct blkfront_info *info = bdev->bd_disk->private_data; 430 int i; 431 432 dev_dbg(&info->xbdev->dev, "command: 0x%x, argument: 0x%lx\n", 433 command, (long)argument); 434 435 switch (command) { 436 case CDROMMULTISESSION: 437 dev_dbg(&info->xbdev->dev, "FIXME: support multisession CDs later\n"); 438 for (i = 0; i < sizeof(struct cdrom_multisession); i++) 439 if (put_user(0, (char __user *)(argument + i))) 440 return -EFAULT; 441 return 0; 442 443 case CDROM_GET_CAPABILITY: { 444 struct gendisk *gd = info->gd; 445 if (gd->flags & GENHD_FL_CD) 446 return 0; 447 return -EINVAL; 448 } 449 450 default: 451 /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n", 452 command);*/ 453 return -EINVAL; /* same return as native Linux */ 454 } 455 456 return 0; 457 } 458 459 static int blkif_queue_discard_req(struct request *req) 460 { 461 struct blkfront_info *info = req->rq_disk->private_data; 462 struct blkif_request *ring_req; 463 unsigned long id; 464 465 /* Fill out a communications ring structure. */ 466 ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); 467 id = get_id_from_freelist(info); 468 info->shadow[id].request = req; 469 470 ring_req->operation = BLKIF_OP_DISCARD; 471 ring_req->u.discard.nr_sectors = blk_rq_sectors(req); 472 ring_req->u.discard.id = id; 473 ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req); 474 if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard) 475 ring_req->u.discard.flag = BLKIF_DISCARD_SECURE; 476 else 477 ring_req->u.discard.flag = 0; 478 479 info->ring.req_prod_pvt++; 480 481 /* Keep a private copy so we can reissue requests when recovering. */ 482 info->shadow[id].req = *ring_req; 483 484 return 0; 485 } 486 487 struct setup_rw_req { 488 unsigned int grant_idx; 489 struct blkif_request_segment *segments; 490 struct blkfront_info *info; 491 struct blkif_request *ring_req; 492 grant_ref_t gref_head; 493 unsigned int id; 494 /* Only used when persistent grant is used and it's a read request */ 495 bool need_copy; 496 unsigned int bvec_off; 497 char *bvec_data; 498 }; 499 500 static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, 501 unsigned int len, void *data) 502 { 503 struct setup_rw_req *setup = data; 504 int n, ref; 505 struct grant *gnt_list_entry; 506 unsigned int fsect, lsect; 507 /* Convenient aliases */ 508 unsigned int grant_idx = setup->grant_idx; 509 struct blkif_request *ring_req = setup->ring_req; 510 struct blkfront_info *info = setup->info; 511 struct blk_shadow *shadow = &info->shadow[setup->id]; 512 513 if ((ring_req->operation == BLKIF_OP_INDIRECT) && 514 (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) { 515 if (setup->segments) 516 kunmap_atomic(setup->segments); 517 518 n = grant_idx / GRANTS_PER_INDIRECT_FRAME; 519 gnt_list_entry = get_indirect_grant(&setup->gref_head, info); 520 shadow->indirect_grants[n] = gnt_list_entry; 521 setup->segments = kmap_atomic(gnt_list_entry->page); 522 ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref; 523 } 524 525 gnt_list_entry = get_grant(&setup->gref_head, gfn, info); 526 ref = gnt_list_entry->gref; 527 shadow->grants_used[grant_idx] = gnt_list_entry; 528 529 if (setup->need_copy) { 530 void *shared_data; 531 532 shared_data = kmap_atomic(gnt_list_entry->page); 533 /* 534 * this does not wipe data stored outside the 535 * range sg->offset..sg->offset+sg->length. 536 * Therefore, blkback *could* see data from 537 * previous requests. This is OK as long as 538 * persistent grants are shared with just one 539 * domain. It may need refactoring if this 540 * changes 541 */ 542 memcpy(shared_data + offset, 543 setup->bvec_data + setup->bvec_off, 544 len); 545 546 kunmap_atomic(shared_data); 547 setup->bvec_off += len; 548 } 549 550 fsect = offset >> 9; 551 lsect = fsect + (len >> 9) - 1; 552 if (ring_req->operation != BLKIF_OP_INDIRECT) { 553 ring_req->u.rw.seg[grant_idx] = 554 (struct blkif_request_segment) { 555 .gref = ref, 556 .first_sect = fsect, 557 .last_sect = lsect }; 558 } else { 559 setup->segments[grant_idx % GRANTS_PER_INDIRECT_FRAME] = 560 (struct blkif_request_segment) { 561 .gref = ref, 562 .first_sect = fsect, 563 .last_sect = lsect }; 564 } 565 566 (setup->grant_idx)++; 567 } 568 569 static int blkif_queue_rw_req(struct request *req) 570 { 571 struct blkfront_info *info = req->rq_disk->private_data; 572 struct blkif_request *ring_req; 573 unsigned long id; 574 int i; 575 struct setup_rw_req setup = { 576 .grant_idx = 0, 577 .segments = NULL, 578 .info = info, 579 .need_copy = rq_data_dir(req) && info->feature_persistent, 580 }; 581 582 /* 583 * Used to store if we are able to queue the request by just using 584 * existing persistent grants, or if we have to get new grants, 585 * as there are not sufficiently many free. 586 */ 587 bool new_persistent_gnts; 588 struct scatterlist *sg; 589 int num_sg, max_grefs, num_grant; 590 591 max_grefs = req->nr_phys_segments * GRANTS_PER_PSEG; 592 if (max_grefs > BLKIF_MAX_SEGMENTS_PER_REQUEST) 593 /* 594 * If we are using indirect segments we need to account 595 * for the indirect grefs used in the request. 596 */ 597 max_grefs += INDIRECT_GREFS(max_grefs); 598 599 /* Check if we have enough grants to allocate a requests */ 600 if (info->persistent_gnts_c < max_grefs) { 601 new_persistent_gnts = 1; 602 if (gnttab_alloc_grant_references( 603 max_grefs - info->persistent_gnts_c, 604 &setup.gref_head) < 0) { 605 gnttab_request_free_callback( 606 &info->callback, 607 blkif_restart_queue_callback, 608 info, 609 max_grefs); 610 return 1; 611 } 612 } else 613 new_persistent_gnts = 0; 614 615 /* Fill out a communications ring structure. */ 616 ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); 617 id = get_id_from_freelist(info); 618 info->shadow[id].request = req; 619 620 BUG_ON(info->max_indirect_segments == 0 && 621 GREFS(req->nr_phys_segments) > BLKIF_MAX_SEGMENTS_PER_REQUEST); 622 BUG_ON(info->max_indirect_segments && 623 GREFS(req->nr_phys_segments) > info->max_indirect_segments); 624 625 num_sg = blk_rq_map_sg(req->q, req, info->shadow[id].sg); 626 num_grant = 0; 627 /* Calculate the number of grant used */ 628 for_each_sg(info->shadow[id].sg, sg, num_sg, i) 629 num_grant += gnttab_count_grant(sg->offset, sg->length); 630 631 ring_req->u.rw.id = id; 632 info->shadow[id].num_sg = num_sg; 633 if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST) { 634 /* 635 * The indirect operation can only be a BLKIF_OP_READ or 636 * BLKIF_OP_WRITE 637 */ 638 BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA)); 639 ring_req->operation = BLKIF_OP_INDIRECT; 640 ring_req->u.indirect.indirect_op = rq_data_dir(req) ? 641 BLKIF_OP_WRITE : BLKIF_OP_READ; 642 ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req); 643 ring_req->u.indirect.handle = info->handle; 644 ring_req->u.indirect.nr_segments = num_grant; 645 } else { 646 ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req); 647 ring_req->u.rw.handle = info->handle; 648 ring_req->operation = rq_data_dir(req) ? 649 BLKIF_OP_WRITE : BLKIF_OP_READ; 650 if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) { 651 /* 652 * Ideally we can do an unordered flush-to-disk. 653 * In case the backend onlysupports barriers, use that. 654 * A barrier request a superset of FUA, so we can 655 * implement it the same way. (It's also a FLUSH+FUA, 656 * since it is guaranteed ordered WRT previous writes.) 657 */ 658 switch (info->feature_flush & 659 ((REQ_FLUSH|REQ_FUA))) { 660 case REQ_FLUSH|REQ_FUA: 661 ring_req->operation = 662 BLKIF_OP_WRITE_BARRIER; 663 break; 664 case REQ_FLUSH: 665 ring_req->operation = 666 BLKIF_OP_FLUSH_DISKCACHE; 667 break; 668 default: 669 ring_req->operation = 0; 670 } 671 } 672 ring_req->u.rw.nr_segments = num_grant; 673 } 674 675 setup.ring_req = ring_req; 676 setup.id = id; 677 for_each_sg(info->shadow[id].sg, sg, num_sg, i) { 678 BUG_ON(sg->offset + sg->length > PAGE_SIZE); 679 680 if (setup.need_copy) { 681 setup.bvec_off = sg->offset; 682 setup.bvec_data = kmap_atomic(sg_page(sg)); 683 } 684 685 gnttab_foreach_grant_in_range(sg_page(sg), 686 sg->offset, 687 sg->length, 688 blkif_setup_rw_req_grant, 689 &setup); 690 691 if (setup.need_copy) 692 kunmap_atomic(setup.bvec_data); 693 } 694 if (setup.segments) 695 kunmap_atomic(setup.segments); 696 697 info->ring.req_prod_pvt++; 698 699 /* Keep a private copy so we can reissue requests when recovering. */ 700 info->shadow[id].req = *ring_req; 701 702 if (new_persistent_gnts) 703 gnttab_free_grant_references(setup.gref_head); 704 705 return 0; 706 } 707 708 /* 709 * Generate a Xen blkfront IO request from a blk layer request. Reads 710 * and writes are handled as expected. 711 * 712 * @req: a request struct 713 */ 714 static int blkif_queue_request(struct request *req) 715 { 716 struct blkfront_info *info = req->rq_disk->private_data; 717 718 if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) 719 return 1; 720 721 if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) 722 return blkif_queue_discard_req(req); 723 else 724 return blkif_queue_rw_req(req); 725 } 726 727 static inline void flush_requests(struct blkfront_info *info) 728 { 729 int notify; 730 731 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify); 732 733 if (notify) 734 notify_remote_via_irq(info->irq); 735 } 736 737 static inline bool blkif_request_flush_invalid(struct request *req, 738 struct blkfront_info *info) 739 { 740 return ((req->cmd_type != REQ_TYPE_FS) || 741 ((req->cmd_flags & REQ_FLUSH) && 742 !(info->feature_flush & REQ_FLUSH)) || 743 ((req->cmd_flags & REQ_FUA) && 744 !(info->feature_flush & REQ_FUA))); 745 } 746 747 static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx, 748 const struct blk_mq_queue_data *qd) 749 { 750 struct blkfront_info *info = qd->rq->rq_disk->private_data; 751 752 blk_mq_start_request(qd->rq); 753 spin_lock_irq(&info->io_lock); 754 if (RING_FULL(&info->ring)) 755 goto out_busy; 756 757 if (blkif_request_flush_invalid(qd->rq, info)) 758 goto out_err; 759 760 if (blkif_queue_request(qd->rq)) 761 goto out_busy; 762 763 flush_requests(info); 764 spin_unlock_irq(&info->io_lock); 765 return BLK_MQ_RQ_QUEUE_OK; 766 767 out_err: 768 spin_unlock_irq(&info->io_lock); 769 return BLK_MQ_RQ_QUEUE_ERROR; 770 771 out_busy: 772 spin_unlock_irq(&info->io_lock); 773 blk_mq_stop_hw_queue(hctx); 774 return BLK_MQ_RQ_QUEUE_BUSY; 775 } 776 777 static struct blk_mq_ops blkfront_mq_ops = { 778 .queue_rq = blkif_queue_rq, 779 .map_queue = blk_mq_map_queue, 780 }; 781 782 static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, 783 unsigned int physical_sector_size, 784 unsigned int segments) 785 { 786 struct request_queue *rq; 787 struct blkfront_info *info = gd->private_data; 788 789 memset(&info->tag_set, 0, sizeof(info->tag_set)); 790 info->tag_set.ops = &blkfront_mq_ops; 791 info->tag_set.nr_hw_queues = 1; 792 info->tag_set.queue_depth = BLK_RING_SIZE(info); 793 info->tag_set.numa_node = NUMA_NO_NODE; 794 info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; 795 info->tag_set.cmd_size = 0; 796 info->tag_set.driver_data = info; 797 798 if (blk_mq_alloc_tag_set(&info->tag_set)) 799 return -1; 800 rq = blk_mq_init_queue(&info->tag_set); 801 if (IS_ERR(rq)) { 802 blk_mq_free_tag_set(&info->tag_set); 803 return -1; 804 } 805 806 queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq); 807 808 if (info->feature_discard) { 809 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, rq); 810 blk_queue_max_discard_sectors(rq, get_capacity(gd)); 811 rq->limits.discard_granularity = info->discard_granularity; 812 rq->limits.discard_alignment = info->discard_alignment; 813 if (info->feature_secdiscard) 814 queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, rq); 815 } 816 817 /* Hard sector size and max sectors impersonate the equiv. hardware. */ 818 blk_queue_logical_block_size(rq, sector_size); 819 blk_queue_physical_block_size(rq, physical_sector_size); 820 blk_queue_max_hw_sectors(rq, (segments * XEN_PAGE_SIZE) / 512); 821 822 /* Each segment in a request is up to an aligned page in size. */ 823 blk_queue_segment_boundary(rq, PAGE_SIZE - 1); 824 blk_queue_max_segment_size(rq, PAGE_SIZE); 825 826 /* Ensure a merged request will fit in a single I/O ring slot. */ 827 blk_queue_max_segments(rq, segments / GRANTS_PER_PSEG); 828 829 /* Make sure buffer addresses are sector-aligned. */ 830 blk_queue_dma_alignment(rq, 511); 831 832 /* Make sure we don't use bounce buffers. */ 833 blk_queue_bounce_limit(rq, BLK_BOUNCE_ANY); 834 835 gd->queue = rq; 836 837 return 0; 838 } 839 840 static const char *flush_info(unsigned int feature_flush) 841 { 842 switch (feature_flush & ((REQ_FLUSH | REQ_FUA))) { 843 case REQ_FLUSH|REQ_FUA: 844 return "barrier: enabled;"; 845 case REQ_FLUSH: 846 return "flush diskcache: enabled;"; 847 default: 848 return "barrier or flush: disabled;"; 849 } 850 } 851 852 static void xlvbd_flush(struct blkfront_info *info) 853 { 854 blk_queue_flush(info->rq, info->feature_flush); 855 pr_info("blkfront: %s: %s %s %s %s %s\n", 856 info->gd->disk_name, flush_info(info->feature_flush), 857 "persistent grants:", info->feature_persistent ? 858 "enabled;" : "disabled;", "indirect descriptors:", 859 info->max_indirect_segments ? "enabled;" : "disabled;"); 860 } 861 862 static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) 863 { 864 int major; 865 major = BLKIF_MAJOR(vdevice); 866 *minor = BLKIF_MINOR(vdevice); 867 switch (major) { 868 case XEN_IDE0_MAJOR: 869 *offset = (*minor / 64) + EMULATED_HD_DISK_NAME_OFFSET; 870 *minor = ((*minor / 64) * PARTS_PER_DISK) + 871 EMULATED_HD_DISK_MINOR_OFFSET; 872 break; 873 case XEN_IDE1_MAJOR: 874 *offset = (*minor / 64) + 2 + EMULATED_HD_DISK_NAME_OFFSET; 875 *minor = (((*minor / 64) + 2) * PARTS_PER_DISK) + 876 EMULATED_HD_DISK_MINOR_OFFSET; 877 break; 878 case XEN_SCSI_DISK0_MAJOR: 879 *offset = (*minor / PARTS_PER_DISK) + EMULATED_SD_DISK_NAME_OFFSET; 880 *minor = *minor + EMULATED_SD_DISK_MINOR_OFFSET; 881 break; 882 case XEN_SCSI_DISK1_MAJOR: 883 case XEN_SCSI_DISK2_MAJOR: 884 case XEN_SCSI_DISK3_MAJOR: 885 case XEN_SCSI_DISK4_MAJOR: 886 case XEN_SCSI_DISK5_MAJOR: 887 case XEN_SCSI_DISK6_MAJOR: 888 case XEN_SCSI_DISK7_MAJOR: 889 *offset = (*minor / PARTS_PER_DISK) + 890 ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16) + 891 EMULATED_SD_DISK_NAME_OFFSET; 892 *minor = *minor + 893 ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16 * PARTS_PER_DISK) + 894 EMULATED_SD_DISK_MINOR_OFFSET; 895 break; 896 case XEN_SCSI_DISK8_MAJOR: 897 case XEN_SCSI_DISK9_MAJOR: 898 case XEN_SCSI_DISK10_MAJOR: 899 case XEN_SCSI_DISK11_MAJOR: 900 case XEN_SCSI_DISK12_MAJOR: 901 case XEN_SCSI_DISK13_MAJOR: 902 case XEN_SCSI_DISK14_MAJOR: 903 case XEN_SCSI_DISK15_MAJOR: 904 *offset = (*minor / PARTS_PER_DISK) + 905 ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16) + 906 EMULATED_SD_DISK_NAME_OFFSET; 907 *minor = *minor + 908 ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16 * PARTS_PER_DISK) + 909 EMULATED_SD_DISK_MINOR_OFFSET; 910 break; 911 case XENVBD_MAJOR: 912 *offset = *minor / PARTS_PER_DISK; 913 break; 914 default: 915 printk(KERN_WARNING "blkfront: your disk configuration is " 916 "incorrect, please use an xvd device instead\n"); 917 return -ENODEV; 918 } 919 return 0; 920 } 921 922 static char *encode_disk_name(char *ptr, unsigned int n) 923 { 924 if (n >= 26) 925 ptr = encode_disk_name(ptr, n / 26 - 1); 926 *ptr = 'a' + n % 26; 927 return ptr + 1; 928 } 929 930 static int xlvbd_alloc_gendisk(blkif_sector_t capacity, 931 struct blkfront_info *info, 932 u16 vdisk_info, u16 sector_size, 933 unsigned int physical_sector_size) 934 { 935 struct gendisk *gd; 936 int nr_minors = 1; 937 int err; 938 unsigned int offset; 939 int minor; 940 int nr_parts; 941 char *ptr; 942 943 BUG_ON(info->gd != NULL); 944 BUG_ON(info->rq != NULL); 945 946 if ((info->vdevice>>EXT_SHIFT) > 1) { 947 /* this is above the extended range; something is wrong */ 948 printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", info->vdevice); 949 return -ENODEV; 950 } 951 952 if (!VDEV_IS_EXTENDED(info->vdevice)) { 953 err = xen_translate_vdev(info->vdevice, &minor, &offset); 954 if (err) 955 return err; 956 nr_parts = PARTS_PER_DISK; 957 } else { 958 minor = BLKIF_MINOR_EXT(info->vdevice); 959 nr_parts = PARTS_PER_EXT_DISK; 960 offset = minor / nr_parts; 961 if (xen_hvm_domain() && offset < EMULATED_HD_DISK_NAME_OFFSET + 4) 962 printk(KERN_WARNING "blkfront: vdevice 0x%x might conflict with " 963 "emulated IDE disks,\n\t choose an xvd device name" 964 "from xvde on\n", info->vdevice); 965 } 966 if (minor >> MINORBITS) { 967 pr_warn("blkfront: %#x's minor (%#x) out of range; ignoring\n", 968 info->vdevice, minor); 969 return -ENODEV; 970 } 971 972 if ((minor % nr_parts) == 0) 973 nr_minors = nr_parts; 974 975 err = xlbd_reserve_minors(minor, nr_minors); 976 if (err) 977 goto out; 978 err = -ENODEV; 979 980 gd = alloc_disk(nr_minors); 981 if (gd == NULL) 982 goto release; 983 984 strcpy(gd->disk_name, DEV_NAME); 985 ptr = encode_disk_name(gd->disk_name + sizeof(DEV_NAME) - 1, offset); 986 BUG_ON(ptr >= gd->disk_name + DISK_NAME_LEN); 987 if (nr_minors > 1) 988 *ptr = 0; 989 else 990 snprintf(ptr, gd->disk_name + DISK_NAME_LEN - ptr, 991 "%d", minor & (nr_parts - 1)); 992 993 gd->major = XENVBD_MAJOR; 994 gd->first_minor = minor; 995 gd->fops = &xlvbd_block_fops; 996 gd->private_data = info; 997 gd->driverfs_dev = &(info->xbdev->dev); 998 set_capacity(gd, capacity); 999 1000 if (xlvbd_init_blk_queue(gd, sector_size, physical_sector_size, 1001 info->max_indirect_segments ? : 1002 BLKIF_MAX_SEGMENTS_PER_REQUEST)) { 1003 del_gendisk(gd); 1004 goto release; 1005 } 1006 1007 info->rq = gd->queue; 1008 info->gd = gd; 1009 1010 xlvbd_flush(info); 1011 1012 if (vdisk_info & VDISK_READONLY) 1013 set_disk_ro(gd, 1); 1014 1015 if (vdisk_info & VDISK_REMOVABLE) 1016 gd->flags |= GENHD_FL_REMOVABLE; 1017 1018 if (vdisk_info & VDISK_CDROM) 1019 gd->flags |= GENHD_FL_CD; 1020 1021 return 0; 1022 1023 release: 1024 xlbd_release_minors(minor, nr_minors); 1025 out: 1026 return err; 1027 } 1028 1029 static void xlvbd_release_gendisk(struct blkfront_info *info) 1030 { 1031 unsigned int minor, nr_minors; 1032 1033 if (info->rq == NULL) 1034 return; 1035 1036 /* No more blkif_request(). */ 1037 blk_mq_stop_hw_queues(info->rq); 1038 1039 /* No more gnttab callback work. */ 1040 gnttab_cancel_free_callback(&info->callback); 1041 1042 /* Flush gnttab callback work. Must be done with no locks held. */ 1043 flush_work(&info->work); 1044 1045 del_gendisk(info->gd); 1046 1047 minor = info->gd->first_minor; 1048 nr_minors = info->gd->minors; 1049 xlbd_release_minors(minor, nr_minors); 1050 1051 blk_cleanup_queue(info->rq); 1052 blk_mq_free_tag_set(&info->tag_set); 1053 info->rq = NULL; 1054 1055 put_disk(info->gd); 1056 info->gd = NULL; 1057 } 1058 1059 /* Must be called with io_lock holded */ 1060 static void kick_pending_request_queues(struct blkfront_info *info) 1061 { 1062 if (!RING_FULL(&info->ring)) 1063 blk_mq_start_stopped_hw_queues(info->rq, true); 1064 } 1065 1066 static void blkif_restart_queue(struct work_struct *work) 1067 { 1068 struct blkfront_info *info = container_of(work, struct blkfront_info, work); 1069 1070 spin_lock_irq(&info->io_lock); 1071 if (info->connected == BLKIF_STATE_CONNECTED) 1072 kick_pending_request_queues(info); 1073 spin_unlock_irq(&info->io_lock); 1074 } 1075 1076 static void blkif_free(struct blkfront_info *info, int suspend) 1077 { 1078 struct grant *persistent_gnt; 1079 struct grant *n; 1080 int i, j, segs; 1081 1082 /* Prevent new requests being issued until we fix things up. */ 1083 spin_lock_irq(&info->io_lock); 1084 info->connected = suspend ? 1085 BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; 1086 /* No more blkif_request(). */ 1087 if (info->rq) 1088 blk_mq_stop_hw_queues(info->rq); 1089 1090 /* Remove all persistent grants */ 1091 if (!list_empty(&info->grants)) { 1092 list_for_each_entry_safe(persistent_gnt, n, 1093 &info->grants, node) { 1094 list_del(&persistent_gnt->node); 1095 if (persistent_gnt->gref != GRANT_INVALID_REF) { 1096 gnttab_end_foreign_access(persistent_gnt->gref, 1097 0, 0UL); 1098 info->persistent_gnts_c--; 1099 } 1100 if (info->feature_persistent) 1101 __free_page(persistent_gnt->page); 1102 kfree(persistent_gnt); 1103 } 1104 } 1105 BUG_ON(info->persistent_gnts_c != 0); 1106 1107 /* 1108 * Remove indirect pages, this only happens when using indirect 1109 * descriptors but not persistent grants 1110 */ 1111 if (!list_empty(&info->indirect_pages)) { 1112 struct page *indirect_page, *n; 1113 1114 BUG_ON(info->feature_persistent); 1115 list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) { 1116 list_del(&indirect_page->lru); 1117 __free_page(indirect_page); 1118 } 1119 } 1120 1121 for (i = 0; i < BLK_RING_SIZE(info); i++) { 1122 /* 1123 * Clear persistent grants present in requests already 1124 * on the shared ring 1125 */ 1126 if (!info->shadow[i].request) 1127 goto free_shadow; 1128 1129 segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ? 1130 info->shadow[i].req.u.indirect.nr_segments : 1131 info->shadow[i].req.u.rw.nr_segments; 1132 for (j = 0; j < segs; j++) { 1133 persistent_gnt = info->shadow[i].grants_used[j]; 1134 gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); 1135 if (info->feature_persistent) 1136 __free_page(persistent_gnt->page); 1137 kfree(persistent_gnt); 1138 } 1139 1140 if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT) 1141 /* 1142 * If this is not an indirect operation don't try to 1143 * free indirect segments 1144 */ 1145 goto free_shadow; 1146 1147 for (j = 0; j < INDIRECT_GREFS(segs); j++) { 1148 persistent_gnt = info->shadow[i].indirect_grants[j]; 1149 gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); 1150 __free_page(persistent_gnt->page); 1151 kfree(persistent_gnt); 1152 } 1153 1154 free_shadow: 1155 kfree(info->shadow[i].grants_used); 1156 info->shadow[i].grants_used = NULL; 1157 kfree(info->shadow[i].indirect_grants); 1158 info->shadow[i].indirect_grants = NULL; 1159 kfree(info->shadow[i].sg); 1160 info->shadow[i].sg = NULL; 1161 } 1162 1163 /* No more gnttab callback work. */ 1164 gnttab_cancel_free_callback(&info->callback); 1165 spin_unlock_irq(&info->io_lock); 1166 1167 /* Flush gnttab callback work. Must be done with no locks held. */ 1168 flush_work(&info->work); 1169 1170 /* Free resources associated with old device channel. */ 1171 for (i = 0; i < info->nr_ring_pages; i++) { 1172 if (info->ring_ref[i] != GRANT_INVALID_REF) { 1173 gnttab_end_foreign_access(info->ring_ref[i], 0, 0); 1174 info->ring_ref[i] = GRANT_INVALID_REF; 1175 } 1176 } 1177 free_pages((unsigned long)info->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE)); 1178 info->ring.sring = NULL; 1179 1180 if (info->irq) 1181 unbind_from_irqhandler(info->irq, info); 1182 info->evtchn = info->irq = 0; 1183 1184 } 1185 1186 struct copy_from_grant { 1187 const struct blk_shadow *s; 1188 unsigned int grant_idx; 1189 unsigned int bvec_offset; 1190 char *bvec_data; 1191 }; 1192 1193 static void blkif_copy_from_grant(unsigned long gfn, unsigned int offset, 1194 unsigned int len, void *data) 1195 { 1196 struct copy_from_grant *info = data; 1197 char *shared_data; 1198 /* Convenient aliases */ 1199 const struct blk_shadow *s = info->s; 1200 1201 shared_data = kmap_atomic(s->grants_used[info->grant_idx]->page); 1202 1203 memcpy(info->bvec_data + info->bvec_offset, 1204 shared_data + offset, len); 1205 1206 info->bvec_offset += len; 1207 info->grant_idx++; 1208 1209 kunmap_atomic(shared_data); 1210 } 1211 1212 static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, 1213 struct blkif_response *bret) 1214 { 1215 int i = 0; 1216 struct scatterlist *sg; 1217 int num_sg, num_grant; 1218 struct copy_from_grant data = { 1219 .s = s, 1220 .grant_idx = 0, 1221 }; 1222 1223 num_grant = s->req.operation == BLKIF_OP_INDIRECT ? 1224 s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments; 1225 num_sg = s->num_sg; 1226 1227 if (bret->operation == BLKIF_OP_READ && info->feature_persistent) { 1228 for_each_sg(s->sg, sg, num_sg, i) { 1229 BUG_ON(sg->offset + sg->length > PAGE_SIZE); 1230 1231 data.bvec_offset = sg->offset; 1232 data.bvec_data = kmap_atomic(sg_page(sg)); 1233 1234 gnttab_foreach_grant_in_range(sg_page(sg), 1235 sg->offset, 1236 sg->length, 1237 blkif_copy_from_grant, 1238 &data); 1239 1240 kunmap_atomic(data.bvec_data); 1241 } 1242 } 1243 /* Add the persistent grant into the list of free grants */ 1244 for (i = 0; i < num_grant; i++) { 1245 if (gnttab_query_foreign_access(s->grants_used[i]->gref)) { 1246 /* 1247 * If the grant is still mapped by the backend (the 1248 * backend has chosen to make this grant persistent) 1249 * we add it at the head of the list, so it will be 1250 * reused first. 1251 */ 1252 if (!info->feature_persistent) 1253 pr_alert_ratelimited("backed has not unmapped grant: %u\n", 1254 s->grants_used[i]->gref); 1255 list_add(&s->grants_used[i]->node, &info->grants); 1256 info->persistent_gnts_c++; 1257 } else { 1258 /* 1259 * If the grant is not mapped by the backend we end the 1260 * foreign access and add it to the tail of the list, 1261 * so it will not be picked again unless we run out of 1262 * persistent grants. 1263 */ 1264 gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL); 1265 s->grants_used[i]->gref = GRANT_INVALID_REF; 1266 list_add_tail(&s->grants_used[i]->node, &info->grants); 1267 } 1268 } 1269 if (s->req.operation == BLKIF_OP_INDIRECT) { 1270 for (i = 0; i < INDIRECT_GREFS(num_grant); i++) { 1271 if (gnttab_query_foreign_access(s->indirect_grants[i]->gref)) { 1272 if (!info->feature_persistent) 1273 pr_alert_ratelimited("backed has not unmapped grant: %u\n", 1274 s->indirect_grants[i]->gref); 1275 list_add(&s->indirect_grants[i]->node, &info->grants); 1276 info->persistent_gnts_c++; 1277 } else { 1278 struct page *indirect_page; 1279 1280 gnttab_end_foreign_access(s->indirect_grants[i]->gref, 0, 0UL); 1281 /* 1282 * Add the used indirect page back to the list of 1283 * available pages for indirect grefs. 1284 */ 1285 if (!info->feature_persistent) { 1286 indirect_page = s->indirect_grants[i]->page; 1287 list_add(&indirect_page->lru, &info->indirect_pages); 1288 } 1289 s->indirect_grants[i]->gref = GRANT_INVALID_REF; 1290 list_add_tail(&s->indirect_grants[i]->node, &info->grants); 1291 } 1292 } 1293 } 1294 } 1295 1296 static irqreturn_t blkif_interrupt(int irq, void *dev_id) 1297 { 1298 struct request *req; 1299 struct blkif_response *bret; 1300 RING_IDX i, rp; 1301 unsigned long flags; 1302 struct blkfront_info *info = (struct blkfront_info *)dev_id; 1303 int error; 1304 1305 spin_lock_irqsave(&info->io_lock, flags); 1306 1307 if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) { 1308 spin_unlock_irqrestore(&info->io_lock, flags); 1309 return IRQ_HANDLED; 1310 } 1311 1312 again: 1313 rp = info->ring.sring->rsp_prod; 1314 rmb(); /* Ensure we see queued responses up to 'rp'. */ 1315 1316 for (i = info->ring.rsp_cons; i != rp; i++) { 1317 unsigned long id; 1318 1319 bret = RING_GET_RESPONSE(&info->ring, i); 1320 id = bret->id; 1321 /* 1322 * The backend has messed up and given us an id that we would 1323 * never have given to it (we stamp it up to BLK_RING_SIZE - 1324 * look in get_id_from_freelist. 1325 */ 1326 if (id >= BLK_RING_SIZE(info)) { 1327 WARN(1, "%s: response to %s has incorrect id (%ld)\n", 1328 info->gd->disk_name, op_name(bret->operation), id); 1329 /* We can't safely get the 'struct request' as 1330 * the id is busted. */ 1331 continue; 1332 } 1333 req = info->shadow[id].request; 1334 1335 if (bret->operation != BLKIF_OP_DISCARD) 1336 blkif_completion(&info->shadow[id], info, bret); 1337 1338 if (add_id_to_freelist(info, id)) { 1339 WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n", 1340 info->gd->disk_name, op_name(bret->operation), id); 1341 continue; 1342 } 1343 1344 error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO; 1345 switch (bret->operation) { 1346 case BLKIF_OP_DISCARD: 1347 if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { 1348 struct request_queue *rq = info->rq; 1349 printk(KERN_WARNING "blkfront: %s: %s op failed\n", 1350 info->gd->disk_name, op_name(bret->operation)); 1351 error = -EOPNOTSUPP; 1352 info->feature_discard = 0; 1353 info->feature_secdiscard = 0; 1354 queue_flag_clear(QUEUE_FLAG_DISCARD, rq); 1355 queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq); 1356 } 1357 blk_mq_complete_request(req, error); 1358 break; 1359 case BLKIF_OP_FLUSH_DISKCACHE: 1360 case BLKIF_OP_WRITE_BARRIER: 1361 if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { 1362 printk(KERN_WARNING "blkfront: %s: %s op failed\n", 1363 info->gd->disk_name, op_name(bret->operation)); 1364 error = -EOPNOTSUPP; 1365 } 1366 if (unlikely(bret->status == BLKIF_RSP_ERROR && 1367 info->shadow[id].req.u.rw.nr_segments == 0)) { 1368 printk(KERN_WARNING "blkfront: %s: empty %s op failed\n", 1369 info->gd->disk_name, op_name(bret->operation)); 1370 error = -EOPNOTSUPP; 1371 } 1372 if (unlikely(error)) { 1373 if (error == -EOPNOTSUPP) 1374 error = 0; 1375 info->feature_flush = 0; 1376 xlvbd_flush(info); 1377 } 1378 /* fall through */ 1379 case BLKIF_OP_READ: 1380 case BLKIF_OP_WRITE: 1381 if (unlikely(bret->status != BLKIF_RSP_OKAY)) 1382 dev_dbg(&info->xbdev->dev, "Bad return from blkdev data " 1383 "request: %x\n", bret->status); 1384 1385 blk_mq_complete_request(req, error); 1386 break; 1387 default: 1388 BUG(); 1389 } 1390 } 1391 1392 info->ring.rsp_cons = i; 1393 1394 if (i != info->ring.req_prod_pvt) { 1395 int more_to_do; 1396 RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do); 1397 if (more_to_do) 1398 goto again; 1399 } else 1400 info->ring.sring->rsp_event = i + 1; 1401 1402 kick_pending_request_queues(info); 1403 1404 spin_unlock_irqrestore(&info->io_lock, flags); 1405 1406 return IRQ_HANDLED; 1407 } 1408 1409 1410 static int setup_blkring(struct xenbus_device *dev, 1411 struct blkfront_info *info) 1412 { 1413 struct blkif_sring *sring; 1414 int err, i; 1415 unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE; 1416 grant_ref_t gref[XENBUS_MAX_RING_GRANTS]; 1417 1418 for (i = 0; i < info->nr_ring_pages; i++) 1419 info->ring_ref[i] = GRANT_INVALID_REF; 1420 1421 sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH, 1422 get_order(ring_size)); 1423 if (!sring) { 1424 xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring"); 1425 return -ENOMEM; 1426 } 1427 SHARED_RING_INIT(sring); 1428 FRONT_RING_INIT(&info->ring, sring, ring_size); 1429 1430 err = xenbus_grant_ring(dev, info->ring.sring, info->nr_ring_pages, gref); 1431 if (err < 0) { 1432 free_pages((unsigned long)sring, get_order(ring_size)); 1433 info->ring.sring = NULL; 1434 goto fail; 1435 } 1436 for (i = 0; i < info->nr_ring_pages; i++) 1437 info->ring_ref[i] = gref[i]; 1438 1439 err = xenbus_alloc_evtchn(dev, &info->evtchn); 1440 if (err) 1441 goto fail; 1442 1443 err = bind_evtchn_to_irqhandler(info->evtchn, blkif_interrupt, 0, 1444 "blkif", info); 1445 if (err <= 0) { 1446 xenbus_dev_fatal(dev, err, 1447 "bind_evtchn_to_irqhandler failed"); 1448 goto fail; 1449 } 1450 info->irq = err; 1451 1452 return 0; 1453 fail: 1454 blkif_free(info, 0); 1455 return err; 1456 } 1457 1458 1459 /* Common code used when first setting up, and when resuming. */ 1460 static int talk_to_blkback(struct xenbus_device *dev, 1461 struct blkfront_info *info) 1462 { 1463 const char *message = NULL; 1464 struct xenbus_transaction xbt; 1465 int err, i; 1466 unsigned int max_page_order = 0; 1467 unsigned int ring_page_order = 0; 1468 1469 err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, 1470 "max-ring-page-order", "%u", &max_page_order); 1471 if (err != 1) 1472 info->nr_ring_pages = 1; 1473 else { 1474 ring_page_order = min(xen_blkif_max_ring_order, max_page_order); 1475 info->nr_ring_pages = 1 << ring_page_order; 1476 } 1477 1478 /* Create shared ring, alloc event channel. */ 1479 err = setup_blkring(dev, info); 1480 if (err) 1481 goto out; 1482 1483 again: 1484 err = xenbus_transaction_start(&xbt); 1485 if (err) { 1486 xenbus_dev_fatal(dev, err, "starting transaction"); 1487 goto destroy_blkring; 1488 } 1489 1490 if (info->nr_ring_pages == 1) { 1491 err = xenbus_printf(xbt, dev->nodename, 1492 "ring-ref", "%u", info->ring_ref[0]); 1493 if (err) { 1494 message = "writing ring-ref"; 1495 goto abort_transaction; 1496 } 1497 } else { 1498 err = xenbus_printf(xbt, dev->nodename, 1499 "ring-page-order", "%u", ring_page_order); 1500 if (err) { 1501 message = "writing ring-page-order"; 1502 goto abort_transaction; 1503 } 1504 1505 for (i = 0; i < info->nr_ring_pages; i++) { 1506 char ring_ref_name[RINGREF_NAME_LEN]; 1507 1508 snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); 1509 err = xenbus_printf(xbt, dev->nodename, ring_ref_name, 1510 "%u", info->ring_ref[i]); 1511 if (err) { 1512 message = "writing ring-ref"; 1513 goto abort_transaction; 1514 } 1515 } 1516 } 1517 err = xenbus_printf(xbt, dev->nodename, 1518 "event-channel", "%u", info->evtchn); 1519 if (err) { 1520 message = "writing event-channel"; 1521 goto abort_transaction; 1522 } 1523 err = xenbus_printf(xbt, dev->nodename, "protocol", "%s", 1524 XEN_IO_PROTO_ABI_NATIVE); 1525 if (err) { 1526 message = "writing protocol"; 1527 goto abort_transaction; 1528 } 1529 err = xenbus_printf(xbt, dev->nodename, 1530 "feature-persistent", "%u", 1); 1531 if (err) 1532 dev_warn(&dev->dev, 1533 "writing persistent grants feature to xenbus"); 1534 1535 err = xenbus_transaction_end(xbt, 0); 1536 if (err) { 1537 if (err == -EAGAIN) 1538 goto again; 1539 xenbus_dev_fatal(dev, err, "completing transaction"); 1540 goto destroy_blkring; 1541 } 1542 1543 for (i = 0; i < BLK_RING_SIZE(info); i++) 1544 info->shadow[i].req.u.rw.id = i+1; 1545 info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; 1546 xenbus_switch_state(dev, XenbusStateInitialised); 1547 1548 return 0; 1549 1550 abort_transaction: 1551 xenbus_transaction_end(xbt, 1); 1552 if (message) 1553 xenbus_dev_fatal(dev, err, "%s", message); 1554 destroy_blkring: 1555 blkif_free(info, 0); 1556 out: 1557 return err; 1558 } 1559 1560 /** 1561 * Entry point to this code when a new device is created. Allocate the basic 1562 * structures and the ring buffer for communication with the backend, and 1563 * inform the backend of the appropriate details for those. Switch to 1564 * Initialised state. 1565 */ 1566 static int blkfront_probe(struct xenbus_device *dev, 1567 const struct xenbus_device_id *id) 1568 { 1569 int err, vdevice; 1570 struct blkfront_info *info; 1571 1572 /* FIXME: Use dynamic device id if this is not set. */ 1573 err = xenbus_scanf(XBT_NIL, dev->nodename, 1574 "virtual-device", "%i", &vdevice); 1575 if (err != 1) { 1576 /* go looking in the extended area instead */ 1577 err = xenbus_scanf(XBT_NIL, dev->nodename, "virtual-device-ext", 1578 "%i", &vdevice); 1579 if (err != 1) { 1580 xenbus_dev_fatal(dev, err, "reading virtual-device"); 1581 return err; 1582 } 1583 } 1584 1585 if (xen_hvm_domain()) { 1586 char *type; 1587 int len; 1588 /* no unplug has been done: do not hook devices != xen vbds */ 1589 if (xen_has_pv_and_legacy_disk_devices()) { 1590 int major; 1591 1592 if (!VDEV_IS_EXTENDED(vdevice)) 1593 major = BLKIF_MAJOR(vdevice); 1594 else 1595 major = XENVBD_MAJOR; 1596 1597 if (major != XENVBD_MAJOR) { 1598 printk(KERN_INFO 1599 "%s: HVM does not support vbd %d as xen block device\n", 1600 __func__, vdevice); 1601 return -ENODEV; 1602 } 1603 } 1604 /* do not create a PV cdrom device if we are an HVM guest */ 1605 type = xenbus_read(XBT_NIL, dev->nodename, "device-type", &len); 1606 if (IS_ERR(type)) 1607 return -ENODEV; 1608 if (strncmp(type, "cdrom", 5) == 0) { 1609 kfree(type); 1610 return -ENODEV; 1611 } 1612 kfree(type); 1613 } 1614 info = kzalloc(sizeof(*info), GFP_KERNEL); 1615 if (!info) { 1616 xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure"); 1617 return -ENOMEM; 1618 } 1619 1620 mutex_init(&info->mutex); 1621 spin_lock_init(&info->io_lock); 1622 info->xbdev = dev; 1623 info->vdevice = vdevice; 1624 INIT_LIST_HEAD(&info->grants); 1625 INIT_LIST_HEAD(&info->indirect_pages); 1626 info->persistent_gnts_c = 0; 1627 info->connected = BLKIF_STATE_DISCONNECTED; 1628 INIT_WORK(&info->work, blkif_restart_queue); 1629 1630 /* Front end dir is a number, which is used as the id. */ 1631 info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0); 1632 dev_set_drvdata(&dev->dev, info); 1633 1634 return 0; 1635 } 1636 1637 static void split_bio_end(struct bio *bio) 1638 { 1639 struct split_bio *split_bio = bio->bi_private; 1640 1641 if (atomic_dec_and_test(&split_bio->pending)) { 1642 split_bio->bio->bi_phys_segments = 0; 1643 split_bio->bio->bi_error = bio->bi_error; 1644 bio_endio(split_bio->bio); 1645 kfree(split_bio); 1646 } 1647 bio_put(bio); 1648 } 1649 1650 static int blkif_recover(struct blkfront_info *info) 1651 { 1652 int i; 1653 struct request *req, *n; 1654 struct blk_shadow *copy; 1655 int rc; 1656 struct bio *bio, *cloned_bio; 1657 struct bio_list bio_list, merge_bio; 1658 unsigned int segs, offset; 1659 int pending, size; 1660 struct split_bio *split_bio; 1661 struct list_head requests; 1662 1663 /* Stage 1: Make a safe copy of the shadow state. */ 1664 copy = kmemdup(info->shadow, sizeof(info->shadow), 1665 GFP_NOIO | __GFP_REPEAT | __GFP_HIGH); 1666 if (!copy) 1667 return -ENOMEM; 1668 1669 /* Stage 2: Set up free list. */ 1670 memset(&info->shadow, 0, sizeof(info->shadow)); 1671 for (i = 0; i < BLK_RING_SIZE(info); i++) 1672 info->shadow[i].req.u.rw.id = i+1; 1673 info->shadow_free = info->ring.req_prod_pvt; 1674 info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; 1675 1676 rc = blkfront_gather_backend_features(info); 1677 if (rc) { 1678 kfree(copy); 1679 return rc; 1680 } 1681 1682 segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST; 1683 blk_queue_max_segments(info->rq, segs); 1684 bio_list_init(&bio_list); 1685 INIT_LIST_HEAD(&requests); 1686 for (i = 0; i < BLK_RING_SIZE(info); i++) { 1687 /* Not in use? */ 1688 if (!copy[i].request) 1689 continue; 1690 1691 /* 1692 * Get the bios in the request so we can re-queue them. 1693 */ 1694 if (copy[i].request->cmd_flags & 1695 (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { 1696 /* 1697 * Flush operations don't contain bios, so 1698 * we need to requeue the whole request 1699 */ 1700 list_add(©[i].request->queuelist, &requests); 1701 continue; 1702 } 1703 merge_bio.head = copy[i].request->bio; 1704 merge_bio.tail = copy[i].request->biotail; 1705 bio_list_merge(&bio_list, &merge_bio); 1706 copy[i].request->bio = NULL; 1707 blk_end_request_all(copy[i].request, 0); 1708 } 1709 1710 kfree(copy); 1711 1712 xenbus_switch_state(info->xbdev, XenbusStateConnected); 1713 1714 spin_lock_irq(&info->io_lock); 1715 1716 /* Now safe for us to use the shared ring */ 1717 info->connected = BLKIF_STATE_CONNECTED; 1718 1719 /* Kick any other new requests queued since we resumed */ 1720 kick_pending_request_queues(info); 1721 1722 list_for_each_entry_safe(req, n, &requests, queuelist) { 1723 /* Requeue pending requests (flush or discard) */ 1724 list_del_init(&req->queuelist); 1725 BUG_ON(req->nr_phys_segments > segs); 1726 blk_mq_requeue_request(req); 1727 } 1728 spin_unlock_irq(&info->io_lock); 1729 blk_mq_kick_requeue_list(info->rq); 1730 1731 while ((bio = bio_list_pop(&bio_list)) != NULL) { 1732 /* Traverse the list of pending bios and re-queue them */ 1733 if (bio_segments(bio) > segs) { 1734 /* 1735 * This bio has more segments than what we can 1736 * handle, we have to split it. 1737 */ 1738 pending = (bio_segments(bio) + segs - 1) / segs; 1739 split_bio = kzalloc(sizeof(*split_bio), GFP_NOIO); 1740 BUG_ON(split_bio == NULL); 1741 atomic_set(&split_bio->pending, pending); 1742 split_bio->bio = bio; 1743 for (i = 0; i < pending; i++) { 1744 offset = (i * segs * XEN_PAGE_SIZE) >> 9; 1745 size = min((unsigned int)(segs * XEN_PAGE_SIZE) >> 9, 1746 (unsigned int)bio_sectors(bio) - offset); 1747 cloned_bio = bio_clone(bio, GFP_NOIO); 1748 BUG_ON(cloned_bio == NULL); 1749 bio_trim(cloned_bio, offset, size); 1750 cloned_bio->bi_private = split_bio; 1751 cloned_bio->bi_end_io = split_bio_end; 1752 submit_bio(cloned_bio->bi_rw, cloned_bio); 1753 } 1754 /* 1755 * Now we have to wait for all those smaller bios to 1756 * end, so we can also end the "parent" bio. 1757 */ 1758 continue; 1759 } 1760 /* We don't need to split this bio */ 1761 submit_bio(bio->bi_rw, bio); 1762 } 1763 1764 return 0; 1765 } 1766 1767 /** 1768 * We are reconnecting to the backend, due to a suspend/resume, or a backend 1769 * driver restart. We tear down our blkif structure and recreate it, but 1770 * leave the device-layer structures intact so that this is transparent to the 1771 * rest of the kernel. 1772 */ 1773 static int blkfront_resume(struct xenbus_device *dev) 1774 { 1775 struct blkfront_info *info = dev_get_drvdata(&dev->dev); 1776 int err; 1777 1778 dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename); 1779 1780 blkif_free(info, info->connected == BLKIF_STATE_CONNECTED); 1781 1782 err = talk_to_blkback(dev, info); 1783 1784 /* 1785 * We have to wait for the backend to switch to 1786 * connected state, since we want to read which 1787 * features it supports. 1788 */ 1789 1790 return err; 1791 } 1792 1793 static void 1794 blkfront_closing(struct blkfront_info *info) 1795 { 1796 struct xenbus_device *xbdev = info->xbdev; 1797 struct block_device *bdev = NULL; 1798 1799 mutex_lock(&info->mutex); 1800 1801 if (xbdev->state == XenbusStateClosing) { 1802 mutex_unlock(&info->mutex); 1803 return; 1804 } 1805 1806 if (info->gd) 1807 bdev = bdget_disk(info->gd, 0); 1808 1809 mutex_unlock(&info->mutex); 1810 1811 if (!bdev) { 1812 xenbus_frontend_closed(xbdev); 1813 return; 1814 } 1815 1816 mutex_lock(&bdev->bd_mutex); 1817 1818 if (bdev->bd_openers) { 1819 xenbus_dev_error(xbdev, -EBUSY, 1820 "Device in use; refusing to close"); 1821 xenbus_switch_state(xbdev, XenbusStateClosing); 1822 } else { 1823 xlvbd_release_gendisk(info); 1824 xenbus_frontend_closed(xbdev); 1825 } 1826 1827 mutex_unlock(&bdev->bd_mutex); 1828 bdput(bdev); 1829 } 1830 1831 static void blkfront_setup_discard(struct blkfront_info *info) 1832 { 1833 int err; 1834 unsigned int discard_granularity; 1835 unsigned int discard_alignment; 1836 unsigned int discard_secure; 1837 1838 info->feature_discard = 1; 1839 err = xenbus_gather(XBT_NIL, info->xbdev->otherend, 1840 "discard-granularity", "%u", &discard_granularity, 1841 "discard-alignment", "%u", &discard_alignment, 1842 NULL); 1843 if (!err) { 1844 info->discard_granularity = discard_granularity; 1845 info->discard_alignment = discard_alignment; 1846 } 1847 err = xenbus_gather(XBT_NIL, info->xbdev->otherend, 1848 "discard-secure", "%d", &discard_secure, 1849 NULL); 1850 if (!err) 1851 info->feature_secdiscard = !!discard_secure; 1852 } 1853 1854 static int blkfront_setup_indirect(struct blkfront_info *info) 1855 { 1856 unsigned int psegs, grants; 1857 int err, i; 1858 1859 if (info->max_indirect_segments == 0) 1860 grants = BLKIF_MAX_SEGMENTS_PER_REQUEST; 1861 else 1862 grants = info->max_indirect_segments; 1863 psegs = grants / GRANTS_PER_PSEG; 1864 1865 err = fill_grant_buffer(info, 1866 (grants + INDIRECT_GREFS(grants)) * BLK_RING_SIZE(info)); 1867 if (err) 1868 goto out_of_memory; 1869 1870 if (!info->feature_persistent && info->max_indirect_segments) { 1871 /* 1872 * We are using indirect descriptors but not persistent 1873 * grants, we need to allocate a set of pages that can be 1874 * used for mapping indirect grefs 1875 */ 1876 int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info); 1877 1878 BUG_ON(!list_empty(&info->indirect_pages)); 1879 for (i = 0; i < num; i++) { 1880 struct page *indirect_page = alloc_page(GFP_NOIO); 1881 if (!indirect_page) 1882 goto out_of_memory; 1883 list_add(&indirect_page->lru, &info->indirect_pages); 1884 } 1885 } 1886 1887 for (i = 0; i < BLK_RING_SIZE(info); i++) { 1888 info->shadow[i].grants_used = kzalloc( 1889 sizeof(info->shadow[i].grants_used[0]) * grants, 1890 GFP_NOIO); 1891 info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * psegs, GFP_NOIO); 1892 if (info->max_indirect_segments) 1893 info->shadow[i].indirect_grants = kzalloc( 1894 sizeof(info->shadow[i].indirect_grants[0]) * 1895 INDIRECT_GREFS(grants), 1896 GFP_NOIO); 1897 if ((info->shadow[i].grants_used == NULL) || 1898 (info->shadow[i].sg == NULL) || 1899 (info->max_indirect_segments && 1900 (info->shadow[i].indirect_grants == NULL))) 1901 goto out_of_memory; 1902 sg_init_table(info->shadow[i].sg, psegs); 1903 } 1904 1905 1906 return 0; 1907 1908 out_of_memory: 1909 for (i = 0; i < BLK_RING_SIZE(info); i++) { 1910 kfree(info->shadow[i].grants_used); 1911 info->shadow[i].grants_used = NULL; 1912 kfree(info->shadow[i].sg); 1913 info->shadow[i].sg = NULL; 1914 kfree(info->shadow[i].indirect_grants); 1915 info->shadow[i].indirect_grants = NULL; 1916 } 1917 if (!list_empty(&info->indirect_pages)) { 1918 struct page *indirect_page, *n; 1919 list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) { 1920 list_del(&indirect_page->lru); 1921 __free_page(indirect_page); 1922 } 1923 } 1924 return -ENOMEM; 1925 } 1926 1927 /* 1928 * Gather all backend feature-* 1929 */ 1930 static int blkfront_gather_backend_features(struct blkfront_info *info) 1931 { 1932 int err; 1933 int barrier, flush, discard, persistent; 1934 unsigned int indirect_segments; 1935 1936 info->feature_flush = 0; 1937 1938 err = xenbus_gather(XBT_NIL, info->xbdev->otherend, 1939 "feature-barrier", "%d", &barrier, 1940 NULL); 1941 1942 /* 1943 * If there's no "feature-barrier" defined, then it means 1944 * we're dealing with a very old backend which writes 1945 * synchronously; nothing to do. 1946 * 1947 * If there are barriers, then we use flush. 1948 */ 1949 if (!err && barrier) 1950 info->feature_flush = REQ_FLUSH | REQ_FUA; 1951 /* 1952 * And if there is "feature-flush-cache" use that above 1953 * barriers. 1954 */ 1955 err = xenbus_gather(XBT_NIL, info->xbdev->otherend, 1956 "feature-flush-cache", "%d", &flush, 1957 NULL); 1958 1959 if (!err && flush) 1960 info->feature_flush = REQ_FLUSH; 1961 1962 err = xenbus_gather(XBT_NIL, info->xbdev->otherend, 1963 "feature-discard", "%d", &discard, 1964 NULL); 1965 1966 if (!err && discard) 1967 blkfront_setup_discard(info); 1968 1969 err = xenbus_gather(XBT_NIL, info->xbdev->otherend, 1970 "feature-persistent", "%u", &persistent, 1971 NULL); 1972 if (err) 1973 info->feature_persistent = 0; 1974 else 1975 info->feature_persistent = persistent; 1976 1977 err = xenbus_gather(XBT_NIL, info->xbdev->otherend, 1978 "feature-max-indirect-segments", "%u", &indirect_segments, 1979 NULL); 1980 if (err) 1981 info->max_indirect_segments = 0; 1982 else 1983 info->max_indirect_segments = min(indirect_segments, 1984 xen_blkif_max_segments); 1985 1986 return blkfront_setup_indirect(info); 1987 } 1988 1989 /* 1990 * Invoked when the backend is finally 'ready' (and has told produced 1991 * the details about the physical device - #sectors, size, etc). 1992 */ 1993 static void blkfront_connect(struct blkfront_info *info) 1994 { 1995 unsigned long long sectors; 1996 unsigned long sector_size; 1997 unsigned int physical_sector_size; 1998 unsigned int binfo; 1999 int err; 2000 2001 switch (info->connected) { 2002 case BLKIF_STATE_CONNECTED: 2003 /* 2004 * Potentially, the back-end may be signalling 2005 * a capacity change; update the capacity. 2006 */ 2007 err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, 2008 "sectors", "%Lu", §ors); 2009 if (XENBUS_EXIST_ERR(err)) 2010 return; 2011 printk(KERN_INFO "Setting capacity to %Lu\n", 2012 sectors); 2013 set_capacity(info->gd, sectors); 2014 revalidate_disk(info->gd); 2015 2016 return; 2017 case BLKIF_STATE_SUSPENDED: 2018 /* 2019 * If we are recovering from suspension, we need to wait 2020 * for the backend to announce it's features before 2021 * reconnecting, at least we need to know if the backend 2022 * supports indirect descriptors, and how many. 2023 */ 2024 blkif_recover(info); 2025 return; 2026 2027 default: 2028 break; 2029 } 2030 2031 dev_dbg(&info->xbdev->dev, "%s:%s.\n", 2032 __func__, info->xbdev->otherend); 2033 2034 err = xenbus_gather(XBT_NIL, info->xbdev->otherend, 2035 "sectors", "%llu", §ors, 2036 "info", "%u", &binfo, 2037 "sector-size", "%lu", §or_size, 2038 NULL); 2039 if (err) { 2040 xenbus_dev_fatal(info->xbdev, err, 2041 "reading backend fields at %s", 2042 info->xbdev->otherend); 2043 return; 2044 } 2045 2046 /* 2047 * physcial-sector-size is a newer field, so old backends may not 2048 * provide this. Assume physical sector size to be the same as 2049 * sector_size in that case. 2050 */ 2051 err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, 2052 "physical-sector-size", "%u", &physical_sector_size); 2053 if (err != 1) 2054 physical_sector_size = sector_size; 2055 2056 err = blkfront_gather_backend_features(info); 2057 if (err) { 2058 xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s", 2059 info->xbdev->otherend); 2060 return; 2061 } 2062 2063 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size, 2064 physical_sector_size); 2065 if (err) { 2066 xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", 2067 info->xbdev->otherend); 2068 return; 2069 } 2070 2071 xenbus_switch_state(info->xbdev, XenbusStateConnected); 2072 2073 /* Kick pending requests. */ 2074 spin_lock_irq(&info->io_lock); 2075 info->connected = BLKIF_STATE_CONNECTED; 2076 kick_pending_request_queues(info); 2077 spin_unlock_irq(&info->io_lock); 2078 2079 add_disk(info->gd); 2080 2081 info->is_ready = 1; 2082 } 2083 2084 /** 2085 * Callback received when the backend's state changes. 2086 */ 2087 static void blkback_changed(struct xenbus_device *dev, 2088 enum xenbus_state backend_state) 2089 { 2090 struct blkfront_info *info = dev_get_drvdata(&dev->dev); 2091 2092 dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state); 2093 2094 switch (backend_state) { 2095 case XenbusStateInitWait: 2096 if (dev->state != XenbusStateInitialising) 2097 break; 2098 if (talk_to_blkback(dev, info)) { 2099 kfree(info); 2100 dev_set_drvdata(&dev->dev, NULL); 2101 break; 2102 } 2103 case XenbusStateInitialising: 2104 case XenbusStateInitialised: 2105 case XenbusStateReconfiguring: 2106 case XenbusStateReconfigured: 2107 case XenbusStateUnknown: 2108 break; 2109 2110 case XenbusStateConnected: 2111 blkfront_connect(info); 2112 break; 2113 2114 case XenbusStateClosed: 2115 if (dev->state == XenbusStateClosed) 2116 break; 2117 /* Missed the backend's Closing state -- fallthrough */ 2118 case XenbusStateClosing: 2119 if (info) 2120 blkfront_closing(info); 2121 break; 2122 } 2123 } 2124 2125 static int blkfront_remove(struct xenbus_device *xbdev) 2126 { 2127 struct blkfront_info *info = dev_get_drvdata(&xbdev->dev); 2128 struct block_device *bdev = NULL; 2129 struct gendisk *disk; 2130 2131 dev_dbg(&xbdev->dev, "%s removed", xbdev->nodename); 2132 2133 blkif_free(info, 0); 2134 2135 mutex_lock(&info->mutex); 2136 2137 disk = info->gd; 2138 if (disk) 2139 bdev = bdget_disk(disk, 0); 2140 2141 info->xbdev = NULL; 2142 mutex_unlock(&info->mutex); 2143 2144 if (!bdev) { 2145 kfree(info); 2146 return 0; 2147 } 2148 2149 /* 2150 * The xbdev was removed before we reached the Closed 2151 * state. See if it's safe to remove the disk. If the bdev 2152 * isn't closed yet, we let release take care of it. 2153 */ 2154 2155 mutex_lock(&bdev->bd_mutex); 2156 info = disk->private_data; 2157 2158 dev_warn(disk_to_dev(disk), 2159 "%s was hot-unplugged, %d stale handles\n", 2160 xbdev->nodename, bdev->bd_openers); 2161 2162 if (info && !bdev->bd_openers) { 2163 xlvbd_release_gendisk(info); 2164 disk->private_data = NULL; 2165 kfree(info); 2166 } 2167 2168 mutex_unlock(&bdev->bd_mutex); 2169 bdput(bdev); 2170 2171 return 0; 2172 } 2173 2174 static int blkfront_is_ready(struct xenbus_device *dev) 2175 { 2176 struct blkfront_info *info = dev_get_drvdata(&dev->dev); 2177 2178 return info->is_ready && info->xbdev; 2179 } 2180 2181 static int blkif_open(struct block_device *bdev, fmode_t mode) 2182 { 2183 struct gendisk *disk = bdev->bd_disk; 2184 struct blkfront_info *info; 2185 int err = 0; 2186 2187 mutex_lock(&blkfront_mutex); 2188 2189 info = disk->private_data; 2190 if (!info) { 2191 /* xbdev gone */ 2192 err = -ERESTARTSYS; 2193 goto out; 2194 } 2195 2196 mutex_lock(&info->mutex); 2197 2198 if (!info->gd) 2199 /* xbdev is closed */ 2200 err = -ERESTARTSYS; 2201 2202 mutex_unlock(&info->mutex); 2203 2204 out: 2205 mutex_unlock(&blkfront_mutex); 2206 return err; 2207 } 2208 2209 static void blkif_release(struct gendisk *disk, fmode_t mode) 2210 { 2211 struct blkfront_info *info = disk->private_data; 2212 struct block_device *bdev; 2213 struct xenbus_device *xbdev; 2214 2215 mutex_lock(&blkfront_mutex); 2216 2217 bdev = bdget_disk(disk, 0); 2218 2219 if (!bdev) { 2220 WARN(1, "Block device %s yanked out from us!\n", disk->disk_name); 2221 goto out_mutex; 2222 } 2223 if (bdev->bd_openers) 2224 goto out; 2225 2226 /* 2227 * Check if we have been instructed to close. We will have 2228 * deferred this request, because the bdev was still open. 2229 */ 2230 2231 mutex_lock(&info->mutex); 2232 xbdev = info->xbdev; 2233 2234 if (xbdev && xbdev->state == XenbusStateClosing) { 2235 /* pending switch to state closed */ 2236 dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n"); 2237 xlvbd_release_gendisk(info); 2238 xenbus_frontend_closed(info->xbdev); 2239 } 2240 2241 mutex_unlock(&info->mutex); 2242 2243 if (!xbdev) { 2244 /* sudden device removal */ 2245 dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n"); 2246 xlvbd_release_gendisk(info); 2247 disk->private_data = NULL; 2248 kfree(info); 2249 } 2250 2251 out: 2252 bdput(bdev); 2253 out_mutex: 2254 mutex_unlock(&blkfront_mutex); 2255 } 2256 2257 static const struct block_device_operations xlvbd_block_fops = 2258 { 2259 .owner = THIS_MODULE, 2260 .open = blkif_open, 2261 .release = blkif_release, 2262 .getgeo = blkif_getgeo, 2263 .ioctl = blkif_ioctl, 2264 }; 2265 2266 2267 static const struct xenbus_device_id blkfront_ids[] = { 2268 { "vbd" }, 2269 { "" } 2270 }; 2271 2272 static struct xenbus_driver blkfront_driver = { 2273 .ids = blkfront_ids, 2274 .probe = blkfront_probe, 2275 .remove = blkfront_remove, 2276 .resume = blkfront_resume, 2277 .otherend_changed = blkback_changed, 2278 .is_ready = blkfront_is_ready, 2279 }; 2280 2281 static int __init xlblk_init(void) 2282 { 2283 int ret; 2284 2285 if (!xen_domain()) 2286 return -ENODEV; 2287 2288 if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) { 2289 pr_info("Invalid max_ring_order (%d), will use default max: %d.\n", 2290 xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER); 2291 xen_blkif_max_ring_order = 0; 2292 } 2293 2294 if (!xen_has_pv_disk_devices()) 2295 return -ENODEV; 2296 2297 if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) { 2298 printk(KERN_WARNING "xen_blk: can't get major %d with name %s\n", 2299 XENVBD_MAJOR, DEV_NAME); 2300 return -ENODEV; 2301 } 2302 2303 ret = xenbus_register_frontend(&blkfront_driver); 2304 if (ret) { 2305 unregister_blkdev(XENVBD_MAJOR, DEV_NAME); 2306 return ret; 2307 } 2308 2309 return 0; 2310 } 2311 module_init(xlblk_init); 2312 2313 2314 static void __exit xlblk_exit(void) 2315 { 2316 xenbus_unregister_driver(&blkfront_driver); 2317 unregister_blkdev(XENVBD_MAJOR, DEV_NAME); 2318 kfree(minors); 2319 } 2320 module_exit(xlblk_exit); 2321 2322 MODULE_DESCRIPTION("Xen virtual block device frontend"); 2323 MODULE_LICENSE("GPL"); 2324 MODULE_ALIAS_BLOCKDEV_MAJOR(XENVBD_MAJOR); 2325 MODULE_ALIAS("xen:vbd"); 2326 MODULE_ALIAS("xenblk"); 2327