1 /* 2 * blkfront.c 3 * 4 * XenLinux virtual block device driver. 5 * 6 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand 7 * Modifications by Mark A. Williamson are (c) Intel Research Cambridge 8 * Copyright (c) 2004, Christian Limpach 9 * Copyright (c) 2004, Andrew Warfield 10 * Copyright (c) 2005, Christopher Clark 11 * Copyright (c) 2005, XenSource Ltd 12 * 13 * This program is free software; you can redistribute it and/or 14 * modify it under the terms of the GNU General Public License version 2 15 * as published by the Free Software Foundation; or, when distributed 16 * separately from the Linux kernel or incorporated into other 17 * software packages, subject to the following license: 18 * 19 * Permission is hereby granted, free of charge, to any person obtaining a copy 20 * of this source file (the "Software"), to deal in the Software without 21 * restriction, including without limitation the rights to use, copy, modify, 22 * merge, publish, distribute, sublicense, and/or sell copies of the Software, 23 * and to permit persons to whom the Software is furnished to do so, subject to 24 * the following conditions: 25 * 26 * The above copyright notice and this permission notice shall be included in 27 * all copies or substantial portions of the Software. 28 * 29 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 30 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 31 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 32 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 33 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 34 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 35 * IN THE SOFTWARE. 36 */ 37 38 #include <linux/interrupt.h> 39 #include <linux/blkdev.h> 40 #include <linux/hdreg.h> 41 #include <linux/cdrom.h> 42 #include <linux/module.h> 43 #include <linux/slab.h> 44 #include <linux/mutex.h> 45 #include <linux/scatterlist.h> 46 #include <linux/bitmap.h> 47 #include <linux/list.h> 48 49 #include <xen/xen.h> 50 #include <xen/xenbus.h> 51 #include <xen/grant_table.h> 52 #include <xen/events.h> 53 #include <xen/page.h> 54 #include <xen/platform_pci.h> 55 56 #include <xen/interface/grant_table.h> 57 #include <xen/interface/io/blkif.h> 58 #include <xen/interface/io/protocols.h> 59 60 #include <asm/xen/hypervisor.h> 61 62 enum blkif_state { 63 BLKIF_STATE_DISCONNECTED, 64 BLKIF_STATE_CONNECTED, 65 BLKIF_STATE_SUSPENDED, 66 }; 67 68 struct grant { 69 grant_ref_t gref; 70 unsigned long pfn; 71 struct list_head node; 72 }; 73 74 struct blk_shadow { 75 struct blkif_request req; 76 struct request *request; 77 struct grant *grants_used[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 78 }; 79 80 static DEFINE_MUTEX(blkfront_mutex); 81 static const struct block_device_operations xlvbd_block_fops; 82 83 #define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) 84 85 /* 86 * We have one of these per vbd, whether ide, scsi or 'other'. They 87 * hang in private_data off the gendisk structure. We may end up 88 * putting all kinds of interesting stuff here :-) 89 */ 90 struct blkfront_info 91 { 92 spinlock_t io_lock; 93 struct mutex mutex; 94 struct xenbus_device *xbdev; 95 struct gendisk *gd; 96 int vdevice; 97 blkif_vdev_t handle; 98 enum blkif_state connected; 99 int ring_ref; 100 struct blkif_front_ring ring; 101 struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 102 unsigned int evtchn, irq; 103 struct request_queue *rq; 104 struct work_struct work; 105 struct gnttab_free_callback callback; 106 struct blk_shadow shadow[BLK_RING_SIZE]; 107 struct list_head persistent_gnts; 108 unsigned int persistent_gnts_c; 109 unsigned long shadow_free; 110 unsigned int feature_flush; 111 unsigned int flush_op; 112 unsigned int feature_discard:1; 113 unsigned int feature_secdiscard:1; 114 unsigned int discard_granularity; 115 unsigned int discard_alignment; 116 unsigned int feature_persistent:1; 117 int is_ready; 118 }; 119 120 static unsigned int nr_minors; 121 static unsigned long *minors; 122 static DEFINE_SPINLOCK(minor_lock); 123 124 #define MAXIMUM_OUTSTANDING_BLOCK_REQS \ 125 (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE) 126 #define GRANT_INVALID_REF 0 127 128 #define PARTS_PER_DISK 16 129 #define PARTS_PER_EXT_DISK 256 130 131 #define BLKIF_MAJOR(dev) ((dev)>>8) 132 #define BLKIF_MINOR(dev) ((dev) & 0xff) 133 134 #define EXT_SHIFT 28 135 #define EXTENDED (1<<EXT_SHIFT) 136 #define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED)) 137 #define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED)) 138 #define EMULATED_HD_DISK_MINOR_OFFSET (0) 139 #define EMULATED_HD_DISK_NAME_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET / 256) 140 #define EMULATED_SD_DISK_MINOR_OFFSET (0) 141 #define EMULATED_SD_DISK_NAME_OFFSET (EMULATED_SD_DISK_MINOR_OFFSET / 256) 142 143 #define DEV_NAME "xvd" /* name in /dev */ 144 145 static int get_id_from_freelist(struct blkfront_info *info) 146 { 147 unsigned long free = info->shadow_free; 148 BUG_ON(free >= BLK_RING_SIZE); 149 info->shadow_free = info->shadow[free].req.u.rw.id; 150 info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */ 151 return free; 152 } 153 154 static int add_id_to_freelist(struct blkfront_info *info, 155 unsigned long id) 156 { 157 if (info->shadow[id].req.u.rw.id != id) 158 return -EINVAL; 159 if (info->shadow[id].request == NULL) 160 return -EINVAL; 161 info->shadow[id].req.u.rw.id = info->shadow_free; 162 info->shadow[id].request = NULL; 163 info->shadow_free = id; 164 return 0; 165 } 166 167 static int fill_grant_buffer(struct blkfront_info *info, int num) 168 { 169 struct page *granted_page; 170 struct grant *gnt_list_entry, *n; 171 int i = 0; 172 173 while(i < num) { 174 gnt_list_entry = kzalloc(sizeof(struct grant), GFP_NOIO); 175 if (!gnt_list_entry) 176 goto out_of_memory; 177 178 granted_page = alloc_page(GFP_NOIO); 179 if (!granted_page) { 180 kfree(gnt_list_entry); 181 goto out_of_memory; 182 } 183 184 gnt_list_entry->pfn = page_to_pfn(granted_page); 185 gnt_list_entry->gref = GRANT_INVALID_REF; 186 list_add(&gnt_list_entry->node, &info->persistent_gnts); 187 i++; 188 } 189 190 return 0; 191 192 out_of_memory: 193 list_for_each_entry_safe(gnt_list_entry, n, 194 &info->persistent_gnts, node) { 195 list_del(&gnt_list_entry->node); 196 __free_page(pfn_to_page(gnt_list_entry->pfn)); 197 kfree(gnt_list_entry); 198 i--; 199 } 200 BUG_ON(i != 0); 201 return -ENOMEM; 202 } 203 204 static struct grant *get_grant(grant_ref_t *gref_head, 205 struct blkfront_info *info) 206 { 207 struct grant *gnt_list_entry; 208 unsigned long buffer_mfn; 209 210 BUG_ON(list_empty(&info->persistent_gnts)); 211 gnt_list_entry = list_first_entry(&info->persistent_gnts, struct grant, 212 node); 213 list_del(&gnt_list_entry->node); 214 215 if (gnt_list_entry->gref != GRANT_INVALID_REF) { 216 info->persistent_gnts_c--; 217 return gnt_list_entry; 218 } 219 220 /* Assign a gref to this page */ 221 gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head); 222 BUG_ON(gnt_list_entry->gref == -ENOSPC); 223 buffer_mfn = pfn_to_mfn(gnt_list_entry->pfn); 224 gnttab_grant_foreign_access_ref(gnt_list_entry->gref, 225 info->xbdev->otherend_id, 226 buffer_mfn, 0); 227 return gnt_list_entry; 228 } 229 230 static const char *op_name(int op) 231 { 232 static const char *const names[] = { 233 [BLKIF_OP_READ] = "read", 234 [BLKIF_OP_WRITE] = "write", 235 [BLKIF_OP_WRITE_BARRIER] = "barrier", 236 [BLKIF_OP_FLUSH_DISKCACHE] = "flush", 237 [BLKIF_OP_DISCARD] = "discard" }; 238 239 if (op < 0 || op >= ARRAY_SIZE(names)) 240 return "unknown"; 241 242 if (!names[op]) 243 return "reserved"; 244 245 return names[op]; 246 } 247 static int xlbd_reserve_minors(unsigned int minor, unsigned int nr) 248 { 249 unsigned int end = minor + nr; 250 int rc; 251 252 if (end > nr_minors) { 253 unsigned long *bitmap, *old; 254 255 bitmap = kcalloc(BITS_TO_LONGS(end), sizeof(*bitmap), 256 GFP_KERNEL); 257 if (bitmap == NULL) 258 return -ENOMEM; 259 260 spin_lock(&minor_lock); 261 if (end > nr_minors) { 262 old = minors; 263 memcpy(bitmap, minors, 264 BITS_TO_LONGS(nr_minors) * sizeof(*bitmap)); 265 minors = bitmap; 266 nr_minors = BITS_TO_LONGS(end) * BITS_PER_LONG; 267 } else 268 old = bitmap; 269 spin_unlock(&minor_lock); 270 kfree(old); 271 } 272 273 spin_lock(&minor_lock); 274 if (find_next_bit(minors, end, minor) >= end) { 275 bitmap_set(minors, minor, nr); 276 rc = 0; 277 } else 278 rc = -EBUSY; 279 spin_unlock(&minor_lock); 280 281 return rc; 282 } 283 284 static void xlbd_release_minors(unsigned int minor, unsigned int nr) 285 { 286 unsigned int end = minor + nr; 287 288 BUG_ON(end > nr_minors); 289 spin_lock(&minor_lock); 290 bitmap_clear(minors, minor, nr); 291 spin_unlock(&minor_lock); 292 } 293 294 static void blkif_restart_queue_callback(void *arg) 295 { 296 struct blkfront_info *info = (struct blkfront_info *)arg; 297 schedule_work(&info->work); 298 } 299 300 static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg) 301 { 302 /* We don't have real geometry info, but let's at least return 303 values consistent with the size of the device */ 304 sector_t nsect = get_capacity(bd->bd_disk); 305 sector_t cylinders = nsect; 306 307 hg->heads = 0xff; 308 hg->sectors = 0x3f; 309 sector_div(cylinders, hg->heads * hg->sectors); 310 hg->cylinders = cylinders; 311 if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect) 312 hg->cylinders = 0xffff; 313 return 0; 314 } 315 316 static int blkif_ioctl(struct block_device *bdev, fmode_t mode, 317 unsigned command, unsigned long argument) 318 { 319 struct blkfront_info *info = bdev->bd_disk->private_data; 320 int i; 321 322 dev_dbg(&info->xbdev->dev, "command: 0x%x, argument: 0x%lx\n", 323 command, (long)argument); 324 325 switch (command) { 326 case CDROMMULTISESSION: 327 dev_dbg(&info->xbdev->dev, "FIXME: support multisession CDs later\n"); 328 for (i = 0; i < sizeof(struct cdrom_multisession); i++) 329 if (put_user(0, (char __user *)(argument + i))) 330 return -EFAULT; 331 return 0; 332 333 case CDROM_GET_CAPABILITY: { 334 struct gendisk *gd = info->gd; 335 if (gd->flags & GENHD_FL_CD) 336 return 0; 337 return -EINVAL; 338 } 339 340 default: 341 /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n", 342 command);*/ 343 return -EINVAL; /* same return as native Linux */ 344 } 345 346 return 0; 347 } 348 349 /* 350 * Generate a Xen blkfront IO request from a blk layer request. Reads 351 * and writes are handled as expected. 352 * 353 * @req: a request struct 354 */ 355 static int blkif_queue_request(struct request *req) 356 { 357 struct blkfront_info *info = req->rq_disk->private_data; 358 struct blkif_request *ring_req; 359 unsigned long id; 360 unsigned int fsect, lsect; 361 int i, ref; 362 363 /* 364 * Used to store if we are able to queue the request by just using 365 * existing persistent grants, or if we have to get new grants, 366 * as there are not sufficiently many free. 367 */ 368 bool new_persistent_gnts; 369 grant_ref_t gref_head; 370 struct grant *gnt_list_entry = NULL; 371 struct scatterlist *sg; 372 373 if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) 374 return 1; 375 376 /* Check if we have enought grants to allocate a requests */ 377 if (info->persistent_gnts_c < BLKIF_MAX_SEGMENTS_PER_REQUEST) { 378 new_persistent_gnts = 1; 379 if (gnttab_alloc_grant_references( 380 BLKIF_MAX_SEGMENTS_PER_REQUEST - info->persistent_gnts_c, 381 &gref_head) < 0) { 382 gnttab_request_free_callback( 383 &info->callback, 384 blkif_restart_queue_callback, 385 info, 386 BLKIF_MAX_SEGMENTS_PER_REQUEST); 387 return 1; 388 } 389 } else 390 new_persistent_gnts = 0; 391 392 /* Fill out a communications ring structure. */ 393 ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); 394 id = get_id_from_freelist(info); 395 info->shadow[id].request = req; 396 397 ring_req->u.rw.id = id; 398 ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req); 399 ring_req->u.rw.handle = info->handle; 400 401 ring_req->operation = rq_data_dir(req) ? 402 BLKIF_OP_WRITE : BLKIF_OP_READ; 403 404 if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) { 405 /* 406 * Ideally we can do an unordered flush-to-disk. In case the 407 * backend onlysupports barriers, use that. A barrier request 408 * a superset of FUA, so we can implement it the same 409 * way. (It's also a FLUSH+FUA, since it is 410 * guaranteed ordered WRT previous writes.) 411 */ 412 ring_req->operation = info->flush_op; 413 } 414 415 if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) { 416 /* id, sector_number and handle are set above. */ 417 ring_req->operation = BLKIF_OP_DISCARD; 418 ring_req->u.discard.nr_sectors = blk_rq_sectors(req); 419 if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard) 420 ring_req->u.discard.flag = BLKIF_DISCARD_SECURE; 421 else 422 ring_req->u.discard.flag = 0; 423 } else { 424 ring_req->u.rw.nr_segments = blk_rq_map_sg(req->q, req, 425 info->sg); 426 BUG_ON(ring_req->u.rw.nr_segments > 427 BLKIF_MAX_SEGMENTS_PER_REQUEST); 428 429 for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) { 430 fsect = sg->offset >> 9; 431 lsect = fsect + (sg->length >> 9) - 1; 432 433 gnt_list_entry = get_grant(&gref_head, info); 434 ref = gnt_list_entry->gref; 435 436 info->shadow[id].grants_used[i] = gnt_list_entry; 437 438 if (rq_data_dir(req)) { 439 char *bvec_data; 440 void *shared_data; 441 442 BUG_ON(sg->offset + sg->length > PAGE_SIZE); 443 444 shared_data = kmap_atomic( 445 pfn_to_page(gnt_list_entry->pfn)); 446 bvec_data = kmap_atomic(sg_page(sg)); 447 448 /* 449 * this does not wipe data stored outside the 450 * range sg->offset..sg->offset+sg->length. 451 * Therefore, blkback *could* see data from 452 * previous requests. This is OK as long as 453 * persistent grants are shared with just one 454 * domain. It may need refactoring if this 455 * changes 456 */ 457 memcpy(shared_data + sg->offset, 458 bvec_data + sg->offset, 459 sg->length); 460 461 kunmap_atomic(bvec_data); 462 kunmap_atomic(shared_data); 463 } 464 465 ring_req->u.rw.seg[i] = 466 (struct blkif_request_segment) { 467 .gref = ref, 468 .first_sect = fsect, 469 .last_sect = lsect }; 470 } 471 } 472 473 info->ring.req_prod_pvt++; 474 475 /* Keep a private copy so we can reissue requests when recovering. */ 476 info->shadow[id].req = *ring_req; 477 478 if (new_persistent_gnts) 479 gnttab_free_grant_references(gref_head); 480 481 return 0; 482 } 483 484 485 static inline void flush_requests(struct blkfront_info *info) 486 { 487 int notify; 488 489 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify); 490 491 if (notify) 492 notify_remote_via_irq(info->irq); 493 } 494 495 /* 496 * do_blkif_request 497 * read a block; request is in a request queue 498 */ 499 static void do_blkif_request(struct request_queue *rq) 500 { 501 struct blkfront_info *info = NULL; 502 struct request *req; 503 int queued; 504 505 pr_debug("Entered do_blkif_request\n"); 506 507 queued = 0; 508 509 while ((req = blk_peek_request(rq)) != NULL) { 510 info = req->rq_disk->private_data; 511 512 if (RING_FULL(&info->ring)) 513 goto wait; 514 515 blk_start_request(req); 516 517 if ((req->cmd_type != REQ_TYPE_FS) || 518 ((req->cmd_flags & (REQ_FLUSH | REQ_FUA)) && 519 !info->flush_op)) { 520 __blk_end_request_all(req, -EIO); 521 continue; 522 } 523 524 pr_debug("do_blk_req %p: cmd %p, sec %lx, " 525 "(%u/%u) buffer:%p [%s]\n", 526 req, req->cmd, (unsigned long)blk_rq_pos(req), 527 blk_rq_cur_sectors(req), blk_rq_sectors(req), 528 req->buffer, rq_data_dir(req) ? "write" : "read"); 529 530 if (blkif_queue_request(req)) { 531 blk_requeue_request(rq, req); 532 wait: 533 /* Avoid pointless unplugs. */ 534 blk_stop_queue(rq); 535 break; 536 } 537 538 queued++; 539 } 540 541 if (queued != 0) 542 flush_requests(info); 543 } 544 545 static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) 546 { 547 struct request_queue *rq; 548 struct blkfront_info *info = gd->private_data; 549 550 rq = blk_init_queue(do_blkif_request, &info->io_lock); 551 if (rq == NULL) 552 return -1; 553 554 queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq); 555 556 if (info->feature_discard) { 557 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, rq); 558 blk_queue_max_discard_sectors(rq, get_capacity(gd)); 559 rq->limits.discard_granularity = info->discard_granularity; 560 rq->limits.discard_alignment = info->discard_alignment; 561 if (info->feature_secdiscard) 562 queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, rq); 563 } 564 565 /* Hard sector size and max sectors impersonate the equiv. hardware. */ 566 blk_queue_logical_block_size(rq, sector_size); 567 blk_queue_max_hw_sectors(rq, 512); 568 569 /* Each segment in a request is up to an aligned page in size. */ 570 blk_queue_segment_boundary(rq, PAGE_SIZE - 1); 571 blk_queue_max_segment_size(rq, PAGE_SIZE); 572 573 /* Ensure a merged request will fit in a single I/O ring slot. */ 574 blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); 575 576 /* Make sure buffer addresses are sector-aligned. */ 577 blk_queue_dma_alignment(rq, 511); 578 579 /* Make sure we don't use bounce buffers. */ 580 blk_queue_bounce_limit(rq, BLK_BOUNCE_ANY); 581 582 gd->queue = rq; 583 584 return 0; 585 } 586 587 588 static void xlvbd_flush(struct blkfront_info *info) 589 { 590 blk_queue_flush(info->rq, info->feature_flush); 591 printk(KERN_INFO "blkfront: %s: %s: %s %s\n", 592 info->gd->disk_name, 593 info->flush_op == BLKIF_OP_WRITE_BARRIER ? 594 "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ? 595 "flush diskcache" : "barrier or flush"), 596 info->feature_flush ? "enabled" : "disabled", 597 info->feature_persistent ? "using persistent grants" : ""); 598 } 599 600 static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) 601 { 602 int major; 603 major = BLKIF_MAJOR(vdevice); 604 *minor = BLKIF_MINOR(vdevice); 605 switch (major) { 606 case XEN_IDE0_MAJOR: 607 *offset = (*minor / 64) + EMULATED_HD_DISK_NAME_OFFSET; 608 *minor = ((*minor / 64) * PARTS_PER_DISK) + 609 EMULATED_HD_DISK_MINOR_OFFSET; 610 break; 611 case XEN_IDE1_MAJOR: 612 *offset = (*minor / 64) + 2 + EMULATED_HD_DISK_NAME_OFFSET; 613 *minor = (((*minor / 64) + 2) * PARTS_PER_DISK) + 614 EMULATED_HD_DISK_MINOR_OFFSET; 615 break; 616 case XEN_SCSI_DISK0_MAJOR: 617 *offset = (*minor / PARTS_PER_DISK) + EMULATED_SD_DISK_NAME_OFFSET; 618 *minor = *minor + EMULATED_SD_DISK_MINOR_OFFSET; 619 break; 620 case XEN_SCSI_DISK1_MAJOR: 621 case XEN_SCSI_DISK2_MAJOR: 622 case XEN_SCSI_DISK3_MAJOR: 623 case XEN_SCSI_DISK4_MAJOR: 624 case XEN_SCSI_DISK5_MAJOR: 625 case XEN_SCSI_DISK6_MAJOR: 626 case XEN_SCSI_DISK7_MAJOR: 627 *offset = (*minor / PARTS_PER_DISK) + 628 ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16) + 629 EMULATED_SD_DISK_NAME_OFFSET; 630 *minor = *minor + 631 ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16 * PARTS_PER_DISK) + 632 EMULATED_SD_DISK_MINOR_OFFSET; 633 break; 634 case XEN_SCSI_DISK8_MAJOR: 635 case XEN_SCSI_DISK9_MAJOR: 636 case XEN_SCSI_DISK10_MAJOR: 637 case XEN_SCSI_DISK11_MAJOR: 638 case XEN_SCSI_DISK12_MAJOR: 639 case XEN_SCSI_DISK13_MAJOR: 640 case XEN_SCSI_DISK14_MAJOR: 641 case XEN_SCSI_DISK15_MAJOR: 642 *offset = (*minor / PARTS_PER_DISK) + 643 ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16) + 644 EMULATED_SD_DISK_NAME_OFFSET; 645 *minor = *minor + 646 ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16 * PARTS_PER_DISK) + 647 EMULATED_SD_DISK_MINOR_OFFSET; 648 break; 649 case XENVBD_MAJOR: 650 *offset = *minor / PARTS_PER_DISK; 651 break; 652 default: 653 printk(KERN_WARNING "blkfront: your disk configuration is " 654 "incorrect, please use an xvd device instead\n"); 655 return -ENODEV; 656 } 657 return 0; 658 } 659 660 static char *encode_disk_name(char *ptr, unsigned int n) 661 { 662 if (n >= 26) 663 ptr = encode_disk_name(ptr, n / 26 - 1); 664 *ptr = 'a' + n % 26; 665 return ptr + 1; 666 } 667 668 static int xlvbd_alloc_gendisk(blkif_sector_t capacity, 669 struct blkfront_info *info, 670 u16 vdisk_info, u16 sector_size) 671 { 672 struct gendisk *gd; 673 int nr_minors = 1; 674 int err; 675 unsigned int offset; 676 int minor; 677 int nr_parts; 678 char *ptr; 679 680 BUG_ON(info->gd != NULL); 681 BUG_ON(info->rq != NULL); 682 683 if ((info->vdevice>>EXT_SHIFT) > 1) { 684 /* this is above the extended range; something is wrong */ 685 printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", info->vdevice); 686 return -ENODEV; 687 } 688 689 if (!VDEV_IS_EXTENDED(info->vdevice)) { 690 err = xen_translate_vdev(info->vdevice, &minor, &offset); 691 if (err) 692 return err; 693 nr_parts = PARTS_PER_DISK; 694 } else { 695 minor = BLKIF_MINOR_EXT(info->vdevice); 696 nr_parts = PARTS_PER_EXT_DISK; 697 offset = minor / nr_parts; 698 if (xen_hvm_domain() && offset < EMULATED_HD_DISK_NAME_OFFSET + 4) 699 printk(KERN_WARNING "blkfront: vdevice 0x%x might conflict with " 700 "emulated IDE disks,\n\t choose an xvd device name" 701 "from xvde on\n", info->vdevice); 702 } 703 if (minor >> MINORBITS) { 704 pr_warn("blkfront: %#x's minor (%#x) out of range; ignoring\n", 705 info->vdevice, minor); 706 return -ENODEV; 707 } 708 709 if ((minor % nr_parts) == 0) 710 nr_minors = nr_parts; 711 712 err = xlbd_reserve_minors(minor, nr_minors); 713 if (err) 714 goto out; 715 err = -ENODEV; 716 717 gd = alloc_disk(nr_minors); 718 if (gd == NULL) 719 goto release; 720 721 strcpy(gd->disk_name, DEV_NAME); 722 ptr = encode_disk_name(gd->disk_name + sizeof(DEV_NAME) - 1, offset); 723 BUG_ON(ptr >= gd->disk_name + DISK_NAME_LEN); 724 if (nr_minors > 1) 725 *ptr = 0; 726 else 727 snprintf(ptr, gd->disk_name + DISK_NAME_LEN - ptr, 728 "%d", minor & (nr_parts - 1)); 729 730 gd->major = XENVBD_MAJOR; 731 gd->first_minor = minor; 732 gd->fops = &xlvbd_block_fops; 733 gd->private_data = info; 734 gd->driverfs_dev = &(info->xbdev->dev); 735 set_capacity(gd, capacity); 736 737 if (xlvbd_init_blk_queue(gd, sector_size)) { 738 del_gendisk(gd); 739 goto release; 740 } 741 742 info->rq = gd->queue; 743 info->gd = gd; 744 745 xlvbd_flush(info); 746 747 if (vdisk_info & VDISK_READONLY) 748 set_disk_ro(gd, 1); 749 750 if (vdisk_info & VDISK_REMOVABLE) 751 gd->flags |= GENHD_FL_REMOVABLE; 752 753 if (vdisk_info & VDISK_CDROM) 754 gd->flags |= GENHD_FL_CD; 755 756 return 0; 757 758 release: 759 xlbd_release_minors(minor, nr_minors); 760 out: 761 return err; 762 } 763 764 static void xlvbd_release_gendisk(struct blkfront_info *info) 765 { 766 unsigned int minor, nr_minors; 767 unsigned long flags; 768 769 if (info->rq == NULL) 770 return; 771 772 spin_lock_irqsave(&info->io_lock, flags); 773 774 /* No more blkif_request(). */ 775 blk_stop_queue(info->rq); 776 777 /* No more gnttab callback work. */ 778 gnttab_cancel_free_callback(&info->callback); 779 spin_unlock_irqrestore(&info->io_lock, flags); 780 781 /* Flush gnttab callback work. Must be done with no locks held. */ 782 flush_work(&info->work); 783 784 del_gendisk(info->gd); 785 786 minor = info->gd->first_minor; 787 nr_minors = info->gd->minors; 788 xlbd_release_minors(minor, nr_minors); 789 790 blk_cleanup_queue(info->rq); 791 info->rq = NULL; 792 793 put_disk(info->gd); 794 info->gd = NULL; 795 } 796 797 static void kick_pending_request_queues(struct blkfront_info *info) 798 { 799 if (!RING_FULL(&info->ring)) { 800 /* Re-enable calldowns. */ 801 blk_start_queue(info->rq); 802 /* Kick things off immediately. */ 803 do_blkif_request(info->rq); 804 } 805 } 806 807 static void blkif_restart_queue(struct work_struct *work) 808 { 809 struct blkfront_info *info = container_of(work, struct blkfront_info, work); 810 811 spin_lock_irq(&info->io_lock); 812 if (info->connected == BLKIF_STATE_CONNECTED) 813 kick_pending_request_queues(info); 814 spin_unlock_irq(&info->io_lock); 815 } 816 817 static void blkif_free(struct blkfront_info *info, int suspend) 818 { 819 struct grant *persistent_gnt; 820 struct grant *n; 821 822 /* Prevent new requests being issued until we fix things up. */ 823 spin_lock_irq(&info->io_lock); 824 info->connected = suspend ? 825 BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; 826 /* No more blkif_request(). */ 827 if (info->rq) 828 blk_stop_queue(info->rq); 829 830 /* Remove all persistent grants */ 831 if (!list_empty(&info->persistent_gnts)) { 832 list_for_each_entry_safe(persistent_gnt, n, 833 &info->persistent_gnts, node) { 834 list_del(&persistent_gnt->node); 835 if (persistent_gnt->gref != GRANT_INVALID_REF) { 836 gnttab_end_foreign_access(persistent_gnt->gref, 837 0, 0UL); 838 info->persistent_gnts_c--; 839 } 840 __free_page(pfn_to_page(persistent_gnt->pfn)); 841 kfree(persistent_gnt); 842 } 843 } 844 BUG_ON(info->persistent_gnts_c != 0); 845 846 /* No more gnttab callback work. */ 847 gnttab_cancel_free_callback(&info->callback); 848 spin_unlock_irq(&info->io_lock); 849 850 /* Flush gnttab callback work. Must be done with no locks held. */ 851 flush_work(&info->work); 852 853 /* Free resources associated with old device channel. */ 854 if (info->ring_ref != GRANT_INVALID_REF) { 855 gnttab_end_foreign_access(info->ring_ref, 0, 856 (unsigned long)info->ring.sring); 857 info->ring_ref = GRANT_INVALID_REF; 858 info->ring.sring = NULL; 859 } 860 if (info->irq) 861 unbind_from_irqhandler(info->irq, info); 862 info->evtchn = info->irq = 0; 863 864 } 865 866 static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, 867 struct blkif_response *bret) 868 { 869 int i = 0; 870 struct bio_vec *bvec; 871 struct req_iterator iter; 872 unsigned long flags; 873 char *bvec_data; 874 void *shared_data; 875 unsigned int offset = 0; 876 877 if (bret->operation == BLKIF_OP_READ) { 878 /* 879 * Copy the data received from the backend into the bvec. 880 * Since bv_offset can be different than 0, and bv_len different 881 * than PAGE_SIZE, we have to keep track of the current offset, 882 * to be sure we are copying the data from the right shared page. 883 */ 884 rq_for_each_segment(bvec, s->request, iter) { 885 BUG_ON((bvec->bv_offset + bvec->bv_len) > PAGE_SIZE); 886 if (bvec->bv_offset < offset) 887 i++; 888 BUG_ON(i >= s->req.u.rw.nr_segments); 889 shared_data = kmap_atomic( 890 pfn_to_page(s->grants_used[i]->pfn)); 891 bvec_data = bvec_kmap_irq(bvec, &flags); 892 memcpy(bvec_data, shared_data + bvec->bv_offset, 893 bvec->bv_len); 894 bvec_kunmap_irq(bvec_data, &flags); 895 kunmap_atomic(shared_data); 896 offset = bvec->bv_offset + bvec->bv_len; 897 } 898 } 899 /* Add the persistent grant into the list of free grants */ 900 for (i = 0; i < s->req.u.rw.nr_segments; i++) { 901 list_add(&s->grants_used[i]->node, &info->persistent_gnts); 902 info->persistent_gnts_c++; 903 } 904 } 905 906 static irqreturn_t blkif_interrupt(int irq, void *dev_id) 907 { 908 struct request *req; 909 struct blkif_response *bret; 910 RING_IDX i, rp; 911 unsigned long flags; 912 struct blkfront_info *info = (struct blkfront_info *)dev_id; 913 int error; 914 915 spin_lock_irqsave(&info->io_lock, flags); 916 917 if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) { 918 spin_unlock_irqrestore(&info->io_lock, flags); 919 return IRQ_HANDLED; 920 } 921 922 again: 923 rp = info->ring.sring->rsp_prod; 924 rmb(); /* Ensure we see queued responses up to 'rp'. */ 925 926 for (i = info->ring.rsp_cons; i != rp; i++) { 927 unsigned long id; 928 929 bret = RING_GET_RESPONSE(&info->ring, i); 930 id = bret->id; 931 /* 932 * The backend has messed up and given us an id that we would 933 * never have given to it (we stamp it up to BLK_RING_SIZE - 934 * look in get_id_from_freelist. 935 */ 936 if (id >= BLK_RING_SIZE) { 937 WARN(1, "%s: response to %s has incorrect id (%ld)\n", 938 info->gd->disk_name, op_name(bret->operation), id); 939 /* We can't safely get the 'struct request' as 940 * the id is busted. */ 941 continue; 942 } 943 req = info->shadow[id].request; 944 945 if (bret->operation != BLKIF_OP_DISCARD) 946 blkif_completion(&info->shadow[id], info, bret); 947 948 if (add_id_to_freelist(info, id)) { 949 WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n", 950 info->gd->disk_name, op_name(bret->operation), id); 951 continue; 952 } 953 954 error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO; 955 switch (bret->operation) { 956 case BLKIF_OP_DISCARD: 957 if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { 958 struct request_queue *rq = info->rq; 959 printk(KERN_WARNING "blkfront: %s: %s op failed\n", 960 info->gd->disk_name, op_name(bret->operation)); 961 error = -EOPNOTSUPP; 962 info->feature_discard = 0; 963 info->feature_secdiscard = 0; 964 queue_flag_clear(QUEUE_FLAG_DISCARD, rq); 965 queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq); 966 } 967 __blk_end_request_all(req, error); 968 break; 969 case BLKIF_OP_FLUSH_DISKCACHE: 970 case BLKIF_OP_WRITE_BARRIER: 971 if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { 972 printk(KERN_WARNING "blkfront: %s: %s op failed\n", 973 info->gd->disk_name, op_name(bret->operation)); 974 error = -EOPNOTSUPP; 975 } 976 if (unlikely(bret->status == BLKIF_RSP_ERROR && 977 info->shadow[id].req.u.rw.nr_segments == 0)) { 978 printk(KERN_WARNING "blkfront: %s: empty %s op failed\n", 979 info->gd->disk_name, op_name(bret->operation)); 980 error = -EOPNOTSUPP; 981 } 982 if (unlikely(error)) { 983 if (error == -EOPNOTSUPP) 984 error = 0; 985 info->feature_flush = 0; 986 info->flush_op = 0; 987 xlvbd_flush(info); 988 } 989 /* fall through */ 990 case BLKIF_OP_READ: 991 case BLKIF_OP_WRITE: 992 if (unlikely(bret->status != BLKIF_RSP_OKAY)) 993 dev_dbg(&info->xbdev->dev, "Bad return from blkdev data " 994 "request: %x\n", bret->status); 995 996 __blk_end_request_all(req, error); 997 break; 998 default: 999 BUG(); 1000 } 1001 } 1002 1003 info->ring.rsp_cons = i; 1004 1005 if (i != info->ring.req_prod_pvt) { 1006 int more_to_do; 1007 RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do); 1008 if (more_to_do) 1009 goto again; 1010 } else 1011 info->ring.sring->rsp_event = i + 1; 1012 1013 kick_pending_request_queues(info); 1014 1015 spin_unlock_irqrestore(&info->io_lock, flags); 1016 1017 return IRQ_HANDLED; 1018 } 1019 1020 1021 static int setup_blkring(struct xenbus_device *dev, 1022 struct blkfront_info *info) 1023 { 1024 struct blkif_sring *sring; 1025 int err; 1026 1027 info->ring_ref = GRANT_INVALID_REF; 1028 1029 sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH); 1030 if (!sring) { 1031 xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring"); 1032 return -ENOMEM; 1033 } 1034 SHARED_RING_INIT(sring); 1035 FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); 1036 1037 sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST); 1038 1039 /* Allocate memory for grants */ 1040 err = fill_grant_buffer(info, BLK_RING_SIZE * 1041 BLKIF_MAX_SEGMENTS_PER_REQUEST); 1042 if (err) 1043 goto fail; 1044 1045 err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring)); 1046 if (err < 0) { 1047 free_page((unsigned long)sring); 1048 info->ring.sring = NULL; 1049 goto fail; 1050 } 1051 info->ring_ref = err; 1052 1053 err = xenbus_alloc_evtchn(dev, &info->evtchn); 1054 if (err) 1055 goto fail; 1056 1057 err = bind_evtchn_to_irqhandler(info->evtchn, blkif_interrupt, 0, 1058 "blkif", info); 1059 if (err <= 0) { 1060 xenbus_dev_fatal(dev, err, 1061 "bind_evtchn_to_irqhandler failed"); 1062 goto fail; 1063 } 1064 info->irq = err; 1065 1066 return 0; 1067 fail: 1068 blkif_free(info, 0); 1069 return err; 1070 } 1071 1072 1073 /* Common code used when first setting up, and when resuming. */ 1074 static int talk_to_blkback(struct xenbus_device *dev, 1075 struct blkfront_info *info) 1076 { 1077 const char *message = NULL; 1078 struct xenbus_transaction xbt; 1079 int err; 1080 1081 /* Create shared ring, alloc event channel. */ 1082 err = setup_blkring(dev, info); 1083 if (err) 1084 goto out; 1085 1086 again: 1087 err = xenbus_transaction_start(&xbt); 1088 if (err) { 1089 xenbus_dev_fatal(dev, err, "starting transaction"); 1090 goto destroy_blkring; 1091 } 1092 1093 err = xenbus_printf(xbt, dev->nodename, 1094 "ring-ref", "%u", info->ring_ref); 1095 if (err) { 1096 message = "writing ring-ref"; 1097 goto abort_transaction; 1098 } 1099 err = xenbus_printf(xbt, dev->nodename, 1100 "event-channel", "%u", info->evtchn); 1101 if (err) { 1102 message = "writing event-channel"; 1103 goto abort_transaction; 1104 } 1105 err = xenbus_printf(xbt, dev->nodename, "protocol", "%s", 1106 XEN_IO_PROTO_ABI_NATIVE); 1107 if (err) { 1108 message = "writing protocol"; 1109 goto abort_transaction; 1110 } 1111 err = xenbus_printf(xbt, dev->nodename, 1112 "feature-persistent", "%u", 1); 1113 if (err) 1114 dev_warn(&dev->dev, 1115 "writing persistent grants feature to xenbus"); 1116 1117 err = xenbus_transaction_end(xbt, 0); 1118 if (err) { 1119 if (err == -EAGAIN) 1120 goto again; 1121 xenbus_dev_fatal(dev, err, "completing transaction"); 1122 goto destroy_blkring; 1123 } 1124 1125 xenbus_switch_state(dev, XenbusStateInitialised); 1126 1127 return 0; 1128 1129 abort_transaction: 1130 xenbus_transaction_end(xbt, 1); 1131 if (message) 1132 xenbus_dev_fatal(dev, err, "%s", message); 1133 destroy_blkring: 1134 blkif_free(info, 0); 1135 out: 1136 return err; 1137 } 1138 1139 /** 1140 * Entry point to this code when a new device is created. Allocate the basic 1141 * structures and the ring buffer for communication with the backend, and 1142 * inform the backend of the appropriate details for those. Switch to 1143 * Initialised state. 1144 */ 1145 static int blkfront_probe(struct xenbus_device *dev, 1146 const struct xenbus_device_id *id) 1147 { 1148 int err, vdevice, i; 1149 struct blkfront_info *info; 1150 1151 /* FIXME: Use dynamic device id if this is not set. */ 1152 err = xenbus_scanf(XBT_NIL, dev->nodename, 1153 "virtual-device", "%i", &vdevice); 1154 if (err != 1) { 1155 /* go looking in the extended area instead */ 1156 err = xenbus_scanf(XBT_NIL, dev->nodename, "virtual-device-ext", 1157 "%i", &vdevice); 1158 if (err != 1) { 1159 xenbus_dev_fatal(dev, err, "reading virtual-device"); 1160 return err; 1161 } 1162 } 1163 1164 if (xen_hvm_domain()) { 1165 char *type; 1166 int len; 1167 /* no unplug has been done: do not hook devices != xen vbds */ 1168 if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY) { 1169 int major; 1170 1171 if (!VDEV_IS_EXTENDED(vdevice)) 1172 major = BLKIF_MAJOR(vdevice); 1173 else 1174 major = XENVBD_MAJOR; 1175 1176 if (major != XENVBD_MAJOR) { 1177 printk(KERN_INFO 1178 "%s: HVM does not support vbd %d as xen block device\n", 1179 __FUNCTION__, vdevice); 1180 return -ENODEV; 1181 } 1182 } 1183 /* do not create a PV cdrom device if we are an HVM guest */ 1184 type = xenbus_read(XBT_NIL, dev->nodename, "device-type", &len); 1185 if (IS_ERR(type)) 1186 return -ENODEV; 1187 if (strncmp(type, "cdrom", 5) == 0) { 1188 kfree(type); 1189 return -ENODEV; 1190 } 1191 kfree(type); 1192 } 1193 info = kzalloc(sizeof(*info), GFP_KERNEL); 1194 if (!info) { 1195 xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure"); 1196 return -ENOMEM; 1197 } 1198 1199 mutex_init(&info->mutex); 1200 spin_lock_init(&info->io_lock); 1201 info->xbdev = dev; 1202 info->vdevice = vdevice; 1203 INIT_LIST_HEAD(&info->persistent_gnts); 1204 info->persistent_gnts_c = 0; 1205 info->connected = BLKIF_STATE_DISCONNECTED; 1206 INIT_WORK(&info->work, blkif_restart_queue); 1207 1208 for (i = 0; i < BLK_RING_SIZE; i++) 1209 info->shadow[i].req.u.rw.id = i+1; 1210 info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; 1211 1212 /* Front end dir is a number, which is used as the id. */ 1213 info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0); 1214 dev_set_drvdata(&dev->dev, info); 1215 1216 err = talk_to_blkback(dev, info); 1217 if (err) { 1218 kfree(info); 1219 dev_set_drvdata(&dev->dev, NULL); 1220 return err; 1221 } 1222 1223 return 0; 1224 } 1225 1226 1227 static int blkif_recover(struct blkfront_info *info) 1228 { 1229 int i; 1230 struct blkif_request *req; 1231 struct blk_shadow *copy; 1232 int j; 1233 1234 /* Stage 1: Make a safe copy of the shadow state. */ 1235 copy = kmemdup(info->shadow, sizeof(info->shadow), 1236 GFP_NOIO | __GFP_REPEAT | __GFP_HIGH); 1237 if (!copy) 1238 return -ENOMEM; 1239 1240 /* Stage 2: Set up free list. */ 1241 memset(&info->shadow, 0, sizeof(info->shadow)); 1242 for (i = 0; i < BLK_RING_SIZE; i++) 1243 info->shadow[i].req.u.rw.id = i+1; 1244 info->shadow_free = info->ring.req_prod_pvt; 1245 info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; 1246 1247 /* Stage 3: Find pending requests and requeue them. */ 1248 for (i = 0; i < BLK_RING_SIZE; i++) { 1249 /* Not in use? */ 1250 if (!copy[i].request) 1251 continue; 1252 1253 /* Grab a request slot and copy shadow state into it. */ 1254 req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); 1255 *req = copy[i].req; 1256 1257 /* We get a new request id, and must reset the shadow state. */ 1258 req->u.rw.id = get_id_from_freelist(info); 1259 memcpy(&info->shadow[req->u.rw.id], ©[i], sizeof(copy[i])); 1260 1261 if (req->operation != BLKIF_OP_DISCARD) { 1262 /* Rewrite any grant references invalidated by susp/resume. */ 1263 for (j = 0; j < req->u.rw.nr_segments; j++) 1264 gnttab_grant_foreign_access_ref( 1265 req->u.rw.seg[j].gref, 1266 info->xbdev->otherend_id, 1267 pfn_to_mfn(copy[i].grants_used[j]->pfn), 1268 0); 1269 } 1270 info->shadow[req->u.rw.id].req = *req; 1271 1272 info->ring.req_prod_pvt++; 1273 } 1274 1275 kfree(copy); 1276 1277 xenbus_switch_state(info->xbdev, XenbusStateConnected); 1278 1279 spin_lock_irq(&info->io_lock); 1280 1281 /* Now safe for us to use the shared ring */ 1282 info->connected = BLKIF_STATE_CONNECTED; 1283 1284 /* Send off requeued requests */ 1285 flush_requests(info); 1286 1287 /* Kick any other new requests queued since we resumed */ 1288 kick_pending_request_queues(info); 1289 1290 spin_unlock_irq(&info->io_lock); 1291 1292 return 0; 1293 } 1294 1295 /** 1296 * We are reconnecting to the backend, due to a suspend/resume, or a backend 1297 * driver restart. We tear down our blkif structure and recreate it, but 1298 * leave the device-layer structures intact so that this is transparent to the 1299 * rest of the kernel. 1300 */ 1301 static int blkfront_resume(struct xenbus_device *dev) 1302 { 1303 struct blkfront_info *info = dev_get_drvdata(&dev->dev); 1304 int err; 1305 1306 dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename); 1307 1308 blkif_free(info, info->connected == BLKIF_STATE_CONNECTED); 1309 1310 err = talk_to_blkback(dev, info); 1311 if (info->connected == BLKIF_STATE_SUSPENDED && !err) 1312 err = blkif_recover(info); 1313 1314 return err; 1315 } 1316 1317 static void 1318 blkfront_closing(struct blkfront_info *info) 1319 { 1320 struct xenbus_device *xbdev = info->xbdev; 1321 struct block_device *bdev = NULL; 1322 1323 mutex_lock(&info->mutex); 1324 1325 if (xbdev->state == XenbusStateClosing) { 1326 mutex_unlock(&info->mutex); 1327 return; 1328 } 1329 1330 if (info->gd) 1331 bdev = bdget_disk(info->gd, 0); 1332 1333 mutex_unlock(&info->mutex); 1334 1335 if (!bdev) { 1336 xenbus_frontend_closed(xbdev); 1337 return; 1338 } 1339 1340 mutex_lock(&bdev->bd_mutex); 1341 1342 if (bdev->bd_openers) { 1343 xenbus_dev_error(xbdev, -EBUSY, 1344 "Device in use; refusing to close"); 1345 xenbus_switch_state(xbdev, XenbusStateClosing); 1346 } else { 1347 xlvbd_release_gendisk(info); 1348 xenbus_frontend_closed(xbdev); 1349 } 1350 1351 mutex_unlock(&bdev->bd_mutex); 1352 bdput(bdev); 1353 } 1354 1355 static void blkfront_setup_discard(struct blkfront_info *info) 1356 { 1357 int err; 1358 char *type; 1359 unsigned int discard_granularity; 1360 unsigned int discard_alignment; 1361 unsigned int discard_secure; 1362 1363 type = xenbus_read(XBT_NIL, info->xbdev->otherend, "type", NULL); 1364 if (IS_ERR(type)) 1365 return; 1366 1367 info->feature_secdiscard = 0; 1368 if (strncmp(type, "phy", 3) == 0) { 1369 err = xenbus_gather(XBT_NIL, info->xbdev->otherend, 1370 "discard-granularity", "%u", &discard_granularity, 1371 "discard-alignment", "%u", &discard_alignment, 1372 NULL); 1373 if (!err) { 1374 info->feature_discard = 1; 1375 info->discard_granularity = discard_granularity; 1376 info->discard_alignment = discard_alignment; 1377 } 1378 err = xenbus_gather(XBT_NIL, info->xbdev->otherend, 1379 "discard-secure", "%d", &discard_secure, 1380 NULL); 1381 if (!err) 1382 info->feature_secdiscard = discard_secure; 1383 1384 } else if (strncmp(type, "file", 4) == 0) 1385 info->feature_discard = 1; 1386 1387 kfree(type); 1388 } 1389 1390 /* 1391 * Invoked when the backend is finally 'ready' (and has told produced 1392 * the details about the physical device - #sectors, size, etc). 1393 */ 1394 static void blkfront_connect(struct blkfront_info *info) 1395 { 1396 unsigned long long sectors; 1397 unsigned long sector_size; 1398 unsigned int binfo; 1399 int err; 1400 int barrier, flush, discard, persistent; 1401 1402 switch (info->connected) { 1403 case BLKIF_STATE_CONNECTED: 1404 /* 1405 * Potentially, the back-end may be signalling 1406 * a capacity change; update the capacity. 1407 */ 1408 err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, 1409 "sectors", "%Lu", §ors); 1410 if (XENBUS_EXIST_ERR(err)) 1411 return; 1412 printk(KERN_INFO "Setting capacity to %Lu\n", 1413 sectors); 1414 set_capacity(info->gd, sectors); 1415 revalidate_disk(info->gd); 1416 1417 /* fall through */ 1418 case BLKIF_STATE_SUSPENDED: 1419 return; 1420 1421 default: 1422 break; 1423 } 1424 1425 dev_dbg(&info->xbdev->dev, "%s:%s.\n", 1426 __func__, info->xbdev->otherend); 1427 1428 err = xenbus_gather(XBT_NIL, info->xbdev->otherend, 1429 "sectors", "%llu", §ors, 1430 "info", "%u", &binfo, 1431 "sector-size", "%lu", §or_size, 1432 NULL); 1433 if (err) { 1434 xenbus_dev_fatal(info->xbdev, err, 1435 "reading backend fields at %s", 1436 info->xbdev->otherend); 1437 return; 1438 } 1439 1440 info->feature_flush = 0; 1441 info->flush_op = 0; 1442 1443 err = xenbus_gather(XBT_NIL, info->xbdev->otherend, 1444 "feature-barrier", "%d", &barrier, 1445 NULL); 1446 1447 /* 1448 * If there's no "feature-barrier" defined, then it means 1449 * we're dealing with a very old backend which writes 1450 * synchronously; nothing to do. 1451 * 1452 * If there are barriers, then we use flush. 1453 */ 1454 if (!err && barrier) { 1455 info->feature_flush = REQ_FLUSH | REQ_FUA; 1456 info->flush_op = BLKIF_OP_WRITE_BARRIER; 1457 } 1458 /* 1459 * And if there is "feature-flush-cache" use that above 1460 * barriers. 1461 */ 1462 err = xenbus_gather(XBT_NIL, info->xbdev->otherend, 1463 "feature-flush-cache", "%d", &flush, 1464 NULL); 1465 1466 if (!err && flush) { 1467 info->feature_flush = REQ_FLUSH; 1468 info->flush_op = BLKIF_OP_FLUSH_DISKCACHE; 1469 } 1470 1471 err = xenbus_gather(XBT_NIL, info->xbdev->otherend, 1472 "feature-discard", "%d", &discard, 1473 NULL); 1474 1475 if (!err && discard) 1476 blkfront_setup_discard(info); 1477 1478 err = xenbus_gather(XBT_NIL, info->xbdev->otherend, 1479 "feature-persistent", "%u", &persistent, 1480 NULL); 1481 if (err) 1482 info->feature_persistent = 0; 1483 else 1484 info->feature_persistent = persistent; 1485 1486 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size); 1487 if (err) { 1488 xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", 1489 info->xbdev->otherend); 1490 return; 1491 } 1492 1493 xenbus_switch_state(info->xbdev, XenbusStateConnected); 1494 1495 /* Kick pending requests. */ 1496 spin_lock_irq(&info->io_lock); 1497 info->connected = BLKIF_STATE_CONNECTED; 1498 kick_pending_request_queues(info); 1499 spin_unlock_irq(&info->io_lock); 1500 1501 add_disk(info->gd); 1502 1503 info->is_ready = 1; 1504 } 1505 1506 /** 1507 * Callback received when the backend's state changes. 1508 */ 1509 static void blkback_changed(struct xenbus_device *dev, 1510 enum xenbus_state backend_state) 1511 { 1512 struct blkfront_info *info = dev_get_drvdata(&dev->dev); 1513 1514 dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state); 1515 1516 switch (backend_state) { 1517 case XenbusStateInitialising: 1518 case XenbusStateInitWait: 1519 case XenbusStateInitialised: 1520 case XenbusStateReconfiguring: 1521 case XenbusStateReconfigured: 1522 case XenbusStateUnknown: 1523 case XenbusStateClosed: 1524 break; 1525 1526 case XenbusStateConnected: 1527 blkfront_connect(info); 1528 break; 1529 1530 case XenbusStateClosing: 1531 blkfront_closing(info); 1532 break; 1533 } 1534 } 1535 1536 static int blkfront_remove(struct xenbus_device *xbdev) 1537 { 1538 struct blkfront_info *info = dev_get_drvdata(&xbdev->dev); 1539 struct block_device *bdev = NULL; 1540 struct gendisk *disk; 1541 1542 dev_dbg(&xbdev->dev, "%s removed", xbdev->nodename); 1543 1544 blkif_free(info, 0); 1545 1546 mutex_lock(&info->mutex); 1547 1548 disk = info->gd; 1549 if (disk) 1550 bdev = bdget_disk(disk, 0); 1551 1552 info->xbdev = NULL; 1553 mutex_unlock(&info->mutex); 1554 1555 if (!bdev) { 1556 kfree(info); 1557 return 0; 1558 } 1559 1560 /* 1561 * The xbdev was removed before we reached the Closed 1562 * state. See if it's safe to remove the disk. If the bdev 1563 * isn't closed yet, we let release take care of it. 1564 */ 1565 1566 mutex_lock(&bdev->bd_mutex); 1567 info = disk->private_data; 1568 1569 dev_warn(disk_to_dev(disk), 1570 "%s was hot-unplugged, %d stale handles\n", 1571 xbdev->nodename, bdev->bd_openers); 1572 1573 if (info && !bdev->bd_openers) { 1574 xlvbd_release_gendisk(info); 1575 disk->private_data = NULL; 1576 kfree(info); 1577 } 1578 1579 mutex_unlock(&bdev->bd_mutex); 1580 bdput(bdev); 1581 1582 return 0; 1583 } 1584 1585 static int blkfront_is_ready(struct xenbus_device *dev) 1586 { 1587 struct blkfront_info *info = dev_get_drvdata(&dev->dev); 1588 1589 return info->is_ready && info->xbdev; 1590 } 1591 1592 static int blkif_open(struct block_device *bdev, fmode_t mode) 1593 { 1594 struct gendisk *disk = bdev->bd_disk; 1595 struct blkfront_info *info; 1596 int err = 0; 1597 1598 mutex_lock(&blkfront_mutex); 1599 1600 info = disk->private_data; 1601 if (!info) { 1602 /* xbdev gone */ 1603 err = -ERESTARTSYS; 1604 goto out; 1605 } 1606 1607 mutex_lock(&info->mutex); 1608 1609 if (!info->gd) 1610 /* xbdev is closed */ 1611 err = -ERESTARTSYS; 1612 1613 mutex_unlock(&info->mutex); 1614 1615 out: 1616 mutex_unlock(&blkfront_mutex); 1617 return err; 1618 } 1619 1620 static void blkif_release(struct gendisk *disk, fmode_t mode) 1621 { 1622 struct blkfront_info *info = disk->private_data; 1623 struct block_device *bdev; 1624 struct xenbus_device *xbdev; 1625 1626 mutex_lock(&blkfront_mutex); 1627 1628 bdev = bdget_disk(disk, 0); 1629 1630 if (bdev->bd_openers) 1631 goto out; 1632 1633 /* 1634 * Check if we have been instructed to close. We will have 1635 * deferred this request, because the bdev was still open. 1636 */ 1637 1638 mutex_lock(&info->mutex); 1639 xbdev = info->xbdev; 1640 1641 if (xbdev && xbdev->state == XenbusStateClosing) { 1642 /* pending switch to state closed */ 1643 dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n"); 1644 xlvbd_release_gendisk(info); 1645 xenbus_frontend_closed(info->xbdev); 1646 } 1647 1648 mutex_unlock(&info->mutex); 1649 1650 if (!xbdev) { 1651 /* sudden device removal */ 1652 dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n"); 1653 xlvbd_release_gendisk(info); 1654 disk->private_data = NULL; 1655 kfree(info); 1656 } 1657 1658 out: 1659 bdput(bdev); 1660 mutex_unlock(&blkfront_mutex); 1661 } 1662 1663 static const struct block_device_operations xlvbd_block_fops = 1664 { 1665 .owner = THIS_MODULE, 1666 .open = blkif_open, 1667 .release = blkif_release, 1668 .getgeo = blkif_getgeo, 1669 .ioctl = blkif_ioctl, 1670 }; 1671 1672 1673 static const struct xenbus_device_id blkfront_ids[] = { 1674 { "vbd" }, 1675 { "" } 1676 }; 1677 1678 static DEFINE_XENBUS_DRIVER(blkfront, , 1679 .probe = blkfront_probe, 1680 .remove = blkfront_remove, 1681 .resume = blkfront_resume, 1682 .otherend_changed = blkback_changed, 1683 .is_ready = blkfront_is_ready, 1684 ); 1685 1686 static int __init xlblk_init(void) 1687 { 1688 int ret; 1689 1690 if (!xen_domain()) 1691 return -ENODEV; 1692 1693 if (xen_hvm_domain() && !xen_platform_pci_unplug) 1694 return -ENODEV; 1695 1696 if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) { 1697 printk(KERN_WARNING "xen_blk: can't get major %d with name %s\n", 1698 XENVBD_MAJOR, DEV_NAME); 1699 return -ENODEV; 1700 } 1701 1702 ret = xenbus_register_frontend(&blkfront_driver); 1703 if (ret) { 1704 unregister_blkdev(XENVBD_MAJOR, DEV_NAME); 1705 return ret; 1706 } 1707 1708 return 0; 1709 } 1710 module_init(xlblk_init); 1711 1712 1713 static void __exit xlblk_exit(void) 1714 { 1715 xenbus_unregister_driver(&blkfront_driver); 1716 unregister_blkdev(XENVBD_MAJOR, DEV_NAME); 1717 kfree(minors); 1718 } 1719 module_exit(xlblk_exit); 1720 1721 MODULE_DESCRIPTION("Xen virtual block device frontend"); 1722 MODULE_LICENSE("GPL"); 1723 MODULE_ALIAS_BLOCKDEV_MAJOR(XENVBD_MAJOR); 1724 MODULE_ALIAS("xen:vbd"); 1725 MODULE_ALIAS("xenblk"); 1726