xref: /openbmc/linux/drivers/block/xen-blkfront.c (revision 384740dc)
1 /*
2  * blkfront.c
3  *
4  * XenLinux virtual block device driver.
5  *
6  * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
7  * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
8  * Copyright (c) 2004, Christian Limpach
9  * Copyright (c) 2004, Andrew Warfield
10  * Copyright (c) 2005, Christopher Clark
11  * Copyright (c) 2005, XenSource Ltd
12  *
13  * This program is free software; you can redistribute it and/or
14  * modify it under the terms of the GNU General Public License version 2
15  * as published by the Free Software Foundation; or, when distributed
16  * separately from the Linux kernel or incorporated into other
17  * software packages, subject to the following license:
18  *
19  * Permission is hereby granted, free of charge, to any person obtaining a copy
20  * of this source file (the "Software"), to deal in the Software without
21  * restriction, including without limitation the rights to use, copy, modify,
22  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
23  * and to permit persons to whom the Software is furnished to do so, subject to
24  * the following conditions:
25  *
26  * The above copyright notice and this permission notice shall be included in
27  * all copies or substantial portions of the Software.
28  *
29  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
34  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
35  * IN THE SOFTWARE.
36  */
37 
38 #include <linux/interrupt.h>
39 #include <linux/blkdev.h>
40 #include <linux/hdreg.h>
41 #include <linux/cdrom.h>
42 #include <linux/module.h>
43 
44 #include <xen/xenbus.h>
45 #include <xen/grant_table.h>
46 #include <xen/events.h>
47 #include <xen/page.h>
48 
49 #include <xen/interface/grant_table.h>
50 #include <xen/interface/io/blkif.h>
51 #include <xen/interface/io/protocols.h>
52 
53 #include <asm/xen/hypervisor.h>
54 
55 enum blkif_state {
56 	BLKIF_STATE_DISCONNECTED,
57 	BLKIF_STATE_CONNECTED,
58 	BLKIF_STATE_SUSPENDED,
59 };
60 
61 struct blk_shadow {
62 	struct blkif_request req;
63 	unsigned long request;
64 	unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
65 };
66 
67 static struct block_device_operations xlvbd_block_fops;
68 
69 #define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
70 
71 /*
72  * We have one of these per vbd, whether ide, scsi or 'other'.  They
73  * hang in private_data off the gendisk structure. We may end up
74  * putting all kinds of interesting stuff here :-)
75  */
76 struct blkfront_info
77 {
78 	struct xenbus_device *xbdev;
79 	struct gendisk *gd;
80 	int vdevice;
81 	blkif_vdev_t handle;
82 	enum blkif_state connected;
83 	int ring_ref;
84 	struct blkif_front_ring ring;
85 	unsigned int evtchn, irq;
86 	struct request_queue *rq;
87 	struct work_struct work;
88 	struct gnttab_free_callback callback;
89 	struct blk_shadow shadow[BLK_RING_SIZE];
90 	unsigned long shadow_free;
91 	int feature_barrier;
92 	int is_ready;
93 
94 	/**
95 	 * The number of people holding this device open.  We won't allow a
96 	 * hot-unplug unless this is 0.
97 	 */
98 	int users;
99 };
100 
101 static DEFINE_SPINLOCK(blkif_io_lock);
102 
103 #define MAXIMUM_OUTSTANDING_BLOCK_REQS \
104 	(BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
105 #define GRANT_INVALID_REF	0
106 
107 #define PARTS_PER_DISK		16
108 #define PARTS_PER_EXT_DISK      256
109 
110 #define BLKIF_MAJOR(dev) ((dev)>>8)
111 #define BLKIF_MINOR(dev) ((dev) & 0xff)
112 
113 #define EXT_SHIFT 28
114 #define EXTENDED (1<<EXT_SHIFT)
115 #define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED))
116 #define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED))
117 
118 #define DEV_NAME	"xvd"	/* name in /dev */
119 
120 static int get_id_from_freelist(struct blkfront_info *info)
121 {
122 	unsigned long free = info->shadow_free;
123 	BUG_ON(free > BLK_RING_SIZE);
124 	info->shadow_free = info->shadow[free].req.id;
125 	info->shadow[free].req.id = 0x0fffffee; /* debug */
126 	return free;
127 }
128 
129 static void add_id_to_freelist(struct blkfront_info *info,
130 			       unsigned long id)
131 {
132 	info->shadow[id].req.id  = info->shadow_free;
133 	info->shadow[id].request = 0;
134 	info->shadow_free = id;
135 }
136 
137 static void blkif_restart_queue_callback(void *arg)
138 {
139 	struct blkfront_info *info = (struct blkfront_info *)arg;
140 	schedule_work(&info->work);
141 }
142 
143 static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
144 {
145 	/* We don't have real geometry info, but let's at least return
146 	   values consistent with the size of the device */
147 	sector_t nsect = get_capacity(bd->bd_disk);
148 	sector_t cylinders = nsect;
149 
150 	hg->heads = 0xff;
151 	hg->sectors = 0x3f;
152 	sector_div(cylinders, hg->heads * hg->sectors);
153 	hg->cylinders = cylinders;
154 	if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
155 		hg->cylinders = 0xffff;
156 	return 0;
157 }
158 
159 static int blkif_ioctl(struct inode *inode, struct file *filep,
160 		       unsigned command, unsigned long argument)
161 {
162 	struct blkfront_info *info =
163 		inode->i_bdev->bd_disk->private_data;
164 	int i;
165 
166 	dev_dbg(&info->xbdev->dev, "command: 0x%x, argument: 0x%lx\n",
167 		command, (long)argument);
168 
169 	switch (command) {
170 	case CDROMMULTISESSION:
171 		dev_dbg(&info->xbdev->dev, "FIXME: support multisession CDs later\n");
172 		for (i = 0; i < sizeof(struct cdrom_multisession); i++)
173 			if (put_user(0, (char __user *)(argument + i)))
174 				return -EFAULT;
175 		return 0;
176 
177 	case CDROM_GET_CAPABILITY: {
178 		struct gendisk *gd = info->gd;
179 		if (gd->flags & GENHD_FL_CD)
180 			return 0;
181 		return -EINVAL;
182 	}
183 
184 	default:
185 		/*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
186 		  command);*/
187 		return -EINVAL; /* same return as native Linux */
188 	}
189 
190 	return 0;
191 }
192 
193 /*
194  * blkif_queue_request
195  *
196  * request block io
197  *
198  * id: for guest use only.
199  * operation: BLKIF_OP_{READ,WRITE,PROBE}
200  * buffer: buffer to read/write into. this should be a
201  *   virtual address in the guest os.
202  */
203 static int blkif_queue_request(struct request *req)
204 {
205 	struct blkfront_info *info = req->rq_disk->private_data;
206 	unsigned long buffer_mfn;
207 	struct blkif_request *ring_req;
208 	struct req_iterator iter;
209 	struct bio_vec *bvec;
210 	unsigned long id;
211 	unsigned int fsect, lsect;
212 	int ref;
213 	grant_ref_t gref_head;
214 
215 	if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
216 		return 1;
217 
218 	if (gnttab_alloc_grant_references(
219 		BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
220 		gnttab_request_free_callback(
221 			&info->callback,
222 			blkif_restart_queue_callback,
223 			info,
224 			BLKIF_MAX_SEGMENTS_PER_REQUEST);
225 		return 1;
226 	}
227 
228 	/* Fill out a communications ring structure. */
229 	ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
230 	id = get_id_from_freelist(info);
231 	info->shadow[id].request = (unsigned long)req;
232 
233 	ring_req->id = id;
234 	ring_req->sector_number = (blkif_sector_t)req->sector;
235 	ring_req->handle = info->handle;
236 
237 	ring_req->operation = rq_data_dir(req) ?
238 		BLKIF_OP_WRITE : BLKIF_OP_READ;
239 	if (blk_barrier_rq(req))
240 		ring_req->operation = BLKIF_OP_WRITE_BARRIER;
241 
242 	ring_req->nr_segments = 0;
243 	rq_for_each_segment(bvec, req, iter) {
244 		BUG_ON(ring_req->nr_segments == BLKIF_MAX_SEGMENTS_PER_REQUEST);
245 		buffer_mfn = pfn_to_mfn(page_to_pfn(bvec->bv_page));
246 		fsect = bvec->bv_offset >> 9;
247 		lsect = fsect + (bvec->bv_len >> 9) - 1;
248 		/* install a grant reference. */
249 		ref = gnttab_claim_grant_reference(&gref_head);
250 		BUG_ON(ref == -ENOSPC);
251 
252 		gnttab_grant_foreign_access_ref(
253 				ref,
254 				info->xbdev->otherend_id,
255 				buffer_mfn,
256 				rq_data_dir(req) );
257 
258 		info->shadow[id].frame[ring_req->nr_segments] =
259 				mfn_to_pfn(buffer_mfn);
260 
261 		ring_req->seg[ring_req->nr_segments] =
262 				(struct blkif_request_segment) {
263 					.gref       = ref,
264 					.first_sect = fsect,
265 					.last_sect  = lsect };
266 
267 		ring_req->nr_segments++;
268 	}
269 
270 	info->ring.req_prod_pvt++;
271 
272 	/* Keep a private copy so we can reissue requests when recovering. */
273 	info->shadow[id].req = *ring_req;
274 
275 	gnttab_free_grant_references(gref_head);
276 
277 	return 0;
278 }
279 
280 
281 static inline void flush_requests(struct blkfront_info *info)
282 {
283 	int notify;
284 
285 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
286 
287 	if (notify)
288 		notify_remote_via_irq(info->irq);
289 }
290 
291 /*
292  * do_blkif_request
293  *  read a block; request is in a request queue
294  */
295 static void do_blkif_request(struct request_queue *rq)
296 {
297 	struct blkfront_info *info = NULL;
298 	struct request *req;
299 	int queued;
300 
301 	pr_debug("Entered do_blkif_request\n");
302 
303 	queued = 0;
304 
305 	while ((req = elv_next_request(rq)) != NULL) {
306 		info = req->rq_disk->private_data;
307 		if (!blk_fs_request(req)) {
308 			end_request(req, 0);
309 			continue;
310 		}
311 
312 		if (RING_FULL(&info->ring))
313 			goto wait;
314 
315 		pr_debug("do_blk_req %p: cmd %p, sec %lx, "
316 			 "(%u/%li) buffer:%p [%s]\n",
317 			 req, req->cmd, (unsigned long)req->sector,
318 			 req->current_nr_sectors,
319 			 req->nr_sectors, req->buffer,
320 			 rq_data_dir(req) ? "write" : "read");
321 
322 
323 		blkdev_dequeue_request(req);
324 		if (blkif_queue_request(req)) {
325 			blk_requeue_request(rq, req);
326 wait:
327 			/* Avoid pointless unplugs. */
328 			blk_stop_queue(rq);
329 			break;
330 		}
331 
332 		queued++;
333 	}
334 
335 	if (queued != 0)
336 		flush_requests(info);
337 }
338 
339 static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
340 {
341 	struct request_queue *rq;
342 
343 	rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
344 	if (rq == NULL)
345 		return -1;
346 
347 	elevator_init(rq, "noop");
348 
349 	/* Hard sector size and max sectors impersonate the equiv. hardware. */
350 	blk_queue_hardsect_size(rq, sector_size);
351 	blk_queue_max_sectors(rq, 512);
352 
353 	/* Each segment in a request is up to an aligned page in size. */
354 	blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
355 	blk_queue_max_segment_size(rq, PAGE_SIZE);
356 
357 	/* Ensure a merged request will fit in a single I/O ring slot. */
358 	blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
359 	blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
360 
361 	/* Make sure buffer addresses are sector-aligned. */
362 	blk_queue_dma_alignment(rq, 511);
363 
364 	/* Make sure we don't use bounce buffers. */
365 	blk_queue_bounce_limit(rq, BLK_BOUNCE_ANY);
366 
367 	gd->queue = rq;
368 
369 	return 0;
370 }
371 
372 
373 static int xlvbd_barrier(struct blkfront_info *info)
374 {
375 	int err;
376 
377 	err = blk_queue_ordered(info->rq,
378 				info->feature_barrier ? QUEUE_ORDERED_DRAIN : QUEUE_ORDERED_NONE,
379 				NULL);
380 
381 	if (err)
382 		return err;
383 
384 	printk(KERN_INFO "blkfront: %s: barriers %s\n",
385 	       info->gd->disk_name,
386 	       info->feature_barrier ? "enabled" : "disabled");
387 	return 0;
388 }
389 
390 
391 static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
392 			       struct blkfront_info *info,
393 			       u16 vdisk_info, u16 sector_size)
394 {
395 	struct gendisk *gd;
396 	int nr_minors = 1;
397 	int err = -ENODEV;
398 	unsigned int offset;
399 	int minor;
400 	int nr_parts;
401 
402 	BUG_ON(info->gd != NULL);
403 	BUG_ON(info->rq != NULL);
404 
405 	if ((info->vdevice>>EXT_SHIFT) > 1) {
406 		/* this is above the extended range; something is wrong */
407 		printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", info->vdevice);
408 		return -ENODEV;
409 	}
410 
411 	if (!VDEV_IS_EXTENDED(info->vdevice)) {
412 		minor = BLKIF_MINOR(info->vdevice);
413 		nr_parts = PARTS_PER_DISK;
414 	} else {
415 		minor = BLKIF_MINOR_EXT(info->vdevice);
416 		nr_parts = PARTS_PER_EXT_DISK;
417 	}
418 
419 	if ((minor % nr_parts) == 0)
420 		nr_minors = nr_parts;
421 
422 	gd = alloc_disk(nr_minors);
423 	if (gd == NULL)
424 		goto out;
425 
426 	offset = minor / nr_parts;
427 
428 	if (nr_minors > 1) {
429 		if (offset < 26)
430 			sprintf(gd->disk_name, "%s%c", DEV_NAME, 'a' + offset);
431 		else
432 			sprintf(gd->disk_name, "%s%c%c", DEV_NAME,
433 				'a' + ((offset / 26)-1), 'a' + (offset % 26));
434 	} else {
435 		if (offset < 26)
436 			sprintf(gd->disk_name, "%s%c%d", DEV_NAME,
437 				'a' + offset,
438 				minor & (nr_parts - 1));
439 		else
440 			sprintf(gd->disk_name, "%s%c%c%d", DEV_NAME,
441 				'a' + ((offset / 26) - 1),
442 				'a' + (offset % 26),
443 				minor & (nr_parts - 1));
444 	}
445 
446 	gd->major = XENVBD_MAJOR;
447 	gd->first_minor = minor;
448 	gd->fops = &xlvbd_block_fops;
449 	gd->private_data = info;
450 	gd->driverfs_dev = &(info->xbdev->dev);
451 	set_capacity(gd, capacity);
452 
453 	if (xlvbd_init_blk_queue(gd, sector_size)) {
454 		del_gendisk(gd);
455 		goto out;
456 	}
457 
458 	info->rq = gd->queue;
459 	info->gd = gd;
460 
461 	if (info->feature_barrier)
462 		xlvbd_barrier(info);
463 
464 	if (vdisk_info & VDISK_READONLY)
465 		set_disk_ro(gd, 1);
466 
467 	if (vdisk_info & VDISK_REMOVABLE)
468 		gd->flags |= GENHD_FL_REMOVABLE;
469 
470 	if (vdisk_info & VDISK_CDROM)
471 		gd->flags |= GENHD_FL_CD;
472 
473 	return 0;
474 
475  out:
476 	return err;
477 }
478 
479 static void kick_pending_request_queues(struct blkfront_info *info)
480 {
481 	if (!RING_FULL(&info->ring)) {
482 		/* Re-enable calldowns. */
483 		blk_start_queue(info->rq);
484 		/* Kick things off immediately. */
485 		do_blkif_request(info->rq);
486 	}
487 }
488 
489 static void blkif_restart_queue(struct work_struct *work)
490 {
491 	struct blkfront_info *info = container_of(work, struct blkfront_info, work);
492 
493 	spin_lock_irq(&blkif_io_lock);
494 	if (info->connected == BLKIF_STATE_CONNECTED)
495 		kick_pending_request_queues(info);
496 	spin_unlock_irq(&blkif_io_lock);
497 }
498 
499 static void blkif_free(struct blkfront_info *info, int suspend)
500 {
501 	/* Prevent new requests being issued until we fix things up. */
502 	spin_lock_irq(&blkif_io_lock);
503 	info->connected = suspend ?
504 		BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
505 	/* No more blkif_request(). */
506 	if (info->rq)
507 		blk_stop_queue(info->rq);
508 	/* No more gnttab callback work. */
509 	gnttab_cancel_free_callback(&info->callback);
510 	spin_unlock_irq(&blkif_io_lock);
511 
512 	/* Flush gnttab callback work. Must be done with no locks held. */
513 	flush_scheduled_work();
514 
515 	/* Free resources associated with old device channel. */
516 	if (info->ring_ref != GRANT_INVALID_REF) {
517 		gnttab_end_foreign_access(info->ring_ref, 0,
518 					  (unsigned long)info->ring.sring);
519 		info->ring_ref = GRANT_INVALID_REF;
520 		info->ring.sring = NULL;
521 	}
522 	if (info->irq)
523 		unbind_from_irqhandler(info->irq, info);
524 	info->evtchn = info->irq = 0;
525 
526 }
527 
528 static void blkif_completion(struct blk_shadow *s)
529 {
530 	int i;
531 	for (i = 0; i < s->req.nr_segments; i++)
532 		gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL);
533 }
534 
535 static irqreturn_t blkif_interrupt(int irq, void *dev_id)
536 {
537 	struct request *req;
538 	struct blkif_response *bret;
539 	RING_IDX i, rp;
540 	unsigned long flags;
541 	struct blkfront_info *info = (struct blkfront_info *)dev_id;
542 	int error;
543 
544 	spin_lock_irqsave(&blkif_io_lock, flags);
545 
546 	if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
547 		spin_unlock_irqrestore(&blkif_io_lock, flags);
548 		return IRQ_HANDLED;
549 	}
550 
551  again:
552 	rp = info->ring.sring->rsp_prod;
553 	rmb(); /* Ensure we see queued responses up to 'rp'. */
554 
555 	for (i = info->ring.rsp_cons; i != rp; i++) {
556 		unsigned long id;
557 		int ret;
558 
559 		bret = RING_GET_RESPONSE(&info->ring, i);
560 		id   = bret->id;
561 		req  = (struct request *)info->shadow[id].request;
562 
563 		blkif_completion(&info->shadow[id]);
564 
565 		add_id_to_freelist(info, id);
566 
567 		error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
568 		switch (bret->operation) {
569 		case BLKIF_OP_WRITE_BARRIER:
570 			if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
571 				printk(KERN_WARNING "blkfront: %s: write barrier op failed\n",
572 				       info->gd->disk_name);
573 				error = -EOPNOTSUPP;
574 				info->feature_barrier = 0;
575 				xlvbd_barrier(info);
576 			}
577 			/* fall through */
578 		case BLKIF_OP_READ:
579 		case BLKIF_OP_WRITE:
580 			if (unlikely(bret->status != BLKIF_RSP_OKAY))
581 				dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
582 					"request: %x\n", bret->status);
583 
584 			ret = __blk_end_request(req, error, blk_rq_bytes(req));
585 			BUG_ON(ret);
586 			break;
587 		default:
588 			BUG();
589 		}
590 	}
591 
592 	info->ring.rsp_cons = i;
593 
594 	if (i != info->ring.req_prod_pvt) {
595 		int more_to_do;
596 		RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
597 		if (more_to_do)
598 			goto again;
599 	} else
600 		info->ring.sring->rsp_event = i + 1;
601 
602 	kick_pending_request_queues(info);
603 
604 	spin_unlock_irqrestore(&blkif_io_lock, flags);
605 
606 	return IRQ_HANDLED;
607 }
608 
609 
610 static int setup_blkring(struct xenbus_device *dev,
611 			 struct blkfront_info *info)
612 {
613 	struct blkif_sring *sring;
614 	int err;
615 
616 	info->ring_ref = GRANT_INVALID_REF;
617 
618 	sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH);
619 	if (!sring) {
620 		xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
621 		return -ENOMEM;
622 	}
623 	SHARED_RING_INIT(sring);
624 	FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
625 
626 	err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
627 	if (err < 0) {
628 		free_page((unsigned long)sring);
629 		info->ring.sring = NULL;
630 		goto fail;
631 	}
632 	info->ring_ref = err;
633 
634 	err = xenbus_alloc_evtchn(dev, &info->evtchn);
635 	if (err)
636 		goto fail;
637 
638 	err = bind_evtchn_to_irqhandler(info->evtchn,
639 					blkif_interrupt,
640 					IRQF_SAMPLE_RANDOM, "blkif", info);
641 	if (err <= 0) {
642 		xenbus_dev_fatal(dev, err,
643 				 "bind_evtchn_to_irqhandler failed");
644 		goto fail;
645 	}
646 	info->irq = err;
647 
648 	return 0;
649 fail:
650 	blkif_free(info, 0);
651 	return err;
652 }
653 
654 
655 /* Common code used when first setting up, and when resuming. */
656 static int talk_to_backend(struct xenbus_device *dev,
657 			   struct blkfront_info *info)
658 {
659 	const char *message = NULL;
660 	struct xenbus_transaction xbt;
661 	int err;
662 
663 	/* Create shared ring, alloc event channel. */
664 	err = setup_blkring(dev, info);
665 	if (err)
666 		goto out;
667 
668 again:
669 	err = xenbus_transaction_start(&xbt);
670 	if (err) {
671 		xenbus_dev_fatal(dev, err, "starting transaction");
672 		goto destroy_blkring;
673 	}
674 
675 	err = xenbus_printf(xbt, dev->nodename,
676 			    "ring-ref", "%u", info->ring_ref);
677 	if (err) {
678 		message = "writing ring-ref";
679 		goto abort_transaction;
680 	}
681 	err = xenbus_printf(xbt, dev->nodename,
682 			    "event-channel", "%u", info->evtchn);
683 	if (err) {
684 		message = "writing event-channel";
685 		goto abort_transaction;
686 	}
687 	err = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
688 			    XEN_IO_PROTO_ABI_NATIVE);
689 	if (err) {
690 		message = "writing protocol";
691 		goto abort_transaction;
692 	}
693 
694 	err = xenbus_transaction_end(xbt, 0);
695 	if (err) {
696 		if (err == -EAGAIN)
697 			goto again;
698 		xenbus_dev_fatal(dev, err, "completing transaction");
699 		goto destroy_blkring;
700 	}
701 
702 	xenbus_switch_state(dev, XenbusStateInitialised);
703 
704 	return 0;
705 
706  abort_transaction:
707 	xenbus_transaction_end(xbt, 1);
708 	if (message)
709 		xenbus_dev_fatal(dev, err, "%s", message);
710  destroy_blkring:
711 	blkif_free(info, 0);
712  out:
713 	return err;
714 }
715 
716 
717 /**
718  * Entry point to this code when a new device is created.  Allocate the basic
719  * structures and the ring buffer for communication with the backend, and
720  * inform the backend of the appropriate details for those.  Switch to
721  * Initialised state.
722  */
723 static int blkfront_probe(struct xenbus_device *dev,
724 			  const struct xenbus_device_id *id)
725 {
726 	int err, vdevice, i;
727 	struct blkfront_info *info;
728 
729 	/* FIXME: Use dynamic device id if this is not set. */
730 	err = xenbus_scanf(XBT_NIL, dev->nodename,
731 			   "virtual-device", "%i", &vdevice);
732 	if (err != 1) {
733 		/* go looking in the extended area instead */
734 		err = xenbus_scanf(XBT_NIL, dev->nodename, "virtual-device-ext",
735 				   "%i", &vdevice);
736 		if (err != 1) {
737 			xenbus_dev_fatal(dev, err, "reading virtual-device");
738 			return err;
739 		}
740 	}
741 
742 	info = kzalloc(sizeof(*info), GFP_KERNEL);
743 	if (!info) {
744 		xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
745 		return -ENOMEM;
746 	}
747 
748 	info->xbdev = dev;
749 	info->vdevice = vdevice;
750 	info->connected = BLKIF_STATE_DISCONNECTED;
751 	INIT_WORK(&info->work, blkif_restart_queue);
752 
753 	for (i = 0; i < BLK_RING_SIZE; i++)
754 		info->shadow[i].req.id = i+1;
755 	info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
756 
757 	/* Front end dir is a number, which is used as the id. */
758 	info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
759 	dev->dev.driver_data = info;
760 
761 	err = talk_to_backend(dev, info);
762 	if (err) {
763 		kfree(info);
764 		dev->dev.driver_data = NULL;
765 		return err;
766 	}
767 
768 	return 0;
769 }
770 
771 
772 static int blkif_recover(struct blkfront_info *info)
773 {
774 	int i;
775 	struct blkif_request *req;
776 	struct blk_shadow *copy;
777 	int j;
778 
779 	/* Stage 1: Make a safe copy of the shadow state. */
780 	copy = kmalloc(sizeof(info->shadow),
781 		       GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
782 	if (!copy)
783 		return -ENOMEM;
784 	memcpy(copy, info->shadow, sizeof(info->shadow));
785 
786 	/* Stage 2: Set up free list. */
787 	memset(&info->shadow, 0, sizeof(info->shadow));
788 	for (i = 0; i < BLK_RING_SIZE; i++)
789 		info->shadow[i].req.id = i+1;
790 	info->shadow_free = info->ring.req_prod_pvt;
791 	info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
792 
793 	/* Stage 3: Find pending requests and requeue them. */
794 	for (i = 0; i < BLK_RING_SIZE; i++) {
795 		/* Not in use? */
796 		if (copy[i].request == 0)
797 			continue;
798 
799 		/* Grab a request slot and copy shadow state into it. */
800 		req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
801 		*req = copy[i].req;
802 
803 		/* We get a new request id, and must reset the shadow state. */
804 		req->id = get_id_from_freelist(info);
805 		memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
806 
807 		/* Rewrite any grant references invalidated by susp/resume. */
808 		for (j = 0; j < req->nr_segments; j++)
809 			gnttab_grant_foreign_access_ref(
810 				req->seg[j].gref,
811 				info->xbdev->otherend_id,
812 				pfn_to_mfn(info->shadow[req->id].frame[j]),
813 				rq_data_dir(
814 					(struct request *)
815 					info->shadow[req->id].request));
816 		info->shadow[req->id].req = *req;
817 
818 		info->ring.req_prod_pvt++;
819 	}
820 
821 	kfree(copy);
822 
823 	xenbus_switch_state(info->xbdev, XenbusStateConnected);
824 
825 	spin_lock_irq(&blkif_io_lock);
826 
827 	/* Now safe for us to use the shared ring */
828 	info->connected = BLKIF_STATE_CONNECTED;
829 
830 	/* Send off requeued requests */
831 	flush_requests(info);
832 
833 	/* Kick any other new requests queued since we resumed */
834 	kick_pending_request_queues(info);
835 
836 	spin_unlock_irq(&blkif_io_lock);
837 
838 	return 0;
839 }
840 
841 /**
842  * We are reconnecting to the backend, due to a suspend/resume, or a backend
843  * driver restart.  We tear down our blkif structure and recreate it, but
844  * leave the device-layer structures intact so that this is transparent to the
845  * rest of the kernel.
846  */
847 static int blkfront_resume(struct xenbus_device *dev)
848 {
849 	struct blkfront_info *info = dev->dev.driver_data;
850 	int err;
851 
852 	dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename);
853 
854 	blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
855 
856 	err = talk_to_backend(dev, info);
857 	if (info->connected == BLKIF_STATE_SUSPENDED && !err)
858 		err = blkif_recover(info);
859 
860 	return err;
861 }
862 
863 
864 /*
865  * Invoked when the backend is finally 'ready' (and has told produced
866  * the details about the physical device - #sectors, size, etc).
867  */
868 static void blkfront_connect(struct blkfront_info *info)
869 {
870 	unsigned long long sectors;
871 	unsigned long sector_size;
872 	unsigned int binfo;
873 	int err;
874 
875 	if ((info->connected == BLKIF_STATE_CONNECTED) ||
876 	    (info->connected == BLKIF_STATE_SUSPENDED) )
877 		return;
878 
879 	dev_dbg(&info->xbdev->dev, "%s:%s.\n",
880 		__func__, info->xbdev->otherend);
881 
882 	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
883 			    "sectors", "%llu", &sectors,
884 			    "info", "%u", &binfo,
885 			    "sector-size", "%lu", &sector_size,
886 			    NULL);
887 	if (err) {
888 		xenbus_dev_fatal(info->xbdev, err,
889 				 "reading backend fields at %s",
890 				 info->xbdev->otherend);
891 		return;
892 	}
893 
894 	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
895 			    "feature-barrier", "%lu", &info->feature_barrier,
896 			    NULL);
897 	if (err)
898 		info->feature_barrier = 0;
899 
900 	err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
901 	if (err) {
902 		xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
903 				 info->xbdev->otherend);
904 		return;
905 	}
906 
907 	xenbus_switch_state(info->xbdev, XenbusStateConnected);
908 
909 	/* Kick pending requests. */
910 	spin_lock_irq(&blkif_io_lock);
911 	info->connected = BLKIF_STATE_CONNECTED;
912 	kick_pending_request_queues(info);
913 	spin_unlock_irq(&blkif_io_lock);
914 
915 	add_disk(info->gd);
916 
917 	info->is_ready = 1;
918 }
919 
920 /**
921  * Handle the change of state of the backend to Closing.  We must delete our
922  * device-layer structures now, to ensure that writes are flushed through to
923  * the backend.  Once is this done, we can switch to Closed in
924  * acknowledgement.
925  */
926 static void blkfront_closing(struct xenbus_device *dev)
927 {
928 	struct blkfront_info *info = dev->dev.driver_data;
929 	unsigned long flags;
930 
931 	dev_dbg(&dev->dev, "blkfront_closing: %s removed\n", dev->nodename);
932 
933 	if (info->rq == NULL)
934 		goto out;
935 
936 	spin_lock_irqsave(&blkif_io_lock, flags);
937 
938 	del_gendisk(info->gd);
939 
940 	/* No more blkif_request(). */
941 	blk_stop_queue(info->rq);
942 
943 	/* No more gnttab callback work. */
944 	gnttab_cancel_free_callback(&info->callback);
945 	spin_unlock_irqrestore(&blkif_io_lock, flags);
946 
947 	/* Flush gnttab callback work. Must be done with no locks held. */
948 	flush_scheduled_work();
949 
950 	blk_cleanup_queue(info->rq);
951 	info->rq = NULL;
952 
953  out:
954 	xenbus_frontend_closed(dev);
955 }
956 
957 /**
958  * Callback received when the backend's state changes.
959  */
960 static void backend_changed(struct xenbus_device *dev,
961 			    enum xenbus_state backend_state)
962 {
963 	struct blkfront_info *info = dev->dev.driver_data;
964 	struct block_device *bd;
965 
966 	dev_dbg(&dev->dev, "blkfront:backend_changed.\n");
967 
968 	switch (backend_state) {
969 	case XenbusStateInitialising:
970 	case XenbusStateInitWait:
971 	case XenbusStateInitialised:
972 	case XenbusStateUnknown:
973 	case XenbusStateClosed:
974 		break;
975 
976 	case XenbusStateConnected:
977 		blkfront_connect(info);
978 		break;
979 
980 	case XenbusStateClosing:
981 		bd = bdget_disk(info->gd, 0);
982 		if (bd == NULL)
983 			xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
984 
985 		mutex_lock(&bd->bd_mutex);
986 		if (info->users > 0)
987 			xenbus_dev_error(dev, -EBUSY,
988 					 "Device in use; refusing to close");
989 		else
990 			blkfront_closing(dev);
991 		mutex_unlock(&bd->bd_mutex);
992 		bdput(bd);
993 		break;
994 	}
995 }
996 
997 static int blkfront_remove(struct xenbus_device *dev)
998 {
999 	struct blkfront_info *info = dev->dev.driver_data;
1000 
1001 	dev_dbg(&dev->dev, "blkfront_remove: %s removed\n", dev->nodename);
1002 
1003 	blkif_free(info, 0);
1004 
1005 	kfree(info);
1006 
1007 	return 0;
1008 }
1009 
1010 static int blkfront_is_ready(struct xenbus_device *dev)
1011 {
1012 	struct blkfront_info *info = dev->dev.driver_data;
1013 
1014 	return info->is_ready;
1015 }
1016 
1017 static int blkif_open(struct inode *inode, struct file *filep)
1018 {
1019 	struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
1020 	info->users++;
1021 	return 0;
1022 }
1023 
1024 static int blkif_release(struct inode *inode, struct file *filep)
1025 {
1026 	struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
1027 	info->users--;
1028 	if (info->users == 0) {
1029 		/* Check whether we have been instructed to close.  We will
1030 		   have ignored this request initially, as the device was
1031 		   still mounted. */
1032 		struct xenbus_device *dev = info->xbdev;
1033 		enum xenbus_state state = xenbus_read_driver_state(dev->otherend);
1034 
1035 		if (state == XenbusStateClosing && info->is_ready)
1036 			blkfront_closing(dev);
1037 	}
1038 	return 0;
1039 }
1040 
1041 static struct block_device_operations xlvbd_block_fops =
1042 {
1043 	.owner = THIS_MODULE,
1044 	.open = blkif_open,
1045 	.release = blkif_release,
1046 	.getgeo = blkif_getgeo,
1047 	.ioctl = blkif_ioctl,
1048 };
1049 
1050 
1051 static struct xenbus_device_id blkfront_ids[] = {
1052 	{ "vbd" },
1053 	{ "" }
1054 };
1055 
1056 static struct xenbus_driver blkfront = {
1057 	.name = "vbd",
1058 	.owner = THIS_MODULE,
1059 	.ids = blkfront_ids,
1060 	.probe = blkfront_probe,
1061 	.remove = blkfront_remove,
1062 	.resume = blkfront_resume,
1063 	.otherend_changed = backend_changed,
1064 	.is_ready = blkfront_is_ready,
1065 };
1066 
1067 static int __init xlblk_init(void)
1068 {
1069 	if (!is_running_on_xen())
1070 		return -ENODEV;
1071 
1072 	if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
1073 		printk(KERN_WARNING "xen_blk: can't get major %d with name %s\n",
1074 		       XENVBD_MAJOR, DEV_NAME);
1075 		return -ENODEV;
1076 	}
1077 
1078 	return xenbus_register_frontend(&blkfront);
1079 }
1080 module_init(xlblk_init);
1081 
1082 
1083 static void __exit xlblk_exit(void)
1084 {
1085 	return xenbus_unregister_driver(&blkfront);
1086 }
1087 module_exit(xlblk_exit);
1088 
1089 MODULE_DESCRIPTION("Xen virtual block device frontend");
1090 MODULE_LICENSE("GPL");
1091 MODULE_ALIAS_BLOCKDEV_MAJOR(XENVBD_MAJOR);
1092 MODULE_ALIAS("xen:vbd");
1093 MODULE_ALIAS("xenblk");
1094