1 /* 2 * Copyright (C) 2003 Sistina Software 3 * Copyright (C) 2006 Red Hat GmbH 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm.h" 9 10 #include <linux/bio.h> 11 #include <linux/mempool.h> 12 #include <linux/module.h> 13 #include <linux/sched.h> 14 #include <linux/slab.h> 15 #include <linux/dm-io.h> 16 17 struct dm_io_client { 18 mempool_t *pool; 19 struct bio_set *bios; 20 }; 21 22 /* FIXME: can we shrink this ? */ 23 struct io { 24 unsigned long error_bits; 25 atomic_t count; 26 struct task_struct *sleeper; 27 struct dm_io_client *client; 28 io_notify_fn callback; 29 void *context; 30 }; 31 32 /* 33 * io contexts are only dynamically allocated for asynchronous 34 * io. Since async io is likely to be the majority of io we'll 35 * have the same number of io contexts as bios! (FIXME: must reduce this). 36 */ 37 38 static unsigned int pages_to_ios(unsigned int pages) 39 { 40 return 4 * pages; /* too many ? */ 41 } 42 43 /* 44 * Create a client with mempool and bioset. 45 */ 46 struct dm_io_client *dm_io_client_create(unsigned num_pages) 47 { 48 unsigned ios = pages_to_ios(num_pages); 49 struct dm_io_client *client; 50 51 client = kmalloc(sizeof(*client), GFP_KERNEL); 52 if (!client) 53 return ERR_PTR(-ENOMEM); 54 55 client->pool = mempool_create_kmalloc_pool(ios, sizeof(struct io)); 56 if (!client->pool) 57 goto bad; 58 59 client->bios = bioset_create(16, 16); 60 if (!client->bios) 61 goto bad; 62 63 return client; 64 65 bad: 66 if (client->pool) 67 mempool_destroy(client->pool); 68 kfree(client); 69 return ERR_PTR(-ENOMEM); 70 } 71 EXPORT_SYMBOL(dm_io_client_create); 72 73 int dm_io_client_resize(unsigned num_pages, struct dm_io_client *client) 74 { 75 return mempool_resize(client->pool, pages_to_ios(num_pages), 76 GFP_KERNEL); 77 } 78 EXPORT_SYMBOL(dm_io_client_resize); 79 80 void dm_io_client_destroy(struct dm_io_client *client) 81 { 82 mempool_destroy(client->pool); 83 bioset_free(client->bios); 84 kfree(client); 85 } 86 EXPORT_SYMBOL(dm_io_client_destroy); 87 88 /*----------------------------------------------------------------- 89 * We need to keep track of which region a bio is doing io for. 90 * In order to save a memory allocation we store this the last 91 * bvec which we know is unused (blech). 92 * XXX This is ugly and can OOPS with some configs... find another way. 93 *---------------------------------------------------------------*/ 94 static inline void bio_set_region(struct bio *bio, unsigned region) 95 { 96 bio->bi_io_vec[bio->bi_max_vecs].bv_len = region; 97 } 98 99 static inline unsigned bio_get_region(struct bio *bio) 100 { 101 return bio->bi_io_vec[bio->bi_max_vecs].bv_len; 102 } 103 104 /*----------------------------------------------------------------- 105 * We need an io object to keep track of the number of bios that 106 * have been dispatched for a particular io. 107 *---------------------------------------------------------------*/ 108 static void dec_count(struct io *io, unsigned int region, int error) 109 { 110 if (error) 111 set_bit(region, &io->error_bits); 112 113 if (atomic_dec_and_test(&io->count)) { 114 if (io->sleeper) 115 wake_up_process(io->sleeper); 116 117 else { 118 unsigned long r = io->error_bits; 119 io_notify_fn fn = io->callback; 120 void *context = io->context; 121 122 mempool_free(io, io->client->pool); 123 fn(r, context); 124 } 125 } 126 } 127 128 static void endio(struct bio *bio, int error) 129 { 130 struct io *io; 131 unsigned region; 132 133 if (error && bio_data_dir(bio) == READ) 134 zero_fill_bio(bio); 135 136 /* 137 * The bio destructor in bio_put() may use the io object. 138 */ 139 io = bio->bi_private; 140 region = bio_get_region(bio); 141 142 bio->bi_max_vecs++; 143 bio_put(bio); 144 145 dec_count(io, region, error); 146 } 147 148 /*----------------------------------------------------------------- 149 * These little objects provide an abstraction for getting a new 150 * destination page for io. 151 *---------------------------------------------------------------*/ 152 struct dpages { 153 void (*get_page)(struct dpages *dp, 154 struct page **p, unsigned long *len, unsigned *offset); 155 void (*next_page)(struct dpages *dp); 156 157 unsigned context_u; 158 void *context_ptr; 159 }; 160 161 /* 162 * Functions for getting the pages from a list. 163 */ 164 static void list_get_page(struct dpages *dp, 165 struct page **p, unsigned long *len, unsigned *offset) 166 { 167 unsigned o = dp->context_u; 168 struct page_list *pl = (struct page_list *) dp->context_ptr; 169 170 *p = pl->page; 171 *len = PAGE_SIZE - o; 172 *offset = o; 173 } 174 175 static void list_next_page(struct dpages *dp) 176 { 177 struct page_list *pl = (struct page_list *) dp->context_ptr; 178 dp->context_ptr = pl->next; 179 dp->context_u = 0; 180 } 181 182 static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offset) 183 { 184 dp->get_page = list_get_page; 185 dp->next_page = list_next_page; 186 dp->context_u = offset; 187 dp->context_ptr = pl; 188 } 189 190 /* 191 * Functions for getting the pages from a bvec. 192 */ 193 static void bvec_get_page(struct dpages *dp, 194 struct page **p, unsigned long *len, unsigned *offset) 195 { 196 struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr; 197 *p = bvec->bv_page; 198 *len = bvec->bv_len; 199 *offset = bvec->bv_offset; 200 } 201 202 static void bvec_next_page(struct dpages *dp) 203 { 204 struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr; 205 dp->context_ptr = bvec + 1; 206 } 207 208 static void bvec_dp_init(struct dpages *dp, struct bio_vec *bvec) 209 { 210 dp->get_page = bvec_get_page; 211 dp->next_page = bvec_next_page; 212 dp->context_ptr = bvec; 213 } 214 215 /* 216 * Functions for getting the pages from a VMA. 217 */ 218 static void vm_get_page(struct dpages *dp, 219 struct page **p, unsigned long *len, unsigned *offset) 220 { 221 *p = vmalloc_to_page(dp->context_ptr); 222 *offset = dp->context_u; 223 *len = PAGE_SIZE - dp->context_u; 224 } 225 226 static void vm_next_page(struct dpages *dp) 227 { 228 dp->context_ptr += PAGE_SIZE - dp->context_u; 229 dp->context_u = 0; 230 } 231 232 static void vm_dp_init(struct dpages *dp, void *data) 233 { 234 dp->get_page = vm_get_page; 235 dp->next_page = vm_next_page; 236 dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1); 237 dp->context_ptr = data; 238 } 239 240 static void dm_bio_destructor(struct bio *bio) 241 { 242 struct io *io = bio->bi_private; 243 244 bio_free(bio, io->client->bios); 245 } 246 247 /* 248 * Functions for getting the pages from kernel memory. 249 */ 250 static void km_get_page(struct dpages *dp, struct page **p, unsigned long *len, 251 unsigned *offset) 252 { 253 *p = virt_to_page(dp->context_ptr); 254 *offset = dp->context_u; 255 *len = PAGE_SIZE - dp->context_u; 256 } 257 258 static void km_next_page(struct dpages *dp) 259 { 260 dp->context_ptr += PAGE_SIZE - dp->context_u; 261 dp->context_u = 0; 262 } 263 264 static void km_dp_init(struct dpages *dp, void *data) 265 { 266 dp->get_page = km_get_page; 267 dp->next_page = km_next_page; 268 dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1); 269 dp->context_ptr = data; 270 } 271 272 /*----------------------------------------------------------------- 273 * IO routines that accept a list of pages. 274 *---------------------------------------------------------------*/ 275 static void do_region(int rw, unsigned region, struct dm_io_region *where, 276 struct dpages *dp, struct io *io) 277 { 278 struct bio *bio; 279 struct page *page; 280 unsigned long len; 281 unsigned offset; 282 unsigned num_bvecs; 283 sector_t remaining = where->count; 284 285 while (remaining) { 286 /* 287 * Allocate a suitably sized-bio: we add an extra 288 * bvec for bio_get/set_region() and decrement bi_max_vecs 289 * to hide it from bio_add_page(). 290 */ 291 num_bvecs = dm_sector_div_up(remaining, 292 (PAGE_SIZE >> SECTOR_SHIFT)); 293 num_bvecs = 1 + min_t(int, bio_get_nr_vecs(where->bdev), 294 num_bvecs); 295 bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); 296 bio->bi_sector = where->sector + (where->count - remaining); 297 bio->bi_bdev = where->bdev; 298 bio->bi_end_io = endio; 299 bio->bi_private = io; 300 bio->bi_destructor = dm_bio_destructor; 301 bio->bi_max_vecs--; 302 bio_set_region(bio, region); 303 304 /* 305 * Try and add as many pages as possible. 306 */ 307 while (remaining) { 308 dp->get_page(dp, &page, &len, &offset); 309 len = min(len, to_bytes(remaining)); 310 if (!bio_add_page(bio, page, len, offset)) 311 break; 312 313 offset = 0; 314 remaining -= to_sector(len); 315 dp->next_page(dp); 316 } 317 318 atomic_inc(&io->count); 319 submit_bio(rw, bio); 320 } 321 } 322 323 static void dispatch_io(int rw, unsigned int num_regions, 324 struct dm_io_region *where, struct dpages *dp, 325 struct io *io, int sync) 326 { 327 int i; 328 struct dpages old_pages = *dp; 329 330 if (sync) 331 rw |= (1 << BIO_RW_SYNC); 332 333 /* 334 * For multiple regions we need to be careful to rewind 335 * the dp object for each call to do_region. 336 */ 337 for (i = 0; i < num_regions; i++) { 338 *dp = old_pages; 339 if (where[i].count) 340 do_region(rw, i, where + i, dp, io); 341 } 342 343 /* 344 * Drop the extra reference that we were holding to avoid 345 * the io being completed too early. 346 */ 347 dec_count(io, 0, 0); 348 } 349 350 static int sync_io(struct dm_io_client *client, unsigned int num_regions, 351 struct dm_io_region *where, int rw, struct dpages *dp, 352 unsigned long *error_bits) 353 { 354 struct io io; 355 356 if (num_regions > 1 && (rw & RW_MASK) != WRITE) { 357 WARN_ON(1); 358 return -EIO; 359 } 360 361 io.error_bits = 0; 362 atomic_set(&io.count, 1); /* see dispatch_io() */ 363 io.sleeper = current; 364 io.client = client; 365 366 dispatch_io(rw, num_regions, where, dp, &io, 1); 367 368 while (1) { 369 set_current_state(TASK_UNINTERRUPTIBLE); 370 371 if (!atomic_read(&io.count) || signal_pending(current)) 372 break; 373 374 io_schedule(); 375 } 376 set_current_state(TASK_RUNNING); 377 378 if (atomic_read(&io.count)) 379 return -EINTR; 380 381 if (error_bits) 382 *error_bits = io.error_bits; 383 384 return io.error_bits ? -EIO : 0; 385 } 386 387 static int async_io(struct dm_io_client *client, unsigned int num_regions, 388 struct dm_io_region *where, int rw, struct dpages *dp, 389 io_notify_fn fn, void *context) 390 { 391 struct io *io; 392 393 if (num_regions > 1 && (rw & RW_MASK) != WRITE) { 394 WARN_ON(1); 395 fn(1, context); 396 return -EIO; 397 } 398 399 io = mempool_alloc(client->pool, GFP_NOIO); 400 io->error_bits = 0; 401 atomic_set(&io->count, 1); /* see dispatch_io() */ 402 io->sleeper = NULL; 403 io->client = client; 404 io->callback = fn; 405 io->context = context; 406 407 dispatch_io(rw, num_regions, where, dp, io, 0); 408 return 0; 409 } 410 411 static int dp_init(struct dm_io_request *io_req, struct dpages *dp) 412 { 413 /* Set up dpages based on memory type */ 414 switch (io_req->mem.type) { 415 case DM_IO_PAGE_LIST: 416 list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset); 417 break; 418 419 case DM_IO_BVEC: 420 bvec_dp_init(dp, io_req->mem.ptr.bvec); 421 break; 422 423 case DM_IO_VMA: 424 vm_dp_init(dp, io_req->mem.ptr.vma); 425 break; 426 427 case DM_IO_KMEM: 428 km_dp_init(dp, io_req->mem.ptr.addr); 429 break; 430 431 default: 432 return -EINVAL; 433 } 434 435 return 0; 436 } 437 438 /* 439 * New collapsed (a)synchronous interface. 440 * 441 * If the IO is asynchronous (i.e. it has notify.fn), you must either unplug 442 * the queue with blk_unplug() some time later or set the BIO_RW_SYNC bit in 443 * io_req->bi_rw. If you fail to do one of these, the IO will be submitted to 444 * the disk after q->unplug_delay, which defaults to 3ms in blk-settings.c. 445 */ 446 int dm_io(struct dm_io_request *io_req, unsigned num_regions, 447 struct dm_io_region *where, unsigned long *sync_error_bits) 448 { 449 int r; 450 struct dpages dp; 451 452 r = dp_init(io_req, &dp); 453 if (r) 454 return r; 455 456 if (!io_req->notify.fn) 457 return sync_io(io_req->client, num_regions, where, 458 io_req->bi_rw, &dp, sync_error_bits); 459 460 return async_io(io_req->client, num_regions, where, io_req->bi_rw, 461 &dp, io_req->notify.fn, io_req->notify.context); 462 } 463 EXPORT_SYMBOL(dm_io); 464