1 /* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */ 2 /* 3 * aoedev.c 4 * AoE device utility functions; maintains device list. 5 */ 6 7 #include <linux/hdreg.h> 8 #include <linux/blkdev.h> 9 #include <linux/netdevice.h> 10 #include <linux/delay.h> 11 #include <linux/slab.h> 12 #include <linux/bitmap.h> 13 #include <linux/kdev_t.h> 14 #include <linux/moduleparam.h> 15 #include "aoe.h" 16 17 static void dummy_timer(ulong); 18 static void freetgt(struct aoedev *d, struct aoetgt *t); 19 static void skbpoolfree(struct aoedev *d); 20 21 static int aoe_dyndevs = 1; 22 module_param(aoe_dyndevs, int, 0644); 23 MODULE_PARM_DESC(aoe_dyndevs, "Use dynamic minor numbers for devices."); 24 25 static struct aoedev *devlist; 26 static DEFINE_SPINLOCK(devlist_lock); 27 28 /* Because some systems will have one, many, or no 29 * - partitions, 30 * - slots per shelf, 31 * - or shelves, 32 * we need some flexibility in the way the minor numbers 33 * are allocated. So they are dynamic. 34 */ 35 #define N_DEVS ((1U<<MINORBITS)/AOE_PARTITIONS) 36 37 static DEFINE_SPINLOCK(used_minors_lock); 38 static DECLARE_BITMAP(used_minors, N_DEVS); 39 40 static int 41 minor_get_dyn(ulong *sysminor) 42 { 43 ulong flags; 44 ulong n; 45 int error = 0; 46 47 spin_lock_irqsave(&used_minors_lock, flags); 48 n = find_first_zero_bit(used_minors, N_DEVS); 49 if (n < N_DEVS) 50 set_bit(n, used_minors); 51 else 52 error = -1; 53 spin_unlock_irqrestore(&used_minors_lock, flags); 54 55 *sysminor = n * AOE_PARTITIONS; 56 return error; 57 } 58 59 static int 60 minor_get_static(ulong *sysminor, ulong aoemaj, int aoemin) 61 { 62 ulong flags; 63 ulong n; 64 int error = 0; 65 enum { 66 /* for backwards compatibility when !aoe_dyndevs, 67 * a static number of supported slots per shelf */ 68 NPERSHELF = 16, 69 }; 70 71 if (aoemin >= NPERSHELF) { 72 pr_err("aoe: %s %d slots per shelf\n", 73 "static minor device numbers support only", 74 NPERSHELF); 75 error = -1; 76 goto out; 77 } 78 79 n = aoemaj * NPERSHELF + aoemin; 80 if (n >= N_DEVS) { 81 pr_err("aoe: %s with e%ld.%d\n", 82 "cannot use static minor device numbers", 83 aoemaj, aoemin); 84 error = -1; 85 goto out; 86 } 87 88 spin_lock_irqsave(&used_minors_lock, flags); 89 if (test_bit(n, used_minors)) { 90 pr_err("aoe: %s %lu\n", 91 "existing device already has static minor number", 92 n); 93 error = -1; 94 } else 95 set_bit(n, used_minors); 96 spin_unlock_irqrestore(&used_minors_lock, flags); 97 *sysminor = n * AOE_PARTITIONS; 98 out: 99 return error; 100 } 101 102 static int 103 minor_get(ulong *sysminor, ulong aoemaj, int aoemin) 104 { 105 if (aoe_dyndevs) 106 return minor_get_dyn(sysminor); 107 else 108 return minor_get_static(sysminor, aoemaj, aoemin); 109 } 110 111 static void 112 minor_free(ulong minor) 113 { 114 ulong flags; 115 116 minor /= AOE_PARTITIONS; 117 BUG_ON(minor >= N_DEVS); 118 119 spin_lock_irqsave(&used_minors_lock, flags); 120 BUG_ON(!test_bit(minor, used_minors)); 121 clear_bit(minor, used_minors); 122 spin_unlock_irqrestore(&used_minors_lock, flags); 123 } 124 125 /* 126 * Users who grab a pointer to the device with aoedev_by_aoeaddr 127 * automatically get a reference count and must be responsible 128 * for performing a aoedev_put. With the addition of async 129 * kthread processing I'm no longer confident that we can 130 * guarantee consistency in the face of device flushes. 131 * 132 * For the time being, we only bother to add extra references for 133 * frames sitting on the iocq. When the kthreads finish processing 134 * these frames, they will aoedev_put the device. 135 */ 136 137 void 138 aoedev_put(struct aoedev *d) 139 { 140 ulong flags; 141 142 spin_lock_irqsave(&devlist_lock, flags); 143 d->ref--; 144 spin_unlock_irqrestore(&devlist_lock, flags); 145 } 146 147 static void 148 dummy_timer(ulong vp) 149 { 150 struct aoedev *d; 151 152 d = (struct aoedev *)vp; 153 if (d->flags & DEVFL_TKILL) 154 return; 155 d->timer.expires = jiffies + HZ; 156 add_timer(&d->timer); 157 } 158 159 static void 160 aoe_failip(struct aoedev *d) 161 { 162 struct request *rq; 163 struct bio *bio; 164 unsigned long n; 165 166 aoe_failbuf(d, d->ip.buf); 167 168 rq = d->ip.rq; 169 if (rq == NULL) 170 return; 171 while ((bio = d->ip.nxbio)) { 172 clear_bit(BIO_UPTODATE, &bio->bi_flags); 173 d->ip.nxbio = bio->bi_next; 174 n = (unsigned long) rq->special; 175 rq->special = (void *) --n; 176 } 177 if ((unsigned long) rq->special == 0) 178 aoe_end_request(d, rq, 0); 179 } 180 181 static void 182 downdev_frame(struct list_head *pos) 183 { 184 struct frame *f; 185 186 f = list_entry(pos, struct frame, head); 187 list_del(pos); 188 if (f->buf) { 189 f->buf->nframesout--; 190 aoe_failbuf(f->t->d, f->buf); 191 } 192 aoe_freetframe(f); 193 } 194 195 void 196 aoedev_downdev(struct aoedev *d) 197 { 198 struct aoetgt *t, **tt, **te; 199 struct list_head *head, *pos, *nx; 200 struct request *rq; 201 int i; 202 203 d->flags &= ~DEVFL_UP; 204 205 /* clean out active and to-be-retransmitted buffers */ 206 for (i = 0; i < NFACTIVE; i++) { 207 head = &d->factive[i]; 208 list_for_each_safe(pos, nx, head) 209 downdev_frame(pos); 210 } 211 head = &d->rexmitq; 212 list_for_each_safe(pos, nx, head) 213 downdev_frame(pos); 214 215 /* reset window dressings */ 216 tt = d->targets; 217 te = tt + d->ntargets; 218 for (; tt < te && (t = *tt); tt++) { 219 aoecmd_wreset(t); 220 t->nout = 0; 221 } 222 223 /* clean out the in-process request (if any) */ 224 aoe_failip(d); 225 226 /* fast fail all pending I/O */ 227 if (d->blkq) { 228 while ((rq = blk_peek_request(d->blkq))) { 229 blk_start_request(rq); 230 aoe_end_request(d, rq, 1); 231 } 232 } 233 234 if (d->gd) 235 set_capacity(d->gd, 0); 236 } 237 238 /* return whether the user asked for this particular 239 * device to be flushed 240 */ 241 static int 242 user_req(char *s, size_t slen, struct aoedev *d) 243 { 244 char *p; 245 size_t lim; 246 247 if (!d->gd) 248 return 0; 249 p = strrchr(d->gd->disk_name, '/'); 250 if (!p) 251 p = d->gd->disk_name; 252 else 253 p += 1; 254 lim = sizeof(d->gd->disk_name); 255 lim -= p - d->gd->disk_name; 256 if (slen < lim) 257 lim = slen; 258 259 return !strncmp(s, p, lim); 260 } 261 262 static void 263 freedev(struct aoedev *d) 264 { 265 struct aoetgt **t, **e; 266 int freeing = 0; 267 unsigned long flags; 268 269 spin_lock_irqsave(&d->lock, flags); 270 if (d->flags & DEVFL_TKILL 271 && !(d->flags & DEVFL_FREEING)) { 272 d->flags |= DEVFL_FREEING; 273 freeing = 1; 274 } 275 spin_unlock_irqrestore(&d->lock, flags); 276 if (!freeing) 277 return; 278 279 del_timer_sync(&d->timer); 280 if (d->gd) { 281 aoedisk_rm_sysfs(d); 282 del_gendisk(d->gd); 283 put_disk(d->gd); 284 blk_cleanup_queue(d->blkq); 285 } 286 t = d->targets; 287 e = t + d->ntargets; 288 for (; t < e && *t; t++) 289 freetgt(d, *t); 290 if (d->bufpool) 291 mempool_destroy(d->bufpool); 292 skbpoolfree(d); 293 minor_free(d->sysminor); 294 295 spin_lock_irqsave(&d->lock, flags); 296 d->flags |= DEVFL_FREED; 297 spin_unlock_irqrestore(&d->lock, flags); 298 } 299 300 enum flush_parms { 301 NOT_EXITING = 0, 302 EXITING = 1, 303 }; 304 305 static int 306 flush(const char __user *str, size_t cnt, int exiting) 307 { 308 ulong flags; 309 struct aoedev *d, **dd; 310 char buf[16]; 311 int all = 0; 312 int specified = 0; /* flush a specific device */ 313 unsigned int skipflags; 314 315 skipflags = DEVFL_GDALLOC | DEVFL_NEWSIZE | DEVFL_TKILL; 316 317 if (!exiting && cnt >= 3) { 318 if (cnt > sizeof buf) 319 cnt = sizeof buf; 320 if (copy_from_user(buf, str, cnt)) 321 return -EFAULT; 322 all = !strncmp(buf, "all", 3); 323 if (!all) 324 specified = 1; 325 } 326 327 flush_scheduled_work(); 328 /* pass one: without sleeping, do aoedev_downdev */ 329 spin_lock_irqsave(&devlist_lock, flags); 330 for (d = devlist; d; d = d->next) { 331 spin_lock(&d->lock); 332 if (exiting) { 333 /* unconditionally take each device down */ 334 } else if (specified) { 335 if (!user_req(buf, cnt, d)) 336 goto cont; 337 } else if ((!all && (d->flags & DEVFL_UP)) 338 || d->flags & skipflags 339 || d->nopen 340 || d->ref) 341 goto cont; 342 343 aoedev_downdev(d); 344 d->flags |= DEVFL_TKILL; 345 cont: 346 spin_unlock(&d->lock); 347 } 348 spin_unlock_irqrestore(&devlist_lock, flags); 349 350 /* pass two: call freedev, which might sleep, 351 * for aoedevs marked with DEVFL_TKILL 352 */ 353 restart: 354 spin_lock_irqsave(&devlist_lock, flags); 355 for (d = devlist; d; d = d->next) { 356 spin_lock(&d->lock); 357 if (d->flags & DEVFL_TKILL 358 && !(d->flags & DEVFL_FREEING)) { 359 spin_unlock(&d->lock); 360 spin_unlock_irqrestore(&devlist_lock, flags); 361 freedev(d); 362 goto restart; 363 } 364 spin_unlock(&d->lock); 365 } 366 367 /* pass three: remove aoedevs marked with DEVFL_FREED */ 368 for (dd = &devlist, d = *dd; d; d = *dd) { 369 struct aoedev *doomed = NULL; 370 371 spin_lock(&d->lock); 372 if (d->flags & DEVFL_FREED) { 373 *dd = d->next; 374 doomed = d; 375 } else { 376 dd = &d->next; 377 } 378 spin_unlock(&d->lock); 379 if (doomed) 380 kfree(doomed->targets); 381 kfree(doomed); 382 } 383 spin_unlock_irqrestore(&devlist_lock, flags); 384 385 return 0; 386 } 387 388 int 389 aoedev_flush(const char __user *str, size_t cnt) 390 { 391 return flush(str, cnt, NOT_EXITING); 392 } 393 394 /* This has been confirmed to occur once with Tms=3*1000 due to the 395 * driver changing link and not processing its transmit ring. The 396 * problem is hard enough to solve by returning an error that I'm 397 * still punting on "solving" this. 398 */ 399 static void 400 skbfree(struct sk_buff *skb) 401 { 402 enum { Sms = 250, Tms = 30 * 1000}; 403 int i = Tms / Sms; 404 405 if (skb == NULL) 406 return; 407 while (atomic_read(&skb_shinfo(skb)->dataref) != 1 && i-- > 0) 408 msleep(Sms); 409 if (i < 0) { 410 printk(KERN_ERR 411 "aoe: %s holds ref: %s\n", 412 skb->dev ? skb->dev->name : "netif", 413 "cannot free skb -- memory leaked."); 414 return; 415 } 416 skb->truesize -= skb->data_len; 417 skb_shinfo(skb)->nr_frags = skb->data_len = 0; 418 skb_trim(skb, 0); 419 dev_kfree_skb(skb); 420 } 421 422 static void 423 skbpoolfree(struct aoedev *d) 424 { 425 struct sk_buff *skb, *tmp; 426 427 skb_queue_walk_safe(&d->skbpool, skb, tmp) 428 skbfree(skb); 429 430 __skb_queue_head_init(&d->skbpool); 431 } 432 433 /* find it or allocate it */ 434 struct aoedev * 435 aoedev_by_aoeaddr(ulong maj, int min, int do_alloc) 436 { 437 struct aoedev *d; 438 int i; 439 ulong flags; 440 ulong sysminor = 0; 441 442 spin_lock_irqsave(&devlist_lock, flags); 443 444 for (d=devlist; d; d=d->next) 445 if (d->aoemajor == maj && d->aoeminor == min) { 446 spin_lock(&d->lock); 447 if (d->flags & DEVFL_TKILL) { 448 spin_unlock(&d->lock); 449 d = NULL; 450 goto out; 451 } 452 d->ref++; 453 spin_unlock(&d->lock); 454 break; 455 } 456 if (d || !do_alloc || minor_get(&sysminor, maj, min) < 0) 457 goto out; 458 d = kcalloc(1, sizeof *d, GFP_ATOMIC); 459 if (!d) 460 goto out; 461 d->targets = kcalloc(NTARGETS, sizeof(*d->targets), GFP_ATOMIC); 462 if (!d->targets) { 463 kfree(d); 464 d = NULL; 465 goto out; 466 } 467 d->ntargets = NTARGETS; 468 INIT_WORK(&d->work, aoecmd_sleepwork); 469 spin_lock_init(&d->lock); 470 skb_queue_head_init(&d->skbpool); 471 init_timer(&d->timer); 472 d->timer.data = (ulong) d; 473 d->timer.function = dummy_timer; 474 d->timer.expires = jiffies + HZ; 475 add_timer(&d->timer); 476 d->bufpool = NULL; /* defer to aoeblk_gdalloc */ 477 d->tgt = d->targets; 478 d->ref = 1; 479 for (i = 0; i < NFACTIVE; i++) 480 INIT_LIST_HEAD(&d->factive[i]); 481 INIT_LIST_HEAD(&d->rexmitq); 482 d->sysminor = sysminor; 483 d->aoemajor = maj; 484 d->aoeminor = min; 485 d->rttavg = RTTAVG_INIT; 486 d->rttdev = RTTDEV_INIT; 487 d->next = devlist; 488 devlist = d; 489 out: 490 spin_unlock_irqrestore(&devlist_lock, flags); 491 return d; 492 } 493 494 static void 495 freetgt(struct aoedev *d, struct aoetgt *t) 496 { 497 struct frame *f; 498 struct list_head *pos, *nx, *head; 499 struct aoeif *ifp; 500 501 for (ifp = t->ifs; ifp < &t->ifs[NAOEIFS]; ++ifp) { 502 if (!ifp->nd) 503 break; 504 dev_put(ifp->nd); 505 } 506 507 head = &t->ffree; 508 list_for_each_safe(pos, nx, head) { 509 list_del(pos); 510 f = list_entry(pos, struct frame, head); 511 skbfree(f->skb); 512 kfree(f); 513 } 514 kfree(t); 515 } 516 517 void 518 aoedev_exit(void) 519 { 520 flush_scheduled_work(); 521 aoe_flush_iocq(); 522 flush(NULL, 0, EXITING); 523 } 524 525 int __init 526 aoedev_init(void) 527 { 528 return 0; 529 } 530