xref: /openbmc/linux/drivers/block/aoe/aoedev.c (revision 4f2c0a4acffbec01079c28f839422e64ddeff004)
1  /* Copyright (c) 2013 Coraid, Inc.  See COPYING for GPL terms. */
2  /*
3   * aoedev.c
4   * AoE device utility functions; maintains device list.
5   */
6  
7  #include <linux/hdreg.h>
8  #include <linux/blk-mq.h>
9  #include <linux/netdevice.h>
10  #include <linux/delay.h>
11  #include <linux/slab.h>
12  #include <linux/bitmap.h>
13  #include <linux/kdev_t.h>
14  #include <linux/moduleparam.h>
15  #include <linux/string.h>
16  #include "aoe.h"
17  
18  static void freetgt(struct aoedev *d, struct aoetgt *t);
19  static void skbpoolfree(struct aoedev *d);
20  
21  static int aoe_dyndevs = 1;
22  module_param(aoe_dyndevs, int, 0644);
23  MODULE_PARM_DESC(aoe_dyndevs, "Use dynamic minor numbers for devices.");
24  
25  static struct aoedev *devlist;
26  static DEFINE_SPINLOCK(devlist_lock);
27  
28  /* Because some systems will have one, many, or no
29   *   - partitions,
30   *   - slots per shelf,
31   *   - or shelves,
32   * we need some flexibility in the way the minor numbers
33   * are allocated.  So they are dynamic.
34   */
35  #define N_DEVS ((1U<<MINORBITS)/AOE_PARTITIONS)
36  
37  static DEFINE_SPINLOCK(used_minors_lock);
38  static DECLARE_BITMAP(used_minors, N_DEVS);
39  
40  static int
minor_get_dyn(ulong * sysminor)41  minor_get_dyn(ulong *sysminor)
42  {
43  	ulong flags;
44  	ulong n;
45  	int error = 0;
46  
47  	spin_lock_irqsave(&used_minors_lock, flags);
48  	n = find_first_zero_bit(used_minors, N_DEVS);
49  	if (n < N_DEVS)
50  		set_bit(n, used_minors);
51  	else
52  		error = -1;
53  	spin_unlock_irqrestore(&used_minors_lock, flags);
54  
55  	*sysminor = n * AOE_PARTITIONS;
56  	return error;
57  }
58  
59  static int
minor_get_static(ulong * sysminor,ulong aoemaj,int aoemin)60  minor_get_static(ulong *sysminor, ulong aoemaj, int aoemin)
61  {
62  	ulong flags;
63  	ulong n;
64  	int error = 0;
65  	enum {
66  		/* for backwards compatibility when !aoe_dyndevs,
67  		 * a static number of supported slots per shelf */
68  		NPERSHELF = 16,
69  	};
70  
71  	if (aoemin >= NPERSHELF) {
72  		pr_err("aoe: %s %d slots per shelf\n",
73  			"static minor device numbers support only",
74  			NPERSHELF);
75  		error = -1;
76  		goto out;
77  	}
78  
79  	n = aoemaj * NPERSHELF + aoemin;
80  	if (n >= N_DEVS) {
81  		pr_err("aoe: %s with e%ld.%d\n",
82  			"cannot use static minor device numbers",
83  			aoemaj, aoemin);
84  		error = -1;
85  		goto out;
86  	}
87  
88  	spin_lock_irqsave(&used_minors_lock, flags);
89  	if (test_bit(n, used_minors)) {
90  		pr_err("aoe: %s %lu\n",
91  			"existing device already has static minor number",
92  			n);
93  		error = -1;
94  	} else
95  		set_bit(n, used_minors);
96  	spin_unlock_irqrestore(&used_minors_lock, flags);
97  	*sysminor = n * AOE_PARTITIONS;
98  out:
99  	return error;
100  }
101  
102  static int
minor_get(ulong * sysminor,ulong aoemaj,int aoemin)103  minor_get(ulong *sysminor, ulong aoemaj, int aoemin)
104  {
105  	if (aoe_dyndevs)
106  		return minor_get_dyn(sysminor);
107  	else
108  		return minor_get_static(sysminor, aoemaj, aoemin);
109  }
110  
111  static void
minor_free(ulong minor)112  minor_free(ulong minor)
113  {
114  	ulong flags;
115  
116  	minor /= AOE_PARTITIONS;
117  	BUG_ON(minor >= N_DEVS);
118  
119  	spin_lock_irqsave(&used_minors_lock, flags);
120  	BUG_ON(!test_bit(minor, used_minors));
121  	clear_bit(minor, used_minors);
122  	spin_unlock_irqrestore(&used_minors_lock, flags);
123  }
124  
125  /*
126   * Users who grab a pointer to the device with aoedev_by_aoeaddr
127   * automatically get a reference count and must be responsible
128   * for performing a aoedev_put.  With the addition of async
129   * kthread processing I'm no longer confident that we can
130   * guarantee consistency in the face of device flushes.
131   *
132   * For the time being, we only bother to add extra references for
133   * frames sitting on the iocq.  When the kthreads finish processing
134   * these frames, they will aoedev_put the device.
135   */
136  
137  void
aoedev_put(struct aoedev * d)138  aoedev_put(struct aoedev *d)
139  {
140  	ulong flags;
141  
142  	spin_lock_irqsave(&devlist_lock, flags);
143  	d->ref--;
144  	spin_unlock_irqrestore(&devlist_lock, flags);
145  }
146  
147  static void
dummy_timer(struct timer_list * t)148  dummy_timer(struct timer_list *t)
149  {
150  	struct aoedev *d;
151  
152  	d = from_timer(d, t, timer);
153  	if (d->flags & DEVFL_TKILL)
154  		return;
155  	d->timer.expires = jiffies + HZ;
156  	add_timer(&d->timer);
157  }
158  
159  static void
aoe_failip(struct aoedev * d)160  aoe_failip(struct aoedev *d)
161  {
162  	struct request *rq;
163  	struct aoe_req *req;
164  	struct bio *bio;
165  
166  	aoe_failbuf(d, d->ip.buf);
167  	rq = d->ip.rq;
168  	if (rq == NULL)
169  		return;
170  
171  	req = blk_mq_rq_to_pdu(rq);
172  	while ((bio = d->ip.nxbio)) {
173  		bio->bi_status = BLK_STS_IOERR;
174  		d->ip.nxbio = bio->bi_next;
175  		req->nr_bios--;
176  	}
177  
178  	if (!req->nr_bios)
179  		aoe_end_request(d, rq, 0);
180  }
181  
182  static void
downdev_frame(struct list_head * pos)183  downdev_frame(struct list_head *pos)
184  {
185  	struct frame *f;
186  
187  	f = list_entry(pos, struct frame, head);
188  	list_del(pos);
189  	if (f->buf) {
190  		f->buf->nframesout--;
191  		aoe_failbuf(f->t->d, f->buf);
192  	}
193  	aoe_freetframe(f);
194  }
195  
196  void
aoedev_downdev(struct aoedev * d)197  aoedev_downdev(struct aoedev *d)
198  {
199  	struct aoetgt *t, **tt, **te;
200  	struct list_head *head, *pos, *nx;
201  	int i;
202  
203  	d->flags &= ~DEVFL_UP;
204  
205  	/* clean out active and to-be-retransmitted buffers */
206  	for (i = 0; i < NFACTIVE; i++) {
207  		head = &d->factive[i];
208  		list_for_each_safe(pos, nx, head)
209  			downdev_frame(pos);
210  	}
211  	head = &d->rexmitq;
212  	list_for_each_safe(pos, nx, head)
213  		downdev_frame(pos);
214  
215  	/* reset window dressings */
216  	tt = d->targets;
217  	te = tt + d->ntargets;
218  	for (; tt < te && (t = *tt); tt++) {
219  		aoecmd_wreset(t);
220  		t->nout = 0;
221  	}
222  
223  	/* clean out the in-process request (if any) */
224  	aoe_failip(d);
225  
226  	/* fast fail all pending I/O */
227  	if (d->blkq) {
228  		/* UP is cleared, freeze+quiesce to insure all are errored */
229  		blk_mq_freeze_queue(d->blkq);
230  		blk_mq_quiesce_queue(d->blkq);
231  		blk_mq_unquiesce_queue(d->blkq);
232  		blk_mq_unfreeze_queue(d->blkq);
233  	}
234  
235  	if (d->gd)
236  		set_capacity(d->gd, 0);
237  }
238  
239  /* return whether the user asked for this particular
240   * device to be flushed
241   */
242  static int
user_req(char * s,size_t slen,struct aoedev * d)243  user_req(char *s, size_t slen, struct aoedev *d)
244  {
245  	const char *p;
246  	size_t lim;
247  
248  	if (!d->gd)
249  		return 0;
250  	p = kbasename(d->gd->disk_name);
251  	lim = sizeof(d->gd->disk_name);
252  	lim -= p - d->gd->disk_name;
253  	if (slen < lim)
254  		lim = slen;
255  
256  	return !strncmp(s, p, lim);
257  }
258  
259  static void
freedev(struct aoedev * d)260  freedev(struct aoedev *d)
261  {
262  	struct aoetgt **t, **e;
263  	int freeing = 0;
264  	unsigned long flags;
265  
266  	spin_lock_irqsave(&d->lock, flags);
267  	if (d->flags & DEVFL_TKILL
268  	&& !(d->flags & DEVFL_FREEING)) {
269  		d->flags |= DEVFL_FREEING;
270  		freeing = 1;
271  	}
272  	spin_unlock_irqrestore(&d->lock, flags);
273  	if (!freeing)
274  		return;
275  
276  	del_timer_sync(&d->timer);
277  	if (d->gd) {
278  		aoedisk_rm_debugfs(d);
279  		del_gendisk(d->gd);
280  		put_disk(d->gd);
281  		blk_mq_free_tag_set(&d->tag_set);
282  	}
283  	t = d->targets;
284  	e = t + d->ntargets;
285  	for (; t < e && *t; t++)
286  		freetgt(d, *t);
287  
288  	mempool_destroy(d->bufpool);
289  	skbpoolfree(d);
290  	minor_free(d->sysminor);
291  
292  	spin_lock_irqsave(&d->lock, flags);
293  	d->flags |= DEVFL_FREED;
294  	spin_unlock_irqrestore(&d->lock, flags);
295  }
296  
297  enum flush_parms {
298  	NOT_EXITING = 0,
299  	EXITING = 1,
300  };
301  
302  static int
flush(const char __user * str,size_t cnt,int exiting)303  flush(const char __user *str, size_t cnt, int exiting)
304  {
305  	ulong flags;
306  	struct aoedev *d, **dd;
307  	char buf[16];
308  	int all = 0;
309  	int specified = 0;	/* flush a specific device */
310  	unsigned int skipflags;
311  
312  	skipflags = DEVFL_GDALLOC | DEVFL_NEWSIZE | DEVFL_TKILL;
313  
314  	if (!exiting && cnt >= 3) {
315  		if (cnt > sizeof buf)
316  			cnt = sizeof buf;
317  		if (copy_from_user(buf, str, cnt))
318  			return -EFAULT;
319  		all = !strncmp(buf, "all", 3);
320  		if (!all)
321  			specified = 1;
322  	}
323  
324  	flush_workqueue(aoe_wq);
325  	/* pass one: do aoedev_downdev, which might sleep */
326  restart1:
327  	spin_lock_irqsave(&devlist_lock, flags);
328  	for (d = devlist; d; d = d->next) {
329  		spin_lock(&d->lock);
330  		if (d->flags & DEVFL_TKILL)
331  			goto cont;
332  
333  		if (exiting) {
334  			/* unconditionally take each device down */
335  		} else if (specified) {
336  			if (!user_req(buf, cnt, d))
337  				goto cont;
338  		} else if ((!all && (d->flags & DEVFL_UP))
339  		|| d->flags & skipflags
340  		|| d->nopen
341  		|| d->ref)
342  			goto cont;
343  
344  		spin_unlock(&d->lock);
345  		spin_unlock_irqrestore(&devlist_lock, flags);
346  		aoedev_downdev(d);
347  		d->flags |= DEVFL_TKILL;
348  		goto restart1;
349  cont:
350  		spin_unlock(&d->lock);
351  	}
352  	spin_unlock_irqrestore(&devlist_lock, flags);
353  
354  	/* pass two: call freedev, which might sleep,
355  	 * for aoedevs marked with DEVFL_TKILL
356  	 */
357  restart2:
358  	spin_lock_irqsave(&devlist_lock, flags);
359  	for (d = devlist; d; d = d->next) {
360  		spin_lock(&d->lock);
361  		if (d->flags & DEVFL_TKILL
362  		&& !(d->flags & DEVFL_FREEING)) {
363  			spin_unlock(&d->lock);
364  			spin_unlock_irqrestore(&devlist_lock, flags);
365  			freedev(d);
366  			goto restart2;
367  		}
368  		spin_unlock(&d->lock);
369  	}
370  
371  	/* pass three: remove aoedevs marked with DEVFL_FREED */
372  	for (dd = &devlist, d = *dd; d; d = *dd) {
373  		struct aoedev *doomed = NULL;
374  
375  		spin_lock(&d->lock);
376  		if (d->flags & DEVFL_FREED) {
377  			*dd = d->next;
378  			doomed = d;
379  		} else {
380  			dd = &d->next;
381  		}
382  		spin_unlock(&d->lock);
383  		if (doomed)
384  			kfree(doomed->targets);
385  		kfree(doomed);
386  	}
387  	spin_unlock_irqrestore(&devlist_lock, flags);
388  
389  	return 0;
390  }
391  
392  int
aoedev_flush(const char __user * str,size_t cnt)393  aoedev_flush(const char __user *str, size_t cnt)
394  {
395  	return flush(str, cnt, NOT_EXITING);
396  }
397  
398  /* This has been confirmed to occur once with Tms=3*1000 due to the
399   * driver changing link and not processing its transmit ring.  The
400   * problem is hard enough to solve by returning an error that I'm
401   * still punting on "solving" this.
402   */
403  static void
skbfree(struct sk_buff * skb)404  skbfree(struct sk_buff *skb)
405  {
406  	enum { Sms = 250, Tms = 30 * 1000};
407  	int i = Tms / Sms;
408  
409  	if (skb == NULL)
410  		return;
411  	while (atomic_read(&skb_shinfo(skb)->dataref) != 1 && i-- > 0)
412  		msleep(Sms);
413  	if (i < 0) {
414  		printk(KERN_ERR
415  			"aoe: %s holds ref: %s\n",
416  			skb->dev ? skb->dev->name : "netif",
417  			"cannot free skb -- memory leaked.");
418  		return;
419  	}
420  	skb->truesize -= skb->data_len;
421  	skb_shinfo(skb)->nr_frags = skb->data_len = 0;
422  	skb_trim(skb, 0);
423  	dev_kfree_skb(skb);
424  }
425  
426  static void
skbpoolfree(struct aoedev * d)427  skbpoolfree(struct aoedev *d)
428  {
429  	struct sk_buff *skb, *tmp;
430  
431  	skb_queue_walk_safe(&d->skbpool, skb, tmp)
432  		skbfree(skb);
433  
434  	__skb_queue_head_init(&d->skbpool);
435  }
436  
437  /* find it or allocate it */
438  struct aoedev *
aoedev_by_aoeaddr(ulong maj,int min,int do_alloc)439  aoedev_by_aoeaddr(ulong maj, int min, int do_alloc)
440  {
441  	struct aoedev *d;
442  	int i;
443  	ulong flags;
444  	ulong sysminor = 0;
445  
446  	spin_lock_irqsave(&devlist_lock, flags);
447  
448  	for (d=devlist; d; d=d->next)
449  		if (d->aoemajor == maj && d->aoeminor == min) {
450  			spin_lock(&d->lock);
451  			if (d->flags & DEVFL_TKILL) {
452  				spin_unlock(&d->lock);
453  				d = NULL;
454  				goto out;
455  			}
456  			d->ref++;
457  			spin_unlock(&d->lock);
458  			break;
459  		}
460  	if (d || !do_alloc || minor_get(&sysminor, maj, min) < 0)
461  		goto out;
462  	d = kcalloc(1, sizeof *d, GFP_ATOMIC);
463  	if (!d)
464  		goto out;
465  	d->targets = kcalloc(NTARGETS, sizeof(*d->targets), GFP_ATOMIC);
466  	if (!d->targets) {
467  		kfree(d);
468  		d = NULL;
469  		goto out;
470  	}
471  	d->ntargets = NTARGETS;
472  	INIT_WORK(&d->work, aoecmd_sleepwork);
473  	spin_lock_init(&d->lock);
474  	INIT_LIST_HEAD(&d->rq_list);
475  	skb_queue_head_init(&d->skbpool);
476  	timer_setup(&d->timer, dummy_timer, 0);
477  	d->timer.expires = jiffies + HZ;
478  	add_timer(&d->timer);
479  	d->bufpool = NULL;	/* defer to aoeblk_gdalloc */
480  	d->tgt = d->targets;
481  	d->ref = 1;
482  	for (i = 0; i < NFACTIVE; i++)
483  		INIT_LIST_HEAD(&d->factive[i]);
484  	INIT_LIST_HEAD(&d->rexmitq);
485  	d->sysminor = sysminor;
486  	d->aoemajor = maj;
487  	d->aoeminor = min;
488  	d->rttavg = RTTAVG_INIT;
489  	d->rttdev = RTTDEV_INIT;
490  	d->next = devlist;
491  	devlist = d;
492   out:
493  	spin_unlock_irqrestore(&devlist_lock, flags);
494  	return d;
495  }
496  
497  static void
freetgt(struct aoedev * d,struct aoetgt * t)498  freetgt(struct aoedev *d, struct aoetgt *t)
499  {
500  	struct frame *f;
501  	struct list_head *pos, *nx, *head;
502  	struct aoeif *ifp;
503  
504  	for (ifp = t->ifs; ifp < &t->ifs[NAOEIFS]; ++ifp) {
505  		if (!ifp->nd)
506  			break;
507  		dev_put(ifp->nd);
508  	}
509  
510  	head = &t->ffree;
511  	list_for_each_safe(pos, nx, head) {
512  		list_del(pos);
513  		f = list_entry(pos, struct frame, head);
514  		skbfree(f->skb);
515  		kfree(f);
516  	}
517  	kfree(t);
518  }
519  
520  void
aoedev_exit(void)521  aoedev_exit(void)
522  {
523  	flush_workqueue(aoe_wq);
524  	flush(NULL, 0, EXITING);
525  }
526  
527  int __init
aoedev_init(void)528  aoedev_init(void)
529  {
530  	return 0;
531  }
532