xref: /openbmc/linux/block/badblocks.c (revision 9a29f5fc)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Bad block management
4  *
5  * - Heavily based on MD badblocks code from Neil Brown
6  *
7  * Copyright (c) 2015, Intel Corporation.
8  */
9 
10 #include <linux/badblocks.h>
11 #include <linux/seqlock.h>
12 #include <linux/device.h>
13 #include <linux/kernel.h>
14 #include <linux/module.h>
15 #include <linux/stddef.h>
16 #include <linux/types.h>
17 #include <linux/slab.h>
18 
19 /**
20  * badblocks_check() - check a given range for bad sectors
21  * @bb:		the badblocks structure that holds all badblock information
22  * @s:		sector (start) at which to check for badblocks
23  * @sectors:	number of sectors to check for badblocks
24  * @first_bad:	pointer to store location of the first badblock
25  * @bad_sectors: pointer to store number of badblocks after @first_bad
26  *
27  * We can record which blocks on each device are 'bad' and so just
28  * fail those blocks, or that stripe, rather than the whole device.
29  * Entries in the bad-block table are 64bits wide.  This comprises:
30  * Length of bad-range, in sectors: 0-511 for lengths 1-512
31  * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
32  *  A 'shift' can be set so that larger blocks are tracked and
33  *  consequently larger devices can be covered.
34  * 'Acknowledged' flag - 1 bit. - the most significant bit.
35  *
36  * Locking of the bad-block table uses a seqlock so badblocks_check
37  * might need to retry if it is very unlucky.
38  * We will sometimes want to check for bad blocks in a bi_end_io function,
39  * so we use the write_seqlock_irq variant.
40  *
41  * When looking for a bad block we specify a range and want to
42  * know if any block in the range is bad.  So we binary-search
43  * to the last range that starts at-or-before the given endpoint,
44  * (or "before the sector after the target range")
45  * then see if it ends after the given start.
46  *
47  * Return:
48  *  0: there are no known bad blocks in the range
49  *  1: there are known bad block which are all acknowledged
50  * -1: there are bad blocks which have not yet been acknowledged in metadata.
51  * plus the start/length of the first bad section we overlap.
52  */
53 int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
54 			sector_t *first_bad, int *bad_sectors)
55 {
56 	int hi;
57 	int lo;
58 	u64 *p = bb->page;
59 	int rv;
60 	sector_t target = s + sectors;
61 	unsigned seq;
62 
63 	if (bb->shift > 0) {
64 		/* round the start down, and the end up */
65 		s >>= bb->shift;
66 		target += (1<<bb->shift) - 1;
67 		target >>= bb->shift;
68 	}
69 	/* 'target' is now the first block after the bad range */
70 
71 retry:
72 	seq = read_seqbegin(&bb->lock);
73 	lo = 0;
74 	rv = 0;
75 	hi = bb->count;
76 
77 	/* Binary search between lo and hi for 'target'
78 	 * i.e. for the last range that starts before 'target'
79 	 */
80 	/* INVARIANT: ranges before 'lo' and at-or-after 'hi'
81 	 * are known not to be the last range before target.
82 	 * VARIANT: hi-lo is the number of possible
83 	 * ranges, and decreases until it reaches 1
84 	 */
85 	while (hi - lo > 1) {
86 		int mid = (lo + hi) / 2;
87 		sector_t a = BB_OFFSET(p[mid]);
88 
89 		if (a < target)
90 			/* This could still be the one, earlier ranges
91 			 * could not.
92 			 */
93 			lo = mid;
94 		else
95 			/* This and later ranges are definitely out. */
96 			hi = mid;
97 	}
98 	/* 'lo' might be the last that started before target, but 'hi' isn't */
99 	if (hi > lo) {
100 		/* need to check all range that end after 's' to see if
101 		 * any are unacknowledged.
102 		 */
103 		while (lo >= 0 &&
104 		       BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
105 			if (BB_OFFSET(p[lo]) < target) {
106 				/* starts before the end, and finishes after
107 				 * the start, so they must overlap
108 				 */
109 				if (rv != -1 && BB_ACK(p[lo]))
110 					rv = 1;
111 				else
112 					rv = -1;
113 				*first_bad = BB_OFFSET(p[lo]);
114 				*bad_sectors = BB_LEN(p[lo]);
115 			}
116 			lo--;
117 		}
118 	}
119 
120 	if (read_seqretry(&bb->lock, seq))
121 		goto retry;
122 
123 	return rv;
124 }
125 EXPORT_SYMBOL_GPL(badblocks_check);
126 
127 static void badblocks_update_acked(struct badblocks *bb)
128 {
129 	u64 *p = bb->page;
130 	int i;
131 	bool unacked = false;
132 
133 	if (!bb->unacked_exist)
134 		return;
135 
136 	for (i = 0; i < bb->count ; i++) {
137 		if (!BB_ACK(p[i])) {
138 			unacked = true;
139 			break;
140 		}
141 	}
142 
143 	if (!unacked)
144 		bb->unacked_exist = 0;
145 }
146 
147 /**
148  * badblocks_set() - Add a range of bad blocks to the table.
149  * @bb:		the badblocks structure that holds all badblock information
150  * @s:		first sector to mark as bad
151  * @sectors:	number of sectors to mark as bad
152  * @acknowledged: weather to mark the bad sectors as acknowledged
153  *
154  * This might extend the table, or might contract it if two adjacent ranges
155  * can be merged. We binary-search to find the 'insertion' point, then
156  * decide how best to handle it.
157  *
158  * Return:
159  *  0: success
160  *  1: failed to set badblocks (out of space)
161  */
162 int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
163 			int acknowledged)
164 {
165 	u64 *p;
166 	int lo, hi;
167 	int rv = 0;
168 	unsigned long flags;
169 
170 	if (bb->shift < 0)
171 		/* badblocks are disabled */
172 		return 1;
173 
174 	if (bb->shift) {
175 		/* round the start down, and the end up */
176 		sector_t next = s + sectors;
177 
178 		s >>= bb->shift;
179 		next += (1<<bb->shift) - 1;
180 		next >>= bb->shift;
181 		sectors = next - s;
182 	}
183 
184 	write_seqlock_irqsave(&bb->lock, flags);
185 
186 	p = bb->page;
187 	lo = 0;
188 	hi = bb->count;
189 	/* Find the last range that starts at-or-before 's' */
190 	while (hi - lo > 1) {
191 		int mid = (lo + hi) / 2;
192 		sector_t a = BB_OFFSET(p[mid]);
193 
194 		if (a <= s)
195 			lo = mid;
196 		else
197 			hi = mid;
198 	}
199 	if (hi > lo && BB_OFFSET(p[lo]) > s)
200 		hi = lo;
201 
202 	if (hi > lo) {
203 		/* we found a range that might merge with the start
204 		 * of our new range
205 		 */
206 		sector_t a = BB_OFFSET(p[lo]);
207 		sector_t e = a + BB_LEN(p[lo]);
208 		int ack = BB_ACK(p[lo]);
209 
210 		if (e >= s) {
211 			/* Yes, we can merge with a previous range */
212 			if (s == a && s + sectors >= e)
213 				/* new range covers old */
214 				ack = acknowledged;
215 			else
216 				ack = ack && acknowledged;
217 
218 			if (e < s + sectors)
219 				e = s + sectors;
220 			if (e - a <= BB_MAX_LEN) {
221 				p[lo] = BB_MAKE(a, e-a, ack);
222 				s = e;
223 			} else {
224 				/* does not all fit in one range,
225 				 * make p[lo] maximal
226 				 */
227 				if (BB_LEN(p[lo]) != BB_MAX_LEN)
228 					p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
229 				s = a + BB_MAX_LEN;
230 			}
231 			sectors = e - s;
232 		}
233 	}
234 	if (sectors && hi < bb->count) {
235 		/* 'hi' points to the first range that starts after 's'.
236 		 * Maybe we can merge with the start of that range
237 		 */
238 		sector_t a = BB_OFFSET(p[hi]);
239 		sector_t e = a + BB_LEN(p[hi]);
240 		int ack = BB_ACK(p[hi]);
241 
242 		if (a <= s + sectors) {
243 			/* merging is possible */
244 			if (e <= s + sectors) {
245 				/* full overlap */
246 				e = s + sectors;
247 				ack = acknowledged;
248 			} else
249 				ack = ack && acknowledged;
250 
251 			a = s;
252 			if (e - a <= BB_MAX_LEN) {
253 				p[hi] = BB_MAKE(a, e-a, ack);
254 				s = e;
255 			} else {
256 				p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
257 				s = a + BB_MAX_LEN;
258 			}
259 			sectors = e - s;
260 			lo = hi;
261 			hi++;
262 		}
263 	}
264 	if (sectors == 0 && hi < bb->count) {
265 		/* we might be able to combine lo and hi */
266 		/* Note: 's' is at the end of 'lo' */
267 		sector_t a = BB_OFFSET(p[hi]);
268 		int lolen = BB_LEN(p[lo]);
269 		int hilen = BB_LEN(p[hi]);
270 		int newlen = lolen + hilen - (s - a);
271 
272 		if (s >= a && newlen < BB_MAX_LEN) {
273 			/* yes, we can combine them */
274 			int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
275 
276 			p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
277 			memmove(p + hi, p + hi + 1,
278 				(bb->count - hi - 1) * 8);
279 			bb->count--;
280 		}
281 	}
282 	while (sectors) {
283 		/* didn't merge (it all).
284 		 * Need to add a range just before 'hi'
285 		 */
286 		if (bb->count >= MAX_BADBLOCKS) {
287 			/* No room for more */
288 			rv = 1;
289 			break;
290 		} else {
291 			int this_sectors = sectors;
292 
293 			memmove(p + hi + 1, p + hi,
294 				(bb->count - hi) * 8);
295 			bb->count++;
296 
297 			if (this_sectors > BB_MAX_LEN)
298 				this_sectors = BB_MAX_LEN;
299 			p[hi] = BB_MAKE(s, this_sectors, acknowledged);
300 			sectors -= this_sectors;
301 			s += this_sectors;
302 		}
303 	}
304 
305 	bb->changed = 1;
306 	if (!acknowledged)
307 		bb->unacked_exist = 1;
308 	else
309 		badblocks_update_acked(bb);
310 	write_sequnlock_irqrestore(&bb->lock, flags);
311 
312 	return rv;
313 }
314 EXPORT_SYMBOL_GPL(badblocks_set);
315 
316 /**
317  * badblocks_clear() - Remove a range of bad blocks to the table.
318  * @bb:		the badblocks structure that holds all badblock information
319  * @s:		first sector to mark as bad
320  * @sectors:	number of sectors to mark as bad
321  *
322  * This may involve extending the table if we spilt a region,
323  * but it must not fail.  So if the table becomes full, we just
324  * drop the remove request.
325  *
326  * Return:
327  *  0: success
328  *  1: failed to clear badblocks
329  */
330 int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
331 {
332 	u64 *p;
333 	int lo, hi;
334 	sector_t target = s + sectors;
335 	int rv = 0;
336 
337 	if (bb->shift > 0) {
338 		/* When clearing we round the start up and the end down.
339 		 * This should not matter as the shift should align with
340 		 * the block size and no rounding should ever be needed.
341 		 * However it is better the think a block is bad when it
342 		 * isn't than to think a block is not bad when it is.
343 		 */
344 		s += (1<<bb->shift) - 1;
345 		s >>= bb->shift;
346 		target >>= bb->shift;
347 	}
348 
349 	write_seqlock_irq(&bb->lock);
350 
351 	p = bb->page;
352 	lo = 0;
353 	hi = bb->count;
354 	/* Find the last range that starts before 'target' */
355 	while (hi - lo > 1) {
356 		int mid = (lo + hi) / 2;
357 		sector_t a = BB_OFFSET(p[mid]);
358 
359 		if (a < target)
360 			lo = mid;
361 		else
362 			hi = mid;
363 	}
364 	if (hi > lo) {
365 		/* p[lo] is the last range that could overlap the
366 		 * current range.  Earlier ranges could also overlap,
367 		 * but only this one can overlap the end of the range.
368 		 */
369 		if ((BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) &&
370 		    (BB_OFFSET(p[lo]) < target)) {
371 			/* Partial overlap, leave the tail of this range */
372 			int ack = BB_ACK(p[lo]);
373 			sector_t a = BB_OFFSET(p[lo]);
374 			sector_t end = a + BB_LEN(p[lo]);
375 
376 			if (a < s) {
377 				/* we need to split this range */
378 				if (bb->count >= MAX_BADBLOCKS) {
379 					rv = -ENOSPC;
380 					goto out;
381 				}
382 				memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
383 				bb->count++;
384 				p[lo] = BB_MAKE(a, s-a, ack);
385 				lo++;
386 			}
387 			p[lo] = BB_MAKE(target, end - target, ack);
388 			/* there is no longer an overlap */
389 			hi = lo;
390 			lo--;
391 		}
392 		while (lo >= 0 &&
393 		       (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) &&
394 		       (BB_OFFSET(p[lo]) < target)) {
395 			/* This range does overlap */
396 			if (BB_OFFSET(p[lo]) < s) {
397 				/* Keep the early parts of this range. */
398 				int ack = BB_ACK(p[lo]);
399 				sector_t start = BB_OFFSET(p[lo]);
400 
401 				p[lo] = BB_MAKE(start, s - start, ack);
402 				/* now low doesn't overlap, so.. */
403 				break;
404 			}
405 			lo--;
406 		}
407 		/* 'lo' is strictly before, 'hi' is strictly after,
408 		 * anything between needs to be discarded
409 		 */
410 		if (hi - lo > 1) {
411 			memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
412 			bb->count -= (hi - lo - 1);
413 		}
414 	}
415 
416 	badblocks_update_acked(bb);
417 	bb->changed = 1;
418 out:
419 	write_sequnlock_irq(&bb->lock);
420 	return rv;
421 }
422 EXPORT_SYMBOL_GPL(badblocks_clear);
423 
424 /**
425  * ack_all_badblocks() - Acknowledge all bad blocks in a list.
426  * @bb:		the badblocks structure that holds all badblock information
427  *
428  * This only succeeds if ->changed is clear.  It is used by
429  * in-kernel metadata updates
430  */
431 void ack_all_badblocks(struct badblocks *bb)
432 {
433 	if (bb->page == NULL || bb->changed)
434 		/* no point even trying */
435 		return;
436 	write_seqlock_irq(&bb->lock);
437 
438 	if (bb->changed == 0 && bb->unacked_exist) {
439 		u64 *p = bb->page;
440 		int i;
441 
442 		for (i = 0; i < bb->count ; i++) {
443 			if (!BB_ACK(p[i])) {
444 				sector_t start = BB_OFFSET(p[i]);
445 				int len = BB_LEN(p[i]);
446 
447 				p[i] = BB_MAKE(start, len, 1);
448 			}
449 		}
450 		bb->unacked_exist = 0;
451 	}
452 	write_sequnlock_irq(&bb->lock);
453 }
454 EXPORT_SYMBOL_GPL(ack_all_badblocks);
455 
456 /**
457  * badblocks_show() - sysfs access to bad-blocks list
458  * @bb:		the badblocks structure that holds all badblock information
459  * @page:	buffer received from sysfs
460  * @unack:	weather to show unacknowledged badblocks
461  *
462  * Return:
463  *  Length of returned data
464  */
465 ssize_t badblocks_show(struct badblocks *bb, char *page, int unack)
466 {
467 	size_t len;
468 	int i;
469 	u64 *p = bb->page;
470 	unsigned seq;
471 
472 	if (bb->shift < 0)
473 		return 0;
474 
475 retry:
476 	seq = read_seqbegin(&bb->lock);
477 
478 	len = 0;
479 	i = 0;
480 
481 	while (len < PAGE_SIZE && i < bb->count) {
482 		sector_t s = BB_OFFSET(p[i]);
483 		unsigned int length = BB_LEN(p[i]);
484 		int ack = BB_ACK(p[i]);
485 
486 		i++;
487 
488 		if (unack && ack)
489 			continue;
490 
491 		len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
492 				(unsigned long long)s << bb->shift,
493 				length << bb->shift);
494 	}
495 	if (unack && len == 0)
496 		bb->unacked_exist = 0;
497 
498 	if (read_seqretry(&bb->lock, seq))
499 		goto retry;
500 
501 	return len;
502 }
503 EXPORT_SYMBOL_GPL(badblocks_show);
504 
505 /**
506  * badblocks_store() - sysfs access to bad-blocks list
507  * @bb:		the badblocks structure that holds all badblock information
508  * @page:	buffer received from sysfs
509  * @len:	length of data received from sysfs
510  * @unack:	weather to show unacknowledged badblocks
511  *
512  * Return:
513  *  Length of the buffer processed or -ve error.
514  */
515 ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len,
516 			int unack)
517 {
518 	unsigned long long sector;
519 	int length;
520 	char newline;
521 
522 	switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
523 	case 3:
524 		if (newline != '\n')
525 			return -EINVAL;
526 		fallthrough;
527 	case 2:
528 		if (length <= 0)
529 			return -EINVAL;
530 		break;
531 	default:
532 		return -EINVAL;
533 	}
534 
535 	if (badblocks_set(bb, sector, length, !unack))
536 		return -ENOSPC;
537 	else
538 		return len;
539 }
540 EXPORT_SYMBOL_GPL(badblocks_store);
541 
542 static int __badblocks_init(struct device *dev, struct badblocks *bb,
543 		int enable)
544 {
545 	bb->dev = dev;
546 	bb->count = 0;
547 	if (enable)
548 		bb->shift = 0;
549 	else
550 		bb->shift = -1;
551 	if (dev)
552 		bb->page = devm_kzalloc(dev, PAGE_SIZE, GFP_KERNEL);
553 	else
554 		bb->page = kzalloc(PAGE_SIZE, GFP_KERNEL);
555 	if (!bb->page) {
556 		bb->shift = -1;
557 		return -ENOMEM;
558 	}
559 	seqlock_init(&bb->lock);
560 
561 	return 0;
562 }
563 
564 /**
565  * badblocks_init() - initialize the badblocks structure
566  * @bb:		the badblocks structure that holds all badblock information
567  * @enable:	weather to enable badblocks accounting
568  *
569  * Return:
570  *  0: success
571  *  -ve errno: on error
572  */
573 int badblocks_init(struct badblocks *bb, int enable)
574 {
575 	return __badblocks_init(NULL, bb, enable);
576 }
577 EXPORT_SYMBOL_GPL(badblocks_init);
578 
579 int devm_init_badblocks(struct device *dev, struct badblocks *bb)
580 {
581 	if (!bb)
582 		return -EINVAL;
583 	return __badblocks_init(dev, bb, 1);
584 }
585 EXPORT_SYMBOL_GPL(devm_init_badblocks);
586 
587 /**
588  * badblocks_exit() - free the badblocks structure
589  * @bb:		the badblocks structure that holds all badblock information
590  */
591 void badblocks_exit(struct badblocks *bb)
592 {
593 	if (!bb)
594 		return;
595 	if (bb->dev)
596 		devm_kfree(bb->dev, bb->page);
597 	else
598 		kfree(bb->page);
599 	bb->page = NULL;
600 }
601 EXPORT_SYMBOL_GPL(badblocks_exit);
602