blob: 2fe6c117ac962b3d235b9c0f2250f9e1b3c6daf4
1 | /* |
2 | * Bad block management |
3 | * |
4 | * - Heavily based on MD badblocks code from Neil Brown |
5 | * |
6 | * Copyright (c) 2015, Intel Corporation. |
7 | * |
8 | * This program is free software; you can redistribute it and/or modify it |
9 | * under the terms and conditions of the GNU General Public License, |
10 | * version 2, as published by the Free Software Foundation. |
11 | * |
12 | * This program is distributed in the hope it will be useful, but WITHOUT |
13 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
14 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for |
15 | * more details. |
16 | */ |
17 | |
18 | #include <linux/badblocks.h> |
19 | #include <linux/seqlock.h> |
20 | #include <linux/device.h> |
21 | #include <linux/kernel.h> |
22 | #include <linux/module.h> |
23 | #include <linux/stddef.h> |
24 | #include <linux/types.h> |
25 | #include <linux/slab.h> |
26 | |
27 | /** |
28 | * badblocks_check() - check a given range for bad sectors |
29 | * @bb: the badblocks structure that holds all badblock information |
30 | * @s: sector (start) at which to check for badblocks |
31 | * @sectors: number of sectors to check for badblocks |
32 | * @first_bad: pointer to store location of the first badblock |
33 | * @bad_sectors: pointer to store number of badblocks after @first_bad |
34 | * |
35 | * We can record which blocks on each device are 'bad' and so just |
36 | * fail those blocks, or that stripe, rather than the whole device. |
37 | * Entries in the bad-block table are 64bits wide. This comprises: |
38 | * Length of bad-range, in sectors: 0-511 for lengths 1-512 |
39 | * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) |
40 | * A 'shift' can be set so that larger blocks are tracked and |
41 | * consequently larger devices can be covered. |
42 | * 'Acknowledged' flag - 1 bit. - the most significant bit. |
43 | * |
44 | * Locking of the bad-block table uses a seqlock so badblocks_check |
45 | * might need to retry if it is very unlucky. |
46 | * We will sometimes want to check for bad blocks in a bi_end_io function, |
47 | * so we use the write_seqlock_irq variant. |
48 | * |
49 | * When looking for a bad block we specify a range and want to |
50 | * know if any block in the range is bad. So we binary-search |
51 | * to the last range that starts at-or-before the given endpoint, |
52 | * (or "before the sector after the target range") |
53 | * then see if it ends after the given start. |
54 | * |
55 | * Return: |
56 | * 0: there are no known bad blocks in the range |
57 | * 1: there are known bad block which are all acknowledged |
58 | * -1: there are bad blocks which have not yet been acknowledged in metadata. |
59 | * plus the start/length of the first bad section we overlap. |
60 | */ |
61 | int badblocks_check(struct badblocks *bb, sector_t s, int sectors, |
62 | sector_t *first_bad, int *bad_sectors) |
63 | { |
64 | int hi; |
65 | int lo; |
66 | u64 *p = bb->page; |
67 | int rv; |
68 | sector_t target = s + sectors; |
69 | unsigned seq; |
70 | |
71 | if (bb->shift > 0) { |
72 | /* round the start down, and the end up */ |
73 | s >>= bb->shift; |
74 | target += (1<<bb->shift) - 1; |
75 | target >>= bb->shift; |
76 | sectors = target - s; |
77 | } |
78 | /* 'target' is now the first block after the bad range */ |
79 | |
80 | retry: |
81 | seq = read_seqbegin(&bb->lock); |
82 | lo = 0; |
83 | rv = 0; |
84 | hi = bb->count; |
85 | |
86 | /* Binary search between lo and hi for 'target' |
87 | * i.e. for the last range that starts before 'target' |
88 | */ |
89 | /* INVARIANT: ranges before 'lo' and at-or-after 'hi' |
90 | * are known not to be the last range before target. |
91 | * VARIANT: hi-lo is the number of possible |
92 | * ranges, and decreases until it reaches 1 |
93 | */ |
94 | while (hi - lo > 1) { |
95 | int mid = (lo + hi) / 2; |
96 | sector_t a = BB_OFFSET(p[mid]); |
97 | |
98 | if (a < target) |
99 | /* This could still be the one, earlier ranges |
100 | * could not. |
101 | */ |
102 | lo = mid; |
103 | else |
104 | /* This and later ranges are definitely out. */ |
105 | hi = mid; |
106 | } |
107 | /* 'lo' might be the last that started before target, but 'hi' isn't */ |
108 | if (hi > lo) { |
109 | /* need to check all range that end after 's' to see if |
110 | * any are unacknowledged. |
111 | */ |
112 | while (lo >= 0 && |
113 | BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { |
114 | if (BB_OFFSET(p[lo]) < target) { |
115 | /* starts before the end, and finishes after |
116 | * the start, so they must overlap |
117 | */ |
118 | if (rv != -1 && BB_ACK(p[lo])) |
119 | rv = 1; |
120 | else |
121 | rv = -1; |
122 | *first_bad = BB_OFFSET(p[lo]); |
123 | *bad_sectors = BB_LEN(p[lo]); |
124 | } |
125 | lo--; |
126 | } |
127 | } |
128 | |
129 | if (read_seqretry(&bb->lock, seq)) |
130 | goto retry; |
131 | |
132 | return rv; |
133 | } |
134 | EXPORT_SYMBOL_GPL(badblocks_check); |
135 | |
136 | static void badblocks_update_acked(struct badblocks *bb) |
137 | { |
138 | u64 *p = bb->page; |
139 | int i; |
140 | bool unacked = false; |
141 | |
142 | if (!bb->unacked_exist) |
143 | return; |
144 | |
145 | for (i = 0; i < bb->count ; i++) { |
146 | if (!BB_ACK(p[i])) { |
147 | unacked = true; |
148 | break; |
149 | } |
150 | } |
151 | |
152 | if (!unacked) |
153 | bb->unacked_exist = 0; |
154 | } |
155 | |
156 | /** |
157 | * badblocks_set() - Add a range of bad blocks to the table. |
158 | * @bb: the badblocks structure that holds all badblock information |
159 | * @s: first sector to mark as bad |
160 | * @sectors: number of sectors to mark as bad |
161 | * @acknowledged: weather to mark the bad sectors as acknowledged |
162 | * |
163 | * This might extend the table, or might contract it if two adjacent ranges |
164 | * can be merged. We binary-search to find the 'insertion' point, then |
165 | * decide how best to handle it. |
166 | * |
167 | * Return: |
168 | * 0: success |
169 | * 1: failed to set badblocks (out of space) |
170 | */ |
171 | int badblocks_set(struct badblocks *bb, sector_t s, int sectors, |
172 | int acknowledged) |
173 | { |
174 | u64 *p; |
175 | int lo, hi; |
176 | int rv = 0; |
177 | unsigned long flags; |
178 | |
179 | if (bb->shift < 0) |
180 | /* badblocks are disabled */ |
181 | return 1; |
182 | |
183 | if (bb->shift) { |
184 | /* round the start down, and the end up */ |
185 | sector_t next = s + sectors; |
186 | |
187 | s >>= bb->shift; |
188 | next += (1<<bb->shift) - 1; |
189 | next >>= bb->shift; |
190 | sectors = next - s; |
191 | } |
192 | |
193 | write_seqlock_irqsave(&bb->lock, flags); |
194 | |
195 | p = bb->page; |
196 | lo = 0; |
197 | hi = bb->count; |
198 | /* Find the last range that starts at-or-before 's' */ |
199 | while (hi - lo > 1) { |
200 | int mid = (lo + hi) / 2; |
201 | sector_t a = BB_OFFSET(p[mid]); |
202 | |
203 | if (a <= s) |
204 | lo = mid; |
205 | else |
206 | hi = mid; |
207 | } |
208 | if (hi > lo && BB_OFFSET(p[lo]) > s) |
209 | hi = lo; |
210 | |
211 | if (hi > lo) { |
212 | /* we found a range that might merge with the start |
213 | * of our new range |
214 | */ |
215 | sector_t a = BB_OFFSET(p[lo]); |
216 | sector_t e = a + BB_LEN(p[lo]); |
217 | int ack = BB_ACK(p[lo]); |
218 | |
219 | if (e >= s) { |
220 | /* Yes, we can merge with a previous range */ |
221 | if (s == a && s + sectors >= e) |
222 | /* new range covers old */ |
223 | ack = acknowledged; |
224 | else |
225 | ack = ack && acknowledged; |
226 | |
227 | if (e < s + sectors) |
228 | e = s + sectors; |
229 | if (e - a <= BB_MAX_LEN) { |
230 | p[lo] = BB_MAKE(a, e-a, ack); |
231 | s = e; |
232 | } else { |
233 | /* does not all fit in one range, |
234 | * make p[lo] maximal |
235 | */ |
236 | if (BB_LEN(p[lo]) != BB_MAX_LEN) |
237 | p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); |
238 | s = a + BB_MAX_LEN; |
239 | } |
240 | sectors = e - s; |
241 | } |
242 | } |
243 | if (sectors && hi < bb->count) { |
244 | /* 'hi' points to the first range that starts after 's'. |
245 | * Maybe we can merge with the start of that range |
246 | */ |
247 | sector_t a = BB_OFFSET(p[hi]); |
248 | sector_t e = a + BB_LEN(p[hi]); |
249 | int ack = BB_ACK(p[hi]); |
250 | |
251 | if (a <= s + sectors) { |
252 | /* merging is possible */ |
253 | if (e <= s + sectors) { |
254 | /* full overlap */ |
255 | e = s + sectors; |
256 | ack = acknowledged; |
257 | } else |
258 | ack = ack && acknowledged; |
259 | |
260 | a = s; |
261 | if (e - a <= BB_MAX_LEN) { |
262 | p[hi] = BB_MAKE(a, e-a, ack); |
263 | s = e; |
264 | } else { |
265 | p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); |
266 | s = a + BB_MAX_LEN; |
267 | } |
268 | sectors = e - s; |
269 | lo = hi; |
270 | hi++; |
271 | } |
272 | } |
273 | if (sectors == 0 && hi < bb->count) { |
274 | /* we might be able to combine lo and hi */ |
275 | /* Note: 's' is at the end of 'lo' */ |
276 | sector_t a = BB_OFFSET(p[hi]); |
277 | int lolen = BB_LEN(p[lo]); |
278 | int hilen = BB_LEN(p[hi]); |
279 | int newlen = lolen + hilen - (s - a); |
280 | |
281 | if (s >= a && newlen < BB_MAX_LEN) { |
282 | /* yes, we can combine them */ |
283 | int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); |
284 | |
285 | p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); |
286 | memmove(p + hi, p + hi + 1, |
287 | (bb->count - hi - 1) * 8); |
288 | bb->count--; |
289 | } |
290 | } |
291 | while (sectors) { |
292 | /* didn't merge (it all). |
293 | * Need to add a range just before 'hi' |
294 | */ |
295 | if (bb->count >= MAX_BADBLOCKS) { |
296 | /* No room for more */ |
297 | rv = 1; |
298 | break; |
299 | } else { |
300 | int this_sectors = sectors; |
301 | |
302 | memmove(p + hi + 1, p + hi, |
303 | (bb->count - hi) * 8); |
304 | bb->count++; |
305 | |
306 | if (this_sectors > BB_MAX_LEN) |
307 | this_sectors = BB_MAX_LEN; |
308 | p[hi] = BB_MAKE(s, this_sectors, acknowledged); |
309 | sectors -= this_sectors; |
310 | s += this_sectors; |
311 | } |
312 | } |
313 | |
314 | bb->changed = 1; |
315 | if (!acknowledged) |
316 | bb->unacked_exist = 1; |
317 | else |
318 | badblocks_update_acked(bb); |
319 | write_sequnlock_irqrestore(&bb->lock, flags); |
320 | |
321 | return rv; |
322 | } |
323 | EXPORT_SYMBOL_GPL(badblocks_set); |
324 | |
325 | /** |
326 | * badblocks_clear() - Remove a range of bad blocks to the table. |
327 | * @bb: the badblocks structure that holds all badblock information |
328 | * @s: first sector to mark as bad |
329 | * @sectors: number of sectors to mark as bad |
330 | * |
331 | * This may involve extending the table if we spilt a region, |
332 | * but it must not fail. So if the table becomes full, we just |
333 | * drop the remove request. |
334 | * |
335 | * Return: |
336 | * 0: success |
337 | * 1: failed to clear badblocks |
338 | */ |
339 | int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) |
340 | { |
341 | u64 *p; |
342 | int lo, hi; |
343 | sector_t target = s + sectors; |
344 | int rv = 0; |
345 | |
346 | if (bb->shift > 0) { |
347 | /* When clearing we round the start up and the end down. |
348 | * This should not matter as the shift should align with |
349 | * the block size and no rounding should ever be needed. |
350 | * However it is better the think a block is bad when it |
351 | * isn't than to think a block is not bad when it is. |
352 | */ |
353 | s += (1<<bb->shift) - 1; |
354 | s >>= bb->shift; |
355 | target >>= bb->shift; |
356 | sectors = target - s; |
357 | } |
358 | |
359 | write_seqlock_irq(&bb->lock); |
360 | |
361 | p = bb->page; |
362 | lo = 0; |
363 | hi = bb->count; |
364 | /* Find the last range that starts before 'target' */ |
365 | while (hi - lo > 1) { |
366 | int mid = (lo + hi) / 2; |
367 | sector_t a = BB_OFFSET(p[mid]); |
368 | |
369 | if (a < target) |
370 | lo = mid; |
371 | else |
372 | hi = mid; |
373 | } |
374 | if (hi > lo) { |
375 | /* p[lo] is the last range that could overlap the |
376 | * current range. Earlier ranges could also overlap, |
377 | * but only this one can overlap the end of the range. |
378 | */ |
379 | if ((BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) && |
380 | (BB_OFFSET(p[lo]) < target)) { |
381 | /* Partial overlap, leave the tail of this range */ |
382 | int ack = BB_ACK(p[lo]); |
383 | sector_t a = BB_OFFSET(p[lo]); |
384 | sector_t end = a + BB_LEN(p[lo]); |
385 | |
386 | if (a < s) { |
387 | /* we need to split this range */ |
388 | if (bb->count >= MAX_BADBLOCKS) { |
389 | rv = -ENOSPC; |
390 | goto out; |
391 | } |
392 | memmove(p+lo+1, p+lo, (bb->count - lo) * 8); |
393 | bb->count++; |
394 | p[lo] = BB_MAKE(a, s-a, ack); |
395 | lo++; |
396 | } |
397 | p[lo] = BB_MAKE(target, end - target, ack); |
398 | /* there is no longer an overlap */ |
399 | hi = lo; |
400 | lo--; |
401 | } |
402 | while (lo >= 0 && |
403 | (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) && |
404 | (BB_OFFSET(p[lo]) < target)) { |
405 | /* This range does overlap */ |
406 | if (BB_OFFSET(p[lo]) < s) { |
407 | /* Keep the early parts of this range. */ |
408 | int ack = BB_ACK(p[lo]); |
409 | sector_t start = BB_OFFSET(p[lo]); |
410 | |
411 | p[lo] = BB_MAKE(start, s - start, ack); |
412 | /* now low doesn't overlap, so.. */ |
413 | break; |
414 | } |
415 | lo--; |
416 | } |
417 | /* 'lo' is strictly before, 'hi' is strictly after, |
418 | * anything between needs to be discarded |
419 | */ |
420 | if (hi - lo > 1) { |
421 | memmove(p+lo+1, p+hi, (bb->count - hi) * 8); |
422 | bb->count -= (hi - lo - 1); |
423 | } |
424 | } |
425 | |
426 | badblocks_update_acked(bb); |
427 | bb->changed = 1; |
428 | out: |
429 | write_sequnlock_irq(&bb->lock); |
430 | return rv; |
431 | } |
432 | EXPORT_SYMBOL_GPL(badblocks_clear); |
433 | |
434 | /** |
435 | * ack_all_badblocks() - Acknowledge all bad blocks in a list. |
436 | * @bb: the badblocks structure that holds all badblock information |
437 | * |
438 | * This only succeeds if ->changed is clear. It is used by |
439 | * in-kernel metadata updates |
440 | */ |
441 | void ack_all_badblocks(struct badblocks *bb) |
442 | { |
443 | if (bb->page == NULL || bb->changed) |
444 | /* no point even trying */ |
445 | return; |
446 | write_seqlock_irq(&bb->lock); |
447 | |
448 | if (bb->changed == 0 && bb->unacked_exist) { |
449 | u64 *p = bb->page; |
450 | int i; |
451 | |
452 | for (i = 0; i < bb->count ; i++) { |
453 | if (!BB_ACK(p[i])) { |
454 | sector_t start = BB_OFFSET(p[i]); |
455 | int len = BB_LEN(p[i]); |
456 | |
457 | p[i] = BB_MAKE(start, len, 1); |
458 | } |
459 | } |
460 | bb->unacked_exist = 0; |
461 | } |
462 | write_sequnlock_irq(&bb->lock); |
463 | } |
464 | EXPORT_SYMBOL_GPL(ack_all_badblocks); |
465 | |
466 | /** |
467 | * badblocks_show() - sysfs access to bad-blocks list |
468 | * @bb: the badblocks structure that holds all badblock information |
469 | * @page: buffer received from sysfs |
470 | * @unack: weather to show unacknowledged badblocks |
471 | * |
472 | * Return: |
473 | * Length of returned data |
474 | */ |
475 | ssize_t badblocks_show(struct badblocks *bb, char *page, int unack) |
476 | { |
477 | size_t len; |
478 | int i; |
479 | u64 *p = bb->page; |
480 | unsigned seq; |
481 | |
482 | if (bb->shift < 0) |
483 | return 0; |
484 | |
485 | retry: |
486 | seq = read_seqbegin(&bb->lock); |
487 | |
488 | len = 0; |
489 | i = 0; |
490 | |
491 | while (len < PAGE_SIZE && i < bb->count) { |
492 | sector_t s = BB_OFFSET(p[i]); |
493 | unsigned int length = BB_LEN(p[i]); |
494 | int ack = BB_ACK(p[i]); |
495 | |
496 | i++; |
497 | |
498 | if (unack && ack) |
499 | continue; |
500 | |
501 | len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", |
502 | (unsigned long long)s << bb->shift, |
503 | length << bb->shift); |
504 | } |
505 | if (unack && len == 0) |
506 | bb->unacked_exist = 0; |
507 | |
508 | if (read_seqretry(&bb->lock, seq)) |
509 | goto retry; |
510 | |
511 | return len; |
512 | } |
513 | EXPORT_SYMBOL_GPL(badblocks_show); |
514 | |
515 | /** |
516 | * badblocks_store() - sysfs access to bad-blocks list |
517 | * @bb: the badblocks structure that holds all badblock information |
518 | * @page: buffer received from sysfs |
519 | * @len: length of data received from sysfs |
520 | * @unack: weather to show unacknowledged badblocks |
521 | * |
522 | * Return: |
523 | * Length of the buffer processed or -ve error. |
524 | */ |
525 | ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, |
526 | int unack) |
527 | { |
528 | unsigned long long sector; |
529 | int length; |
530 | char newline; |
531 | |
532 | switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) { |
533 | case 3: |
534 | if (newline != '\n') |
535 | return -EINVAL; |
536 | case 2: |
537 | if (length <= 0) |
538 | return -EINVAL; |
539 | break; |
540 | default: |
541 | return -EINVAL; |
542 | } |
543 | |
544 | if (badblocks_set(bb, sector, length, !unack)) |
545 | return -ENOSPC; |
546 | else |
547 | return len; |
548 | } |
549 | EXPORT_SYMBOL_GPL(badblocks_store); |
550 | |
551 | static int __badblocks_init(struct device *dev, struct badblocks *bb, |
552 | int enable) |
553 | { |
554 | bb->dev = dev; |
555 | bb->count = 0; |
556 | if (enable) |
557 | bb->shift = 0; |
558 | else |
559 | bb->shift = -1; |
560 | if (dev) |
561 | bb->page = devm_kzalloc(dev, PAGE_SIZE, GFP_KERNEL); |
562 | else |
563 | bb->page = kzalloc(PAGE_SIZE, GFP_KERNEL); |
564 | if (!bb->page) { |
565 | bb->shift = -1; |
566 | return -ENOMEM; |
567 | } |
568 | seqlock_init(&bb->lock); |
569 | |
570 | return 0; |
571 | } |
572 | |
573 | /** |
574 | * badblocks_init() - initialize the badblocks structure |
575 | * @bb: the badblocks structure that holds all badblock information |
576 | * @enable: weather to enable badblocks accounting |
577 | * |
578 | * Return: |
579 | * 0: success |
580 | * -ve errno: on error |
581 | */ |
582 | int badblocks_init(struct badblocks *bb, int enable) |
583 | { |
584 | return __badblocks_init(NULL, bb, enable); |
585 | } |
586 | EXPORT_SYMBOL_GPL(badblocks_init); |
587 | |
588 | int devm_init_badblocks(struct device *dev, struct badblocks *bb) |
589 | { |
590 | if (!bb) |
591 | return -EINVAL; |
592 | return __badblocks_init(dev, bb, 1); |
593 | } |
594 | EXPORT_SYMBOL_GPL(devm_init_badblocks); |
595 | |
596 | /** |
597 | * badblocks_exit() - free the badblocks structure |
598 | * @bb: the badblocks structure that holds all badblock information |
599 | */ |
600 | void badblocks_exit(struct badblocks *bb) |
601 | { |
602 | if (!bb) |
603 | return; |
604 | if (bb->dev) |
605 | devm_kfree(bb->dev, bb->page); |
606 | else |
607 | kfree(bb->page); |
608 | bb->page = NULL; |
609 | } |
610 | EXPORT_SYMBOL_GPL(badblocks_exit); |
611 |