1 | /* $NetBSD: rf_paritymap.c,v 1.8 2011/04/27 07:55:15 mrg Exp $ */ |
2 | |
3 | /*- |
4 | * Copyright (c) 2009 Jed Davis. |
5 | * All rights reserved. |
6 | * |
7 | * Redistribution and use in source and binary forms, with or without |
8 | * modification, are permitted provided that the following conditions |
9 | * are met: |
10 | * 1. Redistributions of source code must retain the above copyright |
11 | * notice, this list of conditions and the following disclaimer. |
12 | * 2. Redistributions in binary form must reproduce the above copyright |
13 | * notice, this list of conditions and the following disclaimer in the |
14 | * documentation and/or other materials provided with the distribution. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
17 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
18 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
19 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
20 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ |
28 | |
29 | #include <sys/cdefs.h> |
30 | __KERNEL_RCSID(0, "$NetBSD: rf_paritymap.c,v 1.8 2011/04/27 07:55:15 mrg Exp $" ); |
31 | |
32 | #include <sys/param.h> |
33 | #include <sys/callout.h> |
34 | #include <sys/kmem.h> |
35 | #include <sys/mutex.h> |
36 | #include <sys/rwlock.h> |
37 | #include <sys/systm.h> |
38 | #include <sys/types.h> |
39 | |
40 | #include <dev/raidframe/rf_paritymap.h> |
41 | #include <dev/raidframe/rf_stripelocks.h> |
42 | #include <dev/raidframe/rf_layout.h> |
43 | #include <dev/raidframe/rf_raid.h> |
44 | #include <dev/raidframe/rf_parityscan.h> |
45 | #include <dev/raidframe/rf_kintf.h> |
46 | |
47 | /* Important parameters: */ |
48 | #define REGION_MINSIZE (25ULL << 20) |
49 | #define DFL_TICKMS 40000 |
50 | #define DFL_COOLDOWN 8 /* 7-8 intervals of 40s = 5min +/- 20s */ |
51 | |
52 | /* Internal-use flag bits. */ |
53 | #define TICKING 1 |
54 | #define TICKED 2 |
55 | |
56 | /* Prototypes! */ |
57 | static void rf_paritymap_write_locked(struct rf_paritymap *); |
58 | static void rf_paritymap_tick(void *); |
59 | static u_int rf_paritymap_nreg(RF_Raid_t *); |
60 | |
61 | /* Extract the current status of the parity map. */ |
62 | void |
63 | rf_paritymap_status(struct rf_paritymap *pm, struct rf_pmstat *ps) |
64 | { |
65 | memset(ps, 0, sizeof(*ps)); |
66 | if (pm == NULL) |
67 | ps->enabled = 0; |
68 | else { |
69 | ps->enabled = 1; |
70 | ps->region_size = pm->region_size; |
71 | mutex_enter(&pm->lock); |
72 | memcpy(&ps->params, &pm->params, sizeof(ps->params)); |
73 | memcpy(ps->dirty, pm->disk_now, sizeof(ps->dirty)); |
74 | memcpy(&ps->ctrs, &pm->ctrs, sizeof(ps->ctrs)); |
75 | mutex_exit(&pm->lock); |
76 | } |
77 | } |
78 | |
79 | /* |
80 | * Test whether parity in a given sector is suspected of being inconsistent |
81 | * on disk (assuming that any pending I/O to it is allowed to complete). |
82 | * This may be of interest to future work on parity scrubbing. |
83 | */ |
84 | int |
85 | rf_paritymap_test(struct rf_paritymap *pm, daddr_t sector) |
86 | { |
87 | unsigned region = sector / pm->region_size; |
88 | int retval; |
89 | |
90 | mutex_enter(&pm->lock); |
91 | retval = isset(pm->disk_boot->bits, region) ? 1 : 0; |
92 | mutex_exit(&pm->lock); |
93 | return retval; |
94 | } |
95 | |
96 | /* To be called before a write to the RAID is submitted. */ |
97 | void |
98 | rf_paritymap_begin(struct rf_paritymap *pm, daddr_t offset, daddr_t size) |
99 | { |
100 | unsigned i, b, e; |
101 | |
102 | b = offset / pm->region_size; |
103 | e = (offset + size - 1) / pm->region_size; |
104 | |
105 | for (i = b; i <= e; i++) |
106 | rf_paritymap_begin_region(pm, i); |
107 | } |
108 | |
109 | /* To be called after a write to the RAID completes. */ |
110 | void |
111 | rf_paritymap_end(struct rf_paritymap *pm, daddr_t offset, daddr_t size) |
112 | { |
113 | unsigned i, b, e; |
114 | |
115 | b = offset / pm->region_size; |
116 | e = (offset + size - 1) / pm->region_size; |
117 | |
118 | for (i = b; i <= e; i++) |
119 | rf_paritymap_end_region(pm, i); |
120 | } |
121 | |
122 | void |
123 | rf_paritymap_begin_region(struct rf_paritymap *pm, unsigned region) |
124 | { |
125 | int needs_write; |
126 | |
127 | KASSERT(region < RF_PARITYMAP_NREG); |
128 | pm->ctrs.nwrite++; |
129 | |
130 | /* If it was being kept warm, deal with that. */ |
131 | mutex_enter(&pm->lock); |
132 | if (pm->current->state[region] < 0) |
133 | pm->current->state[region] = 0; |
134 | |
135 | /* This shouldn't happen unless RAIDOUTSTANDING is set too high. */ |
136 | KASSERT(pm->current->state[region] < 127); |
137 | pm->current->state[region]++; |
138 | |
139 | needs_write = isclr(pm->disk_now->bits, region); |
140 | |
141 | if (needs_write) { |
142 | KASSERT(pm->current->state[region] == 1); |
143 | rf_paritymap_write_locked(pm); |
144 | } |
145 | |
146 | mutex_exit(&pm->lock); |
147 | } |
148 | |
149 | void |
150 | rf_paritymap_end_region(struct rf_paritymap *pm, unsigned region) |
151 | { |
152 | KASSERT(region < RF_PARITYMAP_NREG); |
153 | |
154 | mutex_enter(&pm->lock); |
155 | KASSERT(pm->current->state[region] > 0); |
156 | --pm->current->state[region]; |
157 | |
158 | if (pm->current->state[region] <= 0) { |
159 | pm->current->state[region] = -pm->params.cooldown; |
160 | KASSERT(pm->current->state[region] <= 0); |
161 | mutex_enter(&pm->lk_flags); |
162 | if (!(pm->flags & TICKING)) { |
163 | pm->flags |= TICKING; |
164 | mutex_exit(&pm->lk_flags); |
165 | callout_schedule(&pm->ticker, |
166 | mstohz(pm->params.tickms)); |
167 | } else |
168 | mutex_exit(&pm->lk_flags); |
169 | } |
170 | mutex_exit(&pm->lock); |
171 | } |
172 | |
173 | /* |
174 | * Updates the parity map to account for any changes in current activity |
175 | * and/or an ongoing parity scan, then writes it to disk with appropriate |
176 | * synchronization. |
177 | */ |
178 | void |
179 | rf_paritymap_write(struct rf_paritymap *pm) |
180 | { |
181 | mutex_enter(&pm->lock); |
182 | rf_paritymap_write_locked(pm); |
183 | mutex_exit(&pm->lock); |
184 | } |
185 | |
186 | /* As above, but to be used when pm->lock is already held. */ |
187 | static void |
188 | rf_paritymap_write_locked(struct rf_paritymap *pm) |
189 | { |
190 | char w, w0; |
191 | int i, j, setting, clearing; |
192 | |
193 | setting = clearing = 0; |
194 | for (i = 0; i < RF_PARITYMAP_NBYTE; i++) { |
195 | w0 = pm->disk_now->bits[i]; |
196 | w = pm->disk_boot->bits[i]; |
197 | |
198 | for (j = 0; j < NBBY; j++) |
199 | if (pm->current->state[i * NBBY + j] != 0) |
200 | w |= 1 << j; |
201 | |
202 | if (w & ~w0) |
203 | setting = 1; |
204 | if (w0 & ~w) |
205 | clearing = 1; |
206 | |
207 | pm->disk_now->bits[i] = w; |
208 | } |
209 | pm->ctrs.ncachesync += setting + clearing; |
210 | pm->ctrs.nclearing += clearing; |
211 | |
212 | /* |
213 | * If bits are being set in the parity map, then a sync is |
214 | * required afterwards, so that the regions are marked dirty |
215 | * on disk before any writes to them take place. If bits are |
216 | * being cleared, then a sync is required before the write, so |
217 | * that any writes to those regions are processed before the |
218 | * region is marked clean. (Synchronization is somewhat |
219 | * overkill; a write ordering barrier would suffice, but we |
220 | * currently have no way to express that directly.) |
221 | */ |
222 | if (clearing) |
223 | rf_sync_component_caches(pm->raid); |
224 | rf_paritymap_kern_write(pm->raid, pm->disk_now); |
225 | if (setting) |
226 | rf_sync_component_caches(pm->raid); |
227 | } |
228 | |
229 | /* Mark all parity as being in need of rewrite. */ |
230 | void |
231 | rf_paritymap_invalidate(struct rf_paritymap *pm) |
232 | { |
233 | mutex_enter(&pm->lock); |
234 | memset(pm->disk_boot, ~(unsigned char)0, |
235 | sizeof(struct rf_paritymap_ondisk)); |
236 | mutex_exit(&pm->lock); |
237 | } |
238 | |
239 | /* Mark all parity as being correct. */ |
240 | void |
241 | rf_paritymap_forceclean(struct rf_paritymap *pm) |
242 | { |
243 | mutex_enter(&pm->lock); |
244 | memset(pm->disk_boot, (unsigned char)0, |
245 | sizeof(struct rf_paritymap_ondisk)); |
246 | mutex_exit(&pm->lock); |
247 | } |
248 | |
249 | /* |
250 | * The cooldown callout routine just defers its work to a thread; it can't do |
251 | * the parity map write itself as it would block, and although mutex-induced |
252 | * blocking is permitted it seems wise to avoid tying up the softint. |
253 | */ |
254 | static void |
255 | rf_paritymap_tick(void *arg) |
256 | { |
257 | struct rf_paritymap *pm = arg; |
258 | |
259 | mutex_enter(&pm->lk_flags); |
260 | pm->flags |= TICKED; |
261 | mutex_exit(&pm->lk_flags); |
262 | |
263 | rf_lock_mutex2(pm->raid->iodone_lock); |
264 | rf_signal_cond2(pm->raid->iodone_cv); /* XXX */ |
265 | rf_unlock_mutex2(pm->raid->iodone_lock); |
266 | } |
267 | |
268 | /* |
269 | * This is where the parity cooling work (and rearming the callout if needed) |
270 | * is done; the raidio thread calls it when woken up, as by the above. |
271 | */ |
272 | void |
273 | rf_paritymap_checkwork(struct rf_paritymap *pm) |
274 | { |
275 | int i, zerop, progressp; |
276 | |
277 | mutex_enter(&pm->lk_flags); |
278 | if (pm->flags & TICKED) { |
279 | zerop = progressp = 0; |
280 | |
281 | pm->flags &= ~TICKED; |
282 | mutex_exit(&pm->lk_flags); |
283 | |
284 | mutex_enter(&pm->lock); |
285 | for (i = 0; i < RF_PARITYMAP_NREG; i++) { |
286 | if (pm->current->state[i] < 0) { |
287 | progressp = 1; |
288 | pm->current->state[i]++; |
289 | if (pm->current->state[i] == 0) |
290 | zerop = 1; |
291 | } |
292 | } |
293 | |
294 | if (progressp) |
295 | callout_schedule(&pm->ticker, |
296 | mstohz(pm->params.tickms)); |
297 | else { |
298 | mutex_enter(&pm->lk_flags); |
299 | pm->flags &= ~TICKING; |
300 | mutex_exit(&pm->lk_flags); |
301 | } |
302 | |
303 | if (zerop) |
304 | rf_paritymap_write_locked(pm); |
305 | mutex_exit(&pm->lock); |
306 | } else |
307 | mutex_exit(&pm->lk_flags); |
308 | } |
309 | |
310 | /* |
311 | * Set parity map parameters; used both to alter parameters on the fly and to |
312 | * establish their initial values. Note that setting a parameter to 0 means |
313 | * to leave the previous setting unchanged, and that if this is done for the |
314 | * initial setting of "regions", then a default value will be computed based |
315 | * on the RAID component size. |
316 | */ |
317 | int |
318 | rf_paritymap_set_params(struct rf_paritymap *pm, |
319 | const struct rf_pmparams *params, int todisk) |
320 | { |
321 | int cooldown, tickms; |
322 | u_int regions; |
323 | RF_RowCol_t col; |
324 | RF_ComponentLabel_t *clabel; |
325 | RF_Raid_t *raidPtr; |
326 | |
327 | cooldown = params->cooldown != 0 |
328 | ? params->cooldown : pm->params.cooldown; |
329 | tickms = params->tickms != 0 |
330 | ? params->tickms : pm->params.tickms; |
331 | regions = params->regions != 0 |
332 | ? params->regions : pm->params.regions; |
333 | |
334 | if (cooldown < 1 || cooldown > 128) { |
335 | printf("raid%d: cooldown %d out of range\n" , pm->raid->raidid, |
336 | cooldown); |
337 | return (-1); |
338 | } |
339 | if (tickms < 10) { |
340 | printf("raid%d: tick time %dms out of range\n" , |
341 | pm->raid->raidid, tickms); |
342 | return (-1); |
343 | } |
344 | if (regions == 0) { |
345 | regions = rf_paritymap_nreg(pm->raid); |
346 | } else if (regions > RF_PARITYMAP_NREG) { |
347 | printf("raid%d: region count %u too large (more than %u)\n" , |
348 | pm->raid->raidid, regions, RF_PARITYMAP_NREG); |
349 | return (-1); |
350 | } |
351 | |
352 | /* XXX any currently warm parity will be used with the new tickms! */ |
353 | pm->params.cooldown = cooldown; |
354 | pm->params.tickms = tickms; |
355 | /* Apply the initial region count, but do not change it after that. */ |
356 | if (pm->params.regions == 0) |
357 | pm->params.regions = regions; |
358 | |
359 | /* So that the newly set parameters can be tested: */ |
360 | pm->ctrs.nwrite = pm->ctrs.ncachesync = pm->ctrs.nclearing = 0; |
361 | |
362 | if (todisk) { |
363 | raidPtr = pm->raid; |
364 | for (col = 0; col < raidPtr->numCol; col++) { |
365 | if (RF_DEAD_DISK(raidPtr->Disks[col].status)) |
366 | continue; |
367 | |
368 | clabel = raidget_component_label(raidPtr, col); |
369 | clabel->parity_map_ntick = cooldown; |
370 | clabel->parity_map_tickms = tickms; |
371 | clabel->parity_map_regions = regions; |
372 | |
373 | /* Don't touch the disk if it's been spared */ |
374 | if (clabel->status == rf_ds_spared) |
375 | continue; |
376 | |
377 | raidflush_component_label(raidPtr, col); |
378 | } |
379 | |
380 | /* handle the spares too... */ |
381 | for (col = 0; col < raidPtr->numSpare; col++) { |
382 | if (raidPtr->Disks[raidPtr->numCol+col].status == rf_ds_used_spare) { |
383 | clabel = raidget_component_label(raidPtr, raidPtr->numCol+col); |
384 | clabel->parity_map_ntick = cooldown; |
385 | clabel->parity_map_tickms = tickms; |
386 | clabel->parity_map_regions = regions; |
387 | raidflush_component_label(raidPtr, raidPtr->numCol+col); |
388 | } |
389 | } |
390 | } |
391 | return 0; |
392 | } |
393 | |
394 | /* |
395 | * The number of regions may not be as many as can fit into the map, because |
396 | * when regions are too small, the overhead of setting parity map bits |
397 | * becomes significant in comparison to the actual I/O, while the |
398 | * corresponding gains in parity verification time become negligible. Thus, |
399 | * a minimum region size (defined above) is imposed. |
400 | * |
401 | * Note that, if the number of regions is less than the maximum, then some of |
402 | * the regions will be "fictional", corresponding to no actual disk; some |
403 | * parts of the code may process them as normal, but they can not ever be |
404 | * written to. |
405 | */ |
406 | static u_int |
407 | rf_paritymap_nreg(RF_Raid_t *raid) |
408 | { |
409 | daddr_t bytes_per_disk, nreg; |
410 | |
411 | bytes_per_disk = raid->sectorsPerDisk << raid->logBytesPerSector; |
412 | nreg = bytes_per_disk / REGION_MINSIZE; |
413 | if (nreg > RF_PARITYMAP_NREG) |
414 | nreg = RF_PARITYMAP_NREG; |
415 | if (nreg < 1) |
416 | nreg = 1; |
417 | |
418 | return (u_int)nreg; |
419 | } |
420 | |
421 | /* |
422 | * Initialize a parity map given specific parameters. This neither reads nor |
423 | * writes the parity map config in the component labels; for that, see below. |
424 | */ |
425 | int |
426 | rf_paritymap_init(struct rf_paritymap *pm, RF_Raid_t *raid, |
427 | const struct rf_pmparams *params) |
428 | { |
429 | daddr_t rstripes; |
430 | struct rf_pmparams safe; |
431 | |
432 | pm->raid = raid; |
433 | pm->params.regions = 0; |
434 | if (0 != rf_paritymap_set_params(pm, params, 0)) { |
435 | /* |
436 | * If the parameters are out-of-range, then bring the |
437 | * parity map up with something reasonable, so that |
438 | * the admin can at least go and fix it (or ignore it |
439 | * entirely). |
440 | */ |
441 | safe.cooldown = DFL_COOLDOWN; |
442 | safe.tickms = DFL_TICKMS; |
443 | safe.regions = 0; |
444 | |
445 | if (0 != rf_paritymap_set_params(pm, &safe, 0)) |
446 | return (-1); |
447 | } |
448 | |
449 | rstripes = howmany(raid->Layout.numStripe, pm->params.regions); |
450 | pm->region_size = rstripes * raid->Layout.dataSectorsPerStripe; |
451 | |
452 | callout_init(&pm->ticker, CALLOUT_MPSAFE); |
453 | callout_setfunc(&pm->ticker, rf_paritymap_tick, pm); |
454 | pm->flags = 0; |
455 | |
456 | pm->disk_boot = kmem_alloc(sizeof(struct rf_paritymap_ondisk), |
457 | KM_SLEEP); |
458 | pm->disk_now = kmem_alloc(sizeof(struct rf_paritymap_ondisk), |
459 | KM_SLEEP); |
460 | pm->current = kmem_zalloc(sizeof(struct rf_paritymap_current), |
461 | KM_SLEEP); |
462 | |
463 | rf_paritymap_kern_read(pm->raid, pm->disk_boot); |
464 | memcpy(pm->disk_now, pm->disk_boot, sizeof(*pm->disk_now)); |
465 | |
466 | mutex_init(&pm->lock, MUTEX_DEFAULT, IPL_NONE); |
467 | mutex_init(&pm->lk_flags, MUTEX_DEFAULT, IPL_SOFTCLOCK); |
468 | |
469 | return 0; |
470 | } |
471 | |
472 | /* |
473 | * Destroys a parity map; unless "force" is set, also cleans parity for any |
474 | * regions which were still in cooldown (but are not dirty on disk). |
475 | */ |
476 | void |
477 | rf_paritymap_destroy(struct rf_paritymap *pm, int force) |
478 | { |
479 | int i; |
480 | |
481 | callout_halt(&pm->ticker, NULL); /* XXX stop? halt? */ |
482 | callout_destroy(&pm->ticker); |
483 | |
484 | if (!force) { |
485 | for (i = 0; i < RF_PARITYMAP_NREG; i++) { |
486 | /* XXX check for > 0 ? */ |
487 | if (pm->current->state[i] < 0) |
488 | pm->current->state[i] = 0; |
489 | } |
490 | |
491 | rf_paritymap_write_locked(pm); |
492 | } |
493 | |
494 | mutex_destroy(&pm->lock); |
495 | mutex_destroy(&pm->lk_flags); |
496 | |
497 | kmem_free(pm->disk_boot, sizeof(struct rf_paritymap_ondisk)); |
498 | kmem_free(pm->disk_now, sizeof(struct rf_paritymap_ondisk)); |
499 | kmem_free(pm->current, sizeof(struct rf_paritymap_current)); |
500 | } |
501 | |
502 | /* |
503 | * Rewrite parity, taking parity map into account; this is the equivalent of |
504 | * the old rf_RewriteParity, and is likewise to be called from a suitable |
505 | * thread and shouldn't have multiple copies running in parallel and so on. |
506 | * |
507 | * Note that the fictional regions are "cleaned" in one shot, so that very |
508 | * small RAIDs (useful for testing) will not experience potentially severe |
509 | * regressions in rewrite time. |
510 | */ |
511 | int |
512 | rf_paritymap_rewrite(struct rf_paritymap *pm) |
513 | { |
514 | int i, ret_val = 0; |
515 | daddr_t reg_b, reg_e; |
516 | |
517 | /* Process only the actual regions. */ |
518 | for (i = 0; i < pm->params.regions; i++) { |
519 | mutex_enter(&pm->lock); |
520 | if (isset(pm->disk_boot->bits, i)) { |
521 | mutex_exit(&pm->lock); |
522 | |
523 | reg_b = i * pm->region_size; |
524 | reg_e = reg_b + pm->region_size; |
525 | if (reg_e > pm->raid->totalSectors) |
526 | reg_e = pm->raid->totalSectors; |
527 | |
528 | if (rf_RewriteParityRange(pm->raid, reg_b, |
529 | reg_e - reg_b)) { |
530 | ret_val = 1; |
531 | if (pm->raid->waitShutdown) |
532 | return ret_val; |
533 | } else { |
534 | mutex_enter(&pm->lock); |
535 | clrbit(pm->disk_boot->bits, i); |
536 | rf_paritymap_write_locked(pm); |
537 | mutex_exit(&pm->lock); |
538 | } |
539 | } else { |
540 | mutex_exit(&pm->lock); |
541 | } |
542 | } |
543 | |
544 | /* Now, clear the fictional regions, if any. */ |
545 | rf_paritymap_forceclean(pm); |
546 | rf_paritymap_write(pm); |
547 | |
548 | return ret_val; |
549 | } |
550 | |
551 | /* |
552 | * How to merge the on-disk parity maps when reading them in from the |
553 | * various components; returns whether they differ. In the case that |
554 | * they do differ, sets *dst to the union of *dst and *src. |
555 | * |
556 | * In theory, it should be safe to take the intersection (or just pick |
557 | * a single component arbitrarily), but the paranoid approach costs |
558 | * little. |
559 | * |
560 | * Appropriate locking, if any, is the responsibility of the caller. |
561 | */ |
562 | int |
563 | rf_paritymap_merge(struct rf_paritymap_ondisk *dst, |
564 | struct rf_paritymap_ondisk *src) |
565 | { |
566 | int i, discrep = 0; |
567 | |
568 | for (i = 0; i < RF_PARITYMAP_NBYTE; i++) { |
569 | if (dst->bits[i] != src->bits[i]) |
570 | discrep = 1; |
571 | dst->bits[i] |= src->bits[i]; |
572 | } |
573 | |
574 | return discrep; |
575 | } |
576 | |
577 | /* |
578 | * Detach a parity map from its RAID. This is not meant to be applied except |
579 | * when unconfiguring the RAID after all I/O has been resolved, as otherwise |
580 | * an out-of-date parity map could be treated as current. |
581 | */ |
582 | void |
583 | rf_paritymap_detach(RF_Raid_t *raidPtr) |
584 | { |
585 | if (raidPtr->parity_map == NULL) |
586 | return; |
587 | |
588 | rf_lock_mutex2(raidPtr->iodone_lock); |
589 | struct rf_paritymap *pm = raidPtr->parity_map; |
590 | raidPtr->parity_map = NULL; |
591 | rf_unlock_mutex2(raidPtr->iodone_lock); |
592 | /* XXXjld is that enough locking? Or too much? */ |
593 | rf_paritymap_destroy(pm, 0); |
594 | kmem_free(pm, sizeof(*pm)); |
595 | } |
596 | |
597 | /* |
598 | * Is this RAID set ineligible for parity-map use due to not actually |
599 | * having any parity? (If so, rf_paritymap_attach is a no-op, but |
600 | * rf_paritymap_{get,set}_disable will still pointlessly act on the |
601 | * component labels.) |
602 | */ |
603 | int |
604 | rf_paritymap_ineligible(RF_Raid_t *raidPtr) |
605 | { |
606 | return raidPtr->Layout.map->faultsTolerated == 0; |
607 | } |
608 | |
609 | /* |
610 | * Attach a parity map to a RAID set if appropriate. Includes |
611 | * configure-time processing of parity-map fields of component label. |
612 | */ |
613 | void |
614 | rf_paritymap_attach(RF_Raid_t *raidPtr, int force) |
615 | { |
616 | RF_RowCol_t col; |
617 | int pm_use, pm_zap; |
618 | int g_tickms, g_ntick, g_regions; |
619 | int good; |
620 | RF_ComponentLabel_t *clabel; |
621 | u_int flags, regions; |
622 | struct rf_pmparams params; |
623 | |
624 | if (rf_paritymap_ineligible(raidPtr)) { |
625 | /* There isn't any parity. */ |
626 | return; |
627 | } |
628 | |
629 | pm_use = 1; |
630 | pm_zap = 0; |
631 | g_tickms = DFL_TICKMS; |
632 | g_ntick = DFL_COOLDOWN; |
633 | g_regions = 0; |
634 | |
635 | /* |
636 | * Collect opinions on the set config. If this is the initial |
637 | * config (raidctl -C), treat all labels as invalid, since |
638 | * there may be random data present. |
639 | */ |
640 | if (!force) { |
641 | for (col = 0; col < raidPtr->numCol; col++) { |
642 | if (RF_DEAD_DISK(raidPtr->Disks[col].status)) |
643 | continue; |
644 | clabel = raidget_component_label(raidPtr, col); |
645 | flags = clabel->parity_map_flags; |
646 | /* Check for use by non-parity-map kernel. */ |
647 | if (clabel->parity_map_modcount |
648 | != clabel->mod_counter) { |
649 | flags &= ~RF_PMLABEL_WASUSED; |
650 | } |
651 | |
652 | if (flags & RF_PMLABEL_VALID) { |
653 | g_tickms = clabel->parity_map_tickms; |
654 | g_ntick = clabel->parity_map_ntick; |
655 | regions = clabel->parity_map_regions; |
656 | if (g_regions == 0) |
657 | g_regions = regions; |
658 | else if (g_regions != regions) { |
659 | pm_zap = 1; /* important! */ |
660 | } |
661 | |
662 | if (flags & RF_PMLABEL_DISABLE) { |
663 | pm_use = 0; |
664 | } |
665 | if (!(flags & RF_PMLABEL_WASUSED)) { |
666 | pm_zap = 1; |
667 | } |
668 | } else { |
669 | pm_zap = 1; |
670 | } |
671 | } |
672 | } else { |
673 | pm_zap = 1; |
674 | } |
675 | |
676 | /* Finally, create and attach the parity map. */ |
677 | if (pm_use) { |
678 | params.cooldown = g_ntick; |
679 | params.tickms = g_tickms; |
680 | params.regions = g_regions; |
681 | |
682 | raidPtr->parity_map = kmem_alloc(sizeof(struct rf_paritymap), |
683 | KM_SLEEP); |
684 | if (0 != rf_paritymap_init(raidPtr->parity_map, raidPtr, |
685 | ¶ms)) { |
686 | /* It failed; do without. */ |
687 | kmem_free(raidPtr->parity_map, |
688 | sizeof(struct rf_paritymap)); |
689 | raidPtr->parity_map = NULL; |
690 | return; |
691 | } |
692 | |
693 | if (g_regions == 0) |
694 | /* Pick up the autoconfigured region count. */ |
695 | g_regions = raidPtr->parity_map->params.regions; |
696 | |
697 | if (pm_zap) { |
698 | good = raidPtr->parity_good && !force; |
699 | |
700 | if (good) |
701 | rf_paritymap_forceclean(raidPtr->parity_map); |
702 | else |
703 | rf_paritymap_invalidate(raidPtr->parity_map); |
704 | /* This needs to be on disk before WASUSED is set. */ |
705 | rf_paritymap_write(raidPtr->parity_map); |
706 | } |
707 | } |
708 | |
709 | /* Alter labels in-core to reflect the current view of things. */ |
710 | for (col = 0; col < raidPtr->numCol; col++) { |
711 | if (RF_DEAD_DISK(raidPtr->Disks[col].status)) |
712 | continue; |
713 | clabel = raidget_component_label(raidPtr, col); |
714 | |
715 | if (pm_use) |
716 | flags = RF_PMLABEL_VALID | RF_PMLABEL_WASUSED; |
717 | else |
718 | flags = RF_PMLABEL_VALID | RF_PMLABEL_DISABLE; |
719 | |
720 | clabel->parity_map_flags = flags; |
721 | clabel->parity_map_tickms = g_tickms; |
722 | clabel->parity_map_ntick = g_ntick; |
723 | clabel->parity_map_regions = g_regions; |
724 | raidflush_component_label(raidPtr, col); |
725 | } |
726 | /* Note that we're just in 'attach' here, and there won't |
727 | be any spare disks at this point. */ |
728 | } |
729 | |
730 | /* |
731 | * For initializing the parity-map fields of a component label, both on |
732 | * initial creation and on reconstruct/copyback/etc. */ |
733 | void |
734 | rf_paritymap_init_label(struct rf_paritymap *pm, RF_ComponentLabel_t *clabel) |
735 | { |
736 | if (pm != NULL) { |
737 | clabel->parity_map_flags = |
738 | RF_PMLABEL_VALID | RF_PMLABEL_WASUSED; |
739 | clabel->parity_map_tickms = pm->params.tickms; |
740 | clabel->parity_map_ntick = pm->params.cooldown; |
741 | /* |
742 | * XXXjld: If the number of regions is changed on disk, and |
743 | * then a new component is labeled before the next configure, |
744 | * then it will get the old value and they will conflict on |
745 | * the next boot (and the default will be used instead). |
746 | */ |
747 | clabel->parity_map_regions = pm->params.regions; |
748 | } else { |
749 | /* |
750 | * XXXjld: if the map is disabled, and all the components are |
751 | * replaced without an intervening unconfigure/reconfigure, |
752 | * then it will become enabled on the next unconfig/reconfig. |
753 | */ |
754 | } |
755 | } |
756 | |
757 | |
758 | /* Will the parity map be disabled next time? */ |
759 | int |
760 | rf_paritymap_get_disable(RF_Raid_t *raidPtr) |
761 | { |
762 | RF_ComponentLabel_t *clabel; |
763 | RF_RowCol_t col; |
764 | int dis; |
765 | |
766 | dis = 0; |
767 | for (col = 0; col < raidPtr->numCol; col++) { |
768 | if (RF_DEAD_DISK(raidPtr->Disks[col].status)) |
769 | continue; |
770 | clabel = raidget_component_label(raidPtr, col); |
771 | if (clabel->parity_map_flags & RF_PMLABEL_DISABLE) |
772 | dis = 1; |
773 | } |
774 | for (col = 0; col < raidPtr->numSpare; col++) { |
775 | if (raidPtr->Disks[raidPtr->numCol+col].status != rf_ds_used_spare) |
776 | continue; |
777 | clabel = raidget_component_label(raidPtr, raidPtr->numCol+col); |
778 | if (clabel->parity_map_flags & RF_PMLABEL_DISABLE) |
779 | dis = 1; |
780 | } |
781 | |
782 | return dis; |
783 | } |
784 | |
785 | /* Set whether the parity map will be disabled next time. */ |
786 | void |
787 | rf_paritymap_set_disable(RF_Raid_t *raidPtr, int dis) |
788 | { |
789 | RF_ComponentLabel_t *clabel; |
790 | RF_RowCol_t col; |
791 | |
792 | for (col = 0; col < raidPtr->numCol; col++) { |
793 | if (RF_DEAD_DISK(raidPtr->Disks[col].status)) |
794 | continue; |
795 | clabel = raidget_component_label(raidPtr, col); |
796 | if (dis) |
797 | clabel->parity_map_flags |= RF_PMLABEL_DISABLE; |
798 | else |
799 | clabel->parity_map_flags &= ~RF_PMLABEL_DISABLE; |
800 | raidflush_component_label(raidPtr, col); |
801 | } |
802 | |
803 | /* update any used spares as well */ |
804 | for (col = 0; col < raidPtr->numSpare; col++) { |
805 | if (raidPtr->Disks[raidPtr->numCol+col].status != rf_ds_used_spare) |
806 | continue; |
807 | |
808 | clabel = raidget_component_label(raidPtr, raidPtr->numCol+col); |
809 | if (dis) |
810 | clabel->parity_map_flags |= RF_PMLABEL_DISABLE; |
811 | else |
812 | clabel->parity_map_flags &= ~RF_PMLABEL_DISABLE; |
813 | raidflush_component_label(raidPtr, raidPtr->numCol+col); |
814 | } |
815 | } |
816 | |