1 | /* $NetBSD: rf_map.c,v 1.47 2016/10/15 20:31:15 oster Exp $ */ |
2 | /* |
3 | * Copyright (c) 1995 Carnegie-Mellon University. |
4 | * All rights reserved. |
5 | * |
6 | * Author: Mark Holland |
7 | * |
8 | * Permission to use, copy, modify and distribute this software and |
9 | * its documentation is hereby granted, provided that both the copyright |
10 | * notice and this permission notice appear in all copies of the |
11 | * software, derivative works or modified versions, and any portions |
12 | * thereof, and that both notices appear in supporting documentation. |
13 | * |
14 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
15 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND |
16 | * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. |
17 | * |
18 | * Carnegie Mellon requests users of this software to return to |
19 | * |
20 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
21 | * School of Computer Science |
22 | * Carnegie Mellon University |
23 | * Pittsburgh PA 15213-3890 |
24 | * |
25 | * any improvements or extensions that they make and grant Carnegie the |
26 | * rights to redistribute these changes. |
27 | */ |
28 | |
29 | /************************************************************************** |
30 | * |
31 | * map.c -- main code for mapping RAID addresses to physical disk addresses |
32 | * |
33 | **************************************************************************/ |
34 | |
35 | #include <sys/cdefs.h> |
36 | __KERNEL_RCSID(0, "$NetBSD: rf_map.c,v 1.47 2016/10/15 20:31:15 oster Exp $" ); |
37 | |
38 | #include <dev/raidframe/raidframevar.h> |
39 | |
40 | #include "rf_threadstuff.h" |
41 | #include "rf_raid.h" |
42 | #include "rf_general.h" |
43 | #include "rf_map.h" |
44 | #include "rf_shutdown.h" |
45 | |
46 | static void rf_FreePDAList(RF_PhysDiskAddr_t *pda_list); |
47 | static void rf_FreeASMList(RF_AccessStripeMap_t *asm_list); |
48 | |
49 | /*************************************************************************** |
50 | * |
51 | * MapAccess -- main 1st order mapping routine. Maps an access in the |
52 | * RAID address space to the corresponding set of physical disk |
53 | * addresses. The result is returned as a list of AccessStripeMap |
54 | * structures, one per stripe accessed. Each ASM structure contains a |
55 | * pointer to a list of PhysDiskAddr structures, which describe the |
56 | * physical locations touched by the user access. Note that this |
57 | * routine returns only static mapping information, i.e. the list of |
58 | * physical addresses returned does not necessarily identify the set |
59 | * of physical locations that will actually be read or written. The |
60 | * routine also maps the parity. The physical disk location returned |
61 | * always indicates the entire parity unit, even when only a subset of |
62 | * it is being accessed. This is because an access that is not stripe |
63 | * unit aligned but that spans a stripe unit boundary may require |
64 | * access two distinct portions of the parity unit, and we can't yet |
65 | * tell which portion(s) we'll actually need. We leave it up to the |
66 | * algorithm selection code to decide what subset of the parity unit |
67 | * to access. Note that addresses in the RAID address space must |
68 | * always be maintained as longs, instead of ints. |
69 | * |
70 | * This routine returns NULL if numBlocks is 0 |
71 | * |
72 | * raidAddress - starting address in RAID address space |
73 | * numBlocks - number of blocks in RAID address space to access |
74 | * buffer - buffer to supply/recieve data |
75 | * remap - 1 => remap address to spare space |
76 | ***************************************************************************/ |
77 | |
78 | RF_AccessStripeMapHeader_t * |
79 | rf_MapAccess(RF_Raid_t *raidPtr, RF_RaidAddr_t raidAddress, |
80 | RF_SectorCount_t numBlocks, void *buffer, int remap) |
81 | { |
82 | RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); |
83 | RF_AccessStripeMapHeader_t *asm_hdr = NULL; |
84 | RF_AccessStripeMap_t *asm_list = NULL, *asm_p = NULL; |
85 | int faultsTolerated = layoutPtr->map->faultsTolerated; |
86 | /* we'll change raidAddress along the way */ |
87 | RF_RaidAddr_t startAddress = raidAddress; |
88 | RF_RaidAddr_t endAddress = raidAddress + numBlocks; |
89 | RF_RaidDisk_t *disks = raidPtr->Disks; |
90 | RF_PhysDiskAddr_t *pda_p; |
91 | #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) |
92 | RF_PhysDiskAddr_t *pda_q; |
93 | #endif |
94 | RF_StripeCount_t numStripes = 0; |
95 | RF_RaidAddr_t stripeRealEndAddress, stripeEndAddress, |
96 | nextStripeUnitAddress; |
97 | RF_RaidAddr_t startAddrWithinStripe, lastRaidAddr; |
98 | RF_StripeCount_t totStripes; |
99 | RF_StripeNum_t stripeID, lastSID, SUID, lastSUID; |
100 | RF_AccessStripeMap_t *asmList, *t_asm; |
101 | RF_PhysDiskAddr_t *pdaList, *t_pda; |
102 | |
103 | /* allocate all the ASMs and PDAs up front */ |
104 | lastRaidAddr = raidAddress + numBlocks - 1; |
105 | stripeID = rf_RaidAddressToStripeID(layoutPtr, raidAddress); |
106 | lastSID = rf_RaidAddressToStripeID(layoutPtr, lastRaidAddr); |
107 | totStripes = lastSID - stripeID + 1; |
108 | SUID = rf_RaidAddressToStripeUnitID(layoutPtr, raidAddress); |
109 | lastSUID = rf_RaidAddressToStripeUnitID(layoutPtr, lastRaidAddr); |
110 | |
111 | asmList = rf_AllocASMList(totStripes); |
112 | |
113 | /* may also need pda(s) per stripe for parity */ |
114 | pdaList = rf_AllocPDAList(lastSUID - SUID + 1 + |
115 | faultsTolerated * totStripes); |
116 | |
117 | |
118 | if (raidAddress + numBlocks > raidPtr->totalSectors) { |
119 | RF_ERRORMSG1("Unable to map access because offset (%d) was invalid\n" , |
120 | (int) raidAddress); |
121 | return (NULL); |
122 | } |
123 | #if RF_DEBUG_MAP |
124 | if (rf_mapDebug) |
125 | rf_PrintRaidAddressInfo(raidPtr, raidAddress, numBlocks); |
126 | #endif |
127 | for (; raidAddress < endAddress;) { |
128 | /* make the next stripe structure */ |
129 | RF_ASSERT(asmList); |
130 | t_asm = asmList; |
131 | asmList = asmList->next; |
132 | memset((char *) t_asm, 0, sizeof(RF_AccessStripeMap_t)); |
133 | if (!asm_p) |
134 | asm_list = asm_p = t_asm; |
135 | else { |
136 | asm_p->next = t_asm; |
137 | asm_p = asm_p->next; |
138 | } |
139 | numStripes++; |
140 | |
141 | /* map SUs from current location to the end of the stripe */ |
142 | asm_p->stripeID = /* rf_RaidAddressToStripeID(layoutPtr, |
143 | raidAddress) */ stripeID++; |
144 | stripeRealEndAddress = rf_RaidAddressOfNextStripeBoundary(layoutPtr, raidAddress); |
145 | stripeEndAddress = RF_MIN(endAddress, stripeRealEndAddress); |
146 | asm_p->raidAddress = raidAddress; |
147 | asm_p->endRaidAddress = stripeEndAddress; |
148 | |
149 | /* map each stripe unit in the stripe */ |
150 | pda_p = NULL; |
151 | |
152 | /* Raid addr of start of portion of access that is |
153 | within this stripe */ |
154 | startAddrWithinStripe = raidAddress; |
155 | |
156 | for (; raidAddress < stripeEndAddress;) { |
157 | RF_ASSERT(pdaList); |
158 | t_pda = pdaList; |
159 | pdaList = pdaList->next; |
160 | memset((char *) t_pda, 0, sizeof(RF_PhysDiskAddr_t)); |
161 | if (!pda_p) |
162 | asm_p->physInfo = pda_p = t_pda; |
163 | else { |
164 | pda_p->next = t_pda; |
165 | pda_p = pda_p->next; |
166 | } |
167 | |
168 | pda_p->type = RF_PDA_TYPE_DATA; |
169 | (layoutPtr->map->MapSector) (raidPtr, raidAddress, |
170 | &(pda_p->col), |
171 | &(pda_p->startSector), |
172 | remap); |
173 | |
174 | /* mark any failures we find. failedPDA is |
175 | * don't-care if there is more than one |
176 | * failure */ |
177 | |
178 | /* the RAID address corresponding to this |
179 | physical diskaddress */ |
180 | pda_p->raidAddress = raidAddress; |
181 | nextStripeUnitAddress = rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, raidAddress); |
182 | pda_p->numSector = RF_MIN(endAddress, nextStripeUnitAddress) - raidAddress; |
183 | RF_ASSERT(pda_p->numSector != 0); |
184 | rf_ASMCheckStatus(raidPtr, pda_p, asm_p, disks, 0); |
185 | pda_p->bufPtr = (char *)buffer + rf_RaidAddressToByte(raidPtr, (raidAddress - startAddress)); |
186 | asm_p->totalSectorsAccessed += pda_p->numSector; |
187 | asm_p->numStripeUnitsAccessed++; |
188 | |
189 | raidAddress = RF_MIN(endAddress, nextStripeUnitAddress); |
190 | } |
191 | |
192 | /* Map the parity. At this stage, the startSector and |
193 | * numSector fields for the parity unit are always set |
194 | * to indicate the entire parity unit. We may modify |
195 | * this after mapping the data portion. */ |
196 | switch (faultsTolerated) { |
197 | case 0: |
198 | break; |
199 | case 1: /* single fault tolerant */ |
200 | RF_ASSERT(pdaList); |
201 | t_pda = pdaList; |
202 | pdaList = pdaList->next; |
203 | memset((char *) t_pda, 0, sizeof(RF_PhysDiskAddr_t)); |
204 | pda_p = asm_p->parityInfo = t_pda; |
205 | pda_p->type = RF_PDA_TYPE_PARITY; |
206 | (layoutPtr->map->MapParity) (raidPtr, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe), |
207 | &(pda_p->col), &(pda_p->startSector), remap); |
208 | pda_p->numSector = layoutPtr->sectorsPerStripeUnit; |
209 | /* raidAddr may be needed to find unit to redirect to */ |
210 | pda_p->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe); |
211 | rf_ASMCheckStatus(raidPtr, pda_p, asm_p, disks, 1); |
212 | rf_ASMParityAdjust(asm_p->parityInfo, startAddrWithinStripe, endAddress, layoutPtr, asm_p); |
213 | |
214 | break; |
215 | #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) |
216 | case 2: /* two fault tolerant */ |
217 | RF_ASSERT(pdaList && pdaList->next); |
218 | t_pda = pdaList; |
219 | pdaList = pdaList->next; |
220 | memset((char *) t_pda, 0, sizeof(RF_PhysDiskAddr_t)); |
221 | pda_p = asm_p->parityInfo = t_pda; |
222 | pda_p->type = RF_PDA_TYPE_PARITY; |
223 | t_pda = pdaList; |
224 | pdaList = pdaList->next; |
225 | memset((char *) t_pda, 0, sizeof(RF_PhysDiskAddr_t)); |
226 | pda_q = asm_p->qInfo = t_pda; |
227 | pda_q->type = RF_PDA_TYPE_Q; |
228 | (layoutPtr->map->MapParity) (raidPtr, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe), |
229 | &(pda_p->col), &(pda_p->startSector), remap); |
230 | (layoutPtr->map->MapQ) (raidPtr, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe), |
231 | &(pda_q->col), &(pda_q->startSector), remap); |
232 | pda_q->numSector = pda_p->numSector = layoutPtr->sectorsPerStripeUnit; |
233 | /* raidAddr may be needed to find unit to redirect to */ |
234 | pda_p->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe); |
235 | pda_q->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe); |
236 | /* failure mode stuff */ |
237 | rf_ASMCheckStatus(raidPtr, pda_p, asm_p, disks, 1); |
238 | rf_ASMCheckStatus(raidPtr, pda_q, asm_p, disks, 1); |
239 | rf_ASMParityAdjust(asm_p->parityInfo, startAddrWithinStripe, endAddress, layoutPtr, asm_p); |
240 | rf_ASMParityAdjust(asm_p->qInfo, startAddrWithinStripe, endAddress, layoutPtr, asm_p); |
241 | break; |
242 | #endif |
243 | } |
244 | } |
245 | RF_ASSERT(asmList == NULL && pdaList == NULL); |
246 | /* make the header structure */ |
247 | asm_hdr = rf_AllocAccessStripeMapHeader(); |
248 | RF_ASSERT(numStripes == totStripes); |
249 | asm_hdr->numStripes = numStripes; |
250 | asm_hdr->stripeMap = asm_list; |
251 | |
252 | #if RF_DEBUG_MAP |
253 | if (rf_mapDebug) |
254 | rf_PrintAccessStripeMap(asm_hdr); |
255 | #endif |
256 | return (asm_hdr); |
257 | } |
258 | |
259 | /*************************************************************************** |
260 | * This routine walks through an ASM list and marks the PDAs that have |
261 | * failed. It's called only when a disk failure causes an in-flight |
262 | * DAG to fail. The parity may consist of two components, but we want |
263 | * to use only one failedPDA pointer. Thus we set failedPDA to point |
264 | * to the first parity component, and rely on the rest of the code to |
265 | * do the right thing with this. |
266 | ***************************************************************************/ |
267 | |
268 | void |
269 | rf_MarkFailuresInASMList(RF_Raid_t *raidPtr, |
270 | RF_AccessStripeMapHeader_t *asm_h) |
271 | { |
272 | RF_RaidDisk_t *disks = raidPtr->Disks; |
273 | RF_AccessStripeMap_t *asmap; |
274 | RF_PhysDiskAddr_t *pda; |
275 | |
276 | for (asmap = asm_h->stripeMap; asmap; asmap = asmap->next) { |
277 | asmap->numDataFailed = 0; |
278 | asmap->numParityFailed = 0; |
279 | asmap->numQFailed = 0; |
280 | asmap->numFailedPDAs = 0; |
281 | memset((char *) asmap->failedPDAs, 0, |
282 | RF_MAX_FAILED_PDA * sizeof(RF_PhysDiskAddr_t *)); |
283 | for (pda = asmap->physInfo; pda; pda = pda->next) { |
284 | if (RF_DEAD_DISK(disks[pda->col].status)) { |
285 | asmap->numDataFailed++; |
286 | asmap->failedPDAs[asmap->numFailedPDAs] = pda; |
287 | asmap->numFailedPDAs++; |
288 | } |
289 | } |
290 | pda = asmap->parityInfo; |
291 | if (pda && RF_DEAD_DISK(disks[pda->col].status)) { |
292 | asmap->numParityFailed++; |
293 | asmap->failedPDAs[asmap->numFailedPDAs] = pda; |
294 | asmap->numFailedPDAs++; |
295 | } |
296 | pda = asmap->qInfo; |
297 | if (pda && RF_DEAD_DISK(disks[pda->col].status)) { |
298 | asmap->numQFailed++; |
299 | asmap->failedPDAs[asmap->numFailedPDAs] = pda; |
300 | asmap->numFailedPDAs++; |
301 | } |
302 | } |
303 | } |
304 | |
305 | /*************************************************************************** |
306 | * |
307 | * routines to allocate and free list elements. All allocation |
308 | * routines zero the structure before returning it. |
309 | * |
310 | * FreePhysDiskAddr is static. It should never be called directly, |
311 | * because FreeAccessStripeMap takes care of freeing the PhysDiskAddr |
312 | * list. |
313 | * |
314 | ***************************************************************************/ |
315 | |
316 | #define RF_MAX_FREE_ASMHDR 128 |
317 | #define RF_MIN_FREE_ASMHDR 32 |
318 | |
319 | #define RF_MAX_FREE_ASM 192 |
320 | #define RF_MIN_FREE_ASM 64 |
321 | |
322 | #define RF_MAX_FREE_PDA 192 |
323 | #define RF_MIN_FREE_PDA 64 |
324 | |
325 | #define RF_MAX_FREE_ASMHLE 64 |
326 | #define RF_MIN_FREE_ASMHLE 16 |
327 | |
328 | #define RF_MAX_FREE_FSS 128 |
329 | #define RF_MIN_FREE_FSS 32 |
330 | |
331 | #define RF_MAX_FREE_VFPLE 128 |
332 | #define RF_MIN_FREE_VFPLE 32 |
333 | |
334 | #define RF_MAX_FREE_VPLE 128 |
335 | #define RF_MIN_FREE_VPLE 32 |
336 | |
337 | |
338 | /* called at shutdown time. So far, all that is necessary is to |
339 | release all the free lists */ |
340 | static void rf_ShutdownMapModule(void *); |
341 | static void |
342 | rf_ShutdownMapModule(void *ignored) |
343 | { |
344 | pool_destroy(&rf_pools.asm_hdr); |
345 | pool_destroy(&rf_pools.asmap); |
346 | pool_destroy(&rf_pools.asmhle); |
347 | pool_destroy(&rf_pools.pda); |
348 | pool_destroy(&rf_pools.fss); |
349 | pool_destroy(&rf_pools.vfple); |
350 | pool_destroy(&rf_pools.vple); |
351 | } |
352 | |
353 | int |
354 | rf_ConfigureMapModule(RF_ShutdownList_t **listp) |
355 | { |
356 | |
357 | rf_pool_init(&rf_pools.asm_hdr, sizeof(RF_AccessStripeMapHeader_t), |
358 | "rf_asmhdr_pl" , RF_MIN_FREE_ASMHDR, RF_MAX_FREE_ASMHDR); |
359 | rf_pool_init(&rf_pools.asmap, sizeof(RF_AccessStripeMap_t), |
360 | "rf_asm_pl" , RF_MIN_FREE_ASM, RF_MAX_FREE_ASM); |
361 | rf_pool_init(&rf_pools.asmhle, sizeof(RF_ASMHeaderListElem_t), |
362 | "rf_asmhle_pl" , RF_MIN_FREE_ASMHLE, RF_MAX_FREE_ASMHLE); |
363 | rf_pool_init(&rf_pools.pda, sizeof(RF_PhysDiskAddr_t), |
364 | "rf_pda_pl" , RF_MIN_FREE_PDA, RF_MAX_FREE_PDA); |
365 | rf_pool_init(&rf_pools.fss, sizeof(RF_FailedStripe_t), |
366 | "rf_fss_pl" , RF_MIN_FREE_FSS, RF_MAX_FREE_FSS); |
367 | rf_pool_init(&rf_pools.vfple, sizeof(RF_VoidFunctionPointerListElem_t), |
368 | "rf_vfple_pl" , RF_MIN_FREE_VFPLE, RF_MAX_FREE_VFPLE); |
369 | rf_pool_init(&rf_pools.vple, sizeof(RF_VoidPointerListElem_t), |
370 | "rf_vple_pl" , RF_MIN_FREE_VPLE, RF_MAX_FREE_VPLE); |
371 | rf_ShutdownCreate(listp, rf_ShutdownMapModule, NULL); |
372 | |
373 | return (0); |
374 | } |
375 | |
376 | RF_AccessStripeMapHeader_t * |
377 | (void) |
378 | { |
379 | RF_AccessStripeMapHeader_t *p; |
380 | |
381 | p = pool_get(&rf_pools.asm_hdr, PR_WAITOK); |
382 | memset((char *) p, 0, sizeof(RF_AccessStripeMapHeader_t)); |
383 | |
384 | return (p); |
385 | } |
386 | |
387 | void |
388 | (RF_AccessStripeMapHeader_t *p) |
389 | { |
390 | pool_put(&rf_pools.asm_hdr, p); |
391 | } |
392 | |
393 | |
394 | RF_VoidFunctionPointerListElem_t * |
395 | rf_AllocVFPListElem(void) |
396 | { |
397 | RF_VoidFunctionPointerListElem_t *p; |
398 | |
399 | p = pool_get(&rf_pools.vfple, PR_WAITOK); |
400 | memset((char *) p, 0, sizeof(RF_VoidFunctionPointerListElem_t)); |
401 | |
402 | return (p); |
403 | } |
404 | |
405 | void |
406 | rf_FreeVFPListElem(RF_VoidFunctionPointerListElem_t *p) |
407 | { |
408 | |
409 | pool_put(&rf_pools.vfple, p); |
410 | } |
411 | |
412 | |
413 | RF_VoidPointerListElem_t * |
414 | rf_AllocVPListElem(void) |
415 | { |
416 | RF_VoidPointerListElem_t *p; |
417 | |
418 | p = pool_get(&rf_pools.vple, PR_WAITOK); |
419 | memset((char *) p, 0, sizeof(RF_VoidPointerListElem_t)); |
420 | |
421 | return (p); |
422 | } |
423 | |
424 | void |
425 | rf_FreeVPListElem(RF_VoidPointerListElem_t *p) |
426 | { |
427 | |
428 | pool_put(&rf_pools.vple, p); |
429 | } |
430 | |
431 | RF_ASMHeaderListElem_t * |
432 | (void) |
433 | { |
434 | RF_ASMHeaderListElem_t *p; |
435 | |
436 | p = pool_get(&rf_pools.asmhle, PR_WAITOK); |
437 | memset((char *) p, 0, sizeof(RF_ASMHeaderListElem_t)); |
438 | |
439 | return (p); |
440 | } |
441 | |
442 | void |
443 | (RF_ASMHeaderListElem_t *p) |
444 | { |
445 | |
446 | pool_put(&rf_pools.asmhle, p); |
447 | } |
448 | |
449 | RF_FailedStripe_t * |
450 | rf_AllocFailedStripeStruct(void) |
451 | { |
452 | RF_FailedStripe_t *p; |
453 | |
454 | p = pool_get(&rf_pools.fss, PR_WAITOK); |
455 | memset((char *) p, 0, sizeof(RF_FailedStripe_t)); |
456 | |
457 | return (p); |
458 | } |
459 | |
460 | void |
461 | rf_FreeFailedStripeStruct(RF_FailedStripe_t *p) |
462 | { |
463 | pool_put(&rf_pools.fss, p); |
464 | } |
465 | |
466 | |
467 | |
468 | |
469 | |
470 | RF_PhysDiskAddr_t * |
471 | rf_AllocPhysDiskAddr(void) |
472 | { |
473 | RF_PhysDiskAddr_t *p; |
474 | |
475 | p = pool_get(&rf_pools.pda, PR_WAITOK); |
476 | memset((char *) p, 0, sizeof(RF_PhysDiskAddr_t)); |
477 | |
478 | return (p); |
479 | } |
480 | /* allocates a list of PDAs, locking the free list only once when we |
481 | * have to call calloc, we do it one component at a time to simplify |
482 | * the process of freeing the list at program shutdown. This should |
483 | * not be much of a performance hit, because it should be very |
484 | * infrequently executed. */ |
485 | RF_PhysDiskAddr_t * |
486 | rf_AllocPDAList(int count) |
487 | { |
488 | RF_PhysDiskAddr_t *p, *prev; |
489 | int i; |
490 | |
491 | p = NULL; |
492 | prev = NULL; |
493 | for (i = 0; i < count; i++) { |
494 | p = pool_get(&rf_pools.pda, PR_WAITOK); |
495 | p->next = prev; |
496 | prev = p; |
497 | } |
498 | |
499 | return (p); |
500 | } |
501 | |
502 | void |
503 | rf_FreePhysDiskAddr(RF_PhysDiskAddr_t *p) |
504 | { |
505 | pool_put(&rf_pools.pda, p); |
506 | } |
507 | |
508 | static void |
509 | rf_FreePDAList(RF_PhysDiskAddr_t *pda_list) |
510 | { |
511 | RF_PhysDiskAddr_t *p, *tmp; |
512 | |
513 | p=pda_list; |
514 | while (p) { |
515 | tmp = p->next; |
516 | pool_put(&rf_pools.pda, p); |
517 | p = tmp; |
518 | } |
519 | } |
520 | |
521 | /* this is essentially identical to AllocPDAList. I should combine |
522 | * the two. when we have to call calloc, we do it one component at a |
523 | * time to simplify the process of freeing the list at program |
524 | * shutdown. This should not be much of a performance hit, because it |
525 | * should be very infrequently executed. */ |
526 | RF_AccessStripeMap_t * |
527 | rf_AllocASMList(int count) |
528 | { |
529 | RF_AccessStripeMap_t *p, *prev; |
530 | int i; |
531 | |
532 | p = NULL; |
533 | prev = NULL; |
534 | for (i = 0; i < count; i++) { |
535 | p = pool_get(&rf_pools.asmap, PR_WAITOK); |
536 | p->next = prev; |
537 | prev = p; |
538 | } |
539 | return (p); |
540 | } |
541 | |
542 | static void |
543 | rf_FreeASMList(RF_AccessStripeMap_t *asm_list) |
544 | { |
545 | RF_AccessStripeMap_t *p, *tmp; |
546 | |
547 | p=asm_list; |
548 | while (p) { |
549 | tmp = p->next; |
550 | pool_put(&rf_pools.asmap, p); |
551 | p = tmp; |
552 | } |
553 | } |
554 | |
555 | void |
556 | rf_FreeAccessStripeMap(RF_AccessStripeMapHeader_t *hdr) |
557 | { |
558 | RF_AccessStripeMap_t *p; |
559 | RF_PhysDiskAddr_t *pdp, *trailer, *pdaList = NULL, *pdaEnd = NULL; |
560 | int count = 0, t; |
561 | |
562 | for (p = hdr->stripeMap; p; p = p->next) { |
563 | |
564 | /* link the 3 pda lists into the accumulating pda list */ |
565 | |
566 | if (!pdaList) |
567 | pdaList = p->qInfo; |
568 | else |
569 | pdaEnd->next = p->qInfo; |
570 | for (trailer = NULL, pdp = p->qInfo; pdp;) { |
571 | trailer = pdp; |
572 | pdp = pdp->next; |
573 | count++; |
574 | } |
575 | if (trailer) |
576 | pdaEnd = trailer; |
577 | |
578 | if (!pdaList) |
579 | pdaList = p->parityInfo; |
580 | else |
581 | pdaEnd->next = p->parityInfo; |
582 | for (trailer = NULL, pdp = p->parityInfo; pdp;) { |
583 | trailer = pdp; |
584 | pdp = pdp->next; |
585 | count++; |
586 | } |
587 | if (trailer) |
588 | pdaEnd = trailer; |
589 | |
590 | if (!pdaList) |
591 | pdaList = p->physInfo; |
592 | else |
593 | pdaEnd->next = p->physInfo; |
594 | for (trailer = NULL, pdp = p->physInfo; pdp;) { |
595 | trailer = pdp; |
596 | pdp = pdp->next; |
597 | count++; |
598 | } |
599 | if (trailer) |
600 | pdaEnd = trailer; |
601 | } |
602 | |
603 | /* debug only */ |
604 | for (t = 0, pdp = pdaList; pdp; pdp = pdp->next) |
605 | t++; |
606 | RF_ASSERT(t == count); |
607 | |
608 | if (pdaList) |
609 | rf_FreePDAList(pdaList); |
610 | rf_FreeASMList(hdr->stripeMap); |
611 | rf_FreeAccessStripeMapHeader(hdr); |
612 | } |
613 | /* We can't use the large write optimization if there are any failures |
614 | * in the stripe. In the declustered layout, there is no way to |
615 | * immediately determine what disks constitute a stripe, so we |
616 | * actually have to hunt through the stripe looking for failures. The |
617 | * reason we map the parity instead of just using asm->parityInfo->col |
618 | * is because the latter may have been already redirected to a spare |
619 | * drive, which would mess up the computation of the stripe offset. |
620 | * |
621 | * ASSUMES AT MOST ONE FAILURE IN THE STRIPE. */ |
622 | int |
623 | rf_CheckStripeForFailures(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap) |
624 | { |
625 | RF_RowCol_t tcol, pcol, *diskids, i; |
626 | RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; |
627 | RF_StripeCount_t stripeOffset; |
628 | int numFailures; |
629 | RF_RaidAddr_t sosAddr; |
630 | RF_SectorNum_t diskOffset, poffset; |
631 | |
632 | /* quick out in the fault-free case. */ |
633 | rf_lock_mutex2(raidPtr->mutex); |
634 | numFailures = raidPtr->numFailures; |
635 | rf_unlock_mutex2(raidPtr->mutex); |
636 | if (numFailures == 0) |
637 | return (0); |
638 | |
639 | sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, |
640 | asmap->raidAddress); |
641 | (layoutPtr->map->IdentifyStripe) (raidPtr, asmap->raidAddress, |
642 | &diskids); |
643 | (layoutPtr->map->MapParity) (raidPtr, asmap->raidAddress, |
644 | &pcol, &poffset, 0); /* get pcol */ |
645 | |
646 | /* this need not be true if we've redirected the access to a |
647 | * spare in another row RF_ASSERT(row == testrow); */ |
648 | stripeOffset = 0; |
649 | for (i = 0; i < layoutPtr->numDataCol + layoutPtr->numParityCol; i++) { |
650 | if (diskids[i] != pcol) { |
651 | if (RF_DEAD_DISK(raidPtr->Disks[diskids[i]].status)) { |
652 | if (raidPtr->status != rf_rs_reconstructing) |
653 | return (1); |
654 | RF_ASSERT(raidPtr->reconControl->fcol == diskids[i]); |
655 | layoutPtr->map->MapSector(raidPtr, |
656 | sosAddr + stripeOffset * layoutPtr->sectorsPerStripeUnit, |
657 | &tcol, &diskOffset, 0); |
658 | RF_ASSERT(tcol == diskids[i]); |
659 | if (!rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, diskOffset)) |
660 | return (1); |
661 | asmap->flags |= RF_ASM_REDIR_LARGE_WRITE; |
662 | return (0); |
663 | } |
664 | stripeOffset++; |
665 | } |
666 | } |
667 | return (0); |
668 | } |
669 | #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD >0) |
670 | /* |
671 | return the number of failed data units in the stripe. |
672 | */ |
673 | |
674 | int |
675 | rf_NumFailedDataUnitsInStripe(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap) |
676 | { |
677 | RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; |
678 | RF_RowCol_t tcol, i; |
679 | RF_SectorNum_t diskOffset; |
680 | RF_RaidAddr_t sosAddr; |
681 | int numFailures; |
682 | |
683 | /* quick out in the fault-free case. */ |
684 | rf_lock_mutex2(raidPtr->mutex); |
685 | numFailures = raidPtr->numFailures; |
686 | rf_unlock_mutex2(raidPtr->mutex); |
687 | if (numFailures == 0) |
688 | return (0); |
689 | numFailures = 0; |
690 | |
691 | sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, |
692 | asmap->raidAddress); |
693 | for (i = 0; i < layoutPtr->numDataCol; i++) { |
694 | (layoutPtr->map->MapSector) (raidPtr, sosAddr + i * layoutPtr->sectorsPerStripeUnit, |
695 | &tcol, &diskOffset, 0); |
696 | if (RF_DEAD_DISK(raidPtr->Disks[tcol].status)) |
697 | numFailures++; |
698 | } |
699 | |
700 | return numFailures; |
701 | } |
702 | #endif |
703 | |
704 | /**************************************************************************** |
705 | * |
706 | * debug routines |
707 | * |
708 | ***************************************************************************/ |
709 | #if RF_DEBUG_MAP |
710 | void |
711 | rf_PrintAccessStripeMap(RF_AccessStripeMapHeader_t *asm_h) |
712 | { |
713 | rf_PrintFullAccessStripeMap(asm_h, 0); |
714 | } |
715 | #endif |
716 | |
717 | /* prbuf - flag to print buffer pointers */ |
718 | void |
719 | rf_PrintFullAccessStripeMap(RF_AccessStripeMapHeader_t *asm_h, int prbuf) |
720 | { |
721 | int i; |
722 | RF_AccessStripeMap_t *asmap = asm_h->stripeMap; |
723 | RF_PhysDiskAddr_t *p; |
724 | printf("%d stripes total\n" , (int) asm_h->numStripes); |
725 | for (; asmap; asmap = asmap->next) { |
726 | /* printf("Num failures: %d\n",asmap->numDataFailed); */ |
727 | /* printf("Num sectors: |
728 | * %d\n",(int)asmap->totalSectorsAccessed); */ |
729 | printf("Stripe %d (%d sectors), failures: %d data, %d parity: " , |
730 | (int) asmap->stripeID, |
731 | (int) asmap->totalSectorsAccessed, |
732 | (int) asmap->numDataFailed, |
733 | (int) asmap->numParityFailed); |
734 | if (asmap->parityInfo) { |
735 | printf("Parity [c%d s%d-%d" , asmap->parityInfo->col, |
736 | (int) asmap->parityInfo->startSector, |
737 | (int) (asmap->parityInfo->startSector + |
738 | asmap->parityInfo->numSector - 1)); |
739 | if (prbuf) |
740 | printf(" b0x%lx" , (unsigned long) asmap->parityInfo->bufPtr); |
741 | if (asmap->parityInfo->next) { |
742 | printf(", c%d s%d-%d" , asmap->parityInfo->next->col, |
743 | (int) asmap->parityInfo->next->startSector, |
744 | (int) (asmap->parityInfo->next->startSector + |
745 | asmap->parityInfo->next->numSector - 1)); |
746 | if (prbuf) |
747 | printf(" b0x%lx" , (unsigned long) asmap->parityInfo->next->bufPtr); |
748 | RF_ASSERT(asmap->parityInfo->next->next == NULL); |
749 | } |
750 | printf("]\n\t" ); |
751 | } |
752 | for (i = 0, p = asmap->physInfo; p; p = p->next, i++) { |
753 | printf("SU c%d s%d-%d " , p->col, (int) p->startSector, |
754 | (int) (p->startSector + p->numSector - 1)); |
755 | if (prbuf) |
756 | printf("b0x%lx " , (unsigned long) p->bufPtr); |
757 | if (i && !(i & 1)) |
758 | printf("\n\t" ); |
759 | } |
760 | printf("\n" ); |
761 | p = asm_h->stripeMap->failedPDAs[0]; |
762 | if (asm_h->stripeMap->numDataFailed + asm_h->stripeMap->numParityFailed > 1) |
763 | printf("[multiple failures]\n" ); |
764 | else |
765 | if (asm_h->stripeMap->numDataFailed + asm_h->stripeMap->numParityFailed > 0) |
766 | printf("\t[Failed PDA: c%d s%d-%d]\n" , p->col, |
767 | (int) p->startSector, (int) (p->startSector + p->numSector - 1)); |
768 | } |
769 | } |
770 | |
771 | #if RF_MAP_DEBUG |
772 | void |
773 | rf_PrintRaidAddressInfo(RF_Raid_t *raidPtr, RF_RaidAddr_t raidAddr, |
774 | RF_SectorCount_t numBlocks) |
775 | { |
776 | RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; |
777 | RF_RaidAddr_t ra, sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, raidAddr); |
778 | |
779 | printf("Raid addrs of SU boundaries from start of stripe to end of access:\n\t" ); |
780 | for (ra = sosAddr; ra <= raidAddr + numBlocks; ra += layoutPtr->sectorsPerStripeUnit) { |
781 | printf("%d (0x%x), " , (int) ra, (int) ra); |
782 | } |
783 | printf("\n" ); |
784 | printf("Offset into stripe unit: %d (0x%x)\n" , |
785 | (int) (raidAddr % layoutPtr->sectorsPerStripeUnit), |
786 | (int) (raidAddr % layoutPtr->sectorsPerStripeUnit)); |
787 | } |
788 | #endif |
789 | /* given a parity descriptor and the starting address within a stripe, |
790 | * range restrict the parity descriptor to touch only the correct |
791 | * stuff. */ |
792 | void |
793 | rf_ASMParityAdjust(RF_PhysDiskAddr_t *toAdjust, |
794 | RF_StripeNum_t startAddrWithinStripe, |
795 | RF_SectorNum_t endAddress, |
796 | RF_RaidLayout_t *layoutPtr, |
797 | RF_AccessStripeMap_t *asm_p) |
798 | { |
799 | RF_PhysDiskAddr_t *new_pda; |
800 | |
801 | /* when we're accessing only a portion of one stripe unit, we |
802 | * want the parity descriptor to identify only the chunk of |
803 | * parity associated with the data. When the access spans |
804 | * exactly one stripe unit boundary and is less than a stripe |
805 | * unit in size, it uses two disjoint regions of the parity |
806 | * unit. When an access spans more than one stripe unit |
807 | * boundary, it uses all of the parity unit. |
808 | * |
809 | * To better handle the case where stripe units are small, we |
810 | * may eventually want to change the 2nd case so that if the |
811 | * SU size is below some threshold, we just read/write the |
812 | * whole thing instead of breaking it up into two accesses. */ |
813 | if (asm_p->numStripeUnitsAccessed == 1) { |
814 | int x = (startAddrWithinStripe % layoutPtr->sectorsPerStripeUnit); |
815 | toAdjust->startSector += x; |
816 | toAdjust->raidAddress += x; |
817 | toAdjust->numSector = asm_p->physInfo->numSector; |
818 | RF_ASSERT(toAdjust->numSector != 0); |
819 | } else |
820 | if (asm_p->numStripeUnitsAccessed == 2 && asm_p->totalSectorsAccessed < layoutPtr->sectorsPerStripeUnit) { |
821 | int x = (startAddrWithinStripe % layoutPtr->sectorsPerStripeUnit); |
822 | |
823 | /* create a second pda and copy the parity map info |
824 | * into it */ |
825 | RF_ASSERT(toAdjust->next == NULL); |
826 | /* the following will get freed in rf_FreeAccessStripeMap() via |
827 | rf_FreePDAList() */ |
828 | new_pda = toAdjust->next = rf_AllocPhysDiskAddr(); |
829 | *new_pda = *toAdjust; /* structure assignment */ |
830 | new_pda->next = NULL; |
831 | |
832 | /* adjust the start sector & number of blocks for the |
833 | * first parity pda */ |
834 | toAdjust->startSector += x; |
835 | toAdjust->raidAddress += x; |
836 | toAdjust->numSector = rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, startAddrWithinStripe) - startAddrWithinStripe; |
837 | RF_ASSERT(toAdjust->numSector != 0); |
838 | |
839 | /* adjust the second pda */ |
840 | new_pda->numSector = endAddress - rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, endAddress); |
841 | /* new_pda->raidAddress = |
842 | * rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, |
843 | * toAdjust->raidAddress); */ |
844 | RF_ASSERT(new_pda->numSector != 0); |
845 | } |
846 | } |
847 | |
848 | /* Check if a disk has been spared or failed. If spared, redirect the |
849 | * I/O. If it has been failed, record it in the asm pointer. Fifth |
850 | * arg is whether data or parity. */ |
851 | void |
852 | rf_ASMCheckStatus(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda_p, |
853 | RF_AccessStripeMap_t *asm_p, RF_RaidDisk_t *disks, |
854 | int parity) |
855 | { |
856 | RF_DiskStatus_t dstatus; |
857 | RF_RowCol_t fcol; |
858 | |
859 | dstatus = disks[pda_p->col].status; |
860 | |
861 | if (dstatus == rf_ds_spared) { |
862 | /* if the disk has been spared, redirect access to the spare */ |
863 | fcol = pda_p->col; |
864 | pda_p->col = disks[fcol].spareCol; |
865 | } else |
866 | if (dstatus == rf_ds_dist_spared) { |
867 | /* ditto if disk has been spared to dist spare space */ |
868 | #if RF_DEBUG_MAP |
869 | RF_RowCol_t oc = pda_p->col; |
870 | RF_SectorNum_t oo = pda_p->startSector; |
871 | #endif |
872 | if (pda_p->type == RF_PDA_TYPE_DATA) |
873 | raidPtr->Layout.map->MapSector(raidPtr, pda_p->raidAddress, &pda_p->col, &pda_p->startSector, RF_REMAP); |
874 | else |
875 | raidPtr->Layout.map->MapParity(raidPtr, pda_p->raidAddress, &pda_p->col, &pda_p->startSector, RF_REMAP); |
876 | |
877 | #if RF_DEBUG_MAP |
878 | if (rf_mapDebug) { |
879 | printf("Redirected c %d o %d -> c %d o %d\n" , oc, (int) oo, |
880 | pda_p->col, (int) pda_p->startSector); |
881 | } |
882 | #endif |
883 | } else |
884 | if (RF_DEAD_DISK(dstatus)) { |
885 | /* if the disk is inaccessible, mark the |
886 | * failure */ |
887 | if (parity) |
888 | asm_p->numParityFailed++; |
889 | else { |
890 | asm_p->numDataFailed++; |
891 | } |
892 | asm_p->failedPDAs[asm_p->numFailedPDAs] = pda_p; |
893 | asm_p->numFailedPDAs++; |
894 | #if 0 |
895 | switch (asm_p->numParityFailed + asm_p->numDataFailed) { |
896 | case 1: |
897 | asm_p->failedPDAs[0] = pda_p; |
898 | break; |
899 | case 2: |
900 | asm_p->failedPDAs[1] = pda_p; |
901 | default: |
902 | break; |
903 | } |
904 | #endif |
905 | } |
906 | /* the redirected access should never span a stripe unit boundary */ |
907 | RF_ASSERT(rf_RaidAddressToStripeUnitID(&raidPtr->Layout, pda_p->raidAddress) == |
908 | rf_RaidAddressToStripeUnitID(&raidPtr->Layout, pda_p->raidAddress + pda_p->numSector - 1)); |
909 | RF_ASSERT(pda_p->col != -1); |
910 | } |
911 | |