1 | /* $NetBSD: rf_raid5.c,v 1.19 2006/11/16 01:33:23 christos Exp $ */ |
2 | /* |
3 | * Copyright (c) 1995 Carnegie-Mellon University. |
4 | * All rights reserved. |
5 | * |
6 | * Author: Mark Holland |
7 | * |
8 | * Permission to use, copy, modify and distribute this software and |
9 | * its documentation is hereby granted, provided that both the copyright |
10 | * notice and this permission notice appear in all copies of the |
11 | * software, derivative works or modified versions, and any portions |
12 | * thereof, and that both notices appear in supporting documentation. |
13 | * |
14 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
15 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND |
16 | * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. |
17 | * |
18 | * Carnegie Mellon requests users of this software to return to |
19 | * |
20 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
21 | * School of Computer Science |
22 | * Carnegie Mellon University |
23 | * Pittsburgh PA 15213-3890 |
24 | * |
25 | * any improvements or extensions that they make and grant Carnegie the |
26 | * rights to redistribute these changes. |
27 | */ |
28 | |
29 | /****************************************************************************** |
30 | * |
31 | * rf_raid5.c -- implements RAID Level 5 |
32 | * |
33 | *****************************************************************************/ |
34 | |
35 | #include <sys/cdefs.h> |
36 | __KERNEL_RCSID(0, "$NetBSD: rf_raid5.c,v 1.19 2006/11/16 01:33:23 christos Exp $" ); |
37 | |
38 | #include <dev/raidframe/raidframevar.h> |
39 | |
40 | #include "rf_raid.h" |
41 | #include "rf_raid5.h" |
42 | #include "rf_dag.h" |
43 | #include "rf_dagffrd.h" |
44 | #include "rf_dagffwr.h" |
45 | #include "rf_dagdegrd.h" |
46 | #include "rf_dagdegwr.h" |
47 | #include "rf_dagutils.h" |
48 | #include "rf_general.h" |
49 | #include "rf_map.h" |
50 | #include "rf_utils.h" |
51 | |
52 | typedef struct RF_Raid5ConfigInfo_s { |
53 | RF_RowCol_t **stripeIdentifier; /* filled in at config time and used |
54 | * by IdentifyStripe */ |
55 | } RF_Raid5ConfigInfo_t; |
56 | |
57 | int |
58 | rf_ConfigureRAID5(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, |
59 | RF_Config_t *cfgPtr) |
60 | { |
61 | RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; |
62 | RF_Raid5ConfigInfo_t *info; |
63 | RF_RowCol_t i, j, startdisk; |
64 | |
65 | /* create a RAID level 5 configuration structure */ |
66 | RF_MallocAndAdd(info, sizeof(RF_Raid5ConfigInfo_t), (RF_Raid5ConfigInfo_t *), raidPtr->cleanupList); |
67 | if (info == NULL) |
68 | return (ENOMEM); |
69 | layoutPtr->layoutSpecificInfo = (void *) info; |
70 | |
71 | /* the stripe identifier must identify the disks in each stripe, IN |
72 | * THE ORDER THAT THEY APPEAR IN THE STRIPE. */ |
73 | info->stripeIdentifier = rf_make_2d_array(raidPtr->numCol, raidPtr->numCol, raidPtr->cleanupList); |
74 | if (info->stripeIdentifier == NULL) |
75 | return (ENOMEM); |
76 | startdisk = 0; |
77 | for (i = 0; i < raidPtr->numCol; i++) { |
78 | for (j = 0; j < raidPtr->numCol; j++) { |
79 | info->stripeIdentifier[i][j] = (startdisk + j) % raidPtr->numCol; |
80 | } |
81 | if ((--startdisk) < 0) |
82 | startdisk = raidPtr->numCol - 1; |
83 | } |
84 | |
85 | /* fill in the remaining layout parameters */ |
86 | layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk; |
87 | layoutPtr->numDataCol = raidPtr->numCol - 1; |
88 | layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit; |
89 | layoutPtr->numParityCol = 1; |
90 | layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk; |
91 | |
92 | raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit; |
93 | |
94 | return (0); |
95 | } |
96 | |
97 | int |
98 | rf_GetDefaultNumFloatingReconBuffersRAID5(RF_Raid_t *raidPtr) |
99 | { |
100 | return (20); |
101 | } |
102 | |
103 | RF_HeadSepLimit_t |
104 | rf_GetDefaultHeadSepLimitRAID5(RF_Raid_t *raidPtr) |
105 | { |
106 | return (10); |
107 | } |
108 | #if !defined(__NetBSD__) && !defined(_KERNEL) |
109 | /* not currently used */ |
110 | int |
111 | rf_ShutdownRAID5(RF_Raid_t *raidPtr) |
112 | { |
113 | return (0); |
114 | } |
115 | #endif |
116 | |
117 | void |
118 | rf_MapSectorRAID5(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, |
119 | RF_RowCol_t *col, RF_SectorNum_t *diskSector, |
120 | int remap) |
121 | { |
122 | RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit; |
123 | *col = (SUID % raidPtr->numCol); |
124 | *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit + |
125 | (raidSector % raidPtr->Layout.sectorsPerStripeUnit); |
126 | } |
127 | |
128 | void |
129 | rf_MapParityRAID5(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, |
130 | RF_RowCol_t *col, RF_SectorNum_t *diskSector, |
131 | int remap) |
132 | { |
133 | RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit; |
134 | |
135 | *col = raidPtr->Layout.numDataCol - (SUID / raidPtr->Layout.numDataCol) % raidPtr->numCol; |
136 | *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit + |
137 | (raidSector % raidPtr->Layout.sectorsPerStripeUnit); |
138 | } |
139 | |
140 | void |
141 | rf_IdentifyStripeRAID5(RF_Raid_t *raidPtr, RF_RaidAddr_t addr, |
142 | RF_RowCol_t **diskids) |
143 | { |
144 | RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, addr); |
145 | RF_Raid5ConfigInfo_t *info = (RF_Raid5ConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; |
146 | |
147 | *diskids = info->stripeIdentifier[stripeID % raidPtr->numCol]; |
148 | } |
149 | |
150 | void |
151 | rf_MapSIDToPSIDRAID5(RF_RaidLayout_t *layoutPtr, |
152 | RF_StripeNum_t stripeID, |
153 | RF_StripeNum_t *psID, RF_ReconUnitNum_t *which_ru) |
154 | { |
155 | *which_ru = 0; |
156 | *psID = stripeID; |
157 | } |
158 | /* select an algorithm for performing an access. Returns two pointers, |
159 | * one to a function that will return information about the DAG, and |
160 | * another to a function that will create the dag. |
161 | */ |
162 | void |
163 | rf_RaidFiveDagSelect(RF_Raid_t *raidPtr, RF_IoType_t type, |
164 | RF_AccessStripeMap_t *asmap, |
165 | RF_VoidFuncPtr *createFunc) |
166 | { |
167 | RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); |
168 | RF_PhysDiskAddr_t *failedPDA = NULL; |
169 | RF_RowCol_t fcol; |
170 | RF_RowStatus_t rstat; |
171 | int prior_recon; |
172 | |
173 | RF_ASSERT(RF_IO_IS_R_OR_W(type)); |
174 | |
175 | if ((asmap->numDataFailed + asmap->numParityFailed > 1) || |
176 | (raidPtr->numFailures > 1)){ |
177 | #if RF_DEBUG_DAG |
178 | if (rf_dagDebug) |
179 | RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n" ); |
180 | #endif |
181 | *createFunc = NULL; |
182 | return; |
183 | } |
184 | |
185 | if (asmap->numDataFailed + asmap->numParityFailed == 1) { |
186 | |
187 | /* if under recon & already reconstructed, redirect |
188 | * the access to the spare drive and eliminate the |
189 | * failure indication */ |
190 | failedPDA = asmap->failedPDAs[0]; |
191 | fcol = failedPDA->col; |
192 | rstat = raidPtr->status; |
193 | prior_recon = (rstat == rf_rs_reconfigured) || ( |
194 | (rstat == rf_rs_reconstructing) ? |
195 | rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, failedPDA->startSector) : 0 |
196 | ); |
197 | if (prior_recon) { |
198 | #if RF_DEBUG_DAG > 0 || RF_DEBUG_MAP > 0 |
199 | RF_RowCol_t oc = failedPDA->col; |
200 | RF_SectorNum_t oo = failedPDA->startSector; |
201 | #endif |
202 | #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 |
203 | if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) { /* redirect to dist |
204 | * spare space */ |
205 | |
206 | if (failedPDA == asmap->parityInfo) { |
207 | |
208 | /* parity has failed */ |
209 | (layoutPtr->map->MapParity) (raidPtr, failedPDA->raidAddress, |
210 | &failedPDA->col, &failedPDA->startSector, RF_REMAP); |
211 | |
212 | if (asmap->parityInfo->next) { /* redir 2nd component, |
213 | * if any */ |
214 | RF_PhysDiskAddr_t *p = asmap->parityInfo->next; |
215 | RF_SectorNum_t SUoffs = p->startSector % layoutPtr->sectorsPerStripeUnit; |
216 | p->col = failedPDA->col; |
217 | p->startSector = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, failedPDA->startSector) + |
218 | SUoffs; /* cheating: |
219 | * startSector is not |
220 | * really a RAID address */ |
221 | } |
222 | } else |
223 | if (asmap->parityInfo->next && failedPDA == asmap->parityInfo->next) { |
224 | RF_ASSERT(0); /* should not ever |
225 | * happen */ |
226 | } else { |
227 | |
228 | /* data has failed */ |
229 | (layoutPtr->map->MapSector) (raidPtr, failedPDA->raidAddress, |
230 | &failedPDA->col, &failedPDA->startSector, RF_REMAP); |
231 | |
232 | } |
233 | |
234 | } else { |
235 | #endif |
236 | /* redirect to dedicated spare space */ |
237 | |
238 | failedPDA->col = raidPtr->Disks[fcol].spareCol; |
239 | |
240 | /* the parity may have two distinct |
241 | * components, both of which may need |
242 | * to be redirected */ |
243 | if (asmap->parityInfo->next) { |
244 | if (failedPDA == asmap->parityInfo) { |
245 | failedPDA->next->col = failedPDA->col; |
246 | } else |
247 | if (failedPDA == asmap->parityInfo->next) { /* paranoid: should |
248 | * never occur */ |
249 | asmap->parityInfo->col = failedPDA->col; |
250 | } |
251 | } |
252 | #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 |
253 | } |
254 | #endif |
255 | RF_ASSERT(failedPDA->col != -1); |
256 | |
257 | #if RF_DEBUG_DAG > 0 || RF_DEBUG_MAP > 0 |
258 | if (rf_dagDebug || rf_mapDebug) { |
259 | printf("raid%d: Redirected type '%c' c %d o %ld -> c %d o %ld\n" , |
260 | raidPtr->raidid, type, oc, |
261 | (long) oo, failedPDA->col, |
262 | (long) failedPDA->startSector); |
263 | } |
264 | #endif |
265 | asmap->numDataFailed = asmap->numParityFailed = 0; |
266 | } |
267 | } |
268 | /* all dags begin/end with block/unblock node therefore, hdrSucc & |
269 | * termAnt counts should always be 1 also, these counts should not be |
270 | * visible outside dag creation routines - manipulating the counts |
271 | * here should be removed */ |
272 | if (type == RF_IO_TYPE_READ) { |
273 | if (asmap->numDataFailed == 0) |
274 | *createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG; |
275 | else |
276 | *createFunc = (RF_VoidFuncPtr) rf_CreateRaidFiveDegradedReadDAG; |
277 | } else { |
278 | |
279 | |
280 | /* if mirroring, always use large writes. If the access |
281 | * requires two distinct parity updates, always do a small |
282 | * write. If the stripe contains a failure but the access |
283 | * does not, do a small write. The first conditional |
284 | * (numStripeUnitsAccessed <= numDataCol/2) uses a |
285 | * less-than-or-equal rather than just a less-than because |
286 | * when G is 3 or 4, numDataCol/2 is 1, and I want |
287 | * single-stripe-unit updates to use just one disk. */ |
288 | if ((asmap->numDataFailed + asmap->numParityFailed) == 0) { |
289 | if (rf_suppressLocksAndLargeWrites || |
290 | (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) || |
291 | (asmap->parityInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) { |
292 | *createFunc = (RF_VoidFuncPtr) rf_CreateSmallWriteDAG; |
293 | } else |
294 | *createFunc = (RF_VoidFuncPtr) rf_CreateLargeWriteDAG; |
295 | } else { |
296 | if (asmap->numParityFailed == 1) |
297 | *createFunc = (RF_VoidFuncPtr) rf_CreateNonRedundantWriteDAG; |
298 | else |
299 | if (asmap->numStripeUnitsAccessed != 1 && (failedPDA == NULL || failedPDA->numSector != layoutPtr->sectorsPerStripeUnit)) |
300 | *createFunc = NULL; |
301 | else |
302 | *createFunc = (RF_VoidFuncPtr) rf_CreateDegradedWriteDAG; |
303 | } |
304 | } |
305 | } |
306 | |