1 | /* $NetBSD: rf_pqdegdags.c,v 1.13 2011/08/01 12:28:53 mbalmer Exp $ */ |
2 | /* |
3 | * Copyright (c) 1995 Carnegie-Mellon University. |
4 | * All rights reserved. |
5 | * |
6 | * Author: Daniel Stodolsky |
7 | * |
8 | * Permission to use, copy, modify and distribute this software and |
9 | * its documentation is hereby granted, provided that both the copyright |
10 | * notice and this permission notice appear in all copies of the |
11 | * software, derivative works or modified versions, and any portions |
12 | * thereof, and that both notices appear in supporting documentation. |
13 | * |
14 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
15 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND |
16 | * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. |
17 | * |
18 | * Carnegie Mellon requests users of this software to return to |
19 | * |
20 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
21 | * School of Computer Science |
22 | * Carnegie Mellon University |
23 | * Pittsburgh PA 15213-3890 |
24 | * |
25 | * any improvements or extensions that they make and grant Carnegie the |
26 | * rights to redistribute these changes. |
27 | */ |
28 | |
29 | /* |
30 | * rf_pqdegdags.c |
31 | * Degraded mode dags for double fault cases. |
32 | */ |
33 | |
34 | |
35 | #include <sys/cdefs.h> |
36 | __KERNEL_RCSID(0, "$NetBSD: rf_pqdegdags.c,v 1.13 2011/08/01 12:28:53 mbalmer Exp $" ); |
37 | |
38 | #include "rf_archs.h" |
39 | |
40 | #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) |
41 | |
42 | #include <dev/raidframe/raidframevar.h> |
43 | |
44 | #include "rf_raid.h" |
45 | #include "rf_dag.h" |
46 | #include "rf_dagdegrd.h" |
47 | #include "rf_dagdegwr.h" |
48 | #include "rf_dagfuncs.h" |
49 | #include "rf_dagutils.h" |
50 | #include "rf_etimer.h" |
51 | #include "rf_acctrace.h" |
52 | #include "rf_general.h" |
53 | #include "rf_pqdegdags.h" |
54 | #include "rf_pq.h" |
55 | |
56 | static void |
57 | applyPDA(RF_Raid_t * raidPtr, RF_PhysDiskAddr_t * pda, RF_PhysDiskAddr_t * ppda, |
58 | RF_PhysDiskAddr_t * qpda, void *bp); |
59 | |
60 | /* |
61 | Two data drives have failed, and we are doing a read that covers one of them. |
62 | We may also be reading some of the surviving drives. |
63 | |
64 | |
65 | ***************************************************************************************** |
66 | * |
67 | * creates a DAG to perform a degraded-mode read of data within one stripe. |
68 | * This DAG is as follows: |
69 | * |
70 | * Hdr |
71 | * | |
72 | * Block |
73 | * / / \ \ \ \ |
74 | * Rud ... Rud Rrd ... Rrd Rp Rq |
75 | * | \ | \ | \ | \ | \ | \ |
76 | * |
77 | * | | |
78 | * Unblock X |
79 | * \ / |
80 | * ------ T ------ |
81 | * |
82 | * Each R node is a successor of the L node |
83 | * One successor arc from each R node goes to U, and the other to X |
84 | * There is one Rud for each chunk of surviving user data requested by the user, |
85 | * and one Rrd for each chunk of surviving user data _not_ being read by the user |
86 | * R = read, ud = user data, rd = recovery (surviving) data, p = P data, q = Qdata |
87 | * X = pq recovery node, T = terminate |
88 | * |
89 | * The block & unblock nodes are leftovers from a previous version. They |
90 | * do nothing, but I haven't deleted them because it would be a tremendous |
91 | * effort to put them back in. |
92 | * |
93 | * Note: The target buffer for the XOR node is set to the actual user buffer where the |
94 | * failed data is supposed to end up. This buffer is zero'd by the code here. Thus, |
95 | * if you create a degraded read dag, use it, and then re-use, you have to be sure to |
96 | * zero the target buffer prior to the re-use. |
97 | * |
98 | * Every buffer read is passed to the pq recovery node, whose job it is to sort out whats |
99 | * needs and what's not. |
100 | ****************************************************************************************/ |
101 | /* init a disk node with 2 successors and one predecessor */ |
102 | #define INIT_DISK_NODE(node,name) \ |
103 | rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \ |
104 | (node)->succedents[0] = unblockNode; \ |
105 | (node)->succedents[1] = recoveryNode; \ |
106 | (node)->antecedents[0] = blockNode; \ |
107 | (node)->antType[0] = rf_control |
108 | |
109 | #define DISK_NODE_PARAMS(_node_,_p_) \ |
110 | (_node_).params[0].p = _p_ ; \ |
111 | (_node_).params[1].p = (_p_)->bufPtr; \ |
112 | (_node_).params[2].v = parityStripeID; \ |
113 | (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru) |
114 | |
115 | #define DISK_NODE_PDA(node) ((node)->params[0].p) |
116 | |
117 | RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead) |
118 | { |
119 | rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList, |
120 | "Rq" , "PQ Recovery" , rf_PQDoubleRecoveryFunc); |
121 | } |
122 | |
123 | static void |
124 | applyPDA(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda, RF_PhysDiskAddr_t *ppda, RF_PhysDiskAddr_t *qpda, void *bp) |
125 | { |
126 | RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); |
127 | RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector); |
128 | RF_SectorCount_t s0len = ppda->numSector, len; |
129 | RF_SectorNum_t suoffset; |
130 | unsigned coeff; |
131 | char *pbuf = ppda->bufPtr; |
132 | char *qbuf = qpda->bufPtr; |
133 | char *buf; |
134 | int delta; |
135 | |
136 | suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); |
137 | len = pda->numSector; |
138 | /* see if pda intersects a recovery pda */ |
139 | if ((suoffset < s0off + s0len) && (suoffset + len > s0off)) { |
140 | buf = pda->bufPtr; |
141 | coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress); |
142 | coeff = (coeff % raidPtr->Layout.numDataCol); |
143 | |
144 | if (suoffset < s0off) { |
145 | delta = s0off - suoffset; |
146 | buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta); |
147 | suoffset = s0off; |
148 | len -= delta; |
149 | } |
150 | if (suoffset > s0off) { |
151 | delta = suoffset - s0off; |
152 | pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta); |
153 | qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta); |
154 | } |
155 | if ((suoffset + len) > (s0len + s0off)) |
156 | len = s0len + s0off - suoffset; |
157 | |
158 | /* src, dest, len */ |
159 | rf_bxor(buf, pbuf, rf_RaidAddressToByte(raidPtr, len), bp); |
160 | |
161 | /* dest, src, len, coeff */ |
162 | rf_IncQ((unsigned long *) qbuf, (unsigned long *) buf, rf_RaidAddressToByte(raidPtr, len), coeff); |
163 | } |
164 | } |
165 | /* |
166 | Recover data in the case of a double failure. There can be two |
167 | result buffers, one for each chunk of data trying to be recovered. |
168 | The params are pda's that have not been range restricted or otherwise |
169 | politely massaged - this should be done here. The last params are the |
170 | pdas of P and Q, followed by the raidPtr. The list can look like |
171 | |
172 | pda, pda, ... , p pda, q pda, raidptr, asm |
173 | |
174 | or |
175 | |
176 | pda, pda, ... , p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm |
177 | |
178 | depending on whether two chunks of recovery data were required. |
179 | |
180 | The second condition only arises if there are two failed buffers |
181 | whose lengths do not add up a stripe unit. |
182 | */ |
183 | |
184 | |
185 | int |
186 | rf_PQDoubleRecoveryFunc(RF_DagNode_t *node) |
187 | { |
188 | int np = node->numParams; |
189 | RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p; |
190 | RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p; |
191 | RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout); |
192 | int d, i; |
193 | unsigned coeff; |
194 | RF_RaidAddr_t sosAddr, suoffset; |
195 | RF_SectorCount_t len, secPerSU = layoutPtr->sectorsPerStripeUnit; |
196 | int two = 0; |
197 | RF_PhysDiskAddr_t *ppda, *ppda2, *qpda, *qpda2, *pda, npda; |
198 | char *buf; |
199 | int numDataCol = layoutPtr->numDataCol; |
200 | RF_Etimer_t timer; |
201 | RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; |
202 | |
203 | RF_ETIMER_START(timer); |
204 | |
205 | if (asmap->failedPDAs[1] && |
206 | (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) { |
207 | RF_ASSERT(0); |
208 | ppda = node->params[np - 6].p; |
209 | ppda2 = node->params[np - 5].p; |
210 | qpda = node->params[np - 4].p; |
211 | qpda2 = node->params[np - 3].p; |
212 | d = (np - 6); |
213 | two = 1; |
214 | } else { |
215 | ppda = node->params[np - 4].p; |
216 | qpda = node->params[np - 3].p; |
217 | d = (np - 4); |
218 | } |
219 | |
220 | for (i = 0; i < d; i++) { |
221 | pda = node->params[i].p; |
222 | buf = pda->bufPtr; |
223 | suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); |
224 | len = pda->numSector; |
225 | coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress); |
226 | /* compute the data unit offset within the column */ |
227 | coeff = (coeff % raidPtr->Layout.numDataCol); |
228 | /* see if pda intersects a recovery pda */ |
229 | applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp); |
230 | if (two) |
231 | applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp); |
232 | } |
233 | |
234 | /* ok, we got the parity back to the point where we can recover. We |
235 | * now need to determine the coeff of the columns that need to be |
236 | * recovered. We can also only need to recover a single stripe unit. */ |
237 | |
238 | if (asmap->failedPDAs[1] == NULL) { /* only a single stripe unit |
239 | * to recover. */ |
240 | pda = asmap->failedPDAs[0]; |
241 | sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); |
242 | /* need to determine the column of the other failed disk */ |
243 | coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress); |
244 | /* compute the data unit offset within the column */ |
245 | coeff = (coeff % raidPtr->Layout.numDataCol); |
246 | for (i = 0; i < numDataCol; i++) { |
247 | npda.raidAddress = sosAddr + (i * secPerSU); |
248 | (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0); |
249 | /* skip over dead disks */ |
250 | if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status)) |
251 | if (i != coeff) |
252 | break; |
253 | } |
254 | RF_ASSERT(i < numDataCol); |
255 | RF_ASSERT(two == 0); |
256 | /* recover the data. Since we need only want to recover one |
257 | * column, we overwrite the parity with the other one. */ |
258 | if (coeff < i) /* recovering 'a' */ |
259 | rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) pda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i); |
260 | else /* recovering 'b' */ |
261 | rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) pda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff); |
262 | } else |
263 | RF_PANIC(); |
264 | |
265 | RF_ETIMER_STOP(timer); |
266 | RF_ETIMER_EVAL(timer); |
267 | if (tracerec) |
268 | tracerec->q_us += RF_ETIMER_VAL_US(timer); |
269 | rf_GenericWakeupFunc(node, 0); |
270 | return (0); |
271 | } |
272 | |
273 | int |
274 | rf_PQWriteDoubleRecoveryFunc(RF_DagNode_t *node) |
275 | { |
276 | /* The situation: |
277 | * |
278 | * We are doing a write that hits only one failed data unit. The other |
279 | * failed data unit is not being overwritten, so we need to generate |
280 | * it. |
281 | * |
282 | * For the moment, we assume all the nonfailed data being written is in |
283 | * the shadow of the failed data unit. (i.e,, either a single data |
284 | * unit write or the entire failed stripe unit is being overwritten. ) |
285 | * |
286 | * Recovery strategy: apply the recovery data to the parity and q. Use P |
287 | * & Q to recover the second failed data unit in P. Zero fill Q, then |
288 | * apply the recovered data to p. Then apply the data being written to |
289 | * the failed drive. Then walk through the surviving drives, applying |
290 | * new data when it exists, othewise the recovery data. Quite a mess. |
291 | * |
292 | * |
293 | * The params |
294 | * |
295 | * read pda0, read pda1, ... read pda (numDataCol-3), write pda0, ... , |
296 | * write pda (numStripeUnitAccess - numDataFailed), failed pda, |
297 | * raidPtr, asmap */ |
298 | |
299 | int np = node->numParams; |
300 | RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p; |
301 | RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p; |
302 | RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout); |
303 | int i; |
304 | RF_RaidAddr_t sosAddr; |
305 | unsigned coeff; |
306 | RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit; |
307 | RF_PhysDiskAddr_t *ppda, *qpda, *pda, npda; |
308 | int numDataCol = layoutPtr->numDataCol; |
309 | RF_Etimer_t timer; |
310 | RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; |
311 | |
312 | RF_ASSERT(node->numResults == 2); |
313 | RF_ASSERT(asmap->failedPDAs[1] == NULL); |
314 | RF_ETIMER_START(timer); |
315 | ppda = node->results[0]; |
316 | qpda = node->results[1]; |
317 | /* apply the recovery data */ |
318 | for (i = 0; i < numDataCol - 2; i++) |
319 | applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp); |
320 | |
321 | /* determine the other failed data unit */ |
322 | pda = asmap->failedPDAs[0]; |
323 | sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); |
324 | /* need to determine the column of the other failed disk */ |
325 | coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress); |
326 | /* compute the data unit offset within the column */ |
327 | coeff = (coeff % raidPtr->Layout.numDataCol); |
328 | for (i = 0; i < numDataCol; i++) { |
329 | npda.raidAddress = sosAddr + (i * secPerSU); |
330 | (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0); |
331 | /* skip over dead disks */ |
332 | if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status)) |
333 | if (i != coeff) |
334 | break; |
335 | } |
336 | RF_ASSERT(i < numDataCol); |
337 | /* recover the data. The column we want to recover we write over the |
338 | * parity. The column we don't care about we dump in q. */ |
339 | if (coeff < i) /* recovering 'a' */ |
340 | rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i); |
341 | else /* recovering 'b' */ |
342 | rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff); |
343 | |
344 | /* OK. The valid data is in P. Zero fill Q, then inc it into it. */ |
345 | memset(qpda->bufPtr, 0, rf_RaidAddressToByte(raidPtr, qpda->numSector)); |
346 | rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), i); |
347 | |
348 | /* now apply all the write data to the buffer */ |
349 | /* single stripe unit write case: the failed data is only thing we are |
350 | * writing. */ |
351 | RF_ASSERT(asmap->numStripeUnitsAccessed == 1); |
352 | /* dest, src, len, coeff */ |
353 | rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) asmap->failedPDAs[0]->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), coeff); |
354 | rf_bxor(asmap->failedPDAs[0]->bufPtr, ppda->bufPtr, rf_RaidAddressToByte(raidPtr, ppda->numSector), node->dagHdr->bp); |
355 | |
356 | /* now apply all the recovery data */ |
357 | for (i = 0; i < numDataCol - 2; i++) |
358 | applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp); |
359 | |
360 | RF_ETIMER_STOP(timer); |
361 | RF_ETIMER_EVAL(timer); |
362 | if (tracerec) |
363 | tracerec->q_us += RF_ETIMER_VAL_US(timer); |
364 | |
365 | rf_GenericWakeupFunc(node, 0); |
366 | return (0); |
367 | } |
368 | RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite) |
369 | { |
370 | RF_PANIC(); |
371 | } |
372 | /* |
373 | Two lost data unit write case. |
374 | |
375 | There are really two cases here: |
376 | |
377 | (1) The write completely covers the two lost data units. |
378 | In that case, a reconstruct write that doesn't write the |
379 | failed data units will do the correct thing. So in this case, |
380 | the dag looks like |
381 | |
382 | full stripe read of surviving data units (not being overwriten) |
383 | write new data (ignoring failed units) compute P&Q |
384 | write P&Q |
385 | |
386 | |
387 | (2) The write does not completely cover both failed data units |
388 | (but touches at least one of them). Then we need to do the |
389 | equivalent of a reconstruct read to recover the missing data |
390 | unit from the other stripe. |
391 | |
392 | For any data we are writing that is not in the "shadow" |
393 | of the failed units, we need to do a four cycle update. |
394 | PANIC on this case. for now |
395 | |
396 | */ |
397 | |
398 | RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG) |
399 | { |
400 | RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); |
401 | RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit; |
402 | int sum; |
403 | int nf = asmap->numDataFailed; |
404 | |
405 | sum = asmap->failedPDAs[0]->numSector; |
406 | if (nf == 2) |
407 | sum += asmap->failedPDAs[1]->numSector; |
408 | |
409 | if ((nf == 2) && (sum == (2 * sectorsPerSU))) { |
410 | /* large write case */ |
411 | rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList); |
412 | return; |
413 | } |
414 | if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU)) { |
415 | /* small write case, no user data not in shadow */ |
416 | rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList); |
417 | return; |
418 | } |
419 | RF_PANIC(); |
420 | } |
421 | RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite) |
422 | { |
423 | rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList, "Rq" , "Wq" , "PQ Recovery" , rf_PQWriteDoubleRecoveryFunc); |
424 | } |
425 | #endif /* (RF_INCLUDE_DECL_PQ > 0) || |
426 | * (RF_INCLUDE_RAID6 > 0) */ |
427 | |