1 | /* $NetBSD: rf_dagdegwr.c,v 1.33 2014/03/23 03:42:39 christos Exp $ */ |
2 | /* |
3 | * Copyright (c) 1995 Carnegie-Mellon University. |
4 | * All rights reserved. |
5 | * |
6 | * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II |
7 | * |
8 | * Permission to use, copy, modify and distribute this software and |
9 | * its documentation is hereby granted, provided that both the copyright |
10 | * notice and this permission notice appear in all copies of the |
11 | * software, derivative works or modified versions, and any portions |
12 | * thereof, and that both notices appear in supporting documentation. |
13 | * |
14 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
15 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND |
16 | * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. |
17 | * |
18 | * Carnegie Mellon requests users of this software to return to |
19 | * |
20 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
21 | * School of Computer Science |
22 | * Carnegie Mellon University |
23 | * Pittsburgh PA 15213-3890 |
24 | * |
25 | * any improvements or extensions that they make and grant Carnegie the |
26 | * rights to redistribute these changes. |
27 | */ |
28 | |
29 | /* |
30 | * rf_dagdegwr.c |
31 | * |
32 | * code for creating degraded write DAGs |
33 | * |
34 | */ |
35 | |
36 | #include <sys/cdefs.h> |
37 | __KERNEL_RCSID(0, "$NetBSD: rf_dagdegwr.c,v 1.33 2014/03/23 03:42:39 christos Exp $" ); |
38 | |
39 | #include <dev/raidframe/raidframevar.h> |
40 | |
41 | #include "rf_raid.h" |
42 | #include "rf_dag.h" |
43 | #include "rf_dagutils.h" |
44 | #include "rf_dagfuncs.h" |
45 | #include "rf_debugMem.h" |
46 | #include "rf_general.h" |
47 | #include "rf_dagdegwr.h" |
48 | #include "rf_map.h" |
49 | |
50 | |
51 | /****************************************************************************** |
52 | * |
53 | * General comments on DAG creation: |
54 | * |
55 | * All DAGs in this file use roll-away error recovery. Each DAG has a single |
56 | * commit node, usually called "Cmt." If an error occurs before the Cmt node |
57 | * is reached, the execution engine will halt forward execution and work |
58 | * backward through the graph, executing the undo functions. Assuming that |
59 | * each node in the graph prior to the Cmt node are undoable and atomic - or - |
60 | * does not make changes to permanent state, the graph will fail atomically. |
61 | * If an error occurs after the Cmt node executes, the engine will roll-forward |
62 | * through the graph, blindly executing nodes until it reaches the end. |
63 | * If a graph reaches the end, it is assumed to have completed successfully. |
64 | * |
65 | * A graph has only 1 Cmt node. |
66 | * |
67 | */ |
68 | |
69 | |
70 | /****************************************************************************** |
71 | * |
72 | * The following wrappers map the standard DAG creation interface to the |
73 | * DAG creation routines. Additionally, these wrappers enable experimentation |
74 | * with new DAG structures by providing an extra level of indirection, allowing |
75 | * the DAG creation routines to be replaced at this single point. |
76 | */ |
77 | |
78 | static |
79 | RF_CREATE_DAG_FUNC_DECL(rf_CreateSimpleDegradedWriteDAG) |
80 | { |
81 | rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, |
82 | flags, allocList, 1, rf_RecoveryXorFunc, RF_TRUE); |
83 | } |
84 | |
85 | void |
86 | rf_CreateDegradedWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, |
87 | RF_DagHeader_t *dag_h, void *bp, |
88 | RF_RaidAccessFlags_t flags, |
89 | RF_AllocListElem_t *allocList) |
90 | { |
91 | |
92 | RF_ASSERT(asmap->numDataFailed == 1); |
93 | dag_h->creator = "DegradedWriteDAG" ; |
94 | |
95 | /* |
96 | * if the access writes only a portion of the failed unit, and also |
97 | * writes some portion of at least one surviving unit, we create two |
98 | * DAGs, one for the failed component and one for the non-failed |
99 | * component, and do them sequentially. Note that the fact that we're |
100 | * accessing only a portion of the failed unit indicates that the |
101 | * access either starts or ends in the failed unit, and hence we need |
102 | * create only two dags. This is inefficient in that the same data or |
103 | * parity can get read and written twice using this structure. I need |
104 | * to fix this to do the access all at once. |
105 | */ |
106 | RF_ASSERT(!(asmap->numStripeUnitsAccessed != 1 && |
107 | asmap->failedPDAs[0]->numSector != |
108 | raidPtr->Layout.sectorsPerStripeUnit)); |
109 | rf_CreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags, |
110 | allocList); |
111 | } |
112 | |
113 | |
114 | |
115 | /****************************************************************************** |
116 | * |
117 | * DAG creation code begins here |
118 | */ |
119 | |
120 | |
121 | |
122 | /****************************************************************************** |
123 | * |
124 | * CommonCreateSimpleDegradedWriteDAG -- creates a DAG to do a degraded-mode |
125 | * write, which is as follows |
126 | * |
127 | * / {Wnq} --\ |
128 | * hdr -> blockNode -> Rod -> Xor -> Cmt -> Wnp ----> unblock -> term |
129 | * \ {Rod} / \ Wnd ---/ |
130 | * \ {Wnd} -/ |
131 | * |
132 | * commit nodes: Xor, Wnd |
133 | * |
134 | * IMPORTANT: |
135 | * This DAG generator does not work for double-degraded archs since it does not |
136 | * generate Q |
137 | * |
138 | * This dag is essentially identical to the large-write dag, except that the |
139 | * write to the failed data unit is suppressed. |
140 | * |
141 | * IMPORTANT: this dag does not work in the case where the access writes only |
142 | * a portion of the failed unit, and also writes some portion of at least one |
143 | * surviving SU. this case is handled in CreateDegradedWriteDAG above. |
144 | * |
145 | * The block & unblock nodes are leftovers from a previous version. They |
146 | * do nothing, but I haven't deleted them because it would be a tremendous |
147 | * effort to put them back in. |
148 | * |
149 | * This dag is used whenever a one of the data units in a write has failed. |
150 | * If it is the parity unit that failed, the nonredundant write dag (below) |
151 | * is used. |
152 | *****************************************************************************/ |
153 | |
154 | void |
155 | rf_CommonCreateSimpleDegradedWriteDAG(RF_Raid_t *raidPtr, |
156 | RF_AccessStripeMap_t *asmap, |
157 | RF_DagHeader_t *dag_h, void *bp, |
158 | RF_RaidAccessFlags_t flags, |
159 | RF_AllocListElem_t *allocList, |
160 | int nfaults, |
161 | int (*redFunc) (RF_DagNode_t *), |
162 | int allowBufferRecycle) |
163 | { |
164 | int nRrdNodes, nWndNodes, nXorBufs, i, j, paramNum, |
165 | rdnodesFaked; |
166 | RF_DagNode_t *blockNode, *unblockNode, *wnpNode, *termNode; |
167 | #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) |
168 | RF_DagNode_t *wnqNode; |
169 | #endif |
170 | RF_DagNode_t *wndNodes, *rrdNodes, *xorNode, *commitNode; |
171 | RF_DagNode_t *tmpNode, *tmpwndNode, *tmprrdNode; |
172 | RF_SectorCount_t sectorsPerSU; |
173 | RF_ReconUnitNum_t which_ru; |
174 | char *xorTargetBuf = NULL; /* the target buffer for the XOR |
175 | * operation */ |
176 | char overlappingPDAs[RF_MAXCOL];/* a temporary array of flags */ |
177 | RF_AccessStripeMapHeader_t *new_asm_h[2]; |
178 | RF_PhysDiskAddr_t *pda, *parityPDA; |
179 | RF_StripeNum_t parityStripeID; |
180 | RF_PhysDiskAddr_t *failedPDA; |
181 | RF_RaidLayout_t *layoutPtr; |
182 | |
183 | layoutPtr = &(raidPtr->Layout); |
184 | parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress, |
185 | &which_ru); |
186 | sectorsPerSU = layoutPtr->sectorsPerStripeUnit; |
187 | /* failedPDA points to the pda within the asm that targets the failed |
188 | * disk */ |
189 | failedPDA = asmap->failedPDAs[0]; |
190 | |
191 | #if RF_DEBUG_DAG |
192 | if (rf_dagDebug) |
193 | printf("[Creating degraded-write DAG]\n" ); |
194 | #endif |
195 | |
196 | RF_ASSERT(asmap->numDataFailed == 1); |
197 | dag_h->creator = "SimpleDegradedWriteDAG" ; |
198 | |
199 | /* |
200 | * Generate two ASMs identifying the surviving data |
201 | * we need in order to recover the lost data. |
202 | */ |
203 | /* overlappingPDAs array must be zero'd */ |
204 | memset(overlappingPDAs, 0, RF_MAXCOL); |
205 | rf_GenerateFailedAccessASMs(raidPtr, asmap, failedPDA, dag_h, new_asm_h, |
206 | &nXorBufs, NULL, overlappingPDAs, allocList); |
207 | |
208 | /* create all the nodes at once */ |
209 | nWndNodes = asmap->numStripeUnitsAccessed - 1; /* no access is |
210 | * generated for the |
211 | * failed pda */ |
212 | |
213 | nRrdNodes = ((new_asm_h[0]) ? new_asm_h[0]->stripeMap->numStripeUnitsAccessed : 0) + |
214 | ((new_asm_h[1]) ? new_asm_h[1]->stripeMap->numStripeUnitsAccessed : 0); |
215 | /* |
216 | * XXX |
217 | * |
218 | * There's a bug with a complete stripe overwrite- that means 0 reads |
219 | * of old data, and the rest of the DAG generation code doesn't like |
220 | * that. A release is coming, and I don't wanna risk breaking a critical |
221 | * DAG generator, so here's what I'm gonna do- if there's no read nodes, |
222 | * I'm gonna fake there being a read node, and I'm gonna swap in a |
223 | * no-op node in its place (to make all the link-up code happy). |
224 | * This should be fixed at some point. --jimz |
225 | */ |
226 | if (nRrdNodes == 0) { |
227 | nRrdNodes = 1; |
228 | rdnodesFaked = 1; |
229 | } else { |
230 | rdnodesFaked = 0; |
231 | } |
232 | |
233 | blockNode = rf_AllocDAGNode(); |
234 | blockNode->list_next = dag_h->nodes; |
235 | dag_h->nodes = blockNode; |
236 | |
237 | commitNode = rf_AllocDAGNode(); |
238 | commitNode->list_next = dag_h->nodes; |
239 | dag_h->nodes = commitNode; |
240 | |
241 | unblockNode = rf_AllocDAGNode(); |
242 | unblockNode->list_next = dag_h->nodes; |
243 | dag_h->nodes = unblockNode; |
244 | |
245 | termNode = rf_AllocDAGNode(); |
246 | termNode->list_next = dag_h->nodes; |
247 | dag_h->nodes = termNode; |
248 | |
249 | xorNode = rf_AllocDAGNode(); |
250 | xorNode->list_next = dag_h->nodes; |
251 | dag_h->nodes = xorNode; |
252 | |
253 | wnpNode = rf_AllocDAGNode(); |
254 | wnpNode->list_next = dag_h->nodes; |
255 | dag_h->nodes = wnpNode; |
256 | |
257 | for (i = 0; i < nWndNodes; i++) { |
258 | tmpNode = rf_AllocDAGNode(); |
259 | tmpNode->list_next = dag_h->nodes; |
260 | dag_h->nodes = tmpNode; |
261 | } |
262 | wndNodes = dag_h->nodes; |
263 | |
264 | for (i = 0; i < nRrdNodes; i++) { |
265 | tmpNode = rf_AllocDAGNode(); |
266 | tmpNode->list_next = dag_h->nodes; |
267 | dag_h->nodes = tmpNode; |
268 | } |
269 | rrdNodes = dag_h->nodes; |
270 | |
271 | #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) |
272 | if (nfaults == 2) { |
273 | wnqNode = rf_AllocDAGNode(); |
274 | wnqNode->list_next = dag_h->nodes; |
275 | dag_h->nodes = wnqNode; |
276 | } else { |
277 | wnqNode = NULL; |
278 | } |
279 | #endif |
280 | |
281 | /* this dag can not commit until all rrd and xor Nodes have completed */ |
282 | dag_h->numCommitNodes = 1; |
283 | dag_h->numCommits = 0; |
284 | dag_h->numSuccedents = 1; |
285 | |
286 | RF_ASSERT(nRrdNodes > 0); |
287 | rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, |
288 | NULL, nRrdNodes, 0, 0, 0, dag_h, "Nil" , allocList); |
289 | rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, |
290 | NULL, nWndNodes + nfaults, 1, 0, 0, dag_h, "Cmt" , allocList); |
291 | rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, |
292 | NULL, 1, nWndNodes + nfaults, 0, 0, dag_h, "Nil" , allocList); |
293 | rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, |
294 | NULL, 0, 1, 0, 0, dag_h, "Trm" , allocList); |
295 | rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, rf_NullNodeUndoFunc, NULL, 1, |
296 | nRrdNodes, 2 * nXorBufs + 2, nfaults, dag_h, "Xrc" , allocList); |
297 | |
298 | /* |
299 | * Fill in the Rrd nodes. If any of the rrd buffers are the same size as |
300 | * the failed buffer, save a pointer to it so we can use it as the target |
301 | * of the XOR. The pdas in the rrd nodes have been range-restricted, so if |
302 | * a buffer is the same size as the failed buffer, it must also be at the |
303 | * same alignment within the SU. |
304 | */ |
305 | i = 0; |
306 | tmprrdNode = rrdNodes; |
307 | if (new_asm_h[0]) { |
308 | for (i = 0, pda = new_asm_h[0]->stripeMap->physInfo; |
309 | i < new_asm_h[0]->stripeMap->numStripeUnitsAccessed; |
310 | i++, pda = pda->next) { |
311 | rf_InitNode(tmprrdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, |
312 | rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rrd" , allocList); |
313 | RF_ASSERT(pda); |
314 | tmprrdNode->params[0].p = pda; |
315 | tmprrdNode->params[1].p = pda->bufPtr; |
316 | tmprrdNode->params[2].v = parityStripeID; |
317 | tmprrdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); |
318 | tmprrdNode = tmprrdNode->list_next; |
319 | } |
320 | } |
321 | /* i now equals the number of stripe units accessed in new_asm_h[0] */ |
322 | /* Note that for tmprrdNode, this means a continuation from above, so no need to |
323 | assign it anything.. */ |
324 | if (new_asm_h[1]) { |
325 | for (j = 0, pda = new_asm_h[1]->stripeMap->physInfo; |
326 | j < new_asm_h[1]->stripeMap->numStripeUnitsAccessed; |
327 | j++, pda = pda->next) { |
328 | rf_InitNode(tmprrdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, |
329 | rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rrd" , allocList); |
330 | RF_ASSERT(pda); |
331 | tmprrdNode->params[0].p = pda; |
332 | tmprrdNode->params[1].p = pda->bufPtr; |
333 | tmprrdNode->params[2].v = parityStripeID; |
334 | tmprrdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); |
335 | if (allowBufferRecycle && (pda->numSector == failedPDA->numSector)) |
336 | xorTargetBuf = pda->bufPtr; |
337 | tmprrdNode = tmprrdNode->list_next; |
338 | } |
339 | } |
340 | if (rdnodesFaked) { |
341 | /* |
342 | * This is where we'll init that fake noop read node |
343 | * (XXX should the wakeup func be different?) |
344 | */ |
345 | /* node that rrdNodes will just be a single node... */ |
346 | rf_InitNode(rrdNodes, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, |
347 | NULL, 1, 1, 0, 0, dag_h, "RrN" , allocList); |
348 | } |
349 | /* |
350 | * Make a PDA for the parity unit. The parity PDA should start at |
351 | * the same offset into the SU as the failed PDA. |
352 | */ |
353 | /* Danner comment: I don't think this copy is really necessary. We are |
354 | * in one of two cases here. (1) The entire failed unit is written. |
355 | * Then asmap->parityInfo will describe the entire parity. (2) We are |
356 | * only writing a subset of the failed unit and nothing else. Then the |
357 | * asmap->parityInfo describes the failed unit and the copy can also |
358 | * be avoided. */ |
359 | |
360 | parityPDA = rf_AllocPhysDiskAddr(); |
361 | parityPDA->next = dag_h->pda_cleanup_list; |
362 | dag_h->pda_cleanup_list = parityPDA; |
363 | parityPDA->col = asmap->parityInfo->col; |
364 | parityPDA->startSector = ((asmap->parityInfo->startSector / sectorsPerSU) |
365 | * sectorsPerSU) + (failedPDA->startSector % sectorsPerSU); |
366 | parityPDA->numSector = failedPDA->numSector; |
367 | |
368 | if (!xorTargetBuf) { |
369 | xorTargetBuf = rf_AllocBuffer(raidPtr, dag_h, rf_RaidAddressToByte(raidPtr, failedPDA->numSector)); |
370 | } |
371 | /* init the Wnp node */ |
372 | rf_InitNode(wnpNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, |
373 | rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnp" , allocList); |
374 | wnpNode->params[0].p = parityPDA; |
375 | wnpNode->params[1].p = xorTargetBuf; |
376 | wnpNode->params[2].v = parityStripeID; |
377 | wnpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); |
378 | |
379 | #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) |
380 | /* fill in the Wnq Node */ |
381 | if (nfaults == 2) { |
382 | { |
383 | RF_MallocAndAdd(parityPDA, sizeof(RF_PhysDiskAddr_t), |
384 | (RF_PhysDiskAddr_t *), allocList); |
385 | parityPDA->col = asmap->qInfo->col; |
386 | parityPDA->startSector = ((asmap->qInfo->startSector / sectorsPerSU) |
387 | * sectorsPerSU) + (failedPDA->startSector % sectorsPerSU); |
388 | parityPDA->numSector = failedPDA->numSector; |
389 | |
390 | rf_InitNode(wnqNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, |
391 | rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnq" , allocList); |
392 | wnqNode->params[0].p = parityPDA; |
393 | RF_MallocAndAdd(xorNode->results[1], |
394 | rf_RaidAddressToByte(raidPtr, failedPDA->numSector), (char *), allocList); |
395 | wnqNode->params[1].p = xorNode->results[1]; |
396 | wnqNode->params[2].v = parityStripeID; |
397 | wnqNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); |
398 | } |
399 | } |
400 | #endif |
401 | /* fill in the Wnd nodes */ |
402 | tmpwndNode = wndNodes; |
403 | for (pda = asmap->physInfo, i = 0; i < nWndNodes; i++, pda = pda->next) { |
404 | if (pda == failedPDA) { |
405 | i--; |
406 | continue; |
407 | } |
408 | rf_InitNode(tmpwndNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, |
409 | rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd" , allocList); |
410 | RF_ASSERT(pda); |
411 | tmpwndNode->params[0].p = pda; |
412 | tmpwndNode->params[1].p = pda->bufPtr; |
413 | tmpwndNode->params[2].v = parityStripeID; |
414 | tmpwndNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); |
415 | tmpwndNode = tmpwndNode->list_next; |
416 | } |
417 | |
418 | /* fill in the results of the xor node */ |
419 | xorNode->results[0] = xorTargetBuf; |
420 | |
421 | /* fill in the params of the xor node */ |
422 | |
423 | paramNum = 0; |
424 | if (rdnodesFaked == 0) { |
425 | tmprrdNode = rrdNodes; |
426 | for (i = 0; i < nRrdNodes; i++) { |
427 | /* all the Rrd nodes need to be xored together */ |
428 | xorNode->params[paramNum++] = tmprrdNode->params[0]; |
429 | xorNode->params[paramNum++] = tmprrdNode->params[1]; |
430 | tmprrdNode = tmprrdNode->list_next; |
431 | } |
432 | } |
433 | tmpwndNode = wndNodes; |
434 | for (i = 0; i < nWndNodes; i++) { |
435 | /* any Wnd nodes that overlap the failed access need to be |
436 | * xored in */ |
437 | if (overlappingPDAs[i]) { |
438 | pda = rf_AllocPhysDiskAddr(); |
439 | memcpy((char *) pda, (char *) tmpwndNode->params[0].p, sizeof(RF_PhysDiskAddr_t)); |
440 | /* add it into the pda_cleanup_list *after* the copy, TYVM */ |
441 | pda->next = dag_h->pda_cleanup_list; |
442 | dag_h->pda_cleanup_list = pda; |
443 | rf_RangeRestrictPDA(raidPtr, failedPDA, pda, RF_RESTRICT_DOBUFFER, 0); |
444 | xorNode->params[paramNum++].p = pda; |
445 | xorNode->params[paramNum++].p = pda->bufPtr; |
446 | } |
447 | tmpwndNode = tmpwndNode->list_next; |
448 | } |
449 | |
450 | /* |
451 | * Install the failed PDA into the xor param list so that the |
452 | * new data gets xor'd in. |
453 | */ |
454 | xorNode->params[paramNum++].p = failedPDA; |
455 | xorNode->params[paramNum++].p = failedPDA->bufPtr; |
456 | |
457 | /* |
458 | * The last 2 params to the recovery xor node are always the failed |
459 | * PDA and the raidPtr. install the failedPDA even though we have just |
460 | * done so above. This allows us to use the same XOR function for both |
461 | * degraded reads and degraded writes. |
462 | */ |
463 | xorNode->params[paramNum++].p = failedPDA; |
464 | xorNode->params[paramNum++].p = raidPtr; |
465 | RF_ASSERT(paramNum == 2 * nXorBufs + 2); |
466 | |
467 | /* |
468 | * Code to link nodes begins here |
469 | */ |
470 | |
471 | /* link header to block node */ |
472 | RF_ASSERT(blockNode->numAntecedents == 0); |
473 | dag_h->succedents[0] = blockNode; |
474 | |
475 | /* link block node to rd nodes */ |
476 | RF_ASSERT(blockNode->numSuccedents == nRrdNodes); |
477 | tmprrdNode = rrdNodes; |
478 | for (i = 0; i < nRrdNodes; i++) { |
479 | RF_ASSERT(tmprrdNode->numAntecedents == 1); |
480 | blockNode->succedents[i] = tmprrdNode; |
481 | tmprrdNode->antecedents[0] = blockNode; |
482 | tmprrdNode->antType[0] = rf_control; |
483 | tmprrdNode = tmprrdNode->list_next; |
484 | } |
485 | |
486 | /* link read nodes to xor node */ |
487 | RF_ASSERT(xorNode->numAntecedents == nRrdNodes); |
488 | tmprrdNode = rrdNodes; |
489 | for (i = 0; i < nRrdNodes; i++) { |
490 | RF_ASSERT(tmprrdNode->numSuccedents == 1); |
491 | tmprrdNode->succedents[0] = xorNode; |
492 | xorNode->antecedents[i] = tmprrdNode; |
493 | xorNode->antType[i] = rf_trueData; |
494 | tmprrdNode = tmprrdNode->list_next; |
495 | } |
496 | |
497 | /* link xor node to commit node */ |
498 | RF_ASSERT(xorNode->numSuccedents == 1); |
499 | RF_ASSERT(commitNode->numAntecedents == 1); |
500 | xorNode->succedents[0] = commitNode; |
501 | commitNode->antecedents[0] = xorNode; |
502 | commitNode->antType[0] = rf_control; |
503 | |
504 | /* link commit node to wnd nodes */ |
505 | RF_ASSERT(commitNode->numSuccedents == nfaults + nWndNodes); |
506 | tmpwndNode = wndNodes; |
507 | for (i = 0; i < nWndNodes; i++) { |
508 | RF_ASSERT(tmpwndNode->numAntecedents == 1); |
509 | commitNode->succedents[i] = tmpwndNode; |
510 | tmpwndNode->antecedents[0] = commitNode; |
511 | tmpwndNode->antType[0] = rf_control; |
512 | tmpwndNode = tmpwndNode->list_next; |
513 | } |
514 | |
515 | /* link the commit node to wnp, wnq nodes */ |
516 | RF_ASSERT(wnpNode->numAntecedents == 1); |
517 | commitNode->succedents[nWndNodes] = wnpNode; |
518 | wnpNode->antecedents[0] = commitNode; |
519 | wnpNode->antType[0] = rf_control; |
520 | #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) |
521 | if (nfaults == 2) { |
522 | RF_ASSERT(wnqNode->numAntecedents == 1); |
523 | commitNode->succedents[nWndNodes + 1] = wnqNode; |
524 | wnqNode->antecedents[0] = commitNode; |
525 | wnqNode->antType[0] = rf_control; |
526 | } |
527 | #endif |
528 | /* link write new data nodes to unblock node */ |
529 | RF_ASSERT(unblockNode->numAntecedents == (nWndNodes + nfaults)); |
530 | tmpwndNode = wndNodes; |
531 | for (i = 0; i < nWndNodes; i++) { |
532 | RF_ASSERT(tmpwndNode->numSuccedents == 1); |
533 | tmpwndNode->succedents[0] = unblockNode; |
534 | unblockNode->antecedents[i] = tmpwndNode; |
535 | unblockNode->antType[i] = rf_control; |
536 | tmpwndNode = tmpwndNode->list_next; |
537 | } |
538 | |
539 | /* link write new parity node to unblock node */ |
540 | RF_ASSERT(wnpNode->numSuccedents == 1); |
541 | wnpNode->succedents[0] = unblockNode; |
542 | unblockNode->antecedents[nWndNodes] = wnpNode; |
543 | unblockNode->antType[nWndNodes] = rf_control; |
544 | |
545 | #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) |
546 | /* link write new q node to unblock node */ |
547 | if (nfaults == 2) { |
548 | RF_ASSERT(wnqNode->numSuccedents == 1); |
549 | wnqNode->succedents[0] = unblockNode; |
550 | unblockNode->antecedents[nWndNodes + 1] = wnqNode; |
551 | unblockNode->antType[nWndNodes + 1] = rf_control; |
552 | } |
553 | #endif |
554 | /* link unblock node to term node */ |
555 | RF_ASSERT(unblockNode->numSuccedents == 1); |
556 | RF_ASSERT(termNode->numAntecedents == 1); |
557 | RF_ASSERT(termNode->numSuccedents == 0); |
558 | unblockNode->succedents[0] = termNode; |
559 | termNode->antecedents[0] = unblockNode; |
560 | termNode->antType[0] = rf_control; |
561 | } |
562 | #define CONS_PDA(if,start,num) \ |
563 | pda_p->col = asmap->if->col; \ |
564 | pda_p->startSector = ((asmap->if->startSector / secPerSU) * secPerSU) + start; \ |
565 | pda_p->numSector = num; \ |
566 | pda_p->next = NULL; \ |
567 | RF_MallocAndAdd(pda_p->bufPtr,rf_RaidAddressToByte(raidPtr,num),(char *), allocList) |
568 | #if (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_EVENODD > 0) |
569 | void |
570 | rf_WriteGenerateFailedAccessASMs( |
571 | RF_Raid_t * raidPtr, |
572 | RF_AccessStripeMap_t * asmap, |
573 | RF_PhysDiskAddr_t ** pdap, |
574 | int *nNodep, |
575 | RF_PhysDiskAddr_t ** pqpdap, |
576 | int *nPQNodep, |
577 | RF_AllocListElem_t * allocList) |
578 | { |
579 | RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); |
580 | int PDAPerDisk, i; |
581 | RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit; |
582 | int numDataCol = layoutPtr->numDataCol; |
583 | int state; |
584 | unsigned napdas; |
585 | RF_SectorNum_t fone_start, ftwo_start = 0; |
586 | RF_PhysDiskAddr_t *fone = asmap->failedPDAs[0], *ftwo = asmap->failedPDAs[1]; |
587 | RF_PhysDiskAddr_t *pda_p; |
588 | RF_RaidAddr_t sosAddr; |
589 | |
590 | /* determine how many pda's we will have to generate per unaccess |
591 | * stripe. If there is only one failed data unit, it is one; if two, |
592 | * possibly two, depending whether they overlap. */ |
593 | |
594 | fone_start = rf_StripeUnitOffset(layoutPtr, fone->startSector); |
595 | |
596 | if (asmap->numDataFailed == 1) { |
597 | PDAPerDisk = 1; |
598 | state = 1; |
599 | RF_MallocAndAdd(*pqpdap, 2 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList); |
600 | pda_p = *pqpdap; |
601 | /* build p */ |
602 | CONS_PDA(parityInfo, fone_start, fone->numSector); |
603 | pda_p->type = RF_PDA_TYPE_PARITY; |
604 | pda_p++; |
605 | /* build q */ |
606 | CONS_PDA(qInfo, fone_start, fone->numSector); |
607 | pda_p->type = RF_PDA_TYPE_Q; |
608 | } else { |
609 | ftwo_start = rf_StripeUnitOffset(layoutPtr, ftwo->startSector); |
610 | if (fone->numSector + ftwo->numSector > secPerSU) { |
611 | PDAPerDisk = 1; |
612 | state = 2; |
613 | RF_MallocAndAdd(*pqpdap, 2 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList); |
614 | pda_p = *pqpdap; |
615 | CONS_PDA(parityInfo, 0, secPerSU); |
616 | pda_p->type = RF_PDA_TYPE_PARITY; |
617 | pda_p++; |
618 | CONS_PDA(qInfo, 0, secPerSU); |
619 | pda_p->type = RF_PDA_TYPE_Q; |
620 | } else { |
621 | PDAPerDisk = 2; |
622 | state = 3; |
623 | /* four of them, fone, then ftwo */ |
624 | RF_MallocAndAdd(*pqpdap, 4 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList); |
625 | pda_p = *pqpdap; |
626 | CONS_PDA(parityInfo, fone_start, fone->numSector); |
627 | pda_p->type = RF_PDA_TYPE_PARITY; |
628 | pda_p++; |
629 | CONS_PDA(qInfo, fone_start, fone->numSector); |
630 | pda_p->type = RF_PDA_TYPE_Q; |
631 | pda_p++; |
632 | CONS_PDA(parityInfo, ftwo_start, ftwo->numSector); |
633 | pda_p->type = RF_PDA_TYPE_PARITY; |
634 | pda_p++; |
635 | CONS_PDA(qInfo, ftwo_start, ftwo->numSector); |
636 | pda_p->type = RF_PDA_TYPE_Q; |
637 | } |
638 | } |
639 | /* figure out number of nonaccessed pda */ |
640 | napdas = PDAPerDisk * (numDataCol - 2); |
641 | *nPQNodep = PDAPerDisk; |
642 | |
643 | *nNodep = napdas; |
644 | if (napdas == 0) |
645 | return; /* short circuit */ |
646 | |
647 | /* allocate up our list of pda's */ |
648 | |
649 | RF_MallocAndAdd(pda_p, napdas * sizeof(RF_PhysDiskAddr_t), |
650 | (RF_PhysDiskAddr_t *), allocList); |
651 | *pdap = pda_p; |
652 | |
653 | /* linkem together */ |
654 | for (i = 0; i < (napdas - 1); i++) |
655 | pda_p[i].next = pda_p + (i + 1); |
656 | |
657 | sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); |
658 | for (i = 0; i < numDataCol; i++) { |
659 | if ((pda_p - (*pdap)) == napdas) |
660 | continue; |
661 | pda_p->type = RF_PDA_TYPE_DATA; |
662 | pda_p->raidAddress = sosAddr + (i * secPerSU); |
663 | (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0); |
664 | /* skip over dead disks */ |
665 | if (RF_DEAD_DISK(raidPtr->Disks[pda_p->col].status)) |
666 | continue; |
667 | switch (state) { |
668 | case 1: /* fone */ |
669 | pda_p->numSector = fone->numSector; |
670 | pda_p->raidAddress += fone_start; |
671 | pda_p->startSector += fone_start; |
672 | RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList); |
673 | break; |
674 | case 2: /* full stripe */ |
675 | pda_p->numSector = secPerSU; |
676 | RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, secPerSU), (char *), allocList); |
677 | break; |
678 | case 3: /* two slabs */ |
679 | pda_p->numSector = fone->numSector; |
680 | pda_p->raidAddress += fone_start; |
681 | pda_p->startSector += fone_start; |
682 | RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList); |
683 | pda_p++; |
684 | pda_p->type = RF_PDA_TYPE_DATA; |
685 | pda_p->raidAddress = sosAddr + (i * secPerSU); |
686 | (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0); |
687 | pda_p->numSector = ftwo->numSector; |
688 | pda_p->raidAddress += ftwo_start; |
689 | pda_p->startSector += ftwo_start; |
690 | RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList); |
691 | break; |
692 | default: |
693 | RF_PANIC(); |
694 | } |
695 | pda_p++; |
696 | } |
697 | |
698 | RF_ASSERT(pda_p - *pdap == napdas); |
699 | return; |
700 | } |
701 | #define DISK_NODE_PDA(node) ((node)->params[0].p) |
702 | |
703 | #define DISK_NODE_PARAMS(_node_,_p_) \ |
704 | (_node_).params[0].p = _p_ ; \ |
705 | (_node_).params[1].p = (_p_)->bufPtr; \ |
706 | (_node_).params[2].v = parityStripeID; \ |
707 | (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru) |
708 | |
709 | void |
710 | rf_DoubleDegSmallWrite(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, |
711 | RF_DagHeader_t *dag_h, void *bp, |
712 | RF_RaidAccessFlags_t flags, |
713 | RF_AllocListElem_t *allocList, |
714 | const char *redundantReadNodeName, |
715 | const char *redundantWriteNodeName, |
716 | const char *recoveryNodeName, |
717 | int (*recovFunc) (RF_DagNode_t *)) |
718 | { |
719 | RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); |
720 | RF_DagNode_t *nodes, *wudNodes, *rrdNodes, *recoveryNode, *blockNode, |
721 | *unblockNode, *rpNodes, *rqNodes, *wpNodes, *wqNodes, *termNode; |
722 | RF_PhysDiskAddr_t *pda, *pqPDAs; |
723 | RF_PhysDiskAddr_t *npdas; |
724 | int nWriteNodes, nNodes, nReadNodes, nRrdNodes, nWudNodes, i; |
725 | RF_ReconUnitNum_t which_ru; |
726 | int nPQNodes; |
727 | RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress, &which_ru); |
728 | |
729 | /* simple small write case - First part looks like a reconstruct-read |
730 | * of the failed data units. Then a write of all data units not |
731 | * failed. */ |
732 | |
733 | |
734 | /* Hdr | ------Block- / / \ Rrd Rrd ... Rrd Rp Rq \ \ |
735 | * / -------PQ----- / \ \ Wud Wp WQ \ | / |
736 | * --Unblock- | T |
737 | * |
738 | * Rrd = read recovery data (potentially none) Wud = write user data |
739 | * (not incl. failed disks) Wp = Write P (could be two) Wq = Write Q |
740 | * (could be two) |
741 | * |
742 | */ |
743 | |
744 | rf_WriteGenerateFailedAccessASMs(raidPtr, asmap, &npdas, &nRrdNodes, &pqPDAs, &nPQNodes, allocList); |
745 | |
746 | RF_ASSERT(asmap->numDataFailed == 1); |
747 | |
748 | nWudNodes = asmap->numStripeUnitsAccessed - (asmap->numDataFailed); |
749 | nReadNodes = nRrdNodes + 2 * nPQNodes; |
750 | nWriteNodes = nWudNodes + 2 * nPQNodes; |
751 | nNodes = 4 + nReadNodes + nWriteNodes; |
752 | |
753 | RF_MallocAndAdd(nodes, nNodes * sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList); |
754 | blockNode = nodes; |
755 | unblockNode = blockNode + 1; |
756 | termNode = unblockNode + 1; |
757 | recoveryNode = termNode + 1; |
758 | rrdNodes = recoveryNode + 1; |
759 | rpNodes = rrdNodes + nRrdNodes; |
760 | rqNodes = rpNodes + nPQNodes; |
761 | wudNodes = rqNodes + nPQNodes; |
762 | wpNodes = wudNodes + nWudNodes; |
763 | wqNodes = wpNodes + nPQNodes; |
764 | |
765 | dag_h->creator = "PQ_DDSimpleSmallWrite" ; |
766 | dag_h->numSuccedents = 1; |
767 | dag_h->succedents[0] = blockNode; |
768 | rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm" , allocList); |
769 | termNode->antecedents[0] = unblockNode; |
770 | termNode->antType[0] = rf_control; |
771 | |
772 | /* init the block and unblock nodes */ |
773 | /* The block node has all the read nodes as successors */ |
774 | rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nReadNodes, 0, 0, 0, dag_h, "Nil" , allocList); |
775 | for (i = 0; i < nReadNodes; i++) |
776 | blockNode->succedents[i] = rrdNodes + i; |
777 | |
778 | /* The unblock node has all the writes as successors */ |
779 | rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nWriteNodes, 0, 0, dag_h, "Nil" , allocList); |
780 | for (i = 0; i < nWriteNodes; i++) { |
781 | unblockNode->antecedents[i] = wudNodes + i; |
782 | unblockNode->antType[i] = rf_control; |
783 | } |
784 | unblockNode->succedents[0] = termNode; |
785 | |
786 | #define INIT_READ_NODE(node,name) \ |
787 | rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, name, allocList); \ |
788 | (node)->succedents[0] = recoveryNode; \ |
789 | (node)->antecedents[0] = blockNode; \ |
790 | (node)->antType[0] = rf_control; |
791 | |
792 | /* build the read nodes */ |
793 | pda = npdas; |
794 | for (i = 0; i < nRrdNodes; i++, pda = pda->next) { |
795 | INIT_READ_NODE(rrdNodes + i, "rrd" ); |
796 | DISK_NODE_PARAMS(rrdNodes[i], pda); |
797 | } |
798 | |
799 | /* read redundancy pdas */ |
800 | pda = pqPDAs; |
801 | INIT_READ_NODE(rpNodes, "Rp" ); |
802 | RF_ASSERT(pda); |
803 | DISK_NODE_PARAMS(rpNodes[0], pda); |
804 | pda++; |
805 | INIT_READ_NODE(rqNodes, redundantReadNodeName); |
806 | RF_ASSERT(pda); |
807 | DISK_NODE_PARAMS(rqNodes[0], pda); |
808 | if (nPQNodes == 2) { |
809 | pda++; |
810 | INIT_READ_NODE(rpNodes + 1, "Rp" ); |
811 | RF_ASSERT(pda); |
812 | DISK_NODE_PARAMS(rpNodes[1], pda); |
813 | pda++; |
814 | INIT_READ_NODE(rqNodes + 1, redundantReadNodeName); |
815 | RF_ASSERT(pda); |
816 | DISK_NODE_PARAMS(rqNodes[1], pda); |
817 | } |
818 | /* the recovery node has all reads as precedessors and all writes as |
819 | * successors. It generates a result for every write P or write Q |
820 | * node. As parameters, it takes a pda per read and a pda per stripe |
821 | * of user data written. It also takes as the last params the raidPtr |
822 | * and asm. For results, it takes PDA for P & Q. */ |
823 | |
824 | |
825 | rf_InitNode(recoveryNode, rf_wait, RF_FALSE, recovFunc, rf_NullNodeUndoFunc, NULL, |
826 | nWriteNodes, /* succesors */ |
827 | nReadNodes, /* preds */ |
828 | nReadNodes + nWudNodes + 3, /* params */ |
829 | 2 * nPQNodes, /* results */ |
830 | dag_h, recoveryNodeName, allocList); |
831 | |
832 | |
833 | |
834 | for (i = 0; i < nReadNodes; i++) { |
835 | recoveryNode->antecedents[i] = rrdNodes + i; |
836 | recoveryNode->antType[i] = rf_control; |
837 | recoveryNode->params[i].p = DISK_NODE_PDA(rrdNodes + i); |
838 | } |
839 | for (i = 0; i < nWudNodes; i++) { |
840 | recoveryNode->succedents[i] = wudNodes + i; |
841 | } |
842 | recoveryNode->params[nReadNodes + nWudNodes].p = asmap->failedPDAs[0]; |
843 | recoveryNode->params[nReadNodes + nWudNodes + 1].p = raidPtr; |
844 | recoveryNode->params[nReadNodes + nWudNodes + 2].p = asmap; |
845 | |
846 | for (; i < nWriteNodes; i++) |
847 | recoveryNode->succedents[i] = wudNodes + i; |
848 | |
849 | pda = pqPDAs; |
850 | recoveryNode->results[0] = pda; |
851 | pda++; |
852 | recoveryNode->results[1] = pda; |
853 | if (nPQNodes == 2) { |
854 | pda++; |
855 | recoveryNode->results[2] = pda; |
856 | pda++; |
857 | recoveryNode->results[3] = pda; |
858 | } |
859 | /* fill writes */ |
860 | #define INIT_WRITE_NODE(node,name) \ |
861 | rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, name, allocList); \ |
862 | (node)->succedents[0] = unblockNode; \ |
863 | (node)->antecedents[0] = recoveryNode; \ |
864 | (node)->antType[0] = rf_control; |
865 | |
866 | pda = asmap->physInfo; |
867 | for (i = 0; i < nWudNodes; i++) { |
868 | INIT_WRITE_NODE(wudNodes + i, "Wd" ); |
869 | DISK_NODE_PARAMS(wudNodes[i], pda); |
870 | recoveryNode->params[nReadNodes + i].p = DISK_NODE_PDA(wudNodes + i); |
871 | pda = pda->next; |
872 | } |
873 | /* write redundancy pdas */ |
874 | pda = pqPDAs; |
875 | INIT_WRITE_NODE(wpNodes, "Wp" ); |
876 | RF_ASSERT(pda); |
877 | DISK_NODE_PARAMS(wpNodes[0], pda); |
878 | pda++; |
879 | INIT_WRITE_NODE(wqNodes, "Wq" ); |
880 | RF_ASSERT(pda); |
881 | DISK_NODE_PARAMS(wqNodes[0], pda); |
882 | if (nPQNodes == 2) { |
883 | pda++; |
884 | INIT_WRITE_NODE(wpNodes + 1, "Wp" ); |
885 | RF_ASSERT(pda); |
886 | DISK_NODE_PARAMS(wpNodes[1], pda); |
887 | pda++; |
888 | INIT_WRITE_NODE(wqNodes + 1, "Wq" ); |
889 | RF_ASSERT(pda); |
890 | DISK_NODE_PARAMS(wqNodes[1], pda); |
891 | } |
892 | } |
893 | #endif /* (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_EVENODD > 0) */ |
894 | |