1 | /* $NetBSD: rf_dagffwr.c,v 1.34 2013/09/15 12:41:17 martin Exp $ */ |
2 | /* |
3 | * Copyright (c) 1995 Carnegie-Mellon University. |
4 | * All rights reserved. |
5 | * |
6 | * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II |
7 | * |
8 | * Permission to use, copy, modify and distribute this software and |
9 | * its documentation is hereby granted, provided that both the copyright |
10 | * notice and this permission notice appear in all copies of the |
11 | * software, derivative works or modified versions, and any portions |
12 | * thereof, and that both notices appear in supporting documentation. |
13 | * |
14 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
15 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND |
16 | * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. |
17 | * |
18 | * Carnegie Mellon requests users of this software to return to |
19 | * |
20 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
21 | * School of Computer Science |
22 | * Carnegie Mellon University |
23 | * Pittsburgh PA 15213-3890 |
24 | * |
25 | * any improvements or extensions that they make and grant Carnegie the |
26 | * rights to redistribute these changes. |
27 | */ |
28 | |
29 | /* |
30 | * rf_dagff.c |
31 | * |
32 | * code for creating fault-free DAGs |
33 | * |
34 | */ |
35 | |
36 | #include <sys/cdefs.h> |
37 | __KERNEL_RCSID(0, "$NetBSD: rf_dagffwr.c,v 1.34 2013/09/15 12:41:17 martin Exp $" ); |
38 | |
39 | #include <dev/raidframe/raidframevar.h> |
40 | |
41 | #include "rf_raid.h" |
42 | #include "rf_dag.h" |
43 | #include "rf_dagutils.h" |
44 | #include "rf_dagfuncs.h" |
45 | #include "rf_debugMem.h" |
46 | #include "rf_dagffrd.h" |
47 | #include "rf_general.h" |
48 | #include "rf_dagffwr.h" |
49 | #include "rf_map.h" |
50 | |
51 | /****************************************************************************** |
52 | * |
53 | * General comments on DAG creation: |
54 | * |
55 | * All DAGs in this file use roll-away error recovery. Each DAG has a single |
56 | * commit node, usually called "Cmt." If an error occurs before the Cmt node |
57 | * is reached, the execution engine will halt forward execution and work |
58 | * backward through the graph, executing the undo functions. Assuming that |
59 | * each node in the graph prior to the Cmt node are undoable and atomic - or - |
60 | * does not make changes to permanent state, the graph will fail atomically. |
61 | * If an error occurs after the Cmt node executes, the engine will roll-forward |
62 | * through the graph, blindly executing nodes until it reaches the end. |
63 | * If a graph reaches the end, it is assumed to have completed successfully. |
64 | * |
65 | * A graph has only 1 Cmt node. |
66 | * |
67 | */ |
68 | |
69 | |
70 | /****************************************************************************** |
71 | * |
72 | * The following wrappers map the standard DAG creation interface to the |
73 | * DAG creation routines. Additionally, these wrappers enable experimentation |
74 | * with new DAG structures by providing an extra level of indirection, allowing |
75 | * the DAG creation routines to be replaced at this single point. |
76 | */ |
77 | |
78 | |
79 | void |
80 | rf_CreateNonRedundantWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, |
81 | RF_DagHeader_t *dag_h, void *bp, |
82 | RF_RaidAccessFlags_t flags, |
83 | RF_AllocListElem_t *allocList, |
84 | RF_IoType_t type) |
85 | { |
86 | rf_CreateNonredundantDAG(raidPtr, asmap, dag_h, bp, flags, allocList, |
87 | RF_IO_TYPE_WRITE); |
88 | } |
89 | |
90 | void |
91 | rf_CreateRAID0WriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, |
92 | RF_DagHeader_t *dag_h, void *bp, |
93 | RF_RaidAccessFlags_t flags, |
94 | RF_AllocListElem_t *allocList, |
95 | RF_IoType_t type) |
96 | { |
97 | rf_CreateNonredundantDAG(raidPtr, asmap, dag_h, bp, flags, allocList, |
98 | RF_IO_TYPE_WRITE); |
99 | } |
100 | |
101 | void |
102 | rf_CreateSmallWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, |
103 | RF_DagHeader_t *dag_h, void *bp, |
104 | RF_RaidAccessFlags_t flags, |
105 | RF_AllocListElem_t *allocList) |
106 | { |
107 | /* "normal" rollaway */ |
108 | rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, |
109 | allocList, &rf_xorFuncs, NULL); |
110 | } |
111 | |
112 | void |
113 | rf_CreateLargeWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, |
114 | RF_DagHeader_t *dag_h, void *bp, |
115 | RF_RaidAccessFlags_t flags, |
116 | RF_AllocListElem_t *allocList) |
117 | { |
118 | /* "normal" rollaway */ |
119 | rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, |
120 | allocList, 1, rf_RegularXorFunc, RF_TRUE); |
121 | } |
122 | |
123 | |
124 | /****************************************************************************** |
125 | * |
126 | * DAG creation code begins here |
127 | */ |
128 | |
129 | |
130 | /****************************************************************************** |
131 | * |
132 | * creates a DAG to perform a large-write operation: |
133 | * |
134 | * / Rod \ / Wnd \ |
135 | * H -- block- Rod - Xor - Cmt - Wnd --- T |
136 | * \ Rod / \ Wnp / |
137 | * \[Wnq]/ |
138 | * |
139 | * The XOR node also does the Q calculation in the P+Q architecture. |
140 | * All nodes are before the commit node (Cmt) are assumed to be atomic and |
141 | * undoable - or - they make no changes to permanent state. |
142 | * |
143 | * Rod = read old data |
144 | * Cmt = commit node |
145 | * Wnp = write new parity |
146 | * Wnd = write new data |
147 | * Wnq = write new "q" |
148 | * [] denotes optional segments in the graph |
149 | * |
150 | * Parameters: raidPtr - description of the physical array |
151 | * asmap - logical & physical addresses for this access |
152 | * bp - buffer ptr (holds write data) |
153 | * flags - general flags (e.g. disk locking) |
154 | * allocList - list of memory allocated in DAG creation |
155 | * nfaults - number of faults array can tolerate |
156 | * (equal to # redundancy units in stripe) |
157 | * redfuncs - list of redundancy generating functions |
158 | * |
159 | *****************************************************************************/ |
160 | |
161 | void |
162 | rf_CommonCreateLargeWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, |
163 | RF_DagHeader_t *dag_h, void *bp, |
164 | RF_RaidAccessFlags_t flags, |
165 | RF_AllocListElem_t *allocList, |
166 | int nfaults, int (*redFunc) (RF_DagNode_t *), |
167 | int allowBufferRecycle) |
168 | { |
169 | RF_DagNode_t *wndNodes, *rodNodes, *xorNode, *wnpNode, *tmpNode; |
170 | RF_DagNode_t *blockNode, *commitNode, *termNode; |
171 | #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) |
172 | RF_DagNode_t *wnqNode; |
173 | #endif |
174 | int nWndNodes, nRodNodes, i, nodeNum, asmNum; |
175 | RF_AccessStripeMapHeader_t *new_asm_h[2]; |
176 | RF_StripeNum_t parityStripeID; |
177 | char *sosBuffer, *eosBuffer; |
178 | RF_ReconUnitNum_t which_ru; |
179 | RF_RaidLayout_t *layoutPtr; |
180 | RF_PhysDiskAddr_t *pda; |
181 | |
182 | layoutPtr = &(raidPtr->Layout); |
183 | parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, |
184 | asmap->raidAddress, |
185 | &which_ru); |
186 | |
187 | #if RF_DEBUG_DAG |
188 | if (rf_dagDebug) { |
189 | printf("[Creating large-write DAG]\n" ); |
190 | } |
191 | #endif |
192 | dag_h->creator = "LargeWriteDAG" ; |
193 | |
194 | dag_h->numCommitNodes = 1; |
195 | dag_h->numCommits = 0; |
196 | dag_h->numSuccedents = 1; |
197 | |
198 | /* alloc the nodes: Wnd, xor, commit, block, term, and Wnp */ |
199 | nWndNodes = asmap->numStripeUnitsAccessed; |
200 | |
201 | for (i = 0; i < nWndNodes; i++) { |
202 | tmpNode = rf_AllocDAGNode(); |
203 | tmpNode->list_next = dag_h->nodes; |
204 | dag_h->nodes = tmpNode; |
205 | } |
206 | wndNodes = dag_h->nodes; |
207 | |
208 | xorNode = rf_AllocDAGNode(); |
209 | xorNode->list_next = dag_h->nodes; |
210 | dag_h->nodes = xorNode; |
211 | |
212 | wnpNode = rf_AllocDAGNode(); |
213 | wnpNode->list_next = dag_h->nodes; |
214 | dag_h->nodes = wnpNode; |
215 | |
216 | blockNode = rf_AllocDAGNode(); |
217 | blockNode->list_next = dag_h->nodes; |
218 | dag_h->nodes = blockNode; |
219 | |
220 | commitNode = rf_AllocDAGNode(); |
221 | commitNode->list_next = dag_h->nodes; |
222 | dag_h->nodes = commitNode; |
223 | |
224 | termNode = rf_AllocDAGNode(); |
225 | termNode->list_next = dag_h->nodes; |
226 | dag_h->nodes = termNode; |
227 | |
228 | #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) |
229 | if (nfaults == 2) { |
230 | wnqNode = rf_AllocDAGNode(); |
231 | } else { |
232 | wnqNode = NULL; |
233 | } |
234 | #endif |
235 | rf_MapUnaccessedPortionOfStripe(raidPtr, layoutPtr, asmap, dag_h, |
236 | new_asm_h, &nRodNodes, &sosBuffer, |
237 | &eosBuffer, allocList); |
238 | if (nRodNodes > 0) { |
239 | for (i = 0; i < nRodNodes; i++) { |
240 | tmpNode = rf_AllocDAGNode(); |
241 | tmpNode->list_next = dag_h->nodes; |
242 | dag_h->nodes = tmpNode; |
243 | } |
244 | rodNodes = dag_h->nodes; |
245 | } else { |
246 | rodNodes = NULL; |
247 | } |
248 | |
249 | /* begin node initialization */ |
250 | if (nRodNodes > 0) { |
251 | rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, |
252 | rf_NullNodeUndoFunc, NULL, nRodNodes, 0, 0, 0, |
253 | dag_h, "Nil" , allocList); |
254 | } else { |
255 | rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, |
256 | rf_NullNodeUndoFunc, NULL, 1, 0, 0, 0, |
257 | dag_h, "Nil" , allocList); |
258 | } |
259 | |
260 | rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, |
261 | rf_NullNodeUndoFunc, NULL, nWndNodes + nfaults, 1, 0, 0, |
262 | dag_h, "Cmt" , allocList); |
263 | rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, |
264 | rf_TerminateUndoFunc, NULL, 0, nWndNodes + nfaults, 0, 0, |
265 | dag_h, "Trm" , allocList); |
266 | |
267 | /* initialize the Rod nodes */ |
268 | tmpNode = rodNodes; |
269 | for (nodeNum = asmNum = 0; asmNum < 2; asmNum++) { |
270 | if (new_asm_h[asmNum]) { |
271 | pda = new_asm_h[asmNum]->stripeMap->physInfo; |
272 | while (pda) { |
273 | rf_InitNode(tmpNode, rf_wait, |
274 | RF_FALSE, rf_DiskReadFunc, |
275 | rf_DiskReadUndoFunc, |
276 | rf_GenericWakeupFunc, |
277 | 1, 1, 4, 0, dag_h, |
278 | "Rod" , allocList); |
279 | tmpNode->params[0].p = pda; |
280 | tmpNode->params[1].p = pda->bufPtr; |
281 | tmpNode->params[2].v = parityStripeID; |
282 | tmpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, |
283 | which_ru); |
284 | nodeNum++; |
285 | pda = pda->next; |
286 | tmpNode = tmpNode->list_next; |
287 | } |
288 | } |
289 | } |
290 | RF_ASSERT(nodeNum == nRodNodes); |
291 | |
292 | /* initialize the wnd nodes */ |
293 | pda = asmap->physInfo; |
294 | tmpNode = wndNodes; |
295 | for (i = 0; i < nWndNodes; i++) { |
296 | rf_InitNode(tmpNode, rf_wait, RF_FALSE, |
297 | rf_DiskWriteFunc, rf_DiskWriteUndoFunc, |
298 | rf_GenericWakeupFunc, 1, 1, 4, 0, |
299 | dag_h, "Wnd" , allocList); |
300 | RF_ASSERT(pda != NULL); |
301 | tmpNode->params[0].p = pda; |
302 | tmpNode->params[1].p = pda->bufPtr; |
303 | tmpNode->params[2].v = parityStripeID; |
304 | tmpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); |
305 | pda = pda->next; |
306 | tmpNode = tmpNode->list_next; |
307 | } |
308 | |
309 | /* initialize the redundancy node */ |
310 | if (nRodNodes > 0) { |
311 | rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, |
312 | rf_NullNodeUndoFunc, NULL, 1, |
313 | nRodNodes, 2 * (nWndNodes + nRodNodes) + 1, |
314 | nfaults, dag_h, "Xr " , allocList); |
315 | } else { |
316 | rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, |
317 | rf_NullNodeUndoFunc, NULL, 1, |
318 | 1, 2 * (nWndNodes + nRodNodes) + 1, |
319 | nfaults, dag_h, "Xr " , allocList); |
320 | } |
321 | xorNode->flags |= RF_DAGNODE_FLAG_YIELD; |
322 | tmpNode = wndNodes; |
323 | for (i = 0; i < nWndNodes; i++) { |
324 | /* pda */ |
325 | xorNode->params[2 * i + 0] = tmpNode->params[0]; |
326 | /* buf ptr */ |
327 | xorNode->params[2 * i + 1] = tmpNode->params[1]; |
328 | tmpNode = tmpNode->list_next; |
329 | } |
330 | tmpNode = rodNodes; |
331 | for (i = 0; i < nRodNodes; i++) { |
332 | /* pda */ |
333 | xorNode->params[2 * (nWndNodes + i) + 0] = tmpNode->params[0]; |
334 | /* buf ptr */ |
335 | xorNode->params[2 * (nWndNodes + i) + 1] = tmpNode->params[1]; |
336 | tmpNode = tmpNode->list_next; |
337 | } |
338 | /* xor node needs to get at RAID information */ |
339 | xorNode->params[2 * (nWndNodes + nRodNodes)].p = raidPtr; |
340 | |
341 | /* |
342 | * Look for an Rod node that reads a complete SU. If none, |
343 | * alloc a buffer to receive the parity info. Note that we |
344 | * can't use a new data buffer because it will not have gotten |
345 | * written when the xor occurs. */ |
346 | if (allowBufferRecycle) { |
347 | tmpNode = rodNodes; |
348 | for (i = 0; i < nRodNodes; i++) { |
349 | if (((RF_PhysDiskAddr_t *) tmpNode->params[0].p)->numSector == raidPtr->Layout.sectorsPerStripeUnit) |
350 | break; |
351 | tmpNode = tmpNode->list_next; |
352 | } |
353 | } |
354 | if ((!allowBufferRecycle) || (i == nRodNodes)) { |
355 | xorNode->results[0] = rf_AllocBuffer(raidPtr, dag_h, rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit)); |
356 | } else { |
357 | /* this works because the only way we get here is if |
358 | allowBufferRecycle is true and we went through the |
359 | above for loop, and exited via the break before |
360 | i==nRodNodes was true. That means tmpNode will |
361 | still point to a valid node -- the one we want for |
362 | here! */ |
363 | xorNode->results[0] = tmpNode->params[1].p; |
364 | } |
365 | |
366 | /* initialize the Wnp node */ |
367 | rf_InitNode(wnpNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, |
368 | rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, |
369 | dag_h, "Wnp" , allocList); |
370 | wnpNode->params[0].p = asmap->parityInfo; |
371 | wnpNode->params[1].p = xorNode->results[0]; |
372 | wnpNode->params[2].v = parityStripeID; |
373 | wnpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); |
374 | /* parityInfo must describe entire parity unit */ |
375 | RF_ASSERT(asmap->parityInfo->next == NULL); |
376 | |
377 | #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) |
378 | if (nfaults == 2) { |
379 | /* |
380 | * We never try to recycle a buffer for the Q calcuation |
381 | * in addition to the parity. This would cause two buffers |
382 | * to get smashed during the P and Q calculation, guaranteeing |
383 | * one would be wrong. |
384 | */ |
385 | RF_MallocAndAdd(xorNode->results[1], |
386 | rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit), |
387 | (void *), allocList); |
388 | rf_InitNode(wnqNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, |
389 | rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, |
390 | 1, 1, 4, 0, dag_h, "Wnq" , allocList); |
391 | wnqNode->params[0].p = asmap->qInfo; |
392 | wnqNode->params[1].p = xorNode->results[1]; |
393 | wnqNode->params[2].v = parityStripeID; |
394 | wnqNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); |
395 | /* parityInfo must describe entire parity unit */ |
396 | RF_ASSERT(asmap->parityInfo->next == NULL); |
397 | } |
398 | #endif |
399 | /* |
400 | * Connect nodes to form graph. |
401 | */ |
402 | |
403 | /* connect dag header to block node */ |
404 | RF_ASSERT(blockNode->numAntecedents == 0); |
405 | dag_h->succedents[0] = blockNode; |
406 | |
407 | if (nRodNodes > 0) { |
408 | /* connect the block node to the Rod nodes */ |
409 | RF_ASSERT(blockNode->numSuccedents == nRodNodes); |
410 | RF_ASSERT(xorNode->numAntecedents == nRodNodes); |
411 | tmpNode = rodNodes; |
412 | for (i = 0; i < nRodNodes; i++) { |
413 | RF_ASSERT(tmpNode->numAntecedents == 1); |
414 | blockNode->succedents[i] = tmpNode; |
415 | tmpNode->antecedents[0] = blockNode; |
416 | tmpNode->antType[0] = rf_control; |
417 | |
418 | /* connect the Rod nodes to the Xor node */ |
419 | RF_ASSERT(tmpNode->numSuccedents == 1); |
420 | tmpNode->succedents[0] = xorNode; |
421 | xorNode->antecedents[i] = tmpNode; |
422 | xorNode->antType[i] = rf_trueData; |
423 | tmpNode = tmpNode->list_next; |
424 | } |
425 | } else { |
426 | /* connect the block node to the Xor node */ |
427 | RF_ASSERT(blockNode->numSuccedents == 1); |
428 | RF_ASSERT(xorNode->numAntecedents == 1); |
429 | blockNode->succedents[0] = xorNode; |
430 | xorNode->antecedents[0] = blockNode; |
431 | xorNode->antType[0] = rf_control; |
432 | } |
433 | |
434 | /* connect the xor node to the commit node */ |
435 | RF_ASSERT(xorNode->numSuccedents == 1); |
436 | RF_ASSERT(commitNode->numAntecedents == 1); |
437 | xorNode->succedents[0] = commitNode; |
438 | commitNode->antecedents[0] = xorNode; |
439 | commitNode->antType[0] = rf_control; |
440 | |
441 | /* connect the commit node to the write nodes */ |
442 | RF_ASSERT(commitNode->numSuccedents == nWndNodes + nfaults); |
443 | tmpNode = wndNodes; |
444 | for (i = 0; i < nWndNodes; i++) { |
445 | RF_ASSERT(wndNodes->numAntecedents == 1); |
446 | commitNode->succedents[i] = tmpNode; |
447 | tmpNode->antecedents[0] = commitNode; |
448 | tmpNode->antType[0] = rf_control; |
449 | tmpNode = tmpNode->list_next; |
450 | } |
451 | RF_ASSERT(wnpNode->numAntecedents == 1); |
452 | commitNode->succedents[nWndNodes] = wnpNode; |
453 | wnpNode->antecedents[0] = commitNode; |
454 | wnpNode->antType[0] = rf_trueData; |
455 | #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) |
456 | if (nfaults == 2) { |
457 | RF_ASSERT(wnqNode->numAntecedents == 1); |
458 | commitNode->succedents[nWndNodes + 1] = wnqNode; |
459 | wnqNode->antecedents[0] = commitNode; |
460 | wnqNode->antType[0] = rf_trueData; |
461 | } |
462 | #endif |
463 | /* connect the write nodes to the term node */ |
464 | RF_ASSERT(termNode->numAntecedents == nWndNodes + nfaults); |
465 | RF_ASSERT(termNode->numSuccedents == 0); |
466 | tmpNode = wndNodes; |
467 | for (i = 0; i < nWndNodes; i++) { |
468 | RF_ASSERT(wndNodes->numSuccedents == 1); |
469 | tmpNode->succedents[0] = termNode; |
470 | termNode->antecedents[i] = tmpNode; |
471 | termNode->antType[i] = rf_control; |
472 | tmpNode = tmpNode->list_next; |
473 | } |
474 | RF_ASSERT(wnpNode->numSuccedents == 1); |
475 | wnpNode->succedents[0] = termNode; |
476 | termNode->antecedents[nWndNodes] = wnpNode; |
477 | termNode->antType[nWndNodes] = rf_control; |
478 | #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) |
479 | if (nfaults == 2) { |
480 | RF_ASSERT(wnqNode->numSuccedents == 1); |
481 | wnqNode->succedents[0] = termNode; |
482 | termNode->antecedents[nWndNodes + 1] = wnqNode; |
483 | termNode->antType[nWndNodes + 1] = rf_control; |
484 | } |
485 | #endif |
486 | } |
487 | /****************************************************************************** |
488 | * |
489 | * creates a DAG to perform a small-write operation (either raid 5 or pq), |
490 | * which is as follows: |
491 | * |
492 | * Hdr -> Nil -> Rop -> Xor -> Cmt ----> Wnp [Unp] --> Trm |
493 | * \- Rod X / \----> Wnd [Und]-/ |
494 | * [\- Rod X / \---> Wnd [Und]-/] |
495 | * [\- Roq -> Q / \--> Wnq [Unq]-/] |
496 | * |
497 | * Rop = read old parity |
498 | * Rod = read old data |
499 | * Roq = read old "q" |
500 | * Cmt = commit node |
501 | * Und = unlock data disk |
502 | * Unp = unlock parity disk |
503 | * Unq = unlock q disk |
504 | * Wnp = write new parity |
505 | * Wnd = write new data |
506 | * Wnq = write new "q" |
507 | * [ ] denotes optional segments in the graph |
508 | * |
509 | * Parameters: raidPtr - description of the physical array |
510 | * asmap - logical & physical addresses for this access |
511 | * bp - buffer ptr (holds write data) |
512 | * flags - general flags (e.g. disk locking) |
513 | * allocList - list of memory allocated in DAG creation |
514 | * pfuncs - list of parity generating functions |
515 | * qfuncs - list of q generating functions |
516 | * |
517 | * A null qfuncs indicates single fault tolerant |
518 | *****************************************************************************/ |
519 | |
520 | void |
521 | rf_CommonCreateSmallWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, |
522 | RF_DagHeader_t *dag_h, void *bp, |
523 | RF_RaidAccessFlags_t flags, |
524 | RF_AllocListElem_t *allocList, |
525 | const RF_RedFuncs_t *pfuncs, |
526 | const RF_RedFuncs_t *qfuncs) |
527 | { |
528 | RF_DagNode_t *readDataNodes, *readParityNodes, *termNode; |
529 | RF_DagNode_t *tmpNode, *tmpreadDataNode, *tmpreadParityNode; |
530 | RF_DagNode_t *xorNodes, *blockNode, *commitNode; |
531 | RF_DagNode_t *writeDataNodes, *writeParityNodes; |
532 | RF_DagNode_t *tmpxorNode, *tmpwriteDataNode; |
533 | RF_DagNode_t *tmpwriteParityNode; |
534 | #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) |
535 | RF_DagNode_t *tmpwriteQNode, *tmpreadQNode, *tmpqNode, *readQNodes, |
536 | *writeQNodes, *qNodes; |
537 | #endif |
538 | int i, j, nNodes; |
539 | RF_ReconUnitNum_t which_ru; |
540 | int (*func) (RF_DagNode_t *), (*undoFunc) (RF_DagNode_t *); |
541 | int (*qfunc) (RF_DagNode_t *) __unused; |
542 | int numDataNodes, numParityNodes; |
543 | RF_StripeNum_t parityStripeID; |
544 | RF_PhysDiskAddr_t *pda; |
545 | const char *name, *qname __unused; |
546 | long nfaults; |
547 | |
548 | nfaults = qfuncs ? 2 : 1; |
549 | |
550 | parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), |
551 | asmap->raidAddress, &which_ru); |
552 | pda = asmap->physInfo; |
553 | numDataNodes = asmap->numStripeUnitsAccessed; |
554 | numParityNodes = (asmap->parityInfo->next) ? 2 : 1; |
555 | |
556 | #if RF_DEBUG_DAG |
557 | if (rf_dagDebug) { |
558 | printf("[Creating small-write DAG]\n" ); |
559 | } |
560 | #endif |
561 | RF_ASSERT(numDataNodes > 0); |
562 | dag_h->creator = "SmallWriteDAG" ; |
563 | |
564 | dag_h->numCommitNodes = 1; |
565 | dag_h->numCommits = 0; |
566 | dag_h->numSuccedents = 1; |
567 | |
568 | /* |
569 | * DAG creation occurs in four steps: |
570 | * 1. count the number of nodes in the DAG |
571 | * 2. create the nodes |
572 | * 3. initialize the nodes |
573 | * 4. connect the nodes |
574 | */ |
575 | |
576 | /* |
577 | * Step 1. compute number of nodes in the graph |
578 | */ |
579 | |
580 | /* number of nodes: a read and write for each data unit a |
581 | * redundancy computation node for each parity node (nfaults * |
582 | * nparity) a read and write for each parity unit a block and |
583 | * commit node (2) a terminate node if atomic RMW an unlock |
584 | * node for each data unit, redundancy unit |
585 | * totalNumNodes = (2 * numDataNodes) + (nfaults * numParityNodes) |
586 | * + (nfaults * 2 * numParityNodes) + 3; |
587 | */ |
588 | |
589 | /* |
590 | * Step 2. create the nodes |
591 | */ |
592 | |
593 | blockNode = rf_AllocDAGNode(); |
594 | blockNode->list_next = dag_h->nodes; |
595 | dag_h->nodes = blockNode; |
596 | |
597 | commitNode = rf_AllocDAGNode(); |
598 | commitNode->list_next = dag_h->nodes; |
599 | dag_h->nodes = commitNode; |
600 | |
601 | for (i = 0; i < numDataNodes; i++) { |
602 | tmpNode = rf_AllocDAGNode(); |
603 | tmpNode->list_next = dag_h->nodes; |
604 | dag_h->nodes = tmpNode; |
605 | } |
606 | readDataNodes = dag_h->nodes; |
607 | |
608 | for (i = 0; i < numParityNodes; i++) { |
609 | tmpNode = rf_AllocDAGNode(); |
610 | tmpNode->list_next = dag_h->nodes; |
611 | dag_h->nodes = tmpNode; |
612 | } |
613 | readParityNodes = dag_h->nodes; |
614 | |
615 | for (i = 0; i < numDataNodes; i++) { |
616 | tmpNode = rf_AllocDAGNode(); |
617 | tmpNode->list_next = dag_h->nodes; |
618 | dag_h->nodes = tmpNode; |
619 | } |
620 | writeDataNodes = dag_h->nodes; |
621 | |
622 | for (i = 0; i < numParityNodes; i++) { |
623 | tmpNode = rf_AllocDAGNode(); |
624 | tmpNode->list_next = dag_h->nodes; |
625 | dag_h->nodes = tmpNode; |
626 | } |
627 | writeParityNodes = dag_h->nodes; |
628 | |
629 | for (i = 0; i < numParityNodes; i++) { |
630 | tmpNode = rf_AllocDAGNode(); |
631 | tmpNode->list_next = dag_h->nodes; |
632 | dag_h->nodes = tmpNode; |
633 | } |
634 | xorNodes = dag_h->nodes; |
635 | |
636 | termNode = rf_AllocDAGNode(); |
637 | termNode->list_next = dag_h->nodes; |
638 | dag_h->nodes = termNode; |
639 | |
640 | #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) |
641 | if (nfaults == 2) { |
642 | for (i = 0; i < numParityNodes; i++) { |
643 | tmpNode = rf_AllocDAGNode(); |
644 | tmpNode->list_next = dag_h->nodes; |
645 | dag_h->nodes = tmpNode; |
646 | } |
647 | readQNodes = dag_h->nodes; |
648 | |
649 | for (i = 0; i < numParityNodes; i++) { |
650 | tmpNode = rf_AllocDAGNode(); |
651 | tmpNode->list_next = dag_h->nodes; |
652 | dag_h->nodes = tmpNode; |
653 | } |
654 | writeQNodes = dag_h->nodes; |
655 | |
656 | for (i = 0; i < numParityNodes; i++) { |
657 | tmpNode = rf_AllocDAGNode(); |
658 | tmpNode->list_next = dag_h->nodes; |
659 | dag_h->nodes = tmpNode; |
660 | } |
661 | qNodes = dag_h->nodes; |
662 | } else { |
663 | readQNodes = writeQNodes = qNodes = NULL; |
664 | } |
665 | #endif |
666 | |
667 | /* |
668 | * Step 3. initialize the nodes |
669 | */ |
670 | /* initialize block node (Nil) */ |
671 | nNodes = numDataNodes + (nfaults * numParityNodes); |
672 | rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, |
673 | rf_NullNodeUndoFunc, NULL, nNodes, 0, 0, 0, |
674 | dag_h, "Nil" , allocList); |
675 | |
676 | /* initialize commit node (Cmt) */ |
677 | rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, |
678 | rf_NullNodeUndoFunc, NULL, nNodes, |
679 | (nfaults * numParityNodes), 0, 0, dag_h, "Cmt" , allocList); |
680 | |
681 | /* initialize terminate node (Trm) */ |
682 | rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, |
683 | rf_TerminateUndoFunc, NULL, 0, nNodes, 0, 0, |
684 | dag_h, "Trm" , allocList); |
685 | |
686 | /* initialize nodes which read old data (Rod) */ |
687 | tmpreadDataNode = readDataNodes; |
688 | for (i = 0; i < numDataNodes; i++) { |
689 | rf_InitNode(tmpreadDataNode, rf_wait, RF_FALSE, |
690 | rf_DiskReadFunc, rf_DiskReadUndoFunc, |
691 | rf_GenericWakeupFunc, (nfaults * numParityNodes), |
692 | 1, 4, 0, dag_h, "Rod" , allocList); |
693 | RF_ASSERT(pda != NULL); |
694 | /* physical disk addr desc */ |
695 | tmpreadDataNode->params[0].p = pda; |
696 | /* buffer to hold old data */ |
697 | tmpreadDataNode->params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda->numSector << raidPtr->logBytesPerSector); |
698 | tmpreadDataNode->params[2].v = parityStripeID; |
699 | tmpreadDataNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, |
700 | which_ru); |
701 | pda = pda->next; |
702 | for (j = 0; j < tmpreadDataNode->numSuccedents; j++) { |
703 | tmpreadDataNode->propList[j] = NULL; |
704 | } |
705 | tmpreadDataNode = tmpreadDataNode->list_next; |
706 | } |
707 | |
708 | /* initialize nodes which read old parity (Rop) */ |
709 | pda = asmap->parityInfo; |
710 | i = 0; |
711 | tmpreadParityNode = readParityNodes; |
712 | for (i = 0; i < numParityNodes; i++) { |
713 | RF_ASSERT(pda != NULL); |
714 | rf_InitNode(tmpreadParityNode, rf_wait, RF_FALSE, |
715 | rf_DiskReadFunc, rf_DiskReadUndoFunc, |
716 | rf_GenericWakeupFunc, numParityNodes, 1, 4, 0, |
717 | dag_h, "Rop" , allocList); |
718 | tmpreadParityNode->params[0].p = pda; |
719 | /* buffer to hold old parity */ |
720 | tmpreadParityNode->params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda->numSector << raidPtr->logBytesPerSector); |
721 | tmpreadParityNode->params[2].v = parityStripeID; |
722 | tmpreadParityNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, |
723 | which_ru); |
724 | pda = pda->next; |
725 | for (j = 0; j < tmpreadParityNode->numSuccedents; j++) { |
726 | tmpreadParityNode->propList[0] = NULL; |
727 | } |
728 | tmpreadParityNode = tmpreadParityNode->list_next; |
729 | } |
730 | |
731 | #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) |
732 | /* initialize nodes which read old Q (Roq) */ |
733 | if (nfaults == 2) { |
734 | pda = asmap->qInfo; |
735 | tmpreadQNode = readQNodes; |
736 | for (i = 0; i < numParityNodes; i++) { |
737 | RF_ASSERT(pda != NULL); |
738 | rf_InitNode(tmpreadQNode, rf_wait, RF_FALSE, |
739 | rf_DiskReadFunc, rf_DiskReadUndoFunc, |
740 | rf_GenericWakeupFunc, numParityNodes, |
741 | 1, 4, 0, dag_h, "Roq" , allocList); |
742 | tmpreadQNode->params[0].p = pda; |
743 | /* buffer to hold old Q */ |
744 | tmpreadQNode->params[1].p = rf_AllocBuffer(raidPtr, dag_h, |
745 | pda->numSector << raidPtr->logBytesPerSector); |
746 | tmpreadQNode->params[2].v = parityStripeID; |
747 | tmpreadQNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, |
748 | which_ru); |
749 | pda = pda->next; |
750 | for (j = 0; j < tmpreadQNode->numSuccedents; j++) { |
751 | tmpreadQNode->propList[0] = NULL; |
752 | } |
753 | tmpreadQNode = tmpreadQNode->list_next; |
754 | } |
755 | } |
756 | #endif |
757 | /* initialize nodes which write new data (Wnd) */ |
758 | pda = asmap->physInfo; |
759 | tmpwriteDataNode = writeDataNodes; |
760 | for (i = 0; i < numDataNodes; i++) { |
761 | RF_ASSERT(pda != NULL); |
762 | rf_InitNode(tmpwriteDataNode, rf_wait, RF_FALSE, |
763 | rf_DiskWriteFunc, rf_DiskWriteUndoFunc, |
764 | rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, |
765 | "Wnd" , allocList); |
766 | /* physical disk addr desc */ |
767 | tmpwriteDataNode->params[0].p = pda; |
768 | /* buffer holding new data to be written */ |
769 | tmpwriteDataNode->params[1].p = pda->bufPtr; |
770 | tmpwriteDataNode->params[2].v = parityStripeID; |
771 | tmpwriteDataNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, |
772 | which_ru); |
773 | pda = pda->next; |
774 | tmpwriteDataNode = tmpwriteDataNode->list_next; |
775 | } |
776 | |
777 | /* |
778 | * Initialize nodes which compute new parity and Q. |
779 | */ |
780 | /* |
781 | * We use the simple XOR func in the double-XOR case, and when |
782 | * we're accessing only a portion of one stripe unit. The |
783 | * distinction between the two is that the regular XOR func |
784 | * assumes that the targbuf is a full SU in size, and examines |
785 | * the pda associated with the buffer to decide where within |
786 | * the buffer to XOR the data, whereas the simple XOR func |
787 | * just XORs the data into the start of the buffer. */ |
788 | if ((numParityNodes == 2) || ((numDataNodes == 1) |
789 | && (asmap->totalSectorsAccessed < |
790 | raidPtr->Layout.sectorsPerStripeUnit))) { |
791 | func = pfuncs->simple; |
792 | undoFunc = rf_NullNodeUndoFunc; |
793 | name = pfuncs->SimpleName; |
794 | if (qfuncs) { |
795 | qfunc = qfuncs->simple; |
796 | qname = qfuncs->SimpleName; |
797 | } else { |
798 | qfunc = NULL; |
799 | qname = NULL; |
800 | } |
801 | } else { |
802 | func = pfuncs->regular; |
803 | undoFunc = rf_NullNodeUndoFunc; |
804 | name = pfuncs->RegularName; |
805 | if (qfuncs) { |
806 | qfunc = qfuncs->regular; |
807 | qname = qfuncs->RegularName; |
808 | } else { |
809 | qfunc = NULL; |
810 | qname = NULL; |
811 | } |
812 | } |
813 | /* |
814 | * Initialize the xor nodes: params are {pda,buf} |
815 | * from {Rod,Wnd,Rop} nodes, and raidPtr |
816 | */ |
817 | if (numParityNodes == 2) { |
818 | /* double-xor case */ |
819 | tmpxorNode = xorNodes; |
820 | tmpreadDataNode = readDataNodes; |
821 | tmpreadParityNode = readParityNodes; |
822 | tmpwriteDataNode = writeDataNodes; |
823 | #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) |
824 | tmpqNode = qNodes; |
825 | tmpreadQNode = readQNodes; |
826 | #endif |
827 | for (i = 0; i < numParityNodes; i++) { |
828 | /* note: no wakeup func for xor */ |
829 | rf_InitNode(tmpxorNode, rf_wait, RF_FALSE, func, |
830 | undoFunc, NULL, 1, |
831 | (numDataNodes + numParityNodes), |
832 | 7, 1, dag_h, name, allocList); |
833 | tmpxorNode->flags |= RF_DAGNODE_FLAG_YIELD; |
834 | tmpxorNode->params[0] = tmpreadDataNode->params[0]; |
835 | tmpxorNode->params[1] = tmpreadDataNode->params[1]; |
836 | tmpxorNode->params[2] = tmpreadParityNode->params[0]; |
837 | tmpxorNode->params[3] = tmpreadParityNode->params[1]; |
838 | tmpxorNode->params[4] = tmpwriteDataNode->params[0]; |
839 | tmpxorNode->params[5] = tmpwriteDataNode->params[1]; |
840 | tmpxorNode->params[6].p = raidPtr; |
841 | /* use old parity buf as target buf */ |
842 | tmpxorNode->results[0] = tmpreadParityNode->params[1].p; |
843 | #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) |
844 | if (nfaults == 2) { |
845 | /* note: no wakeup func for qor */ |
846 | rf_InitNode(tmpqNode, rf_wait, RF_FALSE, |
847 | qfunc, undoFunc, NULL, 1, |
848 | (numDataNodes + numParityNodes), |
849 | 7, 1, dag_h, qname, allocList); |
850 | tmpqNode->params[0] = tmpreadDataNode->params[0]; |
851 | tmpqNode->params[1] = tmpreadDataNode->params[1]; |
852 | tmpqNode->params[2] = tmpreadQNode->.params[0]; |
853 | tmpqNode->params[3] = tmpreadQNode->params[1]; |
854 | tmpqNode->params[4] = tmpwriteDataNode->params[0]; |
855 | tmpqNode->params[5] = tmpwriteDataNode->params[1]; |
856 | tmpqNode->params[6].p = raidPtr; |
857 | /* use old Q buf as target buf */ |
858 | tmpqNode->results[0] = tmpreadQNode->params[1].p; |
859 | tmpqNode = tmpqNode->list_next; |
860 | tmpreadQNodes = tmpreadQNodes->list_next; |
861 | } |
862 | #endif |
863 | tmpxorNode = tmpxorNode->list_next; |
864 | tmpreadDataNode = tmpreadDataNode->list_next; |
865 | tmpreadParityNode = tmpreadParityNode->list_next; |
866 | tmpwriteDataNode = tmpwriteDataNode->list_next; |
867 | } |
868 | } else { |
869 | /* there is only one xor node in this case */ |
870 | rf_InitNode(xorNodes, rf_wait, RF_FALSE, func, |
871 | undoFunc, NULL, 1, (numDataNodes + numParityNodes), |
872 | (2 * (numDataNodes + numDataNodes + 1) + 1), 1, |
873 | dag_h, name, allocList); |
874 | xorNodes->flags |= RF_DAGNODE_FLAG_YIELD; |
875 | tmpreadDataNode = readDataNodes; |
876 | for (i = 0; i < numDataNodes; i++) { /* used to be"numDataNodes + 1" until we factored |
877 | out the "+1" into the "deal with Rop separately below */ |
878 | /* set up params related to Rod nodes */ |
879 | xorNodes->params[2 * i + 0] = tmpreadDataNode->params[0]; /* pda */ |
880 | xorNodes->params[2 * i + 1] = tmpreadDataNode->params[1]; /* buffer ptr */ |
881 | tmpreadDataNode = tmpreadDataNode->list_next; |
882 | } |
883 | /* deal with Rop separately */ |
884 | xorNodes->params[2 * numDataNodes + 0] = readParityNodes->params[0]; /* pda */ |
885 | xorNodes->params[2 * numDataNodes + 1] = readParityNodes->params[1]; /* buffer ptr */ |
886 | |
887 | tmpwriteDataNode = writeDataNodes; |
888 | for (i = 0; i < numDataNodes; i++) { |
889 | /* set up params related to Wnd and Wnp nodes */ |
890 | xorNodes->params[2 * (numDataNodes + 1 + i) + 0] = /* pda */ |
891 | tmpwriteDataNode->params[0]; |
892 | xorNodes->params[2 * (numDataNodes + 1 + i) + 1] = /* buffer ptr */ |
893 | tmpwriteDataNode->params[1]; |
894 | tmpwriteDataNode = tmpwriteDataNode->list_next; |
895 | } |
896 | /* xor node needs to get at RAID information */ |
897 | xorNodes->params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr; |
898 | xorNodes->results[0] = readParityNodes->params[1].p; |
899 | #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) |
900 | if (nfaults == 2) { |
901 | rf_InitNode(qNodes, rf_wait, RF_FALSE, qfunc, |
902 | undoFunc, NULL, 1, |
903 | (numDataNodes + numParityNodes), |
904 | (2 * (numDataNodes + numDataNodes + 1) + 1), 1, |
905 | dag_h, qname, allocList); |
906 | tmpreadDataNode = readDataNodes; |
907 | for (i = 0; i < numDataNodes; i++) { |
908 | /* set up params related to Rod */ |
909 | qNodes->params[2 * i + 0] = tmpreadDataNode->params[0]; /* pda */ |
910 | qNodes->params[2 * i + 1] = tmpreadDataNode->params[1]; /* buffer ptr */ |
911 | tmpreadDataNode = tmpreadDataNode->list_next; |
912 | } |
913 | /* and read old q */ |
914 | qNodes->params[2 * numDataNodes + 0] = /* pda */ |
915 | readQNodes->params[0]; |
916 | qNodes->params[2 * numDataNodes + 1] = /* buffer ptr */ |
917 | readQNodes->params[1]; |
918 | tmpwriteDataNode = writeDataNodes; |
919 | for (i = 0; i < numDataNodes; i++) { |
920 | /* set up params related to Wnd nodes */ |
921 | qNodes->params[2 * (numDataNodes + 1 + i) + 0] = /* pda */ |
922 | tmpwriteDataNode->params[0]; |
923 | qNodes->params[2 * (numDataNodes + 1 + i) + 1] = /* buffer ptr */ |
924 | tmpwriteDataNode->params[1]; |
925 | tmpwriteDataNode = tmpwriteDataNode->list_next; |
926 | } |
927 | /* xor node needs to get at RAID information */ |
928 | qNodes->params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr; |
929 | qNodes->results[0] = readQNodes->params[1].p; |
930 | } |
931 | #endif |
932 | } |
933 | |
934 | /* initialize nodes which write new parity (Wnp) */ |
935 | pda = asmap->parityInfo; |
936 | tmpwriteParityNode = writeParityNodes; |
937 | tmpxorNode = xorNodes; |
938 | for (i = 0; i < numParityNodes; i++) { |
939 | rf_InitNode(tmpwriteParityNode, rf_wait, RF_FALSE, |
940 | rf_DiskWriteFunc, rf_DiskWriteUndoFunc, |
941 | rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, |
942 | "Wnp" , allocList); |
943 | RF_ASSERT(pda != NULL); |
944 | tmpwriteParityNode->params[0].p = pda; /* param 1 (bufPtr) |
945 | * filled in by xor node */ |
946 | tmpwriteParityNode->params[1].p = tmpxorNode->results[0]; /* buffer pointer for |
947 | * parity write |
948 | * operation */ |
949 | tmpwriteParityNode->params[2].v = parityStripeID; |
950 | tmpwriteParityNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, |
951 | which_ru); |
952 | pda = pda->next; |
953 | tmpwriteParityNode = tmpwriteParityNode->list_next; |
954 | tmpxorNode = tmpxorNode->list_next; |
955 | } |
956 | |
957 | #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) |
958 | /* initialize nodes which write new Q (Wnq) */ |
959 | if (nfaults == 2) { |
960 | pda = asmap->qInfo; |
961 | tmpwriteQNode = writeQNodes; |
962 | tmpqNode = qNodes; |
963 | for (i = 0; i < numParityNodes; i++) { |
964 | rf_InitNode(tmpwriteQNode, rf_wait, RF_FALSE, |
965 | rf_DiskWriteFunc, rf_DiskWriteUndoFunc, |
966 | rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, |
967 | "Wnq" , allocList); |
968 | RF_ASSERT(pda != NULL); |
969 | tmpwriteQNode->params[0].p = pda; /* param 1 (bufPtr) |
970 | * filled in by xor node */ |
971 | tmpwriteQNode->params[1].p = tmpqNode->results[0]; /* buffer pointer for |
972 | * parity write |
973 | * operation */ |
974 | tmpwriteQNode->params[2].v = parityStripeID; |
975 | tmpwriteQNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, |
976 | which_ru); |
977 | pda = pda->next; |
978 | tmpwriteQNode = tmpwriteQNode->list_next; |
979 | tmpqNode = tmpqNode->list_next; |
980 | } |
981 | } |
982 | #endif |
983 | /* |
984 | * Step 4. connect the nodes. |
985 | */ |
986 | |
987 | /* connect header to block node */ |
988 | dag_h->succedents[0] = blockNode; |
989 | |
990 | /* connect block node to read old data nodes */ |
991 | RF_ASSERT(blockNode->numSuccedents == (numDataNodes + (numParityNodes * nfaults))); |
992 | tmpreadDataNode = readDataNodes; |
993 | for (i = 0; i < numDataNodes; i++) { |
994 | blockNode->succedents[i] = tmpreadDataNode; |
995 | RF_ASSERT(tmpreadDataNode->numAntecedents == 1); |
996 | tmpreadDataNode->antecedents[0] = blockNode; |
997 | tmpreadDataNode->antType[0] = rf_control; |
998 | tmpreadDataNode = tmpreadDataNode->list_next; |
999 | } |
1000 | |
1001 | /* connect block node to read old parity nodes */ |
1002 | tmpreadParityNode = readParityNodes; |
1003 | for (i = 0; i < numParityNodes; i++) { |
1004 | blockNode->succedents[numDataNodes + i] = tmpreadParityNode; |
1005 | RF_ASSERT(tmpreadParityNode->numAntecedents == 1); |
1006 | tmpreadParityNode->antecedents[0] = blockNode; |
1007 | tmpreadParityNode->antType[0] = rf_control; |
1008 | tmpreadParityNode = tmpreadParityNode->list_next; |
1009 | } |
1010 | |
1011 | #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) |
1012 | /* connect block node to read old Q nodes */ |
1013 | if (nfaults == 2) { |
1014 | tmpreadQNode = readQNodes; |
1015 | for (i = 0; i < numParityNodes; i++) { |
1016 | blockNode->succedents[numDataNodes + numParityNodes + i] = tmpreadQNode; |
1017 | RF_ASSERT(tmpreadQNode->numAntecedents == 1); |
1018 | tmpreadQNode->antecedents[0] = blockNode; |
1019 | tmpreadQNode->antType[0] = rf_control; |
1020 | tmpreadQNode = tmpreadQNode->list_next; |
1021 | } |
1022 | } |
1023 | #endif |
1024 | /* connect read old data nodes to xor nodes */ |
1025 | tmpreadDataNode = readDataNodes; |
1026 | for (i = 0; i < numDataNodes; i++) { |
1027 | RF_ASSERT(tmpreadDataNode->numSuccedents == (nfaults * numParityNodes)); |
1028 | tmpxorNode = xorNodes; |
1029 | for (j = 0; j < numParityNodes; j++) { |
1030 | RF_ASSERT(tmpxorNode->numAntecedents == numDataNodes + numParityNodes); |
1031 | tmpreadDataNode->succedents[j] = tmpxorNode; |
1032 | tmpxorNode->antecedents[i] = tmpreadDataNode; |
1033 | tmpxorNode->antType[i] = rf_trueData; |
1034 | tmpxorNode = tmpxorNode->list_next; |
1035 | } |
1036 | tmpreadDataNode = tmpreadDataNode->list_next; |
1037 | } |
1038 | |
1039 | #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) |
1040 | /* connect read old data nodes to q nodes */ |
1041 | if (nfaults == 2) { |
1042 | tmpreadDataNode = readDataNodes; |
1043 | for (i = 0; i < numDataNodes; i++) { |
1044 | tmpqNode = qNodes; |
1045 | for (j = 0; j < numParityNodes; j++) { |
1046 | RF_ASSERT(tmpqNode->numAntecedents == numDataNodes + numParityNodes); |
1047 | tmpreadDataNode->succedents[numParityNodes + j] = tmpqNode; |
1048 | tmpqNode->antecedents[i] = tmpreadDataNode; |
1049 | tmpqNode->antType[i] = rf_trueData; |
1050 | tmpqNode = tmpqNode->list_next; |
1051 | } |
1052 | tmpreadDataNode = tmpreadDataNode->list_next; |
1053 | } |
1054 | } |
1055 | #endif |
1056 | /* connect read old parity nodes to xor nodes */ |
1057 | tmpreadParityNode = readParityNodes; |
1058 | for (i = 0; i < numParityNodes; i++) { |
1059 | RF_ASSERT(tmpreadParityNode->numSuccedents == numParityNodes); |
1060 | tmpxorNode = xorNodes; |
1061 | for (j = 0; j < numParityNodes; j++) { |
1062 | tmpreadParityNode->succedents[j] = tmpxorNode; |
1063 | tmpxorNode->antecedents[numDataNodes + i] = tmpreadParityNode; |
1064 | tmpxorNode->antType[numDataNodes + i] = rf_trueData; |
1065 | tmpxorNode = tmpxorNode->list_next; |
1066 | } |
1067 | tmpreadParityNode = tmpreadParityNode->list_next; |
1068 | } |
1069 | |
1070 | #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) |
1071 | /* connect read old q nodes to q nodes */ |
1072 | if (nfaults == 2) { |
1073 | tmpreadParityNode = readParityNodes; |
1074 | tmpreadQNode = readQNodes; |
1075 | for (i = 0; i < numParityNodes; i++) { |
1076 | RF_ASSERT(tmpreadParityNode->numSuccedents == numParityNodes); |
1077 | tmpqNode = qNodes; |
1078 | for (j = 0; j < numParityNodes; j++) { |
1079 | tmpreadQNode->succedents[j] = tmpqNode; |
1080 | tmpqNode->antecedents[numDataNodes + i] = tmpreadQNodes; |
1081 | tmpqNode->antType[numDataNodes + i] = rf_trueData; |
1082 | tmpqNode = tmpqNode->list_next; |
1083 | } |
1084 | tmpreadParityNode = tmpreadParityNode->list_next; |
1085 | tmpreadQNode = tmpreadQNode->list_next; |
1086 | } |
1087 | } |
1088 | #endif |
1089 | /* connect xor nodes to commit node */ |
1090 | RF_ASSERT(commitNode->numAntecedents == (nfaults * numParityNodes)); |
1091 | tmpxorNode = xorNodes; |
1092 | for (i = 0; i < numParityNodes; i++) { |
1093 | RF_ASSERT(tmpxorNode->numSuccedents == 1); |
1094 | tmpxorNode->succedents[0] = commitNode; |
1095 | commitNode->antecedents[i] = tmpxorNode; |
1096 | commitNode->antType[i] = rf_control; |
1097 | tmpxorNode = tmpxorNode->list_next; |
1098 | } |
1099 | |
1100 | #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) |
1101 | /* connect q nodes to commit node */ |
1102 | if (nfaults == 2) { |
1103 | tmpqNode = qNodes; |
1104 | for (i = 0; i < numParityNodes; i++) { |
1105 | RF_ASSERT(tmpqNode->numSuccedents == 1); |
1106 | tmpqNode->succedents[0] = commitNode; |
1107 | commitNode->antecedents[i + numParityNodes] = tmpqNode; |
1108 | commitNode->antType[i + numParityNodes] = rf_control; |
1109 | tmpqNode = tmpqNode->list_next; |
1110 | } |
1111 | } |
1112 | #endif |
1113 | /* connect commit node to write nodes */ |
1114 | RF_ASSERT(commitNode->numSuccedents == (numDataNodes + (nfaults * numParityNodes))); |
1115 | tmpwriteDataNode = writeDataNodes; |
1116 | for (i = 0; i < numDataNodes; i++) { |
1117 | RF_ASSERT(tmpwriteDataNode->numAntecedents == 1); |
1118 | commitNode->succedents[i] = tmpwriteDataNode; |
1119 | tmpwriteDataNode->antecedents[0] = commitNode; |
1120 | tmpwriteDataNode->antType[0] = rf_trueData; |
1121 | tmpwriteDataNode = tmpwriteDataNode->list_next; |
1122 | } |
1123 | tmpwriteParityNode = writeParityNodes; |
1124 | for (i = 0; i < numParityNodes; i++) { |
1125 | RF_ASSERT(tmpwriteParityNode->numAntecedents == 1); |
1126 | commitNode->succedents[i + numDataNodes] = tmpwriteParityNode; |
1127 | tmpwriteParityNode->antecedents[0] = commitNode; |
1128 | tmpwriteParityNode->antType[0] = rf_trueData; |
1129 | tmpwriteParityNode = tmpwriteParityNode->list_next; |
1130 | } |
1131 | #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) |
1132 | if (nfaults == 2) { |
1133 | tmpwriteQNode = writeQNodes; |
1134 | for (i = 0; i < numParityNodes; i++) { |
1135 | RF_ASSERT(tmpwriteQNode->numAntecedents == 1); |
1136 | commitNode->succedents[i + numDataNodes + numParityNodes] = tmpwriteQNode; |
1137 | tmpwriteQNode->antecedents[0] = commitNode; |
1138 | tmpwriteQNode->antType[0] = rf_trueData; |
1139 | tmpwriteQNode = tmpwriteQNode->list_next; |
1140 | } |
1141 | } |
1142 | #endif |
1143 | RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes))); |
1144 | RF_ASSERT(termNode->numSuccedents == 0); |
1145 | tmpwriteDataNode = writeDataNodes; |
1146 | for (i = 0; i < numDataNodes; i++) { |
1147 | /* connect write new data nodes to term node */ |
1148 | RF_ASSERT(tmpwriteDataNode->numSuccedents == 1); |
1149 | RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes))); |
1150 | tmpwriteDataNode->succedents[0] = termNode; |
1151 | termNode->antecedents[i] = tmpwriteDataNode; |
1152 | termNode->antType[i] = rf_control; |
1153 | tmpwriteDataNode = tmpwriteDataNode->list_next; |
1154 | } |
1155 | |
1156 | tmpwriteParityNode = writeParityNodes; |
1157 | for (i = 0; i < numParityNodes; i++) { |
1158 | RF_ASSERT(tmpwriteParityNode->numSuccedents == 1); |
1159 | tmpwriteParityNode->succedents[0] = termNode; |
1160 | termNode->antecedents[numDataNodes + i] = tmpwriteParityNode; |
1161 | termNode->antType[numDataNodes + i] = rf_control; |
1162 | tmpwriteParityNode = tmpwriteParityNode->list_next; |
1163 | } |
1164 | |
1165 | #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) |
1166 | if (nfaults == 2) { |
1167 | tmpwriteQNode = writeQNodes; |
1168 | for (i = 0; i < numParityNodes; i++) { |
1169 | RF_ASSERT(tmpwriteQNode->numSuccedents == 1); |
1170 | tmpwriteQNode->succedents[0] = termNode; |
1171 | termNode->antecedents[numDataNodes + numParityNodes + i] = tmpwriteQNode; |
1172 | termNode->antType[numDataNodes + numParityNodes + i] = rf_control; |
1173 | tmpwriteQNode = tmpwriteQNode->list_next; |
1174 | } |
1175 | } |
1176 | #endif |
1177 | } |
1178 | |
1179 | |
1180 | /****************************************************************************** |
1181 | * create a write graph (fault-free or degraded) for RAID level 1 |
1182 | * |
1183 | * Hdr -> Commit -> Wpd -> Nil -> Trm |
1184 | * -> Wsd -> |
1185 | * |
1186 | * The "Wpd" node writes data to the primary copy in the mirror pair |
1187 | * The "Wsd" node writes data to the secondary copy in the mirror pair |
1188 | * |
1189 | * Parameters: raidPtr - description of the physical array |
1190 | * asmap - logical & physical addresses for this access |
1191 | * bp - buffer ptr (holds write data) |
1192 | * flags - general flags (e.g. disk locking) |
1193 | * allocList - list of memory allocated in DAG creation |
1194 | *****************************************************************************/ |
1195 | |
1196 | void |
1197 | rf_CreateRaidOneWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, |
1198 | RF_DagHeader_t *dag_h, void *bp, |
1199 | RF_RaidAccessFlags_t flags, |
1200 | RF_AllocListElem_t *allocList) |
1201 | { |
1202 | RF_DagNode_t *unblockNode, *termNode, *commitNode; |
1203 | RF_DagNode_t *wndNode, *wmirNode; |
1204 | RF_DagNode_t *tmpNode, *tmpwndNode, *tmpwmirNode; |
1205 | int nWndNodes, nWmirNodes, i; |
1206 | RF_ReconUnitNum_t which_ru; |
1207 | RF_PhysDiskAddr_t *pda, *pdaP; |
1208 | RF_StripeNum_t parityStripeID; |
1209 | |
1210 | parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), |
1211 | asmap->raidAddress, &which_ru); |
1212 | #if RF_DEBUG_DAG |
1213 | if (rf_dagDebug) { |
1214 | printf("[Creating RAID level 1 write DAG]\n" ); |
1215 | } |
1216 | #endif |
1217 | dag_h->creator = "RaidOneWriteDAG" ; |
1218 | |
1219 | /* 2 implies access not SU aligned */ |
1220 | nWmirNodes = (asmap->parityInfo->next) ? 2 : 1; |
1221 | nWndNodes = (asmap->physInfo->next) ? 2 : 1; |
1222 | |
1223 | /* alloc the Wnd nodes and the Wmir node */ |
1224 | if (asmap->numDataFailed == 1) |
1225 | nWndNodes--; |
1226 | if (asmap->numParityFailed == 1) |
1227 | nWmirNodes--; |
1228 | |
1229 | /* total number of nodes = nWndNodes + nWmirNodes + (commit + unblock |
1230 | * + terminator) */ |
1231 | for (i = 0; i < nWndNodes; i++) { |
1232 | tmpNode = rf_AllocDAGNode(); |
1233 | tmpNode->list_next = dag_h->nodes; |
1234 | dag_h->nodes = tmpNode; |
1235 | } |
1236 | wndNode = dag_h->nodes; |
1237 | |
1238 | for (i = 0; i < nWmirNodes; i++) { |
1239 | tmpNode = rf_AllocDAGNode(); |
1240 | tmpNode->list_next = dag_h->nodes; |
1241 | dag_h->nodes = tmpNode; |
1242 | } |
1243 | wmirNode = dag_h->nodes; |
1244 | |
1245 | commitNode = rf_AllocDAGNode(); |
1246 | commitNode->list_next = dag_h->nodes; |
1247 | dag_h->nodes = commitNode; |
1248 | |
1249 | unblockNode = rf_AllocDAGNode(); |
1250 | unblockNode->list_next = dag_h->nodes; |
1251 | dag_h->nodes = unblockNode; |
1252 | |
1253 | termNode = rf_AllocDAGNode(); |
1254 | termNode->list_next = dag_h->nodes; |
1255 | dag_h->nodes = termNode; |
1256 | |
1257 | /* this dag can commit immediately */ |
1258 | dag_h->numCommitNodes = 1; |
1259 | dag_h->numCommits = 0; |
1260 | dag_h->numSuccedents = 1; |
1261 | |
1262 | /* initialize the commit, unblock, and term nodes */ |
1263 | rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, |
1264 | rf_NullNodeUndoFunc, NULL, (nWndNodes + nWmirNodes), |
1265 | 0, 0, 0, dag_h, "Cmt" , allocList); |
1266 | rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, |
1267 | rf_NullNodeUndoFunc, NULL, 1, (nWndNodes + nWmirNodes), |
1268 | 0, 0, dag_h, "Nil" , allocList); |
1269 | rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, |
1270 | rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, |
1271 | dag_h, "Trm" , allocList); |
1272 | |
1273 | /* initialize the wnd nodes */ |
1274 | if (nWndNodes > 0) { |
1275 | pda = asmap->physInfo; |
1276 | tmpwndNode = wndNode; |
1277 | for (i = 0; i < nWndNodes; i++) { |
1278 | rf_InitNode(tmpwndNode, rf_wait, RF_FALSE, |
1279 | rf_DiskWriteFunc, rf_DiskWriteUndoFunc, |
1280 | rf_GenericWakeupFunc, 1, 1, 4, 0, |
1281 | dag_h, "Wpd" , allocList); |
1282 | RF_ASSERT(pda != NULL); |
1283 | tmpwndNode->params[0].p = pda; |
1284 | tmpwndNode->params[1].p = pda->bufPtr; |
1285 | tmpwndNode->params[2].v = parityStripeID; |
1286 | tmpwndNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); |
1287 | pda = pda->next; |
1288 | tmpwndNode = tmpwndNode->list_next; |
1289 | } |
1290 | RF_ASSERT(pda == NULL); |
1291 | } |
1292 | /* initialize the mirror nodes */ |
1293 | if (nWmirNodes > 0) { |
1294 | pda = asmap->physInfo; |
1295 | pdaP = asmap->parityInfo; |
1296 | tmpwmirNode = wmirNode; |
1297 | for (i = 0; i < nWmirNodes; i++) { |
1298 | rf_InitNode(tmpwmirNode, rf_wait, RF_FALSE, |
1299 | rf_DiskWriteFunc, rf_DiskWriteUndoFunc, |
1300 | rf_GenericWakeupFunc, 1, 1, 4, 0, |
1301 | dag_h, "Wsd" , allocList); |
1302 | RF_ASSERT(pda != NULL); |
1303 | tmpwmirNode->params[0].p = pdaP; |
1304 | tmpwmirNode->params[1].p = pda->bufPtr; |
1305 | tmpwmirNode->params[2].v = parityStripeID; |
1306 | tmpwmirNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); |
1307 | pda = pda->next; |
1308 | pdaP = pdaP->next; |
1309 | tmpwmirNode = tmpwmirNode->list_next; |
1310 | } |
1311 | RF_ASSERT(pda == NULL); |
1312 | RF_ASSERT(pdaP == NULL); |
1313 | } |
1314 | /* link the header node to the commit node */ |
1315 | RF_ASSERT(dag_h->numSuccedents == 1); |
1316 | RF_ASSERT(commitNode->numAntecedents == 0); |
1317 | dag_h->succedents[0] = commitNode; |
1318 | |
1319 | /* link the commit node to the write nodes */ |
1320 | RF_ASSERT(commitNode->numSuccedents == (nWndNodes + nWmirNodes)); |
1321 | tmpwndNode = wndNode; |
1322 | for (i = 0; i < nWndNodes; i++) { |
1323 | RF_ASSERT(tmpwndNode->numAntecedents == 1); |
1324 | commitNode->succedents[i] = tmpwndNode; |
1325 | tmpwndNode->antecedents[0] = commitNode; |
1326 | tmpwndNode->antType[0] = rf_control; |
1327 | tmpwndNode = tmpwndNode->list_next; |
1328 | } |
1329 | tmpwmirNode = wmirNode; |
1330 | for (i = 0; i < nWmirNodes; i++) { |
1331 | RF_ASSERT(tmpwmirNode->numAntecedents == 1); |
1332 | commitNode->succedents[i + nWndNodes] = tmpwmirNode; |
1333 | tmpwmirNode->antecedents[0] = commitNode; |
1334 | tmpwmirNode->antType[0] = rf_control; |
1335 | tmpwmirNode = tmpwmirNode->list_next; |
1336 | } |
1337 | |
1338 | /* link the write nodes to the unblock node */ |
1339 | RF_ASSERT(unblockNode->numAntecedents == (nWndNodes + nWmirNodes)); |
1340 | tmpwndNode = wndNode; |
1341 | for (i = 0; i < nWndNodes; i++) { |
1342 | RF_ASSERT(tmpwndNode->numSuccedents == 1); |
1343 | tmpwndNode->succedents[0] = unblockNode; |
1344 | unblockNode->antecedents[i] = tmpwndNode; |
1345 | unblockNode->antType[i] = rf_control; |
1346 | tmpwndNode = tmpwndNode->list_next; |
1347 | } |
1348 | tmpwmirNode = wmirNode; |
1349 | for (i = 0; i < nWmirNodes; i++) { |
1350 | RF_ASSERT(tmpwmirNode->numSuccedents == 1); |
1351 | tmpwmirNode->succedents[0] = unblockNode; |
1352 | unblockNode->antecedents[i + nWndNodes] = tmpwmirNode; |
1353 | unblockNode->antType[i + nWndNodes] = rf_control; |
1354 | tmpwmirNode = tmpwmirNode->list_next; |
1355 | } |
1356 | |
1357 | /* link the unblock node to the term node */ |
1358 | RF_ASSERT(unblockNode->numSuccedents == 1); |
1359 | RF_ASSERT(termNode->numAntecedents == 1); |
1360 | RF_ASSERT(termNode->numSuccedents == 0); |
1361 | unblockNode->succedents[0] = termNode; |
1362 | termNode->antecedents[0] = unblockNode; |
1363 | termNode->antType[0] = rf_control; |
1364 | } |
1365 | |