1 | /* $NetBSD: rf_parityloggingdags.c,v 1.21 2014/03/23 09:30:59 christos Exp $ */ |
2 | /* |
3 | * Copyright (c) 1995 Carnegie-Mellon University. |
4 | * All rights reserved. |
5 | * |
6 | * Author: William V. Courtright II |
7 | * |
8 | * Permission to use, copy, modify and distribute this software and |
9 | * its documentation is hereby granted, provided that both the copyright |
10 | * notice and this permission notice appear in all copies of the |
11 | * software, derivative works or modified versions, and any portions |
12 | * thereof, and that both notices appear in supporting documentation. |
13 | * |
14 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
15 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND |
16 | * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. |
17 | * |
18 | * Carnegie Mellon requests users of this software to return to |
19 | * |
20 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
21 | * School of Computer Science |
22 | * Carnegie Mellon University |
23 | * Pittsburgh PA 15213-3890 |
24 | * |
25 | * any improvements or extensions that they make and grant Carnegie the |
26 | * rights to redistribute these changes. |
27 | */ |
28 | |
29 | /* |
30 | DAGs specific to parity logging are created here |
31 | */ |
32 | |
33 | #include <sys/cdefs.h> |
34 | __KERNEL_RCSID(0, "$NetBSD: rf_parityloggingdags.c,v 1.21 2014/03/23 09:30:59 christos Exp $" ); |
35 | |
36 | #ifdef _KERNEL_OPT |
37 | #include "opt_raid_diagnostic.h" |
38 | #endif |
39 | |
40 | #include "rf_archs.h" |
41 | |
42 | #if RF_INCLUDE_PARITYLOGGING > 0 |
43 | |
44 | #include <dev/raidframe/raidframevar.h> |
45 | |
46 | #include "rf_raid.h" |
47 | #include "rf_dag.h" |
48 | #include "rf_dagutils.h" |
49 | #include "rf_dagfuncs.h" |
50 | #include "rf_debugMem.h" |
51 | #include "rf_paritylog.h" |
52 | #include "rf_general.h" |
53 | |
54 | #include "rf_parityloggingdags.h" |
55 | |
56 | /****************************************************************************** |
57 | * |
58 | * creates a DAG to perform a large-write operation: |
59 | * |
60 | * / Rod \ / Wnd \ |
61 | * H -- NIL- Rod - NIL - Wnd ------ NIL - T |
62 | * \ Rod / \ Xor - Lpo / |
63 | * |
64 | * The writes are not done until the reads complete because if they were done in |
65 | * parallel, a failure on one of the reads could leave the parity in an inconsistent |
66 | * state, so that the retry with a new DAG would produce erroneous parity. |
67 | * |
68 | * Note: this DAG has the nasty property that none of the buffers allocated for reading |
69 | * old data can be freed until the XOR node fires. Need to fix this. |
70 | * |
71 | * The last two arguments are the number of faults tolerated, and function for the |
72 | * redundancy calculation. The undo for the redundancy calc is assumed to be null |
73 | * |
74 | *****************************************************************************/ |
75 | |
76 | void |
77 | rf_CommonCreateParityLoggingLargeWriteDAG( |
78 | RF_Raid_t * raidPtr, |
79 | RF_AccessStripeMap_t * asmap, |
80 | RF_DagHeader_t * dag_h, |
81 | void *bp, |
82 | RF_RaidAccessFlags_t flags, |
83 | RF_AllocListElem_t * allocList, |
84 | int nfaults, |
85 | int (*redFunc) (RF_DagNode_t *)) |
86 | { |
87 | RF_DagNode_t *nodes, *wndNodes, *rodNodes = NULL, *syncNode, *xorNode, |
88 | *lpoNode, *blockNode, *unblockNode, *termNode; |
89 | int nWndNodes, nRodNodes, i; |
90 | RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); |
91 | RF_AccessStripeMapHeader_t *new_asm_h[2]; |
92 | int nodeNum, asmNum; |
93 | RF_ReconUnitNum_t which_ru; |
94 | char *sosBuffer, *eosBuffer; |
95 | RF_PhysDiskAddr_t *pda; |
96 | RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), asmap->raidAddress, &which_ru); |
97 | |
98 | if (rf_dagDebug) |
99 | printf("[Creating parity-logging large-write DAG]\n" ); |
100 | RF_ASSERT(nfaults == 1);/* this arch only single fault tolerant */ |
101 | dag_h->creator = "ParityLoggingLargeWriteDAG" ; |
102 | |
103 | /* alloc the Wnd nodes, the xor node, and the Lpo node */ |
104 | nWndNodes = asmap->numStripeUnitsAccessed; |
105 | RF_MallocAndAdd(nodes, (nWndNodes + 6) * sizeof(RF_DagNode_t), |
106 | (RF_DagNode_t *), allocList); |
107 | i = 0; |
108 | wndNodes = &nodes[i]; |
109 | i += nWndNodes; |
110 | xorNode = &nodes[i]; |
111 | i += 1; |
112 | lpoNode = &nodes[i]; |
113 | i += 1; |
114 | blockNode = &nodes[i]; |
115 | i += 1; |
116 | syncNode = &nodes[i]; |
117 | i += 1; |
118 | unblockNode = &nodes[i]; |
119 | i += 1; |
120 | termNode = &nodes[i]; |
121 | i += 1; |
122 | |
123 | dag_h->numCommitNodes = nWndNodes + 1; |
124 | dag_h->numCommits = 0; |
125 | dag_h->numSuccedents = 1; |
126 | |
127 | rf_MapUnaccessedPortionOfStripe(raidPtr, layoutPtr, asmap, dag_h, new_asm_h, &nRodNodes, &sosBuffer, &eosBuffer, allocList); |
128 | if (nRodNodes > 0) |
129 | RF_MallocAndAdd(rodNodes, nRodNodes * sizeof(RF_DagNode_t), |
130 | (RF_DagNode_t *), allocList); |
131 | |
132 | /* begin node initialization */ |
133 | rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nRodNodes + 1, 0, 0, 0, dag_h, "Nil" , allocList); |
134 | rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nWndNodes + 1, 0, 0, dag_h, "Nil" , allocList); |
135 | rf_InitNode(syncNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nWndNodes + 1, nRodNodes + 1, 0, 0, dag_h, "Nil" , allocList); |
136 | rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm" , allocList); |
137 | |
138 | /* initialize the Rod nodes */ |
139 | for (nodeNum = asmNum = 0; asmNum < 2; asmNum++) { |
140 | if (new_asm_h[asmNum]) { |
141 | pda = new_asm_h[asmNum]->stripeMap->physInfo; |
142 | while (pda) { |
143 | rf_InitNode(&rodNodes[nodeNum], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rod" , allocList); |
144 | rodNodes[nodeNum].params[0].p = pda; |
145 | rodNodes[nodeNum].params[1].p = pda->bufPtr; |
146 | rodNodes[nodeNum].params[2].v = parityStripeID; |
147 | rodNodes[nodeNum].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); |
148 | nodeNum++; |
149 | pda = pda->next; |
150 | } |
151 | } |
152 | } |
153 | RF_ASSERT(nodeNum == nRodNodes); |
154 | |
155 | /* initialize the wnd nodes */ |
156 | pda = asmap->physInfo; |
157 | for (i = 0; i < nWndNodes; i++) { |
158 | rf_InitNode(&wndNodes[i], rf_wait, RF_TRUE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd" , allocList); |
159 | RF_ASSERT(pda != NULL); |
160 | wndNodes[i].params[0].p = pda; |
161 | wndNodes[i].params[1].p = pda->bufPtr; |
162 | wndNodes[i].params[2].v = parityStripeID; |
163 | wndNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); |
164 | pda = pda->next; |
165 | } |
166 | |
167 | /* initialize the redundancy node */ |
168 | rf_InitNode(xorNode, rf_wait, RF_TRUE, redFunc, rf_NullNodeUndoFunc, NULL, 1, 1, 2 * (nWndNodes + nRodNodes) + 1, 1, dag_h, "Xr " , allocList); |
169 | xorNode->flags |= RF_DAGNODE_FLAG_YIELD; |
170 | for (i = 0; i < nWndNodes; i++) { |
171 | xorNode->params[2 * i + 0] = wndNodes[i].params[0]; /* pda */ |
172 | xorNode->params[2 * i + 1] = wndNodes[i].params[1]; /* buf ptr */ |
173 | } |
174 | for (i = 0; i < nRodNodes; i++) { |
175 | xorNode->params[2 * (nWndNodes + i) + 0] = rodNodes[i].params[0]; /* pda */ |
176 | xorNode->params[2 * (nWndNodes + i) + 1] = rodNodes[i].params[1]; /* buf ptr */ |
177 | } |
178 | xorNode->params[2 * (nWndNodes + nRodNodes)].p = raidPtr; /* xor node needs to get |
179 | * at RAID information */ |
180 | |
181 | /* look for an Rod node that reads a complete SU. If none, alloc a |
182 | * buffer to receive the parity info. Note that we can't use a new |
183 | * data buffer because it will not have gotten written when the xor |
184 | * occurs. */ |
185 | for (i = 0; i < nRodNodes; i++) |
186 | if (((RF_PhysDiskAddr_t *) rodNodes[i].params[0].p)->numSector == raidPtr->Layout.sectorsPerStripeUnit) |
187 | break; |
188 | if (i == nRodNodes) { |
189 | RF_MallocAndAdd(xorNode->results[0], |
190 | rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit), (void *), allocList); |
191 | } else { |
192 | xorNode->results[0] = rodNodes[i].params[1].p; |
193 | } |
194 | |
195 | /* initialize the Lpo node */ |
196 | rf_InitNode(lpoNode, rf_wait, RF_FALSE, rf_ParityLogOverwriteFunc, rf_ParityLogOverwriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Lpo" , allocList); |
197 | |
198 | lpoNode->params[0].p = asmap->parityInfo; |
199 | lpoNode->params[1].p = xorNode->results[0]; |
200 | RF_ASSERT(asmap->parityInfo->next == NULL); /* parityInfo must |
201 | * describe entire |
202 | * parity unit */ |
203 | |
204 | /* connect nodes to form graph */ |
205 | |
206 | /* connect dag header to block node */ |
207 | RF_ASSERT(dag_h->numSuccedents == 1); |
208 | RF_ASSERT(blockNode->numAntecedents == 0); |
209 | dag_h->succedents[0] = blockNode; |
210 | |
211 | /* connect the block node to the Rod nodes */ |
212 | RF_ASSERT(blockNode->numSuccedents == nRodNodes + 1); |
213 | for (i = 0; i < nRodNodes; i++) { |
214 | RF_ASSERT(rodNodes[i].numAntecedents == 1); |
215 | blockNode->succedents[i] = &rodNodes[i]; |
216 | rodNodes[i].antecedents[0] = blockNode; |
217 | rodNodes[i].antType[0] = rf_control; |
218 | } |
219 | |
220 | /* connect the block node to the sync node */ |
221 | /* necessary if nRodNodes == 0 */ |
222 | RF_ASSERT(syncNode->numAntecedents == nRodNodes + 1); |
223 | blockNode->succedents[nRodNodes] = syncNode; |
224 | syncNode->antecedents[0] = blockNode; |
225 | syncNode->antType[0] = rf_control; |
226 | |
227 | /* connect the Rod nodes to the syncNode */ |
228 | for (i = 0; i < nRodNodes; i++) { |
229 | rodNodes[i].succedents[0] = syncNode; |
230 | syncNode->antecedents[1 + i] = &rodNodes[i]; |
231 | syncNode->antType[1 + i] = rf_control; |
232 | } |
233 | |
234 | /* connect the sync node to the xor node */ |
235 | RF_ASSERT(syncNode->numSuccedents == nWndNodes + 1); |
236 | RF_ASSERT(xorNode->numAntecedents == 1); |
237 | syncNode->succedents[0] = xorNode; |
238 | xorNode->antecedents[0] = syncNode; |
239 | xorNode->antType[0] = rf_trueData; /* carry forward from sync */ |
240 | |
241 | /* connect the sync node to the Wnd nodes */ |
242 | for (i = 0; i < nWndNodes; i++) { |
243 | RF_ASSERT(wndNodes->numAntecedents == 1); |
244 | syncNode->succedents[1 + i] = &wndNodes[i]; |
245 | wndNodes[i].antecedents[0] = syncNode; |
246 | wndNodes[i].antType[0] = rf_control; |
247 | } |
248 | |
249 | /* connect the xor node to the Lpo node */ |
250 | RF_ASSERT(xorNode->numSuccedents == 1); |
251 | RF_ASSERT(lpoNode->numAntecedents == 1); |
252 | xorNode->succedents[0] = lpoNode; |
253 | lpoNode->antecedents[0] = xorNode; |
254 | lpoNode->antType[0] = rf_trueData; |
255 | |
256 | /* connect the Wnd nodes to the unblock node */ |
257 | RF_ASSERT(unblockNode->numAntecedents == nWndNodes + 1); |
258 | for (i = 0; i < nWndNodes; i++) { |
259 | RF_ASSERT(wndNodes->numSuccedents == 1); |
260 | wndNodes[i].succedents[0] = unblockNode; |
261 | unblockNode->antecedents[i] = &wndNodes[i]; |
262 | unblockNode->antType[i] = rf_control; |
263 | } |
264 | |
265 | /* connect the Lpo node to the unblock node */ |
266 | RF_ASSERT(lpoNode->numSuccedents == 1); |
267 | lpoNode->succedents[0] = unblockNode; |
268 | unblockNode->antecedents[nWndNodes] = lpoNode; |
269 | unblockNode->antType[nWndNodes] = rf_control; |
270 | |
271 | /* connect unblock node to terminator */ |
272 | RF_ASSERT(unblockNode->numSuccedents == 1); |
273 | RF_ASSERT(termNode->numAntecedents == 1); |
274 | RF_ASSERT(termNode->numSuccedents == 0); |
275 | unblockNode->succedents[0] = termNode; |
276 | termNode->antecedents[0] = unblockNode; |
277 | termNode->antType[0] = rf_control; |
278 | } |
279 | |
280 | |
281 | |
282 | |
283 | /****************************************************************************** |
284 | * |
285 | * creates a DAG to perform a small-write operation (either raid 5 or pq), which is as follows: |
286 | * |
287 | * Header |
288 | * | |
289 | * Block |
290 | * / | ... \ \ |
291 | * / | \ \ |
292 | * Rod Rod Rod Rop |
293 | * | \ /| \ / | \/ | |
294 | * | | | /\ | |
295 | * Wnd Wnd Wnd X |
296 | * | \ / | |
297 | * | \ / | |
298 | * \ \ / Lpo |
299 | * \ \ / / |
300 | * +-> Unblock <-+ |
301 | * | |
302 | * T |
303 | * |
304 | * |
305 | * R = Read, W = Write, X = Xor, o = old, n = new, d = data, p = parity. |
306 | * When the access spans a stripe unit boundary and is less than one SU in size, there will |
307 | * be two Rop -- X -- Wnp branches. I call this the "double-XOR" case. |
308 | * The second output from each Rod node goes to the X node. In the double-XOR |
309 | * case, there are exactly 2 Rod nodes, and each sends one output to one X node. |
310 | * There is one Rod -- Wnd -- T branch for each stripe unit being updated. |
311 | * |
312 | * The block and unblock nodes are unused. See comment above CreateFaultFreeReadDAG. |
313 | * |
314 | * Note: this DAG ignores all the optimizations related to making the RMWs atomic. |
315 | * it also has the nasty property that none of the buffers allocated for reading |
316 | * old data & parity can be freed until the XOR node fires. Need to fix this. |
317 | * |
318 | * A null qfuncs indicates single fault tolerant |
319 | *****************************************************************************/ |
320 | |
321 | void |
322 | rf_CommonCreateParityLoggingSmallWriteDAG( |
323 | RF_Raid_t * raidPtr, |
324 | RF_AccessStripeMap_t * asmap, |
325 | RF_DagHeader_t * dag_h, |
326 | void *bp, |
327 | RF_RaidAccessFlags_t flags, |
328 | RF_AllocListElem_t * allocList, |
329 | const RF_RedFuncs_t * pfuncs, |
330 | const RF_RedFuncs_t * qfuncs) |
331 | { |
332 | RF_DagNode_t *xorNodes, *blockNode, *unblockNode, *nodes; |
333 | RF_DagNode_t *readDataNodes, *readParityNodes; |
334 | RF_DagNode_t *writeDataNodes, *lpuNodes; |
335 | RF_DagNode_t *termNode; |
336 | RF_PhysDiskAddr_t *pda = asmap->physInfo; |
337 | int numDataNodes = asmap->numStripeUnitsAccessed; |
338 | int numParityNodes = (asmap->parityInfo->next) ? 2 : 1; |
339 | int i, j, nNodes, totalNumNodes; |
340 | RF_ReconUnitNum_t which_ru; |
341 | int (*func) (RF_DagNode_t * node), (*undoFunc) (RF_DagNode_t * node); |
342 | const char *name; |
343 | RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), asmap->raidAddress, &which_ru); |
344 | long nfaults __unused = qfuncs ? 2 : 1; |
345 | |
346 | if (rf_dagDebug) |
347 | printf("[Creating parity-logging small-write DAG]\n" ); |
348 | RF_ASSERT(numDataNodes > 0); |
349 | RF_ASSERT(nfaults == 1); |
350 | dag_h->creator = "ParityLoggingSmallWriteDAG" ; |
351 | |
352 | /* DAG creation occurs in three steps: 1. count the number of nodes in |
353 | * the DAG 2. create the nodes 3. initialize the nodes 4. connect the |
354 | * nodes */ |
355 | |
356 | /* Step 1. compute number of nodes in the graph */ |
357 | |
358 | /* number of nodes: a read and write for each data unit a redundancy |
359 | * computation node for each parity node a read and Lpu for each |
360 | * parity unit a block and unblock node (2) a terminator node if |
361 | * atomic RMW an unlock node for each data unit, redundancy unit */ |
362 | totalNumNodes = (2 * numDataNodes) + numParityNodes + (2 * numParityNodes) + 3; |
363 | |
364 | nNodes = numDataNodes + numParityNodes; |
365 | |
366 | dag_h->numCommitNodes = numDataNodes + numParityNodes; |
367 | dag_h->numCommits = 0; |
368 | dag_h->numSuccedents = 1; |
369 | |
370 | /* Step 2. create the nodes */ |
371 | RF_MallocAndAdd(nodes, totalNumNodes * sizeof(RF_DagNode_t), |
372 | (RF_DagNode_t *), allocList); |
373 | i = 0; |
374 | blockNode = &nodes[i]; |
375 | i += 1; |
376 | unblockNode = &nodes[i]; |
377 | i += 1; |
378 | readDataNodes = &nodes[i]; |
379 | i += numDataNodes; |
380 | readParityNodes = &nodes[i]; |
381 | i += numParityNodes; |
382 | writeDataNodes = &nodes[i]; |
383 | i += numDataNodes; |
384 | lpuNodes = &nodes[i]; |
385 | i += numParityNodes; |
386 | xorNodes = &nodes[i]; |
387 | i += numParityNodes; |
388 | termNode = &nodes[i]; |
389 | i += 1; |
390 | |
391 | RF_ASSERT(i == totalNumNodes); |
392 | |
393 | /* Step 3. initialize the nodes */ |
394 | /* initialize block node (Nil) */ |
395 | rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nNodes, 0, 0, 0, dag_h, "Nil" , allocList); |
396 | |
397 | /* initialize unblock node (Nil) */ |
398 | rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nNodes, 0, 0, dag_h, "Nil" , allocList); |
399 | |
400 | /* initialize terminatory node (Trm) */ |
401 | rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm" , allocList); |
402 | |
403 | /* initialize nodes which read old data (Rod) */ |
404 | for (i = 0; i < numDataNodes; i++) { |
405 | rf_InitNode(&readDataNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, nNodes, 1, 4, 0, dag_h, "Rod" , allocList); |
406 | RF_ASSERT(pda != NULL); |
407 | readDataNodes[i].params[0].p = pda; /* physical disk addr |
408 | * desc */ |
409 | readDataNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda->numSector << raidPtr->logBytesPerSector); /* buffer to hold old data */ |
410 | readDataNodes[i].params[2].v = parityStripeID; |
411 | readDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); |
412 | pda = pda->next; |
413 | readDataNodes[i].propList[0] = NULL; |
414 | readDataNodes[i].propList[1] = NULL; |
415 | } |
416 | |
417 | /* initialize nodes which read old parity (Rop) */ |
418 | pda = asmap->parityInfo; |
419 | i = 0; |
420 | for (i = 0; i < numParityNodes; i++) { |
421 | RF_ASSERT(pda != NULL); |
422 | rf_InitNode(&readParityNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, nNodes, 1, 4, 0, dag_h, "Rop" , allocList); |
423 | readParityNodes[i].params[0].p = pda; |
424 | readParityNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda->numSector << raidPtr->logBytesPerSector); /* buffer to hold old parity */ |
425 | readParityNodes[i].params[2].v = parityStripeID; |
426 | readParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); |
427 | readParityNodes[i].propList[0] = NULL; |
428 | pda = pda->next; |
429 | } |
430 | |
431 | /* initialize nodes which write new data (Wnd) */ |
432 | pda = asmap->physInfo; |
433 | for (i = 0; i < numDataNodes; i++) { |
434 | RF_ASSERT(pda != NULL); |
435 | rf_InitNode(&writeDataNodes[i], rf_wait, RF_TRUE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, nNodes, 4, 0, dag_h, "Wnd" , allocList); |
436 | writeDataNodes[i].params[0].p = pda; /* physical disk addr |
437 | * desc */ |
438 | writeDataNodes[i].params[1].p = pda->bufPtr; /* buffer holding new |
439 | * data to be written */ |
440 | writeDataNodes[i].params[2].v = parityStripeID; |
441 | writeDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); |
442 | |
443 | pda = pda->next; |
444 | } |
445 | |
446 | |
447 | /* initialize nodes which compute new parity */ |
448 | /* we use the simple XOR func in the double-XOR case, and when we're |
449 | * accessing only a portion of one stripe unit. the distinction |
450 | * between the two is that the regular XOR func assumes that the |
451 | * targbuf is a full SU in size, and examines the pda associated with |
452 | * the buffer to decide where within the buffer to XOR the data, |
453 | * whereas the simple XOR func just XORs the data into the start of |
454 | * the buffer. */ |
455 | if ((numParityNodes == 2) || ((numDataNodes == 1) && (asmap->totalSectorsAccessed < raidPtr->Layout.sectorsPerStripeUnit))) { |
456 | func = pfuncs->simple; |
457 | undoFunc = rf_NullNodeUndoFunc; |
458 | name = pfuncs->SimpleName; |
459 | } else { |
460 | func = pfuncs->regular; |
461 | undoFunc = rf_NullNodeUndoFunc; |
462 | name = pfuncs->RegularName; |
463 | } |
464 | /* initialize the xor nodes: params are {pda,buf} from {Rod,Wnd,Rop} |
465 | * nodes, and raidPtr */ |
466 | if (numParityNodes == 2) { /* double-xor case */ |
467 | for (i = 0; i < numParityNodes; i++) { |
468 | rf_InitNode(&xorNodes[i], rf_wait, RF_TRUE, func, undoFunc, NULL, 1, nNodes, 7, 1, dag_h, name, allocList); /* no wakeup func for |
469 | * xor */ |
470 | xorNodes[i].flags |= RF_DAGNODE_FLAG_YIELD; |
471 | xorNodes[i].params[0] = readDataNodes[i].params[0]; |
472 | xorNodes[i].params[1] = readDataNodes[i].params[1]; |
473 | xorNodes[i].params[2] = readParityNodes[i].params[0]; |
474 | xorNodes[i].params[3] = readParityNodes[i].params[1]; |
475 | xorNodes[i].params[4] = writeDataNodes[i].params[0]; |
476 | xorNodes[i].params[5] = writeDataNodes[i].params[1]; |
477 | xorNodes[i].params[6].p = raidPtr; |
478 | xorNodes[i].results[0] = readParityNodes[i].params[1].p; /* use old parity buf as |
479 | * target buf */ |
480 | } |
481 | } else { |
482 | /* there is only one xor node in this case */ |
483 | rf_InitNode(&xorNodes[0], rf_wait, RF_TRUE, func, undoFunc, NULL, 1, nNodes, (2 * (numDataNodes + numDataNodes + 1) + 1), 1, dag_h, name, allocList); |
484 | xorNodes[0].flags |= RF_DAGNODE_FLAG_YIELD; |
485 | for (i = 0; i < numDataNodes + 1; i++) { |
486 | /* set up params related to Rod and Rop nodes */ |
487 | xorNodes[0].params[2 * i + 0] = readDataNodes[i].params[0]; /* pda */ |
488 | xorNodes[0].params[2 * i + 1] = readDataNodes[i].params[1]; /* buffer pointer */ |
489 | } |
490 | for (i = 0; i < numDataNodes; i++) { |
491 | /* set up params related to Wnd and Wnp nodes */ |
492 | xorNodes[0].params[2 * (numDataNodes + 1 + i) + 0] = writeDataNodes[i].params[0]; /* pda */ |
493 | xorNodes[0].params[2 * (numDataNodes + 1 + i) + 1] = writeDataNodes[i].params[1]; /* buffer pointer */ |
494 | } |
495 | xorNodes[0].params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr; /* xor node needs to get |
496 | * at RAID information */ |
497 | xorNodes[0].results[0] = readParityNodes[0].params[1].p; |
498 | } |
499 | |
500 | /* initialize the log node(s) */ |
501 | pda = asmap->parityInfo; |
502 | for (i = 0; i < numParityNodes; i++) { |
503 | RF_ASSERT(pda); |
504 | rf_InitNode(&lpuNodes[i], rf_wait, RF_FALSE, rf_ParityLogUpdateFunc, rf_ParityLogUpdateUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Lpu" , allocList); |
505 | lpuNodes[i].params[0].p = pda; /* PhysDiskAddr of parity */ |
506 | lpuNodes[i].params[1].p = xorNodes[i].results[0]; /* buffer pointer to |
507 | * parity */ |
508 | pda = pda->next; |
509 | } |
510 | |
511 | |
512 | /* Step 4. connect the nodes */ |
513 | |
514 | /* connect header to block node */ |
515 | RF_ASSERT(dag_h->numSuccedents == 1); |
516 | RF_ASSERT(blockNode->numAntecedents == 0); |
517 | dag_h->succedents[0] = blockNode; |
518 | |
519 | /* connect block node to read old data nodes */ |
520 | RF_ASSERT(blockNode->numSuccedents == (numDataNodes + numParityNodes)); |
521 | for (i = 0; i < numDataNodes; i++) { |
522 | blockNode->succedents[i] = &readDataNodes[i]; |
523 | RF_ASSERT(readDataNodes[i].numAntecedents == 1); |
524 | readDataNodes[i].antecedents[0] = blockNode; |
525 | readDataNodes[i].antType[0] = rf_control; |
526 | } |
527 | |
528 | /* connect block node to read old parity nodes */ |
529 | for (i = 0; i < numParityNodes; i++) { |
530 | blockNode->succedents[numDataNodes + i] = &readParityNodes[i]; |
531 | RF_ASSERT(readParityNodes[i].numAntecedents == 1); |
532 | readParityNodes[i].antecedents[0] = blockNode; |
533 | readParityNodes[i].antType[0] = rf_control; |
534 | } |
535 | |
536 | /* connect read old data nodes to write new data nodes */ |
537 | for (i = 0; i < numDataNodes; i++) { |
538 | RF_ASSERT(readDataNodes[i].numSuccedents == numDataNodes + numParityNodes); |
539 | for (j = 0; j < numDataNodes; j++) { |
540 | RF_ASSERT(writeDataNodes[j].numAntecedents == numDataNodes + numParityNodes); |
541 | readDataNodes[i].succedents[j] = &writeDataNodes[j]; |
542 | writeDataNodes[j].antecedents[i] = &readDataNodes[i]; |
543 | if (i == j) |
544 | writeDataNodes[j].antType[i] = rf_antiData; |
545 | else |
546 | writeDataNodes[j].antType[i] = rf_control; |
547 | } |
548 | } |
549 | |
550 | /* connect read old data nodes to xor nodes */ |
551 | for (i = 0; i < numDataNodes; i++) |
552 | for (j = 0; j < numParityNodes; j++) { |
553 | RF_ASSERT(xorNodes[j].numAntecedents == numDataNodes + numParityNodes); |
554 | readDataNodes[i].succedents[numDataNodes + j] = &xorNodes[j]; |
555 | xorNodes[j].antecedents[i] = &readDataNodes[i]; |
556 | xorNodes[j].antType[i] = rf_trueData; |
557 | } |
558 | |
559 | /* connect read old parity nodes to write new data nodes */ |
560 | for (i = 0; i < numParityNodes; i++) { |
561 | RF_ASSERT(readParityNodes[i].numSuccedents == numDataNodes + numParityNodes); |
562 | for (j = 0; j < numDataNodes; j++) { |
563 | readParityNodes[i].succedents[j] = &writeDataNodes[j]; |
564 | writeDataNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i]; |
565 | writeDataNodes[j].antType[numDataNodes + i] = rf_control; |
566 | } |
567 | } |
568 | |
569 | /* connect read old parity nodes to xor nodes */ |
570 | for (i = 0; i < numParityNodes; i++) |
571 | for (j = 0; j < numParityNodes; j++) { |
572 | readParityNodes[i].succedents[numDataNodes + j] = &xorNodes[j]; |
573 | xorNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i]; |
574 | xorNodes[j].antType[numDataNodes + i] = rf_trueData; |
575 | } |
576 | |
577 | /* connect xor nodes to write new parity nodes */ |
578 | for (i = 0; i < numParityNodes; i++) { |
579 | RF_ASSERT(xorNodes[i].numSuccedents == 1); |
580 | RF_ASSERT(lpuNodes[i].numAntecedents == 1); |
581 | xorNodes[i].succedents[0] = &lpuNodes[i]; |
582 | lpuNodes[i].antecedents[0] = &xorNodes[i]; |
583 | lpuNodes[i].antType[0] = rf_trueData; |
584 | } |
585 | |
586 | for (i = 0; i < numDataNodes; i++) { |
587 | /* connect write new data nodes to unblock node */ |
588 | RF_ASSERT(writeDataNodes[i].numSuccedents == 1); |
589 | RF_ASSERT(unblockNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes))); |
590 | writeDataNodes[i].succedents[0] = unblockNode; |
591 | unblockNode->antecedents[i] = &writeDataNodes[i]; |
592 | unblockNode->antType[i] = rf_control; |
593 | } |
594 | |
595 | /* connect write new parity nodes to unblock node */ |
596 | for (i = 0; i < numParityNodes; i++) { |
597 | RF_ASSERT(lpuNodes[i].numSuccedents == 1); |
598 | lpuNodes[i].succedents[0] = unblockNode; |
599 | unblockNode->antecedents[numDataNodes + i] = &lpuNodes[i]; |
600 | unblockNode->antType[numDataNodes + i] = rf_control; |
601 | } |
602 | |
603 | /* connect unblock node to terminator */ |
604 | RF_ASSERT(unblockNode->numSuccedents == 1); |
605 | RF_ASSERT(termNode->numAntecedents == 1); |
606 | RF_ASSERT(termNode->numSuccedents == 0); |
607 | unblockNode->succedents[0] = termNode; |
608 | termNode->antecedents[0] = unblockNode; |
609 | termNode->antType[0] = rf_control; |
610 | } |
611 | |
612 | |
613 | void |
614 | rf_CreateParityLoggingSmallWriteDAG( |
615 | RF_Raid_t * raidPtr, |
616 | RF_AccessStripeMap_t * asmap, |
617 | RF_DagHeader_t * dag_h, |
618 | void *bp, |
619 | RF_RaidAccessFlags_t flags, |
620 | RF_AllocListElem_t * allocList, |
621 | const RF_RedFuncs_t * pfuncs, |
622 | const RF_RedFuncs_t * qfuncs) |
623 | { |
624 | dag_h->creator = "ParityLoggingSmallWriteDAG" ; |
625 | rf_CommonCreateParityLoggingSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_xorFuncs, NULL); |
626 | } |
627 | |
628 | |
629 | void |
630 | rf_CreateParityLoggingLargeWriteDAG( |
631 | RF_Raid_t * raidPtr, |
632 | RF_AccessStripeMap_t * asmap, |
633 | RF_DagHeader_t * dag_h, |
634 | void *bp, |
635 | RF_RaidAccessFlags_t flags, |
636 | RF_AllocListElem_t * allocList, |
637 | int nfaults, |
638 | int (*redFunc) (RF_DagNode_t *)) |
639 | { |
640 | dag_h->creator = "ParityLoggingSmallWriteDAG" ; |
641 | rf_CommonCreateParityLoggingLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 1, rf_RegularXorFunc); |
642 | } |
643 | #endif /* RF_INCLUDE_PARITYLOGGING > 0 */ |
644 | |