1 | /* $NetBSD: rf_dagfuncs.c,v 1.30 2009/03/23 18:38:54 oster Exp $ */ |
2 | /* |
3 | * Copyright (c) 1995 Carnegie-Mellon University. |
4 | * All rights reserved. |
5 | * |
6 | * Author: Mark Holland, William V. Courtright II |
7 | * |
8 | * Permission to use, copy, modify and distribute this software and |
9 | * its documentation is hereby granted, provided that both the copyright |
10 | * notice and this permission notice appear in all copies of the |
11 | * software, derivative works or modified versions, and any portions |
12 | * thereof, and that both notices appear in supporting documentation. |
13 | * |
14 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
15 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND |
16 | * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. |
17 | * |
18 | * Carnegie Mellon requests users of this software to return to |
19 | * |
20 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
21 | * School of Computer Science |
22 | * Carnegie Mellon University |
23 | * Pittsburgh PA 15213-3890 |
24 | * |
25 | * any improvements or extensions that they make and grant Carnegie the |
26 | * rights to redistribute these changes. |
27 | */ |
28 | |
29 | /* |
30 | * dagfuncs.c -- DAG node execution routines |
31 | * |
32 | * Rules: |
33 | * 1. Every DAG execution function must eventually cause node->status to |
34 | * get set to "good" or "bad", and "FinishNode" to be called. In the |
35 | * case of nodes that complete immediately (xor, NullNodeFunc, etc), |
36 | * the node execution function can do these two things directly. In |
37 | * the case of nodes that have to wait for some event (a disk read to |
38 | * complete, a lock to be released, etc) to occur before they can |
39 | * complete, this is typically achieved by having whatever module |
40 | * is doing the operation call GenericWakeupFunc upon completion. |
41 | * 2. DAG execution functions should check the status in the DAG header |
42 | * and NOP out their operations if the status is not "enable". However, |
43 | * execution functions that release resources must be sure to release |
44 | * them even when they NOP out the function that would use them. |
45 | * Functions that acquire resources should go ahead and acquire them |
46 | * even when they NOP, so that a downstream release node will not have |
47 | * to check to find out whether or not the acquire was suppressed. |
48 | */ |
49 | |
50 | #include <sys/cdefs.h> |
51 | __KERNEL_RCSID(0, "$NetBSD: rf_dagfuncs.c,v 1.30 2009/03/23 18:38:54 oster Exp $" ); |
52 | |
53 | #include <sys/param.h> |
54 | #include <sys/ioctl.h> |
55 | |
56 | #include "rf_archs.h" |
57 | #include "rf_raid.h" |
58 | #include "rf_dag.h" |
59 | #include "rf_layout.h" |
60 | #include "rf_etimer.h" |
61 | #include "rf_acctrace.h" |
62 | #include "rf_diskqueue.h" |
63 | #include "rf_dagfuncs.h" |
64 | #include "rf_general.h" |
65 | #include "rf_engine.h" |
66 | #include "rf_dagutils.h" |
67 | |
68 | #include "rf_kintf.h" |
69 | |
70 | #if RF_INCLUDE_PARITYLOGGING > 0 |
71 | #include "rf_paritylog.h" |
72 | #endif /* RF_INCLUDE_PARITYLOGGING > 0 */ |
73 | |
74 | int (*rf_DiskReadFunc) (RF_DagNode_t *); |
75 | int (*rf_DiskWriteFunc) (RF_DagNode_t *); |
76 | int (*rf_DiskReadUndoFunc) (RF_DagNode_t *); |
77 | int (*rf_DiskWriteUndoFunc) (RF_DagNode_t *); |
78 | int (*rf_RegularXorUndoFunc) (RF_DagNode_t *); |
79 | int (*rf_SimpleXorUndoFunc) (RF_DagNode_t *); |
80 | int (*rf_RecoveryXorUndoFunc) (RF_DagNode_t *); |
81 | |
82 | /***************************************************************************** |
83 | * main (only) configuration routine for this module |
84 | ****************************************************************************/ |
85 | int |
86 | rf_ConfigureDAGFuncs(RF_ShutdownList_t **listp) |
87 | { |
88 | RF_ASSERT(((sizeof(long) == 8) && RF_LONGSHIFT == 3) || |
89 | ((sizeof(long) == 4) && RF_LONGSHIFT == 2)); |
90 | rf_DiskReadFunc = rf_DiskReadFuncForThreads; |
91 | rf_DiskReadUndoFunc = rf_DiskUndoFunc; |
92 | rf_DiskWriteFunc = rf_DiskWriteFuncForThreads; |
93 | rf_DiskWriteUndoFunc = rf_DiskUndoFunc; |
94 | rf_RegularXorUndoFunc = rf_NullNodeUndoFunc; |
95 | rf_SimpleXorUndoFunc = rf_NullNodeUndoFunc; |
96 | rf_RecoveryXorUndoFunc = rf_NullNodeUndoFunc; |
97 | return (0); |
98 | } |
99 | |
100 | |
101 | |
102 | /***************************************************************************** |
103 | * the execution function associated with a terminate node |
104 | ****************************************************************************/ |
105 | int |
106 | rf_TerminateFunc(RF_DagNode_t *node) |
107 | { |
108 | RF_ASSERT(node->dagHdr->numCommits == node->dagHdr->numCommitNodes); |
109 | node->status = rf_good; |
110 | return (rf_FinishNode(node, RF_THREAD_CONTEXT)); |
111 | } |
112 | |
113 | int |
114 | rf_TerminateUndoFunc(RF_DagNode_t *node) |
115 | { |
116 | return (0); |
117 | } |
118 | |
119 | |
120 | /***************************************************************************** |
121 | * execution functions associated with a mirror node |
122 | * |
123 | * parameters: |
124 | * |
125 | * 0 - physical disk addres of data |
126 | * 1 - buffer for holding read data |
127 | * 2 - parity stripe ID |
128 | * 3 - flags |
129 | * 4 - physical disk address of mirror (parity) |
130 | * |
131 | ****************************************************************************/ |
132 | |
133 | int |
134 | rf_DiskReadMirrorIdleFunc(RF_DagNode_t *node) |
135 | { |
136 | /* select the mirror copy with the shortest queue and fill in node |
137 | * parameters with physical disk address */ |
138 | |
139 | rf_SelectMirrorDiskIdle(node); |
140 | return (rf_DiskReadFunc(node)); |
141 | } |
142 | |
143 | #if (RF_INCLUDE_CHAINDECLUSTER > 0) || (RF_INCLUDE_INTERDECLUSTER > 0) || (RF_DEBUG_VALIDATE_DAG > 0) |
144 | int |
145 | rf_DiskReadMirrorPartitionFunc(RF_DagNode_t *node) |
146 | { |
147 | /* select the mirror copy with the shortest queue and fill in node |
148 | * parameters with physical disk address */ |
149 | |
150 | rf_SelectMirrorDiskPartition(node); |
151 | return (rf_DiskReadFunc(node)); |
152 | } |
153 | #endif |
154 | |
155 | int |
156 | rf_DiskReadMirrorUndoFunc(RF_DagNode_t *node) |
157 | { |
158 | return (0); |
159 | } |
160 | |
161 | |
162 | |
163 | #if RF_INCLUDE_PARITYLOGGING > 0 |
164 | /***************************************************************************** |
165 | * the execution function associated with a parity log update node |
166 | ****************************************************************************/ |
167 | int |
168 | rf_ParityLogUpdateFunc(RF_DagNode_t *node) |
169 | { |
170 | RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p; |
171 | void *bf = (void *) node->params[1].p; |
172 | RF_ParityLogData_t *logData; |
173 | #if RF_ACC_TRACE > 0 |
174 | RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; |
175 | RF_Etimer_t timer; |
176 | #endif |
177 | |
178 | if (node->dagHdr->status == rf_enable) { |
179 | #if RF_ACC_TRACE > 0 |
180 | RF_ETIMER_START(timer); |
181 | #endif |
182 | logData = rf_CreateParityLogData(RF_UPDATE, pda, bf, |
183 | (RF_Raid_t *) (node->dagHdr->raidPtr), |
184 | node->wakeFunc, (void *) node, |
185 | node->dagHdr->tracerec, timer); |
186 | if (logData) |
187 | rf_ParityLogAppend(logData, RF_FALSE, NULL, RF_FALSE); |
188 | else { |
189 | #if RF_ACC_TRACE > 0 |
190 | RF_ETIMER_STOP(timer); |
191 | RF_ETIMER_EVAL(timer); |
192 | tracerec->plog_us += RF_ETIMER_VAL_US(timer); |
193 | #endif |
194 | (node->wakeFunc) (node, ENOMEM); |
195 | } |
196 | } |
197 | return (0); |
198 | } |
199 | |
200 | |
201 | /***************************************************************************** |
202 | * the execution function associated with a parity log overwrite node |
203 | ****************************************************************************/ |
204 | int |
205 | rf_ParityLogOverwriteFunc(RF_DagNode_t *node) |
206 | { |
207 | RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p; |
208 | void *bf = (void *) node->params[1].p; |
209 | RF_ParityLogData_t *logData; |
210 | #if RF_ACC_TRACE > 0 |
211 | RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; |
212 | RF_Etimer_t timer; |
213 | #endif |
214 | |
215 | if (node->dagHdr->status == rf_enable) { |
216 | #if RF_ACC_TRACE > 0 |
217 | RF_ETIMER_START(timer); |
218 | #endif |
219 | logData = rf_CreateParityLogData(RF_OVERWRITE, pda, bf, |
220 | (RF_Raid_t *) (node->dagHdr->raidPtr), |
221 | node->wakeFunc, (void *) node, node->dagHdr->tracerec, timer); |
222 | if (logData) |
223 | rf_ParityLogAppend(logData, RF_FALSE, NULL, RF_FALSE); |
224 | else { |
225 | #if RF_ACC_TRACE > 0 |
226 | RF_ETIMER_STOP(timer); |
227 | RF_ETIMER_EVAL(timer); |
228 | tracerec->plog_us += RF_ETIMER_VAL_US(timer); |
229 | #endif |
230 | (node->wakeFunc) (node, ENOMEM); |
231 | } |
232 | } |
233 | return (0); |
234 | } |
235 | |
236 | int |
237 | rf_ParityLogUpdateUndoFunc(RF_DagNode_t *node) |
238 | { |
239 | return (0); |
240 | } |
241 | |
242 | int |
243 | rf_ParityLogOverwriteUndoFunc(RF_DagNode_t *node) |
244 | { |
245 | return (0); |
246 | } |
247 | #endif /* RF_INCLUDE_PARITYLOGGING > 0 */ |
248 | |
249 | /***************************************************************************** |
250 | * the execution function associated with a NOP node |
251 | ****************************************************************************/ |
252 | int |
253 | rf_NullNodeFunc(RF_DagNode_t *node) |
254 | { |
255 | node->status = rf_good; |
256 | return (rf_FinishNode(node, RF_THREAD_CONTEXT)); |
257 | } |
258 | |
259 | int |
260 | rf_NullNodeUndoFunc(RF_DagNode_t *node) |
261 | { |
262 | node->status = rf_undone; |
263 | return (rf_FinishNode(node, RF_THREAD_CONTEXT)); |
264 | } |
265 | |
266 | |
267 | /***************************************************************************** |
268 | * the execution function associated with a disk-read node |
269 | ****************************************************************************/ |
270 | int |
271 | rf_DiskReadFuncForThreads(RF_DagNode_t *node) |
272 | { |
273 | RF_DiskQueueData_t *req; |
274 | RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p; |
275 | void *bf = (void *) node->params[1].p; |
276 | RF_StripeNum_t parityStripeID = (RF_StripeNum_t) node->params[2].v; |
277 | unsigned priority = RF_EXTRACT_PRIORITY(node->params[3].v); |
278 | unsigned which_ru = RF_EXTRACT_RU(node->params[3].v); |
279 | RF_IoType_t iotype = (node->dagHdr->status == rf_enable) ? RF_IO_TYPE_READ : RF_IO_TYPE_NOP; |
280 | RF_DiskQueue_t *dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues; |
281 | void *b_proc = NULL; |
282 | |
283 | if (node->dagHdr->bp) |
284 | b_proc = (void *) ((struct buf *) node->dagHdr->bp)->b_proc; |
285 | |
286 | req = rf_CreateDiskQueueData(iotype, pda->startSector, pda->numSector, |
287 | bf, parityStripeID, which_ru, |
288 | (int (*) (void *, int)) node->wakeFunc, |
289 | node, |
290 | #if RF_ACC_TRACE > 0 |
291 | node->dagHdr->tracerec, |
292 | #else |
293 | NULL, |
294 | #endif |
295 | (void *) (node->dagHdr->raidPtr), 0, b_proc, PR_NOWAIT); |
296 | if (!req) { |
297 | (node->wakeFunc) (node, ENOMEM); |
298 | } else { |
299 | node->dagFuncData = (void *) req; |
300 | rf_DiskIOEnqueue(&(dqs[pda->col]), req, priority); |
301 | } |
302 | return (0); |
303 | } |
304 | |
305 | |
306 | /***************************************************************************** |
307 | * the execution function associated with a disk-write node |
308 | ****************************************************************************/ |
309 | int |
310 | rf_DiskWriteFuncForThreads(RF_DagNode_t *node) |
311 | { |
312 | RF_DiskQueueData_t *req; |
313 | RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p; |
314 | void *bf = (void *) node->params[1].p; |
315 | RF_StripeNum_t parityStripeID = (RF_StripeNum_t) node->params[2].v; |
316 | unsigned priority = RF_EXTRACT_PRIORITY(node->params[3].v); |
317 | unsigned which_ru = RF_EXTRACT_RU(node->params[3].v); |
318 | RF_IoType_t iotype = (node->dagHdr->status == rf_enable) ? RF_IO_TYPE_WRITE : RF_IO_TYPE_NOP; |
319 | RF_DiskQueue_t *dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues; |
320 | void *b_proc = NULL; |
321 | |
322 | if (node->dagHdr->bp) |
323 | b_proc = (void *) ((struct buf *) node->dagHdr->bp)->b_proc; |
324 | |
325 | /* normal processing (rollaway or forward recovery) begins here */ |
326 | req = rf_CreateDiskQueueData(iotype, pda->startSector, pda->numSector, |
327 | bf, parityStripeID, which_ru, |
328 | (int (*) (void *, int)) node->wakeFunc, |
329 | (void *) node, |
330 | #if RF_ACC_TRACE > 0 |
331 | node->dagHdr->tracerec, |
332 | #else |
333 | NULL, |
334 | #endif |
335 | (void *) (node->dagHdr->raidPtr), |
336 | 0, b_proc, PR_NOWAIT); |
337 | |
338 | if (!req) { |
339 | (node->wakeFunc) (node, ENOMEM); |
340 | } else { |
341 | node->dagFuncData = (void *) req; |
342 | rf_DiskIOEnqueue(&(dqs[pda->col]), req, priority); |
343 | } |
344 | |
345 | return (0); |
346 | } |
347 | /***************************************************************************** |
348 | * the undo function for disk nodes |
349 | * Note: this is not a proper undo of a write node, only locks are released. |
350 | * old data is not restored to disk! |
351 | ****************************************************************************/ |
352 | int |
353 | rf_DiskUndoFunc(RF_DagNode_t *node) |
354 | { |
355 | RF_DiskQueueData_t *req; |
356 | RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p; |
357 | RF_DiskQueue_t *dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues; |
358 | |
359 | req = rf_CreateDiskQueueData(RF_IO_TYPE_NOP, |
360 | 0L, 0, NULL, 0L, 0, |
361 | (int (*) (void *, int)) node->wakeFunc, |
362 | (void *) node, |
363 | #if RF_ACC_TRACE > 0 |
364 | node->dagHdr->tracerec, |
365 | #else |
366 | NULL, |
367 | #endif |
368 | (void *) (node->dagHdr->raidPtr), |
369 | 0, NULL, PR_NOWAIT); |
370 | if (!req) |
371 | (node->wakeFunc) (node, ENOMEM); |
372 | else { |
373 | node->dagFuncData = (void *) req; |
374 | rf_DiskIOEnqueue(&(dqs[pda->col]), req, RF_IO_NORMAL_PRIORITY); |
375 | } |
376 | |
377 | return (0); |
378 | } |
379 | |
380 | /***************************************************************************** |
381 | * Callback routine for DiskRead and DiskWrite nodes. When the disk |
382 | * op completes, the routine is called to set the node status and |
383 | * inform the execution engine that the node has fired. |
384 | ****************************************************************************/ |
385 | int |
386 | rf_GenericWakeupFunc(RF_DagNode_t *node, int status) |
387 | { |
388 | |
389 | switch (node->status) { |
390 | case rf_fired: |
391 | if (status) |
392 | node->status = rf_bad; |
393 | else |
394 | node->status = rf_good; |
395 | break; |
396 | case rf_recover: |
397 | /* probably should never reach this case */ |
398 | if (status) |
399 | node->status = rf_panic; |
400 | else |
401 | node->status = rf_undone; |
402 | break; |
403 | default: |
404 | printf("rf_GenericWakeupFunc:" ); |
405 | printf("node->status is %d," , node->status); |
406 | printf("status is %d \n" , status); |
407 | RF_PANIC(); |
408 | break; |
409 | } |
410 | if (node->dagFuncData) |
411 | rf_FreeDiskQueueData((RF_DiskQueueData_t *) node->dagFuncData); |
412 | return (rf_FinishNode(node, RF_INTR_CONTEXT)); |
413 | } |
414 | |
415 | |
416 | /***************************************************************************** |
417 | * there are three distinct types of xor nodes: |
418 | |
419 | * A "regular xor" is used in the fault-free case where the access |
420 | * spans a complete stripe unit. It assumes that the result buffer is |
421 | * one full stripe unit in size, and uses the stripe-unit-offset |
422 | * values that it computes from the PDAs to determine where within the |
423 | * stripe unit to XOR each argument buffer. |
424 | * |
425 | * A "simple xor" is used in the fault-free case where the access |
426 | * touches only a portion of one (or two, in some cases) stripe |
427 | * unit(s). It assumes that all the argument buffers are of the same |
428 | * size and have the same stripe unit offset. |
429 | * |
430 | * A "recovery xor" is used in the degraded-mode case. It's similar |
431 | * to the regular xor function except that it takes the failed PDA as |
432 | * an additional parameter, and uses it to determine what portions of |
433 | * the argument buffers need to be xor'd into the result buffer, and |
434 | * where in the result buffer they should go. |
435 | ****************************************************************************/ |
436 | |
437 | /* xor the params together and store the result in the result field. |
438 | * assume the result field points to a buffer that is the size of one |
439 | * SU, and use the pda params to determine where within the buffer to |
440 | * XOR the input buffers. */ |
441 | int |
442 | rf_RegularXorFunc(RF_DagNode_t *node) |
443 | { |
444 | RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p; |
445 | #if RF_ACC_TRACE > 0 |
446 | RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; |
447 | RF_Etimer_t timer; |
448 | #endif |
449 | int i, retcode; |
450 | |
451 | retcode = 0; |
452 | if (node->dagHdr->status == rf_enable) { |
453 | /* don't do the XOR if the input is the same as the output */ |
454 | #if RF_ACC_TRACE > 0 |
455 | RF_ETIMER_START(timer); |
456 | #endif |
457 | for (i = 0; i < node->numParams - 1; i += 2) |
458 | if (node->params[i + 1].p != node->results[0]) { |
459 | retcode = rf_XorIntoBuffer(raidPtr, (RF_PhysDiskAddr_t *) node->params[i].p, |
460 | (char *) node->params[i + 1].p, (char *) node->results[0]); |
461 | } |
462 | #if RF_ACC_TRACE > 0 |
463 | RF_ETIMER_STOP(timer); |
464 | RF_ETIMER_EVAL(timer); |
465 | tracerec->xor_us += RF_ETIMER_VAL_US(timer); |
466 | #endif |
467 | } |
468 | return (rf_GenericWakeupFunc(node, retcode)); /* call wake func |
469 | * explicitly since no |
470 | * I/O in this node */ |
471 | } |
472 | /* xor the inputs into the result buffer, ignoring placement issues */ |
473 | int |
474 | rf_SimpleXorFunc(RF_DagNode_t *node) |
475 | { |
476 | RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p; |
477 | int i, retcode = 0; |
478 | #if RF_ACC_TRACE > 0 |
479 | RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; |
480 | RF_Etimer_t timer; |
481 | #endif |
482 | |
483 | if (node->dagHdr->status == rf_enable) { |
484 | #if RF_ACC_TRACE > 0 |
485 | RF_ETIMER_START(timer); |
486 | #endif |
487 | /* don't do the XOR if the input is the same as the output */ |
488 | for (i = 0; i < node->numParams - 1; i += 2) |
489 | if (node->params[i + 1].p != node->results[0]) { |
490 | retcode = rf_bxor((char *) node->params[i + 1].p, (char *) node->results[0], |
491 | rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[i].p)->numSector)); |
492 | } |
493 | #if RF_ACC_TRACE > 0 |
494 | RF_ETIMER_STOP(timer); |
495 | RF_ETIMER_EVAL(timer); |
496 | tracerec->xor_us += RF_ETIMER_VAL_US(timer); |
497 | #endif |
498 | } |
499 | return (rf_GenericWakeupFunc(node, retcode)); /* call wake func |
500 | * explicitly since no |
501 | * I/O in this node */ |
502 | } |
503 | /* this xor is used by the degraded-mode dag functions to recover lost |
504 | * data. the second-to-last parameter is the PDA for the failed |
505 | * portion of the access. the code here looks at this PDA and assumes |
506 | * that the xor target buffer is equal in size to the number of |
507 | * sectors in the failed PDA. It then uses the other PDAs in the |
508 | * parameter list to determine where within the target buffer the |
509 | * corresponding data should be xored. */ |
510 | int |
511 | rf_RecoveryXorFunc(RF_DagNode_t *node) |
512 | { |
513 | RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p; |
514 | RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout; |
515 | RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p; |
516 | int i, retcode = 0; |
517 | RF_PhysDiskAddr_t *pda; |
518 | int suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector); |
519 | char *srcbuf, *destbuf; |
520 | #if RF_ACC_TRACE > 0 |
521 | RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; |
522 | RF_Etimer_t timer; |
523 | #endif |
524 | |
525 | if (node->dagHdr->status == rf_enable) { |
526 | #if RF_ACC_TRACE > 0 |
527 | RF_ETIMER_START(timer); |
528 | #endif |
529 | for (i = 0; i < node->numParams - 2; i += 2) |
530 | if (node->params[i + 1].p != node->results[0]) { |
531 | pda = (RF_PhysDiskAddr_t *) node->params[i].p; |
532 | srcbuf = (char *) node->params[i + 1].p; |
533 | suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); |
534 | destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset); |
535 | retcode = rf_bxor(srcbuf, destbuf, rf_RaidAddressToByte(raidPtr, pda->numSector)); |
536 | } |
537 | #if RF_ACC_TRACE > 0 |
538 | RF_ETIMER_STOP(timer); |
539 | RF_ETIMER_EVAL(timer); |
540 | tracerec->xor_us += RF_ETIMER_VAL_US(timer); |
541 | #endif |
542 | } |
543 | return (rf_GenericWakeupFunc(node, retcode)); |
544 | } |
545 | /***************************************************************************** |
546 | * The next three functions are utilities used by the above |
547 | * xor-execution functions. |
548 | ****************************************************************************/ |
549 | |
550 | |
551 | /* |
552 | * this is just a glorified buffer xor. targbuf points to a buffer |
553 | * that is one full stripe unit in size. srcbuf points to a buffer |
554 | * that may be less than 1 SU, but never more. When the access |
555 | * described by pda is one SU in size (which by implication means it's |
556 | * SU-aligned), all that happens is (targbuf) <- (srcbuf ^ targbuf). |
557 | * When the access is less than one SU in size the XOR occurs on only |
558 | * the portion of targbuf identified in the pda. */ |
559 | |
560 | int |
561 | rf_XorIntoBuffer(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda, |
562 | char *srcbuf, char *targbuf) |
563 | { |
564 | char *targptr; |
565 | int sectPerSU = raidPtr->Layout.sectorsPerStripeUnit; |
566 | int SUOffset = pda->startSector % sectPerSU; |
567 | int length, retcode = 0; |
568 | |
569 | RF_ASSERT(pda->numSector <= sectPerSU); |
570 | |
571 | targptr = targbuf + rf_RaidAddressToByte(raidPtr, SUOffset); |
572 | length = rf_RaidAddressToByte(raidPtr, pda->numSector); |
573 | retcode = rf_bxor(srcbuf, targptr, length); |
574 | return (retcode); |
575 | } |
576 | /* it really should be the case that the buffer pointers (returned by |
577 | * malloc) are aligned to the natural word size of the machine, so |
578 | * this is the only case we optimize for. The length should always be |
579 | * a multiple of the sector size, so there should be no problem with |
580 | * leftover bytes at the end. */ |
581 | int |
582 | rf_bxor(char *src, char *dest, int len) |
583 | { |
584 | unsigned mask = sizeof(long) - 1, retcode = 0; |
585 | |
586 | if (!(((unsigned long) src) & mask) && |
587 | !(((unsigned long) dest) & mask) && !(len & mask)) { |
588 | retcode = rf_longword_bxor((unsigned long *) src, |
589 | (unsigned long *) dest, |
590 | len >> RF_LONGSHIFT); |
591 | } else { |
592 | RF_ASSERT(0); |
593 | } |
594 | return (retcode); |
595 | } |
596 | |
597 | /* When XORing in kernel mode, we need to map each user page to kernel |
598 | * space before we can access it. We don't want to assume anything |
599 | * about which input buffers are in kernel/user space, nor about their |
600 | * alignment, so in each loop we compute the maximum number of bytes |
601 | * that we can xor without crossing any page boundaries, and do only |
602 | * this many bytes before the next remap. |
603 | * |
604 | * len - is in longwords |
605 | */ |
606 | int |
607 | rf_longword_bxor(unsigned long *src, unsigned long *dest, int len) |
608 | { |
609 | unsigned long *end = src + len; |
610 | unsigned long d0, d1, d2, d3, s0, s1, s2, s3; /* temps */ |
611 | unsigned long *pg_src, *pg_dest; /* per-page source/dest pointers */ |
612 | int longs_this_time;/* # longwords to xor in the current iteration */ |
613 | |
614 | pg_src = src; |
615 | pg_dest = dest; |
616 | if (!pg_src || !pg_dest) |
617 | return (EFAULT); |
618 | |
619 | while (len >= 4) { |
620 | longs_this_time = RF_MIN(len, RF_MIN(RF_BLIP(pg_src), RF_BLIP(pg_dest)) >> RF_LONGSHIFT); /* note len in longwords */ |
621 | src += longs_this_time; |
622 | dest += longs_this_time; |
623 | len -= longs_this_time; |
624 | while (longs_this_time >= 4) { |
625 | d0 = pg_dest[0]; |
626 | d1 = pg_dest[1]; |
627 | d2 = pg_dest[2]; |
628 | d3 = pg_dest[3]; |
629 | s0 = pg_src[0]; |
630 | s1 = pg_src[1]; |
631 | s2 = pg_src[2]; |
632 | s3 = pg_src[3]; |
633 | pg_dest[0] = d0 ^ s0; |
634 | pg_dest[1] = d1 ^ s1; |
635 | pg_dest[2] = d2 ^ s2; |
636 | pg_dest[3] = d3 ^ s3; |
637 | pg_src += 4; |
638 | pg_dest += 4; |
639 | longs_this_time -= 4; |
640 | } |
641 | while (longs_this_time > 0) { /* cannot cross any page |
642 | * boundaries here */ |
643 | *pg_dest++ ^= *pg_src++; |
644 | longs_this_time--; |
645 | } |
646 | |
647 | /* either we're done, or we've reached a page boundary on one |
648 | * (or possibly both) of the pointers */ |
649 | if (len) { |
650 | if (RF_PAGE_ALIGNED(src)) |
651 | pg_src = src; |
652 | if (RF_PAGE_ALIGNED(dest)) |
653 | pg_dest = dest; |
654 | if (!pg_src || !pg_dest) |
655 | return (EFAULT); |
656 | } |
657 | } |
658 | while (src < end) { |
659 | *pg_dest++ ^= *pg_src++; |
660 | src++; |
661 | dest++; |
662 | len--; |
663 | if (RF_PAGE_ALIGNED(src)) |
664 | pg_src = src; |
665 | if (RF_PAGE_ALIGNED(dest)) |
666 | pg_dest = dest; |
667 | } |
668 | RF_ASSERT(len == 0); |
669 | return (0); |
670 | } |
671 | |
672 | #if 0 |
673 | /* |
674 | dst = a ^ b ^ c; |
675 | a may equal dst |
676 | see comment above longword_bxor |
677 | len is length in longwords |
678 | */ |
679 | int |
680 | rf_longword_bxor3(unsigned long *dst, unsigned long *a, unsigned long *b, |
681 | unsigned long *c, int len, void *bp) |
682 | { |
683 | unsigned long a0, a1, a2, a3, b0, b1, b2, b3; |
684 | unsigned long *pg_a, *pg_b, *pg_c, *pg_dst; /* per-page source/dest |
685 | * pointers */ |
686 | int longs_this_time;/* # longs to xor in the current iteration */ |
687 | char dst_is_a = 0; |
688 | |
689 | pg_a = a; |
690 | pg_b = b; |
691 | pg_c = c; |
692 | if (a == dst) { |
693 | pg_dst = pg_a; |
694 | dst_is_a = 1; |
695 | } else { |
696 | pg_dst = dst; |
697 | } |
698 | |
699 | /* align dest to cache line. Can't cross a pg boundary on dst here. */ |
700 | while ((((unsigned long) pg_dst) & 0x1f)) { |
701 | *pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++; |
702 | dst++; |
703 | a++; |
704 | b++; |
705 | c++; |
706 | if (RF_PAGE_ALIGNED(a)) { |
707 | pg_a = a; |
708 | if (!pg_a) |
709 | return (EFAULT); |
710 | } |
711 | if (RF_PAGE_ALIGNED(b)) { |
712 | pg_b = a; |
713 | if (!pg_b) |
714 | return (EFAULT); |
715 | } |
716 | if (RF_PAGE_ALIGNED(c)) { |
717 | pg_c = a; |
718 | if (!pg_c) |
719 | return (EFAULT); |
720 | } |
721 | len--; |
722 | } |
723 | |
724 | while (len > 4) { |
725 | longs_this_time = RF_MIN(len, RF_MIN(RF_BLIP(a), RF_MIN(RF_BLIP(b), RF_MIN(RF_BLIP(c), RF_BLIP(dst)))) >> RF_LONGSHIFT); |
726 | a += longs_this_time; |
727 | b += longs_this_time; |
728 | c += longs_this_time; |
729 | dst += longs_this_time; |
730 | len -= longs_this_time; |
731 | while (longs_this_time >= 4) { |
732 | a0 = pg_a[0]; |
733 | longs_this_time -= 4; |
734 | |
735 | a1 = pg_a[1]; |
736 | a2 = pg_a[2]; |
737 | |
738 | a3 = pg_a[3]; |
739 | pg_a += 4; |
740 | |
741 | b0 = pg_b[0]; |
742 | b1 = pg_b[1]; |
743 | |
744 | b2 = pg_b[2]; |
745 | b3 = pg_b[3]; |
746 | /* start dual issue */ |
747 | a0 ^= b0; |
748 | b0 = pg_c[0]; |
749 | |
750 | pg_b += 4; |
751 | a1 ^= b1; |
752 | |
753 | a2 ^= b2; |
754 | a3 ^= b3; |
755 | |
756 | b1 = pg_c[1]; |
757 | a0 ^= b0; |
758 | |
759 | b2 = pg_c[2]; |
760 | a1 ^= b1; |
761 | |
762 | b3 = pg_c[3]; |
763 | a2 ^= b2; |
764 | |
765 | pg_dst[0] = a0; |
766 | a3 ^= b3; |
767 | pg_dst[1] = a1; |
768 | pg_c += 4; |
769 | pg_dst[2] = a2; |
770 | pg_dst[3] = a3; |
771 | pg_dst += 4; |
772 | } |
773 | while (longs_this_time > 0) { /* cannot cross any page |
774 | * boundaries here */ |
775 | *pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++; |
776 | longs_this_time--; |
777 | } |
778 | |
779 | if (len) { |
780 | if (RF_PAGE_ALIGNED(a)) { |
781 | pg_a = a; |
782 | if (!pg_a) |
783 | return (EFAULT); |
784 | if (dst_is_a) |
785 | pg_dst = pg_a; |
786 | } |
787 | if (RF_PAGE_ALIGNED(b)) { |
788 | pg_b = b; |
789 | if (!pg_b) |
790 | return (EFAULT); |
791 | } |
792 | if (RF_PAGE_ALIGNED(c)) { |
793 | pg_c = c; |
794 | if (!pg_c) |
795 | return (EFAULT); |
796 | } |
797 | if (!dst_is_a) |
798 | if (RF_PAGE_ALIGNED(dst)) { |
799 | pg_dst = dst; |
800 | if (!pg_dst) |
801 | return (EFAULT); |
802 | } |
803 | } |
804 | } |
805 | while (len) { |
806 | *pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++; |
807 | dst++; |
808 | a++; |
809 | b++; |
810 | c++; |
811 | if (RF_PAGE_ALIGNED(a)) { |
812 | pg_a = a; |
813 | if (!pg_a) |
814 | return (EFAULT); |
815 | if (dst_is_a) |
816 | pg_dst = pg_a; |
817 | } |
818 | if (RF_PAGE_ALIGNED(b)) { |
819 | pg_b = b; |
820 | if (!pg_b) |
821 | return (EFAULT); |
822 | } |
823 | if (RF_PAGE_ALIGNED(c)) { |
824 | pg_c = c; |
825 | if (!pg_c) |
826 | return (EFAULT); |
827 | } |
828 | if (!dst_is_a) |
829 | if (RF_PAGE_ALIGNED(dst)) { |
830 | pg_dst = dst; |
831 | if (!pg_dst) |
832 | return (EFAULT); |
833 | } |
834 | len--; |
835 | } |
836 | return (0); |
837 | } |
838 | |
839 | int |
840 | rf_bxor3(unsigned char *dst, unsigned char *a, unsigned char *b, |
841 | unsigned char *c, unsigned long len, void *bp) |
842 | { |
843 | RF_ASSERT(((RF_UL(dst) | RF_UL(a) | RF_UL(b) | RF_UL(c) | len) & 0x7) == 0); |
844 | |
845 | return (rf_longword_bxor3((unsigned long *) dst, (unsigned long *) a, |
846 | (unsigned long *) b, (unsigned long *) c, len >> RF_LONGSHIFT, bp)); |
847 | } |
848 | #endif |
849 | |