1 | /* $NetBSD: rf_pq.c,v 1.16 2009/03/14 15:36:20 dsl Exp $ */ |
2 | /* |
3 | * Copyright (c) 1995 Carnegie-Mellon University. |
4 | * All rights reserved. |
5 | * |
6 | * Author: Daniel Stodolsky |
7 | * |
8 | * Permission to use, copy, modify and distribute this software and |
9 | * its documentation is hereby granted, provided that both the copyright |
10 | * notice and this permission notice appear in all copies of the |
11 | * software, derivative works or modified versions, and any portions |
12 | * thereof, and that both notices appear in supporting documentation. |
13 | * |
14 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
15 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND |
16 | * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. |
17 | * |
18 | * Carnegie Mellon requests users of this software to return to |
19 | * |
20 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
21 | * School of Computer Science |
22 | * Carnegie Mellon University |
23 | * Pittsburgh PA 15213-3890 |
24 | * |
25 | * any improvements or extensions that they make and grant Carnegie the |
26 | * rights to redistribute these changes. |
27 | */ |
28 | |
29 | /* |
30 | * Code for RAID level 6 (P + Q) disk array architecture. |
31 | */ |
32 | |
33 | #include <sys/cdefs.h> |
34 | __KERNEL_RCSID(0, "$NetBSD: rf_pq.c,v 1.16 2009/03/14 15:36:20 dsl Exp $" ); |
35 | |
36 | #include "rf_archs.h" |
37 | |
38 | #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0) |
39 | |
40 | #include <dev/raidframe/raidframevar.h> |
41 | |
42 | #include "rf_raid.h" |
43 | #include "rf_dag.h" |
44 | #include "rf_dagffrd.h" |
45 | #include "rf_dagffwr.h" |
46 | #include "rf_dagdegrd.h" |
47 | #include "rf_dagdegwr.h" |
48 | #include "rf_dagutils.h" |
49 | #include "rf_dagfuncs.h" |
50 | #include "rf_etimer.h" |
51 | #include "rf_pqdeg.h" |
52 | #include "rf_general.h" |
53 | #include "rf_map.h" |
54 | #include "rf_pq.h" |
55 | |
56 | RF_RedFuncs_t rf_pFuncs = {rf_RegularONPFunc, "Regular Old-New P" , rf_SimpleONPFunc, "Simple Old-New P" }; |
57 | RF_RedFuncs_t rf_pRecoveryFuncs = {rf_RecoveryPFunc, "Recovery P Func" , rf_RecoveryPFunc, "Recovery P Func" }; |
58 | |
59 | int |
60 | rf_RegularONPFunc(RF_DagNode_t *node) |
61 | { |
62 | return (rf_RegularXorFunc(node)); |
63 | } |
64 | /* |
65 | same as simpleONQ func, but the coefficient is always 1 |
66 | */ |
67 | |
68 | int |
69 | rf_SimpleONPFunc(RF_DagNode_t *node) |
70 | { |
71 | return (rf_SimpleXorFunc(node)); |
72 | } |
73 | |
74 | int |
75 | rf_RecoveryPFunc(RF_DagNode_t *node) |
76 | { |
77 | return (rf_RecoveryXorFunc(node)); |
78 | } |
79 | |
80 | int |
81 | rf_RegularPFunc(RF_DagNode_t *node) |
82 | { |
83 | return (rf_RegularXorFunc(node)); |
84 | } |
85 | #endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0) */ |
86 | #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) |
87 | |
88 | static void |
89 | QDelta(char *dest, char *obuf, char *nbuf, unsigned length, |
90 | unsigned char coeff); |
91 | static void |
92 | rf_InvertQ(unsigned long *qbuf, unsigned long *abuf, |
93 | unsigned length, unsigned coeff); |
94 | |
95 | RF_RedFuncs_t rf_qFuncs = {rf_RegularONQFunc, "Regular Old-New Q" , rf_SimpleONQFunc, "Simple Old-New Q" }; |
96 | RF_RedFuncs_t rf_qRecoveryFuncs = {rf_RecoveryQFunc, "Recovery Q Func" , rf_RecoveryQFunc, "Recovery Q Func" }; |
97 | RF_RedFuncs_t rf_pqRecoveryFuncs = {rf_RecoveryPQFunc, "Recovery PQ Func" , rf_RecoveryPQFunc, "Recovery PQ Func" }; |
98 | |
99 | void |
100 | rf_PQDagSelect( |
101 | RF_Raid_t * raidPtr, |
102 | RF_IoType_t type, |
103 | RF_AccessStripeMap_t * asmap, |
104 | RF_VoidFuncPtr * createFunc) |
105 | { |
106 | RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); |
107 | unsigned ndfail = asmap->numDataFailed; |
108 | unsigned npfail = asmap->numParityFailed; |
109 | unsigned ntfail = npfail + ndfail; |
110 | |
111 | RF_ASSERT(RF_IO_IS_R_OR_W(type)); |
112 | if (ntfail > 2) { |
113 | RF_ERRORMSG("more than two disks failed in a single group! Aborting I/O operation.\n" ); |
114 | *createFunc = NULL; |
115 | return; |
116 | } |
117 | /* ok, we can do this I/O */ |
118 | if (type == RF_IO_TYPE_READ) { |
119 | switch (ndfail) { |
120 | case 0: |
121 | /* fault free read */ |
122 | *createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG; /* same as raid 5 */ |
123 | break; |
124 | case 1: |
125 | /* lost a single data unit */ |
126 | /* two cases: (1) parity is not lost. do a normal raid |
127 | * 5 reconstruct read. (2) parity is lost. do a |
128 | * reconstruct read using "q". */ |
129 | if (ntfail == 2) { /* also lost redundancy */ |
130 | if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) |
131 | *createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateReadDAG; |
132 | else |
133 | *createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateReadDAG; |
134 | } else { |
135 | /* P and Q are ok. But is there a failure in |
136 | * some unaccessed data unit? */ |
137 | if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2) |
138 | *createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG; |
139 | else |
140 | *createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateReadDAG; |
141 | } |
142 | break; |
143 | case 2: |
144 | /* lost two data units */ |
145 | *createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG; |
146 | break; |
147 | } |
148 | return; |
149 | } |
150 | /* a write */ |
151 | switch (ntfail) { |
152 | case 0: /* fault free */ |
153 | if (rf_suppressLocksAndLargeWrites || |
154 | (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) || |
155 | (asmap->parityInfo->next != NULL) || (asmap->qInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) { |
156 | |
157 | *createFunc = (RF_VoidFuncPtr) rf_PQCreateSmallWriteDAG; |
158 | } else { |
159 | *createFunc = (RF_VoidFuncPtr) rf_PQCreateLargeWriteDAG; |
160 | } |
161 | break; |
162 | |
163 | case 1: /* single disk fault */ |
164 | if (npfail == 1) { |
165 | RF_ASSERT((asmap->failedPDAs[0]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q)); |
166 | if (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q) { /* q died, treat like |
167 | * normal mode raid5 |
168 | * write. */ |
169 | if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1)) |
170 | || rf_NumFailedDataUnitsInStripe(raidPtr, asmap)) |
171 | *createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateSmallWriteDAG; |
172 | else |
173 | *createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateLargeWriteDAG; |
174 | } else {/* parity died, small write only updating Q */ |
175 | if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1)) |
176 | || rf_NumFailedDataUnitsInStripe(raidPtr, asmap)) |
177 | *createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateSmallWriteDAG; |
178 | else |
179 | *createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateLargeWriteDAG; |
180 | } |
181 | } else { /* data missing. Do a P reconstruct write if |
182 | * only a single data unit is lost in the |
183 | * stripe, otherwise a PQ reconstruct write. */ |
184 | if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2) |
185 | *createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG; |
186 | else |
187 | *createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateWriteDAG; |
188 | } |
189 | break; |
190 | |
191 | case 2: /* two disk faults */ |
192 | switch (npfail) { |
193 | case 2: /* both p and q dead */ |
194 | *createFunc = (RF_VoidFuncPtr) rf_PQ_011_CreateWriteDAG; |
195 | break; |
196 | case 1: /* either p or q and dead data */ |
197 | RF_ASSERT(asmap->failedPDAs[0]->type == RF_PDA_TYPE_DATA); |
198 | RF_ASSERT((asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q)); |
199 | if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q) |
200 | *createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateWriteDAG; |
201 | else |
202 | *createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateWriteDAG; |
203 | break; |
204 | case 0: /* double data loss */ |
205 | *createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG; |
206 | break; |
207 | } |
208 | break; |
209 | |
210 | default: /* more than 2 disk faults */ |
211 | *createFunc = NULL; |
212 | RF_PANIC(); |
213 | } |
214 | return; |
215 | } |
216 | /* |
217 | Used as a stop gap info function |
218 | */ |
219 | #if 0 |
220 | static void |
221 | PQOne(RF_Raid_t *raidPtr, int *nSucc, int *nAnte, RF_AccessStripeMap_t *asmap) |
222 | { |
223 | *nSucc = *nAnte = 1; |
224 | } |
225 | |
226 | static void |
227 | PQOneTwo(RF_Raid_t *raidPtr, int *nSucc, int *nAnte, RF_AccessStripeMap_t *asmap) |
228 | { |
229 | *nSucc = 1; |
230 | *nAnte = 2; |
231 | } |
232 | #endif |
233 | |
234 | RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG) |
235 | { |
236 | rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2, |
237 | rf_RegularPQFunc, RF_FALSE); |
238 | } |
239 | |
240 | int |
241 | rf_RegularONQFunc(RF_DagNode_t *node) |
242 | { |
243 | int np = node->numParams; |
244 | int d; |
245 | RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p; |
246 | int i; |
247 | RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; |
248 | RF_Etimer_t timer; |
249 | char *qbuf, *qpbuf; |
250 | char *obuf, *nbuf; |
251 | RF_PhysDiskAddr_t *old, *new; |
252 | unsigned long coeff; |
253 | unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit; |
254 | |
255 | RF_ETIMER_START(timer); |
256 | |
257 | d = (np - 3) / 4; |
258 | RF_ASSERT(4 * d + 3 == np); |
259 | qbuf = (char *) node->params[2 * d + 1].p; /* q buffer */ |
260 | for (i = 0; i < d; i++) { |
261 | old = (RF_PhysDiskAddr_t *) node->params[2 * i].p; |
262 | obuf = (char *) node->params[2 * i + 1].p; |
263 | new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p; |
264 | nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p; |
265 | RF_ASSERT(new->numSector == old->numSector); |
266 | RF_ASSERT(new->raidAddress == old->raidAddress); |
267 | /* the stripe unit within the stripe tells us the coefficient |
268 | * to use for the multiply. */ |
269 | coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress); |
270 | /* compute the data unit offset within the column, then add |
271 | * one */ |
272 | coeff = (coeff % raidPtr->Layout.numDataCol); |
273 | qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU); |
274 | QDelta(qpbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff); |
275 | } |
276 | |
277 | RF_ETIMER_STOP(timer); |
278 | RF_ETIMER_EVAL(timer); |
279 | tracerec->q_us += RF_ETIMER_VAL_US(timer); |
280 | rf_GenericWakeupFunc(node, 0); /* call wake func explicitly since no |
281 | * I/O in this node */ |
282 | return (0); |
283 | } |
284 | /* |
285 | See the SimpleXORFunc for the difference between a simple and regular func. |
286 | These Q functions should be used for |
287 | |
288 | new q = Q(data,old data,old q) |
289 | |
290 | style updates and not for |
291 | |
292 | q = ( new data, new data, .... ) |
293 | |
294 | computations. |
295 | |
296 | The simple q takes 2(2d+1)+1 params, where d is the number |
297 | of stripes written. The order of params is |
298 | old data pda_0, old data buffer_0, old data pda_1, old data buffer_1, ... old data pda_d, old data buffer_d |
299 | [2d] old q pda_0, old q buffer |
300 | [2d_2] new data pda_0, new data buffer_0, ... new data pda_d, new data buffer_d |
301 | raidPtr |
302 | */ |
303 | |
304 | int |
305 | rf_SimpleONQFunc(RF_DagNode_t *node) |
306 | { |
307 | int np = node->numParams; |
308 | int d; |
309 | RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p; |
310 | int i; |
311 | RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; |
312 | RF_Etimer_t timer; |
313 | char *qbuf; |
314 | char *obuf, *nbuf; |
315 | RF_PhysDiskAddr_t *old, *new; |
316 | unsigned long coeff; |
317 | |
318 | RF_ETIMER_START(timer); |
319 | |
320 | d = (np - 3) / 4; |
321 | RF_ASSERT(4 * d + 3 == np); |
322 | qbuf = (char *) node->params[2 * d + 1].p; /* q buffer */ |
323 | for (i = 0; i < d; i++) { |
324 | old = (RF_PhysDiskAddr_t *) node->params[2 * i].p; |
325 | obuf = (char *) node->params[2 * i + 1].p; |
326 | new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p; |
327 | nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p; |
328 | RF_ASSERT(new->numSector == old->numSector); |
329 | RF_ASSERT(new->raidAddress == old->raidAddress); |
330 | /* the stripe unit within the stripe tells us the coefficient |
331 | * to use for the multiply. */ |
332 | coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress); |
333 | /* compute the data unit offset within the column, then add |
334 | * one */ |
335 | coeff = (coeff % raidPtr->Layout.numDataCol); |
336 | QDelta(qbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff); |
337 | } |
338 | |
339 | RF_ETIMER_STOP(timer); |
340 | RF_ETIMER_EVAL(timer); |
341 | tracerec->q_us += RF_ETIMER_VAL_US(timer); |
342 | rf_GenericWakeupFunc(node, 0); /* call wake func explicitly since no |
343 | * I/O in this node */ |
344 | return (0); |
345 | } |
346 | RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG) |
347 | { |
348 | rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pFuncs, &rf_qFuncs); |
349 | } |
350 | |
351 | static void RegularQSubr(RF_DagNode_t *node, char *qbuf); |
352 | |
353 | static void |
354 | RegularQSubr(RF_DagNode_t *node, char *qbuf) |
355 | { |
356 | int np = node->numParams; |
357 | int d; |
358 | RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p; |
359 | unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit; |
360 | int i; |
361 | RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; |
362 | RF_Etimer_t timer; |
363 | char *obuf, *qpbuf; |
364 | RF_PhysDiskAddr_t *old; |
365 | unsigned long coeff; |
366 | |
367 | RF_ETIMER_START(timer); |
368 | |
369 | d = (np - 1) / 2; |
370 | RF_ASSERT(2 * d + 1 == np); |
371 | for (i = 0; i < d; i++) { |
372 | old = (RF_PhysDiskAddr_t *) node->params[2 * i].p; |
373 | obuf = (char *) node->params[2 * i + 1].p; |
374 | coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress); |
375 | /* compute the data unit offset within the column, then add |
376 | * one */ |
377 | coeff = (coeff % raidPtr->Layout.numDataCol); |
378 | /* the input buffers may not all be aligned with the start of |
379 | * the stripe. so shift by their sector offset within the |
380 | * stripe unit */ |
381 | qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU); |
382 | rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff); |
383 | } |
384 | |
385 | RF_ETIMER_STOP(timer); |
386 | RF_ETIMER_EVAL(timer); |
387 | tracerec->q_us += RF_ETIMER_VAL_US(timer); |
388 | } |
389 | /* |
390 | used in degraded writes. |
391 | */ |
392 | |
393 | static void DegrQSubr(RF_DagNode_t *node); |
394 | |
395 | static void |
396 | DegrQSubr(RF_DagNode_t *node) |
397 | { |
398 | int np = node->numParams; |
399 | int d; |
400 | RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p; |
401 | unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit; |
402 | int i; |
403 | RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; |
404 | RF_Etimer_t timer; |
405 | char *qbuf = node->results[1]; |
406 | char *obuf, *qpbuf; |
407 | RF_PhysDiskAddr_t *old; |
408 | unsigned long coeff; |
409 | unsigned fail_start; |
410 | int j; |
411 | |
412 | old = (RF_PhysDiskAddr_t *) node->params[np - 2].p; |
413 | fail_start = old->startSector % secPerSU; |
414 | |
415 | RF_ETIMER_START(timer); |
416 | |
417 | d = (np - 2) / 2; |
418 | RF_ASSERT(2 * d + 2 == np); |
419 | for (i = 0; i < d; i++) { |
420 | old = (RF_PhysDiskAddr_t *) node->params[2 * i].p; |
421 | obuf = (char *) node->params[2 * i + 1].p; |
422 | coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress); |
423 | /* compute the data unit offset within the column, then add |
424 | * one */ |
425 | coeff = (coeff % raidPtr->Layout.numDataCol); |
426 | /* the input buffers may not all be aligned with the start of |
427 | * the stripe. so shift by their sector offset within the |
428 | * stripe unit */ |
429 | j = old->startSector % secPerSU; |
430 | RF_ASSERT(j >= fail_start); |
431 | qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start); |
432 | rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff); |
433 | } |
434 | |
435 | RF_ETIMER_STOP(timer); |
436 | RF_ETIMER_EVAL(timer); |
437 | tracerec->q_us += RF_ETIMER_VAL_US(timer); |
438 | } |
439 | /* |
440 | Called by large write code to compute the new parity and the new q. |
441 | |
442 | structure of the params: |
443 | |
444 | pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d ( d = numDataCol |
445 | raidPtr |
446 | |
447 | for a total of 2d+1 arguments. |
448 | The result buffers results[0], results[1] are the buffers for the p and q, |
449 | respectively. |
450 | |
451 | We compute Q first, then compute P. The P calculation may try to reuse |
452 | one of the input buffers for its output, so if we computed P first, we would |
453 | corrupt the input for the q calculation. |
454 | */ |
455 | |
456 | int |
457 | rf_RegularPQFunc(RF_DagNode_t *node) |
458 | { |
459 | RegularQSubr(node, node->results[1]); |
460 | return (rf_RegularXorFunc(node)); /* does the wakeup */ |
461 | } |
462 | |
463 | int |
464 | rf_RegularQFunc(RF_DagNode_t *node) |
465 | { |
466 | /* Almost ... adjust Qsubr args */ |
467 | RegularQSubr(node, node->results[0]); |
468 | rf_GenericWakeupFunc(node, 0); /* call wake func explicitly since no |
469 | * I/O in this node */ |
470 | return (0); |
471 | } |
472 | /* |
473 | Called by singly degraded write code to compute the new parity and the new q. |
474 | |
475 | structure of the params: |
476 | |
477 | pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d |
478 | failedPDA raidPtr |
479 | |
480 | for a total of 2d+2 arguments. |
481 | The result buffers results[0], results[1] are the buffers for the parity and q, |
482 | respectively. |
483 | |
484 | We compute Q first, then compute parity. The parity calculation may try to reuse |
485 | one of the input buffers for its output, so if we computed parity first, we would |
486 | corrupt the input for the q calculation. |
487 | |
488 | We treat this identically to the regularPQ case, ignoring the failedPDA extra argument. |
489 | */ |
490 | |
491 | void |
492 | rf_Degraded_100_PQFunc(RF_DagNode_t *node) |
493 | { |
494 | int np = node->numParams; |
495 | |
496 | RF_ASSERT(np >= 2); |
497 | DegrQSubr(node); |
498 | rf_RecoveryXorFunc(node); |
499 | } |
500 | |
501 | |
502 | /* |
503 | The two below are used when reading a stripe with a single lost data unit. |
504 | The parameters are |
505 | |
506 | pda_0, buffer_0, .... pda_n, buffer_n, P pda, P buffer, failedPDA, raidPtr |
507 | |
508 | and results[0] contains the data buffer. Which is originally zero-filled. |
509 | |
510 | */ |
511 | |
512 | /* this Q func is used by the degraded-mode dag functions to recover lost data. |
513 | * the second-to-last parameter is the PDA for the failed portion of the access. |
514 | * the code here looks at this PDA and assumes that the xor target buffer is |
515 | * equal in size to the number of sectors in the failed PDA. It then uses |
516 | * the other PDAs in the parameter list to determine where within the target |
517 | * buffer the corresponding data should be xored. |
518 | * |
519 | * Recall the basic equation is |
520 | * |
521 | * Q = ( data_1 + 2 * data_2 ... + k * data_k ) mod 256 |
522 | * |
523 | * so to recover data_j we need |
524 | * |
525 | * J data_j = (Q - data_1 - 2 data_2 ....- k* data_k) mod 256 |
526 | * |
527 | * So the coefficient for each buffer is (255 - data_col), and j should be initialized by |
528 | * copying Q into it. Then we need to do a table lookup to convert to solve |
529 | * data_j /= J |
530 | * |
531 | * |
532 | */ |
533 | int |
534 | rf_RecoveryQFunc(RF_DagNode_t *node) |
535 | { |
536 | RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p; |
537 | RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout; |
538 | RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p; |
539 | int i; |
540 | RF_PhysDiskAddr_t *pda; |
541 | RF_RaidAddr_t suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector); |
542 | char *srcbuf, *destbuf; |
543 | RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; |
544 | RF_Etimer_t timer; |
545 | unsigned long coeff; |
546 | |
547 | RF_ETIMER_START(timer); |
548 | /* start by copying Q into the buffer */ |
549 | memcpy(node->results[0], node->params[node->numParams - 3].p, |
550 | rf_RaidAddressToByte(raidPtr, failedPDA->numSector)); |
551 | for (i = 0; i < node->numParams - 4; i += 2) { |
552 | RF_ASSERT(node->params[i + 1].p != node->results[0]); |
553 | pda = (RF_PhysDiskAddr_t *) node->params[i].p; |
554 | srcbuf = (char *) node->params[i + 1].p; |
555 | suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); |
556 | destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset); |
557 | coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress); |
558 | /* compute the data unit offset within the column */ |
559 | coeff = (coeff % raidPtr->Layout.numDataCol); |
560 | rf_IncQ((unsigned long *) destbuf, (unsigned long *) srcbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff); |
561 | } |
562 | /* Do the nasty inversion now */ |
563 | coeff = (rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), failedPDA->startSector) % raidPtr->Layout.numDataCol); |
564 | rf_InvertQ(node->results[0], node->results[0], rf_RaidAddressToByte(raidPtr, pda->numSector), coeff); |
565 | RF_ETIMER_STOP(timer); |
566 | RF_ETIMER_EVAL(timer); |
567 | tracerec->q_us += RF_ETIMER_VAL_US(timer); |
568 | rf_GenericWakeupFunc(node, 0); |
569 | return (0); |
570 | } |
571 | |
572 | int |
573 | rf_RecoveryPQFunc(RF_DagNode_t *node) |
574 | { |
575 | RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p; |
576 | printf("raid%d: Recovery from PQ not implemented.\n" ,raidPtr->raidid); |
577 | return (1); |
578 | } |
579 | /* |
580 | Degraded write Q subroutine. |
581 | Used when P is dead. |
582 | Large-write style Q computation. |
583 | Parameters |
584 | |
585 | (pda,buf),(pda,buf),.....,(failedPDA,bufPtr),failedPDA,raidPtr. |
586 | |
587 | We ignore failedPDA. |
588 | |
589 | This is a "simple style" recovery func. |
590 | */ |
591 | |
592 | void |
593 | rf_PQ_DegradedWriteQFunc(RF_DagNode_t *node) |
594 | { |
595 | int np = node->numParams; |
596 | int d; |
597 | RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p; |
598 | unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit; |
599 | int i; |
600 | RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; |
601 | RF_Etimer_t timer; |
602 | char *qbuf = node->results[0]; |
603 | char *obuf, *qpbuf; |
604 | RF_PhysDiskAddr_t *old; |
605 | unsigned long coeff; |
606 | int fail_start, j; |
607 | |
608 | old = (RF_PhysDiskAddr_t *) node->params[np - 2].p; |
609 | fail_start = old->startSector % secPerSU; |
610 | |
611 | RF_ETIMER_START(timer); |
612 | |
613 | d = (np - 2) / 2; |
614 | RF_ASSERT(2 * d + 2 == np); |
615 | |
616 | for (i = 0; i < d; i++) { |
617 | old = (RF_PhysDiskAddr_t *) node->params[2 * i].p; |
618 | obuf = (char *) node->params[2 * i + 1].p; |
619 | coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress); |
620 | /* compute the data unit offset within the column, then add |
621 | * one */ |
622 | coeff = (coeff % raidPtr->Layout.numDataCol); |
623 | j = old->startSector % secPerSU; |
624 | RF_ASSERT(j >= fail_start); |
625 | qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start); |
626 | rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff); |
627 | } |
628 | |
629 | RF_ETIMER_STOP(timer); |
630 | RF_ETIMER_EVAL(timer); |
631 | tracerec->q_us += RF_ETIMER_VAL_US(timer); |
632 | rf_GenericWakeupFunc(node, 0); |
633 | } |
634 | |
635 | |
636 | |
637 | |
638 | /* Q computations */ |
639 | |
640 | /* |
641 | coeff - colummn; |
642 | |
643 | compute dest ^= qfor[28-coeff][rn[coeff+1] a] |
644 | |
645 | on 5-bit basis; |
646 | length in bytes; |
647 | */ |
648 | |
649 | void |
650 | rf_IncQ(unsigned long *dest, unsigned long *buf, unsigned length, unsigned coeff) |
651 | { |
652 | unsigned long a, d, new; |
653 | unsigned long a1, a2; |
654 | unsigned int *q = &(rf_qfor[28 - coeff][0]); |
655 | unsigned r = rf_rn[coeff + 1]; |
656 | |
657 | #define EXTRACT(a,i) ((a >> (5L*i)) & 0x1f) |
658 | #define INSERT(a,i) (a << (5L*i)) |
659 | |
660 | length /= 8; |
661 | /* 13 5 bit quants in a 64 bit word */ |
662 | while (length) { |
663 | a = *buf++; |
664 | d = *dest; |
665 | a1 = EXTRACT(a, 0) ^ r; |
666 | a2 = EXTRACT(a, 1) ^ r; |
667 | new = INSERT(a2, 1) | a1; |
668 | a1 = EXTRACT(a, 2) ^ r; |
669 | a2 = EXTRACT(a, 3) ^ r; |
670 | a1 = q[a1]; |
671 | a2 = q[a2]; |
672 | new = new | INSERT(a1, 2) | INSERT(a2, 3); |
673 | a1 = EXTRACT(a, 4) ^ r; |
674 | a2 = EXTRACT(a, 5) ^ r; |
675 | a1 = q[a1]; |
676 | a2 = q[a2]; |
677 | new = new | INSERT(a1, 4) | INSERT(a2, 5); |
678 | a1 = EXTRACT(a, 5) ^ r; |
679 | a2 = EXTRACT(a, 6) ^ r; |
680 | a1 = q[a1]; |
681 | a2 = q[a2]; |
682 | new = new | INSERT(a1, 5) | INSERT(a2, 6); |
683 | #if RF_LONGSHIFT > 2 |
684 | a1 = EXTRACT(a, 7) ^ r; |
685 | a2 = EXTRACT(a, 8) ^ r; |
686 | a1 = q[a1]; |
687 | a2 = q[a2]; |
688 | new = new | INSERT(a1, 7) | INSERT(a2, 8); |
689 | a1 = EXTRACT(a, 9) ^ r; |
690 | a2 = EXTRACT(a, 10) ^ r; |
691 | a1 = q[a1]; |
692 | a2 = q[a2]; |
693 | new = new | INSERT(a1, 9) | INSERT(a2, 10); |
694 | a1 = EXTRACT(a, 11) ^ r; |
695 | a2 = EXTRACT(a, 12) ^ r; |
696 | a1 = q[a1]; |
697 | a2 = q[a2]; |
698 | new = new | INSERT(a1, 11) | INSERT(a2, 12); |
699 | #endif /* RF_LONGSHIFT > 2 */ |
700 | d ^= new; |
701 | *dest++ = d; |
702 | length--; |
703 | } |
704 | } |
705 | /* |
706 | compute |
707 | |
708 | dest ^= rf_qfor[28-coeff][rf_rn[coeff+1] (old^new) ] |
709 | |
710 | on a five bit basis. |
711 | optimization: compute old ^ new on 64 bit basis. |
712 | |
713 | length in bytes. |
714 | */ |
715 | |
716 | static void |
717 | QDelta( |
718 | char *dest, |
719 | char *obuf, |
720 | char *nbuf, |
721 | unsigned length, |
722 | unsigned char coeff) |
723 | { |
724 | unsigned long a, d, new; |
725 | unsigned long a1, a2; |
726 | unsigned int *q = &(rf_qfor[28 - coeff][0]); |
727 | unsigned int r = rf_rn[coeff + 1]; |
728 | |
729 | r = a1 = a2 = new = d = a = 0; /* XXX for now... */ |
730 | q = NULL; /* XXX for now */ |
731 | |
732 | #ifdef _KERNEL |
733 | /* PQ in kernel currently not supported because the encoding/decoding |
734 | * table is not present */ |
735 | memset(dest, 0, length); |
736 | #else /* KERNEL */ |
737 | /* this code probably doesn't work and should be rewritten -wvcii */ |
738 | /* 13 5 bit quants in a 64 bit word */ |
739 | length /= 8; |
740 | while (length) { |
741 | a = *obuf++; /* XXX need to reorg to avoid cache conflicts */ |
742 | a ^= *nbuf++; |
743 | d = *dest; |
744 | a1 = EXTRACT(a, 0) ^ r; |
745 | a2 = EXTRACT(a, 1) ^ r; |
746 | a1 = q[a1]; |
747 | a2 = q[a2]; |
748 | new = INSERT(a2, 1) | a1; |
749 | a1 = EXTRACT(a, 2) ^ r; |
750 | a2 = EXTRACT(a, 3) ^ r; |
751 | a1 = q[a1]; |
752 | a2 = q[a2]; |
753 | new = new | INSERT(a1, 2) | INSERT(a2, 3); |
754 | a1 = EXTRACT(a, 4) ^ r; |
755 | a2 = EXTRACT(a, 5) ^ r; |
756 | a1 = q[a1]; |
757 | a2 = q[a2]; |
758 | new = new | INSERT(a1, 4) | INSERT(a2, 5); |
759 | a1 = EXTRACT(a, 5) ^ r; |
760 | a2 = EXTRACT(a, 6) ^ r; |
761 | a1 = q[a1]; |
762 | a2 = q[a2]; |
763 | new = new | INSERT(a1, 5) | INSERT(a2, 6); |
764 | #if RF_LONGSHIFT > 2 |
765 | a1 = EXTRACT(a, 7) ^ r; |
766 | a2 = EXTRACT(a, 8) ^ r; |
767 | a1 = q[a1]; |
768 | a2 = q[a2]; |
769 | new = new | INSERT(a1, 7) | INSERT(a2, 8); |
770 | a1 = EXTRACT(a, 9) ^ r; |
771 | a2 = EXTRACT(a, 10) ^ r; |
772 | a1 = q[a1]; |
773 | a2 = q[a2]; |
774 | new = new | INSERT(a1, 9) | INSERT(a2, 10); |
775 | a1 = EXTRACT(a, 11) ^ r; |
776 | a2 = EXTRACT(a, 12) ^ r; |
777 | a1 = q[a1]; |
778 | a2 = q[a2]; |
779 | new = new | INSERT(a1, 11) | INSERT(a2, 12); |
780 | #endif /* RF_LONGSHIFT > 2 */ |
781 | d ^= new; |
782 | *dest++ = d; |
783 | length--; |
784 | } |
785 | #endif /* _KERNEL */ |
786 | } |
787 | /* |
788 | recover columns a and b from the given p and q into |
789 | bufs abuf and bbuf. All bufs are word aligned. |
790 | Length is in bytes. |
791 | */ |
792 | |
793 | |
794 | /* |
795 | * XXX |
796 | * |
797 | * Everything about this seems wrong. |
798 | */ |
799 | void |
800 | rf_PQ_recover(unsigned long *pbuf, unsigned long *qbuf, unsigned long *abuf, unsigned long *bbuf, unsigned length, unsigned coeff_a, unsigned coeff_b) |
801 | { |
802 | unsigned long p, q, a, a0, a1; |
803 | int col = (29 * coeff_a) + coeff_b; |
804 | unsigned char *q0 = &(rf_qinv[col][0]); |
805 | |
806 | length /= 8; |
807 | while (length) { |
808 | p = *pbuf++; |
809 | q = *qbuf++; |
810 | a0 = EXTRACT(p, 0); |
811 | a1 = EXTRACT(q, 0); |
812 | a = q0[a0 << 5 | a1]; |
813 | #define MF(i) \ |
814 | a0 = EXTRACT(p,i); \ |
815 | a1 = EXTRACT(q,i); \ |
816 | a = a | INSERT(q0[a0<<5 | a1],i) |
817 | |
818 | MF(1); |
819 | MF(2); |
820 | MF(3); |
821 | MF(4); |
822 | MF(5); |
823 | MF(6); |
824 | #if 0 |
825 | MF(7); |
826 | MF(8); |
827 | MF(9); |
828 | MF(10); |
829 | MF(11); |
830 | MF(12); |
831 | #endif /* 0 */ |
832 | *abuf++ = a; |
833 | *bbuf++ = a ^ p; |
834 | length--; |
835 | } |
836 | } |
837 | /* |
838 | Lost parity and a data column. Recover that data column. |
839 | Assume col coeff is lost. Let q the contents of Q after |
840 | all surviving data columns have been q-xored out of it. |
841 | Then we have the equation |
842 | |
843 | q[28-coeff][a_i ^ r_i+1] = q |
844 | |
845 | but q is cyclic with period 31. |
846 | So q[3+coeff][q[28-coeff][a_i ^ r_{i+1}]] = |
847 | q[31][a_i ^ r_{i+1}] = a_i ^ r_{i+1} . |
848 | |
849 | so a_i = r_{coeff+1} ^ q[3+coeff][q] |
850 | |
851 | The routine is passed q buffer and the buffer |
852 | the data is to be recoverd into. They can be the same. |
853 | */ |
854 | |
855 | |
856 | |
857 | static void |
858 | rf_InvertQ( |
859 | unsigned long *qbuf, |
860 | unsigned long *abuf, |
861 | unsigned length, |
862 | unsigned coeff) |
863 | { |
864 | unsigned long a, new; |
865 | unsigned long a1, a2; |
866 | unsigned int *q = &(rf_qfor[3 + coeff][0]); |
867 | unsigned r = rf_rn[coeff + 1]; |
868 | |
869 | /* 13 5 bit quants in a 64 bit word */ |
870 | length /= 8; |
871 | while (length) { |
872 | a = *qbuf++; |
873 | a1 = EXTRACT(a, 0); |
874 | a2 = EXTRACT(a, 1); |
875 | a1 = r ^ q[a1]; |
876 | a2 = r ^ q[a2]; |
877 | new = INSERT(a2, 1) | a1; |
878 | #define M(i,j) \ |
879 | a1 = EXTRACT(a,i); \ |
880 | a2 = EXTRACT(a,j); \ |
881 | a1 = r ^ q[a1]; \ |
882 | a2 = r ^ q[a2]; \ |
883 | new = new | INSERT(a1,i) | INSERT(a2,j) |
884 | |
885 | M(2, 3); |
886 | M(4, 5); |
887 | M(5, 6); |
888 | #if RF_LONGSHIFT > 2 |
889 | M(7, 8); |
890 | M(9, 10); |
891 | M(11, 12); |
892 | #endif /* RF_LONGSHIFT > 2 */ |
893 | *abuf++ = new; |
894 | length--; |
895 | } |
896 | } |
897 | #endif /* (RF_INCLUDE_DECL_PQ > 0) || |
898 | * (RF_INCLUDE_RAID6 > 0) */ |
899 | |