1/* $NetBSD: rf_evenodd_dagfuncs.c,v 1.22 2014/03/23 09:30:59 christos Exp $ */
2/*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: ChangMing Wu
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29/*
30 * Code for RAID-EVENODD architecture.
31 */
32
33#include <sys/cdefs.h>
34__KERNEL_RCSID(0, "$NetBSD: rf_evenodd_dagfuncs.c,v 1.22 2014/03/23 09:30:59 christos Exp $");
35
36#include "rf_archs.h"
37
38#ifdef _KERNEL_OPT
39#include "opt_raid_diagnostic.h"
40#endif
41
42#if RF_INCLUDE_EVENODD > 0
43
44#include <dev/raidframe/raidframevar.h>
45
46#include "rf_raid.h"
47#include "rf_dag.h"
48#include "rf_dagffrd.h"
49#include "rf_dagffwr.h"
50#include "rf_dagdegrd.h"
51#include "rf_dagdegwr.h"
52#include "rf_dagutils.h"
53#include "rf_dagfuncs.h"
54#include "rf_etimer.h"
55#include "rf_general.h"
56#include "rf_parityscan.h"
57#include "rf_evenodd.h"
58#include "rf_evenodd_dagfuncs.h"
59
60/* These redundant functions are for small write */
61RF_RedFuncs_t rf_EOSmallWritePFuncs = {rf_RegularXorFunc, "Regular Old-New P", rf_SimpleXorFunc, "Simple Old-New P"};
62RF_RedFuncs_t rf_EOSmallWriteEFuncs = {rf_RegularONEFunc, "Regular Old-New E", rf_SimpleONEFunc, "Regular Old-New E"};
63/* These redundant functions are for degraded read */
64RF_RedFuncs_t rf_eoPRecoveryFuncs = {rf_RecoveryXorFunc, "Recovery Xr", rf_RecoveryXorFunc, "Recovery Xr"};
65RF_RedFuncs_t rf_eoERecoveryFuncs = {rf_RecoveryEFunc, "Recovery E Func", rf_RecoveryEFunc, "Recovery E Func"};
66/**********************************************************************************************
67 * the following encoding node functions is used in EO_000_CreateLargeWriteDAG
68 **********************************************************************************************/
69int
70rf_RegularPEFunc(RF_DagNode_t *node)
71{
72 rf_RegularESubroutine(node, node->results[1]);
73 rf_RegularXorFunc(node);/* does the wakeup here! */
74#if 1
75 return (0); /* XXX This was missing... GO */
76#endif
77}
78
79
80/************************************************************************************************
81 * For EO_001_CreateSmallWriteDAG, there are (i)RegularONEFunc() and (ii)SimpleONEFunc() to
82 * be used. The previous case is when write access at least sectors of full stripe unit.
83 * The later function is used when the write access two stripe units but with total sectors
84 * less than sectors per SU. In this case, the access of parity and 'E' are shown as disconnected
85 * areas in their stripe unit and parity write and 'E' write are both devided into two distinct
86 * writes( totally four). This simple old-new write and regular old-new write happen as in RAID-5
87 ************************************************************************************************/
88
89/* Algorithm:
90 1. Store the difference of old data and new data in the Rod buffer.
91 2. then encode this buffer into the buffer which already have old 'E' information inside it,
92 the result can be shown to be the new 'E' information.
93 3. xor the Wnd buffer into the difference buffer to recover the original old data.
94 Here we have another alternative: to allocate a temporary buffer for storing the difference of
95 old data and new data, then encode temp buf into old 'E' buf to form new 'E', but this approach
96 take the same speed as the previous, and need more memory.
97*/
98int
99rf_RegularONEFunc(RF_DagNode_t *node)
100{
101 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
102 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
103 int EpdaIndex = (node->numParams - 1) / 2 - 1; /* the parameter of node
104 * where you can find
105 * e-pda */
106 int i, k;
107 int suoffset, length;
108 RF_RowCol_t scol;
109 char *srcbuf, *destbuf;
110 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
111 RF_Etimer_t timer;
112 RF_PhysDiskAddr_t *pda;
113#ifdef RAID_DIAGNOSTIC
114 RF_PhysDiskAddr_t *EPDA =
115 (RF_PhysDiskAddr_t *) node->params[EpdaIndex].p;
116 int ESUOffset = rf_StripeUnitOffset(layoutPtr, EPDA->startSector);
117
118 RF_ASSERT(EPDA->type == RF_PDA_TYPE_Q);
119 RF_ASSERT(ESUOffset == 0);
120#endif /* RAID_DIAGNOSTIC */
121
122 RF_ETIMER_START(timer);
123
124 /* Xor the Wnd buffer into Rod buffer, the difference of old data and
125 * new data is stored in Rod buffer */
126 for (k = 0; k < EpdaIndex; k += 2) {
127 length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[k].p)->numSector);
128 rf_bxor(node->params[k + EpdaIndex + 3].p, node->params[k + 1].p, length);
129 }
130 /* Start to encoding the buffer storing the difference of old data and
131 * new data into 'E' buffer */
132 for (i = 0; i < EpdaIndex; i += 2)
133 if (node->params[i + 1].p != node->results[0]) { /* results[0] is buf ptr
134 * of E */
135 pda = (RF_PhysDiskAddr_t *) node->params[i].p;
136 srcbuf = (char *) node->params[i + 1].p;
137 scol = rf_EUCol(layoutPtr, pda->raidAddress);
138 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
139 destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset);
140 rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
141 }
142 /* Recover the original old data to be used by parity encoding
143 * function in XorNode */
144 for (k = 0; k < EpdaIndex; k += 2) {
145 length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[k].p)->numSector);
146 rf_bxor(node->params[k + EpdaIndex + 3].p, node->params[k + 1].p, length);
147 }
148 RF_ETIMER_STOP(timer);
149 RF_ETIMER_EVAL(timer);
150 tracerec->q_us += RF_ETIMER_VAL_US(timer);
151 rf_GenericWakeupFunc(node, 0);
152#if 1
153 return (0); /* XXX this was missing.. GO */
154#endif
155}
156
157int
158rf_SimpleONEFunc(RF_DagNode_t *node)
159{
160 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
161 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
162 RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
163 int retcode = 0;
164 char *srcbuf, *destbuf;
165 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
166 int length;
167 RF_RowCol_t scol;
168 RF_Etimer_t timer;
169
170 RF_ASSERT(((RF_PhysDiskAddr_t *) node->params[2].p)->type == RF_PDA_TYPE_Q);
171 if (node->dagHdr->status == rf_enable) {
172 RF_ETIMER_START(timer);
173 length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[4].p)->numSector); /* this is a pda of
174 * writeDataNodes */
175 /* bxor to buffer of readDataNodes */
176 retcode = rf_bxor(node->params[5].p, node->params[1].p, length);
177 /* find out the corresponding colume in encoding matrix for
178 * write colume to be encoded into redundant disk 'E' */
179 scol = rf_EUCol(layoutPtr, pda->raidAddress);
180 srcbuf = node->params[1].p;
181 destbuf = node->params[3].p;
182 /* Start encoding process */
183 rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
184 rf_bxor(node->params[5].p, node->params[1].p, length);
185 RF_ETIMER_STOP(timer);
186 RF_ETIMER_EVAL(timer);
187 tracerec->q_us += RF_ETIMER_VAL_US(timer);
188
189 }
190 return (rf_GenericWakeupFunc(node, retcode)); /* call wake func
191 * explicitly since no
192 * I/O in this node */
193}
194
195
196/****** called by rf_RegularPEFunc(node) and rf_RegularEFunc(node) in f.f. large write ********/
197void
198rf_RegularESubroutine(RF_DagNode_t *node, char *ebuf)
199{
200 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
201 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
202 RF_PhysDiskAddr_t *pda;
203 int i, suoffset;
204 RF_RowCol_t scol;
205 char *srcbuf, *destbuf;
206 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
207 RF_Etimer_t timer;
208
209 RF_ETIMER_START(timer);
210 for (i = 0; i < node->numParams - 2; i += 2) {
211 RF_ASSERT(node->params[i + 1].p != ebuf);
212 pda = (RF_PhysDiskAddr_t *) node->params[i].p;
213 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
214 scol = rf_EUCol(layoutPtr, pda->raidAddress);
215 srcbuf = (char *) node->params[i + 1].p;
216 destbuf = ebuf + rf_RaidAddressToByte(raidPtr, suoffset);
217 rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
218 }
219 RF_ETIMER_STOP(timer);
220 RF_ETIMER_EVAL(timer);
221 tracerec->xor_us += RF_ETIMER_VAL_US(timer);
222}
223
224
225/*******************************************************************************************
226 * Used in EO_001_CreateLargeWriteDAG
227 ******************************************************************************************/
228int
229rf_RegularEFunc(RF_DagNode_t *node)
230{
231 rf_RegularESubroutine(node, node->results[0]);
232 rf_GenericWakeupFunc(node, 0);
233#if 1
234 return (0); /* XXX this was missing?.. GO */
235#endif
236}
237/*******************************************************************************************
238 * This degraded function allow only two case:
239 * 1. when write access the full failed stripe unit, then the access can be more than
240 * one tripe units.
241 * 2. when write access only part of the failed SU, we assume accesses of more than
242 * one stripe unit is not allowed so that the write can be dealt with like a
243 * large write.
244 * The following function is based on these assumptions. So except in the second case,
245 * it looks the same as a large write encodeing function. But this is not exactly the
246 * normal way for doing a degraded write, since raidframe have to break cases of access
247 * other than the above two into smaller accesses. We may have to change
248 * DegrESubroutin in the future.
249 *******************************************************************************************/
250void
251rf_DegrESubroutine(RF_DagNode_t *node, char *ebuf)
252{
253 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
254 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
255 RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
256 RF_PhysDiskAddr_t *pda;
257 int i, suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
258 RF_RowCol_t scol;
259 char *srcbuf, *destbuf;
260 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
261 RF_Etimer_t timer;
262
263 RF_ETIMER_START(timer);
264 for (i = 0; i < node->numParams - 2; i += 2) {
265 RF_ASSERT(node->params[i + 1].p != ebuf);
266 pda = (RF_PhysDiskAddr_t *) node->params[i].p;
267 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
268 scol = rf_EUCol(layoutPtr, pda->raidAddress);
269 srcbuf = (char *) node->params[i + 1].p;
270 destbuf = ebuf + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
271 rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
272 }
273
274 RF_ETIMER_STOP(timer);
275 RF_ETIMER_EVAL(timer);
276 tracerec->q_us += RF_ETIMER_VAL_US(timer);
277}
278
279
280/**************************************************************************************
281 * This function is used in case where one data disk failed and both redundant disks
282 * alive. It is used in the EO_100_CreateWriteDAG. Note: if there is another disk
283 * failed in the stripe but not accessed at this time, then we should, instead, use
284 * the rf_EOWriteDoubleRecoveryFunc().
285 **************************************************************************************/
286int
287rf_Degraded_100_EOFunc(RF_DagNode_t *node)
288{
289 rf_DegrESubroutine(node, node->results[1]);
290 rf_RecoveryXorFunc(node); /* does the wakeup here! */
291#if 1
292 return (0); /* XXX this was missing... SHould these be
293 * void functions??? GO */
294#endif
295}
296/**************************************************************************************
297 * This function is to encode one sector in one of the data disks to the E disk.
298 * However, in evenodd this function can also be used as decoding function to recover
299 * data from dead disk in the case of parity failure and a single data failure.
300 **************************************************************************************/
301void
302rf_e_EncOneSect(
303 RF_RowCol_t srcLogicCol,
304 char *srcSecbuf,
305 RF_RowCol_t destLogicCol,
306 char *destSecbuf,
307 int bytesPerSector)
308{
309 int S_index; /* index of the EU in the src col which need
310 * be Xored into all EUs in a dest sector */
311 int numRowInEncMatix = (RF_EO_MATRIX_DIM) - 1;
312 RF_RowCol_t j, indexInDest, /* row index of an encoding unit in
313 * the destination colume of encoding
314 * matrix */
315 indexInSrc; /* row index of an encoding unit in the source
316 * colume used for recovery */
317 int bytesPerEU = bytesPerSector / numRowInEncMatix;
318
319#if RF_EO_MATRIX_DIM > 17
320 int shortsPerEU = bytesPerEU / sizeof(short);
321 short *destShortBuf, *srcShortBuf1, *srcShortBuf2;
322 short temp1;
323#elif RF_EO_MATRIX_DIM == 17
324 int longsPerEU = bytesPerEU / sizeof(long);
325 long *destLongBuf, *srcLongBuf1, *srcLongBuf2;
326 long temp1;
327#endif
328
329#if RF_EO_MATRIX_DIM > 17
330 RF_ASSERT(sizeof(short) == 2 || sizeof(short) == 1);
331 RF_ASSERT(bytesPerEU % sizeof(short) == 0);
332#elif RF_EO_MATRIX_DIM == 17
333 RF_ASSERT(sizeof(long) == 8 || sizeof(long) == 4);
334 RF_ASSERT(bytesPerEU % sizeof(long) == 0);
335#endif
336
337 S_index = rf_EO_Mod((RF_EO_MATRIX_DIM - 1 + destLogicCol - srcLogicCol), RF_EO_MATRIX_DIM);
338#if RF_EO_MATRIX_DIM > 17
339 srcShortBuf1 = (short *) (srcSecbuf + S_index * bytesPerEU);
340#elif RF_EO_MATRIX_DIM == 17
341 srcLongBuf1 = (long *) (srcSecbuf + S_index * bytesPerEU);
342#endif
343
344 for (indexInDest = 0; indexInDest < numRowInEncMatix; indexInDest++) {
345 indexInSrc = rf_EO_Mod((indexInDest + destLogicCol - srcLogicCol), RF_EO_MATRIX_DIM);
346
347#if RF_EO_MATRIX_DIM > 17
348 destShortBuf = (short *) (destSecbuf + indexInDest * bytesPerEU);
349 srcShortBuf2 = (short *) (srcSecbuf + indexInSrc * bytesPerEU);
350 for (j = 0; j < shortsPerEU; j++) {
351 temp1 = destShortBuf[j] ^ srcShortBuf1[j];
352 /* note: S_index won't be at the end row for any src
353 * col! */
354 if (indexInSrc != RF_EO_MATRIX_DIM - 1)
355 destShortBuf[j] = (srcShortBuf2[j]) ^ temp1;
356 /* if indexInSrc is at the end row, ie.
357 * RF_EO_MATRIX_DIM -1, then all elements are zero! */
358 else
359 destShortBuf[j] = temp1;
360 }
361
362#elif RF_EO_MATRIX_DIM == 17
363 destLongBuf = (long *) (destSecbuf + indexInDest * bytesPerEU);
364 srcLongBuf2 = (long *) (srcSecbuf + indexInSrc * bytesPerEU);
365 for (j = 0; j < longsPerEU; j++) {
366 temp1 = destLongBuf[j] ^ srcLongBuf1[j];
367 if (indexInSrc != RF_EO_MATRIX_DIM - 1)
368 destLongBuf[j] = (srcLongBuf2[j]) ^ temp1;
369 else
370 destLongBuf[j] = temp1;
371 }
372#endif
373 }
374}
375
376void
377rf_e_encToBuf(
378 RF_Raid_t * raidPtr,
379 RF_RowCol_t srcLogicCol,
380 char *srcbuf,
381 RF_RowCol_t destLogicCol,
382 char *destbuf,
383 int numSector)
384{
385 int i, bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
386
387 for (i = 0; i < numSector; i++) {
388 rf_e_EncOneSect(srcLogicCol, srcbuf, destLogicCol, destbuf, bytesPerSector);
389 srcbuf += bytesPerSector;
390 destbuf += bytesPerSector;
391 }
392}
393/**************************************************************************************
394 * when parity die and one data die, We use second redundant information, 'E',
395 * to recover the data in dead disk. This function is used in the recovery node of
396 * for EO_110_CreateReadDAG
397 **************************************************************************************/
398int
399rf_RecoveryEFunc(RF_DagNode_t *node)
400{
401 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
402 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
403 RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
404 RF_RowCol_t scol, /* source logical column */
405 fcol = rf_EUCol(layoutPtr, failedPDA->raidAddress); /* logical column of
406 * failed SU */
407 int i;
408 RF_PhysDiskAddr_t *pda;
409 int suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
410 char *srcbuf, *destbuf;
411 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
412 RF_Etimer_t timer;
413
414 memset((char *) node->results[0], 0,
415 rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
416 if (node->dagHdr->status == rf_enable) {
417 RF_ETIMER_START(timer);
418 for (i = 0; i < node->numParams - 2; i += 2)
419 if (node->params[i + 1].p != node->results[0]) {
420 pda = (RF_PhysDiskAddr_t *) node->params[i].p;
421 if (i == node->numParams - 4)
422 scol = RF_EO_MATRIX_DIM - 2; /* the colume of
423 * redundant E */
424 else
425 scol = rf_EUCol(layoutPtr, pda->raidAddress);
426 srcbuf = (char *) node->params[i + 1].p;
427 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
428 destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
429 rf_e_encToBuf(raidPtr, scol, srcbuf, fcol, destbuf, pda->numSector);
430 }
431 RF_ETIMER_STOP(timer);
432 RF_ETIMER_EVAL(timer);
433 tracerec->xor_us += RF_ETIMER_VAL_US(timer);
434 }
435 return (rf_GenericWakeupFunc(node, 0)); /* node execute successfully */
436}
437/**************************************************************************************
438 * This function is used in the case where one data and the parity have filed.
439 * (in EO_110_CreateWriteDAG )
440 **************************************************************************************/
441int
442rf_EO_DegradedWriteEFunc(RF_DagNode_t * node)
443{
444 rf_DegrESubroutine(node, node->results[0]);
445 rf_GenericWakeupFunc(node, 0);
446#if 1
447 return (0); /* XXX Yet another one!! GO */
448#endif
449}
450
451
452
453/**************************************************************************************
454 * THE FUNCTION IS FOR DOUBLE DEGRADED READ AND WRITE CASES
455 **************************************************************************************/
456
457void
458rf_doubleEOdecode(
459 RF_Raid_t * raidPtr,
460 char **rrdbuf,
461 char **dest,
462 RF_RowCol_t * fcol,
463 char *pbuf,
464 char *ebuf)
465{
466 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
467 int i, j, k, f1, f2, row;
468 int rrdrow, erow, count = 0;
469 int bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
470 int numRowInEncMatix = (RF_EO_MATRIX_DIM) - 1;
471#if 0
472 int pcol = (RF_EO_MATRIX_DIM) - 1;
473#endif
474 int ecol = (RF_EO_MATRIX_DIM) - 2;
475 int bytesPerEU = bytesPerSector / numRowInEncMatix;
476 int numDataCol = layoutPtr->numDataCol;
477#if RF_EO_MATRIX_DIM > 17
478 int shortsPerEU = bytesPerEU / sizeof(short);
479 short *rrdbuf_current, *pbuf_current, *ebuf_current;
480 short *dest_smaller, *dest_smaller_current, *dest_larger, *dest_larger_current;
481 short *temp;
482 short *P;
483
484 RF_ASSERT(bytesPerEU % sizeof(short) == 0);
485 RF_Malloc(P, bytesPerEU, (short *));
486 RF_Malloc(temp, bytesPerEU, (short *));
487#elif RF_EO_MATRIX_DIM == 17
488 int longsPerEU = bytesPerEU / sizeof(long);
489 long *rrdbuf_current, *pbuf_current, *ebuf_current;
490 long *dest_smaller, *dest_smaller_current, *dest_larger, *dest_larger_current;
491 long *temp;
492 long *P;
493
494 RF_ASSERT(bytesPerEU % sizeof(long) == 0);
495 RF_Malloc(P, bytesPerEU, (long *));
496 RF_Malloc(temp, bytesPerEU, (long *));
497#endif
498 RF_ASSERT(*((long *) dest[0]) == 0);
499 RF_ASSERT(*((long *) dest[1]) == 0);
500 memset((char *) P, 0, bytesPerEU);
501 memset((char *) temp, 0, bytesPerEU);
502 RF_ASSERT(*P == 0);
503 /* calculate the 'P' parameter, which, not parity, is the Xor of all
504 * elements in the last two column, ie. 'E' and 'parity' colume, see
505 * the Ref. paper by Blaum, et al 1993 */
506 for (i = 0; i < numRowInEncMatix; i++)
507 for (k = 0; k < longsPerEU; k++) {
508#if RF_EO_MATRIX_DIM > 17
509 ebuf_current = ((short *) ebuf) + i * shortsPerEU + k;
510 pbuf_current = ((short *) pbuf) + i * shortsPerEU + k;
511#elif RF_EO_MATRIX_DIM == 17
512 ebuf_current = ((long *) ebuf) + i * longsPerEU + k;
513 pbuf_current = ((long *) pbuf) + i * longsPerEU + k;
514#endif
515 P[k] ^= *ebuf_current;
516 P[k] ^= *pbuf_current;
517 }
518 RF_ASSERT(fcol[0] != fcol[1]);
519 if (fcol[0] < fcol[1]) {
520#if RF_EO_MATRIX_DIM > 17
521 dest_smaller = (short *) (dest[0]);
522 dest_larger = (short *) (dest[1]);
523#elif RF_EO_MATRIX_DIM == 17
524 dest_smaller = (long *) (dest[0]);
525 dest_larger = (long *) (dest[1]);
526#endif
527 f1 = fcol[0];
528 f2 = fcol[1];
529 } else {
530#if RF_EO_MATRIX_DIM > 17
531 dest_smaller = (short *) (dest[1]);
532 dest_larger = (short *) (dest[0]);
533#elif RF_EO_MATRIX_DIM == 17
534 dest_smaller = (long *) (dest[1]);
535 dest_larger = (long *) (dest[0]);
536#endif
537 f1 = fcol[1];
538 f2 = fcol[0];
539 }
540 row = (RF_EO_MATRIX_DIM) - 1;
541 while ((row = rf_EO_Mod((row + f1 - f2), RF_EO_MATRIX_DIM)) != ((RF_EO_MATRIX_DIM) - 1)) {
542#if RF_EO_MATRIX_DIM > 17
543 dest_larger_current = dest_larger + row * shortsPerEU;
544 dest_smaller_current = dest_smaller + row * shortsPerEU;
545#elif RF_EO_MATRIX_DIM == 17
546 dest_larger_current = dest_larger + row * longsPerEU;
547 dest_smaller_current = dest_smaller + row * longsPerEU;
548#endif
549 /** Do the diagonal recovery. Initially, temp[k] = (failed 1),
550 which is the failed data in the colume which has smaller col index. **/
551 /* step 1: ^(SUM of nonfailed in-diagonal A(rrdrow,0..m-3)) */
552 for (j = 0; j < numDataCol; j++) {
553 if (j == f1 || j == f2)
554 continue;
555 rrdrow = rf_EO_Mod((row + f2 - j), RF_EO_MATRIX_DIM);
556 if (rrdrow != (RF_EO_MATRIX_DIM) - 1) {
557#if RF_EO_MATRIX_DIM > 17
558 rrdbuf_current = (short *) (rrdbuf[j]) + rrdrow * shortsPerEU;
559 for (k = 0; k < shortsPerEU; k++)
560 temp[k] ^= *(rrdbuf_current + k);
561#elif RF_EO_MATRIX_DIM == 17
562 rrdbuf_current = (long *) (rrdbuf[j]) + rrdrow * longsPerEU;
563 for (k = 0; k < longsPerEU; k++)
564 temp[k] ^= *(rrdbuf_current + k);
565#endif
566 }
567 }
568 /* step 2: ^E(erow,m-2), If erow is at the buttom row, don't
569 * Xor into it E(erow,m-2) = (principle diagonal) ^ (failed
570 * 1) ^ (failed 2) ^ ( SUM of nonfailed in-diagonal
571 * A(rrdrow,0..m-3) ) After this step, temp[k] = (principle
572 * diagonal) ^ (failed 2) */
573
574 erow = rf_EO_Mod((row + f2 - ecol), (RF_EO_MATRIX_DIM));
575 if (erow != (RF_EO_MATRIX_DIM) - 1) {
576#if RF_EO_MATRIX_DIM > 17
577 ebuf_current = (short *) ebuf + shortsPerEU * erow;
578 for (k = 0; k < shortsPerEU; k++)
579 temp[k] ^= *(ebuf_current + k);
580#elif RF_EO_MATRIX_DIM == 17
581 ebuf_current = (long *) ebuf + longsPerEU * erow;
582 for (k = 0; k < longsPerEU; k++)
583 temp[k] ^= *(ebuf_current + k);
584#endif
585 }
586 /* step 3: ^P to obtain the failed data (failed 2). P can be
587 * proved to be actually (principle diagonal) After this
588 * step, temp[k] = (failed 2), the failed data to be recovered */
589#if RF_EO_MATRIX_DIM > 17
590 for (k = 0; k < shortsPerEU; k++)
591 temp[k] ^= P[k];
592 /* Put the data to the destination buffer */
593 for (k = 0; k < shortsPerEU; k++)
594 dest_larger_current[k] = temp[k];
595#elif RF_EO_MATRIX_DIM == 17
596 for (k = 0; k < longsPerEU; k++)
597 temp[k] ^= P[k];
598 /* Put the data to the destination buffer */
599 for (k = 0; k < longsPerEU; k++)
600 dest_larger_current[k] = temp[k];
601#endif
602
603 /** THE FOLLOWING DO THE HORIZONTAL XOR **/
604 /* step 1: ^(SUM of A(row,0..m-3)), ie. all nonfailed data
605 * columes */
606 for (j = 0; j < numDataCol; j++) {
607 if (j == f1 || j == f2)
608 continue;
609#if RF_EO_MATRIX_DIM > 17
610 rrdbuf_current = (short *) (rrdbuf[j]) + row * shortsPerEU;
611 for (k = 0; k < shortsPerEU; k++)
612 temp[k] ^= *(rrdbuf_current + k);
613#elif RF_EO_MATRIX_DIM == 17
614 rrdbuf_current = (long *) (rrdbuf[j]) + row * longsPerEU;
615 for (k = 0; k < longsPerEU; k++)
616 temp[k] ^= *(rrdbuf_current + k);
617#endif
618 }
619 /* step 2: ^A(row,m-1) */
620 /* step 3: Put the data to the destination buffer */
621#if RF_EO_MATRIX_DIM > 17
622 pbuf_current = (short *) pbuf + shortsPerEU * row;
623 for (k = 0; k < shortsPerEU; k++)
624 temp[k] ^= *(pbuf_current + k);
625 for (k = 0; k < shortsPerEU; k++)
626 dest_smaller_current[k] = temp[k];
627#elif RF_EO_MATRIX_DIM == 17
628 pbuf_current = (long *) pbuf + longsPerEU * row;
629 for (k = 0; k < longsPerEU; k++)
630 temp[k] ^= *(pbuf_current + k);
631 for (k = 0; k < longsPerEU; k++)
632 dest_smaller_current[k] = temp[k];
633#endif
634 count++;
635 }
636 /* Check if all Encoding Unit in the data buffer have been decoded,
637 * according EvenOdd theory, if "RF_EO_MATRIX_DIM" is a prime number,
638 * this algorithm will covered all buffer */
639 RF_ASSERT(count == numRowInEncMatix);
640 RF_Free((char *) P, bytesPerEU);
641 RF_Free((char *) temp, bytesPerEU);
642}
643
644
645/***************************************************************************************
646* This function is called by double degragded read
647* EO_200_CreateReadDAG
648*
649***************************************************************************************/
650int
651rf_EvenOddDoubleRecoveryFunc(RF_DagNode_t *node)
652{
653 int ndataParam = 0;
654 int np = node->numParams;
655 RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
656 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
657 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
658 int i, prm, sector, nresults = node->numResults;
659 RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
660 unsigned sosAddr;
661 int mallc_one = 0, mallc_two = 0; /* flags to indicate if
662 * memory is allocated */
663 int bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
664 RF_PhysDiskAddr_t *ppda, *ppda2, *epda, *epda2, *pda, *pda0, *pda1,
665 npda;
666 RF_RowCol_t fcol[2], fsuoff[2], fsuend[2], numDataCol = layoutPtr->numDataCol;
667 char **buf, *ebuf, *pbuf, *dest[2];
668 long *suoff = NULL, *suend = NULL, *prmToCol = NULL,
669 psuoff = 0, esuoff = 0;
670 RF_SectorNum_t startSector, endSector;
671 RF_Etimer_t timer;
672 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
673
674 RF_ETIMER_START(timer);
675
676 /* Find out the number of parameters which are pdas for data
677 * information */
678 for (i = 0; i <= np; i++)
679 if (((RF_PhysDiskAddr_t *) node->params[i].p)->type != RF_PDA_TYPE_DATA) {
680 ndataParam = i;
681 break;
682 }
683 RF_Malloc(buf, numDataCol * sizeof(char *), (char **));
684 if (ndataParam != 0) {
685 RF_Malloc(suoff, ndataParam * sizeof(long), (long *));
686 RF_Malloc(suend, ndataParam * sizeof(long), (long *));
687 RF_Malloc(prmToCol, ndataParam * sizeof(long), (long *));
688 }
689 if (asmap->failedPDAs[1] &&
690 (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) {
691 RF_ASSERT(0); /* currently, no support for this situation */
692 ppda = node->params[np - 6].p;
693 ppda2 = node->params[np - 5].p;
694 RF_ASSERT(ppda2->type == RF_PDA_TYPE_PARITY);
695 epda = node->params[np - 4].p;
696 epda2 = node->params[np - 3].p;
697 RF_ASSERT(epda2->type == RF_PDA_TYPE_Q);
698 } else {
699 ppda = node->params[np - 4].p;
700 epda = node->params[np - 3].p;
701 psuoff = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
702 esuoff = rf_StripeUnitOffset(layoutPtr, epda->startSector);
703 RF_ASSERT(psuoff == esuoff);
704 }
705 /*
706 the followings have three goals:
707 1. determine the startSector to begin decoding and endSector to end decoding.
708 2. determine the colume numbers of the two failed disks.
709 3. determine the offset and end offset of the access within each failed stripe unit.
710 */
711 if (nresults == 1) {
712 /* find the startSector to begin decoding */
713 pda = node->results[0];
714 memset(pda->bufPtr, 0, bytesPerSector * pda->numSector);
715 fsuoff[0] = rf_StripeUnitOffset(layoutPtr, pda->startSector);
716 fsuend[0] = fsuoff[0] + pda->numSector;
717 fsuoff[1] = 0;
718 fsuend[1] = 0;
719 startSector = fsuoff[0];
720 endSector = fsuend[0];
721
722 /* find out the column of failed disk being accessed */
723 fcol[0] = rf_EUCol(layoutPtr, pda->raidAddress);
724
725 /* find out the other failed colume not accessed */
726 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
727 for (i = 0; i < numDataCol; i++) {
728 npda.raidAddress = sosAddr + (i * secPerSU);
729 (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.col), &(npda.startSector), 0);
730 /* skip over dead disks */
731 if (RF_DEAD_DISK(raidPtr->Disks[npda.col].status))
732 if (i != fcol[0])
733 break;
734 }
735 RF_ASSERT(i < numDataCol);
736 fcol[1] = i;
737 } else {
738 RF_ASSERT(nresults == 2);
739 pda0 = node->results[0];
740 memset(pda0->bufPtr, 0, bytesPerSector * pda0->numSector);
741 pda1 = node->results[1];
742 memset(pda1->bufPtr, 0, bytesPerSector * pda1->numSector);
743 /* determine the failed colume numbers of the two failed
744 * disks. */
745 fcol[0] = rf_EUCol(layoutPtr, pda0->raidAddress);
746 fcol[1] = rf_EUCol(layoutPtr, pda1->raidAddress);
747 /* determine the offset and end offset of the access within
748 * each failed stripe unit. */
749 fsuoff[0] = rf_StripeUnitOffset(layoutPtr, pda0->startSector);
750 fsuend[0] = fsuoff[0] + pda0->numSector;
751 fsuoff[1] = rf_StripeUnitOffset(layoutPtr, pda1->startSector);
752 fsuend[1] = fsuoff[1] + pda1->numSector;
753 /* determine the startSector to begin decoding */
754 startSector = RF_MIN(pda0->startSector, pda1->startSector);
755 /* determine the endSector to end decoding */
756 endSector = RF_MAX(fsuend[0], fsuend[1]);
757 }
758 /*
759 assign the beginning sector and the end sector for each parameter
760 find out the corresponding colume # for each parameter
761 */
762 for (prm = 0; prm < ndataParam; prm++) {
763 pda = node->params[prm].p;
764 suoff[prm] = rf_StripeUnitOffset(layoutPtr, pda->startSector);
765 suend[prm] = suoff[prm] + pda->numSector;
766 prmToCol[prm] = rf_EUCol(layoutPtr, pda->raidAddress);
767 }
768 /* 'sector' is the sector for the current decoding algorithm. For each
769 * sector in the failed SU, find out the corresponding parameters that
770 * cover the current sector and that are needed for decoding of this
771 * sector in failed SU. 2. Find out if sector is in the shadow of any
772 * accessed failed SU. If not, malloc a temporary space of a sector in
773 * size. */
774 for (sector = startSector; sector < endSector; sector++) {
775 if (nresults == 2)
776 if (!(fsuoff[0] <= sector && sector < fsuend[0]) && !(fsuoff[1] <= sector && sector < fsuend[1]))
777 continue;
778 for (prm = 0; prm < ndataParam; prm++)
779 if (suoff[prm] <= sector && sector < suend[prm])
780 buf[(prmToCol[prm])] = (char *)((RF_PhysDiskAddr_t *) node->params[prm].p)->bufPtr +
781 rf_RaidAddressToByte(raidPtr, sector - suoff[prm]);
782 /* find out if sector is in the shadow of any accessed failed
783 * SU. If yes, assign dest[0], dest[1] to point at suitable
784 * position of the buffer corresponding to failed SUs. if no,
785 * malloc a temporary space of a sector in size for
786 * destination of decoding. */
787 RF_ASSERT(nresults == 1 || nresults == 2);
788 if (nresults == 1) {
789 dest[0] = (char *)((RF_PhysDiskAddr_t *) node->results[0])->bufPtr + rf_RaidAddressToByte(raidPtr, sector - fsuoff[0]);
790 /* Always malloc temp buffer to dest[1] */
791 RF_Malloc(dest[1], bytesPerSector, (char *));
792 memset(dest[1], 0, bytesPerSector);
793 mallc_two = 1;
794 } else {
795 if (fsuoff[0] <= sector && sector < fsuend[0])
796 dest[0] = (char *)((RF_PhysDiskAddr_t *) node->results[0])->bufPtr + rf_RaidAddressToByte(raidPtr, sector - fsuoff[0]);
797 else {
798 RF_Malloc(dest[0], bytesPerSector, (char *));
799 memset(dest[0], 0, bytesPerSector);
800 mallc_one = 1;
801 }
802 if (fsuoff[1] <= sector && sector < fsuend[1])
803 dest[1] = (char *)((RF_PhysDiskAddr_t *) node->results[1])->bufPtr + rf_RaidAddressToByte(raidPtr, sector - fsuoff[1]);
804 else {
805 RF_Malloc(dest[1], bytesPerSector, (char *));
806 memset(dest[1], 0, bytesPerSector);
807 mallc_two = 1;
808 }
809 RF_ASSERT(mallc_one == 0 || mallc_two == 0);
810 }
811 pbuf = (char *)ppda->bufPtr + rf_RaidAddressToByte(raidPtr, sector - psuoff);
812 ebuf = (char *)epda->bufPtr + rf_RaidAddressToByte(raidPtr, sector - esuoff);
813 /*
814 * After finish finding all needed sectors, call doubleEOdecode function for decoding
815 * one sector to destination.
816 */
817 rf_doubleEOdecode(raidPtr, buf, dest, fcol, pbuf, ebuf);
818 /* free all allocated memory, and mark flag to indicate no
819 * memory is being allocated */
820 if (mallc_one == 1)
821 RF_Free(dest[0], bytesPerSector);
822 if (mallc_two == 1)
823 RF_Free(dest[1], bytesPerSector);
824 mallc_one = mallc_two = 0;
825 }
826 RF_Free(buf, numDataCol * sizeof(char *));
827 if (ndataParam != 0) {
828 RF_Free(suoff, ndataParam * sizeof(long));
829 RF_Free(suend, ndataParam * sizeof(long));
830 RF_Free(prmToCol, ndataParam * sizeof(long));
831 }
832 RF_ETIMER_STOP(timer);
833 RF_ETIMER_EVAL(timer);
834 if (tracerec) {
835 tracerec->q_us += RF_ETIMER_VAL_US(timer);
836 }
837 rf_GenericWakeupFunc(node, 0);
838#if 1
839 return (0); /* XXX is this even close!!?!?!!? GO */
840#endif
841}
842
843
844/* currently, only access of one of the two failed SU is allowed in this function.
845 * also, asmap->numStripeUnitsAccessed is limited to be one, the RaidFrame will break large access into
846 * many accesses of single stripe unit.
847 */
848
849int
850rf_EOWriteDoubleRecoveryFunc(RF_DagNode_t *node)
851{
852 int np = node->numParams;
853 RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
854 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
855 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
856 RF_SectorNum_t sector;
857 RF_RowCol_t col, scol;
858 int prm, i, j;
859 RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
860 unsigned sosAddr;
861 unsigned bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
862 RF_int64 numbytes;
863 RF_SectorNum_t startSector, endSector;
864 RF_PhysDiskAddr_t *ppda, *epda, *pda, *fpda, npda;
865 RF_RowCol_t fcol[2], numDataCol = layoutPtr->numDataCol;
866 char **buf; /* buf[0], buf[1], buf[2], ...etc. point to
867 * buffer storing data read from col0, col1,
868 * col2 */
869 char *ebuf, *pbuf, *dest[2], *olddata[2];
870 RF_Etimer_t timer;
871 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
872
873 RF_ASSERT(asmap->numDataFailed == 1); /* currently only support this
874 * case, the other failed SU
875 * is not being accessed */
876 RF_ETIMER_START(timer);
877 RF_Malloc(buf, numDataCol * sizeof(char *), (char **));
878
879 ppda = node->results[0];/* Instead of being buffers, node->results[0]
880 * and [1] are Ppda and Epda */
881 epda = node->results[1];
882 fpda = asmap->failedPDAs[0];
883
884 /* First, recovery the failed old SU using EvenOdd double decoding */
885 /* determine the startSector and endSector for decoding */
886 startSector = rf_StripeUnitOffset(layoutPtr, fpda->startSector);
887 endSector = startSector + fpda->numSector;
888 /* Assign buf[col] pointers to point to each non-failed colume and
889 * initialize the pbuf and ebuf to point at the beginning of each
890 * source buffers and destination buffers */
891 for (prm = 0; prm < numDataCol - 2; prm++) {
892 pda = (RF_PhysDiskAddr_t *) node->params[prm].p;
893 col = rf_EUCol(layoutPtr, pda->raidAddress);
894 buf[col] = pda->bufPtr;
895 }
896 /* pbuf and ebuf: they will change values as double recovery decoding
897 * goes on */
898 pbuf = ppda->bufPtr;
899 ebuf = epda->bufPtr;
900 /* find out the logical colume numbers in the encoding matrix of the
901 * two failed columes */
902 fcol[0] = rf_EUCol(layoutPtr, fpda->raidAddress);
903
904 /* find out the other failed colume not accessed this time */
905 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
906 for (i = 0; i < numDataCol; i++) {
907 npda.raidAddress = sosAddr + (i * secPerSU);
908 (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.col), &(npda.startSector), 0);
909 /* skip over dead disks */
910 if (RF_DEAD_DISK(raidPtr->Disks[npda.col].status))
911 if (i != fcol[0])
912 break;
913 }
914 RF_ASSERT(i < numDataCol);
915 fcol[1] = i;
916 /* assign temporary space to put recovered failed SU */
917 numbytes = fpda->numSector * bytesPerSector;
918 RF_Malloc(olddata[0], numbytes, (char *));
919 RF_Malloc(olddata[1], numbytes, (char *));
920 dest[0] = olddata[0];
921 dest[1] = olddata[1];
922 memset(olddata[0], 0, numbytes);
923 memset(olddata[1], 0, numbytes);
924 /* Begin the recovery decoding, initially buf[j], ebuf, pbuf, dest[j]
925 * have already pointed at the beginning of each source buffers and
926 * destination buffers */
927 for (sector = startSector, i = 0; sector < endSector; sector++, i++) {
928 rf_doubleEOdecode(raidPtr, buf, dest, fcol, pbuf, ebuf);
929 for (j = 0; j < numDataCol; j++)
930 if ((j != fcol[0]) && (j != fcol[1]))
931 buf[j] += bytesPerSector;
932 dest[0] += bytesPerSector;
933 dest[1] += bytesPerSector;
934 ebuf += bytesPerSector;
935 pbuf += bytesPerSector;
936 }
937 /* after recovery, the buffer pointed by olddata[0] is the old failed
938 * data. With new writing data and this old data, use small write to
939 * calculate the new redundant informations */
940 /* node->params[ 0, ... PDAPerDisk * (numDataCol - 2)-1 ] are Pdas of
941 * Rrd; params[ PDAPerDisk*(numDataCol - 2), ... PDAPerDisk*numDataCol
942 * -1 ] are Pdas of Rp, ( Rp2 ), Re, ( Re2 ) ; params[
943 * PDAPerDisk*numDataCol, ... PDAPerDisk*numDataCol
944 * +asmap->numStripeUnitsAccessed -asmap->numDataFailed-1] are Pdas of
945 * wudNodes; For current implementation, we assume the simplest case:
946 * asmap->numStripeUnitsAccessed == 1 and asmap->numDataFailed == 1
947 * ie. PDAPerDisk = 1 then node->params[numDataCol] must be the new
948 * data to be writen to the failed disk. We first bxor the new data
949 * into the old recovered data, then do the same things as small
950 * write. */
951
952 rf_bxor(((RF_PhysDiskAddr_t *) node->params[numDataCol].p)->bufPtr, olddata[0], numbytes);
953 /* do new 'E' calculation */
954 /* find out the corresponding colume in encoding matrix for write
955 * colume to be encoded into redundant disk 'E' */
956 scol = rf_EUCol(layoutPtr, fpda->raidAddress);
957 /* olddata[0] now is source buffer pointer; epda->bufPtr is the dest
958 * buffer pointer */
959 rf_e_encToBuf(raidPtr, scol, olddata[0], RF_EO_MATRIX_DIM - 2, epda->bufPtr, fpda->numSector);
960
961 /* do new 'P' calculation */
962 rf_bxor(olddata[0], ppda->bufPtr, numbytes);
963 /* Free the allocated buffer */
964 RF_Free(olddata[0], numbytes);
965 RF_Free(olddata[1], numbytes);
966 RF_Free(buf, numDataCol * sizeof(char *));
967
968 RF_ETIMER_STOP(timer);
969 RF_ETIMER_EVAL(timer);
970 if (tracerec) {
971 tracerec->q_us += RF_ETIMER_VAL_US(timer);
972 }
973 rf_GenericWakeupFunc(node, 0);
974 return (0);
975}
976#endif /* RF_INCLUDE_EVENODD > 0 */
977