1 | /* $NetBSD: rf_dagutils.c,v 1.54 2016/01/07 21:57:00 joerg Exp $ */ |
2 | /* |
3 | * Copyright (c) 1995 Carnegie-Mellon University. |
4 | * All rights reserved. |
5 | * |
6 | * Authors: Mark Holland, William V. Courtright II, Jim Zelenka |
7 | * |
8 | * Permission to use, copy, modify and distribute this software and |
9 | * its documentation is hereby granted, provided that both the copyright |
10 | * notice and this permission notice appear in all copies of the |
11 | * software, derivative works or modified versions, and any portions |
12 | * thereof, and that both notices appear in supporting documentation. |
13 | * |
14 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
15 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND |
16 | * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. |
17 | * |
18 | * Carnegie Mellon requests users of this software to return to |
19 | * |
20 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
21 | * School of Computer Science |
22 | * Carnegie Mellon University |
23 | * Pittsburgh PA 15213-3890 |
24 | * |
25 | * any improvements or extensions that they make and grant Carnegie the |
26 | * rights to redistribute these changes. |
27 | */ |
28 | |
29 | /****************************************************************************** |
30 | * |
31 | * rf_dagutils.c -- utility routines for manipulating dags |
32 | * |
33 | *****************************************************************************/ |
34 | |
35 | #include <sys/cdefs.h> |
36 | __KERNEL_RCSID(0, "$NetBSD: rf_dagutils.c,v 1.54 2016/01/07 21:57:00 joerg Exp $" ); |
37 | |
38 | #include <dev/raidframe/raidframevar.h> |
39 | |
40 | #include "rf_archs.h" |
41 | #include "rf_threadstuff.h" |
42 | #include "rf_raid.h" |
43 | #include "rf_dag.h" |
44 | #include "rf_dagutils.h" |
45 | #include "rf_dagfuncs.h" |
46 | #include "rf_general.h" |
47 | #include "rf_map.h" |
48 | #include "rf_shutdown.h" |
49 | |
50 | #define SNUM_DIFF(_a_,_b_) (((_a_)>(_b_))?((_a_)-(_b_)):((_b_)-(_a_))) |
51 | |
52 | const RF_RedFuncs_t rf_xorFuncs = { |
53 | rf_RegularXorFunc, "Reg Xr" , |
54 | rf_SimpleXorFunc, "Simple Xr" }; |
55 | |
56 | const RF_RedFuncs_t rf_xorRecoveryFuncs = { |
57 | rf_RecoveryXorFunc, "Recovery Xr" , |
58 | rf_RecoveryXorFunc, "Recovery Xr" }; |
59 | |
60 | #if RF_DEBUG_VALIDATE_DAG |
61 | static void rf_RecurPrintDAG(RF_DagNode_t *, int, int); |
62 | static void rf_PrintDAG(RF_DagHeader_t *); |
63 | static int rf_ValidateBranch(RF_DagNode_t *, int *, int *, |
64 | RF_DagNode_t **, int); |
65 | static void rf_ValidateBranchVisitedBits(RF_DagNode_t *, int, int); |
66 | static void rf_ValidateVisitedBits(RF_DagHeader_t *); |
67 | #endif /* RF_DEBUG_VALIDATE_DAG */ |
68 | |
69 | /* The maximum number of nodes in a DAG is bounded by |
70 | |
71 | (2 * raidPtr->Layout->numDataCol) + (1 * layoutPtr->numParityCol) + |
72 | (1 * 2 * layoutPtr->numParityCol) + 3 |
73 | |
74 | which is: 2*RF_MAXCOL+1*2+1*2*2+3 |
75 | |
76 | For RF_MAXCOL of 40, this works out to 89. We use this value to provide an estimate |
77 | on the maximum size needed for RF_DAGPCACHE_SIZE. For RF_MAXCOL of 40, this structure |
78 | would be 534 bytes. Too much to have on-hand in a RF_DagNode_t, but should be ok to |
79 | have a few kicking around. |
80 | */ |
81 | #define RF_DAGPCACHE_SIZE ((2*RF_MAXCOL+1*2+1*2*2+3) *(RF_MAX(sizeof(RF_DagParam_t), sizeof(RF_DagNode_t *)))) |
82 | |
83 | |
84 | /****************************************************************************** |
85 | * |
86 | * InitNode - initialize a dag node |
87 | * |
88 | * the size of the propList array is always the same as that of the |
89 | * successors array. |
90 | * |
91 | *****************************************************************************/ |
92 | void |
93 | rf_InitNode(RF_DagNode_t *node, RF_NodeStatus_t initstatus, int commit, |
94 | int (*doFunc) (RF_DagNode_t *node), |
95 | int (*undoFunc) (RF_DagNode_t *node), |
96 | int (*wakeFunc) (RF_DagNode_t *node, int status), |
97 | int nSucc, int nAnte, int nParam, int nResult, |
98 | RF_DagHeader_t *hdr, const char *name, RF_AllocListElem_t *alist) |
99 | { |
100 | void **ptrs; |
101 | int nptrs; |
102 | |
103 | if (nAnte > RF_MAX_ANTECEDENTS) |
104 | RF_PANIC(); |
105 | node->status = initstatus; |
106 | node->commitNode = commit; |
107 | node->doFunc = doFunc; |
108 | node->undoFunc = undoFunc; |
109 | node->wakeFunc = wakeFunc; |
110 | node->numParams = nParam; |
111 | node->numResults = nResult; |
112 | node->numAntecedents = nAnte; |
113 | node->numAntDone = 0; |
114 | node->next = NULL; |
115 | /* node->list_next = NULL */ /* Don't touch this here! |
116 | It may already be |
117 | in use by the caller! */ |
118 | node->numSuccedents = nSucc; |
119 | node->name = name; |
120 | node->dagHdr = hdr; |
121 | node->big_dag_ptrs = NULL; |
122 | node->big_dag_params = NULL; |
123 | node->visited = 0; |
124 | |
125 | /* allocate all the pointers with one call to malloc */ |
126 | nptrs = nSucc + nAnte + nResult + nSucc; |
127 | |
128 | if (nptrs <= RF_DAG_PTRCACHESIZE) { |
129 | /* |
130 | * The dag_ptrs field of the node is basically some scribble |
131 | * space to be used here. We could get rid of it, and always |
132 | * allocate the range of pointers, but that's expensive. So, |
133 | * we pick a "common case" size for the pointer cache. Hopefully, |
134 | * we'll find that: |
135 | * (1) Generally, nptrs doesn't exceed RF_DAG_PTRCACHESIZE by |
136 | * only a little bit (least efficient case) |
137 | * (2) Generally, ntprs isn't a lot less than RF_DAG_PTRCACHESIZE |
138 | * (wasted memory) |
139 | */ |
140 | ptrs = (void **) node->dag_ptrs; |
141 | } else if (nptrs <= (RF_DAGPCACHE_SIZE / sizeof(RF_DagNode_t *))) { |
142 | node->big_dag_ptrs = rf_AllocDAGPCache(); |
143 | ptrs = (void **) node->big_dag_ptrs; |
144 | } else { |
145 | RF_MallocAndAdd(ptrs, nptrs * sizeof(void *), |
146 | (void **), alist); |
147 | } |
148 | node->succedents = (nSucc) ? (RF_DagNode_t **) ptrs : NULL; |
149 | node->antecedents = (nAnte) ? (RF_DagNode_t **) (ptrs + nSucc) : NULL; |
150 | node->results = (nResult) ? (void **) (ptrs + nSucc + nAnte) : NULL; |
151 | node->propList = (nSucc) ? (RF_PropHeader_t **) (ptrs + nSucc + nAnte + nResult) : NULL; |
152 | |
153 | if (nParam) { |
154 | if (nParam <= RF_DAG_PARAMCACHESIZE) { |
155 | node->params = (RF_DagParam_t *) node->dag_params; |
156 | } else if (nParam <= (RF_DAGPCACHE_SIZE / sizeof(RF_DagParam_t))) { |
157 | node->big_dag_params = rf_AllocDAGPCache(); |
158 | node->params = node->big_dag_params; |
159 | } else { |
160 | RF_MallocAndAdd(node->params, |
161 | nParam * sizeof(RF_DagParam_t), |
162 | (RF_DagParam_t *), alist); |
163 | } |
164 | } else { |
165 | node->params = NULL; |
166 | } |
167 | } |
168 | |
169 | |
170 | |
171 | /****************************************************************************** |
172 | * |
173 | * allocation and deallocation routines |
174 | * |
175 | *****************************************************************************/ |
176 | |
177 | void |
178 | rf_FreeDAG(RF_DagHeader_t *dag_h) |
179 | { |
180 | RF_AccessStripeMapHeader_t *asmap, *t_asmap; |
181 | RF_PhysDiskAddr_t *pda; |
182 | RF_DagNode_t *tmpnode; |
183 | RF_DagHeader_t *nextDag; |
184 | |
185 | while (dag_h) { |
186 | nextDag = dag_h->next; |
187 | rf_FreeAllocList(dag_h->allocList); |
188 | for (asmap = dag_h->asmList; asmap;) { |
189 | t_asmap = asmap; |
190 | asmap = asmap->next; |
191 | rf_FreeAccessStripeMap(t_asmap); |
192 | } |
193 | while (dag_h->pda_cleanup_list) { |
194 | pda = dag_h->pda_cleanup_list; |
195 | dag_h->pda_cleanup_list = dag_h->pda_cleanup_list->next; |
196 | rf_FreePhysDiskAddr(pda); |
197 | } |
198 | while (dag_h->nodes) { |
199 | tmpnode = dag_h->nodes; |
200 | dag_h->nodes = dag_h->nodes->list_next; |
201 | rf_FreeDAGNode(tmpnode); |
202 | } |
203 | rf_FreeDAGHeader(dag_h); |
204 | dag_h = nextDag; |
205 | } |
206 | } |
207 | |
208 | #define RF_MAX_FREE_DAGH 128 |
209 | #define RF_MIN_FREE_DAGH 32 |
210 | |
211 | #define RF_MAX_FREE_DAGNODE 512 /* XXX Tune this... */ |
212 | #define RF_MIN_FREE_DAGNODE 128 /* XXX Tune this... */ |
213 | |
214 | #define RF_MAX_FREE_DAGLIST 128 |
215 | #define RF_MIN_FREE_DAGLIST 32 |
216 | |
217 | #define RF_MAX_FREE_DAGPCACHE 128 |
218 | #define RF_MIN_FREE_DAGPCACHE 8 |
219 | |
220 | #define RF_MAX_FREE_FUNCLIST 128 |
221 | #define RF_MIN_FREE_FUNCLIST 32 |
222 | |
223 | #define RF_MAX_FREE_BUFFERS 128 |
224 | #define RF_MIN_FREE_BUFFERS 32 |
225 | |
226 | static void rf_ShutdownDAGs(void *); |
227 | static void |
228 | rf_ShutdownDAGs(void *ignored) |
229 | { |
230 | pool_destroy(&rf_pools.dagh); |
231 | pool_destroy(&rf_pools.dagnode); |
232 | pool_destroy(&rf_pools.daglist); |
233 | pool_destroy(&rf_pools.dagpcache); |
234 | pool_destroy(&rf_pools.funclist); |
235 | } |
236 | |
237 | int |
238 | rf_ConfigureDAGs(RF_ShutdownList_t **listp) |
239 | { |
240 | |
241 | rf_pool_init(&rf_pools.dagnode, sizeof(RF_DagNode_t), |
242 | "rf_dagnode_pl" , RF_MIN_FREE_DAGNODE, RF_MAX_FREE_DAGNODE); |
243 | rf_pool_init(&rf_pools.dagh, sizeof(RF_DagHeader_t), |
244 | "rf_dagh_pl" , RF_MIN_FREE_DAGH, RF_MAX_FREE_DAGH); |
245 | rf_pool_init(&rf_pools.daglist, sizeof(RF_DagList_t), |
246 | "rf_daglist_pl" , RF_MIN_FREE_DAGLIST, RF_MAX_FREE_DAGLIST); |
247 | rf_pool_init(&rf_pools.dagpcache, RF_DAGPCACHE_SIZE, |
248 | "rf_dagpcache_pl" , RF_MIN_FREE_DAGPCACHE, RF_MAX_FREE_DAGPCACHE); |
249 | rf_pool_init(&rf_pools.funclist, sizeof(RF_FuncList_t), |
250 | "rf_funclist_pl" , RF_MIN_FREE_FUNCLIST, RF_MAX_FREE_FUNCLIST); |
251 | rf_ShutdownCreate(listp, rf_ShutdownDAGs, NULL); |
252 | |
253 | return (0); |
254 | } |
255 | |
256 | RF_DagHeader_t * |
257 | (void) |
258 | { |
259 | RF_DagHeader_t *dh; |
260 | |
261 | dh = pool_get(&rf_pools.dagh, PR_WAITOK); |
262 | memset((char *) dh, 0, sizeof(RF_DagHeader_t)); |
263 | return (dh); |
264 | } |
265 | |
266 | void |
267 | (RF_DagHeader_t * dh) |
268 | { |
269 | pool_put(&rf_pools.dagh, dh); |
270 | } |
271 | |
272 | RF_DagNode_t * |
273 | rf_AllocDAGNode(void) |
274 | { |
275 | RF_DagNode_t *node; |
276 | |
277 | node = pool_get(&rf_pools.dagnode, PR_WAITOK); |
278 | memset(node, 0, sizeof(RF_DagNode_t)); |
279 | return (node); |
280 | } |
281 | |
282 | void |
283 | rf_FreeDAGNode(RF_DagNode_t *node) |
284 | { |
285 | if (node->big_dag_ptrs) { |
286 | rf_FreeDAGPCache(node->big_dag_ptrs); |
287 | } |
288 | if (node->big_dag_params) { |
289 | rf_FreeDAGPCache(node->big_dag_params); |
290 | } |
291 | pool_put(&rf_pools.dagnode, node); |
292 | } |
293 | |
294 | RF_DagList_t * |
295 | rf_AllocDAGList(void) |
296 | { |
297 | RF_DagList_t *dagList; |
298 | |
299 | dagList = pool_get(&rf_pools.daglist, PR_WAITOK); |
300 | memset(dagList, 0, sizeof(RF_DagList_t)); |
301 | |
302 | return (dagList); |
303 | } |
304 | |
305 | void |
306 | rf_FreeDAGList(RF_DagList_t *dagList) |
307 | { |
308 | pool_put(&rf_pools.daglist, dagList); |
309 | } |
310 | |
311 | void * |
312 | rf_AllocDAGPCache(void) |
313 | { |
314 | void *p; |
315 | p = pool_get(&rf_pools.dagpcache, PR_WAITOK); |
316 | memset(p, 0, RF_DAGPCACHE_SIZE); |
317 | |
318 | return (p); |
319 | } |
320 | |
321 | void |
322 | rf_FreeDAGPCache(void *p) |
323 | { |
324 | pool_put(&rf_pools.dagpcache, p); |
325 | } |
326 | |
327 | RF_FuncList_t * |
328 | rf_AllocFuncList(void) |
329 | { |
330 | RF_FuncList_t *funcList; |
331 | |
332 | funcList = pool_get(&rf_pools.funclist, PR_WAITOK); |
333 | memset(funcList, 0, sizeof(RF_FuncList_t)); |
334 | |
335 | return (funcList); |
336 | } |
337 | |
338 | void |
339 | rf_FreeFuncList(RF_FuncList_t *funcList) |
340 | { |
341 | pool_put(&rf_pools.funclist, funcList); |
342 | } |
343 | |
344 | /* allocates a stripe buffer -- a buffer large enough to hold all the data |
345 | in an entire stripe. |
346 | */ |
347 | |
348 | void * |
349 | rf_AllocStripeBuffer(RF_Raid_t *raidPtr, RF_DagHeader_t *dag_h, |
350 | int size) |
351 | { |
352 | RF_VoidPointerListElem_t *vple; |
353 | void *p; |
354 | |
355 | RF_ASSERT((size <= (raidPtr->numCol * (raidPtr->Layout.sectorsPerStripeUnit << |
356 | raidPtr->logBytesPerSector)))); |
357 | |
358 | p = malloc( raidPtr->numCol * (raidPtr->Layout.sectorsPerStripeUnit << |
359 | raidPtr->logBytesPerSector), |
360 | M_RAIDFRAME, M_NOWAIT); |
361 | if (!p) { |
362 | rf_lock_mutex2(raidPtr->mutex); |
363 | if (raidPtr->stripebuf_count > 0) { |
364 | vple = raidPtr->stripebuf; |
365 | raidPtr->stripebuf = vple->next; |
366 | p = vple->p; |
367 | rf_FreeVPListElem(vple); |
368 | raidPtr->stripebuf_count--; |
369 | } else { |
370 | #ifdef DIAGNOSTIC |
371 | printf("raid%d: Help! Out of emergency full-stripe buffers!\n" , raidPtr->raidid); |
372 | #endif |
373 | } |
374 | rf_unlock_mutex2(raidPtr->mutex); |
375 | if (!p) { |
376 | /* We didn't get a buffer... not much we can do other than wait, |
377 | and hope that someone frees up memory for us.. */ |
378 | p = malloc( raidPtr->numCol * (raidPtr->Layout.sectorsPerStripeUnit << |
379 | raidPtr->logBytesPerSector), M_RAIDFRAME, M_WAITOK); |
380 | } |
381 | } |
382 | memset(p, 0, raidPtr->numCol * (raidPtr->Layout.sectorsPerStripeUnit << raidPtr->logBytesPerSector)); |
383 | |
384 | vple = rf_AllocVPListElem(); |
385 | vple->p = p; |
386 | vple->next = dag_h->desc->stripebufs; |
387 | dag_h->desc->stripebufs = vple; |
388 | |
389 | return (p); |
390 | } |
391 | |
392 | |
393 | void |
394 | rf_FreeStripeBuffer(RF_Raid_t *raidPtr, RF_VoidPointerListElem_t *vple) |
395 | { |
396 | rf_lock_mutex2(raidPtr->mutex); |
397 | if (raidPtr->stripebuf_count < raidPtr->numEmergencyStripeBuffers) { |
398 | /* just tack it in */ |
399 | vple->next = raidPtr->stripebuf; |
400 | raidPtr->stripebuf = vple; |
401 | raidPtr->stripebuf_count++; |
402 | } else { |
403 | free(vple->p, M_RAIDFRAME); |
404 | rf_FreeVPListElem(vple); |
405 | } |
406 | rf_unlock_mutex2(raidPtr->mutex); |
407 | } |
408 | |
409 | /* allocates a buffer big enough to hold the data described by the |
410 | caller (ie. the data of the associated PDA). Glue this buffer |
411 | into our dag_h cleanup structure. */ |
412 | |
413 | void * |
414 | rf_AllocBuffer(RF_Raid_t *raidPtr, RF_DagHeader_t *dag_h, int size) |
415 | { |
416 | RF_VoidPointerListElem_t *vple; |
417 | void *p; |
418 | |
419 | p = rf_AllocIOBuffer(raidPtr, size); |
420 | vple = rf_AllocVPListElem(); |
421 | vple->p = p; |
422 | vple->next = dag_h->desc->iobufs; |
423 | dag_h->desc->iobufs = vple; |
424 | |
425 | return (p); |
426 | } |
427 | |
428 | void * |
429 | rf_AllocIOBuffer(RF_Raid_t *raidPtr, int size) |
430 | { |
431 | RF_VoidPointerListElem_t *vple; |
432 | void *p; |
433 | |
434 | RF_ASSERT((size <= (raidPtr->Layout.sectorsPerStripeUnit << |
435 | raidPtr->logBytesPerSector))); |
436 | |
437 | p = malloc( raidPtr->Layout.sectorsPerStripeUnit << |
438 | raidPtr->logBytesPerSector, |
439 | M_RAIDFRAME, M_NOWAIT); |
440 | if (!p) { |
441 | rf_lock_mutex2(raidPtr->mutex); |
442 | if (raidPtr->iobuf_count > 0) { |
443 | vple = raidPtr->iobuf; |
444 | raidPtr->iobuf = vple->next; |
445 | p = vple->p; |
446 | rf_FreeVPListElem(vple); |
447 | raidPtr->iobuf_count--; |
448 | } else { |
449 | #ifdef DIAGNOSTIC |
450 | printf("raid%d: Help! Out of emergency buffers!\n" , raidPtr->raidid); |
451 | #endif |
452 | } |
453 | rf_unlock_mutex2(raidPtr->mutex); |
454 | if (!p) { |
455 | /* We didn't get a buffer... not much we can do other than wait, |
456 | and hope that someone frees up memory for us.. */ |
457 | p = malloc( raidPtr->Layout.sectorsPerStripeUnit << |
458 | raidPtr->logBytesPerSector, |
459 | M_RAIDFRAME, M_WAITOK); |
460 | } |
461 | } |
462 | memset(p, 0, raidPtr->Layout.sectorsPerStripeUnit << raidPtr->logBytesPerSector); |
463 | return (p); |
464 | } |
465 | |
466 | void |
467 | rf_FreeIOBuffer(RF_Raid_t *raidPtr, RF_VoidPointerListElem_t *vple) |
468 | { |
469 | rf_lock_mutex2(raidPtr->mutex); |
470 | if (raidPtr->iobuf_count < raidPtr->numEmergencyBuffers) { |
471 | /* just tack it in */ |
472 | vple->next = raidPtr->iobuf; |
473 | raidPtr->iobuf = vple; |
474 | raidPtr->iobuf_count++; |
475 | } else { |
476 | free(vple->p, M_RAIDFRAME); |
477 | rf_FreeVPListElem(vple); |
478 | } |
479 | rf_unlock_mutex2(raidPtr->mutex); |
480 | } |
481 | |
482 | |
483 | |
484 | #if RF_DEBUG_VALIDATE_DAG |
485 | /****************************************************************************** |
486 | * |
487 | * debug routines |
488 | * |
489 | *****************************************************************************/ |
490 | |
491 | char * |
492 | rf_NodeStatusString(RF_DagNode_t *node) |
493 | { |
494 | switch (node->status) { |
495 | case rf_wait: |
496 | return ("wait" ); |
497 | case rf_fired: |
498 | return ("fired" ); |
499 | case rf_good: |
500 | return ("good" ); |
501 | case rf_bad: |
502 | return ("bad" ); |
503 | default: |
504 | return ("?" ); |
505 | } |
506 | } |
507 | |
508 | void |
509 | rf_PrintNodeInfoString(RF_DagNode_t *node) |
510 | { |
511 | RF_PhysDiskAddr_t *pda; |
512 | int (*df) (RF_DagNode_t *) = node->doFunc; |
513 | int i, lk, unlk; |
514 | void *bufPtr; |
515 | |
516 | if ((df == rf_DiskReadFunc) || (df == rf_DiskWriteFunc) |
517 | || (df == rf_DiskReadMirrorIdleFunc) |
518 | || (df == rf_DiskReadMirrorPartitionFunc)) { |
519 | pda = (RF_PhysDiskAddr_t *) node->params[0].p; |
520 | bufPtr = (void *) node->params[1].p; |
521 | lk = 0; |
522 | unlk = 0; |
523 | RF_ASSERT(!(lk && unlk)); |
524 | printf("c %d offs %ld nsect %d buf 0x%lx %s\n" , pda->col, |
525 | (long) pda->startSector, (int) pda->numSector, (long) bufPtr, |
526 | (lk) ? "LOCK" : ((unlk) ? "UNLK" : " " )); |
527 | return; |
528 | } |
529 | if ((df == rf_SimpleXorFunc) || (df == rf_RegularXorFunc) |
530 | || (df == rf_RecoveryXorFunc)) { |
531 | printf("result buf 0x%lx\n" , (long) node->results[0]); |
532 | for (i = 0; i < node->numParams - 1; i += 2) { |
533 | pda = (RF_PhysDiskAddr_t *) node->params[i].p; |
534 | bufPtr = (RF_PhysDiskAddr_t *) node->params[i + 1].p; |
535 | printf(" buf 0x%lx c%d offs %ld nsect %d\n" , |
536 | (long) bufPtr, pda->col, |
537 | (long) pda->startSector, (int) pda->numSector); |
538 | } |
539 | return; |
540 | } |
541 | #if RF_INCLUDE_PARITYLOGGING > 0 |
542 | if (df == rf_ParityLogOverwriteFunc || df == rf_ParityLogUpdateFunc) { |
543 | for (i = 0; i < node->numParams - 1; i += 2) { |
544 | pda = (RF_PhysDiskAddr_t *) node->params[i].p; |
545 | bufPtr = (RF_PhysDiskAddr_t *) node->params[i + 1].p; |
546 | printf(" c%d offs %ld nsect %d buf 0x%lx\n" , |
547 | pda->col, (long) pda->startSector, |
548 | (int) pda->numSector, (long) bufPtr); |
549 | } |
550 | return; |
551 | } |
552 | #endif /* RF_INCLUDE_PARITYLOGGING > 0 */ |
553 | |
554 | if ((df == rf_TerminateFunc) || (df == rf_NullNodeFunc)) { |
555 | printf("\n" ); |
556 | return; |
557 | } |
558 | printf("?\n" ); |
559 | } |
560 | #ifdef DEBUG |
561 | static void |
562 | rf_RecurPrintDAG(RF_DagNode_t *node, int depth, int unvisited) |
563 | { |
564 | char *anttype; |
565 | int i; |
566 | |
567 | node->visited = (unvisited) ? 0 : 1; |
568 | printf("(%d) %d C%d %s: %s,s%d %d/%d,a%d/%d,p%d,r%d S{" , depth, |
569 | node->nodeNum, node->commitNode, node->name, rf_NodeStatusString(node), |
570 | node->numSuccedents, node->numSuccFired, node->numSuccDone, |
571 | node->numAntecedents, node->numAntDone, node->numParams, node->numResults); |
572 | for (i = 0; i < node->numSuccedents; i++) { |
573 | printf("%d%s" , node->succedents[i]->nodeNum, |
574 | ((i == node->numSuccedents - 1) ? "\0" : " " )); |
575 | } |
576 | printf("} A{" ); |
577 | for (i = 0; i < node->numAntecedents; i++) { |
578 | switch (node->antType[i]) { |
579 | case rf_trueData: |
580 | anttype = "T" ; |
581 | break; |
582 | case rf_antiData: |
583 | anttype = "A" ; |
584 | break; |
585 | case rf_outputData: |
586 | anttype = "O" ; |
587 | break; |
588 | case rf_control: |
589 | anttype = "C" ; |
590 | break; |
591 | default: |
592 | anttype = "?" ; |
593 | break; |
594 | } |
595 | printf("%d(%s)%s" , node->antecedents[i]->nodeNum, anttype, (i == node->numAntecedents - 1) ? "\0" : " " ); |
596 | } |
597 | printf("}; " ); |
598 | rf_PrintNodeInfoString(node); |
599 | for (i = 0; i < node->numSuccedents; i++) { |
600 | if (node->succedents[i]->visited == unvisited) |
601 | rf_RecurPrintDAG(node->succedents[i], depth + 1, unvisited); |
602 | } |
603 | } |
604 | |
605 | static void |
606 | rf_PrintDAG(RF_DagHeader_t *dag_h) |
607 | { |
608 | int unvisited, i; |
609 | char *status; |
610 | |
611 | /* set dag status */ |
612 | switch (dag_h->status) { |
613 | case rf_enable: |
614 | status = "enable" ; |
615 | break; |
616 | case rf_rollForward: |
617 | status = "rollForward" ; |
618 | break; |
619 | case rf_rollBackward: |
620 | status = "rollBackward" ; |
621 | break; |
622 | default: |
623 | status = "illegal!" ; |
624 | break; |
625 | } |
626 | /* find out if visited bits are currently set or clear */ |
627 | unvisited = dag_h->succedents[0]->visited; |
628 | |
629 | printf("DAG type: %s\n" , dag_h->creator); |
630 | printf("format is (depth) num commit type: status,nSucc nSuccFired/nSuccDone,nAnte/nAnteDone,nParam,nResult S{x} A{x(type)}; info\n" ); |
631 | printf("(0) %d Hdr: %s, s%d, (commit %d/%d) S{" , dag_h->nodeNum, |
632 | status, dag_h->numSuccedents, dag_h->numCommitNodes, dag_h->numCommits); |
633 | for (i = 0; i < dag_h->numSuccedents; i++) { |
634 | printf("%d%s" , dag_h->succedents[i]->nodeNum, |
635 | ((i == dag_h->numSuccedents - 1) ? "\0" : " " )); |
636 | } |
637 | printf("};\n" ); |
638 | for (i = 0; i < dag_h->numSuccedents; i++) { |
639 | if (dag_h->succedents[i]->visited == unvisited) |
640 | rf_RecurPrintDAG(dag_h->succedents[i], 1, unvisited); |
641 | } |
642 | } |
643 | #endif |
644 | /* assigns node numbers */ |
645 | int |
646 | rf_AssignNodeNums(RF_DagHeader_t * dag_h) |
647 | { |
648 | int unvisited, i, nnum; |
649 | RF_DagNode_t *node; |
650 | |
651 | nnum = 0; |
652 | unvisited = dag_h->succedents[0]->visited; |
653 | |
654 | dag_h->nodeNum = nnum++; |
655 | for (i = 0; i < dag_h->numSuccedents; i++) { |
656 | node = dag_h->succedents[i]; |
657 | if (node->visited == unvisited) { |
658 | nnum = rf_RecurAssignNodeNums(dag_h->succedents[i], nnum, unvisited); |
659 | } |
660 | } |
661 | return (nnum); |
662 | } |
663 | |
664 | int |
665 | rf_RecurAssignNodeNums(RF_DagNode_t *node, int num, int unvisited) |
666 | { |
667 | int i; |
668 | |
669 | node->visited = (unvisited) ? 0 : 1; |
670 | |
671 | node->nodeNum = num++; |
672 | for (i = 0; i < node->numSuccedents; i++) { |
673 | if (node->succedents[i]->visited == unvisited) { |
674 | num = rf_RecurAssignNodeNums(node->succedents[i], num, unvisited); |
675 | } |
676 | } |
677 | return (num); |
678 | } |
679 | /* set the header pointers in each node to "newptr" */ |
680 | void |
681 | rf_ResetDAGHeaderPointers(RF_DagHeader_t *dag_h, RF_DagHeader_t *newptr) |
682 | { |
683 | int i; |
684 | for (i = 0; i < dag_h->numSuccedents; i++) |
685 | if (dag_h->succedents[i]->dagHdr != newptr) |
686 | rf_RecurResetDAGHeaderPointers(dag_h->succedents[i], newptr); |
687 | } |
688 | |
689 | void |
690 | rf_RecurResetDAGHeaderPointers(RF_DagNode_t *node, RF_DagHeader_t *newptr) |
691 | { |
692 | int i; |
693 | node->dagHdr = newptr; |
694 | for (i = 0; i < node->numSuccedents; i++) |
695 | if (node->succedents[i]->dagHdr != newptr) |
696 | rf_RecurResetDAGHeaderPointers(node->succedents[i], newptr); |
697 | } |
698 | |
699 | |
700 | void |
701 | rf_PrintDAGList(RF_DagHeader_t * dag_h) |
702 | { |
703 | int i = 0; |
704 | |
705 | for (; dag_h; dag_h = dag_h->next) { |
706 | rf_AssignNodeNums(dag_h); |
707 | printf("\n\nDAG %d IN LIST:\n" , i++); |
708 | rf_PrintDAG(dag_h); |
709 | } |
710 | } |
711 | |
712 | static int |
713 | rf_ValidateBranch(RF_DagNode_t *node, int *scount, int *acount, |
714 | RF_DagNode_t **nodes, int unvisited) |
715 | { |
716 | int i, retcode = 0; |
717 | |
718 | /* construct an array of node pointers indexed by node num */ |
719 | node->visited = (unvisited) ? 0 : 1; |
720 | nodes[node->nodeNum] = node; |
721 | |
722 | if (node->next != NULL) { |
723 | printf("INVALID DAG: next pointer in node is not NULL\n" ); |
724 | retcode = 1; |
725 | } |
726 | if (node->status != rf_wait) { |
727 | printf("INVALID DAG: Node status is not wait\n" ); |
728 | retcode = 1; |
729 | } |
730 | if (node->numAntDone != 0) { |
731 | printf("INVALID DAG: numAntDone is not zero\n" ); |
732 | retcode = 1; |
733 | } |
734 | if (node->doFunc == rf_TerminateFunc) { |
735 | if (node->numSuccedents != 0) { |
736 | printf("INVALID DAG: Terminator node has succedents\n" ); |
737 | retcode = 1; |
738 | } |
739 | } else { |
740 | if (node->numSuccedents == 0) { |
741 | printf("INVALID DAG: Non-terminator node has no succedents\n" ); |
742 | retcode = 1; |
743 | } |
744 | } |
745 | for (i = 0; i < node->numSuccedents; i++) { |
746 | if (!node->succedents[i]) { |
747 | printf("INVALID DAG: succedent %d of node %s is NULL\n" , i, node->name); |
748 | retcode = 1; |
749 | } |
750 | scount[node->succedents[i]->nodeNum]++; |
751 | } |
752 | for (i = 0; i < node->numAntecedents; i++) { |
753 | if (!node->antecedents[i]) { |
754 | printf("INVALID DAG: antecedent %d of node %s is NULL\n" , i, node->name); |
755 | retcode = 1; |
756 | } |
757 | acount[node->antecedents[i]->nodeNum]++; |
758 | } |
759 | for (i = 0; i < node->numSuccedents; i++) { |
760 | if (node->succedents[i]->visited == unvisited) { |
761 | if (rf_ValidateBranch(node->succedents[i], scount, |
762 | acount, nodes, unvisited)) { |
763 | retcode = 1; |
764 | } |
765 | } |
766 | } |
767 | return (retcode); |
768 | } |
769 | |
770 | static void |
771 | rf_ValidateBranchVisitedBits(RF_DagNode_t *node, int unvisited, int rl) |
772 | { |
773 | int i; |
774 | |
775 | RF_ASSERT(node->visited == unvisited); |
776 | for (i = 0; i < node->numSuccedents; i++) { |
777 | if (node->succedents[i] == NULL) { |
778 | printf("node=%lx node->succedents[%d] is NULL\n" , (long) node, i); |
779 | RF_ASSERT(0); |
780 | } |
781 | rf_ValidateBranchVisitedBits(node->succedents[i], unvisited, rl + 1); |
782 | } |
783 | } |
784 | /* NOTE: never call this on a big dag, because it is exponential |
785 | * in execution time |
786 | */ |
787 | static void |
788 | rf_ValidateVisitedBits(RF_DagHeader_t *dag) |
789 | { |
790 | int i, unvisited; |
791 | |
792 | unvisited = dag->succedents[0]->visited; |
793 | |
794 | for (i = 0; i < dag->numSuccedents; i++) { |
795 | if (dag->succedents[i] == NULL) { |
796 | printf("dag=%lx dag->succedents[%d] is NULL\n" , (long) dag, i); |
797 | RF_ASSERT(0); |
798 | } |
799 | rf_ValidateBranchVisitedBits(dag->succedents[i], unvisited, 0); |
800 | } |
801 | } |
802 | /* validate a DAG. _at entry_ verify that: |
803 | * -- numNodesCompleted is zero |
804 | * -- node queue is null |
805 | * -- dag status is rf_enable |
806 | * -- next pointer is null on every node |
807 | * -- all nodes have status wait |
808 | * -- numAntDone is zero in all nodes |
809 | * -- terminator node has zero successors |
810 | * -- no other node besides terminator has zero successors |
811 | * -- no successor or antecedent pointer in a node is NULL |
812 | * -- number of times that each node appears as a successor of another node |
813 | * is equal to the antecedent count on that node |
814 | * -- number of times that each node appears as an antecedent of another node |
815 | * is equal to the succedent count on that node |
816 | * -- what else? |
817 | */ |
818 | int |
819 | rf_ValidateDAG(RF_DagHeader_t *dag_h) |
820 | { |
821 | int i, nodecount; |
822 | int *scount, *acount;/* per-node successor and antecedent counts */ |
823 | RF_DagNode_t **nodes; /* array of ptrs to nodes in dag */ |
824 | int retcode = 0; |
825 | int unvisited; |
826 | int commitNodeCount = 0; |
827 | |
828 | if (rf_validateVisitedDebug) |
829 | rf_ValidateVisitedBits(dag_h); |
830 | |
831 | if (dag_h->numNodesCompleted != 0) { |
832 | printf("INVALID DAG: num nodes completed is %d, should be 0\n" , dag_h->numNodesCompleted); |
833 | retcode = 1; |
834 | goto validate_dag_bad; |
835 | } |
836 | if (dag_h->status != rf_enable) { |
837 | printf("INVALID DAG: not enabled\n" ); |
838 | retcode = 1; |
839 | goto validate_dag_bad; |
840 | } |
841 | if (dag_h->numCommits != 0) { |
842 | printf("INVALID DAG: numCommits != 0 (%d)\n" , dag_h->numCommits); |
843 | retcode = 1; |
844 | goto validate_dag_bad; |
845 | } |
846 | if (dag_h->numSuccedents != 1) { |
847 | /* currently, all dags must have only one succedent */ |
848 | printf("INVALID DAG: numSuccedents !1 (%d)\n" , dag_h->numSuccedents); |
849 | retcode = 1; |
850 | goto validate_dag_bad; |
851 | } |
852 | nodecount = rf_AssignNodeNums(dag_h); |
853 | |
854 | unvisited = dag_h->succedents[0]->visited; |
855 | |
856 | RF_Malloc(scount, nodecount * sizeof(int), (int *)); |
857 | RF_Malloc(acount, nodecount * sizeof(int), (int *)); |
858 | RF_Malloc(nodes, nodecount * sizeof(RF_DagNode_t *), |
859 | (RF_DagNode_t **)); |
860 | for (i = 0; i < dag_h->numSuccedents; i++) { |
861 | if ((dag_h->succedents[i]->visited == unvisited) |
862 | && rf_ValidateBranch(dag_h->succedents[i], scount, |
863 | acount, nodes, unvisited)) { |
864 | retcode = 1; |
865 | } |
866 | } |
867 | /* start at 1 to skip the header node */ |
868 | for (i = 1; i < nodecount; i++) { |
869 | if (nodes[i]->commitNode) |
870 | commitNodeCount++; |
871 | if (nodes[i]->doFunc == NULL) { |
872 | printf("INVALID DAG: node %s has an undefined doFunc\n" , nodes[i]->name); |
873 | retcode = 1; |
874 | goto validate_dag_out; |
875 | } |
876 | if (nodes[i]->undoFunc == NULL) { |
877 | printf("INVALID DAG: node %s has an undefined doFunc\n" , nodes[i]->name); |
878 | retcode = 1; |
879 | goto validate_dag_out; |
880 | } |
881 | if (nodes[i]->numAntecedents != scount[nodes[i]->nodeNum]) { |
882 | printf("INVALID DAG: node %s has %d antecedents but appears as a succedent %d times\n" , |
883 | nodes[i]->name, nodes[i]->numAntecedents, scount[nodes[i]->nodeNum]); |
884 | retcode = 1; |
885 | goto validate_dag_out; |
886 | } |
887 | if (nodes[i]->numSuccedents != acount[nodes[i]->nodeNum]) { |
888 | printf("INVALID DAG: node %s has %d succedents but appears as an antecedent %d times\n" , |
889 | nodes[i]->name, nodes[i]->numSuccedents, acount[nodes[i]->nodeNum]); |
890 | retcode = 1; |
891 | goto validate_dag_out; |
892 | } |
893 | } |
894 | |
895 | if (dag_h->numCommitNodes != commitNodeCount) { |
896 | printf("INVALID DAG: incorrect commit node count. hdr->numCommitNodes (%d) found (%d) commit nodes in graph\n" , |
897 | dag_h->numCommitNodes, commitNodeCount); |
898 | retcode = 1; |
899 | goto validate_dag_out; |
900 | } |
901 | validate_dag_out: |
902 | RF_Free(scount, nodecount * sizeof(int)); |
903 | RF_Free(acount, nodecount * sizeof(int)); |
904 | RF_Free(nodes, nodecount * sizeof(RF_DagNode_t *)); |
905 | if (retcode) |
906 | rf_PrintDAGList(dag_h); |
907 | |
908 | if (rf_validateVisitedDebug) |
909 | rf_ValidateVisitedBits(dag_h); |
910 | |
911 | return (retcode); |
912 | |
913 | validate_dag_bad: |
914 | rf_PrintDAGList(dag_h); |
915 | return (retcode); |
916 | } |
917 | |
918 | #endif /* RF_DEBUG_VALIDATE_DAG */ |
919 | |
920 | /****************************************************************************** |
921 | * |
922 | * misc construction routines |
923 | * |
924 | *****************************************************************************/ |
925 | |
926 | void |
927 | rf_redirect_asm(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap) |
928 | { |
929 | int ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) ? 1 : 0; |
930 | int fcol = raidPtr->reconControl->fcol; |
931 | int scol = raidPtr->reconControl->spareCol; |
932 | RF_PhysDiskAddr_t *pda; |
933 | |
934 | RF_ASSERT(raidPtr->status == rf_rs_reconstructing); |
935 | for (pda = asmap->physInfo; pda; pda = pda->next) { |
936 | if (pda->col == fcol) { |
937 | #if RF_DEBUG_DAG |
938 | if (rf_dagDebug) { |
939 | if (!rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, |
940 | pda->startSector)) { |
941 | RF_PANIC(); |
942 | } |
943 | } |
944 | #endif |
945 | /* printf("Remapped data for large write\n"); */ |
946 | if (ds) { |
947 | raidPtr->Layout.map->MapSector(raidPtr, pda->raidAddress, |
948 | &pda->col, &pda->startSector, RF_REMAP); |
949 | } else { |
950 | pda->col = scol; |
951 | } |
952 | } |
953 | } |
954 | for (pda = asmap->parityInfo; pda; pda = pda->next) { |
955 | if (pda->col == fcol) { |
956 | #if RF_DEBUG_DAG |
957 | if (rf_dagDebug) { |
958 | if (!rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, pda->startSector)) { |
959 | RF_PANIC(); |
960 | } |
961 | } |
962 | #endif |
963 | } |
964 | if (ds) { |
965 | (raidPtr->Layout.map->MapParity) (raidPtr, pda->raidAddress, &pda->col, &pda->startSector, RF_REMAP); |
966 | } else { |
967 | pda->col = scol; |
968 | } |
969 | } |
970 | } |
971 | |
972 | |
973 | /* this routine allocates read buffers and generates stripe maps for the |
974 | * regions of the array from the start of the stripe to the start of the |
975 | * access, and from the end of the access to the end of the stripe. It also |
976 | * computes and returns the number of DAG nodes needed to read all this data. |
977 | * Note that this routine does the wrong thing if the access is fully |
978 | * contained within one stripe unit, so we RF_ASSERT against this case at the |
979 | * start. |
980 | * |
981 | * layoutPtr - in: layout information |
982 | * asmap - in: access stripe map |
983 | * dag_h - in: header of the dag to create |
984 | * new_asm_h - in: ptr to array of 2 headers. to be filled in |
985 | * nRodNodes - out: num nodes to be generated to read unaccessed data |
986 | * sosBuffer, eosBuffer - out: pointers to newly allocated buffer |
987 | */ |
988 | void |
989 | rf_MapUnaccessedPortionOfStripe(RF_Raid_t *raidPtr, |
990 | RF_RaidLayout_t *layoutPtr, |
991 | RF_AccessStripeMap_t *asmap, |
992 | RF_DagHeader_t *dag_h, |
993 | RF_AccessStripeMapHeader_t **new_asm_h, |
994 | int *nRodNodes, |
995 | char **sosBuffer, char **eosBuffer, |
996 | RF_AllocListElem_t *allocList) |
997 | { |
998 | RF_RaidAddr_t sosRaidAddress, eosRaidAddress; |
999 | RF_SectorNum_t sosNumSector, eosNumSector; |
1000 | |
1001 | RF_ASSERT(asmap->numStripeUnitsAccessed > (layoutPtr->numDataCol / 2)); |
1002 | /* generate an access map for the region of the array from start of |
1003 | * stripe to start of access */ |
1004 | new_asm_h[0] = new_asm_h[1] = NULL; |
1005 | *nRodNodes = 0; |
1006 | if (!rf_RaidAddressStripeAligned(layoutPtr, asmap->raidAddress)) { |
1007 | sosRaidAddress = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); |
1008 | sosNumSector = asmap->raidAddress - sosRaidAddress; |
1009 | *sosBuffer = rf_AllocStripeBuffer(raidPtr, dag_h, rf_RaidAddressToByte(raidPtr, sosNumSector)); |
1010 | new_asm_h[0] = rf_MapAccess(raidPtr, sosRaidAddress, sosNumSector, *sosBuffer, RF_DONT_REMAP); |
1011 | new_asm_h[0]->next = dag_h->asmList; |
1012 | dag_h->asmList = new_asm_h[0]; |
1013 | *nRodNodes += new_asm_h[0]->stripeMap->numStripeUnitsAccessed; |
1014 | |
1015 | RF_ASSERT(new_asm_h[0]->stripeMap->next == NULL); |
1016 | /* we're totally within one stripe here */ |
1017 | if (asmap->flags & RF_ASM_REDIR_LARGE_WRITE) |
1018 | rf_redirect_asm(raidPtr, new_asm_h[0]->stripeMap); |
1019 | } |
1020 | /* generate an access map for the region of the array from end of |
1021 | * access to end of stripe */ |
1022 | if (!rf_RaidAddressStripeAligned(layoutPtr, asmap->endRaidAddress)) { |
1023 | eosRaidAddress = asmap->endRaidAddress; |
1024 | eosNumSector = rf_RaidAddressOfNextStripeBoundary(layoutPtr, eosRaidAddress) - eosRaidAddress; |
1025 | *eosBuffer = rf_AllocStripeBuffer(raidPtr, dag_h, rf_RaidAddressToByte(raidPtr, eosNumSector)); |
1026 | new_asm_h[1] = rf_MapAccess(raidPtr, eosRaidAddress, eosNumSector, *eosBuffer, RF_DONT_REMAP); |
1027 | new_asm_h[1]->next = dag_h->asmList; |
1028 | dag_h->asmList = new_asm_h[1]; |
1029 | *nRodNodes += new_asm_h[1]->stripeMap->numStripeUnitsAccessed; |
1030 | |
1031 | RF_ASSERT(new_asm_h[1]->stripeMap->next == NULL); |
1032 | /* we're totally within one stripe here */ |
1033 | if (asmap->flags & RF_ASM_REDIR_LARGE_WRITE) |
1034 | rf_redirect_asm(raidPtr, new_asm_h[1]->stripeMap); |
1035 | } |
1036 | } |
1037 | |
1038 | |
1039 | |
1040 | /* returns non-zero if the indicated ranges of stripe unit offsets overlap */ |
1041 | int |
1042 | rf_PDAOverlap(RF_RaidLayout_t *layoutPtr, |
1043 | RF_PhysDiskAddr_t *src, RF_PhysDiskAddr_t *dest) |
1044 | { |
1045 | RF_SectorNum_t soffs = rf_StripeUnitOffset(layoutPtr, src->startSector); |
1046 | RF_SectorNum_t doffs = rf_StripeUnitOffset(layoutPtr, dest->startSector); |
1047 | /* use -1 to be sure we stay within SU */ |
1048 | RF_SectorNum_t send = rf_StripeUnitOffset(layoutPtr, src->startSector + src->numSector - 1); |
1049 | RF_SectorNum_t dend = rf_StripeUnitOffset(layoutPtr, dest->startSector + dest->numSector - 1); |
1050 | return ((RF_MAX(soffs, doffs) <= RF_MIN(send, dend)) ? 1 : 0); |
1051 | } |
1052 | |
1053 | |
1054 | /* GenerateFailedAccessASMs |
1055 | * |
1056 | * this routine figures out what portion of the stripe needs to be read |
1057 | * to effect the degraded read or write operation. It's primary function |
1058 | * is to identify everything required to recover the data, and then |
1059 | * eliminate anything that is already being accessed by the user. |
1060 | * |
1061 | * The main result is two new ASMs, one for the region from the start of the |
1062 | * stripe to the start of the access, and one for the region from the end of |
1063 | * the access to the end of the stripe. These ASMs describe everything that |
1064 | * needs to be read to effect the degraded access. Other results are: |
1065 | * nXorBufs -- the total number of buffers that need to be XORed together to |
1066 | * recover the lost data, |
1067 | * rpBufPtr -- ptr to a newly-allocated buffer to hold the parity. If NULL |
1068 | * at entry, not allocated. |
1069 | * overlappingPDAs -- |
1070 | * describes which of the non-failed PDAs in the user access |
1071 | * overlap data that needs to be read to effect recovery. |
1072 | * overlappingPDAs[i]==1 if and only if, neglecting the failed |
1073 | * PDA, the ith pda in the input asm overlaps data that needs |
1074 | * to be read for recovery. |
1075 | */ |
1076 | /* in: asm - ASM for the actual access, one stripe only */ |
1077 | /* in: failedPDA - which component of the access has failed */ |
1078 | /* in: dag_h - header of the DAG we're going to create */ |
1079 | /* out: new_asm_h - the two new ASMs */ |
1080 | /* out: nXorBufs - the total number of xor bufs required */ |
1081 | /* out: rpBufPtr - a buffer for the parity read */ |
1082 | void |
1083 | rf_GenerateFailedAccessASMs(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, |
1084 | RF_PhysDiskAddr_t *failedPDA, |
1085 | RF_DagHeader_t *dag_h, |
1086 | RF_AccessStripeMapHeader_t **new_asm_h, |
1087 | int *nXorBufs, char **rpBufPtr, |
1088 | char *overlappingPDAs, |
1089 | RF_AllocListElem_t *allocList) |
1090 | { |
1091 | RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); |
1092 | |
1093 | /* s=start, e=end, s=stripe, a=access, f=failed, su=stripe unit */ |
1094 | RF_RaidAddr_t sosAddr, sosEndAddr, eosStartAddr, eosAddr; |
1095 | RF_PhysDiskAddr_t *pda; |
1096 | int foundit, i; |
1097 | |
1098 | foundit = 0; |
1099 | /* first compute the following raid addresses: start of stripe, |
1100 | * (sosAddr) MIN(start of access, start of failed SU), (sosEndAddr) |
1101 | * MAX(end of access, end of failed SU), (eosStartAddr) end of |
1102 | * stripe (i.e. start of next stripe) (eosAddr) */ |
1103 | sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); |
1104 | sosEndAddr = RF_MIN(asmap->raidAddress, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, failedPDA->raidAddress)); |
1105 | eosStartAddr = RF_MAX(asmap->endRaidAddress, rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, failedPDA->raidAddress)); |
1106 | eosAddr = rf_RaidAddressOfNextStripeBoundary(layoutPtr, asmap->raidAddress); |
1107 | |
1108 | /* now generate access stripe maps for each of the above regions of |
1109 | * the stripe. Use a dummy (NULL) buf ptr for now */ |
1110 | |
1111 | new_asm_h[0] = (sosAddr != sosEndAddr) ? rf_MapAccess(raidPtr, sosAddr, sosEndAddr - sosAddr, NULL, RF_DONT_REMAP) : NULL; |
1112 | new_asm_h[1] = (eosStartAddr != eosAddr) ? rf_MapAccess(raidPtr, eosStartAddr, eosAddr - eosStartAddr, NULL, RF_DONT_REMAP) : NULL; |
1113 | |
1114 | /* walk through the PDAs and range-restrict each SU to the region of |
1115 | * the SU touched on the failed PDA. also compute total data buffer |
1116 | * space requirements in this step. Ignore the parity for now. */ |
1117 | /* Also count nodes to find out how many bufs need to be xored together */ |
1118 | (*nXorBufs) = 1; /* in read case, 1 is for parity. In write |
1119 | * case, 1 is for failed data */ |
1120 | |
1121 | if (new_asm_h[0]) { |
1122 | new_asm_h[0]->next = dag_h->asmList; |
1123 | dag_h->asmList = new_asm_h[0]; |
1124 | for (pda = new_asm_h[0]->stripeMap->physInfo; pda; pda = pda->next) { |
1125 | rf_RangeRestrictPDA(raidPtr, failedPDA, pda, RF_RESTRICT_NOBUFFER, 0); |
1126 | pda->bufPtr = rf_AllocBuffer(raidPtr, dag_h, pda->numSector << raidPtr->logBytesPerSector); |
1127 | } |
1128 | (*nXorBufs) += new_asm_h[0]->stripeMap->numStripeUnitsAccessed; |
1129 | } |
1130 | if (new_asm_h[1]) { |
1131 | new_asm_h[1]->next = dag_h->asmList; |
1132 | dag_h->asmList = new_asm_h[1]; |
1133 | for (pda = new_asm_h[1]->stripeMap->physInfo; pda; pda = pda->next) { |
1134 | rf_RangeRestrictPDA(raidPtr, failedPDA, pda, RF_RESTRICT_NOBUFFER, 0); |
1135 | pda->bufPtr = rf_AllocBuffer(raidPtr, dag_h, pda->numSector << raidPtr->logBytesPerSector); |
1136 | } |
1137 | (*nXorBufs) += new_asm_h[1]->stripeMap->numStripeUnitsAccessed; |
1138 | } |
1139 | |
1140 | /* allocate a buffer for parity */ |
1141 | if (rpBufPtr) |
1142 | *rpBufPtr = rf_AllocBuffer(raidPtr, dag_h, failedPDA->numSector << raidPtr->logBytesPerSector); |
1143 | |
1144 | /* the last step is to figure out how many more distinct buffers need |
1145 | * to get xor'd to produce the missing unit. there's one for each |
1146 | * user-data read node that overlaps the portion of the failed unit |
1147 | * being accessed */ |
1148 | |
1149 | for (foundit = i = 0, pda = asmap->physInfo; pda; i++, pda = pda->next) { |
1150 | if (pda == failedPDA) { |
1151 | i--; |
1152 | foundit = 1; |
1153 | continue; |
1154 | } |
1155 | if (rf_PDAOverlap(layoutPtr, pda, failedPDA)) { |
1156 | overlappingPDAs[i] = 1; |
1157 | (*nXorBufs)++; |
1158 | } |
1159 | } |
1160 | if (!foundit) { |
1161 | RF_ERRORMSG("GenerateFailedAccessASMs: did not find failedPDA in asm list\n" ); |
1162 | RF_ASSERT(0); |
1163 | } |
1164 | #if RF_DEBUG_DAG |
1165 | if (rf_degDagDebug) { |
1166 | if (new_asm_h[0]) { |
1167 | printf("First asm:\n" ); |
1168 | rf_PrintFullAccessStripeMap(new_asm_h[0], 1); |
1169 | } |
1170 | if (new_asm_h[1]) { |
1171 | printf("Second asm:\n" ); |
1172 | rf_PrintFullAccessStripeMap(new_asm_h[1], 1); |
1173 | } |
1174 | } |
1175 | #endif |
1176 | } |
1177 | |
1178 | |
1179 | /* adjusts the offset and number of sectors in the destination pda so that |
1180 | * it covers at most the region of the SU covered by the source PDA. This |
1181 | * is exclusively a restriction: the number of sectors indicated by the |
1182 | * target PDA can only shrink. |
1183 | * |
1184 | * For example: s = sectors within SU indicated by source PDA |
1185 | * d = sectors within SU indicated by dest PDA |
1186 | * r = results, stored in dest PDA |
1187 | * |
1188 | * |--------------- one stripe unit ---------------------| |
1189 | * | sssssssssssssssssssssssssssssssss | |
1190 | * | ddddddddddddddddddddddddddddddddddddddddddddd | |
1191 | * | rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr | |
1192 | * |
1193 | * Another example: |
1194 | * |
1195 | * |--------------- one stripe unit ---------------------| |
1196 | * | sssssssssssssssssssssssssssssssss | |
1197 | * | ddddddddddddddddddddddd | |
1198 | * | rrrrrrrrrrrrrrrr | |
1199 | * |
1200 | */ |
1201 | void |
1202 | rf_RangeRestrictPDA(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *src, |
1203 | RF_PhysDiskAddr_t *dest, int dobuffer, int doraidaddr) |
1204 | { |
1205 | RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; |
1206 | RF_SectorNum_t soffs = rf_StripeUnitOffset(layoutPtr, src->startSector); |
1207 | RF_SectorNum_t doffs = rf_StripeUnitOffset(layoutPtr, dest->startSector); |
1208 | RF_SectorNum_t send = rf_StripeUnitOffset(layoutPtr, src->startSector + src->numSector - 1); /* use -1 to be sure we |
1209 | * stay within SU */ |
1210 | RF_SectorNum_t dend = rf_StripeUnitOffset(layoutPtr, dest->startSector + dest->numSector - 1); |
1211 | RF_SectorNum_t subAddr = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, dest->startSector); /* stripe unit boundary */ |
1212 | |
1213 | dest->startSector = subAddr + RF_MAX(soffs, doffs); |
1214 | dest->numSector = subAddr + RF_MIN(send, dend) + 1 - dest->startSector; |
1215 | |
1216 | if (dobuffer) |
1217 | dest->bufPtr = (char *)(dest->bufPtr) + ((soffs > doffs) ? rf_RaidAddressToByte(raidPtr, soffs - doffs) : 0); |
1218 | if (doraidaddr) { |
1219 | dest->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, dest->raidAddress) + |
1220 | rf_StripeUnitOffset(layoutPtr, dest->startSector); |
1221 | } |
1222 | } |
1223 | |
1224 | #if (RF_INCLUDE_CHAINDECLUSTER > 0) |
1225 | |
1226 | /* |
1227 | * Want the highest of these primes to be the largest one |
1228 | * less than the max expected number of columns (won't hurt |
1229 | * to be too small or too large, but won't be optimal, either) |
1230 | * --jimz |
1231 | */ |
1232 | #define NLOWPRIMES 8 |
1233 | static int lowprimes[NLOWPRIMES] = {2, 3, 5, 7, 11, 13, 17, 19}; |
1234 | /***************************************************************************** |
1235 | * compute the workload shift factor. (chained declustering) |
1236 | * |
1237 | * return nonzero if access should shift to secondary, otherwise, |
1238 | * access is to primary |
1239 | *****************************************************************************/ |
1240 | int |
1241 | rf_compute_workload_shift(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda) |
1242 | { |
1243 | /* |
1244 | * variables: |
1245 | * d = column of disk containing primary |
1246 | * f = column of failed disk |
1247 | * n = number of disks in array |
1248 | * sd = "shift distance" (number of columns that d is to the right of f) |
1249 | * v = numerator of redirection ratio |
1250 | * k = denominator of redirection ratio |
1251 | */ |
1252 | RF_RowCol_t d, f, sd, n; |
1253 | int k, v, ret, i; |
1254 | |
1255 | n = raidPtr->numCol; |
1256 | |
1257 | /* assign column of primary copy to d */ |
1258 | d = pda->col; |
1259 | |
1260 | /* assign column of dead disk to f */ |
1261 | for (f = 0; ((!RF_DEAD_DISK(raidPtr->Disks[f].status)) && (f < n)); f++) |
1262 | continue; |
1263 | |
1264 | RF_ASSERT(f < n); |
1265 | RF_ASSERT(f != d); |
1266 | |
1267 | sd = (f > d) ? (n + d - f) : (d - f); |
1268 | RF_ASSERT(sd < n); |
1269 | |
1270 | /* |
1271 | * v of every k accesses should be redirected |
1272 | * |
1273 | * v/k := (n-1-sd)/(n-1) |
1274 | */ |
1275 | v = (n - 1 - sd); |
1276 | k = (n - 1); |
1277 | |
1278 | #if 1 |
1279 | /* |
1280 | * XXX |
1281 | * Is this worth it? |
1282 | * |
1283 | * Now reduce the fraction, by repeatedly factoring |
1284 | * out primes (just like they teach in elementary school!) |
1285 | */ |
1286 | for (i = 0; i < NLOWPRIMES; i++) { |
1287 | if (lowprimes[i] > v) |
1288 | break; |
1289 | while (((v % lowprimes[i]) == 0) && ((k % lowprimes[i]) == 0)) { |
1290 | v /= lowprimes[i]; |
1291 | k /= lowprimes[i]; |
1292 | } |
1293 | } |
1294 | #endif |
1295 | |
1296 | raidPtr->hist_diskreq[d]++; |
1297 | if (raidPtr->hist_diskreq[d] > v) { |
1298 | ret = 0; /* do not redirect */ |
1299 | } else { |
1300 | ret = 1; /* redirect */ |
1301 | } |
1302 | |
1303 | #if 0 |
1304 | printf("d=%d f=%d sd=%d v=%d k=%d ret=%d h=%d\n" , d, f, sd, v, k, ret, |
1305 | raidPtr->hist_diskreq[d]); |
1306 | #endif |
1307 | |
1308 | if (raidPtr->hist_diskreq[d] >= k) { |
1309 | /* reset counter */ |
1310 | raidPtr->hist_diskreq[d] = 0; |
1311 | } |
1312 | return (ret); |
1313 | } |
1314 | #endif /* (RF_INCLUDE_CHAINDECLUSTER > 0) */ |
1315 | |
1316 | /* |
1317 | * Disk selection routines |
1318 | */ |
1319 | |
1320 | /* |
1321 | * Selects the disk with the shortest queue from a mirror pair. |
1322 | * Both the disk I/Os queued in RAIDframe as well as those at the physical |
1323 | * disk are counted as members of the "queue" |
1324 | */ |
1325 | void |
1326 | rf_SelectMirrorDiskIdle(RF_DagNode_t * node) |
1327 | { |
1328 | RF_Raid_t *raidPtr = (RF_Raid_t *) node->dagHdr->raidPtr; |
1329 | RF_RowCol_t colData, colMirror; |
1330 | int dataQueueLength, mirrorQueueLength, usemirror; |
1331 | RF_PhysDiskAddr_t *data_pda = (RF_PhysDiskAddr_t *) node->params[0].p; |
1332 | RF_PhysDiskAddr_t *mirror_pda = (RF_PhysDiskAddr_t *) node->params[4].p; |
1333 | RF_PhysDiskAddr_t *tmp_pda; |
1334 | RF_RaidDisk_t *disks = raidPtr->Disks; |
1335 | RF_DiskQueue_t *dqs = raidPtr->Queues, *dataQueue, *mirrorQueue; |
1336 | |
1337 | /* return the [row col] of the disk with the shortest queue */ |
1338 | colData = data_pda->col; |
1339 | colMirror = mirror_pda->col; |
1340 | dataQueue = &(dqs[colData]); |
1341 | mirrorQueue = &(dqs[colMirror]); |
1342 | |
1343 | #ifdef RF_LOCK_QUEUES_TO_READ_LEN |
1344 | RF_LOCK_QUEUE_MUTEX(dataQueue, "SelectMirrorDiskIdle" ); |
1345 | #endif /* RF_LOCK_QUEUES_TO_READ_LEN */ |
1346 | dataQueueLength = dataQueue->queueLength + dataQueue->numOutstanding; |
1347 | #ifdef RF_LOCK_QUEUES_TO_READ_LEN |
1348 | RF_UNLOCK_QUEUE_MUTEX(dataQueue, "SelectMirrorDiskIdle" ); |
1349 | RF_LOCK_QUEUE_MUTEX(mirrorQueue, "SelectMirrorDiskIdle" ); |
1350 | #endif /* RF_LOCK_QUEUES_TO_READ_LEN */ |
1351 | mirrorQueueLength = mirrorQueue->queueLength + mirrorQueue->numOutstanding; |
1352 | #ifdef RF_LOCK_QUEUES_TO_READ_LEN |
1353 | RF_UNLOCK_QUEUE_MUTEX(mirrorQueue, "SelectMirrorDiskIdle" ); |
1354 | #endif /* RF_LOCK_QUEUES_TO_READ_LEN */ |
1355 | |
1356 | usemirror = 0; |
1357 | if (RF_DEAD_DISK(disks[colMirror].status)) { |
1358 | usemirror = 0; |
1359 | } else |
1360 | if (RF_DEAD_DISK(disks[colData].status)) { |
1361 | usemirror = 1; |
1362 | } else |
1363 | if (raidPtr->parity_good == RF_RAID_DIRTY) { |
1364 | /* Trust only the main disk */ |
1365 | usemirror = 0; |
1366 | } else |
1367 | if (dataQueueLength < mirrorQueueLength) { |
1368 | usemirror = 0; |
1369 | } else |
1370 | if (mirrorQueueLength < dataQueueLength) { |
1371 | usemirror = 1; |
1372 | } else { |
1373 | /* queues are equal length. attempt |
1374 | * cleverness. */ |
1375 | if (SNUM_DIFF(dataQueue->last_deq_sector, data_pda->startSector) |
1376 | <= SNUM_DIFF(mirrorQueue->last_deq_sector, mirror_pda->startSector)) { |
1377 | usemirror = 0; |
1378 | } else { |
1379 | usemirror = 1; |
1380 | } |
1381 | } |
1382 | |
1383 | if (usemirror) { |
1384 | /* use mirror (parity) disk, swap params 0 & 4 */ |
1385 | tmp_pda = data_pda; |
1386 | node->params[0].p = mirror_pda; |
1387 | node->params[4].p = tmp_pda; |
1388 | } else { |
1389 | /* use data disk, leave param 0 unchanged */ |
1390 | } |
1391 | /* printf("dataQueueLength %d, mirrorQueueLength |
1392 | * %d\n",dataQueueLength, mirrorQueueLength); */ |
1393 | } |
1394 | #if (RF_INCLUDE_CHAINDECLUSTER > 0) || (RF_INCLUDE_INTERDECLUSTER > 0) || (RF_DEBUG_VALIDATE_DAG > 0) |
1395 | /* |
1396 | * Do simple partitioning. This assumes that |
1397 | * the data and parity disks are laid out identically. |
1398 | */ |
1399 | void |
1400 | rf_SelectMirrorDiskPartition(RF_DagNode_t * node) |
1401 | { |
1402 | RF_Raid_t *raidPtr = (RF_Raid_t *) node->dagHdr->raidPtr; |
1403 | RF_RowCol_t colData, colMirror; |
1404 | RF_PhysDiskAddr_t *data_pda = (RF_PhysDiskAddr_t *) node->params[0].p; |
1405 | RF_PhysDiskAddr_t *mirror_pda = (RF_PhysDiskAddr_t *) node->params[4].p; |
1406 | RF_PhysDiskAddr_t *tmp_pda; |
1407 | RF_RaidDisk_t *disks = raidPtr->Disks; |
1408 | int usemirror; |
1409 | |
1410 | /* return the [row col] of the disk with the shortest queue */ |
1411 | colData = data_pda->col; |
1412 | colMirror = mirror_pda->col; |
1413 | |
1414 | usemirror = 0; |
1415 | if (RF_DEAD_DISK(disks[colMirror].status)) { |
1416 | usemirror = 0; |
1417 | } else |
1418 | if (RF_DEAD_DISK(disks[colData].status)) { |
1419 | usemirror = 1; |
1420 | } else |
1421 | if (raidPtr->parity_good == RF_RAID_DIRTY) { |
1422 | /* Trust only the main disk */ |
1423 | usemirror = 0; |
1424 | } else |
1425 | if (data_pda->startSector < |
1426 | (disks[colData].numBlocks / 2)) { |
1427 | usemirror = 0; |
1428 | } else { |
1429 | usemirror = 1; |
1430 | } |
1431 | |
1432 | if (usemirror) { |
1433 | /* use mirror (parity) disk, swap params 0 & 4 */ |
1434 | tmp_pda = data_pda; |
1435 | node->params[0].p = mirror_pda; |
1436 | node->params[4].p = tmp_pda; |
1437 | } else { |
1438 | /* use data disk, leave param 0 unchanged */ |
1439 | } |
1440 | } |
1441 | #endif |
1442 | |