1 | /* $NetBSD: rf_reconstruct.c,v 1.121 2014/11/14 14:29:16 oster Exp $ */ |
2 | /* |
3 | * Copyright (c) 1995 Carnegie-Mellon University. |
4 | * All rights reserved. |
5 | * |
6 | * Author: Mark Holland |
7 | * |
8 | * Permission to use, copy, modify and distribute this software and |
9 | * its documentation is hereby granted, provided that both the copyright |
10 | * notice and this permission notice appear in all copies of the |
11 | * software, derivative works or modified versions, and any portions |
12 | * thereof, and that both notices appear in supporting documentation. |
13 | * |
14 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
15 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND |
16 | * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. |
17 | * |
18 | * Carnegie Mellon requests users of this software to return to |
19 | * |
20 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
21 | * School of Computer Science |
22 | * Carnegie Mellon University |
23 | * Pittsburgh PA 15213-3890 |
24 | * |
25 | * any improvements or extensions that they make and grant Carnegie the |
26 | * rights to redistribute these changes. |
27 | */ |
28 | |
29 | /************************************************************ |
30 | * |
31 | * rf_reconstruct.c -- code to perform on-line reconstruction |
32 | * |
33 | ************************************************************/ |
34 | |
35 | #include <sys/cdefs.h> |
36 | __KERNEL_RCSID(0, "$NetBSD: rf_reconstruct.c,v 1.121 2014/11/14 14:29:16 oster Exp $" ); |
37 | |
38 | #include <sys/param.h> |
39 | #include <sys/time.h> |
40 | #include <sys/buf.h> |
41 | #include <sys/errno.h> |
42 | #include <sys/systm.h> |
43 | #include <sys/proc.h> |
44 | #include <sys/ioctl.h> |
45 | #include <sys/fcntl.h> |
46 | #include <sys/vnode.h> |
47 | #include <sys/namei.h> /* for pathbuf */ |
48 | #include <dev/raidframe/raidframevar.h> |
49 | |
50 | #include <miscfs/specfs/specdev.h> /* for v_rdev */ |
51 | |
52 | #include "rf_raid.h" |
53 | #include "rf_reconutil.h" |
54 | #include "rf_revent.h" |
55 | #include "rf_reconbuffer.h" |
56 | #include "rf_acctrace.h" |
57 | #include "rf_etimer.h" |
58 | #include "rf_dag.h" |
59 | #include "rf_desc.h" |
60 | #include "rf_debugprint.h" |
61 | #include "rf_general.h" |
62 | #include "rf_driver.h" |
63 | #include "rf_utils.h" |
64 | #include "rf_shutdown.h" |
65 | |
66 | #include "rf_kintf.h" |
67 | |
68 | /* setting these to -1 causes them to be set to their default values if not set by debug options */ |
69 | |
70 | #if RF_DEBUG_RECON |
71 | #define Dprintf(s) if (rf_reconDebug) rf_debug_printf(s,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL) |
72 | #define Dprintf1(s,a) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL) |
73 | #define Dprintf2(s,a,b) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL) |
74 | #define Dprintf3(s,a,b,c) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL) |
75 | #define Dprintf4(s,a,b,c,d) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),NULL,NULL,NULL,NULL) |
76 | #define Dprintf5(s,a,b,c,d,e) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),NULL,NULL,NULL) |
77 | #define Dprintf6(s,a,b,c,d,e,f) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),NULL,NULL) |
78 | #define Dprintf7(s,a,b,c,d,e,f,g) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),NULL) |
79 | |
80 | #define DDprintf1(s,a) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL) |
81 | #define DDprintf2(s,a,b) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL) |
82 | |
83 | #else /* RF_DEBUG_RECON */ |
84 | |
85 | #define Dprintf(s) {} |
86 | #define Dprintf1(s,a) {} |
87 | #define Dprintf2(s,a,b) {} |
88 | #define Dprintf3(s,a,b,c) {} |
89 | #define Dprintf4(s,a,b,c,d) {} |
90 | #define Dprintf5(s,a,b,c,d,e) {} |
91 | #define Dprintf6(s,a,b,c,d,e,f) {} |
92 | #define Dprintf7(s,a,b,c,d,e,f,g) {} |
93 | |
94 | #define DDprintf1(s,a) {} |
95 | #define DDprintf2(s,a,b) {} |
96 | |
97 | #endif /* RF_DEBUG_RECON */ |
98 | |
99 | #define RF_RECON_DONE_READS 1 |
100 | #define RF_RECON_READ_ERROR 2 |
101 | #define RF_RECON_WRITE_ERROR 3 |
102 | #define RF_RECON_READ_STOPPED 4 |
103 | #define RF_RECON_WRITE_DONE 5 |
104 | |
105 | #define RF_MAX_FREE_RECONBUFFER 32 |
106 | #define RF_MIN_FREE_RECONBUFFER 16 |
107 | |
108 | static RF_RaidReconDesc_t *AllocRaidReconDesc(RF_Raid_t *, RF_RowCol_t, |
109 | RF_RaidDisk_t *, int, RF_RowCol_t); |
110 | static void FreeReconDesc(RF_RaidReconDesc_t *); |
111 | static int ProcessReconEvent(RF_Raid_t *, RF_ReconEvent_t *); |
112 | static int IssueNextReadRequest(RF_Raid_t *, RF_RowCol_t); |
113 | static int TryToRead(RF_Raid_t *, RF_RowCol_t); |
114 | static int ComputePSDiskOffsets(RF_Raid_t *, RF_StripeNum_t, RF_RowCol_t, |
115 | RF_SectorNum_t *, RF_SectorNum_t *, RF_RowCol_t *, |
116 | RF_SectorNum_t *); |
117 | static int IssueNextWriteRequest(RF_Raid_t *); |
118 | static int ReconReadDoneProc(void *, int); |
119 | static int ReconWriteDoneProc(void *, int); |
120 | static void CheckForNewMinHeadSep(RF_Raid_t *, RF_HeadSepLimit_t); |
121 | static int CheckHeadSeparation(RF_Raid_t *, RF_PerDiskReconCtrl_t *, |
122 | RF_RowCol_t, RF_HeadSepLimit_t, |
123 | RF_ReconUnitNum_t); |
124 | static int CheckForcedOrBlockedReconstruction(RF_Raid_t *, |
125 | RF_ReconParityStripeStatus_t *, |
126 | RF_PerDiskReconCtrl_t *, |
127 | RF_RowCol_t, RF_StripeNum_t, |
128 | RF_ReconUnitNum_t); |
129 | static void ForceReconReadDoneProc(void *, int); |
130 | static void rf_ShutdownReconstruction(void *); |
131 | |
132 | struct RF_ReconDoneProc_s { |
133 | void (*proc) (RF_Raid_t *, void *); |
134 | void *arg; |
135 | RF_ReconDoneProc_t *next; |
136 | }; |
137 | |
138 | /************************************************************************** |
139 | * |
140 | * sets up the parameters that will be used by the reconstruction process |
141 | * currently there are none, except for those that the layout-specific |
142 | * configuration (e.g. rf_ConfigureDeclustered) routine sets up. |
143 | * |
144 | * in the kernel, we fire off the recon thread. |
145 | * |
146 | **************************************************************************/ |
147 | static void |
148 | rf_ShutdownReconstruction(void *ignored) |
149 | { |
150 | pool_destroy(&rf_pools.reconbuffer); |
151 | } |
152 | |
153 | int |
154 | rf_ConfigureReconstruction(RF_ShutdownList_t **listp) |
155 | { |
156 | |
157 | rf_pool_init(&rf_pools.reconbuffer, sizeof(RF_ReconBuffer_t), |
158 | "rf_reconbuffer_pl" , RF_MIN_FREE_RECONBUFFER, RF_MAX_FREE_RECONBUFFER); |
159 | rf_ShutdownCreate(listp, rf_ShutdownReconstruction, NULL); |
160 | |
161 | return (0); |
162 | } |
163 | |
164 | static RF_RaidReconDesc_t * |
165 | AllocRaidReconDesc(RF_Raid_t *raidPtr, RF_RowCol_t col, |
166 | RF_RaidDisk_t *spareDiskPtr, int numDisksDone, |
167 | RF_RowCol_t scol) |
168 | { |
169 | |
170 | RF_RaidReconDesc_t *reconDesc; |
171 | |
172 | RF_Malloc(reconDesc, sizeof(RF_RaidReconDesc_t), |
173 | (RF_RaidReconDesc_t *)); |
174 | reconDesc->raidPtr = raidPtr; |
175 | reconDesc->col = col; |
176 | reconDesc->spareDiskPtr = spareDiskPtr; |
177 | reconDesc->numDisksDone = numDisksDone; |
178 | reconDesc->scol = scol; |
179 | reconDesc->next = NULL; |
180 | |
181 | return (reconDesc); |
182 | } |
183 | |
184 | static void |
185 | FreeReconDesc(RF_RaidReconDesc_t *reconDesc) |
186 | { |
187 | #if RF_RECON_STATS > 0 |
188 | printf("raid%d: %lu recon event waits, %lu recon delays\n" , |
189 | reconDesc->raidPtr->raidid, |
190 | (long) reconDesc->numReconEventWaits, |
191 | (long) reconDesc->numReconExecDelays); |
192 | #endif /* RF_RECON_STATS > 0 */ |
193 | printf("raid%d: %lu max exec ticks\n" , |
194 | reconDesc->raidPtr->raidid, |
195 | (long) reconDesc->maxReconExecTicks); |
196 | RF_Free(reconDesc, sizeof(RF_RaidReconDesc_t)); |
197 | } |
198 | |
199 | |
200 | /***************************************************************************** |
201 | * |
202 | * primary routine to reconstruct a failed disk. This should be called from |
203 | * within its own thread. It won't return until reconstruction completes, |
204 | * fails, or is aborted. |
205 | *****************************************************************************/ |
206 | int |
207 | rf_ReconstructFailedDisk(RF_Raid_t *raidPtr, RF_RowCol_t col) |
208 | { |
209 | const RF_LayoutSW_t *lp; |
210 | int rc; |
211 | |
212 | lp = raidPtr->Layout.map; |
213 | if (lp->SubmitReconBuffer) { |
214 | /* |
215 | * The current infrastructure only supports reconstructing one |
216 | * disk at a time for each array. |
217 | */ |
218 | rf_lock_mutex2(raidPtr->mutex); |
219 | while (raidPtr->reconInProgress) { |
220 | rf_wait_cond2(raidPtr->waitForReconCond, raidPtr->mutex); |
221 | } |
222 | raidPtr->reconInProgress++; |
223 | rf_unlock_mutex2(raidPtr->mutex); |
224 | rc = rf_ReconstructFailedDiskBasic(raidPtr, col); |
225 | rf_lock_mutex2(raidPtr->mutex); |
226 | raidPtr->reconInProgress--; |
227 | } else { |
228 | RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n" , |
229 | lp->parityConfig); |
230 | rc = EIO; |
231 | rf_lock_mutex2(raidPtr->mutex); |
232 | } |
233 | rf_signal_cond2(raidPtr->waitForReconCond); |
234 | rf_unlock_mutex2(raidPtr->mutex); |
235 | return (rc); |
236 | } |
237 | |
238 | int |
239 | rf_ReconstructFailedDiskBasic(RF_Raid_t *raidPtr, RF_RowCol_t col) |
240 | { |
241 | RF_ComponentLabel_t *c_label; |
242 | RF_RaidDisk_t *spareDiskPtr = NULL; |
243 | RF_RaidReconDesc_t *reconDesc; |
244 | RF_RowCol_t scol; |
245 | int numDisksDone = 0, rc; |
246 | |
247 | /* first look for a spare drive onto which to reconstruct the data */ |
248 | /* spare disk descriptors are stored in row 0. This may have to |
249 | * change eventually */ |
250 | |
251 | rf_lock_mutex2(raidPtr->mutex); |
252 | RF_ASSERT(raidPtr->Disks[col].status == rf_ds_failed); |
253 | #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 |
254 | if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { |
255 | if (raidPtr->status != rf_rs_degraded) { |
256 | RF_ERRORMSG1("Unable to reconstruct disk at col %d because status not degraded\n" , col); |
257 | rf_unlock_mutex2(raidPtr->mutex); |
258 | return (EINVAL); |
259 | } |
260 | scol = (-1); |
261 | } else { |
262 | #endif |
263 | for (scol = raidPtr->numCol; scol < raidPtr->numCol + raidPtr->numSpare; scol++) { |
264 | if (raidPtr->Disks[scol].status == rf_ds_spare) { |
265 | spareDiskPtr = &raidPtr->Disks[scol]; |
266 | spareDiskPtr->status = rf_ds_rebuilding_spare; |
267 | break; |
268 | } |
269 | } |
270 | if (!spareDiskPtr) { |
271 | RF_ERRORMSG1("Unable to reconstruct disk at col %d because no spares are available\n" , col); |
272 | rf_unlock_mutex2(raidPtr->mutex); |
273 | return (ENOSPC); |
274 | } |
275 | printf("RECON: initiating reconstruction on col %d -> spare at col %d\n" , col, scol); |
276 | #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 |
277 | } |
278 | #endif |
279 | rf_unlock_mutex2(raidPtr->mutex); |
280 | |
281 | reconDesc = AllocRaidReconDesc((void *) raidPtr, col, spareDiskPtr, numDisksDone, scol); |
282 | raidPtr->reconDesc = (void *) reconDesc; |
283 | #if RF_RECON_STATS > 0 |
284 | reconDesc->hsStallCount = 0; |
285 | reconDesc->numReconExecDelays = 0; |
286 | reconDesc->numReconEventWaits = 0; |
287 | #endif /* RF_RECON_STATS > 0 */ |
288 | reconDesc->reconExecTimerRunning = 0; |
289 | reconDesc->reconExecTicks = 0; |
290 | reconDesc->maxReconExecTicks = 0; |
291 | rc = rf_ContinueReconstructFailedDisk(reconDesc); |
292 | |
293 | if (!rc) { |
294 | /* fix up the component label */ |
295 | /* Don't actually need the read here.. */ |
296 | c_label = raidget_component_label(raidPtr, scol); |
297 | |
298 | raid_init_component_label(raidPtr, c_label); |
299 | c_label->row = 0; |
300 | c_label->column = col; |
301 | c_label->clean = RF_RAID_DIRTY; |
302 | c_label->status = rf_ds_optimal; |
303 | rf_component_label_set_partitionsize(c_label, |
304 | raidPtr->Disks[scol].partitionSize); |
305 | |
306 | /* We've just done a rebuild based on all the other |
307 | disks, so at this point the parity is known to be |
308 | clean, even if it wasn't before. */ |
309 | |
310 | /* XXX doesn't hold for RAID 6!!*/ |
311 | |
312 | rf_lock_mutex2(raidPtr->mutex); |
313 | /* The failed disk has already been marked as rf_ds_spared |
314 | (or rf_ds_dist_spared) in |
315 | rf_ContinueReconstructFailedDisk() |
316 | so we just update the spare disk as being a used spare |
317 | */ |
318 | |
319 | spareDiskPtr->status = rf_ds_used_spare; |
320 | raidPtr->parity_good = RF_RAID_CLEAN; |
321 | rf_unlock_mutex2(raidPtr->mutex); |
322 | |
323 | /* XXXX MORE NEEDED HERE */ |
324 | |
325 | raidflush_component_label(raidPtr, scol); |
326 | } else { |
327 | /* Reconstruct failed. */ |
328 | |
329 | rf_lock_mutex2(raidPtr->mutex); |
330 | /* Failed disk goes back to "failed" status */ |
331 | raidPtr->Disks[col].status = rf_ds_failed; |
332 | |
333 | /* Spare disk goes back to "spare" status. */ |
334 | spareDiskPtr->status = rf_ds_spare; |
335 | rf_unlock_mutex2(raidPtr->mutex); |
336 | |
337 | } |
338 | rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE); |
339 | return (rc); |
340 | } |
341 | |
342 | /* |
343 | |
344 | Allow reconstructing a disk in-place -- i.e. component /dev/sd2e goes AWOL, |
345 | and you don't get a spare until the next Monday. With this function |
346 | (and hot-swappable drives) you can now put your new disk containing |
347 | /dev/sd2e on the bus, scsictl it alive, and then use raidctl(8) to |
348 | rebuild the data "on the spot". |
349 | |
350 | */ |
351 | |
352 | int |
353 | rf_ReconstructInPlace(RF_Raid_t *raidPtr, RF_RowCol_t col) |
354 | { |
355 | RF_RaidDisk_t *spareDiskPtr = NULL; |
356 | RF_RaidReconDesc_t *reconDesc; |
357 | const RF_LayoutSW_t *lp; |
358 | RF_ComponentLabel_t *c_label; |
359 | int numDisksDone = 0, rc; |
360 | uint64_t numsec; |
361 | unsigned int secsize; |
362 | struct pathbuf *pb; |
363 | struct vnode *vp; |
364 | int retcode; |
365 | int ac; |
366 | |
367 | rf_lock_mutex2(raidPtr->mutex); |
368 | lp = raidPtr->Layout.map; |
369 | if (!lp->SubmitReconBuffer) { |
370 | RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n" , |
371 | lp->parityConfig); |
372 | /* wakeup anyone who might be waiting to do a reconstruct */ |
373 | rf_signal_cond2(raidPtr->waitForReconCond); |
374 | rf_unlock_mutex2(raidPtr->mutex); |
375 | return(EIO); |
376 | } |
377 | |
378 | /* |
379 | * The current infrastructure only supports reconstructing one |
380 | * disk at a time for each array. |
381 | */ |
382 | |
383 | if (raidPtr->Disks[col].status != rf_ds_failed) { |
384 | /* "It's gone..." */ |
385 | raidPtr->numFailures++; |
386 | raidPtr->Disks[col].status = rf_ds_failed; |
387 | raidPtr->status = rf_rs_degraded; |
388 | rf_unlock_mutex2(raidPtr->mutex); |
389 | rf_update_component_labels(raidPtr, |
390 | RF_NORMAL_COMPONENT_UPDATE); |
391 | rf_lock_mutex2(raidPtr->mutex); |
392 | } |
393 | |
394 | while (raidPtr->reconInProgress) { |
395 | rf_wait_cond2(raidPtr->waitForReconCond, raidPtr->mutex); |
396 | } |
397 | |
398 | raidPtr->reconInProgress++; |
399 | |
400 | /* first look for a spare drive onto which to reconstruct the |
401 | data. spare disk descriptors are stored in row 0. This |
402 | may have to change eventually */ |
403 | |
404 | /* Actually, we don't care if it's failed or not... On a RAID |
405 | set with correct parity, this function should be callable |
406 | on any component without ill effects. */ |
407 | /* RF_ASSERT(raidPtr->Disks[col].status == rf_ds_failed); */ |
408 | |
409 | #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 |
410 | if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { |
411 | RF_ERRORMSG1("Unable to reconstruct to disk at col %d: operation not supported for RF_DISTRIBUTE_SPARE\n" , col); |
412 | |
413 | raidPtr->reconInProgress--; |
414 | rf_signal_cond2(raidPtr->waitForReconCond); |
415 | rf_unlock_mutex2(raidPtr->mutex); |
416 | return (EINVAL); |
417 | } |
418 | #endif |
419 | |
420 | /* This device may have been opened successfully the |
421 | first time. Close it before trying to open it again.. */ |
422 | |
423 | if (raidPtr->raid_cinfo[col].ci_vp != NULL) { |
424 | #if 0 |
425 | printf("Closed the open device: %s\n" , |
426 | raidPtr->Disks[col].devname); |
427 | #endif |
428 | vp = raidPtr->raid_cinfo[col].ci_vp; |
429 | ac = raidPtr->Disks[col].auto_configured; |
430 | rf_unlock_mutex2(raidPtr->mutex); |
431 | rf_close_component(raidPtr, vp, ac); |
432 | rf_lock_mutex2(raidPtr->mutex); |
433 | raidPtr->raid_cinfo[col].ci_vp = NULL; |
434 | } |
435 | /* note that this disk was *not* auto_configured (any longer)*/ |
436 | raidPtr->Disks[col].auto_configured = 0; |
437 | |
438 | #if 0 |
439 | printf("About to (re-)open the device for rebuilding: %s\n" , |
440 | raidPtr->Disks[col].devname); |
441 | #endif |
442 | rf_unlock_mutex2(raidPtr->mutex); |
443 | pb = pathbuf_create(raidPtr->Disks[col].devname); |
444 | if (pb == NULL) { |
445 | retcode = ENOMEM; |
446 | } else { |
447 | retcode = dk_lookup(pb, curlwp, &vp); |
448 | pathbuf_destroy(pb); |
449 | } |
450 | |
451 | if (retcode) { |
452 | printf("raid%d: rebuilding: dk_lookup on device: %s failed: %d!\n" ,raidPtr->raidid, |
453 | raidPtr->Disks[col].devname, retcode); |
454 | |
455 | /* the component isn't responding properly... |
456 | must be still dead :-( */ |
457 | rf_lock_mutex2(raidPtr->mutex); |
458 | raidPtr->reconInProgress--; |
459 | rf_signal_cond2(raidPtr->waitForReconCond); |
460 | rf_unlock_mutex2(raidPtr->mutex); |
461 | return(retcode); |
462 | } |
463 | |
464 | /* Ok, so we can at least do a lookup... |
465 | How about actually getting a vp for it? */ |
466 | |
467 | retcode = getdisksize(vp, &numsec, &secsize); |
468 | if (retcode) { |
469 | vn_close(vp, FREAD | FWRITE, kauth_cred_get()); |
470 | rf_lock_mutex2(raidPtr->mutex); |
471 | raidPtr->reconInProgress--; |
472 | rf_signal_cond2(raidPtr->waitForReconCond); |
473 | rf_unlock_mutex2(raidPtr->mutex); |
474 | return(retcode); |
475 | } |
476 | rf_lock_mutex2(raidPtr->mutex); |
477 | raidPtr->Disks[col].blockSize = secsize; |
478 | raidPtr->Disks[col].numBlocks = numsec - rf_protectedSectors; |
479 | |
480 | raidPtr->raid_cinfo[col].ci_vp = vp; |
481 | raidPtr->raid_cinfo[col].ci_dev = vp->v_rdev; |
482 | |
483 | raidPtr->Disks[col].dev = vp->v_rdev; |
484 | |
485 | /* we allow the user to specify that only a fraction |
486 | of the disks should be used this is just for debug: |
487 | it speeds up * the parity scan */ |
488 | raidPtr->Disks[col].numBlocks = raidPtr->Disks[col].numBlocks * |
489 | rf_sizePercentage / 100; |
490 | rf_unlock_mutex2(raidPtr->mutex); |
491 | |
492 | spareDiskPtr = &raidPtr->Disks[col]; |
493 | spareDiskPtr->status = rf_ds_rebuilding_spare; |
494 | |
495 | printf("raid%d: initiating in-place reconstruction on column %d\n" , |
496 | raidPtr->raidid, col); |
497 | |
498 | reconDesc = AllocRaidReconDesc((void *) raidPtr, col, spareDiskPtr, |
499 | numDisksDone, col); |
500 | raidPtr->reconDesc = (void *) reconDesc; |
501 | #if RF_RECON_STATS > 0 |
502 | reconDesc->hsStallCount = 0; |
503 | reconDesc->numReconExecDelays = 0; |
504 | reconDesc->numReconEventWaits = 0; |
505 | #endif /* RF_RECON_STATS > 0 */ |
506 | reconDesc->reconExecTimerRunning = 0; |
507 | reconDesc->reconExecTicks = 0; |
508 | reconDesc->maxReconExecTicks = 0; |
509 | rc = rf_ContinueReconstructFailedDisk(reconDesc); |
510 | |
511 | if (!rc) { |
512 | rf_lock_mutex2(raidPtr->mutex); |
513 | /* Need to set these here, as at this point it'll be claiming |
514 | that the disk is in rf_ds_spared! But we know better :-) */ |
515 | |
516 | raidPtr->Disks[col].status = rf_ds_optimal; |
517 | raidPtr->status = rf_rs_optimal; |
518 | rf_unlock_mutex2(raidPtr->mutex); |
519 | |
520 | /* fix up the component label */ |
521 | /* Don't actually need the read here.. */ |
522 | c_label = raidget_component_label(raidPtr, col); |
523 | |
524 | rf_lock_mutex2(raidPtr->mutex); |
525 | raid_init_component_label(raidPtr, c_label); |
526 | |
527 | c_label->row = 0; |
528 | c_label->column = col; |
529 | |
530 | /* We've just done a rebuild based on all the other |
531 | disks, so at this point the parity is known to be |
532 | clean, even if it wasn't before. */ |
533 | |
534 | /* XXX doesn't hold for RAID 6!!*/ |
535 | |
536 | raidPtr->parity_good = RF_RAID_CLEAN; |
537 | rf_unlock_mutex2(raidPtr->mutex); |
538 | |
539 | raidflush_component_label(raidPtr, col); |
540 | } else { |
541 | /* Reconstruct-in-place failed. Disk goes back to |
542 | "failed" status, regardless of what it was before. */ |
543 | rf_lock_mutex2(raidPtr->mutex); |
544 | raidPtr->Disks[col].status = rf_ds_failed; |
545 | rf_unlock_mutex2(raidPtr->mutex); |
546 | } |
547 | |
548 | rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE); |
549 | |
550 | rf_lock_mutex2(raidPtr->mutex); |
551 | raidPtr->reconInProgress--; |
552 | rf_signal_cond2(raidPtr->waitForReconCond); |
553 | rf_unlock_mutex2(raidPtr->mutex); |
554 | |
555 | return (rc); |
556 | } |
557 | |
558 | |
559 | int |
560 | rf_ContinueReconstructFailedDisk(RF_RaidReconDesc_t *reconDesc) |
561 | { |
562 | RF_Raid_t *raidPtr = reconDesc->raidPtr; |
563 | RF_RowCol_t col = reconDesc->col; |
564 | RF_RowCol_t scol = reconDesc->scol; |
565 | RF_ReconMap_t *mapPtr; |
566 | RF_ReconCtrl_t *tmp_reconctrl; |
567 | RF_ReconEvent_t *event; |
568 | RF_StripeCount_t incPSID,lastPSID,num_writes,pending_writes,prev; |
569 | #if RF_INCLUDE_RAID5_RS > 0 |
570 | RF_StripeCount_t startPSID,endPSID,aPSID,bPSID,offPSID; |
571 | #endif |
572 | RF_ReconUnitCount_t RUsPerPU; |
573 | struct timeval etime, elpsd; |
574 | unsigned long xor_s, xor_resid_us; |
575 | int i, ds; |
576 | int status, done; |
577 | int recon_error, write_error; |
578 | |
579 | raidPtr->accumXorTimeUs = 0; |
580 | #if RF_ACC_TRACE > 0 |
581 | /* create one trace record per physical disk */ |
582 | RF_Malloc(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *)); |
583 | #endif |
584 | |
585 | /* quiesce the array prior to starting recon. this is needed |
586 | * to assure no nasty interactions with pending user writes. |
587 | * We need to do this before we change the disk or row status. */ |
588 | |
589 | Dprintf("RECON: begin request suspend\n" ); |
590 | rf_SuspendNewRequestsAndWait(raidPtr); |
591 | Dprintf("RECON: end request suspend\n" ); |
592 | |
593 | /* allocate our RF_ReconCTRL_t before we protect raidPtr->reconControl[row] */ |
594 | tmp_reconctrl = rf_MakeReconControl(reconDesc, col, scol); |
595 | |
596 | rf_lock_mutex2(raidPtr->mutex); |
597 | |
598 | /* create the reconstruction control pointer and install it in |
599 | * the right slot */ |
600 | raidPtr->reconControl = tmp_reconctrl; |
601 | mapPtr = raidPtr->reconControl->reconMap; |
602 | raidPtr->reconControl->numRUsTotal = mapPtr->totalRUs; |
603 | raidPtr->reconControl->numRUsComplete = 0; |
604 | raidPtr->status = rf_rs_reconstructing; |
605 | raidPtr->Disks[col].status = rf_ds_reconstructing; |
606 | raidPtr->Disks[col].spareCol = scol; |
607 | |
608 | rf_unlock_mutex2(raidPtr->mutex); |
609 | |
610 | RF_GETTIME(raidPtr->reconControl->starttime); |
611 | |
612 | Dprintf("RECON: resume requests\n" ); |
613 | rf_ResumeNewRequests(raidPtr); |
614 | |
615 | |
616 | mapPtr = raidPtr->reconControl->reconMap; |
617 | |
618 | incPSID = RF_RECONMAP_SIZE; |
619 | lastPSID = raidPtr->Layout.numStripe / raidPtr->Layout.SUsPerPU; |
620 | RUsPerPU = raidPtr->Layout.SUsPerPU / raidPtr->Layout.SUsPerRU; |
621 | recon_error = 0; |
622 | write_error = 0; |
623 | pending_writes = incPSID; |
624 | raidPtr->reconControl->lastPSID = incPSID - 1; |
625 | |
626 | /* bounds check raidPtr->reconControl->lastPSID and |
627 | pending_writes so that we don't attempt to wait for more IO |
628 | than can possibly happen */ |
629 | |
630 | if (raidPtr->reconControl->lastPSID > lastPSID) |
631 | raidPtr->reconControl->lastPSID = lastPSID; |
632 | |
633 | if (pending_writes > lastPSID) |
634 | pending_writes = lastPSID; |
635 | |
636 | /* start the actual reconstruction */ |
637 | |
638 | done = 0; |
639 | while (!done) { |
640 | |
641 | if (raidPtr->waitShutdown) { |
642 | /* someone is unconfiguring this array... bail on the reconstruct.. */ |
643 | recon_error = 1; |
644 | break; |
645 | } |
646 | |
647 | num_writes = 0; |
648 | |
649 | #if RF_INCLUDE_RAID5_RS > 0 |
650 | /* For RAID5 with Rotated Spares we will be 'short' |
651 | some number of writes since no writes will get |
652 | issued for stripes where the spare is on the |
653 | component being rebuilt. Account for the shortage |
654 | here so that we don't hang indefinitely below |
655 | waiting for writes to complete that were never |
656 | scheduled. |
657 | |
658 | XXX: Should be fixed for PARITY_DECLUSTERING and |
659 | others too! |
660 | |
661 | */ |
662 | |
663 | if (raidPtr->Layout.numDataCol < |
664 | raidPtr->numCol - raidPtr->Layout.numParityCol) { |
665 | /* numDataCol is at least 2 less than numCol, so |
666 | should be RAID 5 with Rotated Spares */ |
667 | |
668 | /* XXX need to update for RAID 6 */ |
669 | |
670 | startPSID = raidPtr->reconControl->lastPSID - pending_writes + 1; |
671 | endPSID = raidPtr->reconControl->lastPSID; |
672 | |
673 | offPSID = raidPtr->numCol - col - 1; |
674 | |
675 | aPSID = startPSID - startPSID % raidPtr->numCol + offPSID; |
676 | if (aPSID < startPSID) { |
677 | aPSID += raidPtr->numCol; |
678 | } |
679 | |
680 | bPSID = endPSID - ((endPSID - offPSID) % raidPtr->numCol); |
681 | |
682 | if (aPSID < endPSID) { |
683 | num_writes = ((bPSID - aPSID) / raidPtr->numCol) + 1; |
684 | } |
685 | |
686 | if ((aPSID == endPSID) && (bPSID == endPSID)) { |
687 | num_writes++; |
688 | } |
689 | } |
690 | #endif |
691 | |
692 | /* issue a read for each surviving disk */ |
693 | |
694 | reconDesc->numDisksDone = 0; |
695 | for (i = 0; i < raidPtr->numCol; i++) { |
696 | if (i != col) { |
697 | /* find and issue the next I/O on the |
698 | * indicated disk */ |
699 | if (IssueNextReadRequest(raidPtr, i)) { |
700 | Dprintf1("RECON: done issuing for c%d\n" , i); |
701 | reconDesc->numDisksDone++; |
702 | } |
703 | } |
704 | } |
705 | |
706 | /* process reconstruction events until all disks report that |
707 | * they've completed all work */ |
708 | |
709 | while (reconDesc->numDisksDone < raidPtr->numCol - 1) { |
710 | |
711 | event = rf_GetNextReconEvent(reconDesc); |
712 | status = ProcessReconEvent(raidPtr, event); |
713 | |
714 | /* the normal case is that a read completes, and all is well. */ |
715 | if (status == RF_RECON_DONE_READS) { |
716 | reconDesc->numDisksDone++; |
717 | } else if ((status == RF_RECON_READ_ERROR) || |
718 | (status == RF_RECON_WRITE_ERROR)) { |
719 | /* an error was encountered while reconstructing... |
720 | Pretend we've finished this disk. |
721 | */ |
722 | recon_error = 1; |
723 | raidPtr->reconControl->error = 1; |
724 | |
725 | /* bump the numDisksDone count for reads, |
726 | but not for writes */ |
727 | if (status == RF_RECON_READ_ERROR) |
728 | reconDesc->numDisksDone++; |
729 | |
730 | /* write errors are special -- when we are |
731 | done dealing with the reads that are |
732 | finished, we don't want to wait for any |
733 | writes */ |
734 | if (status == RF_RECON_WRITE_ERROR) { |
735 | write_error = 1; |
736 | num_writes++; |
737 | } |
738 | |
739 | } else if (status == RF_RECON_READ_STOPPED) { |
740 | /* count this component as being "done" */ |
741 | reconDesc->numDisksDone++; |
742 | } else if (status == RF_RECON_WRITE_DONE) { |
743 | num_writes++; |
744 | } |
745 | |
746 | if (recon_error) { |
747 | /* make sure any stragglers are woken up so that |
748 | their theads will complete, and we can get out |
749 | of here with all IO processed */ |
750 | |
751 | rf_WakeupHeadSepCBWaiters(raidPtr); |
752 | } |
753 | |
754 | raidPtr->reconControl->numRUsTotal = |
755 | mapPtr->totalRUs; |
756 | raidPtr->reconControl->numRUsComplete = |
757 | mapPtr->totalRUs - |
758 | rf_UnitsLeftToReconstruct(mapPtr); |
759 | |
760 | #if RF_DEBUG_RECON |
761 | raidPtr->reconControl->percentComplete = |
762 | (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal); |
763 | if (rf_prReconSched) { |
764 | rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime)); |
765 | } |
766 | #endif |
767 | } |
768 | |
769 | /* reads done, wakeup any waiters, and then wait for writes */ |
770 | |
771 | rf_WakeupHeadSepCBWaiters(raidPtr); |
772 | |
773 | while (!recon_error && (num_writes < pending_writes)) { |
774 | event = rf_GetNextReconEvent(reconDesc); |
775 | status = ProcessReconEvent(raidPtr, event); |
776 | |
777 | if (status == RF_RECON_WRITE_ERROR) { |
778 | num_writes++; |
779 | recon_error = 1; |
780 | raidPtr->reconControl->error = 1; |
781 | /* an error was encountered at the very end... bail */ |
782 | } else if (status == RF_RECON_WRITE_DONE) { |
783 | num_writes++; |
784 | } /* else it's something else, and we don't care */ |
785 | } |
786 | if (recon_error || |
787 | (raidPtr->reconControl->lastPSID == lastPSID)) { |
788 | done = 1; |
789 | break; |
790 | } |
791 | |
792 | prev = raidPtr->reconControl->lastPSID; |
793 | raidPtr->reconControl->lastPSID += incPSID; |
794 | |
795 | if (raidPtr->reconControl->lastPSID > lastPSID) { |
796 | pending_writes = lastPSID - prev; |
797 | raidPtr->reconControl->lastPSID = lastPSID; |
798 | } |
799 | |
800 | /* back down curPSID to get ready for the next round... */ |
801 | for (i = 0; i < raidPtr->numCol; i++) { |
802 | if (i != col) { |
803 | raidPtr->reconControl->perDiskInfo[i].curPSID--; |
804 | raidPtr->reconControl->perDiskInfo[i].ru_count = RUsPerPU - 1; |
805 | } |
806 | } |
807 | } |
808 | |
809 | mapPtr = raidPtr->reconControl->reconMap; |
810 | if (rf_reconDebug) { |
811 | printf("RECON: all reads completed\n" ); |
812 | } |
813 | /* at this point all the reads have completed. We now wait |
814 | * for any pending writes to complete, and then we're done */ |
815 | |
816 | while (!recon_error && rf_UnitsLeftToReconstruct(raidPtr->reconControl->reconMap) > 0) { |
817 | |
818 | event = rf_GetNextReconEvent(reconDesc); |
819 | status = ProcessReconEvent(raidPtr, event); |
820 | |
821 | if (status == RF_RECON_WRITE_ERROR) { |
822 | recon_error = 1; |
823 | raidPtr->reconControl->error = 1; |
824 | /* an error was encountered at the very end... bail */ |
825 | } else { |
826 | #if RF_DEBUG_RECON |
827 | raidPtr->reconControl->percentComplete = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 / mapPtr->totalRUs); |
828 | if (rf_prReconSched) { |
829 | rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime)); |
830 | } |
831 | #endif |
832 | } |
833 | } |
834 | |
835 | if (recon_error) { |
836 | /* we've encountered an error in reconstructing. */ |
837 | printf("raid%d: reconstruction failed.\n" , raidPtr->raidid); |
838 | |
839 | /* we start by blocking IO to the RAID set. */ |
840 | rf_SuspendNewRequestsAndWait(raidPtr); |
841 | |
842 | rf_lock_mutex2(raidPtr->mutex); |
843 | /* mark set as being degraded, rather than |
844 | rf_rs_reconstructing as we were before the problem. |
845 | After this is done we can update status of the |
846 | component disks without worrying about someone |
847 | trying to read from a failed component. |
848 | */ |
849 | raidPtr->status = rf_rs_degraded; |
850 | rf_unlock_mutex2(raidPtr->mutex); |
851 | |
852 | /* resume IO */ |
853 | rf_ResumeNewRequests(raidPtr); |
854 | |
855 | /* At this point there are two cases: |
856 | 1) If we've experienced a read error, then we've |
857 | already waited for all the reads we're going to get, |
858 | and we just need to wait for the writes. |
859 | |
860 | 2) If we've experienced a write error, we've also |
861 | already waited for all the reads to complete, |
862 | but there is little point in waiting for the writes -- |
863 | when they do complete, they will just be ignored. |
864 | |
865 | So we just wait for writes to complete if we didn't have a |
866 | write error. |
867 | */ |
868 | |
869 | if (!write_error) { |
870 | /* wait for writes to complete */ |
871 | while (raidPtr->reconControl->pending_writes > 0) { |
872 | |
873 | event = rf_GetNextReconEvent(reconDesc); |
874 | status = ProcessReconEvent(raidPtr, event); |
875 | |
876 | if (status == RF_RECON_WRITE_ERROR) { |
877 | raidPtr->reconControl->error = 1; |
878 | /* an error was encountered at the very end... bail. |
879 | This will be very bad news for the user, since |
880 | at this point there will have been a read error |
881 | on one component, and a write error on another! |
882 | */ |
883 | break; |
884 | } |
885 | } |
886 | } |
887 | |
888 | |
889 | /* cleanup */ |
890 | |
891 | /* drain the event queue - after waiting for the writes above, |
892 | there shouldn't be much (if anything!) left in the queue. */ |
893 | |
894 | rf_DrainReconEventQueue(reconDesc); |
895 | |
896 | /* XXX As much as we'd like to free the recon control structure |
897 | and the reconDesc, we have no way of knowing if/when those will |
898 | be touched by IO that has yet to occur. It is rather poor to be |
899 | basically causing a 'memory leak' here, but there doesn't seem to be |
900 | a cleaner alternative at this time. Perhaps when the reconstruct code |
901 | gets a makeover this problem will go away. |
902 | */ |
903 | #if 0 |
904 | rf_FreeReconControl(raidPtr); |
905 | #endif |
906 | |
907 | #if RF_ACC_TRACE > 0 |
908 | RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t)); |
909 | #endif |
910 | /* XXX see comment above */ |
911 | #if 0 |
912 | FreeReconDesc(reconDesc); |
913 | #endif |
914 | |
915 | return (1); |
916 | } |
917 | |
918 | /* Success: mark the dead disk as reconstructed. We quiesce |
919 | * the array here to assure no nasty interactions with pending |
920 | * user accesses when we free up the psstatus structure as |
921 | * part of FreeReconControl() */ |
922 | |
923 | rf_SuspendNewRequestsAndWait(raidPtr); |
924 | |
925 | rf_lock_mutex2(raidPtr->mutex); |
926 | raidPtr->numFailures--; |
927 | ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE); |
928 | raidPtr->Disks[col].status = (ds) ? rf_ds_dist_spared : rf_ds_spared; |
929 | raidPtr->status = (ds) ? rf_rs_reconfigured : rf_rs_optimal; |
930 | rf_unlock_mutex2(raidPtr->mutex); |
931 | RF_GETTIME(etime); |
932 | RF_TIMEVAL_DIFF(&(raidPtr->reconControl->starttime), &etime, &elpsd); |
933 | |
934 | rf_ResumeNewRequests(raidPtr); |
935 | |
936 | printf("raid%d: Reconstruction of disk at col %d completed\n" , |
937 | raidPtr->raidid, col); |
938 | xor_s = raidPtr->accumXorTimeUs / 1000000; |
939 | xor_resid_us = raidPtr->accumXorTimeUs % 1000000; |
940 | printf("raid%d: Recon time was %d.%06d seconds, accumulated XOR time was %ld us (%ld.%06ld)\n" , |
941 | raidPtr->raidid, |
942 | (int) elpsd.tv_sec, (int) elpsd.tv_usec, |
943 | raidPtr->accumXorTimeUs, xor_s, xor_resid_us); |
944 | printf("raid%d: (start time %d sec %d usec, end time %d sec %d usec)\n" , |
945 | raidPtr->raidid, |
946 | (int) raidPtr->reconControl->starttime.tv_sec, |
947 | (int) raidPtr->reconControl->starttime.tv_usec, |
948 | (int) etime.tv_sec, (int) etime.tv_usec); |
949 | #if RF_RECON_STATS > 0 |
950 | printf("raid%d: Total head-sep stall count was %d\n" , |
951 | raidPtr->raidid, (int) reconDesc->hsStallCount); |
952 | #endif /* RF_RECON_STATS > 0 */ |
953 | rf_FreeReconControl(raidPtr); |
954 | #if RF_ACC_TRACE > 0 |
955 | RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t)); |
956 | #endif |
957 | FreeReconDesc(reconDesc); |
958 | |
959 | return (0); |
960 | |
961 | } |
962 | /***************************************************************************** |
963 | * do the right thing upon each reconstruction event. |
964 | *****************************************************************************/ |
965 | static int |
966 | ProcessReconEvent(RF_Raid_t *raidPtr, RF_ReconEvent_t *event) |
967 | { |
968 | int retcode = 0, submitblocked; |
969 | RF_ReconBuffer_t *rbuf; |
970 | RF_SectorCount_t sectorsPerRU; |
971 | |
972 | retcode = RF_RECON_READ_STOPPED; |
973 | |
974 | Dprintf1("RECON: ProcessReconEvent type %d\n" , event->type); |
975 | |
976 | switch (event->type) { |
977 | |
978 | /* a read I/O has completed */ |
979 | case RF_REVENT_READDONE: |
980 | rbuf = raidPtr->reconControl->perDiskInfo[event->col].rbuf; |
981 | Dprintf2("RECON: READDONE EVENT: col %d psid %ld\n" , |
982 | event->col, rbuf->parityStripeID); |
983 | Dprintf7("RECON: done read psid %ld buf %lx %02x %02x %02x %02x %02x\n" , |
984 | rbuf->parityStripeID, rbuf->buffer, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff, |
985 | rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff); |
986 | rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); |
987 | if (!raidPtr->reconControl->error) { |
988 | submitblocked = rf_SubmitReconBuffer(rbuf, 0, 0); |
989 | Dprintf1("RECON: submitblocked=%d\n" , submitblocked); |
990 | if (!submitblocked) |
991 | retcode = IssueNextReadRequest(raidPtr, event->col); |
992 | else |
993 | retcode = 0; |
994 | } |
995 | break; |
996 | |
997 | /* a write I/O has completed */ |
998 | case RF_REVENT_WRITEDONE: |
999 | #if RF_DEBUG_RECON |
1000 | if (rf_floatingRbufDebug) { |
1001 | rf_CheckFloatingRbufCount(raidPtr, 1); |
1002 | } |
1003 | #endif |
1004 | sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; |
1005 | rbuf = (RF_ReconBuffer_t *) event->arg; |
1006 | rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); |
1007 | Dprintf3("RECON: WRITEDONE EVENT: psid %d ru %d (%d %% complete)\n" , |
1008 | rbuf->parityStripeID, rbuf->which_ru, raidPtr->reconControl->percentComplete); |
1009 | rf_ReconMapUpdate(raidPtr, raidPtr->reconControl->reconMap, |
1010 | rbuf->failedDiskSectorOffset, rbuf->failedDiskSectorOffset + sectorsPerRU - 1); |
1011 | rf_RemoveFromActiveReconTable(raidPtr, rbuf->parityStripeID, rbuf->which_ru); |
1012 | |
1013 | rf_lock_mutex2(raidPtr->reconControl->rb_mutex); |
1014 | raidPtr->reconControl->pending_writes--; |
1015 | rf_unlock_mutex2(raidPtr->reconControl->rb_mutex); |
1016 | |
1017 | if (rbuf->type == RF_RBUF_TYPE_FLOATING) { |
1018 | rf_lock_mutex2(raidPtr->reconControl->rb_mutex); |
1019 | while(raidPtr->reconControl->rb_lock) { |
1020 | rf_wait_cond2(raidPtr->reconControl->rb_cv, |
1021 | raidPtr->reconControl->rb_mutex); |
1022 | } |
1023 | raidPtr->reconControl->rb_lock = 1; |
1024 | rf_unlock_mutex2(raidPtr->reconControl->rb_mutex); |
1025 | |
1026 | raidPtr->numFullReconBuffers--; |
1027 | rf_ReleaseFloatingReconBuffer(raidPtr, rbuf); |
1028 | |
1029 | rf_lock_mutex2(raidPtr->reconControl->rb_mutex); |
1030 | raidPtr->reconControl->rb_lock = 0; |
1031 | rf_broadcast_cond2(raidPtr->reconControl->rb_cv); |
1032 | rf_unlock_mutex2(raidPtr->reconControl->rb_mutex); |
1033 | } else |
1034 | if (rbuf->type == RF_RBUF_TYPE_FORCED) |
1035 | rf_FreeReconBuffer(rbuf); |
1036 | else |
1037 | RF_ASSERT(0); |
1038 | retcode = RF_RECON_WRITE_DONE; |
1039 | break; |
1040 | |
1041 | case RF_REVENT_BUFCLEAR: /* A buffer-stall condition has been |
1042 | * cleared */ |
1043 | Dprintf1("RECON: BUFCLEAR EVENT: col %d\n" , event->col); |
1044 | if (!raidPtr->reconControl->error) { |
1045 | submitblocked = rf_SubmitReconBuffer(raidPtr->reconControl->perDiskInfo[event->col].rbuf, |
1046 | 0, (int) (long) event->arg); |
1047 | RF_ASSERT(!submitblocked); /* we wouldn't have gotten the |
1048 | * BUFCLEAR event if we |
1049 | * couldn't submit */ |
1050 | retcode = IssueNextReadRequest(raidPtr, event->col); |
1051 | } |
1052 | break; |
1053 | |
1054 | case RF_REVENT_BLOCKCLEAR: /* A user-write reconstruction |
1055 | * blockage has been cleared */ |
1056 | DDprintf1("RECON: BLOCKCLEAR EVENT: col %d\n" , event->col); |
1057 | if (!raidPtr->reconControl->error) { |
1058 | retcode = TryToRead(raidPtr, event->col); |
1059 | } |
1060 | break; |
1061 | |
1062 | case RF_REVENT_HEADSEPCLEAR: /* A max-head-separation |
1063 | * reconstruction blockage has been |
1064 | * cleared */ |
1065 | Dprintf1("RECON: HEADSEPCLEAR EVENT: col %d\n" , event->col); |
1066 | if (!raidPtr->reconControl->error) { |
1067 | retcode = TryToRead(raidPtr, event->col); |
1068 | } |
1069 | break; |
1070 | |
1071 | /* a buffer has become ready to write */ |
1072 | case RF_REVENT_BUFREADY: |
1073 | Dprintf1("RECON: BUFREADY EVENT: col %d\n" , event->col); |
1074 | if (!raidPtr->reconControl->error) { |
1075 | retcode = IssueNextWriteRequest(raidPtr); |
1076 | #if RF_DEBUG_RECON |
1077 | if (rf_floatingRbufDebug) { |
1078 | rf_CheckFloatingRbufCount(raidPtr, 1); |
1079 | } |
1080 | #endif |
1081 | } |
1082 | break; |
1083 | |
1084 | /* we need to skip the current RU entirely because it got |
1085 | * recon'd while we were waiting for something else to happen */ |
1086 | case RF_REVENT_SKIP: |
1087 | DDprintf1("RECON: SKIP EVENT: col %d\n" , event->col); |
1088 | if (!raidPtr->reconControl->error) { |
1089 | retcode = IssueNextReadRequest(raidPtr, event->col); |
1090 | } |
1091 | break; |
1092 | |
1093 | /* a forced-reconstruction read access has completed. Just |
1094 | * submit the buffer */ |
1095 | case RF_REVENT_FORCEDREADDONE: |
1096 | rbuf = (RF_ReconBuffer_t *) event->arg; |
1097 | rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); |
1098 | DDprintf1("RECON: FORCEDREADDONE EVENT: col %d\n" , event->col); |
1099 | if (!raidPtr->reconControl->error) { |
1100 | submitblocked = rf_SubmitReconBuffer(rbuf, 1, 0); |
1101 | RF_ASSERT(!submitblocked); |
1102 | retcode = 0; |
1103 | } |
1104 | break; |
1105 | |
1106 | /* A read I/O failed to complete */ |
1107 | case RF_REVENT_READ_FAILED: |
1108 | retcode = RF_RECON_READ_ERROR; |
1109 | break; |
1110 | |
1111 | /* A write I/O failed to complete */ |
1112 | case RF_REVENT_WRITE_FAILED: |
1113 | retcode = RF_RECON_WRITE_ERROR; |
1114 | |
1115 | /* This is an error, but it was a pending write. |
1116 | Account for it. */ |
1117 | rf_lock_mutex2(raidPtr->reconControl->rb_mutex); |
1118 | raidPtr->reconControl->pending_writes--; |
1119 | rf_unlock_mutex2(raidPtr->reconControl->rb_mutex); |
1120 | |
1121 | rbuf = (RF_ReconBuffer_t *) event->arg; |
1122 | |
1123 | /* cleanup the disk queue data */ |
1124 | rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); |
1125 | |
1126 | /* At this point we're erroring out, badly, and floatingRbufs |
1127 | may not even be valid. Rather than putting this back onto |
1128 | the floatingRbufs list, just arrange for its immediate |
1129 | destruction. |
1130 | */ |
1131 | rf_FreeReconBuffer(rbuf); |
1132 | break; |
1133 | |
1134 | /* a forced read I/O failed to complete */ |
1135 | case RF_REVENT_FORCEDREAD_FAILED: |
1136 | retcode = RF_RECON_READ_ERROR; |
1137 | break; |
1138 | |
1139 | default: |
1140 | RF_PANIC(); |
1141 | } |
1142 | rf_FreeReconEventDesc(event); |
1143 | return (retcode); |
1144 | } |
1145 | /***************************************************************************** |
1146 | * |
1147 | * find the next thing that's needed on the indicated disk, and issue |
1148 | * a read request for it. We assume that the reconstruction buffer |
1149 | * associated with this process is free to receive the data. If |
1150 | * reconstruction is blocked on the indicated RU, we issue a |
1151 | * blockage-release request instead of a physical disk read request. |
1152 | * If the current disk gets too far ahead of the others, we issue a |
1153 | * head-separation wait request and return. |
1154 | * |
1155 | * ctrl->{ru_count, curPSID, diskOffset} and |
1156 | * rbuf->failedDiskSectorOffset are maintained to point to the unit |
1157 | * we're currently accessing. Note that this deviates from the |
1158 | * standard C idiom of having counters point to the next thing to be |
1159 | * accessed. This allows us to easily retry when we're blocked by |
1160 | * head separation or reconstruction-blockage events. |
1161 | * |
1162 | *****************************************************************************/ |
1163 | static int |
1164 | IssueNextReadRequest(RF_Raid_t *raidPtr, RF_RowCol_t col) |
1165 | { |
1166 | RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl->perDiskInfo[col]; |
1167 | RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; |
1168 | RF_ReconBuffer_t *rbuf = ctrl->rbuf; |
1169 | RF_ReconUnitCount_t RUsPerPU = layoutPtr->SUsPerPU / layoutPtr->SUsPerRU; |
1170 | RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU; |
1171 | int do_new_check = 0, retcode = 0, status; |
1172 | |
1173 | /* if we are currently the slowest disk, mark that we have to do a new |
1174 | * check */ |
1175 | if (ctrl->headSepCounter <= raidPtr->reconControl->minHeadSepCounter) |
1176 | do_new_check = 1; |
1177 | |
1178 | while (1) { |
1179 | |
1180 | ctrl->ru_count++; |
1181 | if (ctrl->ru_count < RUsPerPU) { |
1182 | ctrl->diskOffset += sectorsPerRU; |
1183 | rbuf->failedDiskSectorOffset += sectorsPerRU; |
1184 | } else { |
1185 | ctrl->curPSID++; |
1186 | ctrl->ru_count = 0; |
1187 | /* code left over from when head-sep was based on |
1188 | * parity stripe id */ |
1189 | if (ctrl->curPSID > raidPtr->reconControl->lastPSID) { |
1190 | CheckForNewMinHeadSep(raidPtr, ++(ctrl->headSepCounter)); |
1191 | return (RF_RECON_DONE_READS); /* finito! */ |
1192 | } |
1193 | /* find the disk offsets of the start of the parity |
1194 | * stripe on both the current disk and the failed |
1195 | * disk. skip this entire parity stripe if either disk |
1196 | * does not appear in the indicated PS */ |
1197 | status = ComputePSDiskOffsets(raidPtr, ctrl->curPSID, col, &ctrl->diskOffset, &rbuf->failedDiskSectorOffset, |
1198 | &rbuf->spCol, &rbuf->spOffset); |
1199 | if (status) { |
1200 | ctrl->ru_count = RUsPerPU - 1; |
1201 | continue; |
1202 | } |
1203 | } |
1204 | rbuf->which_ru = ctrl->ru_count; |
1205 | |
1206 | /* skip this RU if it's already been reconstructed */ |
1207 | if (rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, rbuf->failedDiskSectorOffset)) { |
1208 | Dprintf2("Skipping psid %ld ru %d: already reconstructed\n" , ctrl->curPSID, ctrl->ru_count); |
1209 | continue; |
1210 | } |
1211 | break; |
1212 | } |
1213 | ctrl->headSepCounter++; |
1214 | if (do_new_check) |
1215 | CheckForNewMinHeadSep(raidPtr, ctrl->headSepCounter); /* update min if needed */ |
1216 | |
1217 | |
1218 | /* at this point, we have definitely decided what to do, and we have |
1219 | * only to see if we can actually do it now */ |
1220 | rbuf->parityStripeID = ctrl->curPSID; |
1221 | rbuf->which_ru = ctrl->ru_count; |
1222 | #if RF_ACC_TRACE > 0 |
1223 | memset((char *) &raidPtr->recon_tracerecs[col], 0, |
1224 | sizeof(raidPtr->recon_tracerecs[col])); |
1225 | raidPtr->recon_tracerecs[col].reconacc = 1; |
1226 | RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer); |
1227 | #endif |
1228 | retcode = TryToRead(raidPtr, col); |
1229 | return (retcode); |
1230 | } |
1231 | |
1232 | /* |
1233 | * tries to issue the next read on the indicated disk. We may be |
1234 | * blocked by (a) the heads being too far apart, or (b) recon on the |
1235 | * indicated RU being blocked due to a write by a user thread. In |
1236 | * this case, we issue a head-sep or blockage wait request, which will |
1237 | * cause this same routine to be invoked again later when the blockage |
1238 | * has cleared. |
1239 | */ |
1240 | |
1241 | static int |
1242 | TryToRead(RF_Raid_t *raidPtr, RF_RowCol_t col) |
1243 | { |
1244 | RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl->perDiskInfo[col]; |
1245 | RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; |
1246 | RF_StripeNum_t psid = ctrl->curPSID; |
1247 | RF_ReconUnitNum_t which_ru = ctrl->ru_count; |
1248 | RF_DiskQueueData_t *req; |
1249 | int status; |
1250 | RF_ReconParityStripeStatus_t *pssPtr, *newpssPtr; |
1251 | |
1252 | /* if the current disk is too far ahead of the others, issue a |
1253 | * head-separation wait and return */ |
1254 | if (CheckHeadSeparation(raidPtr, ctrl, col, ctrl->headSepCounter, which_ru)) |
1255 | return (0); |
1256 | |
1257 | /* allocate a new PSS in case we need it */ |
1258 | newpssPtr = rf_AllocPSStatus(raidPtr); |
1259 | |
1260 | RF_LOCK_PSS_MUTEX(raidPtr, psid); |
1261 | pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_CREATE, newpssPtr); |
1262 | |
1263 | if (pssPtr != newpssPtr) { |
1264 | rf_FreePSStatus(raidPtr, newpssPtr); |
1265 | } |
1266 | |
1267 | /* if recon is blocked on the indicated parity stripe, issue a |
1268 | * block-wait request and return. this also must mark the indicated RU |
1269 | * in the stripe as under reconstruction if not blocked. */ |
1270 | status = CheckForcedOrBlockedReconstruction(raidPtr, pssPtr, ctrl, col, psid, which_ru); |
1271 | if (status == RF_PSS_RECON_BLOCKED) { |
1272 | Dprintf2("RECON: Stalling psid %ld ru %d: recon blocked\n" , psid, which_ru); |
1273 | goto out; |
1274 | } else |
1275 | if (status == RF_PSS_FORCED_ON_WRITE) { |
1276 | rf_CauseReconEvent(raidPtr, col, NULL, RF_REVENT_SKIP); |
1277 | goto out; |
1278 | } |
1279 | /* make one last check to be sure that the indicated RU didn't get |
1280 | * reconstructed while we were waiting for something else to happen. |
1281 | * This is unfortunate in that it causes us to make this check twice |
1282 | * in the normal case. Might want to make some attempt to re-work |
1283 | * this so that we only do this check if we've definitely blocked on |
1284 | * one of the above checks. When this condition is detected, we may |
1285 | * have just created a bogus status entry, which we need to delete. */ |
1286 | if (rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, ctrl->rbuf->failedDiskSectorOffset)) { |
1287 | Dprintf2("RECON: Skipping psid %ld ru %d: prior recon after stall\n" , psid, which_ru); |
1288 | if (pssPtr == newpssPtr) |
1289 | rf_PSStatusDelete(raidPtr, raidPtr->reconControl->pssTable, pssPtr); |
1290 | rf_CauseReconEvent(raidPtr, col, NULL, RF_REVENT_SKIP); |
1291 | goto out; |
1292 | } |
1293 | /* found something to read. issue the I/O */ |
1294 | Dprintf4("RECON: Read for psid %ld on col %d offset %ld buf %lx\n" , |
1295 | psid, col, ctrl->diskOffset, ctrl->rbuf->buffer); |
1296 | #if RF_ACC_TRACE > 0 |
1297 | RF_ETIMER_STOP(raidPtr->recon_tracerecs[col].recon_timer); |
1298 | RF_ETIMER_EVAL(raidPtr->recon_tracerecs[col].recon_timer); |
1299 | raidPtr->recon_tracerecs[col].specific.recon.recon_start_to_fetch_us = |
1300 | RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[col].recon_timer); |
1301 | RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer); |
1302 | #endif |
1303 | /* should be ok to use a NULL proc pointer here, all the bufs we use |
1304 | * should be in kernel space */ |
1305 | req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, ctrl->diskOffset, sectorsPerRU, ctrl->rbuf->buffer, psid, which_ru, |
1306 | ReconReadDoneProc, (void *) ctrl, |
1307 | #if RF_ACC_TRACE > 0 |
1308 | &raidPtr->recon_tracerecs[col], |
1309 | #else |
1310 | NULL, |
1311 | #endif |
1312 | (void *) raidPtr, 0, NULL, PR_WAITOK); |
1313 | |
1314 | ctrl->rbuf->arg = (void *) req; |
1315 | rf_DiskIOEnqueue(&raidPtr->Queues[col], req, RF_IO_RECON_PRIORITY); |
1316 | pssPtr->issued[col] = 1; |
1317 | |
1318 | out: |
1319 | RF_UNLOCK_PSS_MUTEX(raidPtr, psid); |
1320 | return (0); |
1321 | } |
1322 | |
1323 | |
1324 | /* |
1325 | * given a parity stripe ID, we want to find out whether both the |
1326 | * current disk and the failed disk exist in that parity stripe. If |
1327 | * not, we want to skip this whole PS. If so, we want to find the |
1328 | * disk offset of the start of the PS on both the current disk and the |
1329 | * failed disk. |
1330 | * |
1331 | * this works by getting a list of disks comprising the indicated |
1332 | * parity stripe, and searching the list for the current and failed |
1333 | * disks. Once we've decided they both exist in the parity stripe, we |
1334 | * need to decide whether each is data or parity, so that we'll know |
1335 | * which mapping function to call to get the corresponding disk |
1336 | * offsets. |
1337 | * |
1338 | * this is kind of unpleasant, but doing it this way allows the |
1339 | * reconstruction code to use parity stripe IDs rather than physical |
1340 | * disks address to march through the failed disk, which greatly |
1341 | * simplifies a lot of code, as well as eliminating the need for a |
1342 | * reverse-mapping function. I also think it will execute faster, |
1343 | * since the calls to the mapping module are kept to a minimum. |
1344 | * |
1345 | * ASSUMES THAT THE STRIPE IDENTIFIER IDENTIFIES THE DISKS COMPRISING |
1346 | * THE STRIPE IN THE CORRECT ORDER |
1347 | * |
1348 | * raidPtr - raid descriptor |
1349 | * psid - parity stripe identifier |
1350 | * col - column of disk to find the offsets for |
1351 | * spCol - out: col of spare unit for failed unit |
1352 | * spOffset - out: offset into disk containing spare unit |
1353 | * |
1354 | */ |
1355 | |
1356 | |
1357 | static int |
1358 | ComputePSDiskOffsets(RF_Raid_t *raidPtr, RF_StripeNum_t psid, |
1359 | RF_RowCol_t col, RF_SectorNum_t *outDiskOffset, |
1360 | RF_SectorNum_t *outFailedDiskSectorOffset, |
1361 | RF_RowCol_t *spCol, RF_SectorNum_t *spOffset) |
1362 | { |
1363 | RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; |
1364 | RF_RowCol_t fcol = raidPtr->reconControl->fcol; |
1365 | RF_RaidAddr_t sosRaidAddress; /* start-of-stripe */ |
1366 | RF_RowCol_t *diskids; |
1367 | u_int i, j, k, i_offset, j_offset; |
1368 | RF_RowCol_t pcol; |
1369 | int testcol; |
1370 | RF_SectorNum_t poffset; |
1371 | char i_is_parity = 0, j_is_parity = 0; |
1372 | RF_RowCol_t stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol; |
1373 | |
1374 | /* get a listing of the disks comprising that stripe */ |
1375 | sosRaidAddress = rf_ParityStripeIDToRaidAddress(layoutPtr, psid); |
1376 | (layoutPtr->map->IdentifyStripe) (raidPtr, sosRaidAddress, &diskids); |
1377 | RF_ASSERT(diskids); |
1378 | |
1379 | /* reject this entire parity stripe if it does not contain the |
1380 | * indicated disk or it does not contain the failed disk */ |
1381 | |
1382 | for (i = 0; i < stripeWidth; i++) { |
1383 | if (col == diskids[i]) |
1384 | break; |
1385 | } |
1386 | if (i == stripeWidth) |
1387 | goto skipit; |
1388 | for (j = 0; j < stripeWidth; j++) { |
1389 | if (fcol == diskids[j]) |
1390 | break; |
1391 | } |
1392 | if (j == stripeWidth) { |
1393 | goto skipit; |
1394 | } |
1395 | /* find out which disk the parity is on */ |
1396 | (layoutPtr->map->MapParity) (raidPtr, sosRaidAddress, &pcol, &poffset, RF_DONT_REMAP); |
1397 | |
1398 | /* find out if either the current RU or the failed RU is parity */ |
1399 | /* also, if the parity occurs in this stripe prior to the data and/or |
1400 | * failed col, we need to decrement i and/or j */ |
1401 | for (k = 0; k < stripeWidth; k++) |
1402 | if (diskids[k] == pcol) |
1403 | break; |
1404 | RF_ASSERT(k < stripeWidth); |
1405 | i_offset = i; |
1406 | j_offset = j; |
1407 | if (k < i) |
1408 | i_offset--; |
1409 | else |
1410 | if (k == i) { |
1411 | i_is_parity = 1; |
1412 | i_offset = 0; |
1413 | } /* set offsets to zero to disable multiply |
1414 | * below */ |
1415 | if (k < j) |
1416 | j_offset--; |
1417 | else |
1418 | if (k == j) { |
1419 | j_is_parity = 1; |
1420 | j_offset = 0; |
1421 | } |
1422 | /* at this point, [ij]_is_parity tells us whether the [current,failed] |
1423 | * disk is parity at the start of this RU, and, if data, "[ij]_offset" |
1424 | * tells us how far into the stripe the [current,failed] disk is. */ |
1425 | |
1426 | /* call the mapping routine to get the offset into the current disk, |
1427 | * repeat for failed disk. */ |
1428 | if (i_is_parity) |
1429 | layoutPtr->map->MapParity(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outDiskOffset, RF_DONT_REMAP); |
1430 | else |
1431 | layoutPtr->map->MapSector(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outDiskOffset, RF_DONT_REMAP); |
1432 | |
1433 | RF_ASSERT(col == testcol); |
1434 | |
1435 | if (j_is_parity) |
1436 | layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP); |
1437 | else |
1438 | layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP); |
1439 | RF_ASSERT(fcol == testcol); |
1440 | |
1441 | /* now locate the spare unit for the failed unit */ |
1442 | #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 |
1443 | if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) { |
1444 | if (j_is_parity) |
1445 | layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spCol, spOffset, RF_REMAP); |
1446 | else |
1447 | layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spCol, spOffset, RF_REMAP); |
1448 | } else { |
1449 | #endif |
1450 | *spCol = raidPtr->reconControl->spareCol; |
1451 | *spOffset = *outFailedDiskSectorOffset; |
1452 | #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 |
1453 | } |
1454 | #endif |
1455 | return (0); |
1456 | |
1457 | skipit: |
1458 | Dprintf2("RECON: Skipping psid %ld: nothing needed from c%d\n" , |
1459 | psid, col); |
1460 | return (1); |
1461 | } |
1462 | /* this is called when a buffer has become ready to write to the replacement disk */ |
1463 | static int |
1464 | IssueNextWriteRequest(RF_Raid_t *raidPtr) |
1465 | { |
1466 | RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; |
1467 | RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU; |
1468 | #if RF_ACC_TRACE > 0 |
1469 | RF_RowCol_t fcol = raidPtr->reconControl->fcol; |
1470 | #endif |
1471 | RF_ReconBuffer_t *rbuf; |
1472 | RF_DiskQueueData_t *req; |
1473 | |
1474 | rbuf = rf_GetFullReconBuffer(raidPtr->reconControl); |
1475 | RF_ASSERT(rbuf); /* there must be one available, or we wouldn't |
1476 | * have gotten the event that sent us here */ |
1477 | RF_ASSERT(rbuf->pssPtr); |
1478 | |
1479 | rbuf->pssPtr->writeRbuf = rbuf; |
1480 | rbuf->pssPtr = NULL; |
1481 | |
1482 | Dprintf6("RECON: New write (c %d offs %d) for psid %ld ru %d (failed disk offset %ld) buf %lx\n" , |
1483 | rbuf->spCol, rbuf->spOffset, rbuf->parityStripeID, |
1484 | rbuf->which_ru, rbuf->failedDiskSectorOffset, rbuf->buffer); |
1485 | Dprintf6("RECON: new write psid %ld %02x %02x %02x %02x %02x\n" , |
1486 | rbuf->parityStripeID, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff, |
1487 | rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff); |
1488 | |
1489 | /* should be ok to use a NULL b_proc here b/c all addrs should be in |
1490 | * kernel space */ |
1491 | req = rf_CreateDiskQueueData(RF_IO_TYPE_WRITE, rbuf->spOffset, |
1492 | sectorsPerRU, rbuf->buffer, |
1493 | rbuf->parityStripeID, rbuf->which_ru, |
1494 | ReconWriteDoneProc, (void *) rbuf, |
1495 | #if RF_ACC_TRACE > 0 |
1496 | &raidPtr->recon_tracerecs[fcol], |
1497 | #else |
1498 | NULL, |
1499 | #endif |
1500 | (void *) raidPtr, 0, NULL, PR_WAITOK); |
1501 | |
1502 | rbuf->arg = (void *) req; |
1503 | rf_lock_mutex2(raidPtr->reconControl->rb_mutex); |
1504 | raidPtr->reconControl->pending_writes++; |
1505 | rf_unlock_mutex2(raidPtr->reconControl->rb_mutex); |
1506 | rf_DiskIOEnqueue(&raidPtr->Queues[rbuf->spCol], req, RF_IO_RECON_PRIORITY); |
1507 | |
1508 | return (0); |
1509 | } |
1510 | |
1511 | /* |
1512 | * this gets called upon the completion of a reconstruction read |
1513 | * operation the arg is a pointer to the per-disk reconstruction |
1514 | * control structure for the process that just finished a read. |
1515 | * |
1516 | * called at interrupt context in the kernel, so don't do anything |
1517 | * illegal here. |
1518 | */ |
1519 | static int |
1520 | ReconReadDoneProc(void *arg, int status) |
1521 | { |
1522 | RF_PerDiskReconCtrl_t *ctrl = (RF_PerDiskReconCtrl_t *) arg; |
1523 | RF_Raid_t *raidPtr; |
1524 | |
1525 | /* Detect that reconCtrl is no longer valid, and if that |
1526 | is the case, bail without calling rf_CauseReconEvent(). |
1527 | There won't be anyone listening for this event anyway */ |
1528 | |
1529 | if (ctrl->reconCtrl == NULL) |
1530 | return(0); |
1531 | |
1532 | raidPtr = ctrl->reconCtrl->reconDesc->raidPtr; |
1533 | |
1534 | if (status) { |
1535 | printf("raid%d: Recon read failed: %d\n" , raidPtr->raidid, status); |
1536 | rf_CauseReconEvent(raidPtr, ctrl->col, NULL, RF_REVENT_READ_FAILED); |
1537 | return(0); |
1538 | } |
1539 | #if RF_ACC_TRACE > 0 |
1540 | RF_ETIMER_STOP(raidPtr->recon_tracerecs[ctrl->col].recon_timer); |
1541 | RF_ETIMER_EVAL(raidPtr->recon_tracerecs[ctrl->col].recon_timer); |
1542 | raidPtr->recon_tracerecs[ctrl->col].specific.recon.recon_fetch_to_return_us = |
1543 | RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[ctrl->col].recon_timer); |
1544 | RF_ETIMER_START(raidPtr->recon_tracerecs[ctrl->col].recon_timer); |
1545 | #endif |
1546 | rf_CauseReconEvent(raidPtr, ctrl->col, NULL, RF_REVENT_READDONE); |
1547 | return (0); |
1548 | } |
1549 | /* this gets called upon the completion of a reconstruction write operation. |
1550 | * the arg is a pointer to the rbuf that was just written |
1551 | * |
1552 | * called at interrupt context in the kernel, so don't do anything illegal here. |
1553 | */ |
1554 | static int |
1555 | ReconWriteDoneProc(void *arg, int status) |
1556 | { |
1557 | RF_ReconBuffer_t *rbuf = (RF_ReconBuffer_t *) arg; |
1558 | |
1559 | /* Detect that reconControl is no longer valid, and if that |
1560 | is the case, bail without calling rf_CauseReconEvent(). |
1561 | There won't be anyone listening for this event anyway */ |
1562 | |
1563 | if (rbuf->raidPtr->reconControl == NULL) |
1564 | return(0); |
1565 | |
1566 | Dprintf2("Reconstruction completed on psid %ld ru %d\n" , rbuf->parityStripeID, rbuf->which_ru); |
1567 | if (status) { |
1568 | printf("raid%d: Recon write failed (status %d(0x%x))!\n" , rbuf->raidPtr->raidid,status,status); |
1569 | rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, arg, RF_REVENT_WRITE_FAILED); |
1570 | return(0); |
1571 | } |
1572 | rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, arg, RF_REVENT_WRITEDONE); |
1573 | return (0); |
1574 | } |
1575 | |
1576 | |
1577 | /* |
1578 | * computes a new minimum head sep, and wakes up anyone who needs to |
1579 | * be woken as a result |
1580 | */ |
1581 | static void |
1582 | CheckForNewMinHeadSep(RF_Raid_t *raidPtr, RF_HeadSepLimit_t hsCtr) |
1583 | { |
1584 | RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl; |
1585 | RF_HeadSepLimit_t new_min; |
1586 | RF_RowCol_t i; |
1587 | RF_CallbackDesc_t *p; |
1588 | RF_ASSERT(hsCtr >= reconCtrlPtr->minHeadSepCounter); /* from the definition |
1589 | * of a minimum */ |
1590 | |
1591 | |
1592 | rf_lock_mutex2(reconCtrlPtr->rb_mutex); |
1593 | while(reconCtrlPtr->rb_lock) { |
1594 | rf_wait_cond2(reconCtrlPtr->rb_cv, reconCtrlPtr->rb_mutex); |
1595 | } |
1596 | reconCtrlPtr->rb_lock = 1; |
1597 | rf_unlock_mutex2(reconCtrlPtr->rb_mutex); |
1598 | |
1599 | new_min = ~(1L << (8 * sizeof(long) - 1)); /* 0x7FFF....FFF */ |
1600 | for (i = 0; i < raidPtr->numCol; i++) |
1601 | if (i != reconCtrlPtr->fcol) { |
1602 | if (reconCtrlPtr->perDiskInfo[i].headSepCounter < new_min) |
1603 | new_min = reconCtrlPtr->perDiskInfo[i].headSepCounter; |
1604 | } |
1605 | /* set the new minimum and wake up anyone who can now run again */ |
1606 | if (new_min != reconCtrlPtr->minHeadSepCounter) { |
1607 | reconCtrlPtr->minHeadSepCounter = new_min; |
1608 | Dprintf1("RECON: new min head pos counter val is %ld\n" , new_min); |
1609 | while (reconCtrlPtr->headSepCBList) { |
1610 | if (reconCtrlPtr->headSepCBList->callbackArg.v > new_min) |
1611 | break; |
1612 | p = reconCtrlPtr->headSepCBList; |
1613 | reconCtrlPtr->headSepCBList = p->next; |
1614 | p->next = NULL; |
1615 | rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR); |
1616 | rf_FreeCallbackDesc(p); |
1617 | } |
1618 | |
1619 | } |
1620 | rf_lock_mutex2(reconCtrlPtr->rb_mutex); |
1621 | reconCtrlPtr->rb_lock = 0; |
1622 | rf_broadcast_cond2(reconCtrlPtr->rb_cv); |
1623 | rf_unlock_mutex2(reconCtrlPtr->rb_mutex); |
1624 | } |
1625 | |
1626 | /* |
1627 | * checks to see that the maximum head separation will not be violated |
1628 | * if we initiate a reconstruction I/O on the indicated disk. |
1629 | * Limiting the maximum head separation between two disks eliminates |
1630 | * the nasty buffer-stall conditions that occur when one disk races |
1631 | * ahead of the others and consumes all of the floating recon buffers. |
1632 | * This code is complex and unpleasant but it's necessary to avoid |
1633 | * some very nasty, albeit fairly rare, reconstruction behavior. |
1634 | * |
1635 | * returns non-zero if and only if we have to stop working on the |
1636 | * indicated disk due to a head-separation delay. |
1637 | */ |
1638 | static int |
1639 | CheckHeadSeparation(RF_Raid_t *raidPtr, RF_PerDiskReconCtrl_t *ctrl, |
1640 | RF_RowCol_t col, RF_HeadSepLimit_t hsCtr, |
1641 | RF_ReconUnitNum_t which_ru) |
1642 | { |
1643 | RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl; |
1644 | RF_CallbackDesc_t *cb, *p, *pt; |
1645 | int retval = 0; |
1646 | |
1647 | /* if we're too far ahead of the slowest disk, stop working on this |
1648 | * disk until the slower ones catch up. We do this by scheduling a |
1649 | * wakeup callback for the time when the slowest disk has caught up. |
1650 | * We define "caught up" with 20% hysteresis, i.e. the head separation |
1651 | * must have fallen to at most 80% of the max allowable head |
1652 | * separation before we'll wake up. |
1653 | * |
1654 | */ |
1655 | rf_lock_mutex2(reconCtrlPtr->rb_mutex); |
1656 | while(reconCtrlPtr->rb_lock) { |
1657 | rf_wait_cond2(reconCtrlPtr->rb_cv, reconCtrlPtr->rb_mutex); |
1658 | } |
1659 | reconCtrlPtr->rb_lock = 1; |
1660 | rf_unlock_mutex2(reconCtrlPtr->rb_mutex); |
1661 | if ((raidPtr->headSepLimit >= 0) && |
1662 | ((ctrl->headSepCounter - reconCtrlPtr->minHeadSepCounter) > raidPtr->headSepLimit)) { |
1663 | Dprintf5("raid%d: RECON: head sep stall: col %d hsCtr %ld minHSCtr %ld limit %ld\n" , |
1664 | raidPtr->raidid, col, ctrl->headSepCounter, |
1665 | reconCtrlPtr->minHeadSepCounter, |
1666 | raidPtr->headSepLimit); |
1667 | cb = rf_AllocCallbackDesc(); |
1668 | /* the minHeadSepCounter value we have to get to before we'll |
1669 | * wake up. build in 20% hysteresis. */ |
1670 | cb->callbackArg.v = (ctrl->headSepCounter - raidPtr->headSepLimit + raidPtr->headSepLimit / 5); |
1671 | cb->col = col; |
1672 | cb->next = NULL; |
1673 | |
1674 | /* insert this callback descriptor into the sorted list of |
1675 | * pending head-sep callbacks */ |
1676 | p = reconCtrlPtr->headSepCBList; |
1677 | if (!p) |
1678 | reconCtrlPtr->headSepCBList = cb; |
1679 | else |
1680 | if (cb->callbackArg.v < p->callbackArg.v) { |
1681 | cb->next = reconCtrlPtr->headSepCBList; |
1682 | reconCtrlPtr->headSepCBList = cb; |
1683 | } else { |
1684 | for (pt = p, p = p->next; p && (p->callbackArg.v < cb->callbackArg.v); pt = p, p = p->next); |
1685 | cb->next = p; |
1686 | pt->next = cb; |
1687 | } |
1688 | retval = 1; |
1689 | #if RF_RECON_STATS > 0 |
1690 | ctrl->reconCtrl->reconDesc->hsStallCount++; |
1691 | #endif /* RF_RECON_STATS > 0 */ |
1692 | } |
1693 | rf_lock_mutex2(reconCtrlPtr->rb_mutex); |
1694 | reconCtrlPtr->rb_lock = 0; |
1695 | rf_broadcast_cond2(reconCtrlPtr->rb_cv); |
1696 | rf_unlock_mutex2(reconCtrlPtr->rb_mutex); |
1697 | |
1698 | return (retval); |
1699 | } |
1700 | /* |
1701 | * checks to see if reconstruction has been either forced or blocked |
1702 | * by a user operation. if forced, we skip this RU entirely. else if |
1703 | * blocked, put ourselves on the wait list. else return 0. |
1704 | * |
1705 | * ASSUMES THE PSS MUTEX IS LOCKED UPON ENTRY |
1706 | */ |
1707 | static int |
1708 | CheckForcedOrBlockedReconstruction(RF_Raid_t *raidPtr, |
1709 | RF_ReconParityStripeStatus_t *pssPtr, |
1710 | RF_PerDiskReconCtrl_t *ctrl, |
1711 | RF_RowCol_t col, |
1712 | RF_StripeNum_t psid, |
1713 | RF_ReconUnitNum_t which_ru) |
1714 | { |
1715 | RF_CallbackDesc_t *cb; |
1716 | int retcode = 0; |
1717 | |
1718 | if ((pssPtr->flags & RF_PSS_FORCED_ON_READ) || (pssPtr->flags & RF_PSS_FORCED_ON_WRITE)) |
1719 | retcode = RF_PSS_FORCED_ON_WRITE; |
1720 | else |
1721 | if (pssPtr->flags & RF_PSS_RECON_BLOCKED) { |
1722 | Dprintf3("RECON: col %d blocked at psid %ld ru %d\n" , col, psid, which_ru); |
1723 | cb = rf_AllocCallbackDesc(); /* append ourselves to |
1724 | * the blockage-wait |
1725 | * list */ |
1726 | cb->col = col; |
1727 | cb->next = pssPtr->blockWaitList; |
1728 | pssPtr->blockWaitList = cb; |
1729 | retcode = RF_PSS_RECON_BLOCKED; |
1730 | } |
1731 | if (!retcode) |
1732 | pssPtr->flags |= RF_PSS_UNDER_RECON; /* mark this RU as under |
1733 | * reconstruction */ |
1734 | |
1735 | return (retcode); |
1736 | } |
1737 | /* |
1738 | * if reconstruction is currently ongoing for the indicated stripeID, |
1739 | * reconstruction is forced to completion and we return non-zero to |
1740 | * indicate that the caller must wait. If not, then reconstruction is |
1741 | * blocked on the indicated stripe and the routine returns zero. If |
1742 | * and only if we return non-zero, we'll cause the cbFunc to get |
1743 | * invoked with the cbArg when the reconstruction has completed. |
1744 | */ |
1745 | int |
1746 | rf_ForceOrBlockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, |
1747 | void (*cbFunc)(RF_Raid_t *, void *), void *cbArg) |
1748 | { |
1749 | RF_StripeNum_t stripeID = asmap->stripeID; /* the stripe ID we're |
1750 | * forcing recon on */ |
1751 | RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; /* num sects in one RU */ |
1752 | RF_ReconParityStripeStatus_t *pssPtr, *newpssPtr; /* a pointer to the parity |
1753 | * stripe status structure */ |
1754 | RF_StripeNum_t psid; /* parity stripe id */ |
1755 | RF_SectorNum_t offset, fd_offset; /* disk offset, failed-disk |
1756 | * offset */ |
1757 | RF_RowCol_t *diskids; |
1758 | RF_ReconUnitNum_t which_ru; /* RU within parity stripe */ |
1759 | RF_RowCol_t fcol, diskno, i; |
1760 | RF_ReconBuffer_t *new_rbuf; /* ptr to newly allocated rbufs */ |
1761 | RF_DiskQueueData_t *req;/* disk I/O req to be enqueued */ |
1762 | RF_CallbackDesc_t *cb; |
1763 | int nPromoted; |
1764 | |
1765 | psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru); |
1766 | |
1767 | /* allocate a new PSS in case we need it */ |
1768 | newpssPtr = rf_AllocPSStatus(raidPtr); |
1769 | |
1770 | RF_LOCK_PSS_MUTEX(raidPtr, psid); |
1771 | |
1772 | pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_CREATE | RF_PSS_RECON_BLOCKED, newpssPtr); |
1773 | |
1774 | if (pssPtr != newpssPtr) { |
1775 | rf_FreePSStatus(raidPtr, newpssPtr); |
1776 | } |
1777 | |
1778 | /* if recon is not ongoing on this PS, just return */ |
1779 | if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) { |
1780 | RF_UNLOCK_PSS_MUTEX(raidPtr, psid); |
1781 | return (0); |
1782 | } |
1783 | /* otherwise, we have to wait for reconstruction to complete on this |
1784 | * RU. */ |
1785 | /* In order to avoid waiting for a potentially large number of |
1786 | * low-priority accesses to complete, we force a normal-priority (i.e. |
1787 | * not low-priority) reconstruction on this RU. */ |
1788 | if (!(pssPtr->flags & RF_PSS_FORCED_ON_WRITE) && !(pssPtr->flags & RF_PSS_FORCED_ON_READ)) { |
1789 | DDprintf1("Forcing recon on psid %ld\n" , psid); |
1790 | pssPtr->flags |= RF_PSS_FORCED_ON_WRITE; /* mark this RU as under |
1791 | * forced recon */ |
1792 | pssPtr->flags &= ~RF_PSS_RECON_BLOCKED; /* clear the blockage |
1793 | * that we just set */ |
1794 | fcol = raidPtr->reconControl->fcol; |
1795 | |
1796 | /* get a listing of the disks comprising the indicated stripe */ |
1797 | (raidPtr->Layout.map->IdentifyStripe) (raidPtr, asmap->raidAddress, &diskids); |
1798 | |
1799 | /* For previously issued reads, elevate them to normal |
1800 | * priority. If the I/O has already completed, it won't be |
1801 | * found in the queue, and hence this will be a no-op. For |
1802 | * unissued reads, allocate buffers and issue new reads. The |
1803 | * fact that we've set the FORCED bit means that the regular |
1804 | * recon procs will not re-issue these reqs */ |
1805 | for (i = 0; i < raidPtr->Layout.numDataCol + raidPtr->Layout.numParityCol; i++) |
1806 | if ((diskno = diskids[i]) != fcol) { |
1807 | if (pssPtr->issued[diskno]) { |
1808 | nPromoted = rf_DiskIOPromote(&raidPtr->Queues[diskno], psid, which_ru); |
1809 | if (rf_reconDebug && nPromoted) |
1810 | printf("raid%d: promoted read from col %d\n" , raidPtr->raidid, diskno); |
1811 | } else { |
1812 | new_rbuf = rf_MakeReconBuffer(raidPtr, diskno, RF_RBUF_TYPE_FORCED); /* create new buf */ |
1813 | ComputePSDiskOffsets(raidPtr, psid, diskno, &offset, &fd_offset, |
1814 | &new_rbuf->spCol, &new_rbuf->spOffset); /* find offsets & spare |
1815 | * location */ |
1816 | new_rbuf->parityStripeID = psid; /* fill in the buffer */ |
1817 | new_rbuf->which_ru = which_ru; |
1818 | new_rbuf->failedDiskSectorOffset = fd_offset; |
1819 | new_rbuf->priority = RF_IO_NORMAL_PRIORITY; |
1820 | |
1821 | /* use NULL b_proc b/c all addrs |
1822 | * should be in kernel space */ |
1823 | req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, offset + which_ru * sectorsPerRU, sectorsPerRU, new_rbuf->buffer, |
1824 | psid, which_ru, (int (*) (void *, int)) ForceReconReadDoneProc, (void *) new_rbuf, |
1825 | NULL, (void *) raidPtr, 0, NULL, PR_WAITOK); |
1826 | |
1827 | new_rbuf->arg = req; |
1828 | rf_DiskIOEnqueue(&raidPtr->Queues[diskno], req, RF_IO_NORMAL_PRIORITY); /* enqueue the I/O */ |
1829 | Dprintf2("raid%d: Issued new read req on col %d\n" , raidPtr->raidid, diskno); |
1830 | } |
1831 | } |
1832 | /* if the write is sitting in the disk queue, elevate its |
1833 | * priority */ |
1834 | if (rf_DiskIOPromote(&raidPtr->Queues[fcol], psid, which_ru)) |
1835 | if (rf_reconDebug) |
1836 | printf("raid%d: promoted write to col %d\n" , |
1837 | raidPtr->raidid, fcol); |
1838 | } |
1839 | /* install a callback descriptor to be invoked when recon completes on |
1840 | * this parity stripe. */ |
1841 | cb = rf_AllocCallbackDesc(); |
1842 | /* XXX the following is bogus.. These functions don't really match!! |
1843 | * GO */ |
1844 | cb->callbackFunc = (void (*) (RF_CBParam_t)) cbFunc; |
1845 | cb->callbackArg.p = (void *) cbArg; |
1846 | cb->next = pssPtr->procWaitList; |
1847 | pssPtr->procWaitList = cb; |
1848 | DDprintf2("raid%d: Waiting for forced recon on psid %ld\n" , |
1849 | raidPtr->raidid, psid); |
1850 | |
1851 | RF_UNLOCK_PSS_MUTEX(raidPtr, psid); |
1852 | return (1); |
1853 | } |
1854 | /* called upon the completion of a forced reconstruction read. |
1855 | * all we do is schedule the FORCEDREADONE event. |
1856 | * called at interrupt context in the kernel, so don't do anything illegal here. |
1857 | */ |
1858 | static void |
1859 | ForceReconReadDoneProc(void *arg, int status) |
1860 | { |
1861 | RF_ReconBuffer_t *rbuf = arg; |
1862 | |
1863 | /* Detect that reconControl is no longer valid, and if that |
1864 | is the case, bail without calling rf_CauseReconEvent(). |
1865 | There won't be anyone listening for this event anyway */ |
1866 | |
1867 | if (rbuf->raidPtr->reconControl == NULL) |
1868 | return; |
1869 | |
1870 | if (status) { |
1871 | printf("raid%d: Forced recon read failed!\n" , rbuf->raidPtr->raidid); |
1872 | rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREAD_FAILED); |
1873 | return; |
1874 | } |
1875 | rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREADDONE); |
1876 | } |
1877 | /* releases a block on the reconstruction of the indicated stripe */ |
1878 | int |
1879 | rf_UnblockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap) |
1880 | { |
1881 | RF_StripeNum_t stripeID = asmap->stripeID; |
1882 | RF_ReconParityStripeStatus_t *pssPtr; |
1883 | RF_ReconUnitNum_t which_ru; |
1884 | RF_StripeNum_t psid; |
1885 | RF_CallbackDesc_t *cb; |
1886 | |
1887 | psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru); |
1888 | RF_LOCK_PSS_MUTEX(raidPtr, psid); |
1889 | pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_NONE, NULL); |
1890 | |
1891 | /* When recon is forced, the pss desc can get deleted before we get |
1892 | * back to unblock recon. But, this can _only_ happen when recon is |
1893 | * forced. It would be good to put some kind of sanity check here, but |
1894 | * how to decide if recon was just forced or not? */ |
1895 | if (!pssPtr) { |
1896 | /* printf("Warning: no pss descriptor upon unblock on psid %ld |
1897 | * RU %d\n",psid,which_ru); */ |
1898 | #if (RF_DEBUG_RECON > 0) || (RF_DEBUG_PSS > 0) |
1899 | if (rf_reconDebug || rf_pssDebug) |
1900 | printf("Warning: no pss descriptor upon unblock on psid %ld RU %d\n" , (long) psid, which_ru); |
1901 | #endif |
1902 | goto out; |
1903 | } |
1904 | pssPtr->blockCount--; |
1905 | Dprintf3("raid%d: unblocking recon on psid %ld: blockcount is %d\n" , |
1906 | raidPtr->raidid, psid, pssPtr->blockCount); |
1907 | if (pssPtr->blockCount == 0) { /* if recon blockage has been released */ |
1908 | |
1909 | /* unblock recon before calling CauseReconEvent in case |
1910 | * CauseReconEvent causes us to try to issue a new read before |
1911 | * returning here. */ |
1912 | pssPtr->flags &= ~RF_PSS_RECON_BLOCKED; |
1913 | |
1914 | |
1915 | while (pssPtr->blockWaitList) { |
1916 | /* spin through the block-wait list and |
1917 | release all the waiters */ |
1918 | cb = pssPtr->blockWaitList; |
1919 | pssPtr->blockWaitList = cb->next; |
1920 | cb->next = NULL; |
1921 | rf_CauseReconEvent(raidPtr, cb->col, NULL, RF_REVENT_BLOCKCLEAR); |
1922 | rf_FreeCallbackDesc(cb); |
1923 | } |
1924 | if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) { |
1925 | /* if no recon was requested while recon was blocked */ |
1926 | rf_PSStatusDelete(raidPtr, raidPtr->reconControl->pssTable, pssPtr); |
1927 | } |
1928 | } |
1929 | out: |
1930 | RF_UNLOCK_PSS_MUTEX(raidPtr, psid); |
1931 | return (0); |
1932 | } |
1933 | |
1934 | void |
1935 | rf_WakeupHeadSepCBWaiters(RF_Raid_t *raidPtr) |
1936 | { |
1937 | RF_CallbackDesc_t *p; |
1938 | |
1939 | rf_lock_mutex2(raidPtr->reconControl->rb_mutex); |
1940 | while(raidPtr->reconControl->rb_lock) { |
1941 | rf_wait_cond2(raidPtr->reconControl->rb_cv, |
1942 | raidPtr->reconControl->rb_mutex); |
1943 | } |
1944 | |
1945 | raidPtr->reconControl->rb_lock = 1; |
1946 | rf_unlock_mutex2(raidPtr->reconControl->rb_mutex); |
1947 | |
1948 | while (raidPtr->reconControl->headSepCBList) { |
1949 | p = raidPtr->reconControl->headSepCBList; |
1950 | raidPtr->reconControl->headSepCBList = p->next; |
1951 | p->next = NULL; |
1952 | rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR); |
1953 | rf_FreeCallbackDesc(p); |
1954 | } |
1955 | rf_lock_mutex2(raidPtr->reconControl->rb_mutex); |
1956 | raidPtr->reconControl->rb_lock = 0; |
1957 | rf_broadcast_cond2(raidPtr->reconControl->rb_cv); |
1958 | rf_unlock_mutex2(raidPtr->reconControl->rb_mutex); |
1959 | |
1960 | } |
1961 | |
1962 | |