1 | /* $NetBSD: rf_driver.c,v 1.132 2015/12/26 00:58:45 pgoyette Exp $ */ |
2 | /*- |
3 | * Copyright (c) 1999 The NetBSD Foundation, Inc. |
4 | * All rights reserved. |
5 | * |
6 | * This code is derived from software contributed to The NetBSD Foundation |
7 | * by Greg Oster |
8 | * |
9 | * Redistribution and use in source and binary forms, with or without |
10 | * modification, are permitted provided that the following conditions |
11 | * are met: |
12 | * 1. Redistributions of source code must retain the above copyright |
13 | * notice, this list of conditions and the following disclaimer. |
14 | * 2. Redistributions in binary form must reproduce the above copyright |
15 | * notice, this list of conditions and the following disclaimer in the |
16 | * documentation and/or other materials provided with the distribution. |
17 | * |
18 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
19 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
20 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
21 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
22 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
23 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
24 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
25 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
26 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
27 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
28 | * POSSIBILITY OF SUCH DAMAGE. |
29 | */ |
30 | |
31 | /* |
32 | * Copyright (c) 1995 Carnegie-Mellon University. |
33 | * All rights reserved. |
34 | * |
35 | * Author: Mark Holland, Khalil Amiri, Claudson Bornstein, William V. Courtright II, |
36 | * Robby Findler, Daniel Stodolsky, Rachad Youssef, Jim Zelenka |
37 | * |
38 | * Permission to use, copy, modify and distribute this software and |
39 | * its documentation is hereby granted, provided that both the copyright |
40 | * notice and this permission notice appear in all copies of the |
41 | * software, derivative works or modified versions, and any portions |
42 | * thereof, and that both notices appear in supporting documentation. |
43 | * |
44 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
45 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND |
46 | * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. |
47 | * |
48 | * Carnegie Mellon requests users of this software to return to |
49 | * |
50 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
51 | * School of Computer Science |
52 | * Carnegie Mellon University |
53 | * Pittsburgh PA 15213-3890 |
54 | * |
55 | * any improvements or extensions that they make and grant Carnegie the |
56 | * rights to redistribute these changes. |
57 | */ |
58 | |
59 | /****************************************************************************** |
60 | * |
61 | * rf_driver.c -- main setup, teardown, and access routines for the RAID driver |
62 | * |
63 | * all routines are prefixed with rf_ (raidframe), to avoid conficts. |
64 | * |
65 | ******************************************************************************/ |
66 | |
67 | |
68 | #include <sys/cdefs.h> |
69 | __KERNEL_RCSID(0, "$NetBSD: rf_driver.c,v 1.132 2015/12/26 00:58:45 pgoyette Exp $" ); |
70 | |
71 | #ifdef _KERNEL_OPT |
72 | #include "opt_raid_diagnostic.h" |
73 | #endif |
74 | |
75 | #include <sys/param.h> |
76 | #include <sys/systm.h> |
77 | #include <sys/ioctl.h> |
78 | #include <sys/fcntl.h> |
79 | #include <sys/vnode.h> |
80 | |
81 | |
82 | #include "rf_archs.h" |
83 | #include "rf_threadstuff.h" |
84 | |
85 | #include <sys/errno.h> |
86 | |
87 | #include "rf_raid.h" |
88 | #include "rf_dag.h" |
89 | #include "rf_aselect.h" |
90 | #include "rf_diskqueue.h" |
91 | #include "rf_parityscan.h" |
92 | #include "rf_alloclist.h" |
93 | #include "rf_dagutils.h" |
94 | #include "rf_utils.h" |
95 | #include "rf_etimer.h" |
96 | #include "rf_acctrace.h" |
97 | #include "rf_general.h" |
98 | #include "rf_desc.h" |
99 | #include "rf_states.h" |
100 | #include "rf_decluster.h" |
101 | #include "rf_map.h" |
102 | #include "rf_revent.h" |
103 | #include "rf_callback.h" |
104 | #include "rf_engine.h" |
105 | #include "rf_mcpair.h" |
106 | #include "rf_nwayxor.h" |
107 | #include "rf_copyback.h" |
108 | #include "rf_driver.h" |
109 | #include "rf_options.h" |
110 | #include "rf_shutdown.h" |
111 | #include "rf_kintf.h" |
112 | #include "rf_paritymap.h" |
113 | |
114 | #include <sys/buf.h> |
115 | |
116 | #ifndef RF_ACCESS_DEBUG |
117 | #define RF_ACCESS_DEBUG 0 |
118 | #endif |
119 | |
120 | /* rad == RF_RaidAccessDesc_t */ |
121 | #define RF_MAX_FREE_RAD 128 |
122 | #define RF_MIN_FREE_RAD 32 |
123 | |
124 | /* debug variables */ |
125 | char rf_panicbuf[2048]; /* a buffer to hold an error msg when we panic */ |
126 | |
127 | /* main configuration routines */ |
128 | static int raidframe_booted = 0; |
129 | |
130 | static void rf_ConfigureDebug(RF_Config_t * cfgPtr); |
131 | static void set_debug_option(char *name, long val); |
132 | static void rf_UnconfigureArray(void); |
133 | static void rf_ShutdownRDFreeList(void *); |
134 | static int rf_ConfigureRDFreeList(RF_ShutdownList_t **); |
135 | |
136 | rf_declare_mutex2(rf_printf_mutex); /* debug only: avoids interleaved |
137 | * printfs by different stripes */ |
138 | |
139 | #define SIGNAL_QUIESCENT_COND(_raid_) \ |
140 | rf_broadcast_cond2((_raid_)->access_suspend_cv) |
141 | #define WAIT_FOR_QUIESCENCE(_raid_) \ |
142 | rf_wait_cond2((_raid_)->access_suspend_cv, \ |
143 | (_raid_)->access_suspend_mutex) |
144 | |
145 | static int configureCount = 0; /* number of active configurations */ |
146 | static int isconfigged = 0; /* is basic raidframe (non per-array) |
147 | * stuff configured */ |
148 | static rf_declare_mutex2(configureMutex); /* used to lock the configuration |
149 | * stuff */ |
150 | static RF_ShutdownList_t *globalShutdown; /* non array-specific |
151 | * stuff */ |
152 | |
153 | static int rf_ConfigureRDFreeList(RF_ShutdownList_t ** listp); |
154 | static int rf_AllocEmergBuffers(RF_Raid_t *); |
155 | static void rf_FreeEmergBuffers(RF_Raid_t *); |
156 | static void rf_destroy_mutex_cond(RF_Raid_t *); |
157 | static void rf_alloc_mutex_cond(RF_Raid_t *); |
158 | |
159 | /* called at system boot time */ |
160 | int |
161 | rf_BootRaidframe(bool boot) |
162 | { |
163 | |
164 | if (boot) { |
165 | if (raidframe_booted) |
166 | return (EBUSY); |
167 | raidframe_booted = 1; |
168 | rf_init_mutex2(configureMutex, IPL_NONE); |
169 | configureCount = 0; |
170 | isconfigged = 0; |
171 | globalShutdown = NULL; |
172 | } else { |
173 | rf_destroy_mutex2(configureMutex); |
174 | raidframe_booted = 0; |
175 | } |
176 | return (0); |
177 | } |
178 | |
179 | /* |
180 | * Called whenever an array is shutdown |
181 | */ |
182 | static void |
183 | rf_UnconfigureArray(void) |
184 | { |
185 | |
186 | rf_lock_mutex2(configureMutex); |
187 | if (--configureCount == 0) { /* if no active configurations, shut |
188 | * everything down */ |
189 | rf_destroy_mutex2(rf_printf_mutex); |
190 | isconfigged = 0; |
191 | rf_ShutdownList(&globalShutdown); |
192 | |
193 | /* |
194 | * We must wait until now, because the AllocList module |
195 | * uses the DebugMem module. |
196 | */ |
197 | #if RF_DEBUG_MEM |
198 | if (rf_memDebug) |
199 | rf_print_unfreed(); |
200 | #endif |
201 | } |
202 | rf_unlock_mutex2(configureMutex); |
203 | } |
204 | |
205 | /* |
206 | * Called to shut down an array. |
207 | */ |
208 | int |
209 | rf_Shutdown(RF_Raid_t *raidPtr) |
210 | { |
211 | |
212 | if (!raidPtr->valid) { |
213 | RF_ERRORMSG("Attempt to shut down unconfigured RAIDframe driver. Aborting shutdown\n" ); |
214 | return (EINVAL); |
215 | } |
216 | /* |
217 | * wait for outstanding IOs to land |
218 | * As described in rf_raid.h, we use the rad_freelist lock |
219 | * to protect the per-array info about outstanding descs |
220 | * since we need to do freelist locking anyway, and this |
221 | * cuts down on the amount of serialization we've got going |
222 | * on. |
223 | */ |
224 | rf_lock_mutex2(raidPtr->rad_lock); |
225 | if (raidPtr->waitShutdown) { |
226 | rf_unlock_mutex2(raidPtr->rad_lock); |
227 | return (EBUSY); |
228 | } |
229 | raidPtr->waitShutdown = 1; |
230 | while (raidPtr->nAccOutstanding) { |
231 | rf_wait_cond2(raidPtr->outstandingCond, raidPtr->rad_lock); |
232 | } |
233 | rf_unlock_mutex2(raidPtr->rad_lock); |
234 | |
235 | /* Wait for any parity re-writes to stop... */ |
236 | while (raidPtr->parity_rewrite_in_progress) { |
237 | printf("raid%d: Waiting for parity re-write to exit...\n" , |
238 | raidPtr->raidid); |
239 | tsleep(&raidPtr->parity_rewrite_in_progress, PRIBIO, |
240 | "rfprwshutdown" , 0); |
241 | } |
242 | |
243 | /* Wait for any reconstruction to stop... */ |
244 | rf_lock_mutex2(raidPtr->mutex); |
245 | while (raidPtr->reconInProgress) { |
246 | printf("raid%d: Waiting for reconstruction to stop...\n" , |
247 | raidPtr->raidid); |
248 | rf_wait_cond2(raidPtr->waitForReconCond, raidPtr->mutex); |
249 | } |
250 | rf_unlock_mutex2(raidPtr->mutex); |
251 | |
252 | raidPtr->valid = 0; |
253 | |
254 | if (raidPtr->parity_map != NULL) |
255 | rf_paritymap_detach(raidPtr); |
256 | |
257 | rf_update_component_labels(raidPtr, RF_FINAL_COMPONENT_UPDATE); |
258 | |
259 | rf_UnconfigureVnodes(raidPtr); |
260 | |
261 | rf_FreeEmergBuffers(raidPtr); |
262 | |
263 | rf_ShutdownList(&raidPtr->shutdownList); |
264 | |
265 | rf_destroy_mutex_cond(raidPtr); |
266 | |
267 | rf_UnconfigureArray(); |
268 | |
269 | return (0); |
270 | } |
271 | |
272 | |
273 | #define DO_INIT_CONFIGURE(f) { \ |
274 | rc = f (&globalShutdown); \ |
275 | if (rc) { \ |
276 | RF_ERRORMSG2("RAIDFRAME: failed %s with %d\n", RF_STRING(f), rc); \ |
277 | rf_ShutdownList(&globalShutdown); \ |
278 | configureCount--; \ |
279 | rf_unlock_mutex2(configureMutex); \ |
280 | rf_destroy_mutex2(rf_printf_mutex); \ |
281 | return(rc); \ |
282 | } \ |
283 | } |
284 | |
285 | #define DO_RAID_FAIL() { \ |
286 | rf_UnconfigureVnodes(raidPtr); \ |
287 | rf_FreeEmergBuffers(raidPtr); \ |
288 | rf_ShutdownList(&raidPtr->shutdownList); \ |
289 | rf_UnconfigureArray(); \ |
290 | rf_destroy_mutex_cond(raidPtr); \ |
291 | } |
292 | |
293 | #define DO_RAID_INIT_CONFIGURE(f) { \ |
294 | rc = f (&raidPtr->shutdownList, raidPtr, cfgPtr); \ |
295 | if (rc) { \ |
296 | RF_ERRORMSG2("RAIDFRAME: failed %s with %d\n", RF_STRING(f), rc); \ |
297 | DO_RAID_FAIL(); \ |
298 | return(rc); \ |
299 | } \ |
300 | } |
301 | |
302 | int |
303 | rf_Configure(RF_Raid_t *raidPtr, RF_Config_t *cfgPtr, RF_AutoConfig_t *ac) |
304 | { |
305 | RF_RowCol_t col; |
306 | int rc; |
307 | |
308 | rf_lock_mutex2(configureMutex); |
309 | configureCount++; |
310 | if (isconfigged == 0) { |
311 | rf_init_mutex2(rf_printf_mutex, IPL_VM); |
312 | |
313 | /* initialize globals */ |
314 | |
315 | DO_INIT_CONFIGURE(rf_ConfigureAllocList); |
316 | |
317 | /* |
318 | * Yes, this does make debugging general to the whole |
319 | * system instead of being array specific. Bummer, drag. |
320 | */ |
321 | rf_ConfigureDebug(cfgPtr); |
322 | DO_INIT_CONFIGURE(rf_ConfigureDebugMem); |
323 | #if RF_ACC_TRACE > 0 |
324 | DO_INIT_CONFIGURE(rf_ConfigureAccessTrace); |
325 | #endif |
326 | DO_INIT_CONFIGURE(rf_ConfigureMapModule); |
327 | DO_INIT_CONFIGURE(rf_ConfigureReconEvent); |
328 | DO_INIT_CONFIGURE(rf_ConfigureCallback); |
329 | DO_INIT_CONFIGURE(rf_ConfigureRDFreeList); |
330 | DO_INIT_CONFIGURE(rf_ConfigureNWayXor); |
331 | DO_INIT_CONFIGURE(rf_ConfigureStripeLockFreeList); |
332 | DO_INIT_CONFIGURE(rf_ConfigureMCPair); |
333 | DO_INIT_CONFIGURE(rf_ConfigureDAGs); |
334 | DO_INIT_CONFIGURE(rf_ConfigureDAGFuncs); |
335 | DO_INIT_CONFIGURE(rf_ConfigureReconstruction); |
336 | DO_INIT_CONFIGURE(rf_ConfigureCopyback); |
337 | DO_INIT_CONFIGURE(rf_ConfigureDiskQueueSystem); |
338 | DO_INIT_CONFIGURE(rf_ConfigurePSStatus); |
339 | isconfigged = 1; |
340 | } |
341 | rf_unlock_mutex2(configureMutex); |
342 | |
343 | rf_alloc_mutex_cond(raidPtr); |
344 | |
345 | /* set up the cleanup list. Do this after ConfigureDebug so that |
346 | * value of memDebug will be set */ |
347 | |
348 | rf_MakeAllocList(raidPtr->cleanupList); |
349 | if (raidPtr->cleanupList == NULL) { |
350 | DO_RAID_FAIL(); |
351 | return (ENOMEM); |
352 | } |
353 | rf_ShutdownCreate(&raidPtr->shutdownList, |
354 | (void (*) (void *)) rf_FreeAllocList, |
355 | raidPtr->cleanupList); |
356 | |
357 | raidPtr->numCol = cfgPtr->numCol; |
358 | raidPtr->numSpare = cfgPtr->numSpare; |
359 | |
360 | raidPtr->status = rf_rs_optimal; |
361 | raidPtr->reconControl = NULL; |
362 | |
363 | DO_RAID_INIT_CONFIGURE(rf_ConfigureEngine); |
364 | DO_RAID_INIT_CONFIGURE(rf_ConfigureStripeLocks); |
365 | |
366 | raidPtr->nAccOutstanding = 0; |
367 | raidPtr->waitShutdown = 0; |
368 | |
369 | if (ac!=NULL) { |
370 | /* We have an AutoConfig structure.. Don't do the |
371 | normal disk configuration... call the auto config |
372 | stuff */ |
373 | rf_AutoConfigureDisks(raidPtr, cfgPtr, ac); |
374 | } else { |
375 | DO_RAID_INIT_CONFIGURE(rf_ConfigureDisks); |
376 | DO_RAID_INIT_CONFIGURE(rf_ConfigureSpareDisks); |
377 | } |
378 | /* do this after ConfigureDisks & ConfigureSpareDisks to be sure dev |
379 | * no. is set */ |
380 | DO_RAID_INIT_CONFIGURE(rf_ConfigureDiskQueues); |
381 | |
382 | DO_RAID_INIT_CONFIGURE(rf_ConfigureLayout); |
383 | |
384 | /* Initialize per-RAID PSS bits */ |
385 | rf_InitPSStatus(raidPtr); |
386 | |
387 | #if RF_INCLUDE_CHAINDECLUSTER > 0 |
388 | for (col = 0; col < raidPtr->numCol; col++) { |
389 | /* |
390 | * XXX better distribution |
391 | */ |
392 | raidPtr->hist_diskreq[col] = 0; |
393 | } |
394 | #endif |
395 | raidPtr->numNewFailures = 0; |
396 | raidPtr->copyback_in_progress = 0; |
397 | raidPtr->parity_rewrite_in_progress = 0; |
398 | raidPtr->adding_hot_spare = 0; |
399 | raidPtr->recon_in_progress = 0; |
400 | |
401 | raidPtr->maxOutstanding = cfgPtr->maxOutstandingDiskReqs; |
402 | |
403 | /* autoconfigure and root_partition will actually get filled in |
404 | after the config is done */ |
405 | raidPtr->autoconfigure = 0; |
406 | raidPtr->root_partition = 0; |
407 | raidPtr->last_unit = raidPtr->raidid; |
408 | raidPtr->config_order = 0; |
409 | |
410 | if (rf_keepAccTotals) { |
411 | raidPtr->keep_acc_totals = 1; |
412 | } |
413 | |
414 | /* Allocate a bunch of buffers to be used in low-memory conditions */ |
415 | raidPtr->iobuf = NULL; |
416 | |
417 | rc = rf_AllocEmergBuffers(raidPtr); |
418 | if (rc) { |
419 | printf("raid%d: Unable to allocate emergency buffers.\n" , |
420 | raidPtr->raidid); |
421 | DO_RAID_FAIL(); |
422 | return(rc); |
423 | } |
424 | |
425 | /* Set up parity map stuff, if applicable. */ |
426 | #ifndef RF_NO_PARITY_MAP |
427 | rf_paritymap_attach(raidPtr, cfgPtr->force); |
428 | #endif |
429 | |
430 | raidPtr->valid = 1; |
431 | |
432 | printf("raid%d: %s\n" , raidPtr->raidid, |
433 | raidPtr->Layout.map->configName); |
434 | printf("raid%d: Components:" , raidPtr->raidid); |
435 | |
436 | for (col = 0; col < raidPtr->numCol; col++) { |
437 | printf(" %s" , raidPtr->Disks[col].devname); |
438 | if (RF_DEAD_DISK(raidPtr->Disks[col].status)) { |
439 | printf("[**FAILED**]" ); |
440 | } |
441 | } |
442 | printf("\n" ); |
443 | printf("raid%d: Total Sectors: %" PRIu64 " (%" PRIu64 " MB)\n" , |
444 | raidPtr->raidid, |
445 | raidPtr->totalSectors, |
446 | (raidPtr->totalSectors / 1024 * |
447 | (1 << raidPtr->logBytesPerSector) / 1024)); |
448 | |
449 | return (0); |
450 | } |
451 | |
452 | |
453 | /* |
454 | |
455 | Routines to allocate and free the "emergency buffers" for a given |
456 | RAID set. These emergency buffers will be used when the kernel runs |
457 | out of kernel memory. |
458 | |
459 | */ |
460 | |
461 | static int |
462 | rf_AllocEmergBuffers(RF_Raid_t *raidPtr) |
463 | { |
464 | void *tmpbuf; |
465 | RF_VoidPointerListElem_t *vple; |
466 | int i; |
467 | |
468 | /* XXX next line needs tuning... */ |
469 | raidPtr->numEmergencyBuffers = 10 * raidPtr->numCol; |
470 | #if DEBUG |
471 | printf("raid%d: allocating %d buffers of %d bytes.\n" , |
472 | raidPtr->raidid, |
473 | raidPtr->numEmergencyBuffers, |
474 | (int)(raidPtr->Layout.sectorsPerStripeUnit << |
475 | raidPtr->logBytesPerSector)); |
476 | #endif |
477 | for (i = 0; i < raidPtr->numEmergencyBuffers; i++) { |
478 | tmpbuf = malloc( raidPtr->Layout.sectorsPerStripeUnit << |
479 | raidPtr->logBytesPerSector, |
480 | M_RAIDFRAME, M_WAITOK); |
481 | if (tmpbuf) { |
482 | vple = rf_AllocVPListElem(); |
483 | vple->p= tmpbuf; |
484 | vple->next = raidPtr->iobuf; |
485 | raidPtr->iobuf = vple; |
486 | raidPtr->iobuf_count++; |
487 | } else { |
488 | printf("raid%d: failed to allocate emergency buffer!\n" , |
489 | raidPtr->raidid); |
490 | return 1; |
491 | } |
492 | } |
493 | |
494 | /* XXX next line needs tuning too... */ |
495 | raidPtr->numEmergencyStripeBuffers = 10; |
496 | for (i = 0; i < raidPtr->numEmergencyStripeBuffers; i++) { |
497 | tmpbuf = malloc( raidPtr->numCol * (raidPtr->Layout.sectorsPerStripeUnit << |
498 | raidPtr->logBytesPerSector), |
499 | M_RAIDFRAME, M_WAITOK); |
500 | if (tmpbuf) { |
501 | vple = rf_AllocVPListElem(); |
502 | vple->p= tmpbuf; |
503 | vple->next = raidPtr->stripebuf; |
504 | raidPtr->stripebuf = vple; |
505 | raidPtr->stripebuf_count++; |
506 | } else { |
507 | printf("raid%d: failed to allocate emergency stripe buffer!\n" , |
508 | raidPtr->raidid); |
509 | return 1; |
510 | } |
511 | } |
512 | |
513 | return (0); |
514 | } |
515 | |
516 | static void |
517 | rf_FreeEmergBuffers(RF_Raid_t *raidPtr) |
518 | { |
519 | RF_VoidPointerListElem_t *tmp; |
520 | |
521 | /* Free the emergency IO buffers */ |
522 | while (raidPtr->iobuf != NULL) { |
523 | tmp = raidPtr->iobuf; |
524 | raidPtr->iobuf = raidPtr->iobuf->next; |
525 | free(tmp->p, M_RAIDFRAME); |
526 | rf_FreeVPListElem(tmp); |
527 | } |
528 | |
529 | /* Free the emergency stripe buffers */ |
530 | while (raidPtr->stripebuf != NULL) { |
531 | tmp = raidPtr->stripebuf; |
532 | raidPtr->stripebuf = raidPtr->stripebuf->next; |
533 | free(tmp->p, M_RAIDFRAME); |
534 | rf_FreeVPListElem(tmp); |
535 | } |
536 | } |
537 | |
538 | |
539 | static void |
540 | rf_ShutdownRDFreeList(void *ignored) |
541 | { |
542 | pool_destroy(&rf_pools.rad); |
543 | } |
544 | |
545 | static int |
546 | rf_ConfigureRDFreeList(RF_ShutdownList_t **listp) |
547 | { |
548 | |
549 | rf_pool_init(&rf_pools.rad, sizeof(RF_RaidAccessDesc_t), |
550 | "rf_rad_pl" , RF_MIN_FREE_RAD, RF_MAX_FREE_RAD); |
551 | rf_ShutdownCreate(listp, rf_ShutdownRDFreeList, NULL); |
552 | return (0); |
553 | } |
554 | |
555 | RF_RaidAccessDesc_t * |
556 | rf_AllocRaidAccDesc(RF_Raid_t *raidPtr, RF_IoType_t type, |
557 | RF_RaidAddr_t raidAddress, RF_SectorCount_t numBlocks, |
558 | void *bufPtr, void *bp, RF_RaidAccessFlags_t flags, |
559 | const RF_AccessState_t *states) |
560 | { |
561 | RF_RaidAccessDesc_t *desc; |
562 | |
563 | desc = pool_get(&rf_pools.rad, PR_WAITOK); |
564 | |
565 | rf_lock_mutex2(raidPtr->rad_lock); |
566 | if (raidPtr->waitShutdown) { |
567 | /* |
568 | * Actually, we're shutting the array down. Free the desc |
569 | * and return NULL. |
570 | */ |
571 | |
572 | rf_unlock_mutex2(raidPtr->rad_lock); |
573 | pool_put(&rf_pools.rad, desc); |
574 | return (NULL); |
575 | } |
576 | raidPtr->nAccOutstanding++; |
577 | |
578 | rf_unlock_mutex2(raidPtr->rad_lock); |
579 | |
580 | desc->raidPtr = (void *) raidPtr; |
581 | desc->type = type; |
582 | desc->raidAddress = raidAddress; |
583 | desc->numBlocks = numBlocks; |
584 | desc->bufPtr = bufPtr; |
585 | desc->bp = bp; |
586 | desc->flags = flags; |
587 | desc->states = states; |
588 | desc->state = 0; |
589 | desc->dagList = NULL; |
590 | |
591 | desc->status = 0; |
592 | desc->numRetries = 0; |
593 | #if RF_ACC_TRACE > 0 |
594 | memset((char *) &desc->tracerec, 0, sizeof(RF_AccTraceEntry_t)); |
595 | #endif |
596 | desc->callbackFunc = NULL; |
597 | desc->callbackArg = NULL; |
598 | desc->next = NULL; |
599 | desc->iobufs = NULL; |
600 | desc->stripebufs = NULL; |
601 | |
602 | return (desc); |
603 | } |
604 | |
605 | void |
606 | rf_FreeRaidAccDesc(RF_RaidAccessDesc_t *desc) |
607 | { |
608 | RF_Raid_t *raidPtr = desc->raidPtr; |
609 | RF_DagList_t *dagList, *temp; |
610 | RF_VoidPointerListElem_t *tmp; |
611 | |
612 | RF_ASSERT(desc); |
613 | |
614 | /* Cleanup the dagList(s) */ |
615 | dagList = desc->dagList; |
616 | while(dagList != NULL) { |
617 | temp = dagList; |
618 | dagList = dagList->next; |
619 | rf_FreeDAGList(temp); |
620 | } |
621 | |
622 | while (desc->iobufs) { |
623 | tmp = desc->iobufs; |
624 | desc->iobufs = desc->iobufs->next; |
625 | rf_FreeIOBuffer(raidPtr, tmp); |
626 | } |
627 | |
628 | while (desc->stripebufs) { |
629 | tmp = desc->stripebufs; |
630 | desc->stripebufs = desc->stripebufs->next; |
631 | rf_FreeStripeBuffer(raidPtr, tmp); |
632 | } |
633 | |
634 | pool_put(&rf_pools.rad, desc); |
635 | rf_lock_mutex2(raidPtr->rad_lock); |
636 | raidPtr->nAccOutstanding--; |
637 | if (raidPtr->waitShutdown) { |
638 | rf_signal_cond2(raidPtr->outstandingCond); |
639 | } |
640 | rf_unlock_mutex2(raidPtr->rad_lock); |
641 | } |
642 | /********************************************************************* |
643 | * Main routine for performing an access. |
644 | * Accesses are retried until a DAG can not be selected. This occurs |
645 | * when either the DAG library is incomplete or there are too many |
646 | * failures in a parity group. |
647 | * |
648 | * type should be read or write async_flag should be RF_TRUE or |
649 | * RF_FALSE bp_in is a buf pointer. void *to facilitate ignoring it |
650 | * outside the kernel |
651 | ********************************************************************/ |
652 | int |
653 | rf_DoAccess(RF_Raid_t * raidPtr, RF_IoType_t type, int async_flag, |
654 | RF_RaidAddr_t raidAddress, RF_SectorCount_t numBlocks, |
655 | void *bufPtr, struct buf *bp, RF_RaidAccessFlags_t flags) |
656 | { |
657 | RF_RaidAccessDesc_t *desc; |
658 | void *lbufPtr = bufPtr; |
659 | |
660 | raidAddress += rf_raidSectorOffset; |
661 | |
662 | #if RF_ACCESS_DEBUG |
663 | if (rf_accessDebug) { |
664 | |
665 | printf("logBytes is: %d %d %d\n" , raidPtr->raidid, |
666 | raidPtr->logBytesPerSector, |
667 | (int) rf_RaidAddressToByte(raidPtr, numBlocks)); |
668 | printf("raid%d: %s raidAddr %d (stripeid %d-%d) numBlocks %d (%d bytes) buf 0x%lx\n" , raidPtr->raidid, |
669 | (type == RF_IO_TYPE_READ) ? "READ" : "WRITE" , (int) raidAddress, |
670 | (int) rf_RaidAddressToStripeID(&raidPtr->Layout, raidAddress), |
671 | (int) rf_RaidAddressToStripeID(&raidPtr->Layout, raidAddress + numBlocks - 1), |
672 | (int) numBlocks, |
673 | (int) rf_RaidAddressToByte(raidPtr, numBlocks), |
674 | (long) bufPtr); |
675 | } |
676 | #endif |
677 | |
678 | desc = rf_AllocRaidAccDesc(raidPtr, type, raidAddress, |
679 | numBlocks, lbufPtr, bp, flags, raidPtr->Layout.map->states); |
680 | |
681 | if (desc == NULL) { |
682 | return (ENOMEM); |
683 | } |
684 | #if RF_ACC_TRACE > 0 |
685 | RF_ETIMER_START(desc->tracerec.tot_timer); |
686 | #endif |
687 | desc->async_flag = async_flag; |
688 | |
689 | if (raidPtr->parity_map != NULL && |
690 | type == RF_IO_TYPE_WRITE) |
691 | rf_paritymap_begin(raidPtr->parity_map, raidAddress, |
692 | numBlocks); |
693 | |
694 | rf_ContinueRaidAccess(desc); |
695 | |
696 | return (0); |
697 | } |
698 | #if 0 |
699 | /* force the array into reconfigured mode without doing reconstruction */ |
700 | int |
701 | rf_SetReconfiguredMode(RF_Raid_t *raidPtr, int col) |
702 | { |
703 | if (!(raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) { |
704 | printf("Can't set reconfigured mode in dedicated-spare array\n" ); |
705 | RF_PANIC(); |
706 | } |
707 | rf_lock_mutex2(raidPtr->mutex); |
708 | raidPtr->numFailures++; |
709 | raidPtr->Disks[col].status = rf_ds_dist_spared; |
710 | raidPtr->status = rf_rs_reconfigured; |
711 | rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE); |
712 | /* install spare table only if declustering + distributed sparing |
713 | * architecture. */ |
714 | if (raidPtr->Layout.map->flags & RF_BD_DECLUSTERED) |
715 | rf_InstallSpareTable(raidPtr, col); |
716 | rf_unlock_mutex2(raidPtr->mutex); |
717 | return (0); |
718 | } |
719 | #endif |
720 | |
721 | int |
722 | rf_FailDisk(RF_Raid_t *raidPtr, int fcol, int initRecon) |
723 | { |
724 | |
725 | /* need to suspend IO's here -- if there are DAGs in flight |
726 | and we pull the rug out from under ci_vp, Bad Things |
727 | can happen. */ |
728 | |
729 | rf_SuspendNewRequestsAndWait(raidPtr); |
730 | |
731 | rf_lock_mutex2(raidPtr->mutex); |
732 | if (raidPtr->Disks[fcol].status != rf_ds_failed) { |
733 | /* must be failing something that is valid, or else it's |
734 | already marked as failed (in which case we don't |
735 | want to mark it failed again!) */ |
736 | raidPtr->numFailures++; |
737 | raidPtr->Disks[fcol].status = rf_ds_failed; |
738 | raidPtr->status = rf_rs_degraded; |
739 | } |
740 | rf_unlock_mutex2(raidPtr->mutex); |
741 | |
742 | rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE); |
743 | |
744 | /* Close the component, so that it's not "locked" if someone |
745 | else want's to use it! */ |
746 | |
747 | rf_close_component(raidPtr, raidPtr->raid_cinfo[fcol].ci_vp, |
748 | raidPtr->Disks[fcol].auto_configured); |
749 | |
750 | rf_lock_mutex2(raidPtr->mutex); |
751 | raidPtr->raid_cinfo[fcol].ci_vp = NULL; |
752 | |
753 | /* Need to mark the component as not being auto_configured |
754 | (in case it was previously). */ |
755 | |
756 | raidPtr->Disks[fcol].auto_configured = 0; |
757 | rf_unlock_mutex2(raidPtr->mutex); |
758 | /* now we can allow IO to continue -- we'll be suspending it |
759 | again in rf_ReconstructFailedDisk() if we have to.. */ |
760 | |
761 | rf_ResumeNewRequests(raidPtr); |
762 | |
763 | if (initRecon) |
764 | rf_ReconstructFailedDisk(raidPtr, fcol); |
765 | return (0); |
766 | } |
767 | /* releases a thread that is waiting for the array to become quiesced. |
768 | * access_suspend_mutex should be locked upon calling this |
769 | */ |
770 | void |
771 | rf_SignalQuiescenceLock(RF_Raid_t *raidPtr) |
772 | { |
773 | #if RF_DEBUG_QUIESCE |
774 | if (rf_quiesceDebug) { |
775 | printf("raid%d: Signalling quiescence lock\n" , |
776 | raidPtr->raidid); |
777 | } |
778 | #endif |
779 | raidPtr->access_suspend_release = 1; |
780 | |
781 | if (raidPtr->waiting_for_quiescence) { |
782 | SIGNAL_QUIESCENT_COND(raidPtr); |
783 | } |
784 | } |
785 | /* suspends all new requests to the array. No effect on accesses that are in flight. */ |
786 | int |
787 | rf_SuspendNewRequestsAndWait(RF_Raid_t *raidPtr) |
788 | { |
789 | #if RF_DEBUG_QUIESCE |
790 | if (rf_quiesceDebug) |
791 | printf("raid%d: Suspending new reqs\n" , raidPtr->raidid); |
792 | #endif |
793 | rf_lock_mutex2(raidPtr->access_suspend_mutex); |
794 | raidPtr->accesses_suspended++; |
795 | raidPtr->waiting_for_quiescence = (raidPtr->accs_in_flight == 0) ? 0 : 1; |
796 | |
797 | if (raidPtr->waiting_for_quiescence) { |
798 | raidPtr->access_suspend_release = 0; |
799 | while (!raidPtr->access_suspend_release) { |
800 | #if RF_DEBUG_QUIESCE |
801 | printf("raid%d: Suspending: Waiting for Quiescence\n" , |
802 | raidPtr->raidid); |
803 | #endif |
804 | WAIT_FOR_QUIESCENCE(raidPtr); |
805 | raidPtr->waiting_for_quiescence = 0; |
806 | } |
807 | } |
808 | #if RF_DEBUG_QUIESCE |
809 | printf("raid%d: Quiescence reached..\n" , raidPtr->raidid); |
810 | #endif |
811 | |
812 | rf_unlock_mutex2(raidPtr->access_suspend_mutex); |
813 | return (raidPtr->waiting_for_quiescence); |
814 | } |
815 | /* wake up everyone waiting for quiescence to be released */ |
816 | void |
817 | rf_ResumeNewRequests(RF_Raid_t *raidPtr) |
818 | { |
819 | RF_CallbackDesc_t *t, *cb; |
820 | |
821 | #if RF_DEBUG_QUIESCE |
822 | if (rf_quiesceDebug) |
823 | printf("raid%d: Resuming new requests\n" , raidPtr->raidid); |
824 | #endif |
825 | |
826 | rf_lock_mutex2(raidPtr->access_suspend_mutex); |
827 | raidPtr->accesses_suspended--; |
828 | if (raidPtr->accesses_suspended == 0) |
829 | cb = raidPtr->quiesce_wait_list; |
830 | else |
831 | cb = NULL; |
832 | raidPtr->quiesce_wait_list = NULL; |
833 | rf_unlock_mutex2(raidPtr->access_suspend_mutex); |
834 | |
835 | while (cb) { |
836 | t = cb; |
837 | cb = cb->next; |
838 | (t->callbackFunc) (t->callbackArg); |
839 | rf_FreeCallbackDesc(t); |
840 | } |
841 | } |
842 | /***************************************************************************************** |
843 | * |
844 | * debug routines |
845 | * |
846 | ****************************************************************************************/ |
847 | |
848 | static void |
849 | set_debug_option(char *name, long val) |
850 | { |
851 | RF_DebugName_t *p; |
852 | |
853 | for (p = rf_debugNames; p->name; p++) { |
854 | if (!strcmp(p->name, name)) { |
855 | *(p->ptr) = val; |
856 | printf("[Set debug variable %s to %ld]\n" , name, val); |
857 | return; |
858 | } |
859 | } |
860 | RF_ERRORMSG1("Unknown debug string \"%s\"\n" , name); |
861 | } |
862 | |
863 | |
864 | /* would like to use sscanf here, but apparently not available in kernel */ |
865 | /*ARGSUSED*/ |
866 | static void |
867 | rf_ConfigureDebug(RF_Config_t *cfgPtr) |
868 | { |
869 | char *val_p, *name_p, *white_p; |
870 | long val; |
871 | int i; |
872 | |
873 | rf_ResetDebugOptions(); |
874 | for (i = 0; i < RF_MAXDBGV && cfgPtr->debugVars[i][0]; i++) { |
875 | name_p = rf_find_non_white(&cfgPtr->debugVars[i][0]); |
876 | white_p = rf_find_white(name_p); /* skip to start of 2nd |
877 | * word */ |
878 | val_p = rf_find_non_white(white_p); |
879 | if (*val_p == '0' && *(val_p + 1) == 'x') |
880 | val = rf_htoi(val_p + 2); |
881 | else |
882 | val = rf_atoi(val_p); |
883 | *white_p = '\0'; |
884 | set_debug_option(name_p, val); |
885 | } |
886 | } |
887 | |
888 | void |
889 | rf_print_panic_message(int line, const char *file) |
890 | { |
891 | snprintf(rf_panicbuf, sizeof(rf_panicbuf), |
892 | "raidframe error at line %d file %s" , line, file); |
893 | } |
894 | |
895 | #ifdef RAID_DIAGNOSTIC |
896 | void |
897 | rf_print_assert_panic_message(int line, const char *file, const char *condition) |
898 | { |
899 | snprintf(rf_panicbuf, sizeof(rf_panicbuf), |
900 | "raidframe error at line %d file %s (failed asserting %s)\n" , |
901 | line, file, condition); |
902 | } |
903 | #endif |
904 | |
905 | void |
906 | rf_print_unable_to_init_mutex(const char *file, int line, int rc) |
907 | { |
908 | RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n" , |
909 | file, line, rc); |
910 | } |
911 | |
912 | void |
913 | rf_print_unable_to_add_shutdown(const char *file, int line, int rc) |
914 | { |
915 | RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n" , |
916 | file, line, rc); |
917 | } |
918 | |
919 | static void |
920 | rf_alloc_mutex_cond(RF_Raid_t *raidPtr) |
921 | { |
922 | |
923 | rf_init_mutex2(raidPtr->mutex, IPL_VM); |
924 | |
925 | rf_init_cond2(raidPtr->outstandingCond, "rfocond" ); |
926 | rf_init_mutex2(raidPtr->rad_lock, IPL_VM); |
927 | |
928 | rf_init_mutex2(raidPtr->access_suspend_mutex, IPL_VM); |
929 | rf_init_cond2(raidPtr->access_suspend_cv, "rfquiesce" ); |
930 | |
931 | rf_init_cond2(raidPtr->waitForReconCond, "rfrcnw" ); |
932 | |
933 | rf_init_cond2(raidPtr->adding_hot_spare_cv, "raidhs" ); |
934 | } |
935 | |
936 | static void |
937 | rf_destroy_mutex_cond(RF_Raid_t *raidPtr) |
938 | { |
939 | |
940 | rf_destroy_cond2(raidPtr->waitForReconCond); |
941 | rf_destroy_cond2(raidPtr->adding_hot_spare_cv); |
942 | |
943 | rf_destroy_mutex2(raidPtr->access_suspend_mutex); |
944 | rf_destroy_cond2(raidPtr->access_suspend_cv); |
945 | |
946 | rf_destroy_cond2(raidPtr->outstandingCond); |
947 | rf_destroy_mutex2(raidPtr->rad_lock); |
948 | |
949 | rf_destroy_mutex2(raidPtr->mutex); |
950 | } |
951 | |