1 | /* $NetBSD: rf_paritylogging.c,v 1.34 2011/05/11 06:20:33 mrg Exp $ */ |
2 | /* |
3 | * Copyright (c) 1995 Carnegie-Mellon University. |
4 | * All rights reserved. |
5 | * |
6 | * Author: William V. Courtright II |
7 | * |
8 | * Permission to use, copy, modify and distribute this software and |
9 | * its documentation is hereby granted, provided that both the copyright |
10 | * notice and this permission notice appear in all copies of the |
11 | * software, derivative works or modified versions, and any portions |
12 | * thereof, and that both notices appear in supporting documentation. |
13 | * |
14 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
15 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND |
16 | * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. |
17 | * |
18 | * Carnegie Mellon requests users of this software to return to |
19 | * |
20 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
21 | * School of Computer Science |
22 | * Carnegie Mellon University |
23 | * Pittsburgh PA 15213-3890 |
24 | * |
25 | * any improvements or extensions that they make and grant Carnegie the |
26 | * rights to redistribute these changes. |
27 | */ |
28 | |
29 | |
30 | /* |
31 | parity logging configuration, dag selection, and mapping is implemented here |
32 | */ |
33 | |
34 | #include <sys/cdefs.h> |
35 | __KERNEL_RCSID(0, "$NetBSD: rf_paritylogging.c,v 1.34 2011/05/11 06:20:33 mrg Exp $" ); |
36 | |
37 | #include "rf_archs.h" |
38 | |
39 | #if RF_INCLUDE_PARITYLOGGING > 0 |
40 | |
41 | #include <dev/raidframe/raidframevar.h> |
42 | |
43 | #include "rf_raid.h" |
44 | #include "rf_dag.h" |
45 | #include "rf_dagutils.h" |
46 | #include "rf_dagfuncs.h" |
47 | #include "rf_dagffrd.h" |
48 | #include "rf_dagffwr.h" |
49 | #include "rf_dagdegrd.h" |
50 | #include "rf_dagdegwr.h" |
51 | #include "rf_paritylog.h" |
52 | #include "rf_paritylogDiskMgr.h" |
53 | #include "rf_paritylogging.h" |
54 | #include "rf_parityloggingdags.h" |
55 | #include "rf_general.h" |
56 | #include "rf_map.h" |
57 | #include "rf_utils.h" |
58 | #include "rf_shutdown.h" |
59 | |
60 | typedef struct RF_ParityLoggingConfigInfo_s { |
61 | RF_RowCol_t **stripeIdentifier; /* filled in at config time & used by |
62 | * IdentifyStripe */ |
63 | } RF_ParityLoggingConfigInfo_t; |
64 | |
65 | static void FreeRegionInfo(RF_Raid_t * raidPtr, RF_RegionId_t regionID); |
66 | static void rf_ShutdownParityLogging(RF_ThreadArg_t arg); |
67 | static void rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg); |
68 | static void rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg); |
69 | static void rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg); |
70 | static void rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg); |
71 | static void rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg); |
72 | |
73 | int |
74 | rf_ConfigureParityLogging( |
75 | RF_ShutdownList_t ** listp, |
76 | RF_Raid_t * raidPtr, |
77 | RF_Config_t * cfgPtr) |
78 | { |
79 | int i, j, startdisk, rc; |
80 | RF_SectorCount_t totalLogCapacity, fragmentation, lastRegionCapacity; |
81 | RF_SectorCount_t parityBufferCapacity, maxRegionParityRange; |
82 | RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; |
83 | RF_ParityLoggingConfigInfo_t *info; |
84 | RF_ParityLog_t *l = NULL, *next; |
85 | void *lHeapPtr; |
86 | |
87 | if (rf_numParityRegions <= 0) |
88 | return(EINVAL); |
89 | |
90 | /* |
91 | * We create multiple entries on the shutdown list here, since |
92 | * this configuration routine is fairly complicated in and of |
93 | * itself, and this makes backing out of a failed configuration |
94 | * much simpler. |
95 | */ |
96 | |
97 | raidPtr->numSectorsPerLog = RF_DEFAULT_NUM_SECTORS_PER_LOG; |
98 | |
99 | /* create a parity logging configuration structure */ |
100 | RF_MallocAndAdd(info, sizeof(RF_ParityLoggingConfigInfo_t), |
101 | (RF_ParityLoggingConfigInfo_t *), |
102 | raidPtr->cleanupList); |
103 | if (info == NULL) |
104 | return (ENOMEM); |
105 | layoutPtr->layoutSpecificInfo = (void *) info; |
106 | |
107 | /* the stripe identifier must identify the disks in each stripe, IN |
108 | * THE ORDER THAT THEY APPEAR IN THE STRIPE. */ |
109 | info->stripeIdentifier = rf_make_2d_array((raidPtr->numCol), |
110 | (raidPtr->numCol), |
111 | raidPtr->cleanupList); |
112 | if (info->stripeIdentifier == NULL) |
113 | return (ENOMEM); |
114 | |
115 | startdisk = 0; |
116 | for (i = 0; i < (raidPtr->numCol); i++) { |
117 | for (j = 0; j < (raidPtr->numCol); j++) { |
118 | info->stripeIdentifier[i][j] = (startdisk + j) % |
119 | (raidPtr->numCol - 1); |
120 | } |
121 | if ((--startdisk) < 0) |
122 | startdisk = raidPtr->numCol - 1 - 1; |
123 | } |
124 | |
125 | /* fill in the remaining layout parameters */ |
126 | layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk; |
127 | layoutPtr->numParityCol = 1; |
128 | layoutPtr->numParityLogCol = 1; |
129 | layoutPtr->numDataCol = raidPtr->numCol - layoutPtr->numParityCol - |
130 | layoutPtr->numParityLogCol; |
131 | layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * |
132 | layoutPtr->sectorsPerStripeUnit; |
133 | layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk; |
134 | raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * |
135 | layoutPtr->sectorsPerStripeUnit; |
136 | |
137 | raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * |
138 | layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit; |
139 | |
140 | /* configure parity log parameters |
141 | * |
142 | * parameter comment/constraints |
143 | * ------------------------------------------- |
144 | * numParityRegions* all regions (except possibly last) |
145 | * of equal size |
146 | * totalInCoreLogCapacity* amount of memory in bytes available |
147 | * for in-core logs (default 1 MB) |
148 | * numSectorsPerLog# capacity of an in-core log in sectors |
149 | * (1 * disk track) |
150 | * numParityLogs total number of in-core logs, |
151 | * should be at least numParityRegions |
152 | * regionLogCapacity size of a region log (except possibly |
153 | * last one) in sectors |
154 | * totalLogCapacity total amount of log space in sectors |
155 | * |
156 | * where '*' denotes a user settable parameter. |
157 | * Note that logs are fixed to be the size of a disk track, |
158 | * value #defined in rf_paritylog.h |
159 | * |
160 | */ |
161 | |
162 | totalLogCapacity = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit * layoutPtr->numParityLogCol; |
163 | raidPtr->regionLogCapacity = totalLogCapacity / rf_numParityRegions; |
164 | if (rf_parityLogDebug) |
165 | printf("bytes per sector %d\n" , raidPtr->bytesPerSector); |
166 | |
167 | /* reduce fragmentation within a disk region by adjusting the number |
168 | * of regions in an attempt to allow an integral number of logs to fit |
169 | * into a disk region */ |
170 | fragmentation = raidPtr->regionLogCapacity % raidPtr->numSectorsPerLog; |
171 | if (fragmentation > 0) |
172 | for (i = 1; i < (raidPtr->numSectorsPerLog / 2); i++) { |
173 | if (((totalLogCapacity / (rf_numParityRegions + i)) % |
174 | raidPtr->numSectorsPerLog) < fragmentation) { |
175 | rf_numParityRegions++; |
176 | raidPtr->regionLogCapacity = totalLogCapacity / |
177 | rf_numParityRegions; |
178 | fragmentation = raidPtr->regionLogCapacity % |
179 | raidPtr->numSectorsPerLog; |
180 | } |
181 | if (((totalLogCapacity / (rf_numParityRegions - i)) % |
182 | raidPtr->numSectorsPerLog) < fragmentation) { |
183 | rf_numParityRegions--; |
184 | raidPtr->regionLogCapacity = totalLogCapacity / |
185 | rf_numParityRegions; |
186 | fragmentation = raidPtr->regionLogCapacity % |
187 | raidPtr->numSectorsPerLog; |
188 | } |
189 | } |
190 | /* ensure integral number of regions per log */ |
191 | raidPtr->regionLogCapacity = (raidPtr->regionLogCapacity / |
192 | raidPtr->numSectorsPerLog) * |
193 | raidPtr->numSectorsPerLog; |
194 | |
195 | raidPtr->numParityLogs = rf_totalInCoreLogCapacity / |
196 | (raidPtr->bytesPerSector * raidPtr->numSectorsPerLog); |
197 | /* to avoid deadlock, must ensure that enough logs exist for each |
198 | * region to have one simultaneously */ |
199 | if (raidPtr->numParityLogs < rf_numParityRegions) |
200 | raidPtr->numParityLogs = rf_numParityRegions; |
201 | |
202 | /* create region information structs */ |
203 | printf("Allocating %d bytes for in-core parity region info\n" , |
204 | (int) (rf_numParityRegions * sizeof(RF_RegionInfo_t))); |
205 | RF_Malloc(raidPtr->regionInfo, |
206 | (rf_numParityRegions * sizeof(RF_RegionInfo_t)), |
207 | (RF_RegionInfo_t *)); |
208 | if (raidPtr->regionInfo == NULL) |
209 | return (ENOMEM); |
210 | |
211 | /* last region may not be full capacity */ |
212 | lastRegionCapacity = raidPtr->regionLogCapacity; |
213 | while ((rf_numParityRegions - 1) * raidPtr->regionLogCapacity + |
214 | lastRegionCapacity > totalLogCapacity) |
215 | lastRegionCapacity = lastRegionCapacity - |
216 | raidPtr->numSectorsPerLog; |
217 | |
218 | raidPtr->regionParityRange = raidPtr->sectorsPerDisk / |
219 | rf_numParityRegions; |
220 | maxRegionParityRange = raidPtr->regionParityRange; |
221 | |
222 | /* i can't remember why this line is in the code -wvcii 6/30/95 */ |
223 | /* if (raidPtr->sectorsPerDisk % rf_numParityRegions > 0) |
224 | regionParityRange++; */ |
225 | |
226 | /* build pool of unused parity logs */ |
227 | printf("Allocating %d bytes for %d parity logs\n" , |
228 | raidPtr->numParityLogs * raidPtr->numSectorsPerLog * |
229 | raidPtr->bytesPerSector, |
230 | raidPtr->numParityLogs); |
231 | RF_Malloc(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * |
232 | raidPtr->numSectorsPerLog * raidPtr->bytesPerSector, |
233 | (void *)); |
234 | if (raidPtr->parityLogBufferHeap == NULL) |
235 | return (ENOMEM); |
236 | lHeapPtr = raidPtr->parityLogBufferHeap; |
237 | rf_init_mutex2(raidPtr->parityLogPool.mutex, IPL_VM); |
238 | for (i = 0; i < raidPtr->numParityLogs; i++) { |
239 | if (i == 0) { |
240 | RF_Malloc(raidPtr->parityLogPool.parityLogs, |
241 | sizeof(RF_ParityLog_t), (RF_ParityLog_t *)); |
242 | if (raidPtr->parityLogPool.parityLogs == NULL) { |
243 | RF_Free(raidPtr->parityLogBufferHeap, |
244 | raidPtr->numParityLogs * |
245 | raidPtr->numSectorsPerLog * |
246 | raidPtr->bytesPerSector); |
247 | return (ENOMEM); |
248 | } |
249 | l = raidPtr->parityLogPool.parityLogs; |
250 | } else { |
251 | RF_Malloc(l->next, sizeof(RF_ParityLog_t), |
252 | (RF_ParityLog_t *)); |
253 | if (l->next == NULL) { |
254 | RF_Free(raidPtr->parityLogBufferHeap, |
255 | raidPtr->numParityLogs * |
256 | raidPtr->numSectorsPerLog * |
257 | raidPtr->bytesPerSector); |
258 | for (l = raidPtr->parityLogPool.parityLogs; |
259 | l; |
260 | l = next) { |
261 | next = l->next; |
262 | if (l->records) |
263 | RF_Free(l->records, (raidPtr->numSectorsPerLog * sizeof(RF_ParityLogRecord_t))); |
264 | RF_Free(l, sizeof(RF_ParityLog_t)); |
265 | } |
266 | return (ENOMEM); |
267 | } |
268 | l = l->next; |
269 | } |
270 | l->bufPtr = lHeapPtr; |
271 | lHeapPtr = (char *)lHeapPtr + raidPtr->numSectorsPerLog * |
272 | raidPtr->bytesPerSector; |
273 | RF_Malloc(l->records, (raidPtr->numSectorsPerLog * |
274 | sizeof(RF_ParityLogRecord_t)), |
275 | (RF_ParityLogRecord_t *)); |
276 | if (l->records == NULL) { |
277 | RF_Free(raidPtr->parityLogBufferHeap, |
278 | raidPtr->numParityLogs * |
279 | raidPtr->numSectorsPerLog * |
280 | raidPtr->bytesPerSector); |
281 | for (l = raidPtr->parityLogPool.parityLogs; |
282 | l; |
283 | l = next) { |
284 | next = l->next; |
285 | if (l->records) |
286 | RF_Free(l->records, |
287 | (raidPtr->numSectorsPerLog * |
288 | sizeof(RF_ParityLogRecord_t))); |
289 | RF_Free(l, sizeof(RF_ParityLog_t)); |
290 | } |
291 | return (ENOMEM); |
292 | } |
293 | } |
294 | rf_ShutdownCreate(listp, rf_ShutdownParityLoggingPool, raidPtr); |
295 | /* build pool of region buffers */ |
296 | rf_init_mutex2(raidPtr->regionBufferPool.mutex, IPL_VM); |
297 | rf_init_cond2(raidPtr->regionBufferPool.cond, "rfrbpl" ); |
298 | raidPtr->regionBufferPool.bufferSize = raidPtr->regionLogCapacity * |
299 | raidPtr->bytesPerSector; |
300 | printf("regionBufferPool.bufferSize %d\n" , |
301 | raidPtr->regionBufferPool.bufferSize); |
302 | |
303 | /* for now, only one region at a time may be reintegrated */ |
304 | raidPtr->regionBufferPool.totalBuffers = 1; |
305 | |
306 | raidPtr->regionBufferPool.availableBuffers = |
307 | raidPtr->regionBufferPool.totalBuffers; |
308 | raidPtr->regionBufferPool.availBuffersIndex = 0; |
309 | raidPtr->regionBufferPool.emptyBuffersIndex = 0; |
310 | printf("Allocating %d bytes for regionBufferPool\n" , |
311 | (int) (raidPtr->regionBufferPool.totalBuffers * |
312 | sizeof(void *))); |
313 | RF_Malloc(raidPtr->regionBufferPool.buffers, |
314 | raidPtr->regionBufferPool.totalBuffers * sizeof(void *), |
315 | (void **)); |
316 | if (raidPtr->regionBufferPool.buffers == NULL) { |
317 | return (ENOMEM); |
318 | } |
319 | for (i = 0; i < raidPtr->regionBufferPool.totalBuffers; i++) { |
320 | printf("Allocating %d bytes for regionBufferPool#%d\n" , |
321 | (int) (raidPtr->regionBufferPool.bufferSize * |
322 | sizeof(char)), i); |
323 | RF_Malloc(raidPtr->regionBufferPool.buffers[i], |
324 | raidPtr->regionBufferPool.bufferSize * sizeof(char), |
325 | (void *)); |
326 | if (raidPtr->regionBufferPool.buffers[i] == NULL) { |
327 | for (j = 0; j < i; j++) { |
328 | RF_Free(raidPtr->regionBufferPool.buffers[i], |
329 | raidPtr->regionBufferPool.bufferSize * |
330 | sizeof(char)); |
331 | } |
332 | RF_Free(raidPtr->regionBufferPool.buffers, |
333 | raidPtr->regionBufferPool.totalBuffers * |
334 | sizeof(void *)); |
335 | return (ENOMEM); |
336 | } |
337 | printf("raidPtr->regionBufferPool.buffers[%d] = %lx\n" , i, |
338 | (long) raidPtr->regionBufferPool.buffers[i]); |
339 | } |
340 | rf_ShutdownCreate(listp, |
341 | rf_ShutdownParityLoggingRegionBufferPool, |
342 | raidPtr); |
343 | /* build pool of parity buffers */ |
344 | parityBufferCapacity = maxRegionParityRange; |
345 | rf_init_mutex2(raidPtr->parityBufferPool.mutex, IPL_VM); |
346 | rf_init_cond2(raidPtr->parityBufferPool.cond, "rfpbpl" ); |
347 | raidPtr->parityBufferPool.bufferSize = parityBufferCapacity * |
348 | raidPtr->bytesPerSector; |
349 | printf("parityBufferPool.bufferSize %d\n" , |
350 | raidPtr->parityBufferPool.bufferSize); |
351 | |
352 | /* for now, only one region at a time may be reintegrated */ |
353 | raidPtr->parityBufferPool.totalBuffers = 1; |
354 | |
355 | raidPtr->parityBufferPool.availableBuffers = |
356 | raidPtr->parityBufferPool.totalBuffers; |
357 | raidPtr->parityBufferPool.availBuffersIndex = 0; |
358 | raidPtr->parityBufferPool.emptyBuffersIndex = 0; |
359 | printf("Allocating %d bytes for parityBufferPool of %d units\n" , |
360 | (int) (raidPtr->parityBufferPool.totalBuffers * |
361 | sizeof(void *)), |
362 | raidPtr->parityBufferPool.totalBuffers ); |
363 | RF_Malloc(raidPtr->parityBufferPool.buffers, |
364 | raidPtr->parityBufferPool.totalBuffers * sizeof(void *), |
365 | (void **)); |
366 | if (raidPtr->parityBufferPool.buffers == NULL) { |
367 | return (ENOMEM); |
368 | } |
369 | for (i = 0; i < raidPtr->parityBufferPool.totalBuffers; i++) { |
370 | printf("Allocating %d bytes for parityBufferPool#%d\n" , |
371 | (int) (raidPtr->parityBufferPool.bufferSize * |
372 | sizeof(char)),i); |
373 | RF_Malloc(raidPtr->parityBufferPool.buffers[i], |
374 | raidPtr->parityBufferPool.bufferSize * sizeof(char), |
375 | (void *)); |
376 | if (raidPtr->parityBufferPool.buffers == NULL) { |
377 | for (j = 0; j < i; j++) { |
378 | RF_Free(raidPtr->parityBufferPool.buffers[i], |
379 | raidPtr->regionBufferPool.bufferSize * |
380 | sizeof(char)); |
381 | } |
382 | RF_Free(raidPtr->parityBufferPool.buffers, |
383 | raidPtr->regionBufferPool.totalBuffers * |
384 | sizeof(void *)); |
385 | return (ENOMEM); |
386 | } |
387 | printf("parityBufferPool.buffers[%d] = %lx\n" , i, |
388 | (long) raidPtr->parityBufferPool.buffers[i]); |
389 | } |
390 | rf_ShutdownCreate(listp, |
391 | rf_ShutdownParityLoggingParityBufferPool, |
392 | raidPtr); |
393 | /* initialize parityLogDiskQueue */ |
394 | rf_init_mutex2(raidPtr->parityLogDiskQueue.mutex, IPL_VM); |
395 | rf_init_cond2(raidPtr->parityLogDiskQueue.cond, "rfpldq" ); |
396 | raidPtr->parityLogDiskQueue.flushQueue = NULL; |
397 | raidPtr->parityLogDiskQueue.reintQueue = NULL; |
398 | raidPtr->parityLogDiskQueue.bufHead = NULL; |
399 | raidPtr->parityLogDiskQueue.bufTail = NULL; |
400 | raidPtr->parityLogDiskQueue.reintHead = NULL; |
401 | raidPtr->parityLogDiskQueue.reintTail = NULL; |
402 | raidPtr->parityLogDiskQueue.logBlockHead = NULL; |
403 | raidPtr->parityLogDiskQueue.logBlockTail = NULL; |
404 | raidPtr->parityLogDiskQueue.reintBlockHead = NULL; |
405 | raidPtr->parityLogDiskQueue.reintBlockTail = NULL; |
406 | raidPtr->parityLogDiskQueue.freeDataList = NULL; |
407 | raidPtr->parityLogDiskQueue.freeCommonList = NULL; |
408 | |
409 | rf_ShutdownCreate(listp, |
410 | rf_ShutdownParityLoggingDiskQueue, |
411 | raidPtr); |
412 | for (i = 0; i < rf_numParityRegions; i++) { |
413 | rf_init_mutex2(raidPtr->regionInfo[i].mutex, IPL_VM); |
414 | rf_init_mutex2(raidPtr->regionInfo[i].reintMutex, IPL_VM); |
415 | raidPtr->regionInfo[i].reintInProgress = RF_FALSE; |
416 | raidPtr->regionInfo[i].regionStartAddr = |
417 | raidPtr->regionLogCapacity * i; |
418 | raidPtr->regionInfo[i].parityStartAddr = |
419 | raidPtr->regionParityRange * i; |
420 | if (i < rf_numParityRegions - 1) { |
421 | raidPtr->regionInfo[i].capacity = |
422 | raidPtr->regionLogCapacity; |
423 | raidPtr->regionInfo[i].numSectorsParity = |
424 | raidPtr->regionParityRange; |
425 | } else { |
426 | raidPtr->regionInfo[i].capacity = |
427 | lastRegionCapacity; |
428 | raidPtr->regionInfo[i].numSectorsParity = |
429 | raidPtr->sectorsPerDisk - |
430 | raidPtr->regionParityRange * i; |
431 | if (raidPtr->regionInfo[i].numSectorsParity > |
432 | maxRegionParityRange) |
433 | maxRegionParityRange = |
434 | raidPtr->regionInfo[i].numSectorsParity; |
435 | } |
436 | raidPtr->regionInfo[i].diskCount = 0; |
437 | RF_ASSERT(raidPtr->regionInfo[i].capacity + |
438 | raidPtr->regionInfo[i].regionStartAddr <= |
439 | totalLogCapacity); |
440 | RF_ASSERT(raidPtr->regionInfo[i].parityStartAddr + |
441 | raidPtr->regionInfo[i].numSectorsParity <= |
442 | raidPtr->sectorsPerDisk); |
443 | printf("Allocating %d bytes for region %d\n" , |
444 | (int) (raidPtr->regionInfo[i].capacity * |
445 | sizeof(RF_DiskMap_t)), i); |
446 | RF_Malloc(raidPtr->regionInfo[i].diskMap, |
447 | (raidPtr->regionInfo[i].capacity * |
448 | sizeof(RF_DiskMap_t)), |
449 | (RF_DiskMap_t *)); |
450 | if (raidPtr->regionInfo[i].diskMap == NULL) { |
451 | for (j = 0; j < i; j++) |
452 | FreeRegionInfo(raidPtr, j); |
453 | RF_Free(raidPtr->regionInfo, |
454 | (rf_numParityRegions * |
455 | sizeof(RF_RegionInfo_t))); |
456 | return (ENOMEM); |
457 | } |
458 | raidPtr->regionInfo[i].loggingEnabled = RF_FALSE; |
459 | raidPtr->regionInfo[i].coreLog = NULL; |
460 | } |
461 | rf_ShutdownCreate(listp, |
462 | rf_ShutdownParityLoggingRegionInfo, |
463 | raidPtr); |
464 | RF_ASSERT(raidPtr->parityLogDiskQueue.threadState == 0); |
465 | raidPtr->parityLogDiskQueue.threadState = RF_PLOG_CREATED; |
466 | rc = RF_CREATE_THREAD(raidPtr->pLogDiskThreadHandle, |
467 | rf_ParityLoggingDiskManager, raidPtr,"rf_log" ); |
468 | if (rc) { |
469 | raidPtr->parityLogDiskQueue.threadState = 0; |
470 | RF_ERRORMSG3("Unable to create parity logging disk thread file %s line %d rc=%d\n" , |
471 | __FILE__, __LINE__, rc); |
472 | return (ENOMEM); |
473 | } |
474 | /* wait for thread to start */ |
475 | rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex); |
476 | while (!(raidPtr->parityLogDiskQueue.threadState & RF_PLOG_RUNNING)) { |
477 | rf_wait_cond2(raidPtr->parityLogDiskQueue.cond, |
478 | raidPtr->parityLogDiskQueue.mutex); |
479 | } |
480 | rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex); |
481 | |
482 | rf_ShutdownCreate(listp, rf_ShutdownParityLogging, raidPtr); |
483 | if (rf_parityLogDebug) { |
484 | printf(" size of disk log in sectors: %d\n" , |
485 | (int) totalLogCapacity); |
486 | printf(" total number of parity regions is %d\n" , (int) rf_numParityRegions); |
487 | printf(" nominal sectors of log per parity region is %d\n" , (int) raidPtr->regionLogCapacity); |
488 | printf(" nominal region fragmentation is %d sectors\n" , (int) fragmentation); |
489 | printf(" total number of parity logs is %d\n" , raidPtr->numParityLogs); |
490 | printf(" parity log size is %d sectors\n" , raidPtr->numSectorsPerLog); |
491 | printf(" total in-core log space is %d bytes\n" , (int) rf_totalInCoreLogCapacity); |
492 | } |
493 | rf_EnableParityLogging(raidPtr); |
494 | |
495 | return (0); |
496 | } |
497 | |
498 | static void |
499 | FreeRegionInfo( |
500 | RF_Raid_t * raidPtr, |
501 | RF_RegionId_t regionID) |
502 | { |
503 | RF_Free(raidPtr->regionInfo[regionID].diskMap, |
504 | (raidPtr->regionInfo[regionID].capacity * |
505 | sizeof(RF_DiskMap_t))); |
506 | if (!rf_forceParityLogReint && raidPtr->regionInfo[regionID].coreLog) { |
507 | rf_ReleaseParityLogs(raidPtr, |
508 | raidPtr->regionInfo[regionID].coreLog); |
509 | raidPtr->regionInfo[regionID].coreLog = NULL; |
510 | } else { |
511 | RF_ASSERT(raidPtr->regionInfo[regionID].coreLog == NULL); |
512 | RF_ASSERT(raidPtr->regionInfo[regionID].diskCount == 0); |
513 | } |
514 | rf_destroy_mutex2(raidPtr->regionInfo[regionID].reintMutex); |
515 | rf_destroy_mutex2(raidPtr->regionInfo[regionID].mutex); |
516 | } |
517 | |
518 | |
519 | static void |
520 | FreeParityLogQueue(RF_Raid_t * raidPtr) |
521 | { |
522 | RF_ParityLog_t *l1, *l2; |
523 | |
524 | l1 = raidPtr->parityLogPool.parityLogs; |
525 | while (l1) { |
526 | l2 = l1; |
527 | l1 = l2->next; |
528 | RF_Free(l2->records, (raidPtr->numSectorsPerLog * |
529 | sizeof(RF_ParityLogRecord_t))); |
530 | RF_Free(l2, sizeof(RF_ParityLog_t)); |
531 | } |
532 | rf_destroy_mutex2(raidPtr->parityLogPool.mutex); |
533 | } |
534 | |
535 | |
536 | static void |
537 | FreeRegionBufferQueue(RF_RegionBufferQueue_t * queue) |
538 | { |
539 | int i; |
540 | |
541 | if (queue->availableBuffers != queue->totalBuffers) { |
542 | printf("Attempt to free region queue which is still in use!\n" ); |
543 | RF_ASSERT(0); |
544 | } |
545 | for (i = 0; i < queue->totalBuffers; i++) |
546 | RF_Free(queue->buffers[i], queue->bufferSize); |
547 | RF_Free(queue->buffers, queue->totalBuffers * sizeof(void *)); |
548 | rf_destroy_mutex2(queue->mutex); |
549 | rf_destroy_cond2(queue->cond); |
550 | } |
551 | |
552 | static void |
553 | rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg) |
554 | { |
555 | RF_Raid_t *raidPtr; |
556 | RF_RegionId_t i; |
557 | |
558 | raidPtr = (RF_Raid_t *) arg; |
559 | if (rf_parityLogDebug) { |
560 | printf("raid%d: ShutdownParityLoggingRegionInfo\n" , |
561 | raidPtr->raidid); |
562 | } |
563 | /* free region information structs */ |
564 | for (i = 0; i < rf_numParityRegions; i++) |
565 | FreeRegionInfo(raidPtr, i); |
566 | RF_Free(raidPtr->regionInfo, (rf_numParityRegions * |
567 | sizeof(raidPtr->regionInfo))); |
568 | raidPtr->regionInfo = NULL; |
569 | } |
570 | |
571 | static void |
572 | rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg) |
573 | { |
574 | RF_Raid_t *raidPtr; |
575 | |
576 | raidPtr = (RF_Raid_t *) arg; |
577 | if (rf_parityLogDebug) { |
578 | printf("raid%d: ShutdownParityLoggingPool\n" , raidPtr->raidid); |
579 | } |
580 | /* free contents of parityLogPool */ |
581 | FreeParityLogQueue(raidPtr); |
582 | RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * |
583 | raidPtr->numSectorsPerLog * raidPtr->bytesPerSector); |
584 | } |
585 | |
586 | static void |
587 | rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg) |
588 | { |
589 | RF_Raid_t *raidPtr; |
590 | |
591 | raidPtr = (RF_Raid_t *) arg; |
592 | if (rf_parityLogDebug) { |
593 | printf("raid%d: ShutdownParityLoggingRegionBufferPool\n" , |
594 | raidPtr->raidid); |
595 | } |
596 | FreeRegionBufferQueue(&raidPtr->regionBufferPool); |
597 | } |
598 | |
599 | static void |
600 | rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg) |
601 | { |
602 | RF_Raid_t *raidPtr; |
603 | |
604 | raidPtr = (RF_Raid_t *) arg; |
605 | if (rf_parityLogDebug) { |
606 | printf("raid%d: ShutdownParityLoggingParityBufferPool\n" , |
607 | raidPtr->raidid); |
608 | } |
609 | FreeRegionBufferQueue(&raidPtr->parityBufferPool); |
610 | } |
611 | |
612 | static void |
613 | rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg) |
614 | { |
615 | RF_ParityLogData_t *d; |
616 | RF_CommonLogData_t *c; |
617 | RF_Raid_t *raidPtr; |
618 | |
619 | raidPtr = (RF_Raid_t *) arg; |
620 | if (rf_parityLogDebug) { |
621 | printf("raid%d: ShutdownParityLoggingDiskQueue\n" , |
622 | raidPtr->raidid); |
623 | } |
624 | /* free disk manager stuff */ |
625 | RF_ASSERT(raidPtr->parityLogDiskQueue.bufHead == NULL); |
626 | RF_ASSERT(raidPtr->parityLogDiskQueue.bufTail == NULL); |
627 | RF_ASSERT(raidPtr->parityLogDiskQueue.reintHead == NULL); |
628 | RF_ASSERT(raidPtr->parityLogDiskQueue.reintTail == NULL); |
629 | while (raidPtr->parityLogDiskQueue.freeDataList) { |
630 | d = raidPtr->parityLogDiskQueue.freeDataList; |
631 | raidPtr->parityLogDiskQueue.freeDataList = |
632 | raidPtr->parityLogDiskQueue.freeDataList->next; |
633 | RF_Free(d, sizeof(RF_ParityLogData_t)); |
634 | } |
635 | while (raidPtr->parityLogDiskQueue.freeCommonList) { |
636 | c = raidPtr->parityLogDiskQueue.freeCommonList; |
637 | raidPtr->parityLogDiskQueue.freeCommonList = c->next; |
638 | /* init is in rf_paritylog.c */ |
639 | rf_destroy_mutex2(c->mutex); |
640 | RF_Free(c, sizeof(RF_CommonLogData_t)); |
641 | } |
642 | |
643 | rf_destroy_mutex2(raidPtr->parityLogDiskQueue.mutex); |
644 | rf_destroy_cond2(raidPtr->parityLogDiskQueue.cond); |
645 | } |
646 | |
647 | static void |
648 | rf_ShutdownParityLogging(RF_ThreadArg_t arg) |
649 | { |
650 | RF_Raid_t *raidPtr; |
651 | |
652 | raidPtr = (RF_Raid_t *) arg; |
653 | if (rf_parityLogDebug) { |
654 | printf("raid%d: ShutdownParityLogging\n" , raidPtr->raidid); |
655 | } |
656 | /* shutdown disk thread */ |
657 | /* This has the desirable side-effect of forcing all regions to be |
658 | * reintegrated. This is necessary since all parity log maps are |
659 | * currently held in volatile memory. */ |
660 | |
661 | rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex); |
662 | raidPtr->parityLogDiskQueue.threadState |= RF_PLOG_TERMINATE; |
663 | rf_signal_cond2(raidPtr->parityLogDiskQueue.cond); |
664 | rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex); |
665 | /* |
666 | * pLogDiskThread will now terminate when queues are cleared |
667 | * now wait for it to be done |
668 | */ |
669 | rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex); |
670 | while (!(raidPtr->parityLogDiskQueue.threadState & RF_PLOG_SHUTDOWN)) { |
671 | rf_wait_cond2(raidPtr->parityLogDiskQueue.cond, |
672 | raidPtr->parityLogDiskQueue.mutex); |
673 | } |
674 | rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex); |
675 | if (rf_parityLogDebug) { |
676 | printf("raid%d: ShutdownParityLogging done (thread completed)\n" , raidPtr->raidid); |
677 | } |
678 | } |
679 | |
680 | int |
681 | rf_GetDefaultNumFloatingReconBuffersParityLogging(RF_Raid_t * raidPtr) |
682 | { |
683 | return (20); |
684 | } |
685 | |
686 | RF_HeadSepLimit_t |
687 | rf_GetDefaultHeadSepLimitParityLogging(RF_Raid_t * raidPtr) |
688 | { |
689 | return (10); |
690 | } |
691 | /* return the region ID for a given RAID address */ |
692 | RF_RegionId_t |
693 | rf_MapRegionIDParityLogging( |
694 | RF_Raid_t * raidPtr, |
695 | RF_SectorNum_t address) |
696 | { |
697 | RF_RegionId_t regionID; |
698 | |
699 | /* regionID = address / (raidPtr->regionParityRange * raidPtr->Layout.numDataCol); */ |
700 | regionID = address / raidPtr->regionParityRange; |
701 | if (regionID == rf_numParityRegions) { |
702 | /* last region may be larger than other regions */ |
703 | regionID--; |
704 | } |
705 | RF_ASSERT(address >= raidPtr->regionInfo[regionID].parityStartAddr); |
706 | RF_ASSERT(address < raidPtr->regionInfo[regionID].parityStartAddr + |
707 | raidPtr->regionInfo[regionID].numSectorsParity); |
708 | RF_ASSERT(regionID < rf_numParityRegions); |
709 | return (regionID); |
710 | } |
711 | |
712 | |
713 | /* given a logical RAID sector, determine physical disk address of data */ |
714 | void |
715 | rf_MapSectorParityLogging( |
716 | RF_Raid_t * raidPtr, |
717 | RF_RaidAddr_t raidSector, |
718 | RF_RowCol_t * col, |
719 | RF_SectorNum_t * diskSector, |
720 | int remap) |
721 | { |
722 | RF_StripeNum_t SUID = raidSector / |
723 | raidPtr->Layout.sectorsPerStripeUnit; |
724 | /* *col = (SUID % (raidPtr->numCol - |
725 | * raidPtr->Layout.numParityLogCol)); */ |
726 | *col = SUID % raidPtr->Layout.numDataCol; |
727 | *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * |
728 | raidPtr->Layout.sectorsPerStripeUnit + |
729 | (raidSector % raidPtr->Layout.sectorsPerStripeUnit); |
730 | } |
731 | |
732 | |
733 | /* given a logical RAID sector, determine physical disk address of parity */ |
734 | void |
735 | rf_MapParityParityLogging( |
736 | RF_Raid_t * raidPtr, |
737 | RF_RaidAddr_t raidSector, |
738 | RF_RowCol_t * col, |
739 | RF_SectorNum_t * diskSector, |
740 | int remap) |
741 | { |
742 | RF_StripeNum_t SUID = raidSector / |
743 | raidPtr->Layout.sectorsPerStripeUnit; |
744 | |
745 | /* *col = |
746 | * raidPtr->Layout.numDataCol-(SUID/raidPtr->Layout.numDataCol)%(raidPt |
747 | * r->numCol - raidPtr->Layout.numParityLogCol); */ |
748 | *col = raidPtr->Layout.numDataCol; |
749 | *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * |
750 | raidPtr->Layout.sectorsPerStripeUnit + |
751 | (raidSector % raidPtr->Layout.sectorsPerStripeUnit); |
752 | } |
753 | |
754 | |
755 | /* given a regionID and sector offset, determine the physical disk address of the parity log */ |
756 | void |
757 | rf_MapLogParityLogging( |
758 | RF_Raid_t * raidPtr, |
759 | RF_RegionId_t regionID, |
760 | RF_SectorNum_t regionOffset, |
761 | RF_RowCol_t * col, |
762 | RF_SectorNum_t * startSector) |
763 | { |
764 | *col = raidPtr->numCol - 1; |
765 | *startSector = raidPtr->regionInfo[regionID].regionStartAddr + regionOffset; |
766 | } |
767 | |
768 | |
769 | /* given a regionID, determine the physical disk address of the logged |
770 | parity for that region */ |
771 | void |
772 | rf_MapRegionParity( |
773 | RF_Raid_t * raidPtr, |
774 | RF_RegionId_t regionID, |
775 | RF_RowCol_t * col, |
776 | RF_SectorNum_t * startSector, |
777 | RF_SectorCount_t * numSector) |
778 | { |
779 | *col = raidPtr->numCol - 2; |
780 | *startSector = raidPtr->regionInfo[regionID].parityStartAddr; |
781 | *numSector = raidPtr->regionInfo[regionID].numSectorsParity; |
782 | } |
783 | |
784 | |
785 | /* given a logical RAID address, determine the participating disks in |
786 | the stripe */ |
787 | void |
788 | rf_IdentifyStripeParityLogging( |
789 | RF_Raid_t * raidPtr, |
790 | RF_RaidAddr_t addr, |
791 | RF_RowCol_t ** diskids) |
792 | { |
793 | RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, |
794 | addr); |
795 | RF_ParityLoggingConfigInfo_t *info = (RF_ParityLoggingConfigInfo_t *) |
796 | raidPtr->Layout.layoutSpecificInfo; |
797 | *diskids = info->stripeIdentifier[stripeID % raidPtr->numCol]; |
798 | } |
799 | |
800 | |
801 | void |
802 | rf_MapSIDToPSIDParityLogging( |
803 | RF_RaidLayout_t * layoutPtr, |
804 | RF_StripeNum_t stripeID, |
805 | RF_StripeNum_t * psID, |
806 | RF_ReconUnitNum_t * which_ru) |
807 | { |
808 | *which_ru = 0; |
809 | *psID = stripeID; |
810 | } |
811 | |
812 | |
813 | /* select an algorithm for performing an access. Returns two pointers, |
814 | * one to a function that will return information about the DAG, and |
815 | * another to a function that will create the dag. |
816 | */ |
817 | void |
818 | rf_ParityLoggingDagSelect( |
819 | RF_Raid_t * raidPtr, |
820 | RF_IoType_t type, |
821 | RF_AccessStripeMap_t * asmp, |
822 | RF_VoidFuncPtr * createFunc) |
823 | { |
824 | RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); |
825 | RF_PhysDiskAddr_t *failedPDA = NULL; |
826 | RF_RowCol_t fcol; |
827 | RF_RowStatus_t rstat; |
828 | int prior_recon; |
829 | |
830 | RF_ASSERT(RF_IO_IS_R_OR_W(type)); |
831 | |
832 | if (asmp->numDataFailed + asmp->numParityFailed > 1) { |
833 | RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n" ); |
834 | *createFunc = NULL; |
835 | return; |
836 | } else |
837 | if (asmp->numDataFailed + asmp->numParityFailed == 1) { |
838 | |
839 | /* if under recon & already reconstructed, redirect |
840 | * the access to the spare drive and eliminate the |
841 | * failure indication */ |
842 | failedPDA = asmp->failedPDAs[0]; |
843 | fcol = failedPDA->col; |
844 | rstat = raidPtr->status; |
845 | prior_recon = (rstat == rf_rs_reconfigured) || ( |
846 | (rstat == rf_rs_reconstructing) ? |
847 | rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, failedPDA->startSector) : 0 |
848 | ); |
849 | if (prior_recon) { |
850 | RF_RowCol_t oc = failedPDA->col; |
851 | RF_SectorNum_t oo = failedPDA->startSector; |
852 | if (layoutPtr->map->flags & |
853 | RF_DISTRIBUTE_SPARE) { |
854 | /* redirect to dist spare space */ |
855 | |
856 | if (failedPDA == asmp->parityInfo) { |
857 | |
858 | /* parity has failed */ |
859 | (layoutPtr->map->MapParity) (raidPtr, failedPDA->raidAddress, |
860 | &failedPDA->col, &failedPDA->startSector, RF_REMAP); |
861 | |
862 | if (asmp->parityInfo->next) { /* redir 2nd component, |
863 | * if any */ |
864 | RF_PhysDiskAddr_t *p = asmp->parityInfo->next; |
865 | RF_SectorNum_t SUoffs = p->startSector % layoutPtr->sectorsPerStripeUnit; |
866 | p->col = failedPDA->col; |
867 | p->startSector = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, failedPDA->startSector) + |
868 | SUoffs; /* cheating: |
869 | * startSector is not |
870 | * really a RAID address */ |
871 | } |
872 | } else |
873 | if (asmp->parityInfo->next && failedPDA == asmp->parityInfo->next) { |
874 | RF_ASSERT(0); /* should not ever |
875 | * happen */ |
876 | } else { |
877 | |
878 | /* data has failed */ |
879 | (layoutPtr->map->MapSector) (raidPtr, failedPDA->raidAddress, |
880 | &failedPDA->col, &failedPDA->startSector, RF_REMAP); |
881 | |
882 | } |
883 | |
884 | } else { |
885 | /* redirect to dedicated spare space */ |
886 | |
887 | failedPDA->col = raidPtr->Disks[fcol].spareCol; |
888 | |
889 | /* the parity may have two distinct |
890 | * components, both of which may need |
891 | * to be redirected */ |
892 | if (asmp->parityInfo->next) { |
893 | if (failedPDA == asmp->parityInfo) { |
894 | failedPDA->next->col = failedPDA->col; |
895 | } else |
896 | if (failedPDA == asmp->parityInfo->next) { /* paranoid: should never occur */ |
897 | asmp->parityInfo->col = failedPDA->col; |
898 | } |
899 | } |
900 | } |
901 | |
902 | RF_ASSERT(failedPDA->col != -1); |
903 | |
904 | if (rf_dagDebug || rf_mapDebug) { |
905 | printf("raid%d: Redirected type '%c' c %d o %ld -> c %d o %ld\n" , |
906 | raidPtr->raidid, type, oc, (long) oo, failedPDA->col, (long) failedPDA->startSector); |
907 | } |
908 | asmp->numDataFailed = asmp->numParityFailed = 0; |
909 | } |
910 | } |
911 | if (type == RF_IO_TYPE_READ) { |
912 | |
913 | if (asmp->numDataFailed == 0) |
914 | *createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG; |
915 | else |
916 | *createFunc = (RF_VoidFuncPtr) rf_CreateRaidFiveDegradedReadDAG; |
917 | |
918 | } else { |
919 | |
920 | |
921 | /* if mirroring, always use large writes. If the access |
922 | * requires two distinct parity updates, always do a small |
923 | * write. If the stripe contains a failure but the access |
924 | * does not, do a small write. The first conditional |
925 | * (numStripeUnitsAccessed <= numDataCol/2) uses a |
926 | * less-than-or-equal rather than just a less-than because |
927 | * when G is 3 or 4, numDataCol/2 is 1, and I want |
928 | * single-stripe-unit updates to use just one disk. */ |
929 | if ((asmp->numDataFailed + asmp->numParityFailed) == 0) { |
930 | if (((asmp->numStripeUnitsAccessed <= |
931 | (layoutPtr->numDataCol / 2)) && |
932 | (layoutPtr->numDataCol != 1)) || |
933 | (asmp->parityInfo->next != NULL) || |
934 | rf_CheckStripeForFailures(raidPtr, asmp)) { |
935 | *createFunc = (RF_VoidFuncPtr) rf_CreateParityLoggingSmallWriteDAG; |
936 | } else |
937 | *createFunc = (RF_VoidFuncPtr) rf_CreateParityLoggingLargeWriteDAG; |
938 | } else |
939 | if (asmp->numParityFailed == 1) |
940 | *createFunc = (RF_VoidFuncPtr) rf_CreateNonRedundantWriteDAG; |
941 | else |
942 | if (asmp->numStripeUnitsAccessed != 1 && failedPDA->numSector != layoutPtr->sectorsPerStripeUnit) |
943 | *createFunc = NULL; |
944 | else |
945 | *createFunc = (RF_VoidFuncPtr) rf_CreateDegradedWriteDAG; |
946 | } |
947 | } |
948 | #endif /* RF_INCLUDE_PARITYLOGGING > 0 */ |
949 | |