1 | /* $NetBSD: rf_reconmap.c,v 1.34 2012/02/20 22:42:05 oster Exp $ */ |
2 | /* |
3 | * Copyright (c) 1995 Carnegie-Mellon University. |
4 | * All rights reserved. |
5 | * |
6 | * Author: Mark Holland |
7 | * |
8 | * Permission to use, copy, modify and distribute this software and |
9 | * its documentation is hereby granted, provided that both the copyright |
10 | * notice and this permission notice appear in all copies of the |
11 | * software, derivative works or modified versions, and any portions |
12 | * thereof, and that both notices appear in supporting documentation. |
13 | * |
14 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
15 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND |
16 | * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. |
17 | * |
18 | * Carnegie Mellon requests users of this software to return to |
19 | * |
20 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
21 | * School of Computer Science |
22 | * Carnegie Mellon University |
23 | * Pittsburgh PA 15213-3890 |
24 | * |
25 | * any improvements or extensions that they make and grant Carnegie the |
26 | * rights to redistribute these changes. |
27 | */ |
28 | |
29 | /************************************************************************* |
30 | * rf_reconmap.c |
31 | * |
32 | * code to maintain a map of what sectors have/have not been reconstructed |
33 | * |
34 | *************************************************************************/ |
35 | |
36 | #include <sys/cdefs.h> |
37 | __KERNEL_RCSID(0, "$NetBSD: rf_reconmap.c,v 1.34 2012/02/20 22:42:05 oster Exp $" ); |
38 | |
39 | #include "rf_raid.h" |
40 | #include <sys/time.h> |
41 | #include "rf_general.h" |
42 | #include "rf_utils.h" |
43 | |
44 | /* special pointer values indicating that a reconstruction unit |
45 | * has been either totally reconstructed or not at all. Both |
46 | * are illegal pointer values, so you have to be careful not to |
47 | * dereference through them. RU_NOTHING must be zero, since |
48 | * MakeReconMap uses memset to initialize the structure. These are used |
49 | * only at the head of the list. |
50 | */ |
51 | #define RU_ALL ((RF_ReconMapListElem_t *) -1) |
52 | #define RU_NOTHING ((RF_ReconMapListElem_t *) 0) |
53 | |
54 | /* For most reconstructs we need at most 3 RF_ReconMapListElem_t's. |
55 | * Bounding the number we need is quite difficult, as it depends on how |
56 | * badly the sectors to be reconstructed get divided up. In the current |
57 | * code, the reconstructed sectors appeared aligned on stripe boundaries, |
58 | * and are always presented in stripe width units, so we're probably |
59 | * allocating quite a bit more than we'll ever need. |
60 | */ |
61 | #define RF_NUM_RECON_POOL_ELEM 100 |
62 | |
63 | static void |
64 | compact_stat_entry(RF_Raid_t *, RF_ReconMap_t *, int, int); |
65 | static void crunch_list(RF_ReconMap_t *, RF_ReconMapListElem_t *); |
66 | static RF_ReconMapListElem_t * |
67 | MakeReconMapListElem(RF_ReconMap_t *, RF_SectorNum_t, RF_SectorNum_t, |
68 | RF_ReconMapListElem_t *); |
69 | static void |
70 | FreeReconMapListElem(RF_ReconMap_t *mapPtr, RF_ReconMapListElem_t * p); |
71 | |
72 | /*--------------------------------------------------------------------------- |
73 | * |
74 | * Creates and initializes new Reconstruction map |
75 | * |
76 | * ru_sectors - size of reconstruction unit in sectors |
77 | * disk_sectors - size of disk in sectors |
78 | * spareUnitsPerDisk - zero unless distributed sparing |
79 | *-------------------------------------------------------------------------*/ |
80 | |
81 | RF_ReconMap_t * |
82 | rf_MakeReconMap(RF_Raid_t *raidPtr, RF_SectorCount_t ru_sectors, |
83 | RF_SectorCount_t disk_sectors, |
84 | RF_ReconUnitCount_t spareUnitsPerDisk) |
85 | { |
86 | RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; |
87 | RF_ReconUnitCount_t num_rus = layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerRU; |
88 | RF_ReconMap_t *p; |
89 | |
90 | RF_Malloc(p, sizeof(RF_ReconMap_t), (RF_ReconMap_t *)); |
91 | p->sectorsPerReconUnit = ru_sectors; |
92 | p->sectorsInDisk = disk_sectors; |
93 | |
94 | p->totalRUs = num_rus; |
95 | p->spareRUs = spareUnitsPerDisk; |
96 | p->unitsLeft = num_rus - spareUnitsPerDisk; |
97 | p->low_ru = 0; |
98 | p->status_size = RF_RECONMAP_SIZE; |
99 | p->high_ru = p->status_size - 1; |
100 | p->head = 0; |
101 | |
102 | RF_Malloc(p->status, p->status_size * sizeof(RF_ReconMapListElem_t *), (RF_ReconMapListElem_t **)); |
103 | RF_ASSERT(p->status != NULL); |
104 | |
105 | (void) memset((char *) p->status, 0, |
106 | p->status_size * sizeof(RF_ReconMapListElem_t *)); |
107 | |
108 | pool_init(&p->elem_pool, sizeof(RF_ReconMapListElem_t), 0, |
109 | 0, 0, "raidreconpl" , NULL, IPL_BIO); |
110 | pool_prime(&p->elem_pool, RF_NUM_RECON_POOL_ELEM); |
111 | |
112 | rf_init_mutex2(p->mutex, IPL_VM); |
113 | rf_init_cond2(p->cv, "reconupdate" ); |
114 | |
115 | return (p); |
116 | } |
117 | |
118 | |
119 | /*--------------------------------------------------------------------------- |
120 | * |
121 | * marks a new set of sectors as reconstructed. All the possible |
122 | * mergings get complicated. To simplify matters, the approach I take |
123 | * is to just dump something into the list, and then clean it up |
124 | * (i.e. merge elements and eliminate redundant ones) in a second pass |
125 | * over the list (compact_stat_entry()). Not 100% efficient, since a |
126 | * structure can be allocated and then immediately freed, but it keeps |
127 | * this code from becoming (more of) a nightmare of special cases. |
128 | * The only thing that compact_stat_entry() assumes is that the list |
129 | * is sorted by startSector, and so this is the only condition I |
130 | * maintain here. (MCH) |
131 | * |
132 | * This code now uses a pool instead of the previous malloc/free |
133 | * stuff. |
134 | *-------------------------------------------------------------------------*/ |
135 | |
136 | void |
137 | rf_ReconMapUpdate(RF_Raid_t *raidPtr, RF_ReconMap_t *mapPtr, |
138 | RF_SectorNum_t startSector, RF_SectorNum_t stopSector) |
139 | { |
140 | RF_SectorCount_t sectorsPerReconUnit = mapPtr->sectorsPerReconUnit; |
141 | RF_SectorNum_t i, first_in_RU, last_in_RU, ru; |
142 | RF_ReconMapListElem_t *p, *pt; |
143 | |
144 | rf_lock_mutex2(mapPtr->mutex); |
145 | while(mapPtr->lock) { |
146 | rf_wait_cond2(mapPtr->cv, mapPtr->mutex); |
147 | } |
148 | mapPtr->lock = 1; |
149 | rf_unlock_mutex2(mapPtr->mutex); |
150 | RF_ASSERT(startSector >= 0 && stopSector < mapPtr->sectorsInDisk && |
151 | stopSector >= startSector); |
152 | |
153 | while (startSector <= stopSector) { |
154 | i = startSector / mapPtr->sectorsPerReconUnit; |
155 | first_in_RU = i * sectorsPerReconUnit; |
156 | last_in_RU = first_in_RU + sectorsPerReconUnit - 1; |
157 | |
158 | /* do we need to move the queue? */ |
159 | while (i > mapPtr->high_ru) { |
160 | #if 0 |
161 | #ifdef DIAGNOSTIC |
162 | /* XXX: The check below is not valid for |
163 | * RAID5_RS. It is valid for RAID 1 and RAID 5. |
164 | * The issue is that we can easily have |
165 | * RU_NOTHING entries here too, and those are |
166 | * quite correct. |
167 | */ |
168 | if (mapPtr->status[mapPtr->head]!=RU_ALL) { |
169 | printf("\nraid%d: reconmap incorrect -- working on i %" PRIu64 "\n" , |
170 | raidPtr->raidid, i); |
171 | printf("raid%d: ru %" PRIu64 " not completed!!!\n" , |
172 | raidPtr->raidid, mapPtr->head); |
173 | |
174 | printf("raid%d: low: %" PRIu64 " high: %" PRIu64 "\n" , |
175 | raidPtr->raidid, mapPtr->low_ru, mapPtr->high_ru); |
176 | |
177 | panic("reconmap incorrect" ); |
178 | } |
179 | #endif |
180 | #endif |
181 | mapPtr->low_ru++; |
182 | mapPtr->high_ru++; |
183 | /* initialize "highest" RU status entry, which |
184 | will take over the current head postion */ |
185 | mapPtr->status[mapPtr->head]=RU_NOTHING; |
186 | |
187 | /* move head too */ |
188 | mapPtr->head++; |
189 | if (mapPtr->head >= mapPtr->status_size) |
190 | mapPtr->head = 0; |
191 | |
192 | } |
193 | |
194 | ru = i - mapPtr->low_ru + mapPtr->head; |
195 | if (ru >= mapPtr->status_size) |
196 | ru = ru - mapPtr->status_size; |
197 | |
198 | if ((ru < 0) || (ru >= mapPtr->status_size)) { |
199 | printf("raid%d: ru is bogus %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "\n" , |
200 | raidPtr->raidid, i, ru, mapPtr->head, mapPtr->low_ru, mapPtr->high_ru); |
201 | panic("bogus ru in reconmap" ); |
202 | } |
203 | |
204 | p = mapPtr->status[ru]; |
205 | if (p != RU_ALL) { |
206 | if (p == RU_NOTHING || p->startSector > startSector) { |
207 | /* insert at front of list */ |
208 | |
209 | mapPtr->status[ru] = MakeReconMapListElem(mapPtr,startSector, RF_MIN(stopSector, last_in_RU), (p == RU_NOTHING) ? NULL : p); |
210 | |
211 | } else {/* general case */ |
212 | do { /* search for place to insert */ |
213 | pt = p; |
214 | p = p->next; |
215 | } while (p && (p->startSector < startSector)); |
216 | pt->next = MakeReconMapListElem(mapPtr,startSector, RF_MIN(stopSector, last_in_RU), p); |
217 | |
218 | } |
219 | compact_stat_entry(raidPtr, mapPtr, i, ru); |
220 | } |
221 | startSector = RF_MIN(stopSector, last_in_RU) + 1; |
222 | } |
223 | rf_lock_mutex2(mapPtr->mutex); |
224 | mapPtr->lock = 0; |
225 | rf_broadcast_cond2(mapPtr->cv); |
226 | rf_unlock_mutex2(mapPtr->mutex); |
227 | } |
228 | |
229 | |
230 | |
231 | /*--------------------------------------------------------------------------- |
232 | * |
233 | * performs whatever list compactions can be done, and frees any space |
234 | * that is no longer necessary. Assumes only that the list is sorted |
235 | * by startSector. crunch_list() compacts a single list as much as |
236 | * possible, and the second block of code deletes the entire list if |
237 | * possible. crunch_list() is also called from |
238 | * MakeReconMapAccessList(). |
239 | * |
240 | * When a recon unit is detected to be fully reconstructed, we set the |
241 | * corresponding bit in the parity stripe map so that the head follow |
242 | * code will not select this parity stripe again. This is redundant |
243 | * (but harmless) when compact_stat_entry is called from the |
244 | * reconstruction code, but necessary when called from the user-write |
245 | * code. |
246 | * |
247 | *-------------------------------------------------------------------------*/ |
248 | |
249 | static void |
250 | compact_stat_entry(RF_Raid_t *raidPtr, RF_ReconMap_t *mapPtr, int i, int j) |
251 | { |
252 | RF_SectorCount_t sectorsPerReconUnit = mapPtr->sectorsPerReconUnit; |
253 | RF_ReconMapListElem_t *p = mapPtr->status[j]; |
254 | |
255 | crunch_list(mapPtr, p); |
256 | |
257 | if ((p->startSector == i * sectorsPerReconUnit) && |
258 | (p->stopSector == i * sectorsPerReconUnit + |
259 | sectorsPerReconUnit - 1)) { |
260 | mapPtr->status[j] = RU_ALL; |
261 | mapPtr->unitsLeft--; |
262 | FreeReconMapListElem(mapPtr, p); |
263 | } |
264 | } |
265 | |
266 | |
267 | static void |
268 | crunch_list(RF_ReconMap_t *mapPtr, RF_ReconMapListElem_t *listPtr) |
269 | { |
270 | RF_ReconMapListElem_t *pt, *p = listPtr; |
271 | |
272 | if (!p) |
273 | return; |
274 | pt = p; |
275 | p = p->next; |
276 | while (p) { |
277 | if (pt->stopSector >= p->startSector - 1) { |
278 | pt->stopSector = RF_MAX(pt->stopSector, p->stopSector); |
279 | pt->next = p->next; |
280 | FreeReconMapListElem(mapPtr, p); |
281 | p = pt->next; |
282 | } else { |
283 | pt = p; |
284 | p = p->next; |
285 | } |
286 | } |
287 | } |
288 | /*--------------------------------------------------------------------------- |
289 | * |
290 | * Allocate and fill a new list element |
291 | * |
292 | *-------------------------------------------------------------------------*/ |
293 | |
294 | static RF_ReconMapListElem_t * |
295 | MakeReconMapListElem(RF_ReconMap_t *mapPtr, RF_SectorNum_t startSector, |
296 | RF_SectorNum_t stopSector, RF_ReconMapListElem_t *next) |
297 | { |
298 | RF_ReconMapListElem_t *p; |
299 | |
300 | p = pool_get(&mapPtr->elem_pool, PR_WAITOK); |
301 | p->startSector = startSector; |
302 | p->stopSector = stopSector; |
303 | p->next = next; |
304 | return (p); |
305 | } |
306 | /*--------------------------------------------------------------------------- |
307 | * |
308 | * Free a list element |
309 | * |
310 | *-------------------------------------------------------------------------*/ |
311 | |
312 | static void |
313 | FreeReconMapListElem(RF_ReconMap_t *mapPtr, RF_ReconMapListElem_t *p) |
314 | { |
315 | pool_put(&mapPtr->elem_pool, p); |
316 | } |
317 | /*--------------------------------------------------------------------------- |
318 | * |
319 | * Free an entire status structure. Inefficient, but can be called at |
320 | * any time. |
321 | * |
322 | *-------------------------------------------------------------------------*/ |
323 | void |
324 | rf_FreeReconMap(RF_ReconMap_t *mapPtr) |
325 | { |
326 | RF_ReconMapListElem_t *p, *q; |
327 | RF_ReconUnitCount_t numRUs; |
328 | RF_ReconUnitNum_t i; |
329 | |
330 | numRUs = mapPtr->sectorsInDisk / mapPtr->sectorsPerReconUnit; |
331 | if (mapPtr->sectorsInDisk % mapPtr->sectorsPerReconUnit) |
332 | numRUs++; |
333 | |
334 | for (i = 0; i < mapPtr->status_size; i++) { |
335 | p = mapPtr->status[i]; |
336 | while (p != RU_NOTHING && p != RU_ALL) { |
337 | q = p; |
338 | p = p->next; |
339 | RF_Free(q, sizeof(*q)); |
340 | } |
341 | } |
342 | |
343 | rf_destroy_mutex2(mapPtr->mutex); |
344 | rf_destroy_cond2(mapPtr->cv); |
345 | |
346 | pool_destroy(&mapPtr->elem_pool); |
347 | RF_Free(mapPtr->status, mapPtr->status_size * |
348 | sizeof(RF_ReconMapListElem_t *)); |
349 | RF_Free(mapPtr, sizeof(RF_ReconMap_t)); |
350 | } |
351 | /*--------------------------------------------------------------------------- |
352 | * |
353 | * returns nonzero if the indicated RU has been reconstructed already |
354 | * |
355 | *-------------------------------------------------------------------------*/ |
356 | |
357 | int |
358 | rf_CheckRUReconstructed(RF_ReconMap_t *mapPtr, RF_SectorNum_t startSector) |
359 | { |
360 | RF_ReconUnitNum_t i; |
361 | int rv; |
362 | |
363 | i = startSector / mapPtr->sectorsPerReconUnit; |
364 | |
365 | if (i < mapPtr->low_ru) |
366 | rv = 1; |
367 | else if (i > mapPtr->high_ru) |
368 | rv = 0; |
369 | else { |
370 | i = i - mapPtr->low_ru + mapPtr->head; |
371 | if (i >= mapPtr->status_size) |
372 | i = i - mapPtr->status_size; |
373 | if (mapPtr->status[i] == RU_ALL) |
374 | rv = 1; |
375 | else |
376 | rv = 0; |
377 | } |
378 | |
379 | return rv; |
380 | } |
381 | |
382 | RF_ReconUnitCount_t |
383 | rf_UnitsLeftToReconstruct(RF_ReconMap_t *mapPtr) |
384 | { |
385 | RF_ASSERT(mapPtr != NULL); |
386 | return (mapPtr->unitsLeft); |
387 | } |
388 | |
389 | #if RF_DEBUG_RECON |
390 | void |
391 | rf_PrintReconSchedule(RF_ReconMap_t *mapPtr, struct timeval *starttime) |
392 | { |
393 | static int old_pctg = -1; |
394 | struct timeval tv, diff; |
395 | int new_pctg; |
396 | |
397 | new_pctg = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * |
398 | 100 / mapPtr->totalRUs); |
399 | if (new_pctg != old_pctg) { |
400 | RF_GETTIME(tv); |
401 | RF_TIMEVAL_DIFF(starttime, &tv, &diff); |
402 | printf("%d %d.%06d\n" , (int) new_pctg, (int) diff.tv_sec, |
403 | (int) diff.tv_usec); |
404 | old_pctg = new_pctg; |
405 | } |
406 | } |
407 | #endif |
408 | |
409 | |