1 | /* $NetBSD: rf_decluster.c,v 1.24 2014/03/23 09:30:59 christos Exp $ */ |
2 | /* |
3 | * Copyright (c) 1995 Carnegie-Mellon University. |
4 | * All rights reserved. |
5 | * |
6 | * Author: Mark Holland |
7 | * |
8 | * Permission to use, copy, modify and distribute this software and |
9 | * its documentation is hereby granted, provided that both the copyright |
10 | * notice and this permission notice appear in all copies of the |
11 | * software, derivative works or modified versions, and any portions |
12 | * thereof, and that both notices appear in supporting documentation. |
13 | * |
14 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
15 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND |
16 | * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. |
17 | * |
18 | * Carnegie Mellon requests users of this software to return to |
19 | * |
20 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
21 | * School of Computer Science |
22 | * Carnegie Mellon University |
23 | * Pittsburgh PA 15213-3890 |
24 | * |
25 | * any improvements or extensions that they make and grant Carnegie the |
26 | * rights to redistribute these changes. |
27 | */ |
28 | |
29 | /*---------------------------------------------------------------------- |
30 | * |
31 | * rf_decluster.c -- code related to the declustered layout |
32 | * |
33 | * Created 10-21-92 (MCH) |
34 | * |
35 | * Nov 93: adding support for distributed sparing. This code is a little |
36 | * complex: the basic layout used is as follows: |
37 | * let F = (v-1)/GCD(r,v-1). The spare space for each set of |
38 | * F consecutive fulltables is grouped together and placed after |
39 | * that set of tables. |
40 | * +------------------------------+ |
41 | * | F fulltables | |
42 | * | Spare Space | |
43 | * | F fulltables | |
44 | * | Spare Space | |
45 | * | ... | |
46 | * +------------------------------+ |
47 | * |
48 | *--------------------------------------------------------------------*/ |
49 | |
50 | #include <sys/cdefs.h> |
51 | __KERNEL_RCSID(0, "$NetBSD: rf_decluster.c,v 1.24 2014/03/23 09:30:59 christos Exp $" ); |
52 | |
53 | #include <dev/raidframe/raidframevar.h> |
54 | |
55 | #include "rf_archs.h" |
56 | #include "rf_raid.h" |
57 | #include "rf_decluster.h" |
58 | #include "rf_debugMem.h" |
59 | #include "rf_utils.h" |
60 | #include "rf_alloclist.h" |
61 | #include "rf_general.h" |
62 | #include "rf_kintf.h" |
63 | #include "rf_shutdown.h" |
64 | #include "rf_copyback.h" |
65 | |
66 | #if (RF_INCLUDE_PARITY_DECLUSTERING > 0) || (RF_INCLUDE_PARITY_DECLUSTERING_PQ > 0) |
67 | |
68 | /* configuration code */ |
69 | |
70 | int |
71 | rf_ConfigureDeclustered(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, |
72 | RF_Config_t *cfgPtr) |
73 | { |
74 | RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); |
75 | int b, v, k, r, lambda; /* block design params */ |
76 | int i, j; |
77 | RF_RowCol_t *first_avail_slot; |
78 | RF_StripeCount_t complete_FT_count, numCompleteFullTablesPerDisk; |
79 | RF_DeclusteredConfigInfo_t *info; |
80 | RF_StripeCount_t PUsPerDisk, spareRegionDepthInPUs, numCompleteSpareRegionsPerDisk, |
81 | extraPUsPerDisk; |
82 | RF_StripeCount_t totSparePUsPerDisk; |
83 | RF_SectorNum_t diskOffsetOfLastFullTableInSUs; |
84 | RF_SectorCount_t SpareSpaceInSUs; |
85 | char *cfgBuf = (char *) (cfgPtr->layoutSpecific); |
86 | RF_StripeNum_t l, SUID; |
87 | |
88 | SUID = l = 0; |
89 | numCompleteSpareRegionsPerDisk = 0; |
90 | |
91 | /* 1. create layout specific structure */ |
92 | RF_MallocAndAdd(info, sizeof(RF_DeclusteredConfigInfo_t), (RF_DeclusteredConfigInfo_t *), raidPtr->cleanupList); |
93 | if (info == NULL) |
94 | return (ENOMEM); |
95 | layoutPtr->layoutSpecificInfo = (void *) info; |
96 | info->SpareTable = NULL; |
97 | |
98 | /* 2. extract parameters from the config structure */ |
99 | if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) { |
100 | (void)memcpy(info->sparemap_fname, cfgBuf, RF_SPAREMAP_NAME_LEN); |
101 | } |
102 | cfgBuf += RF_SPAREMAP_NAME_LEN; |
103 | |
104 | b = *((int *) cfgBuf); |
105 | cfgBuf += sizeof(int); |
106 | v = *((int *) cfgBuf); |
107 | cfgBuf += sizeof(int); |
108 | k = *((int *) cfgBuf); |
109 | cfgBuf += sizeof(int); |
110 | r = *((int *) cfgBuf); |
111 | cfgBuf += sizeof(int); |
112 | lambda = *((int *) cfgBuf); |
113 | cfgBuf += sizeof(int); |
114 | raidPtr->noRotate = *((int *) cfgBuf); |
115 | cfgBuf += sizeof(int); |
116 | |
117 | /* the sparemaps are generated assuming that parity is rotated, so we |
118 | * issue a warning if both distributed sparing and no-rotate are on at |
119 | * the same time */ |
120 | if ((layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) && raidPtr->noRotate) { |
121 | RF_ERRORMSG("Warning: distributed sparing specified without parity rotation.\n" ); |
122 | } |
123 | if (raidPtr->numCol != v) { |
124 | RF_ERRORMSG2("RAID: config error: table element count (%d) not equal to no. of cols (%d)\n" , v, raidPtr->numCol); |
125 | return (EINVAL); |
126 | } |
127 | /* 3. set up the values used in the mapping code */ |
128 | info->BlocksPerTable = b; |
129 | info->Lambda = lambda; |
130 | info->NumParityReps = info->groupSize = k; |
131 | info->SUsPerTable = b * (k - 1) * layoutPtr->SUsPerPU; /* b blks, k-1 SUs each */ |
132 | info->SUsPerFullTable = k * info->SUsPerTable; /* rot k times */ |
133 | info->PUsPerBlock = k - 1; |
134 | info->SUsPerBlock = info->PUsPerBlock * layoutPtr->SUsPerPU; |
135 | info->TableDepthInPUs = (b * k) / v; |
136 | info->FullTableDepthInPUs = info->TableDepthInPUs * k; /* k repetitions */ |
137 | |
138 | /* used only in distributed sparing case */ |
139 | info->FullTablesPerSpareRegion = (v - 1) / rf_gcd(r, v - 1); /* (v-1)/gcd fulltables */ |
140 | info->TablesPerSpareRegion = k * info->FullTablesPerSpareRegion; |
141 | info->SpareSpaceDepthPerRegionInSUs = (r * info->TablesPerSpareRegion / (v - 1)) * layoutPtr->SUsPerPU; |
142 | |
143 | /* check to make sure the block design is sufficiently small */ |
144 | if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) { |
145 | if (info->FullTableDepthInPUs * layoutPtr->SUsPerPU + info->SpareSpaceDepthPerRegionInSUs > layoutPtr->stripeUnitsPerDisk) { |
146 | RF_ERRORMSG3("RAID: config error: Full Table depth (%d) + Spare Space (%d) larger than disk size (%d) (BD too big)\n" , |
147 | (int) info->FullTableDepthInPUs, |
148 | (int) info->SpareSpaceDepthPerRegionInSUs, |
149 | (int) layoutPtr->stripeUnitsPerDisk); |
150 | return (EINVAL); |
151 | } |
152 | } else { |
153 | if (info->TableDepthInPUs * layoutPtr->SUsPerPU > layoutPtr->stripeUnitsPerDisk) { |
154 | RF_ERRORMSG2("RAID: config error: Table depth (%d) larger than disk size (%d) (BD too big)\n" , |
155 | (int) (info->TableDepthInPUs * layoutPtr->SUsPerPU), \ |
156 | (int) layoutPtr->stripeUnitsPerDisk); |
157 | return (EINVAL); |
158 | } |
159 | } |
160 | |
161 | |
162 | /* compute the size of each disk, and the number of tables in the last |
163 | * fulltable (which need not be complete) */ |
164 | if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { |
165 | |
166 | PUsPerDisk = layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU; |
167 | spareRegionDepthInPUs = (info->TablesPerSpareRegion * info->TableDepthInPUs + |
168 | (info->TablesPerSpareRegion * info->TableDepthInPUs) / (v - 1)); |
169 | info->SpareRegionDepthInSUs = spareRegionDepthInPUs * layoutPtr->SUsPerPU; |
170 | |
171 | numCompleteSpareRegionsPerDisk = PUsPerDisk / spareRegionDepthInPUs; |
172 | info->NumCompleteSRs = numCompleteSpareRegionsPerDisk; |
173 | extraPUsPerDisk = PUsPerDisk % spareRegionDepthInPUs; |
174 | |
175 | /* assume conservatively that we need the full amount of spare |
176 | * space in one region in order to provide spares for the |
177 | * partial spare region at the end of the array. We set "i" |
178 | * to the number of tables in the partial spare region. This |
179 | * may actually include some fulltables. */ |
180 | extraPUsPerDisk -= (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU); |
181 | if (extraPUsPerDisk <= 0) |
182 | i = 0; |
183 | else |
184 | i = extraPUsPerDisk / info->TableDepthInPUs; |
185 | |
186 | complete_FT_count = (numCompleteSpareRegionsPerDisk * (info->TablesPerSpareRegion / k) + i / k); |
187 | info->FullTableLimitSUID = complete_FT_count * info->SUsPerFullTable; |
188 | info->ExtraTablesPerDisk = i % k; |
189 | |
190 | /* note that in the last spare region, the spare space is |
191 | * complete even though data/parity space is not */ |
192 | totSparePUsPerDisk = (numCompleteSpareRegionsPerDisk + 1) * (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU); |
193 | info->TotSparePUsPerDisk = totSparePUsPerDisk; |
194 | |
195 | layoutPtr->stripeUnitsPerDisk = |
196 | ((complete_FT_count) * info->FullTableDepthInPUs + /* data & parity space */ |
197 | info->ExtraTablesPerDisk * info->TableDepthInPUs + |
198 | totSparePUsPerDisk /* spare space */ |
199 | ) * layoutPtr->SUsPerPU; |
200 | layoutPtr->dataStripeUnitsPerDisk = |
201 | (complete_FT_count * info->FullTableDepthInPUs + info->ExtraTablesPerDisk * info->TableDepthInPUs) |
202 | * layoutPtr->SUsPerPU * (k - 1) / k; |
203 | |
204 | } else { |
205 | /* non-dist spare case: force each disk to contain an |
206 | * integral number of tables */ |
207 | layoutPtr->stripeUnitsPerDisk /= (info->TableDepthInPUs * layoutPtr->SUsPerPU); |
208 | layoutPtr->stripeUnitsPerDisk *= (info->TableDepthInPUs * layoutPtr->SUsPerPU); |
209 | |
210 | /* compute the number of tables in the last fulltable, which |
211 | * need not be complete */ |
212 | complete_FT_count = |
213 | ((layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU) / info->FullTableDepthInPUs); |
214 | |
215 | info->FullTableLimitSUID = complete_FT_count * info->SUsPerFullTable; |
216 | info->ExtraTablesPerDisk = |
217 | ((layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU) / info->TableDepthInPUs) % k; |
218 | } |
219 | |
220 | raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit; |
221 | |
222 | /* find the disk offset of the stripe unit where the last fulltable |
223 | * starts */ |
224 | numCompleteFullTablesPerDisk = complete_FT_count; |
225 | diskOffsetOfLastFullTableInSUs = numCompleteFullTablesPerDisk * info->FullTableDepthInPUs * layoutPtr->SUsPerPU; |
226 | if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { |
227 | SpareSpaceInSUs = numCompleteSpareRegionsPerDisk * info->SpareSpaceDepthPerRegionInSUs; |
228 | diskOffsetOfLastFullTableInSUs += SpareSpaceInSUs; |
229 | info->DiskOffsetOfLastSpareSpaceChunkInSUs = |
230 | diskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU; |
231 | } |
232 | info->DiskOffsetOfLastFullTableInSUs = diskOffsetOfLastFullTableInSUs; |
233 | info->numCompleteFullTablesPerDisk = numCompleteFullTablesPerDisk; |
234 | |
235 | /* 4. create and initialize the lookup tables */ |
236 | info->LayoutTable = rf_make_2d_array(b, k, raidPtr->cleanupList); |
237 | if (info->LayoutTable == NULL) |
238 | return (ENOMEM); |
239 | info->OffsetTable = rf_make_2d_array(b, k, raidPtr->cleanupList); |
240 | if (info->OffsetTable == NULL) |
241 | return (ENOMEM); |
242 | info->BlockTable = rf_make_2d_array(info->TableDepthInPUs * layoutPtr->SUsPerPU, raidPtr->numCol, raidPtr->cleanupList); |
243 | if (info->BlockTable == NULL) |
244 | return (ENOMEM); |
245 | |
246 | first_avail_slot = rf_make_1d_array(v, NULL); |
247 | if (first_avail_slot == NULL) |
248 | return (ENOMEM); |
249 | |
250 | for (i = 0; i < b; i++) |
251 | for (j = 0; j < k; j++) |
252 | info->LayoutTable[i][j] = *cfgBuf++; |
253 | |
254 | /* initialize offset table */ |
255 | for (i = 0; i < b; i++) |
256 | for (j = 0; j < k; j++) { |
257 | info->OffsetTable[i][j] = first_avail_slot[info->LayoutTable[i][j]]; |
258 | first_avail_slot[info->LayoutTable[i][j]]++; |
259 | } |
260 | |
261 | /* initialize block table */ |
262 | for (SUID = l = 0; l < layoutPtr->SUsPerPU; l++) { |
263 | for (i = 0; i < b; i++) { |
264 | for (j = 0; j < k; j++) { |
265 | info->BlockTable[(info->OffsetTable[i][j] * layoutPtr->SUsPerPU) + l] |
266 | [info->LayoutTable[i][j]] = SUID; |
267 | } |
268 | SUID++; |
269 | } |
270 | } |
271 | |
272 | rf_free_1d_array(first_avail_slot, v); |
273 | |
274 | /* 5. set up the remaining redundant-but-useful parameters */ |
275 | |
276 | raidPtr->totalSectors = (k * complete_FT_count + info->ExtraTablesPerDisk) * |
277 | info->SUsPerTable * layoutPtr->sectorsPerStripeUnit; |
278 | layoutPtr->numStripe = (raidPtr->totalSectors / layoutPtr->sectorsPerStripeUnit) / (k - 1); |
279 | |
280 | /* strange evaluation order below to try and minimize overflow |
281 | * problems */ |
282 | |
283 | layoutPtr->dataSectorsPerStripe = (k - 1) * layoutPtr->sectorsPerStripeUnit; |
284 | layoutPtr->numDataCol = k - 1; |
285 | layoutPtr->numParityCol = 1; |
286 | |
287 | return (0); |
288 | } |
289 | /* declustering with distributed sparing */ |
290 | static void rf_ShutdownDeclusteredDS(RF_ThreadArg_t); |
291 | static void |
292 | rf_ShutdownDeclusteredDS(RF_ThreadArg_t arg) |
293 | { |
294 | RF_DeclusteredConfigInfo_t *info; |
295 | RF_Raid_t *raidPtr; |
296 | |
297 | raidPtr = (RF_Raid_t *) arg; |
298 | info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; |
299 | if (info->SpareTable) |
300 | rf_FreeSpareTable(raidPtr); |
301 | } |
302 | |
303 | int |
304 | rf_ConfigureDeclusteredDS(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, |
305 | RF_Config_t *cfgPtr) |
306 | { |
307 | int rc; |
308 | |
309 | rc = rf_ConfigureDeclustered(listp, raidPtr, cfgPtr); |
310 | if (rc) |
311 | return (rc); |
312 | rf_ShutdownCreate(listp, rf_ShutdownDeclusteredDS, raidPtr); |
313 | |
314 | return (0); |
315 | } |
316 | |
317 | void |
318 | rf_MapSectorDeclustered(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, |
319 | RF_RowCol_t *col, |
320 | RF_SectorNum_t *diskSector, int remap) |
321 | { |
322 | RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); |
323 | RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; |
324 | RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit; |
325 | RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset; |
326 | RF_StripeNum_t BlockID, BlockOffset, RepIndex; |
327 | RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable; |
328 | RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU; |
329 | RF_StripeNum_t base_suid = 0, outSU, SpareRegion = 0, SpareSpace = 0; |
330 | |
331 | rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid); |
332 | |
333 | FullTableID = SUID / sus_per_fulltable; /* fulltable ID within array |
334 | * (across rows) */ |
335 | |
336 | if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { |
337 | SpareRegion = FullTableID / info->FullTablesPerSpareRegion; |
338 | SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs; |
339 | } |
340 | FullTableOffset = SUID % sus_per_fulltable; |
341 | TableID = FullTableOffset / info->SUsPerTable; |
342 | TableOffset = FullTableOffset - TableID * info->SUsPerTable; |
343 | BlockID = TableOffset / info->PUsPerBlock; |
344 | BlockOffset = TableOffset - BlockID * info->PUsPerBlock; |
345 | BlockID %= info->BlocksPerTable; |
346 | RepIndex = info->PUsPerBlock - TableID; |
347 | if (!raidPtr->noRotate) |
348 | BlockOffset += ((BlockOffset >= RepIndex) ? 1 : 0); |
349 | *col = info->LayoutTable[BlockID][BlockOffset]; |
350 | |
351 | /* remap to distributed spare space if indicated */ |
352 | if (remap) { |
353 | RF_ASSERT(raidPtr->Disks[*col].status == rf_ds_reconstructing || raidPtr->Disks[*col].status == rf_ds_dist_spared || |
354 | (rf_copyback_in_progress && raidPtr->Disks[*col].status == rf_ds_optimal)); |
355 | rf_remap_to_spare_space(layoutPtr, info, FullTableID, TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col, &outSU); |
356 | } else { |
357 | |
358 | outSU = base_suid; |
359 | outSU += FullTableID * fulltable_depth; /* offs to strt of FT */ |
360 | outSU += SpareSpace; /* skip rsvd spare space */ |
361 | outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU; /* offs to strt of tble */ |
362 | outSU += info->OffsetTable[BlockID][BlockOffset] * layoutPtr->SUsPerPU; /* offs to the PU */ |
363 | } |
364 | outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock); /* offs to the SU within |
365 | * a PU */ |
366 | |
367 | /* convert SUs to sectors, and, if not aligned to SU boundary, add in |
368 | * offset to sector. */ |
369 | *diskSector = outSU * layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit); |
370 | |
371 | RF_ASSERT(*col != -1); |
372 | } |
373 | |
374 | |
375 | /* prototyping this inexplicably causes the compile of the layout table (rf_layout.c) to fail */ |
376 | void |
377 | rf_MapParityDeclustered(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, |
378 | RF_RowCol_t *col, |
379 | RF_SectorNum_t *diskSector, int remap) |
380 | { |
381 | RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); |
382 | RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; |
383 | RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit; |
384 | RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset; |
385 | RF_StripeNum_t BlockID, RepIndex; |
386 | RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable; |
387 | RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU; |
388 | RF_StripeNum_t base_suid = 0, outSU, SpareRegion = 0, SpareSpace = 0; |
389 | |
390 | rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid); |
391 | |
392 | /* compute row & (possibly) spare space exactly as before */ |
393 | FullTableID = SUID / sus_per_fulltable; |
394 | |
395 | if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) { |
396 | SpareRegion = FullTableID / info->FullTablesPerSpareRegion; |
397 | SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs; |
398 | } |
399 | /* compute BlockID and RepIndex exactly as before */ |
400 | FullTableOffset = SUID % sus_per_fulltable; |
401 | TableID = FullTableOffset / info->SUsPerTable; |
402 | TableOffset = FullTableOffset - TableID * info->SUsPerTable; |
403 | /* TableOffset = FullTableOffset % info->SUsPerTable; */ |
404 | /* BlockID = (TableOffset / info->PUsPerBlock) % |
405 | * info->BlocksPerTable; */ |
406 | BlockID = TableOffset / info->PUsPerBlock; |
407 | BlockID %= info->BlocksPerTable; |
408 | |
409 | /* the parity block is in the position indicated by RepIndex */ |
410 | RepIndex = (raidPtr->noRotate) ? info->PUsPerBlock : info->PUsPerBlock - TableID; |
411 | *col = info->LayoutTable[BlockID][RepIndex]; |
412 | |
413 | if (remap) { |
414 | RF_ASSERT(raidPtr->Disks[*col].status == rf_ds_reconstructing || raidPtr->Disks[*col].status == rf_ds_dist_spared || |
415 | (rf_copyback_in_progress && raidPtr->Disks[*col].status == rf_ds_optimal)); |
416 | rf_remap_to_spare_space(layoutPtr, info, FullTableID, TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col, &outSU); |
417 | } else { |
418 | |
419 | /* compute sector as before, except use RepIndex instead of |
420 | * BlockOffset */ |
421 | outSU = base_suid; |
422 | outSU += FullTableID * fulltable_depth; |
423 | outSU += SpareSpace; /* skip rsvd spare space */ |
424 | outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU; |
425 | outSU += info->OffsetTable[BlockID][RepIndex] * layoutPtr->SUsPerPU; |
426 | } |
427 | |
428 | outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock); |
429 | *diskSector = outSU * layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit); |
430 | |
431 | RF_ASSERT(*col != -1); |
432 | } |
433 | /* returns an array of ints identifying the disks that comprise the stripe containing the indicated address. |
434 | * the caller must _never_ attempt to modify this array. |
435 | */ |
436 | void |
437 | rf_IdentifyStripeDeclustered(RF_Raid_t *raidPtr, RF_RaidAddr_t addr, |
438 | RF_RowCol_t **diskids) |
439 | { |
440 | RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); |
441 | RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; |
442 | RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable; |
443 | RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU; |
444 | RF_StripeNum_t base_suid = 0; |
445 | RF_StripeNum_t SUID = rf_RaidAddressToStripeUnitID(layoutPtr, addr); |
446 | RF_StripeNum_t stripeID; |
447 | int tableOffset; |
448 | |
449 | rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid); |
450 | stripeID = rf_StripeUnitIDToStripeID(layoutPtr, SUID); /* find stripe offset |
451 | * into array */ |
452 | tableOffset = (stripeID % info->BlocksPerTable); /* find offset into |
453 | * block design table */ |
454 | *diskids = info->LayoutTable[tableOffset]; |
455 | } |
456 | /* This returns the default head-separation limit, which is measured |
457 | * in "required units for reconstruction". Each time a disk fetches |
458 | * a unit, it bumps a counter. The head-sep code prohibits any disk |
459 | * from getting more than headSepLimit counter values ahead of any |
460 | * other. |
461 | * |
462 | * We assume here that the number of floating recon buffers is already |
463 | * set. There are r stripes to be reconstructed in each table, and so |
464 | * if we have a total of B buffers, we can have at most B/r tables |
465 | * under recon at any one time. In each table, lambda units are required |
466 | * from each disk, so given B buffers, the head sep limit has to be |
467 | * (lambda*B)/r units. We subtract one to avoid weird boundary cases. |
468 | * |
469 | * for example, suppose were given 50 buffers, r=19, and lambda=4 as in |
470 | * the 20.5 design. There are 19 stripes/table to be reconstructed, so |
471 | * we can have 50/19 tables concurrently under reconstruction, which means |
472 | * we can allow the fastest disk to get 50/19 tables ahead of the slower |
473 | * disk. There are lambda "required units" for each disk, so the fastest |
474 | * disk can get 4*50/19 = 10 counter values ahead of the slowest. |
475 | * |
476 | * If numBufsToAccumulate is not 1, we need to limit the head sep further |
477 | * because multiple bufs will be required for each stripe under recon. |
478 | */ |
479 | RF_HeadSepLimit_t |
480 | rf_GetDefaultHeadSepLimitDeclustered(RF_Raid_t *raidPtr) |
481 | { |
482 | RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; |
483 | |
484 | return (info->Lambda * raidPtr->numFloatingReconBufs / info->TableDepthInPUs / rf_numBufsToAccumulate); |
485 | } |
486 | /* returns the default number of recon buffers to use. The value |
487 | * is somewhat arbitrary...it's intended to be large enough to allow |
488 | * for a reasonably large head-sep limit, but small enough that you |
489 | * don't use up all your system memory with buffers. |
490 | */ |
491 | int |
492 | rf_GetDefaultNumFloatingReconBuffersDeclustered(RF_Raid_t * raidPtr) |
493 | { |
494 | return (100 * rf_numBufsToAccumulate); |
495 | } |
496 | /* sectors in the last fulltable of the array need to be handled |
497 | * specially since this fulltable can be incomplete. this function |
498 | * changes the values of certain params to handle this. |
499 | * |
500 | * the idea here is that MapSector et. al. figure out which disk the |
501 | * addressed unit lives on by computing the modulos of the unit number |
502 | * with the number of units per fulltable, table, etc. In the last |
503 | * fulltable, there are fewer units per fulltable, so we need to adjust |
504 | * the number of user data units per fulltable to reflect this. |
505 | * |
506 | * so, we (1) convert the fulltable size and depth parameters to |
507 | * the size of the partial fulltable at the end, (2) compute the |
508 | * disk sector offset where this fulltable starts, and (3) convert |
509 | * the users stripe unit number from an offset into the array to |
510 | * an offset into the last fulltable. |
511 | */ |
512 | void |
513 | rf_decluster_adjust_params(RF_RaidLayout_t *layoutPtr, |
514 | RF_StripeNum_t *SUID, |
515 | RF_StripeCount_t *sus_per_fulltable, |
516 | RF_StripeCount_t *fulltable_depth, |
517 | RF_StripeNum_t *base_suid) |
518 | { |
519 | RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; |
520 | |
521 | if (*SUID >= info->FullTableLimitSUID) { |
522 | /* new full table size is size of last full table on disk */ |
523 | *sus_per_fulltable = info->ExtraTablesPerDisk * info->SUsPerTable; |
524 | |
525 | /* new full table depth is corresponding depth */ |
526 | *fulltable_depth = info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU; |
527 | |
528 | /* set up the new base offset */ |
529 | *base_suid = info->DiskOffsetOfLastFullTableInSUs; |
530 | |
531 | /* convert users array address to an offset into the last |
532 | * fulltable */ |
533 | *SUID -= info->FullTableLimitSUID; |
534 | } |
535 | } |
536 | /* |
537 | * map a stripe ID to a parity stripe ID. |
538 | * See comment above RaidAddressToParityStripeID in layout.c. |
539 | */ |
540 | void |
541 | rf_MapSIDToPSIDDeclustered(RF_RaidLayout_t *layoutPtr, |
542 | RF_StripeNum_t stripeID, |
543 | RF_StripeNum_t *psID, |
544 | RF_ReconUnitNum_t *which_ru) |
545 | { |
546 | RF_DeclusteredConfigInfo_t *info; |
547 | |
548 | info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; |
549 | |
550 | *psID = (stripeID / (layoutPtr->SUsPerPU * info->BlocksPerTable)) |
551 | * info->BlocksPerTable + (stripeID % info->BlocksPerTable); |
552 | *which_ru = (stripeID % (info->BlocksPerTable * layoutPtr->SUsPerPU)) |
553 | / info->BlocksPerTable; |
554 | RF_ASSERT((*which_ru) < layoutPtr->SUsPerPU / layoutPtr->SUsPerRU); |
555 | } |
556 | /* |
557 | * Called from MapSector and MapParity to retarget an access at the spare unit. |
558 | * Modifies the "col" and "outSU" parameters only. |
559 | */ |
560 | void |
561 | rf_remap_to_spare_space(RF_RaidLayout_t *layoutPtr, |
562 | RF_DeclusteredConfigInfo_t *info, |
563 | RF_StripeNum_t FullTableID, |
564 | RF_StripeNum_t TableID, |
565 | RF_SectorNum_t BlockID, |
566 | RF_StripeNum_t base_suid, |
567 | RF_StripeNum_t SpareRegion, |
568 | RF_RowCol_t *outCol, |
569 | RF_StripeNum_t *outSU) |
570 | { |
571 | RF_StripeNum_t ftID, spareTableStartSU, TableInSpareRegion, lastSROffset, |
572 | which_ft; |
573 | |
574 | /* |
575 | * note that FullTableID and hence SpareRegion may have gotten |
576 | * tweaked by rf_decluster_adjust_params. We detect this by |
577 | * noticing that base_suid is not 0. |
578 | */ |
579 | if (base_suid == 0) { |
580 | ftID = FullTableID; |
581 | } else { |
582 | /* |
583 | * There may be > 1.0 full tables in the last (i.e. partial) |
584 | * spare region. find out which of these we're in. |
585 | */ |
586 | lastSROffset = info->NumCompleteSRs * info->SpareRegionDepthInSUs; |
587 | which_ft = (info->DiskOffsetOfLastFullTableInSUs - lastSROffset) / (info->FullTableDepthInPUs * layoutPtr->SUsPerPU); |
588 | |
589 | /* compute the actual full table ID */ |
590 | ftID = info->DiskOffsetOfLastFullTableInSUs / (info->FullTableDepthInPUs * layoutPtr->SUsPerPU) + which_ft; |
591 | SpareRegion = info->NumCompleteSRs; |
592 | } |
593 | TableInSpareRegion = (ftID * info->NumParityReps + TableID) % info->TablesPerSpareRegion; |
594 | |
595 | *outCol = info->SpareTable[TableInSpareRegion][BlockID].spareDisk; |
596 | RF_ASSERT(*outCol != -1); |
597 | |
598 | spareTableStartSU = (SpareRegion == info->NumCompleteSRs) ? |
599 | info->DiskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU : |
600 | (SpareRegion + 1) * info->SpareRegionDepthInSUs - info->SpareSpaceDepthPerRegionInSUs; |
601 | *outSU = spareTableStartSU + info->SpareTable[TableInSpareRegion][BlockID].spareBlockOffsetInSUs; |
602 | if (*outSU >= layoutPtr->stripeUnitsPerDisk) { |
603 | printf("rf_remap_to_spare_space: invalid remapped disk SU offset %ld\n" , (long) *outSU); |
604 | } |
605 | } |
606 | |
607 | #endif /* (RF_INCLUDE_PARITY_DECLUSTERING > 0) || (RF_INCLUDE_PARITY_DECLUSTERING_PQ > 0) */ |
608 | |
609 | #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) |
610 | int |
611 | rf_InstallSpareTable(RF_Raid_t *raidPtr, RF_RowCol_t frow, |
612 | RF_RowCol_t fcol) |
613 | { |
614 | RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; |
615 | RF_SparetWait_t *req; |
616 | int retcode; |
617 | |
618 | RF_Malloc(req, sizeof(*req), (RF_SparetWait_t *)); |
619 | req->C = raidPtr->numCol; |
620 | req->G = raidPtr->Layout.numDataCol + raidPtr->Layout.numParityCol; |
621 | req->fcol = fcol; |
622 | req->SUsPerPU = raidPtr->Layout.SUsPerPU; |
623 | req->TablesPerSpareRegion = info->TablesPerSpareRegion; |
624 | req->BlocksPerTable = info->BlocksPerTable; |
625 | req->TableDepthInPUs = info->TableDepthInPUs; |
626 | req->SpareSpaceDepthPerRegionInSUs = info->SpareSpaceDepthPerRegionInSUs; |
627 | |
628 | retcode = rf_GetSpareTableFromDaemon(req); |
629 | RF_ASSERT(!retcode); /* XXX -- fix this to recover gracefully -- |
630 | * XXX */ |
631 | return (retcode); |
632 | } |
633 | #endif |
634 | #if (RF_INCLUDE_PARITY_DECLUSTERING > 0) || (RF_INCLUDE_PARITY_DECLUSTERING_PQ > 0) |
635 | /* |
636 | * Invoked via ioctl to install a spare table in the kernel. |
637 | */ |
638 | int |
639 | rf_SetSpareTable(RF_Raid_t *raidPtr, void *data) |
640 | { |
641 | RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; |
642 | RF_SpareTableEntry_t **ptrs; |
643 | int i, retcode; |
644 | |
645 | /* what we need to copyin is a 2-d array, so first copyin the user |
646 | * pointers to the rows in the table */ |
647 | RF_Malloc(ptrs, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *), (RF_SpareTableEntry_t **)); |
648 | retcode = copyin((void *) data, (void *) ptrs, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *)); |
649 | |
650 | if (retcode) |
651 | return (retcode); |
652 | |
653 | /* now allocate kernel space for the row pointers */ |
654 | RF_Malloc(info->SpareTable, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *), (RF_SpareTableEntry_t **)); |
655 | |
656 | /* now allocate kernel space for each row in the table, and copy it in |
657 | * from user space */ |
658 | for (i = 0; i < info->TablesPerSpareRegion; i++) { |
659 | RF_Malloc(info->SpareTable[i], info->BlocksPerTable * sizeof(RF_SpareTableEntry_t), (RF_SpareTableEntry_t *)); |
660 | retcode = copyin(ptrs[i], info->SpareTable[i], info->BlocksPerTable * sizeof(RF_SpareTableEntry_t)); |
661 | if (retcode) { |
662 | info->SpareTable = NULL; /* blow off the memory |
663 | * we've allocated */ |
664 | return (retcode); |
665 | } |
666 | } |
667 | |
668 | /* free up the temporary array we used */ |
669 | RF_Free(ptrs, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *)); |
670 | |
671 | return (0); |
672 | } |
673 | |
674 | RF_ReconUnitCount_t |
675 | rf_GetNumSpareRUsDeclustered(RF_Raid_t *raidPtr) |
676 | { |
677 | RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; |
678 | |
679 | return (((RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo)->TotSparePUsPerDisk); |
680 | } |
681 | #endif /* (RF_INCLUDE_PARITY_DECLUSTERING > 0) || (RF_INCLUDE_PARITY_DECLUSTERING_PQ > 0) */ |
682 | |
683 | void |
684 | rf_FreeSpareTable(RF_Raid_t *raidPtr) |
685 | { |
686 | long i; |
687 | RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; |
688 | RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; |
689 | RF_SpareTableEntry_t **table = info->SpareTable; |
690 | |
691 | for (i = 0; i < info->TablesPerSpareRegion; i++) { |
692 | RF_Free(table[i], info->BlocksPerTable * sizeof(RF_SpareTableEntry_t)); |
693 | } |
694 | RF_Free(table, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *)); |
695 | info->SpareTable = NULL; |
696 | } |
697 | |