1 | /* $NetBSD: rf_raid.h,v 1.45 2014/10/18 08:33:28 snj Exp $ */ |
2 | /* |
3 | * Copyright (c) 1995 Carnegie-Mellon University. |
4 | * All rights reserved. |
5 | * |
6 | * Author: Mark Holland |
7 | * |
8 | * Permission to use, copy, modify and distribute this software and |
9 | * its documentation is hereby granted, provided that both the copyright |
10 | * notice and this permission notice appear in all copies of the |
11 | * software, derivative works or modified versions, and any portions |
12 | * thereof, and that both notices appear in supporting documentation. |
13 | * |
14 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
15 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND |
16 | * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. |
17 | * |
18 | * Carnegie Mellon requests users of this software to return to |
19 | * |
20 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
21 | * School of Computer Science |
22 | * Carnegie Mellon University |
23 | * Pittsburgh PA 15213-3890 |
24 | * |
25 | * any improvements or extensions that they make and grant Carnegie the |
26 | * rights to redistribute these changes. |
27 | */ |
28 | |
29 | /********************************************** |
30 | * rf_raid.h -- main header file for RAID driver |
31 | **********************************************/ |
32 | |
33 | |
34 | #ifndef _RF__RF_RAID_H_ |
35 | #define _RF__RF_RAID_H_ |
36 | |
37 | #include <dev/raidframe/raidframevar.h> |
38 | #include "rf_archs.h" |
39 | #include "rf_threadstuff.h" |
40 | |
41 | #include "rf_netbsd.h" |
42 | |
43 | #include <sys/disklabel.h> |
44 | #include <sys/types.h> |
45 | #include <sys/queue.h> |
46 | |
47 | #include "rf_alloclist.h" |
48 | #include "rf_stripelocks.h" |
49 | #include "rf_layout.h" |
50 | #include "rf_disks.h" |
51 | #include "rf_debugMem.h" |
52 | #include "rf_diskqueue.h" |
53 | #include "rf_reconstruct.h" |
54 | #include "rf_acctrace.h" |
55 | #include "rf_fifo.h" |
56 | |
57 | #if RF_INCLUDE_PARITYLOGGING > 0 |
58 | #include "rf_paritylog.h" |
59 | #endif /* RF_INCLUDE_PARITYLOGGING > 0 */ |
60 | |
61 | #define RF_COMPONENT_LABEL_VERSION_1 1 |
62 | #define RF_COMPONENT_LABEL_VERSION 2 |
63 | #define RF_RAID_DIRTY 0 |
64 | #define RF_RAID_CLEAN 1 |
65 | |
66 | |
67 | /* |
68 | * Each row in the array is a distinct parity group, so |
69 | * each has its own status, which is one of the following. |
70 | */ |
71 | typedef enum RF_RowStatus_e { |
72 | rf_rs_optimal, |
73 | rf_rs_degraded, |
74 | rf_rs_reconstructing, |
75 | rf_rs_reconfigured |
76 | } RF_RowStatus_t; |
77 | |
78 | struct RF_CumulativeStats_s { |
79 | struct timeval start; /* the time when the stats were last started */ |
80 | struct timeval stop; /* the time when the stats were last stopped */ |
81 | long sum_io_us; /* sum of all user response times (us) */ |
82 | long num_ios; /* total number of I/Os serviced */ |
83 | long num_sect_moved; /* total number of sectors read or written */ |
84 | }; |
85 | |
86 | struct RF_ThroughputStats_s { |
87 | rf_declare_mutex2(mutex);/* a mutex used to lock the configuration |
88 | * stuff */ |
89 | struct timeval start; /* timer started when numOutstandingRequests |
90 | * moves from 0 to 1 */ |
91 | struct timeval stop; /* timer stopped when numOutstandingRequests |
92 | * moves from 1 to 0 */ |
93 | RF_uint64 sum_io_us; /* total time timer is enabled */ |
94 | RF_uint64 num_ios; /* total number of ios processed by RAIDframe */ |
95 | long num_out_ios; /* number of outstanding ios */ |
96 | }; |
97 | |
98 | struct RF_Raid_s { |
99 | /* This portion never changes, and can be accessed without locking */ |
100 | /* an exception is Disks[][].status, which requires locking when it is |
101 | * changed. XXX this is no longer true. numSpare and friends can |
102 | * change now. |
103 | */ |
104 | u_int numCol; /* number of columns of disks, typically == # |
105 | * of disks/rank */ |
106 | u_int numSpare; /* number of spare disks */ |
107 | int maxQueueDepth; /* max disk queue depth */ |
108 | RF_SectorCount_t totalSectors; /* total number of sectors in the |
109 | * array */ |
110 | RF_SectorCount_t sectorsPerDisk; /* number of sectors on each |
111 | * disk */ |
112 | u_int logBytesPerSector; /* base-2 log of the number of bytes |
113 | * in a sector */ |
114 | u_int bytesPerSector; /* bytes in a sector */ |
115 | RF_int32 sectorMask; /* mask of bytes-per-sector */ |
116 | |
117 | RF_RaidLayout_t Layout; /* all information related to layout */ |
118 | RF_RaidDisk_t *Disks; /* all information related to physical disks */ |
119 | RF_DiskQueue_t *Queues;/* all information related to disk queues */ |
120 | const RF_DiskQueueSW_t *qType;/* pointer to the DiskQueueSW used for the |
121 | component queues. */ |
122 | /* NOTE: This is an anchor point via which the queues can be |
123 | * accessed, but the enqueue/dequeue routines in diskqueue.c use a |
124 | * local copy of this pointer for the actual accesses. */ |
125 | /* The remainder of the structure can change, and therefore requires |
126 | * locking on reads and updates */ |
127 | rf_declare_mutex2(mutex);/* mutex used to serialize access to |
128 | * the fields below */ |
129 | RF_RowStatus_t status; /* the status of each row in the array */ |
130 | int valid; /* indicates successful configuration */ |
131 | RF_LockTableEntry_t *lockTable; /* stripe-lock table */ |
132 | RF_LockTableEntry_t *quiesceLock; /* quiesnce table */ |
133 | int numFailures; /* total number of failures in the array */ |
134 | int numNewFailures; /* number of *new* failures (that havn't |
135 | caused a mod_counter update */ |
136 | |
137 | int parity_good; /* !0 if parity is known to be correct */ |
138 | int serial_number; /* a "serial number" for this set */ |
139 | int mod_counter; /* modification counter for component labels */ |
140 | int clean; /* completely unused and should be removed */ |
141 | |
142 | int openings; /* Number of IO's which can be scheduled |
143 | simultaneously (high-level - not a |
144 | per-component limit)*/ |
145 | |
146 | int maxOutstanding; /* maxOutstanding requests (per-component) */ |
147 | int autoconfigure; /* automatically configure this RAID set. |
148 | 0 == no, 1 == yes */ |
149 | int root_partition; /* Use this set as / |
150 | 0 == no, 1 == yes*/ |
151 | int last_unit; /* last unit number (e.g. 0 for /dev/raid0) |
152 | of this component. Used for autoconfigure |
153 | only. */ |
154 | int config_order; /* 0 .. n. The order in which the component |
155 | should be auto-configured. E.g. 0 is will |
156 | done first, (and would become raid0). |
157 | This may be in conflict with last_unit!!?! */ |
158 | /* Not currently used. */ |
159 | |
160 | /* queue to gather up requests from KernelWakeupFunc() and let |
161 | a kernel thread deal with calling rf_DiskIOComplete and any |
162 | callback functions. */ |
163 | TAILQ_HEAD(iodone_q,RF_DiskQueueData_s) iodone; |
164 | /* and a lock/cv to protect it */ |
165 | rf_declare_mutex2(iodone_lock); |
166 | rf_declare_cond2(iodone_cv); |
167 | |
168 | |
169 | RF_VoidPointerListElem_t *iobuf; /* I/O buffer free list */ |
170 | int iobuf_count; /* count of I/O buffers on the freelist */ |
171 | int numEmergencyBuffers; /* number of these buffers to pre-allocate */ |
172 | |
173 | RF_VoidPointerListElem_t *stripebuf; /* Full-stripe buffer free list */ |
174 | int stripebuf_count; /* count of full-stripe buffers on the freelist */ |
175 | int numEmergencyStripeBuffers; /* number of these buffers to pre-allocate */ |
176 | |
177 | /* |
178 | * Cleanup stuff |
179 | */ |
180 | RF_ShutdownList_t *shutdownList; /* shutdown activities */ |
181 | RF_AllocListElem_t *cleanupList; /* memory to be freed at |
182 | * shutdown time */ |
183 | |
184 | /* |
185 | * Recon stuff |
186 | */ |
187 | RF_HeadSepLimit_t headSepLimit; |
188 | int numFloatingReconBufs; |
189 | int reconInProgress; |
190 | rf_declare_cond2(waitForReconCond); /* goes with raidPtr->mutex */ |
191 | RF_RaidReconDesc_t *reconDesc; /* reconstruction descriptor */ |
192 | RF_ReconCtrl_t *reconControl; /* reconstruction control structure |
193 | * pointers for each row in the array */ |
194 | |
195 | /* |
196 | * Array-quiescence stuff |
197 | */ |
198 | rf_declare_mutex2(access_suspend_mutex); |
199 | rf_declare_cond2(access_suspend_cv); |
200 | RF_IoCount_t accesses_suspended; |
201 | RF_IoCount_t accs_in_flight; |
202 | int access_suspend_release; |
203 | int waiting_for_quiescence; |
204 | RF_CallbackDesc_t *quiesce_wait_list; |
205 | |
206 | /* |
207 | * Statistics |
208 | */ |
209 | RF_StripeCount_t parity_rewrite_stripes_done; |
210 | RF_StripeCount_t copyback_stripes_done; |
211 | |
212 | int recon_in_progress; |
213 | int parity_rewrite_in_progress; |
214 | int copyback_in_progress; |
215 | int adding_hot_spare; |
216 | |
217 | rf_declare_cond2(adding_hot_spare_cv); |
218 | |
219 | /* |
220 | * Engine thread control |
221 | */ |
222 | rf_declare_mutex2(node_queue_mutex); |
223 | rf_declare_cond2(node_queue_cv); |
224 | RF_DagNode_t *node_queue; |
225 | RF_Thread_t parity_rewrite_thread; |
226 | RF_Thread_t copyback_thread; |
227 | RF_Thread_t engine_thread; |
228 | RF_Thread_t engine_helper_thread; |
229 | RF_Thread_t recon_thread; |
230 | int shutdown_engine; |
231 | int shutdown_raidio; |
232 | int dags_in_flight; /* debug */ |
233 | |
234 | /* |
235 | * PSS (Parity Stripe Status) stuff |
236 | */ |
237 | long pssTableSize; |
238 | |
239 | /* |
240 | * Reconstruction stuff |
241 | */ |
242 | int procsInBufWait; |
243 | int numFullReconBuffers; |
244 | #if RF_ACC_TRACE > 0 |
245 | RF_AccTraceEntry_t *recon_tracerecs; |
246 | #endif |
247 | unsigned long accumXorTimeUs; |
248 | |
249 | /* |
250 | * nAccOutstanding, waitShutdown protected by desc freelist lock |
251 | * (This may seem strange, since that's a central serialization point |
252 | * for a per-array piece of data, but otherwise, it'd be an extra |
253 | * per-array lock, and that'd only be less efficient...) |
254 | */ |
255 | rf_declare_mutex2(rad_lock); |
256 | rf_declare_cond2(outstandingCond); |
257 | int waitShutdown; |
258 | int nAccOutstanding; |
259 | |
260 | RF_DiskId_t **diskids; |
261 | |
262 | int raidid; |
263 | void *softc; |
264 | RF_AccTotals_t acc_totals; |
265 | int keep_acc_totals; |
266 | |
267 | struct raidcinfo *raid_cinfo; /* array of component info */ |
268 | |
269 | int terminate_disk_queues; |
270 | |
271 | /* |
272 | * XXX |
273 | * |
274 | * config-specific information should be moved |
275 | * somewhere else, or at least hung off this |
276 | * in some generic way |
277 | */ |
278 | #if RF_INCLUDE_CHAINDECLUSTER > 0 |
279 | |
280 | /* used by rf_compute_workload_shift */ |
281 | RF_RowCol_t hist_diskreq[RF_MAXCOL]; |
282 | #endif |
283 | /* used by declustering */ |
284 | int noRotate; |
285 | |
286 | #if RF_INCLUDE_PARITYLOGGING > 0 |
287 | /* used by parity logging */ |
288 | RF_SectorCount_t regionLogCapacity; |
289 | RF_ParityLogQueue_t parityLogPool; /* pool of unused parity logs */ |
290 | RF_RegionInfo_t *regionInfo; /* array of region state */ |
291 | int numParityLogs; |
292 | int numSectorsPerLog; |
293 | int regionParityRange; |
294 | int logsInUse; /* debugging */ |
295 | RF_ParityLogDiskQueue_t parityLogDiskQueue; /* state of parity |
296 | * logging disk work */ |
297 | RF_RegionBufferQueue_t regionBufferPool; /* buffers for holding |
298 | * region log */ |
299 | RF_RegionBufferQueue_t parityBufferPool; /* buffers for holding |
300 | * parity */ |
301 | void *parityLogBufferHeap; /* pool of unused parity logs */ |
302 | RF_Thread_t pLogDiskThreadHandle; |
303 | |
304 | #endif /* RF_INCLUDE_PARITYLOGGING > 0 */ |
305 | struct rf_paritymap *parity_map; |
306 | }; |
307 | #endif /* !_RF__RF_RAID_H_ */ |
308 | |