1 | /* $NetBSD: ffs_wapbl.c,v 1.37 2016/11/10 22:19:23 jdolecek Exp $ */ |
2 | |
3 | /*- |
4 | * Copyright (c) 2003,2006,2008 The NetBSD Foundation, Inc. |
5 | * All rights reserved. |
6 | * |
7 | * This code is derived from software contributed to The NetBSD Foundation |
8 | * by Wasabi Systems, Inc. |
9 | * |
10 | * Redistribution and use in source and binary forms, with or without |
11 | * modification, are permitted provided that the following conditions |
12 | * are met: |
13 | * 1. Redistributions of source code must retain the above copyright |
14 | * notice, this list of conditions and the following disclaimer. |
15 | * 2. Redistributions in binary form must reproduce the above copyright |
16 | * notice, this list of conditions and the following disclaimer in the |
17 | * documentation and/or other materials provided with the distribution. |
18 | * |
19 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
20 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
21 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
22 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
23 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
29 | * POSSIBILITY OF SUCH DAMAGE. |
30 | */ |
31 | |
32 | #include <sys/cdefs.h> |
33 | __KERNEL_RCSID(0, "$NetBSD: ffs_wapbl.c,v 1.37 2016/11/10 22:19:23 jdolecek Exp $" ); |
34 | |
35 | #define WAPBL_INTERNAL |
36 | |
37 | #if defined(_KERNEL_OPT) |
38 | #include "opt_ffs.h" |
39 | #endif |
40 | |
41 | #include <sys/param.h> |
42 | #include <sys/systm.h> |
43 | #include <sys/kernel.h> |
44 | #include <sys/vnode.h> |
45 | #include <sys/mount.h> |
46 | #include <sys/file.h> |
47 | #include <sys/disk.h> |
48 | #include <sys/ioctl.h> |
49 | #include <sys/errno.h> |
50 | #include <sys/kauth.h> |
51 | #include <sys/wapbl.h> |
52 | |
53 | #include <ufs/ufs/inode.h> |
54 | #include <ufs/ufs/quota.h> |
55 | #include <ufs/ufs/ufsmount.h> |
56 | #include <ufs/ufs/ufs_bswap.h> |
57 | #include <ufs/ufs/ufs_extern.h> |
58 | #include <ufs/ufs/ufs_wapbl.h> |
59 | |
60 | #include <ufs/ffs/fs.h> |
61 | #include <ufs/ffs/ffs_extern.h> |
62 | |
63 | #undef WAPBL_DEBUG |
64 | #ifdef WAPBL_DEBUG |
65 | int ffs_wapbl_debug = 1; |
66 | #define DPRINTF(fmt, args...) \ |
67 | do { \ |
68 | if (ffs_wapbl_debug) \ |
69 | printf("%s:%d "fmt, __func__ , __LINE__, ##args); \ |
70 | } while (/* CONSTCOND */0) |
71 | #else |
72 | #define DPRINTF(fmt, args...) \ |
73 | do { \ |
74 | /* nothing */ \ |
75 | } while (/* CONSTCOND */0) |
76 | #endif |
77 | |
78 | static int ffs_superblock_layout(struct fs *); |
79 | static int wapbl_log_position(struct mount *, struct fs *, struct vnode *, |
80 | daddr_t *, size_t *, size_t *, uint64_t *); |
81 | static int wapbl_create_infs_log(struct mount *, struct fs *, struct vnode *, |
82 | daddr_t *, size_t *, uint64_t *); |
83 | static void wapbl_find_log_start(struct mount *, struct vnode *, off_t, |
84 | daddr_t *, daddr_t *, size_t *); |
85 | static int wapbl_remove_log(struct mount *); |
86 | static int wapbl_allocate_log_file(struct mount *, struct vnode *, |
87 | daddr_t *, size_t *, uint64_t *); |
88 | |
89 | /* |
90 | * Return the super block layout format - UFS1 or UFS2. |
91 | * WAPBL only works with UFS2 layout (which is still available |
92 | * with FFSv1). |
93 | * |
94 | * XXX Should this be in ufs/ffs/fs.h? Same style of check is |
95 | * also used in ffs_alloc.c in a few places. |
96 | */ |
97 | static int |
98 | ffs_superblock_layout(struct fs *fs) |
99 | { |
100 | if ((fs->fs_magic == FS_UFS1_MAGIC) && |
101 | ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) |
102 | return 1; |
103 | else |
104 | return 2; |
105 | } |
106 | |
107 | /* |
108 | * This function is invoked after a log is replayed to |
109 | * disk to perform logical cleanup actions as described by |
110 | * the log |
111 | */ |
112 | void |
113 | ffs_wapbl_replay_finish(struct mount *mp) |
114 | { |
115 | struct wapbl_replay *wr = mp->mnt_wapbl_replay; |
116 | int i; |
117 | int error; |
118 | |
119 | if (!wr) |
120 | return; |
121 | |
122 | KDASSERT((mp->mnt_flag & MNT_RDONLY) == 0); |
123 | |
124 | for (i = 0; i < wr->wr_inodescnt; i++) { |
125 | struct vnode *vp; |
126 | struct inode *ip; |
127 | error = VFS_VGET(mp, wr->wr_inodes[i].wr_inumber, &vp); |
128 | if (error) { |
129 | printf("%s: %s: unable to cleanup inode %" PRIu32 "\n" , |
130 | __func__, VFSTOUFS(mp)->um_fs->fs_fsmnt, |
131 | wr->wr_inodes[i].wr_inumber); |
132 | continue; |
133 | } |
134 | ip = VTOI(vp); |
135 | KDASSERT(wr->wr_inodes[i].wr_inumber == ip->i_number); |
136 | #ifdef WAPBL_DEBUG |
137 | printf("%s%s: %s: cleaning inode %" PRIu64 " size=%" PRIu64 |
138 | " mode=%o nlink=%d\n" , |
139 | __func__, VFSTOUFS(mp)->um_fs->fs_fsmnt, |
140 | ip->i_number, ip->i_size, ip->i_mode, ip->i_nlink); |
141 | #endif |
142 | KASSERT(ip->i_nlink == 0); |
143 | |
144 | /* |
145 | * The journal may have left partially allocated inodes in mode |
146 | * zero. This may occur if a crash occurs betweeen the node |
147 | * allocation in ffs_nodeallocg and when the node is properly |
148 | * initialized in ufs_makeinode. If so, just dallocate them. |
149 | */ |
150 | if (ip->i_mode == 0) { |
151 | error = UFS_WAPBL_BEGIN(mp); |
152 | if (error) { |
153 | printf("%s: %s: " |
154 | "unable to cleanup inode %" PRIu32 "\n" , |
155 | __func__, VFSTOUFS(mp)->um_fs->fs_fsmnt, |
156 | wr->wr_inodes[i].wr_inumber); |
157 | } else { |
158 | ffs_vfree(vp, ip->i_number, |
159 | wr->wr_inodes[i].wr_imode); |
160 | UFS_WAPBL_END(mp); |
161 | } |
162 | } |
163 | vput(vp); |
164 | } |
165 | wapbl_replay_stop(wr); |
166 | wapbl_replay_free(wr); |
167 | mp->mnt_wapbl_replay = NULL; |
168 | } |
169 | |
170 | /* Callback for wapbl */ |
171 | void |
172 | ffs_wapbl_sync_metadata(struct mount *mp, struct wapbl_dealloc *fdealloc) |
173 | { |
174 | struct ufsmount *ump = VFSTOUFS(mp); |
175 | struct fs *fs = ump->um_fs; |
176 | int error __diagused; |
177 | struct wapbl_dealloc *wd; |
178 | |
179 | UFS_WAPBL_JLOCK_ASSERT(mp); |
180 | |
181 | #ifdef WAPBL_DEBUG_INODES |
182 | ufs_wapbl_verify_inodes(mp, __func__); |
183 | #endif |
184 | |
185 | for (wd = fdealloc; wd != NULL; wd = TAILQ_NEXT(wd, wd_entries)) { |
186 | /* |
187 | * blkfree errors are unreported, might silently fail |
188 | * if it cannot read the cylinder group block |
189 | */ |
190 | ffs_blkfree(fs, ump->um_devvp, |
191 | FFS_DBTOFSB(fs, wd->wd_blkno), wd->wd_len, -1); |
192 | } |
193 | |
194 | if (fs->fs_fmod != 0) { |
195 | fs->fs_fmod = 0; |
196 | fs->fs_time = time_second; |
197 | error = ffs_cgupdate(ump, 0); |
198 | KASSERT(error == 0); |
199 | } |
200 | } |
201 | |
202 | void |
203 | ffs_wapbl_abort_sync_metadata(struct mount *mp, struct wapbl_dealloc *fdealloc) |
204 | { |
205 | struct ufsmount *ump = VFSTOUFS(mp); |
206 | struct fs *fs = ump->um_fs; |
207 | struct wapbl_dealloc *wd; |
208 | |
209 | for (wd = fdealloc; wd != NULL; wd = TAILQ_NEXT(wd, wd_entries)) { |
210 | /* |
211 | * Since the above blkfree may have failed, this blkalloc might |
212 | * fail as well, so don't check its error. Note that if the |
213 | * blkfree succeeded above, then this shouldn't fail because |
214 | * the buffer will be locked in the current transaction. |
215 | */ |
216 | ffs_blkalloc_ump(ump, FFS_DBTOFSB(fs, wd->wd_blkno), |
217 | wd->wd_len); |
218 | } |
219 | } |
220 | |
221 | static int |
222 | wapbl_remove_log(struct mount *mp) |
223 | { |
224 | struct ufsmount *ump = VFSTOUFS(mp); |
225 | struct fs *fs = ump->um_fs; |
226 | struct vnode *vp; |
227 | struct inode *ip; |
228 | ino_t log_ino; |
229 | int error; |
230 | |
231 | /* If super block layout is too old to support WAPBL, return */ |
232 | if (ffs_superblock_layout(fs) < 2) |
233 | return 0; |
234 | |
235 | /* If all the log locators are 0, just clean up */ |
236 | if (fs->fs_journallocs[0] == 0 && |
237 | fs->fs_journallocs[1] == 0 && |
238 | fs->fs_journallocs[2] == 0 && |
239 | fs->fs_journallocs[3] == 0) { |
240 | DPRINTF("empty locators, just clear\n" ); |
241 | goto done; |
242 | } |
243 | |
244 | switch (fs->fs_journal_location) { |
245 | case UFS_WAPBL_JOURNALLOC_NONE: |
246 | /* nothing! */ |
247 | DPRINTF("no log\n" ); |
248 | break; |
249 | |
250 | case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM: |
251 | log_ino = fs->fs_journallocs[UFS_WAPBL_INFS_INO]; |
252 | DPRINTF("in-fs log, ino = %" PRId64 "\n" ,log_ino); |
253 | |
254 | /* if no existing log inode, just clear all fields and bail */ |
255 | if (log_ino == 0) |
256 | goto done; |
257 | error = VFS_VGET(mp, log_ino, &vp); |
258 | if (error != 0) { |
259 | printf("%s: %s: vget failed %d\n" , __func__, |
260 | fs->fs_fsmnt, error); |
261 | /* clear out log info on error */ |
262 | goto done; |
263 | } |
264 | ip = VTOI(vp); |
265 | KASSERT(log_ino == ip->i_number); |
266 | if ((ip->i_flags & SF_LOG) == 0) { |
267 | printf("%s: %s: try to clear non-log inode " |
268 | "%" PRId64 "\n" , __func__, fs->fs_fsmnt, log_ino); |
269 | vput(vp); |
270 | /* clear out log info on error */ |
271 | goto done; |
272 | } |
273 | |
274 | /* |
275 | * remove the log inode by setting its link count back |
276 | * to zero and bail. |
277 | */ |
278 | ip->i_nlink = 0; |
279 | DIP_ASSIGN(ip, nlink, 0); |
280 | vput(vp); |
281 | break; |
282 | |
283 | case UFS_WAPBL_JOURNALLOC_END_PARTITION: |
284 | DPRINTF("end-of-partition log\n" ); |
285 | /* no extra work required */ |
286 | break; |
287 | |
288 | default: |
289 | printf("%s: %s: unknown journal type %d\n" , __func__, |
290 | fs->fs_fsmnt, fs->fs_journal_location); |
291 | break; |
292 | } |
293 | |
294 | |
295 | done: |
296 | /* Clear out all previous knowledge of journal */ |
297 | fs->fs_journal_version = 0; |
298 | fs->fs_journal_location = 0; |
299 | fs->fs_journal_flags = 0; |
300 | fs->fs_journallocs[0] = 0; |
301 | fs->fs_journallocs[1] = 0; |
302 | fs->fs_journallocs[2] = 0; |
303 | fs->fs_journallocs[3] = 0; |
304 | (void) ffs_sbupdate(ump, MNT_WAIT); |
305 | |
306 | return 0; |
307 | } |
308 | |
309 | int |
310 | ffs_wapbl_start(struct mount *mp) |
311 | { |
312 | struct ufsmount *ump = VFSTOUFS(mp); |
313 | struct fs *fs = ump->um_fs; |
314 | struct vnode *devvp = ump->um_devvp; |
315 | daddr_t off; |
316 | size_t count; |
317 | size_t blksize; |
318 | uint64_t ; |
319 | int error; |
320 | |
321 | if (mp->mnt_wapbl == NULL) { |
322 | if (fs->fs_journal_flags & UFS_WAPBL_FLAGS_CLEAR_LOG) { |
323 | /* Clear out any existing journal file */ |
324 | error = wapbl_remove_log(mp); |
325 | if (error != 0) |
326 | return error; |
327 | } |
328 | |
329 | if (mp->mnt_flag & MNT_LOG) { |
330 | KDASSERT(fs->fs_ronly == 0); |
331 | |
332 | /* WAPBL needs UFS2 format super block */ |
333 | if (ffs_superblock_layout(fs) < 2) { |
334 | printf("%s: %s: fs superblock in old format, " |
335 | "not journaling\n" , __func__, |
336 | VFSTOUFS(mp)->um_fs->fs_fsmnt); |
337 | mp->mnt_flag &= ~MNT_LOG; |
338 | return EINVAL; |
339 | } |
340 | |
341 | error = wapbl_log_position(mp, fs, devvp, &off, |
342 | &count, &blksize, &extradata); |
343 | if (error) |
344 | return error; |
345 | |
346 | error = wapbl_start(&mp->mnt_wapbl, mp, devvp, off, |
347 | count, blksize, mp->mnt_wapbl_replay, |
348 | ffs_wapbl_sync_metadata, |
349 | ffs_wapbl_abort_sync_metadata); |
350 | if (error) |
351 | return error; |
352 | |
353 | mp->mnt_wapbl_op = &wapbl_ops; |
354 | |
355 | #ifdef WAPBL_DEBUG |
356 | printf("%s: %s: enabling logging\n" , __func__, |
357 | fs->fs_fsmnt); |
358 | #endif |
359 | |
360 | if ((fs->fs_flags & FS_DOWAPBL) == 0) { |
361 | fs->fs_flags |= FS_DOWAPBL; |
362 | if ((error = UFS_WAPBL_BEGIN(mp)) != 0) |
363 | goto out; |
364 | error = ffs_sbupdate(ump, MNT_WAIT); |
365 | if (error) { |
366 | UFS_WAPBL_END(mp); |
367 | goto out; |
368 | } |
369 | UFS_WAPBL_END(mp); |
370 | error = wapbl_flush(mp->mnt_wapbl, 1); |
371 | if (error) |
372 | goto out; |
373 | } |
374 | |
375 | /* |
376 | * XXX discard interferes with block deallocation |
377 | * registration and hence log consistency |
378 | */ |
379 | if (mp->mnt_flag & MNT_DISCARD) { |
380 | CLR(mp->mnt_flag, MNT_DISCARD); |
381 | printf("%s: %s: disabling discard to preserve log consistency\n" , __func__, |
382 | fs->fs_fsmnt); |
383 | |
384 | if (ump->um_discarddata != NULL) { |
385 | ffs_discard_finish(ump->um_discarddata, |
386 | 0); |
387 | ump->um_discarddata = NULL; |
388 | } |
389 | } |
390 | |
391 | } else if (fs->fs_flags & FS_DOWAPBL) { |
392 | fs->fs_fmod = 1; |
393 | fs->fs_flags &= ~FS_DOWAPBL; |
394 | } |
395 | } |
396 | |
397 | /* |
398 | * It is recommended that you finish replay with logging enabled. |
399 | * However, even if logging is not enabled, the remaining log |
400 | * replay should be safely recoverable with an fsck, so perform |
401 | * it anyway. |
402 | */ |
403 | if ((fs->fs_ronly == 0) && mp->mnt_wapbl_replay) { |
404 | int saveflag = mp->mnt_flag & MNT_RDONLY; |
405 | /* |
406 | * Make sure MNT_RDONLY is not set so that the inode |
407 | * cleanup in ufs_inactive will actually do its work. |
408 | */ |
409 | mp->mnt_flag &= ~MNT_RDONLY; |
410 | ffs_wapbl_replay_finish(mp); |
411 | mp->mnt_flag |= saveflag; |
412 | KASSERT(fs->fs_ronly == 0); |
413 | } |
414 | |
415 | return 0; |
416 | out: |
417 | ffs_wapbl_stop(mp, MNT_FORCE); |
418 | return error; |
419 | } |
420 | |
421 | int |
422 | ffs_wapbl_stop(struct mount *mp, int force) |
423 | { |
424 | struct ufsmount *ump = VFSTOUFS(mp); |
425 | struct fs *fs = ump->um_fs; |
426 | int error; |
427 | |
428 | if (mp->mnt_wapbl) { |
429 | KDASSERT(fs->fs_ronly == 0); |
430 | |
431 | /* |
432 | * Make sure turning off FS_DOWAPBL is only removed |
433 | * as the only change in the final flush since otherwise |
434 | * a transaction may reorder writes. |
435 | */ |
436 | error = wapbl_flush(mp->mnt_wapbl, 1); |
437 | if (error && !force) |
438 | return error; |
439 | if (error && force) |
440 | goto forceout; |
441 | error = UFS_WAPBL_BEGIN(mp); |
442 | if (error && !force) |
443 | return error; |
444 | if (error && force) |
445 | goto forceout; |
446 | KASSERT(fs->fs_flags & FS_DOWAPBL); |
447 | |
448 | fs->fs_flags &= ~FS_DOWAPBL; |
449 | error = ffs_sbupdate(ump, MNT_WAIT); |
450 | KASSERT(error == 0); /* XXX a bit drastic! */ |
451 | UFS_WAPBL_END(mp); |
452 | forceout: |
453 | error = wapbl_stop(mp->mnt_wapbl, force); |
454 | if (error) { |
455 | KASSERT(!force); |
456 | fs->fs_flags |= FS_DOWAPBL; |
457 | return error; |
458 | } |
459 | fs->fs_flags &= ~FS_DOWAPBL; /* Repeat in case of forced error */ |
460 | mp->mnt_wapbl = NULL; |
461 | |
462 | #ifdef WAPBL_DEBUG |
463 | printf("%s: %s: disabled logging\n" , __func__, fs->fs_fsmnt); |
464 | #endif |
465 | } |
466 | |
467 | return 0; |
468 | } |
469 | |
470 | int |
471 | ffs_wapbl_replay_start(struct mount *mp, struct fs *fs, struct vnode *devvp) |
472 | { |
473 | int error; |
474 | daddr_t off; |
475 | size_t count; |
476 | size_t blksize; |
477 | uint64_t ; |
478 | |
479 | /* |
480 | * WAPBL needs UFS2 format super block, if we got here with a |
481 | * UFS1 format super block something is amiss... |
482 | */ |
483 | if (ffs_superblock_layout(fs) < 2) |
484 | return EINVAL; |
485 | |
486 | error = wapbl_log_position(mp, fs, devvp, &off, &count, &blksize, |
487 | &extradata); |
488 | |
489 | if (error) |
490 | return error; |
491 | |
492 | error = wapbl_replay_start(&mp->mnt_wapbl_replay, devvp, off, |
493 | count, blksize); |
494 | if (error) |
495 | return error; |
496 | |
497 | mp->mnt_wapbl_op = &wapbl_ops; |
498 | |
499 | return 0; |
500 | } |
501 | |
502 | /* |
503 | * If the superblock doesn't already have a recorded journal location |
504 | * then we allocate the journal in one of two positions: |
505 | * |
506 | * - At the end of the partition after the filesystem if there's |
507 | * enough space. "Enough space" is defined as >= 1MB of journal |
508 | * per 1GB of filesystem or 64MB, whichever is smaller. |
509 | * |
510 | * - Inside the filesystem. We try to allocate a contiguous journal |
511 | * based on the total filesystem size - the target is 1MB of journal |
512 | * per 1GB of filesystem, up to a maximum journal size of 64MB. As |
513 | * a worst case allowing for fragmentation, we'll allocate a journal |
514 | * 1/4 of the desired size but never smaller than 1MB. |
515 | * |
516 | * XXX In the future if we allow for non-contiguous journal files we |
517 | * can tighten the above restrictions. |
518 | * |
519 | * XXX |
520 | * These seems like a lot of duplication both here and in some of |
521 | * the userland tools (fsck_ffs, dumpfs, tunefs) with similar |
522 | * "switch (fs_journal_location)" constructs. Can we centralise |
523 | * this sort of code somehow/somewhere? |
524 | */ |
525 | static int |
526 | wapbl_log_position(struct mount *mp, struct fs *fs, struct vnode *devvp, |
527 | daddr_t *startp, size_t *countp, size_t *blksizep, uint64_t *) |
528 | { |
529 | struct ufsmount *ump = VFSTOUFS(mp); |
530 | daddr_t logstart, logend, desired_logsize; |
531 | uint64_t numsecs; |
532 | unsigned secsize; |
533 | int error, location; |
534 | |
535 | if (fs->fs_journal_version == UFS_WAPBL_VERSION) { |
536 | switch (fs->fs_journal_location) { |
537 | case UFS_WAPBL_JOURNALLOC_END_PARTITION: |
538 | DPRINTF("found existing end-of-partition log\n" ); |
539 | *startp = fs->fs_journallocs[UFS_WAPBL_EPART_ADDR]; |
540 | *countp = fs->fs_journallocs[UFS_WAPBL_EPART_COUNT]; |
541 | *blksizep = fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ]; |
542 | DPRINTF(" start = %" PRId64 ", size = %zu, " |
543 | "blksize = %zu\n" , *startp, *countp, *blksizep); |
544 | return 0; |
545 | |
546 | case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM: |
547 | DPRINTF("found existing in-filesystem log\n" ); |
548 | *startp = fs->fs_journallocs[UFS_WAPBL_INFS_ADDR]; |
549 | *countp = fs->fs_journallocs[UFS_WAPBL_INFS_COUNT]; |
550 | *blksizep = fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]; |
551 | DPRINTF(" start = %" PRId64 ", size = %zu, " |
552 | "blksize = %zu\n" , *startp, *countp, *blksizep); |
553 | return 0; |
554 | |
555 | default: |
556 | printf("%s: %s: unknown journal type %d\n" , __func__, |
557 | fs->fs_fsmnt, fs->fs_journal_location); |
558 | return EINVAL; |
559 | } |
560 | } |
561 | |
562 | desired_logsize = |
563 | ffs_lfragtosize(fs, fs->fs_size) / UFS_WAPBL_JOURNAL_SCALE; |
564 | DPRINTF("desired log size = %" PRId64 " kB\n" , desired_logsize / 1024); |
565 | desired_logsize = max(desired_logsize, UFS_WAPBL_MIN_JOURNAL_SIZE); |
566 | desired_logsize = min(desired_logsize, UFS_WAPBL_MAX_JOURNAL_SIZE); |
567 | DPRINTF("adjusted desired log size = %" PRId64 " kB\n" , |
568 | desired_logsize / 1024); |
569 | |
570 | /* Is there space after after filesystem on partition for log? */ |
571 | logstart = FFS_FSBTODB(fs, fs->fs_size); |
572 | error = getdisksize(devvp, &numsecs, &secsize); |
573 | if (error) |
574 | return error; |
575 | KDASSERT(secsize != 0); |
576 | logend = btodb(numsecs * secsize); |
577 | |
578 | if (dbtob(logend - logstart) >= desired_logsize) { |
579 | DPRINTF("enough space, use end-of-partition log\n" ); |
580 | |
581 | location = UFS_WAPBL_JOURNALLOC_END_PARTITION; |
582 | *blksizep = secsize; |
583 | |
584 | *startp = logstart; |
585 | *countp = (logend - logstart); |
586 | *extradatap = 0; |
587 | |
588 | /* convert to physical block numbers */ |
589 | *startp = dbtob(*startp) / secsize; |
590 | *countp = dbtob(*countp) / secsize; |
591 | |
592 | fs->fs_journallocs[UFS_WAPBL_EPART_ADDR] = *startp; |
593 | fs->fs_journallocs[UFS_WAPBL_EPART_COUNT] = *countp; |
594 | fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ] = *blksizep; |
595 | fs->fs_journallocs[UFS_WAPBL_EPART_UNUSED] = *extradatap; |
596 | } else { |
597 | DPRINTF("end-of-partition has only %" PRId64 " free\n" , |
598 | logend - logstart); |
599 | |
600 | location = UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM; |
601 | *blksizep = secsize; |
602 | |
603 | error = wapbl_create_infs_log(mp, fs, devvp, |
604 | startp, countp, extradatap); |
605 | ffs_sync(mp, MNT_WAIT, FSCRED); |
606 | |
607 | /* convert to physical block numbers */ |
608 | *startp = dbtob(*startp) / secsize; |
609 | *countp = dbtob(*countp) / secsize; |
610 | |
611 | fs->fs_journallocs[UFS_WAPBL_INFS_ADDR] = *startp; |
612 | fs->fs_journallocs[UFS_WAPBL_INFS_COUNT] = *countp; |
613 | fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ] = *blksizep; |
614 | fs->fs_journallocs[UFS_WAPBL_INFS_INO] = *extradatap; |
615 | } |
616 | |
617 | if (error == 0) { |
618 | /* update superblock with log location */ |
619 | fs->fs_journal_version = UFS_WAPBL_VERSION; |
620 | fs->fs_journal_location = location; |
621 | fs->fs_journal_flags = 0; |
622 | |
623 | error = ffs_sbupdate(ump, MNT_WAIT); |
624 | } |
625 | |
626 | return error; |
627 | } |
628 | |
629 | /* |
630 | * Try to create a journal log inside the filesystem. |
631 | */ |
632 | static int |
633 | wapbl_create_infs_log(struct mount *mp, struct fs *fs, struct vnode *devvp, |
634 | daddr_t *startp, size_t *countp, uint64_t *) |
635 | { |
636 | struct vnode *vp, *rvp; |
637 | struct vattr va; |
638 | struct inode *ip; |
639 | int error; |
640 | |
641 | if ((error = VFS_ROOT(mp, &rvp)) != 0) |
642 | return error; |
643 | |
644 | vattr_null(&va); |
645 | va.va_type = VREG; |
646 | va.va_mode = 0; |
647 | |
648 | error = vcache_new(mp, rvp, &va, NOCRED, &vp); |
649 | vput(rvp); |
650 | if (error) |
651 | return error; |
652 | |
653 | error = vn_lock(vp, LK_EXCLUSIVE); |
654 | if (error) { |
655 | vrele(vp); |
656 | return error; |
657 | } |
658 | |
659 | ip = VTOI(vp); |
660 | ip->i_flags = SF_LOG; |
661 | DIP_ASSIGN(ip, flags, ip->i_flags); |
662 | ip->i_nlink = 1; |
663 | DIP_ASSIGN(ip, nlink, 1); |
664 | ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; |
665 | ffs_update(vp, NULL, NULL, UPDATE_WAIT); |
666 | |
667 | if ((error = wapbl_allocate_log_file(mp, vp, |
668 | startp, countp, extradatap)) != 0) { |
669 | /* |
670 | * If we couldn't allocate the space for the log file, |
671 | * remove the inode by setting its link count back to |
672 | * zero and bail. |
673 | */ |
674 | ip->i_nlink = 0; |
675 | DIP_ASSIGN(ip, nlink, 0); |
676 | VOP_UNLOCK(vp); |
677 | vgone(vp); |
678 | |
679 | return error; |
680 | } |
681 | |
682 | /* |
683 | * Now that we have the place-holder inode for the journal, |
684 | * we don't need the vnode ever again. |
685 | */ |
686 | VOP_UNLOCK(vp); |
687 | vgone(vp); |
688 | |
689 | return 0; |
690 | } |
691 | |
692 | int |
693 | wapbl_allocate_log_file(struct mount *mp, struct vnode *vp, |
694 | daddr_t *startp, size_t *countp, uint64_t *) |
695 | { |
696 | struct ufsmount *ump = VFSTOUFS(mp); |
697 | struct fs *fs = ump->um_fs; |
698 | daddr_t addr, indir_addr; |
699 | off_t logsize; |
700 | size_t size; |
701 | int error; |
702 | |
703 | logsize = 0; |
704 | /* check if there's a suggested log size */ |
705 | if (fs->fs_journal_flags & UFS_WAPBL_FLAGS_CREATE_LOG && |
706 | fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) |
707 | logsize = fs->fs_journallocs[UFS_WAPBL_INFS_COUNT]; |
708 | |
709 | if (vp->v_size > 0) { |
710 | printf("%s: %s: file size (%" PRId64 ") non zero\n" , __func__, |
711 | fs->fs_fsmnt, vp->v_size); |
712 | return EEXIST; |
713 | } |
714 | wapbl_find_log_start(mp, vp, logsize, &addr, &indir_addr, &size); |
715 | if (addr == 0) { |
716 | printf("%s: %s: log not allocated, largest extent is " |
717 | "%" PRId64 "MB\n" , __func__, fs->fs_fsmnt, |
718 | ffs_lblktosize(fs, size) / (1024 * 1024)); |
719 | return ENOSPC; |
720 | } |
721 | |
722 | logsize = ffs_lblktosize(fs, size); /* final log size */ |
723 | |
724 | VTOI(vp)->i_ffs_first_data_blk = addr; |
725 | VTOI(vp)->i_ffs_first_indir_blk = indir_addr; |
726 | |
727 | error = GOP_ALLOC(vp, 0, logsize, B_CONTIG, FSCRED); |
728 | if (error) { |
729 | printf("%s: %s: GOP_ALLOC error %d\n" , __func__, fs->fs_fsmnt, |
730 | error); |
731 | return error; |
732 | } |
733 | |
734 | *startp = FFS_FSBTODB(fs, addr); |
735 | *countp = btodb(logsize); |
736 | *extradatap = VTOI(vp)->i_number; |
737 | |
738 | return 0; |
739 | } |
740 | |
741 | /* |
742 | * Find a suitable location for the journal in the filesystem. |
743 | * |
744 | * Our strategy here is to look for a contiguous block of free space |
745 | * at least "logfile" MB in size (plus room for any indirect blocks). |
746 | * We start at the middle of the filesystem and check each cylinder |
747 | * group working outwards. If "logfile" MB is not available as a |
748 | * single contigous chunk, then return the address and size of the |
749 | * largest chunk found. |
750 | * |
751 | * XXX |
752 | * At what stage does the search fail? Is if the largest space we could |
753 | * find is less than a quarter the requested space reasonable? If the |
754 | * search fails entirely, return a block address if "0" it indicate this. |
755 | */ |
756 | static void |
757 | wapbl_find_log_start(struct mount *mp, struct vnode *vp, off_t logsize, |
758 | daddr_t *addr, daddr_t *indir_addr, size_t *size) |
759 | { |
760 | struct ufsmount *ump = VFSTOUFS(mp); |
761 | struct fs *fs = ump->um_fs; |
762 | struct vnode *devvp = ump->um_devvp; |
763 | struct cg *cgp; |
764 | struct buf *bp; |
765 | uint8_t *blksfree; |
766 | daddr_t blkno, best_addr, start_addr; |
767 | daddr_t desired_blks, min_desired_blks; |
768 | daddr_t freeblks, best_blks; |
769 | int bpcg, cg, error, fixedsize, indir_blks, n, s; |
770 | const int needswap = UFS_FSNEEDSWAP(fs); |
771 | |
772 | if (logsize == 0) { |
773 | fixedsize = 0; /* We can adjust the size if tight */ |
774 | logsize = ffs_lfragtosize(fs, fs->fs_dsize) / |
775 | UFS_WAPBL_JOURNAL_SCALE; |
776 | DPRINTF("suggested log size = %" PRId64 "\n" , logsize); |
777 | logsize = max(logsize, UFS_WAPBL_MIN_JOURNAL_SIZE); |
778 | logsize = min(logsize, UFS_WAPBL_MAX_JOURNAL_SIZE); |
779 | DPRINTF("adjusted log size = %" PRId64 "\n" , logsize); |
780 | } else { |
781 | fixedsize = 1; |
782 | DPRINTF("fixed log size = %" PRId64 "\n" , logsize); |
783 | } |
784 | |
785 | desired_blks = logsize / fs->fs_bsize; |
786 | DPRINTF("desired blocks = %" PRId64 "\n" , desired_blks); |
787 | |
788 | /* add in number of indirect blocks needed */ |
789 | indir_blks = 0; |
790 | if (desired_blks >= UFS_NDADDR) { |
791 | struct indir indirs[UFS_NIADDR + 2]; |
792 | int num; |
793 | |
794 | error = ufs_getlbns(vp, desired_blks, indirs, &num); |
795 | if (error) { |
796 | printf("%s: %s: ufs_getlbns failed, error %d!\n" , |
797 | __func__, fs->fs_fsmnt, error); |
798 | goto bad; |
799 | } |
800 | |
801 | switch (num) { |
802 | case 2: |
803 | indir_blks = 1; /* 1st level indirect */ |
804 | break; |
805 | case 3: |
806 | indir_blks = 1 + /* 1st level indirect */ |
807 | 1 + /* 2nd level indirect */ |
808 | indirs[1].in_off + 1; /* extra 1st level indirect */ |
809 | break; |
810 | default: |
811 | printf("%s: %s: unexpected numlevels %d from " |
812 | "ufs_getlbns\n" , __func__, fs->fs_fsmnt, num); |
813 | *size = 0; |
814 | goto bad; |
815 | } |
816 | desired_blks += indir_blks; |
817 | } |
818 | DPRINTF("desired blocks = %" PRId64 " (including indirect)\n" , |
819 | desired_blks); |
820 | |
821 | /* |
822 | * If a specific size wasn't requested, allow for a smaller log |
823 | * if we're really tight for space... |
824 | */ |
825 | min_desired_blks = desired_blks; |
826 | if (!fixedsize) |
827 | min_desired_blks = desired_blks / 4; |
828 | |
829 | /* Look at number of blocks per CG. If it's too small, bail early. */ |
830 | bpcg = ffs_fragstoblks(fs, fs->fs_fpg); |
831 | if (min_desired_blks > bpcg) { |
832 | printf("%s: %s: cylinder group size of %" PRId64 " MB " |
833 | " is not big enough for journal\n" , __func__, fs->fs_fsmnt, |
834 | ffs_lblktosize(fs, bpcg) / (1024 * 1024)); |
835 | goto bad; |
836 | } |
837 | |
838 | /* |
839 | * Start with the middle cylinder group, and search outwards in |
840 | * both directions until we either find the requested log size |
841 | * or reach the start/end of the file system. If we reach the |
842 | * start/end without finding enough space for the full requested |
843 | * log size, use the largest extent found if it is large enough |
844 | * to satisfy the our minimum size. |
845 | * |
846 | * XXX |
847 | * Can we just use the cluster contigsum stuff (esp on UFS2) |
848 | * here to simplify this search code? |
849 | */ |
850 | best_addr = 0; |
851 | best_blks = 0; |
852 | for (cg = fs->fs_ncg / 2, s = 0, n = 1; |
853 | best_blks < desired_blks && cg >= 0 && cg < fs->fs_ncg; |
854 | s++, n = -n, cg += n * s) { |
855 | DPRINTF("check cg %d of %d\n" , cg, fs->fs_ncg); |
856 | error = bread(devvp, FFS_FSBTODB(fs, cgtod(fs, cg)), |
857 | fs->fs_cgsize, 0, &bp); |
858 | if (error) { |
859 | continue; |
860 | } |
861 | cgp = (struct cg *)bp->b_data; |
862 | if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) { |
863 | brelse(bp, 0); |
864 | continue; |
865 | } |
866 | |
867 | blksfree = cg_blksfree(cgp, needswap); |
868 | |
869 | for (blkno = 0; blkno < bpcg;) { |
870 | /* look for next free block */ |
871 | /* XXX use scanc() and fragtbl[] here? */ |
872 | for (; blkno < bpcg - min_desired_blks; blkno++) |
873 | if (ffs_isblock(fs, blksfree, blkno)) |
874 | break; |
875 | |
876 | /* past end of search space in this CG? */ |
877 | if (blkno >= bpcg - min_desired_blks) |
878 | break; |
879 | |
880 | /* count how many free blocks in this extent */ |
881 | start_addr = blkno; |
882 | for (freeblks = 0; blkno < bpcg; blkno++, freeblks++) |
883 | if (!ffs_isblock(fs, blksfree, blkno)) |
884 | break; |
885 | |
886 | if (freeblks > best_blks) { |
887 | best_blks = freeblks; |
888 | best_addr = ffs_blkstofrags(fs, start_addr) + |
889 | cgbase(fs, cg); |
890 | |
891 | if (freeblks >= desired_blks) { |
892 | DPRINTF("found len %" PRId64 |
893 | " at offset %" PRId64 " in gc\n" , |
894 | freeblks, start_addr); |
895 | break; |
896 | } |
897 | } |
898 | } |
899 | brelse(bp, 0); |
900 | } |
901 | DPRINTF("best found len = %" PRId64 ", wanted %" PRId64 |
902 | " at addr %" PRId64 "\n" , best_blks, desired_blks, best_addr); |
903 | |
904 | if (best_blks < min_desired_blks) { |
905 | *addr = 0; |
906 | *indir_addr = 0; |
907 | } else { |
908 | /* put indirect blocks at start, and data blocks after */ |
909 | *addr = best_addr + ffs_blkstofrags(fs, indir_blks); |
910 | *indir_addr = best_addr; |
911 | } |
912 | *size = min(desired_blks, best_blks) - indir_blks; |
913 | return; |
914 | |
915 | bad: |
916 | *addr = 0; |
917 | *indir_addr = 0; |
918 | *size = 0; |
919 | return; |
920 | } |
921 | |