1 | /* $NetBSD: genfs_io.c,v 1.63 2016/09/29 19:08:48 christos Exp $ */ |
2 | |
3 | /* |
4 | * Copyright (c) 1982, 1986, 1989, 1993 |
5 | * The Regents of the University of California. All rights reserved. |
6 | * |
7 | * Redistribution and use in source and binary forms, with or without |
8 | * modification, are permitted provided that the following conditions |
9 | * are met: |
10 | * 1. Redistributions of source code must retain the above copyright |
11 | * notice, this list of conditions and the following disclaimer. |
12 | * 2. Redistributions in binary form must reproduce the above copyright |
13 | * notice, this list of conditions and the following disclaimer in the |
14 | * documentation and/or other materials provided with the distribution. |
15 | * 3. Neither the name of the University nor the names of its contributors |
16 | * may be used to endorse or promote products derived from this software |
17 | * without specific prior written permission. |
18 | * |
19 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
20 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
21 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
22 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
23 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
24 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
25 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
26 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
27 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
28 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
29 | * SUCH DAMAGE. |
30 | * |
31 | */ |
32 | |
33 | #include <sys/cdefs.h> |
34 | __KERNEL_RCSID(0, "$NetBSD: genfs_io.c,v 1.63 2016/09/29 19:08:48 christos Exp $" ); |
35 | |
36 | #include <sys/param.h> |
37 | #include <sys/systm.h> |
38 | #include <sys/proc.h> |
39 | #include <sys/kernel.h> |
40 | #include <sys/mount.h> |
41 | #include <sys/vnode.h> |
42 | #include <sys/kmem.h> |
43 | #include <sys/kauth.h> |
44 | #include <sys/fstrans.h> |
45 | #include <sys/buf.h> |
46 | |
47 | #include <miscfs/genfs/genfs.h> |
48 | #include <miscfs/genfs/genfs_node.h> |
49 | #include <miscfs/specfs/specdev.h> |
50 | |
51 | #include <uvm/uvm.h> |
52 | #include <uvm/uvm_pager.h> |
53 | |
54 | static int genfs_do_directio(struct vmspace *, vaddr_t, size_t, struct vnode *, |
55 | off_t, enum uio_rw); |
56 | static void genfs_dio_iodone(struct buf *); |
57 | |
58 | static int genfs_getpages_read(struct vnode *, struct vm_page **, int, off_t, |
59 | off_t, bool, bool, bool, bool); |
60 | static int genfs_do_io(struct vnode *, off_t, vaddr_t, size_t, int, enum uio_rw, |
61 | void (*)(struct buf *)); |
62 | static void genfs_rel_pages(struct vm_page **, unsigned int); |
63 | static void genfs_markdirty(struct vnode *); |
64 | |
65 | int genfs_maxdio = MAXPHYS; |
66 | |
67 | static void |
68 | genfs_rel_pages(struct vm_page **pgs, unsigned int npages) |
69 | { |
70 | unsigned int i; |
71 | |
72 | for (i = 0; i < npages; i++) { |
73 | struct vm_page *pg = pgs[i]; |
74 | |
75 | if (pg == NULL || pg == PGO_DONTCARE) |
76 | continue; |
77 | KASSERT(uvm_page_locked_p(pg)); |
78 | if (pg->flags & PG_FAKE) { |
79 | pg->flags |= PG_RELEASED; |
80 | } |
81 | } |
82 | mutex_enter(&uvm_pageqlock); |
83 | uvm_page_unbusy(pgs, npages); |
84 | mutex_exit(&uvm_pageqlock); |
85 | } |
86 | |
87 | static void |
88 | genfs_markdirty(struct vnode *vp) |
89 | { |
90 | struct genfs_node * const gp = VTOG(vp); |
91 | |
92 | KASSERT(mutex_owned(vp->v_interlock)); |
93 | gp->g_dirtygen++; |
94 | if ((vp->v_iflag & VI_ONWORKLST) == 0) { |
95 | vn_syncer_add_to_worklist(vp, filedelay); |
96 | } |
97 | if ((vp->v_iflag & (VI_WRMAP|VI_WRMAPDIRTY)) == VI_WRMAP) { |
98 | vp->v_iflag |= VI_WRMAPDIRTY; |
99 | } |
100 | } |
101 | |
102 | /* |
103 | * generic VM getpages routine. |
104 | * Return PG_BUSY pages for the given range, |
105 | * reading from backing store if necessary. |
106 | */ |
107 | |
108 | int |
109 | genfs_getpages(void *v) |
110 | { |
111 | struct vop_getpages_args /* { |
112 | struct vnode *a_vp; |
113 | voff_t a_offset; |
114 | struct vm_page **a_m; |
115 | int *a_count; |
116 | int a_centeridx; |
117 | vm_prot_t a_access_type; |
118 | int a_advice; |
119 | int a_flags; |
120 | } */ * const ap = v; |
121 | |
122 | off_t diskeof, memeof; |
123 | int i, error, npages; |
124 | const int flags = ap->a_flags; |
125 | struct vnode * const vp = ap->a_vp; |
126 | struct uvm_object * const uobj = &vp->v_uobj; |
127 | const bool async = (flags & PGO_SYNCIO) == 0; |
128 | const bool memwrite = (ap->a_access_type & VM_PROT_WRITE) != 0; |
129 | const bool overwrite = (flags & PGO_OVERWRITE) != 0; |
130 | const bool blockalloc = memwrite && (flags & PGO_NOBLOCKALLOC) == 0; |
131 | const bool glocked = (flags & PGO_GLOCKHELD) != 0; |
132 | const bool need_wapbl = blockalloc && vp->v_mount->mnt_wapbl; |
133 | bool has_trans_wapbl = false; |
134 | UVMHIST_FUNC("genfs_getpages" ); UVMHIST_CALLED(ubchist); |
135 | |
136 | UVMHIST_LOG(ubchist, "vp %p off 0x%x/%x count %d" , |
137 | vp, ap->a_offset >> 32, ap->a_offset, *ap->a_count); |
138 | |
139 | KASSERT(vp->v_type == VREG || vp->v_type == VDIR || |
140 | vp->v_type == VLNK || vp->v_type == VBLK); |
141 | |
142 | startover: |
143 | error = 0; |
144 | const voff_t origvsize = vp->v_size; |
145 | const off_t origoffset = ap->a_offset; |
146 | const int orignpages = *ap->a_count; |
147 | |
148 | GOP_SIZE(vp, origvsize, &diskeof, 0); |
149 | if (flags & PGO_PASTEOF) { |
150 | off_t newsize; |
151 | #if defined(DIAGNOSTIC) |
152 | off_t writeeof; |
153 | #endif /* defined(DIAGNOSTIC) */ |
154 | |
155 | newsize = MAX(origvsize, |
156 | origoffset + (orignpages << PAGE_SHIFT)); |
157 | GOP_SIZE(vp, newsize, &memeof, GOP_SIZE_MEM); |
158 | #if defined(DIAGNOSTIC) |
159 | GOP_SIZE(vp, vp->v_writesize, &writeeof, GOP_SIZE_MEM); |
160 | if (newsize > round_page(writeeof)) { |
161 | panic("%s: past eof: %" PRId64 " vs. %" PRId64, |
162 | __func__, newsize, round_page(writeeof)); |
163 | } |
164 | #endif /* defined(DIAGNOSTIC) */ |
165 | } else { |
166 | GOP_SIZE(vp, origvsize, &memeof, GOP_SIZE_MEM); |
167 | } |
168 | KASSERT(ap->a_centeridx >= 0 || ap->a_centeridx <= orignpages); |
169 | KASSERT((origoffset & (PAGE_SIZE - 1)) == 0 && origoffset >= 0); |
170 | KASSERT(orignpages > 0); |
171 | |
172 | /* |
173 | * Bounds-check the request. |
174 | */ |
175 | |
176 | if (origoffset + (ap->a_centeridx << PAGE_SHIFT) >= memeof) { |
177 | if ((flags & PGO_LOCKED) == 0) { |
178 | mutex_exit(uobj->vmobjlock); |
179 | } |
180 | UVMHIST_LOG(ubchist, "off 0x%x count %d goes past EOF 0x%x" , |
181 | origoffset, *ap->a_count, memeof,0); |
182 | error = EINVAL; |
183 | goto out_err; |
184 | } |
185 | |
186 | /* uobj is locked */ |
187 | |
188 | if ((flags & PGO_NOTIMESTAMP) == 0 && |
189 | (vp->v_type != VBLK || |
190 | (vp->v_mount->mnt_flag & MNT_NODEVMTIME) == 0)) { |
191 | int updflags = 0; |
192 | |
193 | if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0) { |
194 | updflags = GOP_UPDATE_ACCESSED; |
195 | } |
196 | if (memwrite) { |
197 | updflags |= GOP_UPDATE_MODIFIED; |
198 | } |
199 | if (updflags != 0) { |
200 | GOP_MARKUPDATE(vp, updflags); |
201 | } |
202 | } |
203 | |
204 | /* |
205 | * For PGO_LOCKED requests, just return whatever's in memory. |
206 | */ |
207 | |
208 | if (flags & PGO_LOCKED) { |
209 | int nfound; |
210 | struct vm_page *pg; |
211 | |
212 | KASSERT(!glocked); |
213 | npages = *ap->a_count; |
214 | #if defined(DEBUG) |
215 | for (i = 0; i < npages; i++) { |
216 | pg = ap->a_m[i]; |
217 | KASSERT(pg == NULL || pg == PGO_DONTCARE); |
218 | } |
219 | #endif /* defined(DEBUG) */ |
220 | nfound = uvn_findpages(uobj, origoffset, &npages, |
221 | ap->a_m, UFP_NOWAIT|UFP_NOALLOC|(memwrite ? UFP_NORDONLY : 0)); |
222 | KASSERT(npages == *ap->a_count); |
223 | if (nfound == 0) { |
224 | error = EBUSY; |
225 | goto out_err; |
226 | } |
227 | if (!genfs_node_rdtrylock(vp)) { |
228 | genfs_rel_pages(ap->a_m, npages); |
229 | |
230 | /* |
231 | * restore the array. |
232 | */ |
233 | |
234 | for (i = 0; i < npages; i++) { |
235 | pg = ap->a_m[i]; |
236 | |
237 | if (pg != NULL && pg != PGO_DONTCARE) { |
238 | ap->a_m[i] = NULL; |
239 | } |
240 | KASSERT(ap->a_m[i] == NULL || |
241 | ap->a_m[i] == PGO_DONTCARE); |
242 | } |
243 | } else { |
244 | genfs_node_unlock(vp); |
245 | } |
246 | error = (ap->a_m[ap->a_centeridx] == NULL ? EBUSY : 0); |
247 | if (error == 0 && memwrite) { |
248 | genfs_markdirty(vp); |
249 | } |
250 | goto out_err; |
251 | } |
252 | mutex_exit(uobj->vmobjlock); |
253 | |
254 | /* |
255 | * find the requested pages and make some simple checks. |
256 | * leave space in the page array for a whole block. |
257 | */ |
258 | |
259 | const int fs_bshift = (vp->v_type != VBLK) ? |
260 | vp->v_mount->mnt_fs_bshift : DEV_BSHIFT; |
261 | const int fs_bsize = 1 << fs_bshift; |
262 | #define blk_mask (fs_bsize - 1) |
263 | #define trunc_blk(x) ((x) & ~blk_mask) |
264 | #define round_blk(x) (((x) + blk_mask) & ~blk_mask) |
265 | |
266 | const int orignmempages = MIN(orignpages, |
267 | round_page(memeof - origoffset) >> PAGE_SHIFT); |
268 | npages = orignmempages; |
269 | const off_t startoffset = trunc_blk(origoffset); |
270 | const off_t endoffset = MIN( |
271 | round_page(round_blk(origoffset + (npages << PAGE_SHIFT))), |
272 | round_page(memeof)); |
273 | const int ridx = (origoffset - startoffset) >> PAGE_SHIFT; |
274 | |
275 | const int pgs_size = sizeof(struct vm_page *) * |
276 | ((endoffset - startoffset) >> PAGE_SHIFT); |
277 | struct vm_page **pgs, *pgs_onstack[UBC_MAX_PAGES]; |
278 | |
279 | if (pgs_size > sizeof(pgs_onstack)) { |
280 | pgs = kmem_zalloc(pgs_size, async ? KM_NOSLEEP : KM_SLEEP); |
281 | if (pgs == NULL) { |
282 | pgs = pgs_onstack; |
283 | error = ENOMEM; |
284 | goto out_err; |
285 | } |
286 | } else { |
287 | pgs = pgs_onstack; |
288 | (void)memset(pgs, 0, pgs_size); |
289 | } |
290 | |
291 | UVMHIST_LOG(ubchist, "ridx %d npages %d startoff %ld endoff %ld" , |
292 | ridx, npages, startoffset, endoffset); |
293 | |
294 | if (!has_trans_wapbl) { |
295 | fstrans_start(vp->v_mount, FSTRANS_SHARED); |
296 | /* |
297 | * XXX: This assumes that we come here only via |
298 | * the mmio path |
299 | */ |
300 | if (need_wapbl) { |
301 | error = WAPBL_BEGIN(vp->v_mount); |
302 | if (error) { |
303 | fstrans_done(vp->v_mount); |
304 | goto out_err_free; |
305 | } |
306 | } |
307 | has_trans_wapbl = true; |
308 | } |
309 | |
310 | /* |
311 | * hold g_glock to prevent a race with truncate. |
312 | * |
313 | * check if our idea of v_size is still valid. |
314 | */ |
315 | |
316 | KASSERT(!glocked || genfs_node_wrlocked(vp)); |
317 | if (!glocked) { |
318 | if (blockalloc) { |
319 | genfs_node_wrlock(vp); |
320 | } else { |
321 | genfs_node_rdlock(vp); |
322 | } |
323 | } |
324 | mutex_enter(uobj->vmobjlock); |
325 | if (vp->v_size < origvsize) { |
326 | if (!glocked) { |
327 | genfs_node_unlock(vp); |
328 | } |
329 | if (pgs != pgs_onstack) |
330 | kmem_free(pgs, pgs_size); |
331 | goto startover; |
332 | } |
333 | |
334 | if (uvn_findpages(uobj, origoffset, &npages, &pgs[ridx], |
335 | async ? UFP_NOWAIT : UFP_ALL) != orignmempages) { |
336 | if (!glocked) { |
337 | genfs_node_unlock(vp); |
338 | } |
339 | KASSERT(async != 0); |
340 | genfs_rel_pages(&pgs[ridx], orignmempages); |
341 | mutex_exit(uobj->vmobjlock); |
342 | error = EBUSY; |
343 | goto out_err_free; |
344 | } |
345 | |
346 | /* |
347 | * if the pages are already resident, just return them. |
348 | */ |
349 | |
350 | for (i = 0; i < npages; i++) { |
351 | struct vm_page *pg = pgs[ridx + i]; |
352 | |
353 | if ((pg->flags & PG_FAKE) || |
354 | (blockalloc && (pg->flags & PG_RDONLY))) { |
355 | break; |
356 | } |
357 | } |
358 | if (i == npages) { |
359 | if (!glocked) { |
360 | genfs_node_unlock(vp); |
361 | } |
362 | UVMHIST_LOG(ubchist, "returning cached pages" , 0,0,0,0); |
363 | npages += ridx; |
364 | goto out; |
365 | } |
366 | |
367 | /* |
368 | * if PGO_OVERWRITE is set, don't bother reading the pages. |
369 | */ |
370 | |
371 | if (overwrite) { |
372 | if (!glocked) { |
373 | genfs_node_unlock(vp); |
374 | } |
375 | UVMHIST_LOG(ubchist, "PGO_OVERWRITE" ,0,0,0,0); |
376 | |
377 | for (i = 0; i < npages; i++) { |
378 | struct vm_page *pg = pgs[ridx + i]; |
379 | |
380 | pg->flags &= ~(PG_RDONLY|PG_CLEAN); |
381 | } |
382 | npages += ridx; |
383 | goto out; |
384 | } |
385 | |
386 | /* |
387 | * the page wasn't resident and we're not overwriting, |
388 | * so we're going to have to do some i/o. |
389 | * find any additional pages needed to cover the expanded range. |
390 | */ |
391 | |
392 | npages = (endoffset - startoffset) >> PAGE_SHIFT; |
393 | if (startoffset != origoffset || npages != orignmempages) { |
394 | int npgs; |
395 | |
396 | /* |
397 | * we need to avoid deadlocks caused by locking |
398 | * additional pages at lower offsets than pages we |
399 | * already have locked. unlock them all and start over. |
400 | */ |
401 | |
402 | genfs_rel_pages(&pgs[ridx], orignmempages); |
403 | memset(pgs, 0, pgs_size); |
404 | |
405 | UVMHIST_LOG(ubchist, "reset npages start 0x%x end 0x%x" , |
406 | startoffset, endoffset, 0,0); |
407 | npgs = npages; |
408 | if (uvn_findpages(uobj, startoffset, &npgs, pgs, |
409 | async ? UFP_NOWAIT : UFP_ALL) != npages) { |
410 | if (!glocked) { |
411 | genfs_node_unlock(vp); |
412 | } |
413 | KASSERT(async != 0); |
414 | genfs_rel_pages(pgs, npages); |
415 | mutex_exit(uobj->vmobjlock); |
416 | error = EBUSY; |
417 | goto out_err_free; |
418 | } |
419 | } |
420 | |
421 | mutex_exit(uobj->vmobjlock); |
422 | error = genfs_getpages_read(vp, pgs, npages, startoffset, diskeof, |
423 | async, memwrite, blockalloc, glocked); |
424 | if (error == 0 && async) |
425 | goto out_err_free; |
426 | if (!glocked) { |
427 | genfs_node_unlock(vp); |
428 | } |
429 | mutex_enter(uobj->vmobjlock); |
430 | |
431 | /* |
432 | * we're almost done! release the pages... |
433 | * for errors, we free the pages. |
434 | * otherwise we activate them and mark them as valid and clean. |
435 | * also, unbusy pages that were not actually requested. |
436 | */ |
437 | |
438 | if (error) { |
439 | genfs_rel_pages(pgs, npages); |
440 | mutex_exit(uobj->vmobjlock); |
441 | UVMHIST_LOG(ubchist, "returning error %d" , error,0,0,0); |
442 | goto out_err_free; |
443 | } |
444 | |
445 | out: |
446 | UVMHIST_LOG(ubchist, "succeeding, npages %d" , npages,0,0,0); |
447 | error = 0; |
448 | mutex_enter(&uvm_pageqlock); |
449 | for (i = 0; i < npages; i++) { |
450 | struct vm_page *pg = pgs[i]; |
451 | if (pg == NULL) { |
452 | continue; |
453 | } |
454 | UVMHIST_LOG(ubchist, "examining pg %p flags 0x%x" , |
455 | pg, pg->flags, 0,0); |
456 | if (pg->flags & PG_FAKE && !overwrite) { |
457 | pg->flags &= ~(PG_FAKE); |
458 | pmap_clear_modify(pgs[i]); |
459 | } |
460 | KASSERT(!memwrite || !blockalloc || (pg->flags & PG_RDONLY) == 0); |
461 | if (i < ridx || i >= ridx + orignmempages || async) { |
462 | UVMHIST_LOG(ubchist, "unbusy pg %p offset 0x%x" , |
463 | pg, pg->offset,0,0); |
464 | if (pg->flags & PG_WANTED) { |
465 | wakeup(pg); |
466 | } |
467 | if (pg->flags & PG_FAKE) { |
468 | KASSERT(overwrite); |
469 | uvm_pagezero(pg); |
470 | } |
471 | if (pg->flags & PG_RELEASED) { |
472 | uvm_pagefree(pg); |
473 | continue; |
474 | } |
475 | uvm_pageenqueue(pg); |
476 | pg->flags &= ~(PG_WANTED|PG_BUSY|PG_FAKE); |
477 | UVM_PAGE_OWN(pg, NULL); |
478 | } |
479 | } |
480 | mutex_exit(&uvm_pageqlock); |
481 | if (memwrite) { |
482 | genfs_markdirty(vp); |
483 | } |
484 | mutex_exit(uobj->vmobjlock); |
485 | if (ap->a_m != NULL) { |
486 | memcpy(ap->a_m, &pgs[ridx], |
487 | orignmempages * sizeof(struct vm_page *)); |
488 | } |
489 | |
490 | out_err_free: |
491 | if (pgs != NULL && pgs != pgs_onstack) |
492 | kmem_free(pgs, pgs_size); |
493 | out_err: |
494 | if (has_trans_wapbl) { |
495 | if (need_wapbl) |
496 | WAPBL_END(vp->v_mount); |
497 | fstrans_done(vp->v_mount); |
498 | } |
499 | return error; |
500 | } |
501 | |
502 | /* |
503 | * genfs_getpages_read: Read the pages in with VOP_BMAP/VOP_STRATEGY. |
504 | */ |
505 | static int |
506 | genfs_getpages_read(struct vnode *vp, struct vm_page **pgs, int npages, |
507 | off_t startoffset, off_t diskeof, |
508 | bool async, bool memwrite, bool blockalloc, bool glocked) |
509 | { |
510 | struct uvm_object * const uobj = &vp->v_uobj; |
511 | const int fs_bshift = (vp->v_type != VBLK) ? |
512 | vp->v_mount->mnt_fs_bshift : DEV_BSHIFT; |
513 | const int dev_bshift = (vp->v_type != VBLK) ? |
514 | vp->v_mount->mnt_dev_bshift : DEV_BSHIFT; |
515 | kauth_cred_t const cred = curlwp->l_cred; /* XXXUBC curlwp */ |
516 | size_t bytes, iobytes, tailstart, tailbytes, totalbytes, skipbytes; |
517 | vaddr_t kva; |
518 | struct buf *bp, *mbp; |
519 | bool sawhole = false; |
520 | int i; |
521 | int error = 0; |
522 | |
523 | UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist); |
524 | |
525 | /* |
526 | * read the desired page(s). |
527 | */ |
528 | |
529 | totalbytes = npages << PAGE_SHIFT; |
530 | bytes = MIN(totalbytes, MAX(diskeof - startoffset, 0)); |
531 | tailbytes = totalbytes - bytes; |
532 | skipbytes = 0; |
533 | |
534 | kva = uvm_pagermapin(pgs, npages, |
535 | UVMPAGER_MAPIN_READ | (async ? 0 : UVMPAGER_MAPIN_WAITOK)); |
536 | if (kva == 0) |
537 | return EBUSY; |
538 | |
539 | mbp = getiobuf(vp, true); |
540 | mbp->b_bufsize = totalbytes; |
541 | mbp->b_data = (void *)kva; |
542 | mbp->b_resid = mbp->b_bcount = bytes; |
543 | mbp->b_cflags = BC_BUSY; |
544 | if (async) { |
545 | mbp->b_flags = B_READ | B_ASYNC; |
546 | mbp->b_iodone = uvm_aio_biodone; |
547 | } else { |
548 | mbp->b_flags = B_READ; |
549 | mbp->b_iodone = NULL; |
550 | } |
551 | if (async) |
552 | BIO_SETPRIO(mbp, BPRIO_TIMELIMITED); |
553 | else |
554 | BIO_SETPRIO(mbp, BPRIO_TIMECRITICAL); |
555 | |
556 | /* |
557 | * if EOF is in the middle of the range, zero the part past EOF. |
558 | * skip over pages which are not PG_FAKE since in that case they have |
559 | * valid data that we need to preserve. |
560 | */ |
561 | |
562 | tailstart = bytes; |
563 | while (tailbytes > 0) { |
564 | const int len = PAGE_SIZE - (tailstart & PAGE_MASK); |
565 | |
566 | KASSERT(len <= tailbytes); |
567 | if ((pgs[tailstart >> PAGE_SHIFT]->flags & PG_FAKE) != 0) { |
568 | memset((void *)(kva + tailstart), 0, len); |
569 | UVMHIST_LOG(ubchist, "tailbytes %p 0x%x 0x%x" , |
570 | kva, tailstart, len, 0); |
571 | } |
572 | tailstart += len; |
573 | tailbytes -= len; |
574 | } |
575 | |
576 | /* |
577 | * now loop over the pages, reading as needed. |
578 | */ |
579 | |
580 | bp = NULL; |
581 | off_t offset; |
582 | for (offset = startoffset; |
583 | bytes > 0; |
584 | offset += iobytes, bytes -= iobytes) { |
585 | int run; |
586 | daddr_t lbn, blkno; |
587 | int pidx; |
588 | struct vnode *devvp; |
589 | |
590 | /* |
591 | * skip pages which don't need to be read. |
592 | */ |
593 | |
594 | pidx = (offset - startoffset) >> PAGE_SHIFT; |
595 | while ((pgs[pidx]->flags & PG_FAKE) == 0) { |
596 | size_t b; |
597 | |
598 | KASSERT((offset & (PAGE_SIZE - 1)) == 0); |
599 | if ((pgs[pidx]->flags & PG_RDONLY)) { |
600 | sawhole = true; |
601 | } |
602 | b = MIN(PAGE_SIZE, bytes); |
603 | offset += b; |
604 | bytes -= b; |
605 | skipbytes += b; |
606 | pidx++; |
607 | UVMHIST_LOG(ubchist, "skipping, new offset 0x%x" , |
608 | offset, 0,0,0); |
609 | if (bytes == 0) { |
610 | goto loopdone; |
611 | } |
612 | } |
613 | |
614 | /* |
615 | * bmap the file to find out the blkno to read from and |
616 | * how much we can read in one i/o. if bmap returns an error, |
617 | * skip the rest of the top-level i/o. |
618 | */ |
619 | |
620 | lbn = offset >> fs_bshift; |
621 | error = VOP_BMAP(vp, lbn, &devvp, &blkno, &run); |
622 | if (error) { |
623 | UVMHIST_LOG(ubchist, "VOP_BMAP lbn 0x%x -> %d\n" , |
624 | lbn,error,0,0); |
625 | skipbytes += bytes; |
626 | bytes = 0; |
627 | goto loopdone; |
628 | } |
629 | |
630 | /* |
631 | * see how many pages can be read with this i/o. |
632 | * reduce the i/o size if necessary to avoid |
633 | * overwriting pages with valid data. |
634 | */ |
635 | |
636 | iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset, |
637 | bytes); |
638 | if (offset + iobytes > round_page(offset)) { |
639 | int pcount; |
640 | |
641 | pcount = 1; |
642 | while (pidx + pcount < npages && |
643 | pgs[pidx + pcount]->flags & PG_FAKE) { |
644 | pcount++; |
645 | } |
646 | iobytes = MIN(iobytes, (pcount << PAGE_SHIFT) - |
647 | (offset - trunc_page(offset))); |
648 | } |
649 | |
650 | /* |
651 | * if this block isn't allocated, zero it instead of |
652 | * reading it. unless we are going to allocate blocks, |
653 | * mark the pages we zeroed PG_RDONLY. |
654 | */ |
655 | |
656 | if (blkno == (daddr_t)-1) { |
657 | int holepages = (round_page(offset + iobytes) - |
658 | trunc_page(offset)) >> PAGE_SHIFT; |
659 | UVMHIST_LOG(ubchist, "lbn 0x%x -> HOLE" , lbn,0,0,0); |
660 | |
661 | sawhole = true; |
662 | memset((char *)kva + (offset - startoffset), 0, |
663 | iobytes); |
664 | skipbytes += iobytes; |
665 | |
666 | mutex_enter(uobj->vmobjlock); |
667 | for (i = 0; i < holepages; i++) { |
668 | if (memwrite) { |
669 | pgs[pidx + i]->flags &= ~PG_CLEAN; |
670 | } |
671 | if (!blockalloc) { |
672 | pgs[pidx + i]->flags |= PG_RDONLY; |
673 | } |
674 | } |
675 | mutex_exit(uobj->vmobjlock); |
676 | continue; |
677 | } |
678 | |
679 | /* |
680 | * allocate a sub-buf for this piece of the i/o |
681 | * (or just use mbp if there's only 1 piece), |
682 | * and start it going. |
683 | */ |
684 | |
685 | if (offset == startoffset && iobytes == bytes) { |
686 | bp = mbp; |
687 | } else { |
688 | UVMHIST_LOG(ubchist, "vp %p bp %p num now %d" , |
689 | vp, bp, vp->v_numoutput, 0); |
690 | bp = getiobuf(vp, true); |
691 | nestiobuf_setup(mbp, bp, offset - startoffset, iobytes); |
692 | } |
693 | bp->b_lblkno = 0; |
694 | |
695 | /* adjust physical blkno for partial blocks */ |
696 | bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >> |
697 | dev_bshift); |
698 | |
699 | UVMHIST_LOG(ubchist, |
700 | "bp %p offset 0x%x bcount 0x%x blkno 0x%x" , |
701 | bp, offset, bp->b_bcount, bp->b_blkno); |
702 | |
703 | VOP_STRATEGY(devvp, bp); |
704 | } |
705 | |
706 | loopdone: |
707 | nestiobuf_done(mbp, skipbytes, error); |
708 | if (async) { |
709 | UVMHIST_LOG(ubchist, "returning 0 (async)" ,0,0,0,0); |
710 | if (!glocked) { |
711 | genfs_node_unlock(vp); |
712 | } |
713 | return 0; |
714 | } |
715 | if (bp != NULL) { |
716 | error = biowait(mbp); |
717 | } |
718 | |
719 | /* Remove the mapping (make KVA available as soon as possible) */ |
720 | uvm_pagermapout(kva, npages); |
721 | |
722 | /* |
723 | * if this we encountered a hole then we have to do a little more work. |
724 | * for read faults, we marked the page PG_RDONLY so that future |
725 | * write accesses to the page will fault again. |
726 | * for write faults, we must make sure that the backing store for |
727 | * the page is completely allocated while the pages are locked. |
728 | */ |
729 | |
730 | if (!error && sawhole && blockalloc) { |
731 | error = GOP_ALLOC(vp, startoffset, |
732 | npages << PAGE_SHIFT, 0, cred); |
733 | UVMHIST_LOG(ubchist, "gop_alloc off 0x%x/0x%x -> %d" , |
734 | startoffset, npages << PAGE_SHIFT, error,0); |
735 | if (!error) { |
736 | mutex_enter(uobj->vmobjlock); |
737 | for (i = 0; i < npages; i++) { |
738 | struct vm_page *pg = pgs[i]; |
739 | |
740 | if (pg == NULL) { |
741 | continue; |
742 | } |
743 | pg->flags &= ~(PG_CLEAN|PG_RDONLY); |
744 | UVMHIST_LOG(ubchist, "mark dirty pg %p" , |
745 | pg,0,0,0); |
746 | } |
747 | mutex_exit(uobj->vmobjlock); |
748 | } |
749 | } |
750 | |
751 | putiobuf(mbp); |
752 | return error; |
753 | } |
754 | |
755 | /* |
756 | * generic VM putpages routine. |
757 | * Write the given range of pages to backing store. |
758 | * |
759 | * => "offhi == 0" means flush all pages at or after "offlo". |
760 | * => object should be locked by caller. we return with the |
761 | * object unlocked. |
762 | * => if PGO_CLEANIT or PGO_SYNCIO is set, we may block (due to I/O). |
763 | * thus, a caller might want to unlock higher level resources |
764 | * (e.g. vm_map) before calling flush. |
765 | * => if neither PGO_CLEANIT nor PGO_SYNCIO is set, we will not block |
766 | * => if PGO_ALLPAGES is set, then all pages in the object will be processed. |
767 | * => NOTE: we rely on the fact that the object's memq is a TAILQ and |
768 | * that new pages are inserted on the tail end of the list. thus, |
769 | * we can make a complete pass through the object in one go by starting |
770 | * at the head and working towards the tail (new pages are put in |
771 | * front of us). |
772 | * => NOTE: we are allowed to lock the page queues, so the caller |
773 | * must not be holding the page queue lock. |
774 | * |
775 | * note on "cleaning" object and PG_BUSY pages: |
776 | * this routine is holding the lock on the object. the only time |
777 | * that it can run into a PG_BUSY page that it does not own is if |
778 | * some other process has started I/O on the page (e.g. either |
779 | * a pagein, or a pageout). if the PG_BUSY page is being paged |
780 | * in, then it can not be dirty (!PG_CLEAN) because no one has |
781 | * had a chance to modify it yet. if the PG_BUSY page is being |
782 | * paged out then it means that someone else has already started |
783 | * cleaning the page for us (how nice!). in this case, if we |
784 | * have syncio specified, then after we make our pass through the |
785 | * object we need to wait for the other PG_BUSY pages to clear |
786 | * off (i.e. we need to do an iosync). also note that once a |
787 | * page is PG_BUSY it must stay in its object until it is un-busyed. |
788 | * |
789 | * note on page traversal: |
790 | * we can traverse the pages in an object either by going down the |
791 | * linked list in "uobj->memq", or we can go over the address range |
792 | * by page doing hash table lookups for each address. depending |
793 | * on how many pages are in the object it may be cheaper to do one |
794 | * or the other. we set "by_list" to true if we are using memq. |
795 | * if the cost of a hash lookup was equal to the cost of the list |
796 | * traversal we could compare the number of pages in the start->stop |
797 | * range to the total number of pages in the object. however, it |
798 | * seems that a hash table lookup is more expensive than the linked |
799 | * list traversal, so we multiply the number of pages in the |
800 | * range by an estimate of the relatively higher cost of the hash lookup. |
801 | */ |
802 | |
803 | int |
804 | genfs_putpages(void *v) |
805 | { |
806 | struct vop_putpages_args /* { |
807 | struct vnode *a_vp; |
808 | voff_t a_offlo; |
809 | voff_t a_offhi; |
810 | int a_flags; |
811 | } */ * const ap = v; |
812 | |
813 | return genfs_do_putpages(ap->a_vp, ap->a_offlo, ap->a_offhi, |
814 | ap->a_flags, NULL); |
815 | } |
816 | |
817 | int |
818 | genfs_do_putpages(struct vnode *vp, off_t startoff, off_t endoff, |
819 | int origflags, struct vm_page **busypg) |
820 | { |
821 | struct uvm_object * const uobj = &vp->v_uobj; |
822 | kmutex_t * const slock = uobj->vmobjlock; |
823 | off_t off; |
824 | int i, error, npages, nback; |
825 | int freeflag; |
826 | /* |
827 | * This array is larger than it should so that it's size is constant. |
828 | * The right size is MAXPAGES. |
829 | */ |
830 | struct vm_page *pgs[MAXPHYS / MIN_PAGE_SIZE]; |
831 | #define MAXPAGES (MAXPHYS / PAGE_SIZE) |
832 | struct vm_page *pg, *nextpg, *tpg, curmp, endmp; |
833 | bool wasclean, by_list, needs_clean, yld; |
834 | bool async = (origflags & PGO_SYNCIO) == 0; |
835 | bool pagedaemon = curlwp == uvm.pagedaemon_lwp; |
836 | struct lwp * const l = curlwp ? curlwp : &lwp0; |
837 | struct genfs_node * const gp = VTOG(vp); |
838 | int flags; |
839 | int dirtygen; |
840 | bool modified; |
841 | bool need_wapbl; |
842 | bool has_trans; |
843 | bool cleanall; |
844 | bool onworklst; |
845 | |
846 | UVMHIST_FUNC("genfs_putpages" ); UVMHIST_CALLED(ubchist); |
847 | |
848 | KASSERT(origflags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)); |
849 | KASSERT((startoff & PAGE_MASK) == 0 && (endoff & PAGE_MASK) == 0); |
850 | KASSERT(startoff < endoff || endoff == 0); |
851 | |
852 | UVMHIST_LOG(ubchist, "vp %p pages %d off 0x%x len 0x%x" , |
853 | vp, uobj->uo_npages, startoff, endoff - startoff); |
854 | |
855 | has_trans = false; |
856 | need_wapbl = (!pagedaemon && vp->v_mount && vp->v_mount->mnt_wapbl && |
857 | (origflags & PGO_JOURNALLOCKED) == 0); |
858 | |
859 | retry: |
860 | modified = false; |
861 | flags = origflags; |
862 | KASSERT((vp->v_iflag & VI_ONWORKLST) != 0 || |
863 | (vp->v_iflag & VI_WRMAPDIRTY) == 0); |
864 | if (uobj->uo_npages == 0) { |
865 | if (vp->v_iflag & VI_ONWORKLST) { |
866 | vp->v_iflag &= ~VI_WRMAPDIRTY; |
867 | if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL) |
868 | vn_syncer_remove_from_worklist(vp); |
869 | } |
870 | if (has_trans) { |
871 | if (need_wapbl) |
872 | WAPBL_END(vp->v_mount); |
873 | fstrans_done(vp->v_mount); |
874 | } |
875 | mutex_exit(slock); |
876 | return (0); |
877 | } |
878 | |
879 | /* |
880 | * the vnode has pages, set up to process the request. |
881 | */ |
882 | |
883 | if (!has_trans && (flags & PGO_CLEANIT) != 0) { |
884 | mutex_exit(slock); |
885 | if (pagedaemon) { |
886 | error = fstrans_start_nowait(vp->v_mount, FSTRANS_LAZY); |
887 | if (error) |
888 | return error; |
889 | } else |
890 | fstrans_start(vp->v_mount, FSTRANS_LAZY); |
891 | if (need_wapbl) { |
892 | error = WAPBL_BEGIN(vp->v_mount); |
893 | if (error) { |
894 | fstrans_done(vp->v_mount); |
895 | return error; |
896 | } |
897 | } |
898 | has_trans = true; |
899 | mutex_enter(slock); |
900 | goto retry; |
901 | } |
902 | |
903 | error = 0; |
904 | wasclean = (vp->v_numoutput == 0); |
905 | off = startoff; |
906 | if (endoff == 0 || flags & PGO_ALLPAGES) { |
907 | endoff = trunc_page(LLONG_MAX); |
908 | } |
909 | by_list = (uobj->uo_npages <= |
910 | ((endoff - startoff) >> PAGE_SHIFT) * UVM_PAGE_TREE_PENALTY); |
911 | |
912 | /* |
913 | * if this vnode is known not to have dirty pages, |
914 | * don't bother to clean it out. |
915 | */ |
916 | |
917 | if ((vp->v_iflag & VI_ONWORKLST) == 0) { |
918 | #if !defined(DEBUG) |
919 | if ((flags & (PGO_FREE|PGO_DEACTIVATE)) == 0) { |
920 | goto skip_scan; |
921 | } |
922 | #endif /* !defined(DEBUG) */ |
923 | flags &= ~PGO_CLEANIT; |
924 | } |
925 | |
926 | /* |
927 | * start the loop. when scanning by list, hold the last page |
928 | * in the list before we start. pages allocated after we start |
929 | * will be added to the end of the list, so we can stop at the |
930 | * current last page. |
931 | */ |
932 | |
933 | cleanall = (flags & PGO_CLEANIT) != 0 && wasclean && |
934 | startoff == 0 && endoff == trunc_page(LLONG_MAX) && |
935 | (vp->v_iflag & VI_ONWORKLST) != 0; |
936 | dirtygen = gp->g_dirtygen; |
937 | freeflag = pagedaemon ? PG_PAGEOUT : PG_RELEASED; |
938 | if (by_list) { |
939 | curmp.flags = PG_MARKER; |
940 | endmp.flags = PG_MARKER; |
941 | pg = TAILQ_FIRST(&uobj->memq); |
942 | TAILQ_INSERT_TAIL(&uobj->memq, &endmp, listq.queue); |
943 | } else { |
944 | pg = uvm_pagelookup(uobj, off); |
945 | } |
946 | nextpg = NULL; |
947 | while (by_list || off < endoff) { |
948 | |
949 | /* |
950 | * if the current page is not interesting, move on to the next. |
951 | */ |
952 | |
953 | KASSERT(pg == NULL || pg->uobject == uobj || |
954 | (pg->flags & PG_MARKER) != 0); |
955 | KASSERT(pg == NULL || |
956 | (pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 || |
957 | (pg->flags & (PG_BUSY|PG_MARKER)) != 0); |
958 | if (by_list) { |
959 | if (pg == &endmp) { |
960 | break; |
961 | } |
962 | if (pg->flags & PG_MARKER) { |
963 | pg = TAILQ_NEXT(pg, listq.queue); |
964 | continue; |
965 | } |
966 | if (pg->offset < startoff || pg->offset >= endoff || |
967 | pg->flags & (PG_RELEASED|PG_PAGEOUT)) { |
968 | if (pg->flags & (PG_RELEASED|PG_PAGEOUT)) { |
969 | wasclean = false; |
970 | } |
971 | pg = TAILQ_NEXT(pg, listq.queue); |
972 | continue; |
973 | } |
974 | off = pg->offset; |
975 | } else if (pg == NULL || pg->flags & (PG_RELEASED|PG_PAGEOUT)) { |
976 | if (pg != NULL) { |
977 | wasclean = false; |
978 | } |
979 | off += PAGE_SIZE; |
980 | if (off < endoff) { |
981 | pg = uvm_pagelookup(uobj, off); |
982 | } |
983 | continue; |
984 | } |
985 | |
986 | /* |
987 | * if the current page needs to be cleaned and it's busy, |
988 | * wait for it to become unbusy. |
989 | */ |
990 | |
991 | yld = (l->l_cpu->ci_schedstate.spc_flags & |
992 | SPCF_SHOULDYIELD) && !pagedaemon; |
993 | if (pg->flags & PG_BUSY || yld) { |
994 | UVMHIST_LOG(ubchist, "busy %p" , pg,0,0,0); |
995 | if (flags & PGO_BUSYFAIL && pg->flags & PG_BUSY) { |
996 | UVMHIST_LOG(ubchist, "busyfail %p" , pg, 0,0,0); |
997 | error = EDEADLK; |
998 | if (busypg != NULL) |
999 | *busypg = pg; |
1000 | break; |
1001 | } |
1002 | if (pagedaemon) { |
1003 | /* |
1004 | * someone has taken the page while we |
1005 | * dropped the lock for fstrans_start. |
1006 | */ |
1007 | break; |
1008 | } |
1009 | if (by_list) { |
1010 | TAILQ_INSERT_BEFORE(pg, &curmp, listq.queue); |
1011 | UVMHIST_LOG(ubchist, "curmp next %p" , |
1012 | TAILQ_NEXT(&curmp, listq.queue), 0,0,0); |
1013 | } |
1014 | if (yld) { |
1015 | mutex_exit(slock); |
1016 | preempt(); |
1017 | mutex_enter(slock); |
1018 | } else { |
1019 | pg->flags |= PG_WANTED; |
1020 | UVM_UNLOCK_AND_WAIT(pg, slock, 0, "genput" , 0); |
1021 | mutex_enter(slock); |
1022 | } |
1023 | if (by_list) { |
1024 | UVMHIST_LOG(ubchist, "after next %p" , |
1025 | TAILQ_NEXT(&curmp, listq.queue), 0,0,0); |
1026 | pg = TAILQ_NEXT(&curmp, listq.queue); |
1027 | TAILQ_REMOVE(&uobj->memq, &curmp, listq.queue); |
1028 | } else { |
1029 | pg = uvm_pagelookup(uobj, off); |
1030 | } |
1031 | continue; |
1032 | } |
1033 | |
1034 | /* |
1035 | * if we're freeing, remove all mappings of the page now. |
1036 | * if we're cleaning, check if the page is needs to be cleaned. |
1037 | */ |
1038 | |
1039 | if (flags & PGO_FREE) { |
1040 | pmap_page_protect(pg, VM_PROT_NONE); |
1041 | } else if (flags & PGO_CLEANIT) { |
1042 | |
1043 | /* |
1044 | * if we still have some hope to pull this vnode off |
1045 | * from the syncer queue, write-protect the page. |
1046 | */ |
1047 | |
1048 | if (cleanall && wasclean && |
1049 | gp->g_dirtygen == dirtygen) { |
1050 | |
1051 | /* |
1052 | * uobj pages get wired only by uvm_fault |
1053 | * where uobj is locked. |
1054 | */ |
1055 | |
1056 | if (pg->wire_count == 0) { |
1057 | pmap_page_protect(pg, |
1058 | VM_PROT_READ|VM_PROT_EXECUTE); |
1059 | } else { |
1060 | cleanall = false; |
1061 | } |
1062 | } |
1063 | } |
1064 | |
1065 | if (flags & PGO_CLEANIT) { |
1066 | needs_clean = pmap_clear_modify(pg) || |
1067 | (pg->flags & PG_CLEAN) == 0; |
1068 | pg->flags |= PG_CLEAN; |
1069 | } else { |
1070 | needs_clean = false; |
1071 | } |
1072 | |
1073 | /* |
1074 | * if we're cleaning, build a cluster. |
1075 | * the cluster will consist of pages which are currently dirty, |
1076 | * but they will be returned to us marked clean. |
1077 | * if not cleaning, just operate on the one page. |
1078 | */ |
1079 | |
1080 | if (needs_clean) { |
1081 | KDASSERT((vp->v_iflag & VI_ONWORKLST)); |
1082 | wasclean = false; |
1083 | memset(pgs, 0, sizeof(pgs)); |
1084 | pg->flags |= PG_BUSY; |
1085 | UVM_PAGE_OWN(pg, "genfs_putpages" ); |
1086 | |
1087 | /* |
1088 | * first look backward. |
1089 | */ |
1090 | |
1091 | npages = MIN(MAXPAGES >> 1, off >> PAGE_SHIFT); |
1092 | nback = npages; |
1093 | uvn_findpages(uobj, off - PAGE_SIZE, &nback, &pgs[0], |
1094 | UFP_NOWAIT|UFP_NOALLOC|UFP_DIRTYONLY|UFP_BACKWARD); |
1095 | if (nback) { |
1096 | memmove(&pgs[0], &pgs[npages - nback], |
1097 | nback * sizeof(pgs[0])); |
1098 | if (npages - nback < nback) |
1099 | memset(&pgs[nback], 0, |
1100 | (npages - nback) * sizeof(pgs[0])); |
1101 | else |
1102 | memset(&pgs[npages - nback], 0, |
1103 | nback * sizeof(pgs[0])); |
1104 | } |
1105 | |
1106 | /* |
1107 | * then plug in our page of interest. |
1108 | */ |
1109 | |
1110 | pgs[nback] = pg; |
1111 | |
1112 | /* |
1113 | * then look forward to fill in the remaining space in |
1114 | * the array of pages. |
1115 | */ |
1116 | |
1117 | npages = MAXPAGES - nback - 1; |
1118 | uvn_findpages(uobj, off + PAGE_SIZE, &npages, |
1119 | &pgs[nback + 1], |
1120 | UFP_NOWAIT|UFP_NOALLOC|UFP_DIRTYONLY); |
1121 | npages += nback + 1; |
1122 | } else { |
1123 | pgs[0] = pg; |
1124 | npages = 1; |
1125 | nback = 0; |
1126 | } |
1127 | |
1128 | /* |
1129 | * apply FREE or DEACTIVATE options if requested. |
1130 | */ |
1131 | |
1132 | if (flags & (PGO_DEACTIVATE|PGO_FREE)) { |
1133 | mutex_enter(&uvm_pageqlock); |
1134 | } |
1135 | for (i = 0; i < npages; i++) { |
1136 | tpg = pgs[i]; |
1137 | KASSERT(tpg->uobject == uobj); |
1138 | if (by_list && tpg == TAILQ_NEXT(pg, listq.queue)) |
1139 | pg = tpg; |
1140 | if (tpg->offset < startoff || tpg->offset >= endoff) |
1141 | continue; |
1142 | if (flags & PGO_DEACTIVATE && tpg->wire_count == 0) { |
1143 | uvm_pagedeactivate(tpg); |
1144 | } else if (flags & PGO_FREE) { |
1145 | pmap_page_protect(tpg, VM_PROT_NONE); |
1146 | if (tpg->flags & PG_BUSY) { |
1147 | tpg->flags |= freeflag; |
1148 | if (pagedaemon) { |
1149 | uvm_pageout_start(1); |
1150 | uvm_pagedequeue(tpg); |
1151 | } |
1152 | } else { |
1153 | |
1154 | /* |
1155 | * ``page is not busy'' |
1156 | * implies that npages is 1 |
1157 | * and needs_clean is false. |
1158 | */ |
1159 | |
1160 | nextpg = TAILQ_NEXT(tpg, listq.queue); |
1161 | uvm_pagefree(tpg); |
1162 | if (pagedaemon) |
1163 | uvmexp.pdfreed++; |
1164 | } |
1165 | } |
1166 | } |
1167 | if (flags & (PGO_DEACTIVATE|PGO_FREE)) { |
1168 | mutex_exit(&uvm_pageqlock); |
1169 | } |
1170 | if (needs_clean) { |
1171 | modified = true; |
1172 | |
1173 | /* |
1174 | * start the i/o. if we're traversing by list, |
1175 | * keep our place in the list with a marker page. |
1176 | */ |
1177 | |
1178 | if (by_list) { |
1179 | TAILQ_INSERT_AFTER(&uobj->memq, pg, &curmp, |
1180 | listq.queue); |
1181 | } |
1182 | mutex_exit(slock); |
1183 | error = GOP_WRITE(vp, pgs, npages, flags); |
1184 | mutex_enter(slock); |
1185 | if (by_list) { |
1186 | pg = TAILQ_NEXT(&curmp, listq.queue); |
1187 | TAILQ_REMOVE(&uobj->memq, &curmp, listq.queue); |
1188 | } |
1189 | if (error) { |
1190 | break; |
1191 | } |
1192 | if (by_list) { |
1193 | continue; |
1194 | } |
1195 | } |
1196 | |
1197 | /* |
1198 | * find the next page and continue if there was no error. |
1199 | */ |
1200 | |
1201 | if (by_list) { |
1202 | if (nextpg) { |
1203 | pg = nextpg; |
1204 | nextpg = NULL; |
1205 | } else { |
1206 | pg = TAILQ_NEXT(pg, listq.queue); |
1207 | } |
1208 | } else { |
1209 | off += (npages - nback) << PAGE_SHIFT; |
1210 | if (off < endoff) { |
1211 | pg = uvm_pagelookup(uobj, off); |
1212 | } |
1213 | } |
1214 | } |
1215 | if (by_list) { |
1216 | TAILQ_REMOVE(&uobj->memq, &endmp, listq.queue); |
1217 | } |
1218 | |
1219 | if (modified && (vp->v_iflag & VI_WRMAPDIRTY) != 0 && |
1220 | (vp->v_type != VBLK || |
1221 | (vp->v_mount->mnt_flag & MNT_NODEVMTIME) == 0)) { |
1222 | GOP_MARKUPDATE(vp, GOP_UPDATE_MODIFIED); |
1223 | } |
1224 | |
1225 | /* |
1226 | * if we're cleaning and there was nothing to clean, |
1227 | * take us off the syncer list. if we started any i/o |
1228 | * and we're doing sync i/o, wait for all writes to finish. |
1229 | */ |
1230 | |
1231 | if (cleanall && wasclean && gp->g_dirtygen == dirtygen && |
1232 | (vp->v_iflag & VI_ONWORKLST) != 0) { |
1233 | #if defined(DEBUG) |
1234 | TAILQ_FOREACH(pg, &uobj->memq, listq.queue) { |
1235 | if ((pg->flags & (PG_FAKE | PG_MARKER)) != 0) { |
1236 | continue; |
1237 | } |
1238 | if ((pg->flags & PG_CLEAN) == 0) { |
1239 | printf("%s: %p: !CLEAN\n" , __func__, pg); |
1240 | } |
1241 | if (pmap_is_modified(pg)) { |
1242 | printf("%s: %p: modified\n" , __func__, pg); |
1243 | } |
1244 | } |
1245 | #endif /* defined(DEBUG) */ |
1246 | vp->v_iflag &= ~VI_WRMAPDIRTY; |
1247 | if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL) |
1248 | vn_syncer_remove_from_worklist(vp); |
1249 | } |
1250 | |
1251 | #if !defined(DEBUG) |
1252 | skip_scan: |
1253 | #endif /* !defined(DEBUG) */ |
1254 | |
1255 | /* Wait for output to complete. */ |
1256 | if (!wasclean && !async && vp->v_numoutput != 0) { |
1257 | while (vp->v_numoutput != 0) |
1258 | cv_wait(&vp->v_cv, slock); |
1259 | } |
1260 | onworklst = (vp->v_iflag & VI_ONWORKLST) != 0; |
1261 | mutex_exit(slock); |
1262 | |
1263 | if ((flags & PGO_RECLAIM) != 0 && onworklst) { |
1264 | /* |
1265 | * in the case of PGO_RECLAIM, ensure to make the vnode clean. |
1266 | * retrying is not a big deal because, in many cases, |
1267 | * uobj->uo_npages is already 0 here. |
1268 | */ |
1269 | mutex_enter(slock); |
1270 | goto retry; |
1271 | } |
1272 | |
1273 | if (has_trans) { |
1274 | if (need_wapbl) |
1275 | WAPBL_END(vp->v_mount); |
1276 | fstrans_done(vp->v_mount); |
1277 | } |
1278 | |
1279 | return (error); |
1280 | } |
1281 | |
1282 | int |
1283 | genfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags) |
1284 | { |
1285 | off_t off; |
1286 | vaddr_t kva; |
1287 | size_t len; |
1288 | int error; |
1289 | UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist); |
1290 | |
1291 | UVMHIST_LOG(ubchist, "vp %p pgs %p npages %d flags 0x%x" , |
1292 | vp, pgs, npages, flags); |
1293 | |
1294 | off = pgs[0]->offset; |
1295 | kva = uvm_pagermapin(pgs, npages, |
1296 | UVMPAGER_MAPIN_WRITE | UVMPAGER_MAPIN_WAITOK); |
1297 | len = npages << PAGE_SHIFT; |
1298 | |
1299 | error = genfs_do_io(vp, off, kva, len, flags, UIO_WRITE, |
1300 | uvm_aio_biodone); |
1301 | |
1302 | return error; |
1303 | } |
1304 | |
1305 | int |
1306 | genfs_gop_write_rwmap(struct vnode *vp, struct vm_page **pgs, int npages, int flags) |
1307 | { |
1308 | off_t off; |
1309 | vaddr_t kva; |
1310 | size_t len; |
1311 | int error; |
1312 | UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist); |
1313 | |
1314 | UVMHIST_LOG(ubchist, "vp %p pgs %p npages %d flags 0x%x" , |
1315 | vp, pgs, npages, flags); |
1316 | |
1317 | off = pgs[0]->offset; |
1318 | kva = uvm_pagermapin(pgs, npages, |
1319 | UVMPAGER_MAPIN_READ | UVMPAGER_MAPIN_WAITOK); |
1320 | len = npages << PAGE_SHIFT; |
1321 | |
1322 | error = genfs_do_io(vp, off, kva, len, flags, UIO_WRITE, |
1323 | uvm_aio_biodone); |
1324 | |
1325 | return error; |
1326 | } |
1327 | |
1328 | /* |
1329 | * Backend routine for doing I/O to vnode pages. Pages are already locked |
1330 | * and mapped into kernel memory. Here we just look up the underlying |
1331 | * device block addresses and call the strategy routine. |
1332 | */ |
1333 | |
1334 | static int |
1335 | genfs_do_io(struct vnode *vp, off_t off, vaddr_t kva, size_t len, int flags, |
1336 | enum uio_rw rw, void (*iodone)(struct buf *)) |
1337 | { |
1338 | int s, error; |
1339 | int fs_bshift, dev_bshift; |
1340 | off_t eof, offset, startoffset; |
1341 | size_t bytes, iobytes, skipbytes; |
1342 | struct buf *mbp, *bp; |
1343 | const bool async = (flags & PGO_SYNCIO) == 0; |
1344 | const bool lazy = (flags & PGO_LAZY) == 0; |
1345 | const bool iowrite = rw == UIO_WRITE; |
1346 | const int brw = iowrite ? B_WRITE : B_READ; |
1347 | UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist); |
1348 | |
1349 | UVMHIST_LOG(ubchist, "vp %p kva %p len 0x%x flags 0x%x" , |
1350 | vp, kva, len, flags); |
1351 | |
1352 | KASSERT(vp->v_size <= vp->v_writesize); |
1353 | GOP_SIZE(vp, vp->v_writesize, &eof, 0); |
1354 | if (vp->v_type != VBLK) { |
1355 | fs_bshift = vp->v_mount->mnt_fs_bshift; |
1356 | dev_bshift = vp->v_mount->mnt_dev_bshift; |
1357 | } else { |
1358 | fs_bshift = DEV_BSHIFT; |
1359 | dev_bshift = DEV_BSHIFT; |
1360 | } |
1361 | error = 0; |
1362 | startoffset = off; |
1363 | bytes = MIN(len, eof - startoffset); |
1364 | skipbytes = 0; |
1365 | KASSERT(bytes != 0); |
1366 | |
1367 | if (iowrite) { |
1368 | mutex_enter(vp->v_interlock); |
1369 | vp->v_numoutput += 2; |
1370 | mutex_exit(vp->v_interlock); |
1371 | } |
1372 | mbp = getiobuf(vp, true); |
1373 | UVMHIST_LOG(ubchist, "vp %p mbp %p num now %d bytes 0x%x" , |
1374 | vp, mbp, vp->v_numoutput, bytes); |
1375 | mbp->b_bufsize = len; |
1376 | mbp->b_data = (void *)kva; |
1377 | mbp->b_resid = mbp->b_bcount = bytes; |
1378 | mbp->b_cflags = BC_BUSY | BC_AGE; |
1379 | if (async) { |
1380 | mbp->b_flags = brw | B_ASYNC; |
1381 | mbp->b_iodone = iodone; |
1382 | } else { |
1383 | mbp->b_flags = brw; |
1384 | mbp->b_iodone = NULL; |
1385 | } |
1386 | if (curlwp == uvm.pagedaemon_lwp) |
1387 | BIO_SETPRIO(mbp, BPRIO_TIMELIMITED); |
1388 | else if (async || lazy) |
1389 | BIO_SETPRIO(mbp, BPRIO_TIMENONCRITICAL); |
1390 | else |
1391 | BIO_SETPRIO(mbp, BPRIO_TIMECRITICAL); |
1392 | |
1393 | bp = NULL; |
1394 | for (offset = startoffset; |
1395 | bytes > 0; |
1396 | offset += iobytes, bytes -= iobytes) { |
1397 | int run; |
1398 | daddr_t lbn, blkno; |
1399 | struct vnode *devvp; |
1400 | |
1401 | /* |
1402 | * bmap the file to find out the blkno to read from and |
1403 | * how much we can read in one i/o. if bmap returns an error, |
1404 | * skip the rest of the top-level i/o. |
1405 | */ |
1406 | |
1407 | lbn = offset >> fs_bshift; |
1408 | error = VOP_BMAP(vp, lbn, &devvp, &blkno, &run); |
1409 | if (error) { |
1410 | UVMHIST_LOG(ubchist, "VOP_BMAP lbn 0x%x -> %d\n" , |
1411 | lbn,error,0,0); |
1412 | skipbytes += bytes; |
1413 | bytes = 0; |
1414 | goto loopdone; |
1415 | } |
1416 | |
1417 | /* |
1418 | * see how many pages can be read with this i/o. |
1419 | * reduce the i/o size if necessary to avoid |
1420 | * overwriting pages with valid data. |
1421 | */ |
1422 | |
1423 | iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset, |
1424 | bytes); |
1425 | |
1426 | /* |
1427 | * if this block isn't allocated, zero it instead of |
1428 | * reading it. unless we are going to allocate blocks, |
1429 | * mark the pages we zeroed PG_RDONLY. |
1430 | */ |
1431 | |
1432 | if (blkno == (daddr_t)-1) { |
1433 | if (!iowrite) { |
1434 | memset((char *)kva + (offset - startoffset), 0, |
1435 | iobytes); |
1436 | } |
1437 | skipbytes += iobytes; |
1438 | continue; |
1439 | } |
1440 | |
1441 | /* |
1442 | * allocate a sub-buf for this piece of the i/o |
1443 | * (or just use mbp if there's only 1 piece), |
1444 | * and start it going. |
1445 | */ |
1446 | |
1447 | if (offset == startoffset && iobytes == bytes) { |
1448 | bp = mbp; |
1449 | } else { |
1450 | UVMHIST_LOG(ubchist, "vp %p bp %p num now %d" , |
1451 | vp, bp, vp->v_numoutput, 0); |
1452 | bp = getiobuf(vp, true); |
1453 | nestiobuf_setup(mbp, bp, offset - startoffset, iobytes); |
1454 | } |
1455 | bp->b_lblkno = 0; |
1456 | |
1457 | /* adjust physical blkno for partial blocks */ |
1458 | bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >> |
1459 | dev_bshift); |
1460 | |
1461 | UVMHIST_LOG(ubchist, |
1462 | "bp %p offset 0x%x bcount 0x%x blkno 0x%x" , |
1463 | bp, offset, bp->b_bcount, bp->b_blkno); |
1464 | |
1465 | VOP_STRATEGY(devvp, bp); |
1466 | } |
1467 | |
1468 | loopdone: |
1469 | if (skipbytes) { |
1470 | UVMHIST_LOG(ubchist, "skipbytes %d" , skipbytes, 0,0,0); |
1471 | } |
1472 | nestiobuf_done(mbp, skipbytes, error); |
1473 | if (async) { |
1474 | UVMHIST_LOG(ubchist, "returning 0 (async)" , 0,0,0,0); |
1475 | return (0); |
1476 | } |
1477 | UVMHIST_LOG(ubchist, "waiting for mbp %p" , mbp,0,0,0); |
1478 | error = biowait(mbp); |
1479 | s = splbio(); |
1480 | (*iodone)(mbp); |
1481 | splx(s); |
1482 | UVMHIST_LOG(ubchist, "returning, error %d" , error,0,0,0); |
1483 | return (error); |
1484 | } |
1485 | |
1486 | int |
1487 | genfs_compat_getpages(void *v) |
1488 | { |
1489 | struct vop_getpages_args /* { |
1490 | struct vnode *a_vp; |
1491 | voff_t a_offset; |
1492 | struct vm_page **a_m; |
1493 | int *a_count; |
1494 | int a_centeridx; |
1495 | vm_prot_t a_access_type; |
1496 | int a_advice; |
1497 | int a_flags; |
1498 | } */ *ap = v; |
1499 | |
1500 | off_t origoffset; |
1501 | struct vnode *vp = ap->a_vp; |
1502 | struct uvm_object *uobj = &vp->v_uobj; |
1503 | struct vm_page *pg, **pgs; |
1504 | vaddr_t kva; |
1505 | int i, error, orignpages, npages; |
1506 | struct iovec iov; |
1507 | struct uio uio; |
1508 | kauth_cred_t cred = curlwp->l_cred; |
1509 | const bool memwrite = (ap->a_access_type & VM_PROT_WRITE) != 0; |
1510 | |
1511 | error = 0; |
1512 | origoffset = ap->a_offset; |
1513 | orignpages = *ap->a_count; |
1514 | pgs = ap->a_m; |
1515 | |
1516 | if (ap->a_flags & PGO_LOCKED) { |
1517 | uvn_findpages(uobj, origoffset, ap->a_count, ap->a_m, |
1518 | UFP_NOWAIT|UFP_NOALLOC| (memwrite ? UFP_NORDONLY : 0)); |
1519 | |
1520 | error = ap->a_m[ap->a_centeridx] == NULL ? EBUSY : 0; |
1521 | if (error == 0 && memwrite) { |
1522 | genfs_markdirty(vp); |
1523 | } |
1524 | return error; |
1525 | } |
1526 | if (origoffset + (ap->a_centeridx << PAGE_SHIFT) >= vp->v_size) { |
1527 | mutex_exit(uobj->vmobjlock); |
1528 | return EINVAL; |
1529 | } |
1530 | if ((ap->a_flags & PGO_SYNCIO) == 0) { |
1531 | mutex_exit(uobj->vmobjlock); |
1532 | return 0; |
1533 | } |
1534 | npages = orignpages; |
1535 | uvn_findpages(uobj, origoffset, &npages, pgs, UFP_ALL); |
1536 | mutex_exit(uobj->vmobjlock); |
1537 | kva = uvm_pagermapin(pgs, npages, |
1538 | UVMPAGER_MAPIN_READ | UVMPAGER_MAPIN_WAITOK); |
1539 | for (i = 0; i < npages; i++) { |
1540 | pg = pgs[i]; |
1541 | if ((pg->flags & PG_FAKE) == 0) { |
1542 | continue; |
1543 | } |
1544 | iov.iov_base = (char *)kva + (i << PAGE_SHIFT); |
1545 | iov.iov_len = PAGE_SIZE; |
1546 | uio.uio_iov = &iov; |
1547 | uio.uio_iovcnt = 1; |
1548 | uio.uio_offset = origoffset + (i << PAGE_SHIFT); |
1549 | uio.uio_rw = UIO_READ; |
1550 | uio.uio_resid = PAGE_SIZE; |
1551 | UIO_SETUP_SYSSPACE(&uio); |
1552 | /* XXX vn_lock */ |
1553 | error = VOP_READ(vp, &uio, 0, cred); |
1554 | if (error) { |
1555 | break; |
1556 | } |
1557 | if (uio.uio_resid) { |
1558 | memset(iov.iov_base, 0, uio.uio_resid); |
1559 | } |
1560 | } |
1561 | uvm_pagermapout(kva, npages); |
1562 | mutex_enter(uobj->vmobjlock); |
1563 | mutex_enter(&uvm_pageqlock); |
1564 | for (i = 0; i < npages; i++) { |
1565 | pg = pgs[i]; |
1566 | if (error && (pg->flags & PG_FAKE) != 0) { |
1567 | pg->flags |= PG_RELEASED; |
1568 | } else { |
1569 | pmap_clear_modify(pg); |
1570 | uvm_pageactivate(pg); |
1571 | } |
1572 | } |
1573 | if (error) { |
1574 | uvm_page_unbusy(pgs, npages); |
1575 | } |
1576 | mutex_exit(&uvm_pageqlock); |
1577 | if (error == 0 && memwrite) { |
1578 | genfs_markdirty(vp); |
1579 | } |
1580 | mutex_exit(uobj->vmobjlock); |
1581 | return error; |
1582 | } |
1583 | |
1584 | int |
1585 | genfs_compat_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, |
1586 | int flags) |
1587 | { |
1588 | off_t offset; |
1589 | struct iovec iov; |
1590 | struct uio uio; |
1591 | kauth_cred_t cred = curlwp->l_cred; |
1592 | struct buf *bp; |
1593 | vaddr_t kva; |
1594 | int error; |
1595 | |
1596 | offset = pgs[0]->offset; |
1597 | kva = uvm_pagermapin(pgs, npages, |
1598 | UVMPAGER_MAPIN_WRITE | UVMPAGER_MAPIN_WAITOK); |
1599 | |
1600 | iov.iov_base = (void *)kva; |
1601 | iov.iov_len = npages << PAGE_SHIFT; |
1602 | uio.uio_iov = &iov; |
1603 | uio.uio_iovcnt = 1; |
1604 | uio.uio_offset = offset; |
1605 | uio.uio_rw = UIO_WRITE; |
1606 | uio.uio_resid = npages << PAGE_SHIFT; |
1607 | UIO_SETUP_SYSSPACE(&uio); |
1608 | /* XXX vn_lock */ |
1609 | error = VOP_WRITE(vp, &uio, 0, cred); |
1610 | |
1611 | mutex_enter(vp->v_interlock); |
1612 | vp->v_numoutput++; |
1613 | mutex_exit(vp->v_interlock); |
1614 | |
1615 | bp = getiobuf(vp, true); |
1616 | bp->b_cflags = BC_BUSY | BC_AGE; |
1617 | bp->b_lblkno = offset >> vp->v_mount->mnt_fs_bshift; |
1618 | bp->b_data = (char *)kva; |
1619 | bp->b_bcount = npages << PAGE_SHIFT; |
1620 | bp->b_bufsize = npages << PAGE_SHIFT; |
1621 | bp->b_resid = 0; |
1622 | bp->b_error = error; |
1623 | uvm_aio_aiodone(bp); |
1624 | return (error); |
1625 | } |
1626 | |
1627 | /* |
1628 | * Process a uio using direct I/O. If we reach a part of the request |
1629 | * which cannot be processed in this fashion for some reason, just return. |
1630 | * The caller must handle some additional part of the request using |
1631 | * buffered I/O before trying direct I/O again. |
1632 | */ |
1633 | |
1634 | void |
1635 | genfs_directio(struct vnode *vp, struct uio *uio, int ioflag) |
1636 | { |
1637 | struct vmspace *vs; |
1638 | struct iovec *iov; |
1639 | vaddr_t va; |
1640 | size_t len; |
1641 | const int mask = DEV_BSIZE - 1; |
1642 | int error; |
1643 | bool need_wapbl = (vp->v_mount && vp->v_mount->mnt_wapbl && |
1644 | (ioflag & IO_JOURNALLOCKED) == 0); |
1645 | |
1646 | /* |
1647 | * We only support direct I/O to user space for now. |
1648 | */ |
1649 | |
1650 | if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) { |
1651 | return; |
1652 | } |
1653 | |
1654 | /* |
1655 | * If the vnode is mapped, we would need to get the getpages lock |
1656 | * to stabilize the bmap, but then we would get into trouble while |
1657 | * locking the pages if the pages belong to this same vnode (or a |
1658 | * multi-vnode cascade to the same effect). Just fall back to |
1659 | * buffered I/O if the vnode is mapped to avoid this mess. |
1660 | */ |
1661 | |
1662 | if (vp->v_vflag & VV_MAPPED) { |
1663 | return; |
1664 | } |
1665 | |
1666 | if (need_wapbl) { |
1667 | error = WAPBL_BEGIN(vp->v_mount); |
1668 | if (error) |
1669 | return; |
1670 | } |
1671 | |
1672 | /* |
1673 | * Do as much of the uio as possible with direct I/O. |
1674 | */ |
1675 | |
1676 | vs = uio->uio_vmspace; |
1677 | while (uio->uio_resid) { |
1678 | iov = uio->uio_iov; |
1679 | if (iov->iov_len == 0) { |
1680 | uio->uio_iov++; |
1681 | uio->uio_iovcnt--; |
1682 | continue; |
1683 | } |
1684 | va = (vaddr_t)iov->iov_base; |
1685 | len = MIN(iov->iov_len, genfs_maxdio); |
1686 | len &= ~mask; |
1687 | |
1688 | /* |
1689 | * If the next chunk is smaller than DEV_BSIZE or extends past |
1690 | * the current EOF, then fall back to buffered I/O. |
1691 | */ |
1692 | |
1693 | if (len == 0 || uio->uio_offset + len > vp->v_size) { |
1694 | break; |
1695 | } |
1696 | |
1697 | /* |
1698 | * Check alignment. The file offset must be at least |
1699 | * sector-aligned. The exact constraint on memory alignment |
1700 | * is very hardware-dependent, but requiring sector-aligned |
1701 | * addresses there too is safe. |
1702 | */ |
1703 | |
1704 | if (uio->uio_offset & mask || va & mask) { |
1705 | break; |
1706 | } |
1707 | error = genfs_do_directio(vs, va, len, vp, uio->uio_offset, |
1708 | uio->uio_rw); |
1709 | if (error) { |
1710 | break; |
1711 | } |
1712 | iov->iov_base = (char *)iov->iov_base + len; |
1713 | iov->iov_len -= len; |
1714 | uio->uio_offset += len; |
1715 | uio->uio_resid -= len; |
1716 | } |
1717 | |
1718 | if (need_wapbl) |
1719 | WAPBL_END(vp->v_mount); |
1720 | } |
1721 | |
1722 | /* |
1723 | * Iodone routine for direct I/O. We don't do much here since the request is |
1724 | * always synchronous, so the caller will do most of the work after biowait(). |
1725 | */ |
1726 | |
1727 | static void |
1728 | genfs_dio_iodone(struct buf *bp) |
1729 | { |
1730 | |
1731 | KASSERT((bp->b_flags & B_ASYNC) == 0); |
1732 | if ((bp->b_flags & B_READ) == 0 && (bp->b_cflags & BC_AGE) != 0) { |
1733 | mutex_enter(bp->b_objlock); |
1734 | vwakeup(bp); |
1735 | mutex_exit(bp->b_objlock); |
1736 | } |
1737 | putiobuf(bp); |
1738 | } |
1739 | |
1740 | /* |
1741 | * Process one chunk of a direct I/O request. |
1742 | */ |
1743 | |
1744 | static int |
1745 | genfs_do_directio(struct vmspace *vs, vaddr_t uva, size_t len, struct vnode *vp, |
1746 | off_t off, enum uio_rw rw) |
1747 | { |
1748 | struct vm_map *map; |
1749 | struct pmap *upm, *kpm __unused; |
1750 | size_t klen = round_page(uva + len) - trunc_page(uva); |
1751 | off_t spoff, epoff; |
1752 | vaddr_t kva, puva; |
1753 | paddr_t pa; |
1754 | vm_prot_t prot; |
1755 | int error, rv __diagused, poff, koff; |
1756 | const int pgoflags = PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED | |
1757 | (rw == UIO_WRITE ? PGO_FREE : 0); |
1758 | |
1759 | /* |
1760 | * For writes, verify that this range of the file already has fully |
1761 | * allocated backing store. If there are any holes, just punt and |
1762 | * make the caller take the buffered write path. |
1763 | */ |
1764 | |
1765 | if (rw == UIO_WRITE) { |
1766 | daddr_t lbn, elbn, blkno; |
1767 | int bsize, bshift, run; |
1768 | |
1769 | bshift = vp->v_mount->mnt_fs_bshift; |
1770 | bsize = 1 << bshift; |
1771 | lbn = off >> bshift; |
1772 | elbn = (off + len + bsize - 1) >> bshift; |
1773 | while (lbn < elbn) { |
1774 | error = VOP_BMAP(vp, lbn, NULL, &blkno, &run); |
1775 | if (error) { |
1776 | return error; |
1777 | } |
1778 | if (blkno == (daddr_t)-1) { |
1779 | return ENOSPC; |
1780 | } |
1781 | lbn += 1 + run; |
1782 | } |
1783 | } |
1784 | |
1785 | /* |
1786 | * Flush any cached pages for parts of the file that we're about to |
1787 | * access. If we're writing, invalidate pages as well. |
1788 | */ |
1789 | |
1790 | spoff = trunc_page(off); |
1791 | epoff = round_page(off + len); |
1792 | mutex_enter(vp->v_interlock); |
1793 | error = VOP_PUTPAGES(vp, spoff, epoff, pgoflags); |
1794 | if (error) { |
1795 | return error; |
1796 | } |
1797 | |
1798 | /* |
1799 | * Wire the user pages and remap them into kernel memory. |
1800 | */ |
1801 | |
1802 | prot = rw == UIO_READ ? VM_PROT_READ | VM_PROT_WRITE : VM_PROT_READ; |
1803 | error = uvm_vslock(vs, (void *)uva, len, prot); |
1804 | if (error) { |
1805 | return error; |
1806 | } |
1807 | |
1808 | map = &vs->vm_map; |
1809 | upm = vm_map_pmap(map); |
1810 | kpm = vm_map_pmap(kernel_map); |
1811 | puva = trunc_page(uva); |
1812 | kva = uvm_km_alloc(kernel_map, klen, atop(puva) & uvmexp.colormask, |
1813 | UVM_KMF_VAONLY | UVM_KMF_WAITVA | UVM_KMF_COLORMATCH); |
1814 | for (poff = 0; poff < klen; poff += PAGE_SIZE) { |
1815 | rv = pmap_extract(upm, puva + poff, &pa); |
1816 | KASSERT(rv); |
1817 | pmap_kenter_pa(kva + poff, pa, prot, PMAP_WIRED); |
1818 | } |
1819 | pmap_update(kpm); |
1820 | |
1821 | /* |
1822 | * Do the I/O. |
1823 | */ |
1824 | |
1825 | koff = uva - trunc_page(uva); |
1826 | error = genfs_do_io(vp, off, kva + koff, len, PGO_SYNCIO, rw, |
1827 | genfs_dio_iodone); |
1828 | |
1829 | /* |
1830 | * Tear down the kernel mapping. |
1831 | */ |
1832 | |
1833 | pmap_kremove(kva, klen); |
1834 | pmap_update(kpm); |
1835 | uvm_km_free(kernel_map, kva, klen, UVM_KMF_VAONLY); |
1836 | |
1837 | /* |
1838 | * Unwire the user pages. |
1839 | */ |
1840 | |
1841 | uvm_vsunlock(vs, (void *)uva, len); |
1842 | return error; |
1843 | } |
1844 | |