1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #pragma ident "%Z%%M% %I% %E% SMI"
27
28 #include <sys/zfs_context.h>
29 #include <sys/spa.h>
30 #include <sys/vdev_file.h>
31 #include <sys/vdev_impl.h>
32 #include <sys/zio.h>
33 #include <sys/fs/zfs.h>
34
35 /*
36 * Virtual device vector for files.
37 */
38
39 static int
40 vdev_file_open_common(vdev_t *vd)
41 {
42 vdev_file_t *vf;
43 vnode_t *vp;
44 int error;
45
46 /*
47 * We must have a pathname, and it must be absolute.
48 */
49 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
50 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
51 return (EINVAL);
52 }
53
54 vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
55
56 /*
57 * We always open the files from the root of the global zone, even if
58 * we're in a local zone. If the user has gotten to this point, the
59 * administrator has already decided that the pool should be available
60 * to local zone users, so the underlying devices should be as well.
61 */
62 ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
63 error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE,
64 spa_mode | FOFFMAX, 0, &vp, 0, 0, rootdir, -1);
65
66 if (error) {
67 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
68 return (error);
69 }
70
71 vf->vf_vnode = vp;
72
73 #ifdef _KERNEL
74 /*
75 * Make sure it's a regular file.
76 */
77 if (vp->v_type != VREG) {
78 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
79 return (ENODEV);
80 }
81 #endif
82
83 return (0);
84 }
85
86 static int
87 vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
88 {
89 vdev_file_t *vf;
90 vattr_t vattr;
91 int error;
92
93 if ((error = vdev_file_open_common(vd)) != 0)
94 return (error);
95
96 vf = vd->vdev_tsd;
97
98 /*
99 * Determine the physical size of the file.
100 */
101 vattr.va_mask = AT_SIZE;
102 error = VOP_GETATTR(vf->vf_vnode, &vattr, 0, kcred, NULL);
103 if (error) {
104 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
105 return (error);
106 }
107
108 *psize = vattr.va_size;
109 *ashift = SPA_MINBLOCKSHIFT;
110
111 return (0);
112 }
113
114 static void
115 vdev_file_close(vdev_t *vd)
116 {
117 vdev_file_t *vf = vd->vdev_tsd;
118
119 if (vf == NULL)
120 return;
121
122 if (vf->vf_vnode != NULL) {
123 (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL);
124 (void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred, NULL);
125 VN_RELE(vf->vf_vnode);
126 }
127
128 kmem_free(vf, sizeof (vdev_file_t));
129 vd->vdev_tsd = NULL;
130 }
131
132 static int
133 vdev_file_probe_io(vdev_t *vd, caddr_t data, size_t size, uint64_t offset,
134 enum uio_rw rw)
135 {
136 vdev_file_t *vf = vd->vdev_tsd;
137 ssize_t resid;
138 int error = 0;
139
140 if (vd == NULL || vf == NULL || vf->vf_vnode == NULL)
141 return (EINVAL);
142
143 ASSERT(rw == UIO_READ || rw == UIO_WRITE);
144
145 error = vn_rdwr(rw, vf->vf_vnode, data, size, offset, UIO_SYSSPACE,
146 0, RLIM64_INFINITY, kcred, &resid);
147 if (error || resid != 0)
148 return (EIO);
149 return (0);
150 }
151
152 /*
153 * Determine if the underlying device is accessible by reading and writing
154 * to a known location. We must be able to do this during syncing context
155 * and thus we cannot set the vdev state directly.
156 */
157 static int
158 vdev_file_probe(vdev_t *vd)
159 {
160 vdev_t *nvd;
161 char *vl_boot;
162 uint64_t offset;
163 int l, error = 0, retries = 0;
164
165 if (vd == NULL)
166 return (EINVAL);
167
168 /* Hijack the current vdev */
169 nvd = vd;
170
171 /*
172 * Pick a random label to rewrite.
173 */
174 l = spa_get_random(VDEV_LABELS);
175 ASSERT(l < VDEV_LABELS);
176
177 offset = vdev_label_offset(vd->vdev_psize, l,
178 offsetof(vdev_label_t, vl_boot_header));
179
180 vl_boot = kmem_alloc(VDEV_BOOT_HEADER_SIZE, KM_SLEEP);
181
182 while ((error = vdev_file_probe_io(nvd, vl_boot, VDEV_BOOT_HEADER_SIZE,
183 offset, UIO_READ)) != 0 && retries == 0) {
184
185 /*
186 * If we failed with the vdev that was passed in then
187 * try allocating a new one and try again.
188 */
189 nvd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
190 if (vd->vdev_path)
191 nvd->vdev_path = spa_strdup(vd->vdev_path);
192 retries++;
193
194 error = vdev_file_open_common(nvd);
195 if (error)
196 break;
197 }
198
199 if ((spa_mode & FWRITE) && !error) {
200 error = vdev_file_probe_io(nvd, vl_boot, VDEV_BOOT_HEADER_SIZE,
201 offset, UIO_WRITE);
202 }
203
204 if (retries) {
205 vdev_file_close(nvd);
206 if (nvd->vdev_path)
207 spa_strfree(nvd->vdev_path);
208 kmem_free(nvd, sizeof (vdev_t));
209 }
210 kmem_free(vl_boot, VDEV_BOOT_HEADER_SIZE);
211
212 if (!error)
213 vd->vdev_is_failing = B_FALSE;
214
215 return (error);
216 }
217
218 static int
219 vdev_file_io_start(zio_t *zio)
220 {
221 vdev_t *vd = zio->io_vd;
222 vdev_file_t *vf = vd->vdev_tsd;
223 ssize_t resid;
224 int error;
225
226 if (zio->io_type == ZIO_TYPE_IOCTL) {
227 zio_vdev_io_bypass(zio);
228
229 /* XXPOLICY */
230 if (!vdev_readable(vd)) {
231 zio->io_error = ENXIO;
232 return (ZIO_PIPELINE_CONTINUE);
233 }
234
235 switch (zio->io_cmd) {
236 case DKIOCFLUSHWRITECACHE:
237 zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC,
238 kcred, NULL);
239 dprintf("fsync(%s) = %d\n", vdev_description(vd),
240 zio->io_error);
241 break;
242 default:
243 zio->io_error = ENOTSUP;
244 }
245
246 return (ZIO_PIPELINE_CONTINUE);
247 }
248
249 /*
250 * In the kernel, don't bother double-caching, but in userland,
251 * we want to test the vdev_cache code.
252 */
253 #ifndef _KERNEL
254 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
255 return (ZIO_PIPELINE_STOP);
256 #endif
257
258 if ((zio = vdev_queue_io(zio)) == NULL)
259 return (ZIO_PIPELINE_STOP);
260
261 /* XXPOLICY */
262 if (zio->io_type == ZIO_TYPE_WRITE)
263 error = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
264 else
265 error = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
266 error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error;
267 if (error) {
268 zio->io_error = error;
269 zio_interrupt(zio);
270 return (ZIO_PIPELINE_STOP);
271 }
272
273 zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
274 UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data,
275 zio->io_size, zio->io_offset, UIO_SYSSPACE,
276 0, RLIM64_INFINITY, kcred, &resid);
277
278 if (resid != 0 && zio->io_error == 0)
279 zio->io_error = ENOSPC;
280
281 zio_interrupt(zio);
282
283 return (ZIO_PIPELINE_STOP);
284 }
285
286 static int
287 vdev_file_io_done(zio_t *zio)
288 {
289 vdev_t *vd = zio->io_vd;
290
291 if (zio_injection_enabled && zio->io_error == 0)
292 zio->io_error = zio_handle_device_injection(vd, EIO);
293
294 /*
295 * If an error has been encountered then attempt to probe the device
296 * to determine if it's still accessible.
297 */
298 if (zio->io_error == EIO && vdev_probe(vd) != 0)
299 vd->vdev_is_failing = B_TRUE;
300
301 vdev_queue_io_done(zio);
302
303 #ifndef _KERNEL
304 if (zio->io_type == ZIO_TYPE_WRITE)
305 vdev_cache_write(zio);
306 #endif
307
308 return (ZIO_PIPELINE_CONTINUE);
309 }
310
311 vdev_ops_t vdev_file_ops = {
312 vdev_file_open,
313 vdev_file_close,
314 vdev_file_probe,
315 vdev_default_asize,
316 vdev_file_io_start,
317 vdev_file_io_done,
318 NULL,
319 NULL,
320 VDEV_TYPE_FILE, /* name of this vdev type */
321 B_TRUE /* leaf vdev */
322 };
323
324 /*
325 * From userland we access disks just like files.
326 */
327 #ifndef _KERNEL
328
329 vdev_ops_t vdev_disk_ops = {
330 vdev_file_open,
331 vdev_file_close,
332 vdev_file_probe,
333 vdev_default_asize,
334 vdev_file_io_start,
335 vdev_file_io_done,
336 NULL,
337 VDEV_TYPE_DISK, /* name of this vdev type */
338 B_TRUE /* leaf vdev */
339 };
340
341 #endif