1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 #pragma ident   "%Z%%M% %I%     %E% SMI"
  27 
  28 #include <sys/zfs_context.h>
  29 #include <sys/spa.h>
  30 #include <sys/vdev_file.h>
  31 #include <sys/vdev_impl.h>
  32 #include <sys/zio.h>
  33 #include <sys/fs/zfs.h>
  34 
  35 /*
  36  * Virtual device vector for files.
  37  */
  38 
  39 static int
  40 vdev_file_open_common(vdev_t *vd)
  41 {
  42         vdev_file_t *vf;
  43         vnode_t *vp;
  44         int error;
  45 
  46         /*
  47          * We must have a pathname, and it must be absolute.
  48          */
  49         if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
  50                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
  51                 return (EINVAL);
  52         }
  53 
  54         vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
  55 
  56         /*
  57          * We always open the files from the root of the global zone, even if
  58          * we're in a local zone.  If the user has gotten to this point, the
  59          * administrator has already decided that the pool should be available
  60          * to local zone users, so the underlying devices should be as well.
  61          */
  62         ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
  63         error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE,
  64             spa_mode | FOFFMAX, 0, &vp, 0, 0, rootdir, -1);
  65 
  66         if (error) {
  67                 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
  68                 return (error);
  69         }
  70 
  71         vf->vf_vnode = vp;
  72 
  73 #ifdef _KERNEL
  74         /*
  75          * Make sure it's a regular file.
  76          */
  77         if (vp->v_type != VREG) {
  78                 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
  79                 return (ENODEV);
  80         }
  81 #endif
  82 
  83         return (0);
  84 }
  85 
  86 static int
  87 vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
  88 {
  89         vdev_file_t *vf;
  90         vattr_t vattr;
  91         int error;
  92 
  93         if ((error = vdev_file_open_common(vd)) != 0)
  94                 return (error);
  95 
  96         vf = vd->vdev_tsd;
  97 
  98         /*
  99          * Determine the physical size of the file.
 100          */
 101         vattr.va_mask = AT_SIZE;
 102         error = VOP_GETATTR(vf->vf_vnode, &vattr, 0, kcred, NULL);
 103         if (error) {
 104                 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 105                 return (error);
 106         }
 107 
 108         *psize = vattr.va_size;
 109         *ashift = SPA_MINBLOCKSHIFT;
 110 
 111         return (0);
 112 }
 113 
 114 static void
 115 vdev_file_close(vdev_t *vd)
 116 {
 117         vdev_file_t *vf = vd->vdev_tsd;
 118 
 119         if (vf == NULL)
 120                 return;
 121 
 122         if (vf->vf_vnode != NULL) {
 123                 (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL);
 124                 (void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred, NULL);
 125                 VN_RELE(vf->vf_vnode);
 126         }
 127 
 128         kmem_free(vf, sizeof (vdev_file_t));
 129         vd->vdev_tsd = NULL;
 130 }
 131 
 132 static int
 133 vdev_file_probe_io(vdev_t *vd, caddr_t data, size_t size, uint64_t offset,
 134     enum uio_rw rw)
 135 {
 136         vdev_file_t *vf = vd->vdev_tsd;
 137         ssize_t resid;
 138         int error = 0;
 139 
 140         if (vd == NULL || vf == NULL || vf->vf_vnode == NULL)
 141                 return (EINVAL);
 142 
 143         ASSERT(rw == UIO_READ || rw ==  UIO_WRITE);
 144 
 145         error = vn_rdwr(rw, vf->vf_vnode, data, size, offset, UIO_SYSSPACE,
 146             0, RLIM64_INFINITY, kcred, &resid);
 147         if (error || resid != 0)
 148                 return (EIO);
 149         return (0);
 150 }
 151 
 152 /*
 153  * Determine if the underlying device is accessible by reading and writing
 154  * to a known location. We must be able to do this during syncing context
 155  * and thus we cannot set the vdev state directly.
 156  */
 157 static int
 158 vdev_file_probe(vdev_t *vd)
 159 {
 160         vdev_t *nvd;
 161         char *vl_boot;
 162         uint64_t offset;
 163         int l, error = 0, retries = 0;
 164 
 165         if (vd == NULL)
 166                 return (EINVAL);
 167 
 168         /* Hijack the current vdev */
 169         nvd = vd;
 170 
 171         /*
 172          * Pick a random label to rewrite.
 173          */
 174         l = spa_get_random(VDEV_LABELS);
 175         ASSERT(l < VDEV_LABELS);
 176 
 177         offset = vdev_label_offset(vd->vdev_psize, l,
 178             offsetof(vdev_label_t, vl_boot_header));
 179 
 180         vl_boot = kmem_alloc(VDEV_BOOT_HEADER_SIZE, KM_SLEEP);
 181 
 182         while ((error = vdev_file_probe_io(nvd, vl_boot, VDEV_BOOT_HEADER_SIZE,
 183             offset, UIO_READ)) != 0 && retries == 0) {
 184 
 185                 /*
 186                  * If we failed with the vdev that was passed in then
 187                  * try allocating a new one and try again.
 188                  */
 189                 nvd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
 190                 if (vd->vdev_path)
 191                         nvd->vdev_path = spa_strdup(vd->vdev_path);
 192                 retries++;
 193 
 194                 error = vdev_file_open_common(nvd);
 195                 if (error)
 196                         break;
 197         }
 198 
 199         if ((spa_mode & FWRITE) && !error) {
 200                 error = vdev_file_probe_io(nvd, vl_boot, VDEV_BOOT_HEADER_SIZE,
 201                     offset, UIO_WRITE);
 202         }
 203 
 204         if (retries) {
 205                 vdev_file_close(nvd);
 206                 if (nvd->vdev_path)
 207                         spa_strfree(nvd->vdev_path);
 208                 kmem_free(nvd, sizeof (vdev_t));
 209         }
 210         kmem_free(vl_boot, VDEV_BOOT_HEADER_SIZE);
 211 
 212         if (!error)
 213                 vd->vdev_is_failing = B_FALSE;
 214 
 215         return (error);
 216 }
 217 
 218 static int
 219 vdev_file_io_start(zio_t *zio)
 220 {
 221         vdev_t *vd = zio->io_vd;
 222         vdev_file_t *vf = vd->vdev_tsd;
 223         ssize_t resid;
 224         int error;
 225 
 226         if (zio->io_type == ZIO_TYPE_IOCTL) {
 227                 zio_vdev_io_bypass(zio);
 228 
 229                 /* XXPOLICY */
 230                 if (!vdev_readable(vd)) {
 231                         zio->io_error = ENXIO;
 232                         return (ZIO_PIPELINE_CONTINUE);
 233                 }
 234 
 235                 switch (zio->io_cmd) {
 236                 case DKIOCFLUSHWRITECACHE:
 237                         zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC,
 238                             kcred, NULL);
 239                         dprintf("fsync(%s) = %d\n", vdev_description(vd),
 240                             zio->io_error);
 241                         break;
 242                 default:
 243                         zio->io_error = ENOTSUP;
 244                 }
 245 
 246                 return (ZIO_PIPELINE_CONTINUE);
 247         }
 248 
 249         /*
 250          * In the kernel, don't bother double-caching, but in userland,
 251          * we want to test the vdev_cache code.
 252          */
 253 #ifndef _KERNEL
 254         if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
 255                 return (ZIO_PIPELINE_STOP);
 256 #endif
 257 
 258         if ((zio = vdev_queue_io(zio)) == NULL)
 259                 return (ZIO_PIPELINE_STOP);
 260 
 261         /* XXPOLICY */
 262         if (zio->io_type == ZIO_TYPE_WRITE)
 263                 error = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
 264         else
 265                 error = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
 266         error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error;
 267         if (error) {
 268                 zio->io_error = error;
 269                 zio_interrupt(zio);
 270                 return (ZIO_PIPELINE_STOP);
 271         }
 272 
 273         zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
 274             UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data,
 275             zio->io_size, zio->io_offset, UIO_SYSSPACE,
 276             0, RLIM64_INFINITY, kcred, &resid);
 277 
 278         if (resid != 0 && zio->io_error == 0)
 279                 zio->io_error = ENOSPC;
 280 
 281         zio_interrupt(zio);
 282 
 283         return (ZIO_PIPELINE_STOP);
 284 }
 285 
 286 static int
 287 vdev_file_io_done(zio_t *zio)
 288 {
 289         vdev_t *vd = zio->io_vd;
 290 
 291         if (zio_injection_enabled && zio->io_error == 0)
 292                 zio->io_error = zio_handle_device_injection(vd, EIO);
 293 
 294         /*
 295          * If an error has been encountered then attempt to probe the device
 296          * to determine if it's still accessible.
 297          */
 298         if (zio->io_error == EIO && vdev_probe(vd) != 0)
 299                 vd->vdev_is_failing = B_TRUE;
 300 
 301         vdev_queue_io_done(zio);
 302 
 303 #ifndef _KERNEL
 304         if (zio->io_type == ZIO_TYPE_WRITE)
 305                 vdev_cache_write(zio);
 306 #endif
 307 
 308         return (ZIO_PIPELINE_CONTINUE);
 309 }
 310 
 311 vdev_ops_t vdev_file_ops = {
 312         vdev_file_open,
 313         vdev_file_close,
 314         vdev_file_probe,
 315         vdev_default_asize,
 316         vdev_file_io_start,
 317         vdev_file_io_done,
 318         NULL,
 319         VDEV_TYPE_FILE,         /* name of this vdev type */
 320         B_TRUE                  /* leaf vdev */
 321 };
 322 
 323 /*
 324  * From userland we access disks just like files.
 325  */
 326 #ifndef _KERNEL
 327 
 328 vdev_ops_t vdev_disk_ops = {
 329         vdev_file_open,
 330         vdev_file_close,
 331         vdev_file_probe,
 332         vdev_default_asize,
 333         vdev_file_io_start,
 334         vdev_file_io_done,
 335         NULL,
 336         VDEV_TYPE_DISK,         /* name of this vdev type */
 337         B_TRUE                  /* leaf vdev */
 338 };
 339 
 340 #endif