1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 #pragma ident   "%Z%%M% %I%     %E% SMI"
  27 
  28 #include <sys/zfs_context.h>
  29 #include <sys/spa.h>
  30 #include <sys/vdev_disk.h>
  31 #include <sys/vdev_impl.h>
  32 #include <sys/fs/zfs.h>
  33 #include <sys/zio.h>
  34 #include <sys/sunldi.h>
  35 
  36 /*
  37  * Virtual device vector for disks.
  38  */
  39 
  40 extern ldi_ident_t zfs_li;
  41 
  42 typedef struct vdev_disk_buf {
  43         buf_t   vdb_buf;
  44         zio_t   *vdb_io;
  45 } vdev_disk_buf_t;
  46 
  47 static int
  48 vdev_disk_open_common(vdev_t *vd)
  49 {
  50         vdev_disk_t *dvd;
  51         dev_t dev;
  52         int error;
  53 
  54         /*
  55          * We must have a pathname, and it must be absolute.
  56          */
  57         if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
  58                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
  59                 return (EINVAL);
  60         }
  61 
  62         dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
  63 
  64         /*
  65          * When opening a disk device, we want to preserve the user's original
  66          * intent.  We always want to open the device by the path the user gave
  67          * us, even if it is one of multiple paths to the save device.  But we
  68          * also want to be able to survive disks being removed/recabled.
  69          * Therefore the sequence of opening devices is:
  70          *
  71          * 1. Try opening the device by path.  For legacy pools without the
  72          *    'whole_disk' property, attempt to fix the path by appending 's0'.
  73          *
  74          * 2. If the devid of the device matches the stored value, return
  75          *    success.
  76          *
  77          * 3. Otherwise, the device may have moved.  Try opening the device
  78          *    by the devid instead.
  79          *
  80          */
  81         if (vd->vdev_devid != NULL) {
  82                 if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
  83                     &dvd->vd_minor) != 0) {
  84                         vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
  85                         return (EINVAL);
  86                 }
  87         }
  88 
  89         error = EINVAL;         /* presume failure */
  90 
  91         if (vd->vdev_path != NULL) {
  92                 ddi_devid_t devid;
  93 
  94                 if (vd->vdev_wholedisk == -1ULL) {
  95                         size_t len = strlen(vd->vdev_path) + 3;
  96                         char *buf = kmem_alloc(len, KM_SLEEP);
  97                         ldi_handle_t lh;
  98 
  99                         (void) snprintf(buf, len, "%ss0", vd->vdev_path);
 100 
 101                         if (ldi_open_by_name(buf, spa_mode, kcred,
 102                             &lh, zfs_li) == 0) {
 103                                 spa_strfree(vd->vdev_path);
 104                                 vd->vdev_path = buf;
 105                                 vd->vdev_wholedisk = 1ULL;
 106                                 (void) ldi_close(lh, spa_mode, kcred);
 107                         } else {
 108                                 kmem_free(buf, len);
 109                         }
 110                 }
 111 
 112                 error = ldi_open_by_name(vd->vdev_path, spa_mode, kcred,
 113                     &dvd->vd_lh, zfs_li);
 114 
 115                 /*
 116                  * Compare the devid to the stored value.
 117                  */
 118                 if (error == 0 && vd->vdev_devid != NULL &&
 119                     ldi_get_devid(dvd->vd_lh, &devid) == 0) {
 120                         if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
 121                                 error = EINVAL;
 122                                 (void) ldi_close(dvd->vd_lh, spa_mode, kcred);
 123                                 dvd->vd_lh = NULL;
 124                         }
 125                         ddi_devid_free(devid);
 126                 }
 127 
 128                 /*
 129                  * If we succeeded in opening the device, but 'vdev_wholedisk'
 130                  * is not yet set, then this must be a slice.
 131                  */
 132                 if (error == 0 && vd->vdev_wholedisk == -1ULL)
 133                         vd->vdev_wholedisk = 0;
 134         }
 135 
 136         /*
 137          * If we were unable to open by path, or the devid check fails, open by
 138          * devid instead.
 139          */
 140         if (error != 0 && vd->vdev_devid != NULL)
 141                 error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
 142                     spa_mode, kcred, &dvd->vd_lh, zfs_li);
 143 
 144         /*
 145          * If all else fails, then try opening by physical path (if available)
 146          * or the logical path (if we failed due to the devid check).  While not
 147          * as reliable as the devid, this will give us something, and the higher
 148          * level vdev validation will prevent us from opening the wrong device.
 149          */
 150         if (error) {
 151                 if (vd->vdev_physpath != NULL &&
 152                     (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != ENODEV)
 153                         error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode,
 154                             kcred, &dvd->vd_lh, zfs_li);
 155 
 156                 /*
 157                  * Note that we don't support the legacy auto-wholedisk support
 158                  * as above.  This hasn't been used in a very long time and we
 159                  * don't need to propagate its oddities to this edge condition.
 160                  */
 161                 if (error && vd->vdev_path != NULL)
 162                         error = ldi_open_by_name(vd->vdev_path, spa_mode, kcred,
 163                             &dvd->vd_lh, zfs_li);
 164         }
 165 
 166         if (error)
 167                 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 168 
 169         return (error);
 170 }
 171 
 172 static int
 173 vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 174 {
 175         vdev_disk_t *dvd;
 176         struct dk_minfo dkm;
 177         int error;
 178         dev_t dev;
 179         int otyp;
 180 
 181         error = vdev_disk_open_common(vd);
 182         if (error)
 183                 return (error);
 184 
 185         dvd = vd->vdev_tsd;
 186         /*
 187          * Once a device is opened, verify that the physical device path (if
 188          * available) is up to date.
 189          */
 190         if (ldi_get_dev(dvd->vd_lh, &dev) == 0 &&
 191             ldi_get_otyp(dvd->vd_lh, &otyp) == 0) {
 192                 char *physpath, *minorname;
 193 
 194                 physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 195                 minorname = NULL;
 196                 if (ddi_dev_pathname(dev, otyp, physpath) == 0 &&
 197                     ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 &&
 198                     (vd->vdev_physpath == NULL ||
 199                     strcmp(vd->vdev_physpath, physpath) != 0)) {
 200                         if (vd->vdev_physpath)
 201                                 spa_strfree(vd->vdev_physpath);
 202                         (void) strlcat(physpath, ":", MAXPATHLEN);
 203                         (void) strlcat(physpath, minorname, MAXPATHLEN);
 204                         vd->vdev_physpath = spa_strdup(physpath);
 205                 }
 206                 if (minorname)
 207                         kmem_free(minorname, strlen(minorname) + 1);
 208                 kmem_free(physpath, MAXPATHLEN);
 209         }
 210 
 211         /*
 212          * Determine the actual size of the device.
 213          */
 214         if (ldi_get_size(dvd->vd_lh, psize) != 0) {
 215                 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 216                 return (EINVAL);
 217         }
 218 
 219         /*
 220          * If we own the whole disk, try to enable disk write caching.
 221          * We ignore errors because it's OK if we can't do it.
 222          */
 223         if (vd->vdev_wholedisk == 1) {
 224                 int wce = 1;
 225                 (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
 226                     FKIOCTL, kcred, NULL);
 227         }
 228 
 229         /*
 230          * Determine the device's minimum transfer size.
 231          * If the ioctl isn't supported, assume DEV_BSIZE.
 232          */
 233         if (ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO, (intptr_t)&dkm,
 234             FKIOCTL, kcred, NULL) != 0)
 235                 dkm.dki_lbsize = DEV_BSIZE;
 236 
 237         *ashift = highbit(MAX(dkm.dki_lbsize, SPA_MINBLOCKSIZE)) - 1;
 238 
 239         /*
 240          * Clear the nowritecache bit, so that on a vdev_reopen() we will
 241          * try again.
 242          */
 243         vd->vdev_nowritecache = B_FALSE;
 244 
 245         return (0);
 246 }
 247 
 248 static void
 249 vdev_disk_close(vdev_t *vd)
 250 {
 251         vdev_disk_t *dvd = vd->vdev_tsd;
 252 
 253         if (dvd == NULL)
 254                 return;
 255 
 256         if (dvd->vd_minor != NULL)
 257                 ddi_devid_str_free(dvd->vd_minor);
 258 
 259         if (dvd->vd_devid != NULL)
 260                 ddi_devid_free(dvd->vd_devid);
 261 
 262         if (dvd->vd_lh != NULL)
 263                 (void) ldi_close(dvd->vd_lh, spa_mode, kcred);
 264 
 265         kmem_free(dvd, sizeof (vdev_disk_t));
 266         vd->vdev_tsd = NULL;
 267 }
 268 
 269 static int
 270 vdev_disk_probe_io(vdev_t *vd, caddr_t data, size_t size, uint64_t offset,
 271     int flags)
 272 {
 273         buf_t buf;
 274         int error = 0;
 275         vdev_disk_t *dvd = vd->vdev_tsd;
 276 
 277         if (vd == NULL || dvd == NULL || dvd->vd_lh == NULL)
 278                 return (EINVAL);
 279 
 280         ASSERT(flags & B_READ || flags & B_WRITE);
 281 
 282         bioinit(&buf);
 283         buf.b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST;
 284         buf.b_bcount = size;
 285         buf.b_un.b_addr = (void *)data;
 286         buf.b_lblkno = lbtodb(offset);
 287         buf.b_bufsize = size;
 288 
 289         error = ldi_strategy(dvd->vd_lh, &buf);
 290         ASSERT(error == 0);
 291         error = biowait(&buf);
 292 
 293         if (zio_injection_enabled && error == 0)
 294                 error = zio_handle_device_injection(vd, EIO);
 295 
 296         return (error);
 297 }
 298 
 299 /*
 300  * Determine if the underlying device is accessible by reading and writing
 301  * to a known location. We must be able to do this during syncing context
 302  * and thus we cannot set the vdev state directly.
 303  */
 304 static int
 305 vdev_disk_probe(vdev_t *vd)
 306 {
 307         uint64_t offset;
 308         vdev_t *nvd;
 309         int l, error = 0, retries = 0;
 310         char *vl_pad;
 311 
 312         if (vd == NULL)
 313                 return (EINVAL);
 314 
 315         /* Hijack the current vdev */
 316         nvd = vd;
 317 
 318         /*
 319          * Pick a random label to rewrite.
 320          */
 321         l = spa_get_random(VDEV_LABELS);
 322         ASSERT(l < VDEV_LABELS);
 323 
 324         offset = vdev_label_offset(vd->vdev_psize, l,
 325             offsetof(vdev_label_t, vl_pad));
 326 
 327         vl_pad = kmem_alloc(VDEV_SKIP_SIZE, KM_SLEEP);
 328 
 329         /*
 330          * Try to read and write to a special location on the
 331          * label. We use the existing vdev initially and only
 332          * try to create and reopen it if we encounter a failure.
 333          */
 334         while ((error = vdev_disk_probe_io(nvd, vl_pad, VDEV_SKIP_SIZE,
 335             offset, B_READ)) != 0 && retries == 0) {
 336 
 337                 nvd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
 338                 if (vd->vdev_path)
 339                         nvd->vdev_path = spa_strdup(vd->vdev_path);
 340                 if (vd->vdev_physpath)
 341                         nvd->vdev_physpath = spa_strdup(vd->vdev_physpath);
 342                 if (vd->vdev_devid)
 343                         nvd->vdev_devid = spa_strdup(vd->vdev_devid);
 344                 nvd->vdev_wholedisk = vd->vdev_wholedisk;
 345                 nvd->vdev_guid = vd->vdev_guid;
 346                 retries++;
 347 
 348                 error = vdev_disk_open_common(nvd);
 349                 if (error)
 350                         break;
 351         }
 352 
 353         if (!error) {
 354                 error = vdev_disk_probe_io(nvd, vl_pad, VDEV_SKIP_SIZE,
 355                     offset, B_WRITE);
 356         }
 357 
 358         /* Clean up if we allocated a new vdev */
 359         if (retries) {
 360                 vdev_disk_close(nvd);
 361                 if (nvd->vdev_path)
 362                         spa_strfree(nvd->vdev_path);
 363                 if (nvd->vdev_physpath)
 364                         spa_strfree(nvd->vdev_physpath);
 365                 if (nvd->vdev_devid)
 366                         spa_strfree(nvd->vdev_devid);
 367                 kmem_free(nvd, sizeof (vdev_t));
 368         }
 369         kmem_free(vl_pad, VDEV_SKIP_SIZE);
 370 
 371         /* Reset the failing flag */
 372         if (!error)
 373                 vd->vdev_is_failing = B_FALSE;
 374 
 375         return (error);
 376 }
 377 
 378 static void
 379 vdev_disk_io_intr(buf_t *bp)
 380 {
 381         vdev_disk_buf_t *vdb = (vdev_disk_buf_t *)bp;
 382         zio_t *zio = vdb->vdb_io;
 383 
 384         if ((zio->io_error = geterror(bp)) == 0 && bp->b_resid != 0)
 385                 zio->io_error = EIO;
 386 
 387         kmem_free(vdb, sizeof (vdev_disk_buf_t));
 388 
 389         zio_interrupt(zio);
 390 }
 391 
 392 static void
 393 vdev_disk_ioctl_done(void *zio_arg, int error)
 394 {
 395         zio_t *zio = zio_arg;
 396 
 397         zio->io_error = error;
 398 
 399         zio_interrupt(zio);
 400 }
 401 
 402 static int
 403 vdev_disk_io_start(zio_t *zio)
 404 {
 405         vdev_t *vd = zio->io_vd;
 406         vdev_disk_t *dvd = vd->vdev_tsd;
 407         vdev_disk_buf_t *vdb;
 408         buf_t *bp;
 409         int flags, error;
 410 
 411         if (zio->io_type == ZIO_TYPE_IOCTL) {
 412                 zio_vdev_io_bypass(zio);
 413 
 414                 /* XXPOLICY */
 415                 if (!vdev_readable(vd)) {
 416                         zio->io_error = ENXIO;
 417                         return (ZIO_PIPELINE_CONTINUE);
 418                 }
 419 
 420                 switch (zio->io_cmd) {
 421 
 422                 case DKIOCFLUSHWRITECACHE:
 423 
 424                         if (zfs_nocacheflush)
 425                                 break;
 426 
 427                         if (vd->vdev_nowritecache) {
 428                                 zio->io_error = ENOTSUP;
 429                                 break;
 430                         }
 431 
 432                         zio->io_dk_callback.dkc_callback = vdev_disk_ioctl_done;
 433                         zio->io_dk_callback.dkc_flag = FLUSH_VOLATILE;
 434                         zio->io_dk_callback.dkc_cookie = zio;
 435 
 436                         error = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
 437                             (uintptr_t)&zio->io_dk_callback,
 438                             FKIOCTL, kcred, NULL);
 439 
 440                         if (error == 0) {
 441                                 /*
 442                                  * The ioctl will be done asychronously,
 443                                  * and will call vdev_disk_ioctl_done()
 444                                  * upon completion.
 445                                  */
 446                                 return (ZIO_PIPELINE_STOP);
 447                         }
 448 
 449                         if (error == ENOTSUP || error == ENOTTY) {
 450                                 /*
 451                                  * If we get ENOTSUP or ENOTTY, we know that
 452                                  * no future attempts will ever succeed.
 453                                  * In this case we set a persistent bit so
 454                                  * that we don't bother with the ioctl in the
 455                                  * future.
 456                                  */
 457                                 vd->vdev_nowritecache = B_TRUE;
 458                         }
 459                         zio->io_error = error;
 460 
 461                         break;
 462 
 463                 default:
 464                         zio->io_error = ENOTSUP;
 465                 }
 466 
 467                 return (ZIO_PIPELINE_CONTINUE);
 468         }
 469 
 470         if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
 471                 return (ZIO_PIPELINE_STOP);
 472 
 473         if ((zio = vdev_queue_io(zio)) == NULL)
 474                 return (ZIO_PIPELINE_STOP);
 475 
 476         if (zio->io_type == ZIO_TYPE_WRITE)
 477                 error = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
 478         else
 479                 error = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
 480         error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error;
 481 
 482         if (error) {
 483                 zio->io_error = error;
 484                 zio_interrupt(zio);
 485                 return (ZIO_PIPELINE_STOP);
 486         }
 487 
 488         flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
 489         flags |= B_BUSY | B_NOCACHE;
 490         if (zio->io_flags & ZIO_FLAG_FAILFAST)
 491                 flags |= B_FAILFAST;
 492 
 493         vdb = kmem_alloc(sizeof (vdev_disk_buf_t), KM_SLEEP);
 494 
 495         vdb->vdb_io = zio;
 496         bp = &vdb->vdb_buf;
 497 
 498         bioinit(bp);
 499         bp->b_flags = flags;
 500         bp->b_bcount = zio->io_size;
 501         bp->b_un.b_addr = zio->io_data;
 502         bp->b_lblkno = lbtodb(zio->io_offset);
 503         bp->b_bufsize = zio->io_size;
 504         bp->b_iodone = (int (*)())vdev_disk_io_intr;
 505 
 506         error = ldi_strategy(dvd->vd_lh, bp);
 507         /* ldi_strategy() will return non-zero only on programming errors */
 508         ASSERT(error == 0);
 509 
 510         return (ZIO_PIPELINE_STOP);
 511 }
 512 
 513 static int
 514 vdev_disk_io_done(zio_t *zio)
 515 {
 516         vdev_queue_io_done(zio);
 517 
 518         if (zio->io_type == ZIO_TYPE_WRITE)
 519                 vdev_cache_write(zio);
 520 
 521         if (zio_injection_enabled && zio->io_error == 0)
 522                 zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
 523 
 524         /*
 525          * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
 526          * the device has been removed.  If this is the case, then we trigger an
 527          * asynchronous removal of the device. Otherwise, probe the device and
 528          * make sure it's still accessible.
 529          */
 530         if (zio->io_error == EIO) {
 531                 vdev_t *vd = zio->io_vd;
 532                 vdev_disk_t *dvd = vd->vdev_tsd;
 533                 int state;
 534 
 535                 state = DKIO_NONE;
 536                 if (dvd && ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
 537                     FKIOCTL, kcred, NULL) == 0 &&
 538                     state != DKIO_INSERTED) {
 539                         vd->vdev_remove_wanted = B_TRUE;
 540                         spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
 541                 } else if (vdev_probe(vd) != 0) {
 542                         ASSERT(vd->vdev_ops->vdev_op_leaf);
 543                         vd->vdev_is_failing = B_TRUE;
 544                 }
 545         }
 546 
 547         return (ZIO_PIPELINE_CONTINUE);
 548 }
 549 
 550 vdev_ops_t vdev_disk_ops = {
 551         vdev_disk_open,
 552         vdev_disk_close,
 553         vdev_disk_probe,
 554         vdev_default_asize,
 555         vdev_disk_io_start,
 556         vdev_disk_io_done,
 557         NULL,
 558         NULL,
 559         VDEV_TYPE_DISK,         /* name of this vdev type */
 560         B_TRUE                  /* leaf vdev */
 561 };