1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 #pragma ident   "%Z%%M% %I%     %E% SMI"
  27 
  28 /*
  29  * lofi (loopback file) driver - allows you to attach a file to a device,
  30  * which can then be accessed through that device. The simple model is that
  31  * you tell lofi to open a file, and then use the block device you get as
  32  * you would any block device. lofi translates access to the block device
  33  * into I/O on the underlying file. This is mostly useful for
  34  * mounting images of filesystems.
  35  *
  36  * lofi is controlled through /dev/lofictl - this is the only device exported
  37  * during attach, and is minor number 0. lofiadm communicates with lofi through
  38  * ioctls on this device. When a file is attached to lofi, block and character
  39  * devices are exported in /dev/lofi and /dev/rlofi. Currently, these devices
  40  * are identified by their minor number, and the minor number is also used
  41  * as the name in /dev/lofi. If we ever decide to support virtual disks,
  42  * we'll have to divide the minor number space to identify fdisk partitions
  43  * and slices, and the name will then be the minor number shifted down a
  44  * few bits. Minor devices are tracked with state structures handled with
  45  * ddi_soft_state(9F) for simplicity.
  46  *
  47  * A file attached to lofi is opened when attached and not closed until
  48  * explicitly detached from lofi. This seems more sensible than deferring
  49  * the open until the /dev/lofi device is opened, for a number of reasons.
  50  * One is that any failure is likely to be noticed by the person (or script)
  51  * running lofiadm. Another is that it would be a security problem if the
  52  * file was replaced by another one after being added but before being opened.
  53  *
  54  * The only hard part about lofi is the ioctls. In order to support things
  55  * like 'newfs' on a lofi device, it needs to support certain disk ioctls.
  56  * So it has to fake disk geometry and partition information. More may need
  57  * to be faked if your favorite utility doesn't work and you think it should
  58  * (fdformat doesn't work because it really wants to know the type of floppy
  59  * controller to talk to, and that didn't seem easy to fake. Or possibly even
  60  * necessary, since we have mkfs_pcfs now).
  61  *
  62  * Normally, a lofi device cannot be detached if it is open (i.e. busy).  To
  63  * support simulation of hotplug events, an optional force flag is provided.
  64  * If a lofi device is open when a force detach is requested, then the
  65  * underlying file is closed and any subsequent operations return EIO.  When the
  66  * device is closed for the last time, it will be cleaned up at that time.  In
  67  * addition, the DKIOCSTATE ioctl will return DKIO_DEV_GONE when the device is
  68  * detached but not removed.
  69  *
  70  * Known problems:
  71  *
  72  *      UFS logging. Mounting a UFS filesystem image "logging"
  73  *      works for basic copy testing but wedges during a build of ON through
  74  *      that image. Some deadlock in lufs holding the log mutex and then
  75  *      getting stuck on a buf. So for now, don't do that.
  76  *
  77  *      Direct I/O. Since the filesystem data is being cached in the buffer
  78  *      cache, _and_ again in the underlying filesystem, it's tempting to
  79  *      enable direct I/O on the underlying file. Don't, because that deadlocks.
  80  *      I think to fix the cache-twice problem we might need filesystem support.
  81  *
  82  *      lofi on itself. The simple lock strategy (lofi_lock) precludes this
  83  *      because you'll be in lofi_ioctl, holding the lock when you open the
  84  *      file, which, if it's lofi, will grab lofi_lock. We prevent this for
  85  *      now, though not using ddi_soft_state(9F) would make it possible to
  86  *      do. Though it would still be silly.
  87  *
  88  * Interesting things to do:
  89  *
  90  *      Allow multiple files for each device. A poor-man's metadisk, basically.
  91  *
  92  *      Pass-through ioctls on block devices. You can (though it's not
  93  *      documented), give lofi a block device as a file name. Then we shouldn't
  94  *      need to fake a geometry. But this is also silly unless you're replacing
  95  *      metadisk.
  96  *
  97  *      Encryption. tpm would like this. Apparently Windows 2000 has it, and
  98  *      so does Linux.
  99  */
 100 
 101 #include <sys/types.h>
 102 #include <netinet/in.h>
 103 #include <sys/sysmacros.h>
 104 #include <sys/cmn_err.h>
 105 #include <sys/uio.h>
 106 #include <sys/kmem.h>
 107 #include <sys/cred.h>
 108 #include <sys/mman.h>
 109 #include <sys/errno.h>
 110 #include <sys/aio_req.h>
 111 #include <sys/stat.h>
 112 #include <sys/file.h>
 113 #include <sys/modctl.h>
 114 #include <sys/conf.h>
 115 #include <sys/debug.h>
 116 #include <sys/vnode.h>
 117 #include <sys/lofi.h>
 118 #include <sys/fcntl.h>
 119 #include <sys/pathname.h>
 120 #include <sys/filio.h>
 121 #include <sys/fdio.h>
 122 #include <sys/open.h>
 123 #include <sys/disp.h>
 124 #include <vm/seg_map.h>
 125 #include <sys/ddi.h>
 126 #include <sys/sunddi.h>
 127 #include <sys/zmod.h>
 128 
 129 #define NBLOCKS_PROP_NAME       "Nblocks"
 130 #define SIZE_PROP_NAME          "Size"
 131 
 132 static dev_info_t *lofi_dip;
 133 static void     *lofi_statep;
 134 static kmutex_t lofi_lock;              /* state lock */
 135 
 136 /*
 137  * Because lofi_taskq_nthreads limits the actual swamping of the device, the
 138  * maxalloc parameter (lofi_taskq_maxalloc) should be tuned conservatively
 139  * high.  If we want to be assured that the underlying device is always busy,
 140  * we must be sure that the number of bytes enqueued when the number of
 141  * enqueued tasks exceeds maxalloc is sufficient to keep the device busy for
 142  * the duration of the sleep time in taskq_ent_alloc().  That is, lofi should
 143  * set maxalloc to be the maximum throughput (in bytes per second) of the
 144  * underlying device divided by the minimum I/O size.  We assume a realistic
 145  * maximum throughput of one hundred megabytes per second; we set maxalloc on
 146  * the lofi task queue to be 104857600 divided by DEV_BSIZE.
 147  */
 148 static int lofi_taskq_maxalloc = 104857600 / DEV_BSIZE;
 149 static int lofi_taskq_nthreads = 4;     /* # of taskq threads per device */
 150 
 151 uint32_t lofi_max_files = LOFI_MAX_FILES;
 152 
 153 static int gzip_decompress(void *src, size_t srclen, void *dst,
 154         size_t *destlen, int level);
 155 
 156 lofi_compress_info_t lofi_compress_table[LOFI_COMPRESS_FUNCTIONS] = {
 157         {gzip_decompress,       NULL,   6,      "gzip"}, /* default */
 158         {gzip_decompress,       NULL,   6,      "gzip-6"},
 159         {gzip_decompress,       NULL,   9,      "gzip-9"}
 160 };
 161 
 162 static int
 163 lofi_busy(void)
 164 {
 165         minor_t minor;
 166 
 167         /*
 168          * We need to make sure no mappings exist - mod_remove won't
 169          * help because the device isn't open.
 170          */
 171         mutex_enter(&lofi_lock);
 172         for (minor = 1; minor <= lofi_max_files; minor++) {
 173                 if (ddi_get_soft_state(lofi_statep, minor) != NULL) {
 174                         mutex_exit(&lofi_lock);
 175                         return (EBUSY);
 176                 }
 177         }
 178         mutex_exit(&lofi_lock);
 179         return (0);
 180 }
 181 
 182 static int
 183 is_opened(struct lofi_state *lsp)
 184 {
 185         ASSERT(mutex_owned(&lofi_lock));
 186         return (lsp->ls_chr_open || lsp->ls_blk_open || lsp->ls_lyr_open_count);
 187 }
 188 
 189 static int
 190 mark_opened(struct lofi_state *lsp, int otyp)
 191 {
 192         ASSERT(mutex_owned(&lofi_lock));
 193         switch (otyp) {
 194         case OTYP_CHR:
 195                 lsp->ls_chr_open = 1;
 196                 break;
 197         case OTYP_BLK:
 198                 lsp->ls_blk_open = 1;
 199                 break;
 200         case OTYP_LYR:
 201                 lsp->ls_lyr_open_count++;
 202                 break;
 203         default:
 204                 return (-1);
 205         }
 206         return (0);
 207 }
 208 
 209 static void
 210 mark_closed(struct lofi_state *lsp, int otyp)
 211 {
 212         ASSERT(mutex_owned(&lofi_lock));
 213         switch (otyp) {
 214         case OTYP_CHR:
 215                 lsp->ls_chr_open = 0;
 216                 break;
 217         case OTYP_BLK:
 218                 lsp->ls_blk_open = 0;
 219                 break;
 220         case OTYP_LYR:
 221                 lsp->ls_lyr_open_count--;
 222                 break;
 223         default:
 224                 break;
 225         }
 226 }
 227 
 228 static void
 229 lofi_free_handle(dev_t dev, minor_t minor, struct lofi_state *lsp,
 230     cred_t *credp)
 231 {
 232         dev_t   newdev;
 233         char    namebuf[50];
 234 
 235         if (lsp->ls_vp) {
 236                 (void) VOP_CLOSE(lsp->ls_vp, lsp->ls_openflag, 1, 0, credp);
 237                 VN_RELE(lsp->ls_vp);
 238                 lsp->ls_vp = NULL;
 239         }
 240 
 241         newdev = makedevice(getmajor(dev), minor);
 242         (void) ddi_prop_remove(newdev, lofi_dip, SIZE_PROP_NAME);
 243         (void) ddi_prop_remove(newdev, lofi_dip, NBLOCKS_PROP_NAME);
 244 
 245         (void) snprintf(namebuf, sizeof (namebuf), "%d", minor);
 246         ddi_remove_minor_node(lofi_dip, namebuf);
 247         (void) snprintf(namebuf, sizeof (namebuf), "%d,raw", minor);
 248         ddi_remove_minor_node(lofi_dip, namebuf);
 249 
 250         kmem_free(lsp->ls_filename, lsp->ls_filename_sz);
 251         taskq_destroy(lsp->ls_taskq);
 252         if (lsp->ls_kstat) {
 253                 kstat_delete(lsp->ls_kstat);
 254                 mutex_destroy(&lsp->ls_kstat_lock);
 255         }
 256         ddi_soft_state_free(lofi_statep, minor);
 257 }
 258 
 259 /*ARGSUSED*/
 260 static int
 261 lofi_open(dev_t *devp, int flag, int otyp, struct cred *credp)
 262 {
 263         minor_t minor;
 264         struct lofi_state *lsp;
 265 
 266         mutex_enter(&lofi_lock);
 267         minor = getminor(*devp);
 268         if (minor == 0) {
 269                 /* master control device */
 270                 /* must be opened exclusively */
 271                 if (((flag & FEXCL) != FEXCL) || (otyp != OTYP_CHR)) {
 272                         mutex_exit(&lofi_lock);
 273                         return (EINVAL);
 274                 }
 275                 lsp = ddi_get_soft_state(lofi_statep, 0);
 276                 if (lsp == NULL) {
 277                         mutex_exit(&lofi_lock);
 278                         return (ENXIO);
 279                 }
 280                 if (is_opened(lsp)) {
 281                         mutex_exit(&lofi_lock);
 282                         return (EBUSY);
 283                 }
 284                 (void) mark_opened(lsp, OTYP_CHR);
 285                 mutex_exit(&lofi_lock);
 286                 return (0);
 287         }
 288 
 289         /* otherwise, the mapping should already exist */
 290         lsp = ddi_get_soft_state(lofi_statep, minor);
 291         if (lsp == NULL) {
 292                 mutex_exit(&lofi_lock);
 293                 return (EINVAL);
 294         }
 295 
 296         if (lsp->ls_vp == NULL) {
 297                 mutex_exit(&lofi_lock);
 298                 return (ENXIO);
 299         }
 300 
 301         if (mark_opened(lsp, otyp) == -1) {
 302                 mutex_exit(&lofi_lock);
 303                 return (EINVAL);
 304         }
 305 
 306         mutex_exit(&lofi_lock);
 307         return (0);
 308 }
 309 
 310 /*ARGSUSED*/
 311 static int
 312 lofi_close(dev_t dev, int flag, int otyp, struct cred *credp)
 313 {
 314         minor_t minor;
 315         struct lofi_state *lsp;
 316 
 317         mutex_enter(&lofi_lock);
 318         minor = getminor(dev);
 319         lsp = ddi_get_soft_state(lofi_statep, minor);
 320         if (lsp == NULL) {
 321                 mutex_exit(&lofi_lock);
 322                 return (EINVAL);
 323         }
 324         mark_closed(lsp, otyp);
 325 
 326         /*
 327          * If we have forcibly closed the underlying device, and this is the
 328          * last close, then tear down the rest of the device.
 329          */
 330         if (minor != 0 && lsp->ls_vp == NULL && !is_opened(lsp))
 331                 lofi_free_handle(dev, minor, lsp, credp);
 332         mutex_exit(&lofi_lock);
 333         return (0);
 334 }
 335 
 336 static int
 337 lofi_mapped_rdwr(caddr_t bufaddr, offset_t offset, struct buf *bp,
 338         struct lofi_state *lsp)
 339 {
 340         int error;
 341         offset_t alignedoffset, mapoffset;
 342         size_t  xfersize;
 343         int     isread;
 344         int     smflags;
 345         caddr_t mapaddr;
 346         size_t  len;
 347         enum seg_rw srw;
 348 
 349         /*
 350          * segmap always gives us an 8K (MAXBSIZE) chunk, aligned on
 351          * an 8K boundary, but the buf transfer address may not be
 352          * aligned on more than a 512-byte boundary (we don't enforce
 353          * that even though we could). This matters since the initial
 354          * part of the transfer may not start at offset 0 within the
 355          * segmap'd chunk. So we have to compensate for that with
 356          * 'mapoffset'. Subsequent chunks always start off at the
 357          * beginning, and the last is capped by b_resid
 358          */
 359         mapoffset = offset & MAXBOFFSET;
 360         alignedoffset = offset - mapoffset;
 361         bp->b_resid = bp->b_bcount;
 362         isread = bp->b_flags & B_READ;
 363         srw = isread ? S_READ : S_WRITE;
 364         do {
 365                 xfersize = MIN(lsp->ls_vp_comp_size - offset,
 366                     MIN(MAXBSIZE - mapoffset, bp->b_resid));
 367                 len = roundup(mapoffset + xfersize, PAGESIZE);
 368                 mapaddr = segmap_getmapflt(segkmap, lsp->ls_vp,
 369                     alignedoffset, MAXBSIZE, 1, srw);
 370                 /*
 371                  * Now fault in the pages. This lets us check
 372                  * for errors before we reference mapaddr and
 373                  * try to resolve the fault in bcopy (which would
 374                  * panic instead). And this can easily happen,
 375                  * particularly if you've lofi'd a file over NFS
 376                  * and someone deletes the file on the server.
 377                  */
 378                 error = segmap_fault(kas.a_hat, segkmap, mapaddr,
 379                     len, F_SOFTLOCK, srw);
 380                 if (error) {
 381                         (void) segmap_release(segkmap, mapaddr, 0);
 382                         if (FC_CODE(error) == FC_OBJERR)
 383                                 error = FC_ERRNO(error);
 384                         else
 385                                 error = EIO;
 386                         break;
 387                 }
 388                 smflags = 0;
 389                 if (isread) {
 390                         smflags |= SM_FREE;
 391                         /*
 392                          * If we're reading an entire page starting
 393                          * at a page boundary, there's a good chance
 394                          * we won't need it again. Put it on the
 395                          * head of the freelist.
 396                          */
 397                         if (mapoffset == 0 && xfersize == PAGESIZE)
 398                                 smflags |= SM_DONTNEED;
 399                         bcopy(mapaddr + mapoffset, bufaddr, xfersize);
 400                 } else {
 401                         smflags |= SM_WRITE;
 402                         bcopy(bufaddr, mapaddr + mapoffset, xfersize);
 403                 }
 404                 bp->b_resid -= xfersize;
 405                 bufaddr += xfersize;
 406                 offset += xfersize;
 407                 (void) segmap_fault(kas.a_hat, segkmap, mapaddr,
 408                     len, F_SOFTUNLOCK, srw);
 409                 error = segmap_release(segkmap, mapaddr, smflags);
 410                 /* only the first map may start partial */
 411                 mapoffset = 0;
 412                 alignedoffset += MAXBSIZE;
 413         } while ((error == 0) && (bp->b_resid > 0) &&
 414             (offset < lsp->ls_vp_comp_size));
 415 
 416         return (error);
 417 }
 418 
 419 /*ARGSUSED*/
 420 static int gzip_decompress(void *src, size_t srclen, void *dst,
 421     size_t *dstlen, int level)
 422 {
 423         ASSERT(*dstlen >= srclen);
 424 
 425         if (z_uncompress(dst, dstlen, src, srclen) != Z_OK)
 426                 return (-1);
 427         return (0);
 428 }
 429 
 430 /*
 431  * This is basically what strategy used to be before we found we
 432  * needed task queues.
 433  */
 434 static void
 435 lofi_strategy_task(void *arg)
 436 {
 437         struct buf *bp = (struct buf *)arg;
 438         int error;
 439         struct lofi_state *lsp;
 440         uint64_t sblkno, eblkno, cmpbytes;
 441         offset_t offset, sblkoff, eblkoff;
 442         offset_t salign, ealign;
 443         offset_t sdiff;
 444         uint32_t comp_data_sz;
 445         caddr_t bufaddr;
 446         unsigned char *compressed_seg = NULL, *cmpbuf;
 447         unsigned char *uncompressed_seg = NULL;
 448         lofi_compress_info_t *li;
 449         size_t oblkcount, xfersize;
 450         unsigned long seglen;
 451 
 452         lsp = ddi_get_soft_state(lofi_statep, getminor(bp->b_edev));
 453         if (lsp->ls_kstat) {
 454                 mutex_enter(lsp->ls_kstat->ks_lock);
 455                 kstat_waitq_to_runq(KSTAT_IO_PTR(lsp->ls_kstat));
 456                 mutex_exit(lsp->ls_kstat->ks_lock);
 457         }
 458         bp_mapin(bp);
 459         bufaddr = bp->b_un.b_addr;
 460         offset = bp->b_lblkno * DEV_BSIZE;   /* offset within file */
 461 
 462         /*
 463          * We used to always use vn_rdwr here, but we cannot do that because
 464          * we might decide to read or write from the the underlying
 465          * file during this call, which would be a deadlock because
 466          * we have the rw_lock. So instead we page, unless it's not
 467          * mapable or it's a character device.
 468          */
 469         if (lsp->ls_vp == NULL || lsp->ls_vp_closereq) {
 470                 error = EIO;
 471         } else if (((lsp->ls_vp->v_flag & VNOMAP) == 0) &&
 472             (lsp->ls_vp->v_type != VCHR)) {
 473                 uint64_t i;
 474 
 475                 /*
 476                  * Handle uncompressed files with a regular read
 477                  */
 478                 if (lsp->ls_uncomp_seg_sz == 0) {
 479                         error = lofi_mapped_rdwr(bufaddr, offset, bp, lsp);
 480                         goto done;
 481                 }
 482 
 483                 /*
 484                  * From here on we're dealing primarily with compressed files
 485                  */
 486 
 487                 /*
 488                  * Compressed files can only be read from and
 489                  * not written to
 490                  */
 491                 if (!(bp->b_flags & B_READ)) {
 492                         bp->b_resid = bp->b_bcount;
 493                         error = EROFS;
 494                         goto done;
 495                 }
 496 
 497                 ASSERT(lsp->ls_comp_algorithm_index >= 0);
 498                 li = &lofi_compress_table[lsp->ls_comp_algorithm_index];
 499                 /*
 500                  * Compute starting and ending compressed segment numbers
 501                  * We use only bitwise operations avoiding division and
 502                  * modulus because we enforce the compression segment size
 503                  * to a power of 2
 504                  */
 505                 sblkno = offset >> lsp->ls_comp_seg_shift;
 506                 sblkoff = offset & (lsp->ls_uncomp_seg_sz - 1);
 507                 eblkno = (offset + bp->b_bcount) >> lsp->ls_comp_seg_shift;
 508                 eblkoff = (offset + bp->b_bcount) & (lsp->ls_uncomp_seg_sz - 1);
 509 
 510                 /*
 511                  * Align start offset to block boundary for segmap
 512                  */
 513                 salign = lsp->ls_comp_seg_index[sblkno];
 514                 sdiff = salign & (DEV_BSIZE - 1);
 515                 salign -= sdiff;
 516                 if (eblkno >= (lsp->ls_comp_index_sz - 1)) {
 517                         /*
 518                          * We're dealing with the last segment of
 519                          * the compressed file -- the size of this
 520                          * segment *may not* be the same as the
 521                          * segment size for the file
 522                          */
 523                         eblkoff = (offset + bp->b_bcount) &
 524                             (lsp->ls_uncomp_last_seg_sz - 1);
 525                         ealign = lsp->ls_vp_comp_size;
 526                 } else {
 527                         ealign = lsp->ls_comp_seg_index[eblkno + 1];
 528                 }
 529 
 530                 /*
 531                  * Preserve original request paramaters
 532                  */
 533                 oblkcount = bp->b_bcount;
 534 
 535                 /*
 536                  * Assign the calculated parameters
 537                  */
 538                 comp_data_sz = ealign - salign;
 539                 bp->b_bcount = comp_data_sz;
 540 
 541                 /*
 542                  * Allocate fixed size memory blocks to hold one
 543                  * compressed and uncompressed segment since we
 544                  * uncompress segments one at a time
 545                  */
 546                 compressed_seg = kmem_alloc(bp->b_bcount, KM_SLEEP);
 547                 uncompressed_seg = kmem_alloc(lsp->ls_uncomp_seg_sz, KM_SLEEP);
 548                 /*
 549                  * Map in the calculated number of blocks
 550                  */
 551                 error = lofi_mapped_rdwr((caddr_t)compressed_seg, salign,
 552                     bp, lsp);
 553 
 554                 bp->b_bcount = oblkcount;
 555                 bp->b_resid = oblkcount;
 556                 if (error != 0)
 557                         goto done;
 558 
 559                 /*
 560                  * We have the compressed blocks, now uncompress them
 561                  */
 562                 cmpbuf = compressed_seg + sdiff;
 563                 for (i = sblkno; i < (eblkno + 1) && i < lsp->ls_comp_index_sz;
 564                     i++) {
 565                         /*
 566                          * Each of the segment index entries contains
 567                          * the starting block number for that segment.
 568                          * The number of compressed bytes in a segment
 569                          * is thus the difference between the starting
 570                          * block number of this segment and the starting
 571                          * block number of the next segment.
 572                          */
 573                         if ((i == eblkno) &&
 574                             (i == lsp->ls_comp_index_sz - 1)) {
 575                                 cmpbytes = lsp->ls_vp_comp_size -
 576                                     lsp->ls_comp_seg_index[i];
 577                         } else {
 578                                 cmpbytes = lsp->ls_comp_seg_index[i + 1] -
 579                                     lsp->ls_comp_seg_index[i];
 580                         }
 581 
 582                         /*
 583                          * The first byte in a compressed segment is a flag
 584                          * that indicates whether is this segment is
 585                          * compressed at all
 586                          */
 587                         if (*cmpbuf == UNCOMPRESSED) {
 588                                 bcopy((cmpbuf + SEGHDR), uncompressed_seg,
 589                                     (cmpbytes - SEGHDR));
 590                         } else {
 591                                 seglen = lsp->ls_uncomp_seg_sz;
 592 
 593                                 if (li->l_decompress((cmpbuf + SEGHDR),
 594                                     (cmpbytes - SEGHDR), uncompressed_seg,
 595                                     &seglen, li->l_level) != 0) {
 596                                         error = EIO;
 597                                         goto done;
 598                                 }
 599                         }
 600 
 601                         /*
 602                          * Determine how much uncompressed data we
 603                          * have to copy and copy it
 604                          */
 605                         xfersize = lsp->ls_uncomp_seg_sz - sblkoff;
 606                         if (i == eblkno) {
 607                                 if (i == (lsp->ls_comp_index_sz - 1))
 608                                         xfersize -= (lsp->ls_uncomp_last_seg_sz
 609                                             - eblkoff);
 610                                 else
 611                                         xfersize -=
 612                                             (lsp->ls_uncomp_seg_sz - eblkoff);
 613                         }
 614 
 615                         bcopy((uncompressed_seg + sblkoff), bufaddr, xfersize);
 616 
 617                         cmpbuf += cmpbytes;
 618                         bufaddr += xfersize;
 619                         bp->b_resid -= xfersize;
 620                         sblkoff = 0;
 621 
 622                         if (bp->b_resid == 0)
 623                                 break;
 624                 }
 625         } else {
 626                 ssize_t resid;
 627                 enum uio_rw rw;
 628 
 629                 if (bp->b_flags & B_READ)
 630                         rw = UIO_READ;
 631                 else
 632                         rw = UIO_WRITE;
 633                 error = vn_rdwr(rw, lsp->ls_vp, bufaddr, bp->b_bcount,
 634                     offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
 635                 bp->b_resid = resid;
 636         }
 637 
 638 done:
 639         if (compressed_seg != NULL)
 640                 kmem_free(compressed_seg, comp_data_sz);
 641         if (uncompressed_seg != NULL)
 642                 kmem_free(uncompressed_seg, lsp->ls_uncomp_seg_sz);
 643 
 644         if (lsp->ls_kstat) {
 645                 size_t n_done = bp->b_bcount - bp->b_resid;
 646                 kstat_io_t *kioptr;
 647 
 648                 mutex_enter(lsp->ls_kstat->ks_lock);
 649                 kioptr = KSTAT_IO_PTR(lsp->ls_kstat);
 650                 if (bp->b_flags & B_READ) {
 651                         kioptr->nread += n_done;
 652                         kioptr->reads++;
 653                 } else {
 654                         kioptr->nwritten += n_done;
 655                         kioptr->writes++;
 656                 }
 657                 kstat_runq_exit(kioptr);
 658                 mutex_exit(lsp->ls_kstat->ks_lock);
 659         }
 660 
 661         mutex_enter(&lsp->ls_vp_lock);
 662         if (--lsp->ls_vp_iocount == 0)
 663                 cv_broadcast(&lsp->ls_vp_cv);
 664         mutex_exit(&lsp->ls_vp_lock);
 665 
 666         bioerror(bp, error);
 667         biodone(bp);
 668 }
 669 
 670 static int
 671 lofi_strategy(struct buf *bp)
 672 {
 673         struct lofi_state *lsp;
 674         offset_t        offset;
 675 
 676         /*
 677          * We cannot just do I/O here, because the current thread
 678          * _might_ end up back in here because the underlying filesystem
 679          * wants a buffer, which eventually gets into bio_recycle and
 680          * might call into lofi to write out a delayed-write buffer.
 681          * This is bad if the filesystem above lofi is the same as below.
 682          *
 683          * We could come up with a complex strategy using threads to
 684          * do the I/O asynchronously, or we could use task queues. task
 685          * queues were incredibly easy so they win.
 686          */
 687         lsp = ddi_get_soft_state(lofi_statep, getminor(bp->b_edev));
 688         mutex_enter(&lsp->ls_vp_lock);
 689         if (lsp->ls_vp == NULL || lsp->ls_vp_closereq) {
 690                 bioerror(bp, EIO);
 691                 biodone(bp);
 692                 mutex_exit(&lsp->ls_vp_lock);
 693                 return (0);
 694         }
 695 
 696         offset = bp->b_lblkno * DEV_BSIZE;   /* offset within file */
 697         if (offset == lsp->ls_vp_size) {
 698                 /* EOF */
 699                 if ((bp->b_flags & B_READ) != 0) {
 700                         bp->b_resid = bp->b_bcount;
 701                         bioerror(bp, 0);
 702                 } else {
 703                         /* writes should fail */
 704                         bioerror(bp, ENXIO);
 705                 }
 706                 biodone(bp);
 707                 mutex_exit(&lsp->ls_vp_lock);
 708                 return (0);
 709         }
 710         if (offset > lsp->ls_vp_size) {
 711                 bioerror(bp, ENXIO);
 712                 biodone(bp);
 713                 mutex_exit(&lsp->ls_vp_lock);
 714                 return (0);
 715         }
 716         lsp->ls_vp_iocount++;
 717         mutex_exit(&lsp->ls_vp_lock);
 718 
 719         if (lsp->ls_kstat) {
 720                 mutex_enter(lsp->ls_kstat->ks_lock);
 721                 kstat_waitq_enter(KSTAT_IO_PTR(lsp->ls_kstat));
 722                 mutex_exit(lsp->ls_kstat->ks_lock);
 723         }
 724         (void) taskq_dispatch(lsp->ls_taskq, lofi_strategy_task, bp, KM_SLEEP);
 725         return (0);
 726 }
 727 
 728 /*ARGSUSED2*/
 729 static int
 730 lofi_read(dev_t dev, struct uio *uio, struct cred *credp)
 731 {
 732         if (getminor(dev) == 0)
 733                 return (EINVAL);
 734         return (physio(lofi_strategy, NULL, dev, B_READ, minphys, uio));
 735 }
 736 
 737 /*ARGSUSED2*/
 738 static int
 739 lofi_write(dev_t dev, struct uio *uio, struct cred *credp)
 740 {
 741         if (getminor(dev) == 0)
 742                 return (EINVAL);
 743         return (physio(lofi_strategy, NULL, dev, B_WRITE, minphys, uio));
 744 }
 745 
 746 /*ARGSUSED2*/
 747 static int
 748 lofi_aread(dev_t dev, struct aio_req *aio, struct cred *credp)
 749 {
 750         if (getminor(dev) == 0)
 751                 return (EINVAL);
 752         return (aphysio(lofi_strategy, anocancel, dev, B_READ, minphys, aio));
 753 }
 754 
 755 /*ARGSUSED2*/
 756 static int
 757 lofi_awrite(dev_t dev, struct aio_req *aio, struct cred *credp)
 758 {
 759         if (getminor(dev) == 0)
 760                 return (EINVAL);
 761         return (aphysio(lofi_strategy, anocancel, dev, B_WRITE, minphys, aio));
 762 }
 763 
 764 /*ARGSUSED*/
 765 static int
 766 lofi_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
 767 {
 768         switch (infocmd) {
 769         case DDI_INFO_DEVT2DEVINFO:
 770                 *result = lofi_dip;
 771                 return (DDI_SUCCESS);
 772         case DDI_INFO_DEVT2INSTANCE:
 773                 *result = 0;
 774                 return (DDI_SUCCESS);
 775         }
 776         return (DDI_FAILURE);
 777 }
 778 
 779 static int
 780 lofi_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 781 {
 782         int     error;
 783 
 784         if (cmd != DDI_ATTACH)
 785                 return (DDI_FAILURE);
 786         error = ddi_soft_state_zalloc(lofi_statep, 0);
 787         if (error == DDI_FAILURE) {
 788                 return (DDI_FAILURE);
 789         }
 790         error = ddi_create_minor_node(dip, LOFI_CTL_NODE, S_IFCHR, 0,
 791             DDI_PSEUDO, NULL);
 792         if (error == DDI_FAILURE) {
 793                 ddi_soft_state_free(lofi_statep, 0);
 794                 return (DDI_FAILURE);
 795         }
 796         /* driver handles kernel-issued IOCTLs */
 797         if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
 798             DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) {
 799                 ddi_remove_minor_node(dip, NULL);
 800                 ddi_soft_state_free(lofi_statep, 0);
 801                 return (DDI_FAILURE);
 802         }
 803         lofi_dip = dip;
 804         ddi_report_dev(dip);
 805         return (DDI_SUCCESS);
 806 }
 807 
 808 static int
 809 lofi_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 810 {
 811         if (cmd != DDI_DETACH)
 812                 return (DDI_FAILURE);
 813         if (lofi_busy())
 814                 return (DDI_FAILURE);
 815         lofi_dip = NULL;
 816         ddi_remove_minor_node(dip, NULL);
 817         ddi_prop_remove_all(dip);
 818         ddi_soft_state_free(lofi_statep, 0);
 819         return (DDI_SUCCESS);
 820 }
 821 
 822 /*
 823  * These two just simplify the rest of the ioctls that need to copyin/out
 824  * the lofi_ioctl structure.
 825  */
 826 struct lofi_ioctl *
 827 copy_in_lofi_ioctl(const struct lofi_ioctl *ulip, int flag)
 828 {
 829         struct lofi_ioctl *klip;
 830         int     error;
 831 
 832         klip = kmem_alloc(sizeof (struct lofi_ioctl), KM_SLEEP);
 833         error = ddi_copyin(ulip, klip, sizeof (struct lofi_ioctl), flag);
 834         if (error) {
 835                 kmem_free(klip, sizeof (struct lofi_ioctl));
 836                 return (NULL);
 837         }
 838 
 839         /* make sure filename is always null-terminated */
 840         klip->li_filename[MAXPATHLEN - 1] = '\0';
 841 
 842         /* validate minor number */
 843         if (klip->li_minor > lofi_max_files) {
 844                 kmem_free(klip, sizeof (struct lofi_ioctl));
 845                 return (NULL);
 846         }
 847         return (klip);
 848 }
 849 
 850 int
 851 copy_out_lofi_ioctl(const struct lofi_ioctl *klip, struct lofi_ioctl *ulip,
 852         int flag)
 853 {
 854         int     error;
 855 
 856         error = ddi_copyout(klip, ulip, sizeof (struct lofi_ioctl), flag);
 857         if (error)
 858                 return (EFAULT);
 859         return (0);
 860 }
 861 
 862 void
 863 free_lofi_ioctl(struct lofi_ioctl *klip)
 864 {
 865         kmem_free(klip, sizeof (struct lofi_ioctl));
 866 }
 867 
 868 /*
 869  * Return the minor number 'filename' is mapped to, if it is.
 870  */
 871 static int
 872 file_to_minor(char *filename)
 873 {
 874         minor_t minor;
 875         struct lofi_state *lsp;
 876 
 877         ASSERT(mutex_owned(&lofi_lock));
 878         for (minor = 1; minor <= lofi_max_files; minor++) {
 879                 lsp = ddi_get_soft_state(lofi_statep, minor);
 880                 if (lsp == NULL)
 881                         continue;
 882                 if (strcmp(lsp->ls_filename, filename) == 0)
 883                         return (minor);
 884         }
 885         return (0);
 886 }
 887 
 888 /*
 889  * lofiadm does some validation, but since Joe Random (or crashme) could
 890  * do our ioctls, we need to do some validation too.
 891  */
 892 static int
 893 valid_filename(const char *filename)
 894 {
 895         static char *blkprefix = "/dev/" LOFI_BLOCK_NAME "/";
 896         static char *charprefix = "/dev/" LOFI_CHAR_NAME "/";
 897 
 898         /* must be absolute path */
 899         if (filename[0] != '/')
 900                 return (0);
 901         /* must not be lofi */
 902         if (strncmp(filename, blkprefix, strlen(blkprefix)) == 0)
 903                 return (0);
 904         if (strncmp(filename, charprefix, strlen(charprefix)) == 0)
 905                 return (0);
 906         return (1);
 907 }
 908 
 909 /*
 910  * Fakes up a disk geometry, and one big partition, based on the size
 911  * of the file. This is needed because we allow newfs'ing the device,
 912  * and newfs will do several disk ioctls to figure out the geometry and
 913  * partition information. It uses that information to determine the parameters
 914  * to pass to mkfs. Geometry is pretty much irrelevant these days, but we
 915  * have to support it.
 916  */
 917 static void
 918 fake_disk_geometry(struct lofi_state *lsp)
 919 {
 920         /* dk_geom - see dkio(7I) */
 921         /*
 922          * dkg_ncyl _could_ be set to one here (one big cylinder with gobs
 923          * of sectors), but that breaks programs like fdisk which want to
 924          * partition a disk by cylinder. With one cylinder, you can't create
 925          * an fdisk partition and put pcfs on it for testing (hard to pick
 926          * a number between one and one).
 927          *
 928          * The cheezy floppy test is an attempt to not have too few cylinders
 929          * for a small file, or so many on a big file that you waste space
 930          * for backup superblocks or cylinder group structures.
 931          */
 932         if (lsp->ls_vp_size < (2 * 1024 * 1024)) /* floppy? */
 933                 lsp->ls_dkg.dkg_ncyl = lsp->ls_vp_size / (100 * 1024);
 934         else
 935                 lsp->ls_dkg.dkg_ncyl = lsp->ls_vp_size / (300 * 1024);
 936         /* in case file file is < 100k */
 937         if (lsp->ls_dkg.dkg_ncyl == 0)
 938                 lsp->ls_dkg.dkg_ncyl = 1;
 939         lsp->ls_dkg.dkg_acyl = 0;
 940         lsp->ls_dkg.dkg_bcyl = 0;
 941         lsp->ls_dkg.dkg_nhead = 1;
 942         lsp->ls_dkg.dkg_obs1 = 0;
 943         lsp->ls_dkg.dkg_intrlv = 0;
 944         lsp->ls_dkg.dkg_obs2 = 0;
 945         lsp->ls_dkg.dkg_obs3 = 0;
 946         lsp->ls_dkg.dkg_apc = 0;
 947         lsp->ls_dkg.dkg_rpm = 7200;
 948         lsp->ls_dkg.dkg_pcyl = lsp->ls_dkg.dkg_ncyl + lsp->ls_dkg.dkg_acyl;
 949         lsp->ls_dkg.dkg_nsect = lsp->ls_vp_size /
 950             (DEV_BSIZE * lsp->ls_dkg.dkg_ncyl);
 951         lsp->ls_dkg.dkg_write_reinstruct = 0;
 952         lsp->ls_dkg.dkg_read_reinstruct = 0;
 953 
 954         /* vtoc - see dkio(7I) */
 955         bzero(&lsp->ls_vtoc, sizeof (struct vtoc));
 956         lsp->ls_vtoc.v_sanity = VTOC_SANE;
 957         lsp->ls_vtoc.v_version = V_VERSION;
 958         bcopy(LOFI_DRIVER_NAME, lsp->ls_vtoc.v_volume, 7);
 959         lsp->ls_vtoc.v_sectorsz = DEV_BSIZE;
 960         lsp->ls_vtoc.v_nparts = 1;
 961         lsp->ls_vtoc.v_part[0].p_tag = V_UNASSIGNED;
 962 
 963         /*
 964          * A compressed file is read-only, other files can
 965          * be read-write
 966          */
 967         if (lsp->ls_uncomp_seg_sz > 0) {
 968                 lsp->ls_vtoc.v_part[0].p_flag = V_UNMNT | V_RONLY;
 969         } else {
 970                 lsp->ls_vtoc.v_part[0].p_flag = V_UNMNT;
 971         }
 972         lsp->ls_vtoc.v_part[0].p_start = (daddr_t)0;
 973         /*
 974          * The partition size cannot just be the number of sectors, because
 975          * that might not end on a cylinder boundary. And if that's the case,
 976          * newfs/mkfs will print a scary warning. So just figure the size
 977          * based on the number of cylinders and sectors/cylinder.
 978          */
 979         lsp->ls_vtoc.v_part[0].p_size = lsp->ls_dkg.dkg_pcyl *
 980             lsp->ls_dkg.dkg_nsect * lsp->ls_dkg.dkg_nhead;
 981 
 982         /* dk_cinfo - see dkio(7I) */
 983         bzero(&lsp->ls_ci, sizeof (struct dk_cinfo));
 984         (void) strcpy(lsp->ls_ci.dki_cname, LOFI_DRIVER_NAME);
 985         lsp->ls_ci.dki_ctype = DKC_MD;
 986         lsp->ls_ci.dki_flags = 0;
 987         lsp->ls_ci.dki_cnum = 0;
 988         lsp->ls_ci.dki_addr = 0;
 989         lsp->ls_ci.dki_space = 0;
 990         lsp->ls_ci.dki_prio = 0;
 991         lsp->ls_ci.dki_vec = 0;
 992         (void) strcpy(lsp->ls_ci.dki_dname, LOFI_DRIVER_NAME);
 993         lsp->ls_ci.dki_unit = 0;
 994         lsp->ls_ci.dki_slave = 0;
 995         lsp->ls_ci.dki_partition = 0;
 996         /*
 997          * newfs uses this to set maxcontig. Must not be < 16, or it
 998          * will be 0 when newfs multiplies it by DEV_BSIZE and divides
 999          * it by the block size. Then tunefs doesn't work because
1000          * maxcontig is 0.
1001          */
1002         lsp->ls_ci.dki_maxtransfer = 16;
1003 }
1004 
1005 /*
1006  * map in a compressed file
1007  *
1008  * Read in the header and the index that follows.
1009  *
1010  * The header is as follows -
1011  *
1012  * Signature (name of the compression algorithm)
1013  * Compression segment size (a multiple of 512)
1014  * Number of index entries
1015  * Size of the last block
1016  * The array containing the index entries
1017  *
1018  * The header information is always stored in
1019  * network byte order on disk.
1020  */
1021 static int
1022 lofi_map_compressed_file(struct lofi_state *lsp, char *buf)
1023 {
1024         uint32_t index_sz, header_len, i;
1025         ssize_t resid;
1026         enum uio_rw rw;
1027         char *tbuf = buf;
1028         int error;
1029 
1030         /* The signature has already been read */
1031         tbuf += lsp->ls_comp_algorithm_len;
1032         bcopy(tbuf, &(lsp->ls_uncomp_seg_sz), sizeof (lsp->ls_uncomp_seg_sz));
1033         lsp->ls_uncomp_seg_sz = ntohl(lsp->ls_uncomp_seg_sz);
1034 
1035         /*
1036          * The compressed segment size must be a power of 2
1037          */
1038         if (lsp->ls_uncomp_seg_sz % 2)
1039                 return (EINVAL);
1040 
1041         for (i = 0; !((lsp->ls_uncomp_seg_sz >> i) & 1); i++)
1042                 ;
1043 
1044         lsp->ls_comp_seg_shift = i;
1045 
1046         tbuf += sizeof (lsp->ls_uncomp_seg_sz);
1047         bcopy(tbuf, &(lsp->ls_comp_index_sz), sizeof (lsp->ls_comp_index_sz));
1048         lsp->ls_comp_index_sz = ntohl(lsp->ls_comp_index_sz);
1049 
1050         tbuf += sizeof (lsp->ls_comp_index_sz);
1051         bcopy(tbuf, &(lsp->ls_uncomp_last_seg_sz),
1052             sizeof (lsp->ls_uncomp_last_seg_sz));
1053         lsp->ls_uncomp_last_seg_sz = ntohl(lsp->ls_uncomp_last_seg_sz);
1054 
1055         /*
1056          * Compute the total size of the uncompressed data
1057          * for use in fake_disk_geometry and other calculations.
1058          * Disk geometry has to be faked with respect to the
1059          * actual uncompressed data size rather than the
1060          * compressed file size.
1061          */
1062         /* XXX '2' shouldn't subtracted here - should be '1' */
1063         lsp->ls_vp_size = (lsp->ls_comp_index_sz - 2) * lsp->ls_uncomp_seg_sz
1064             + lsp->ls_uncomp_last_seg_sz;
1065 
1066         /*
1067          * Index size is rounded up to a 512 byte boundary for ease
1068          * of segmapping
1069          */
1070         index_sz = sizeof (lsp->ls_comp_seg_index) * lsp->ls_comp_index_sz;
1071         header_len = lsp->ls_comp_algorithm_len +
1072             sizeof (lsp->ls_uncomp_seg_sz) +
1073             sizeof (lsp->ls_comp_index_sz) +
1074             sizeof (lsp->ls_uncomp_last_seg_sz);
1075         lsp->ls_comp_offbase = header_len + index_sz;
1076 
1077         index_sz += header_len;
1078         index_sz = roundup(index_sz, DEV_BSIZE);
1079 
1080         lsp->ls_comp_index_data = kmem_alloc(index_sz, KM_SLEEP);
1081         lsp->ls_comp_index_data_sz = index_sz;
1082 
1083         /*
1084          * Read in the index -- this has a side-effect
1085          * of reading in the header as well
1086          */
1087         rw = UIO_READ;
1088         error = vn_rdwr(rw, lsp->ls_vp, lsp->ls_comp_index_data, index_sz,
1089             0, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
1090 
1091         if (error != 0)
1092                 return (error);
1093 
1094         /* Skip the header, this is where the index really begins */
1095         lsp->ls_comp_seg_index =
1096             /*LINTED*/
1097             (uint64_t *)(lsp->ls_comp_index_data + header_len);
1098 
1099         /* Now map the index into memory */
1100         for (i = 0; i < lsp->ls_comp_index_sz; i++)
1101                 lsp->ls_comp_seg_index[i] = lsp->ls_comp_offbase +
1102                     lsp->ls_comp_seg_index[i];
1103 
1104         return (error);
1105 }
1106 
1107 /*
1108  * Check to see if the passed in signature is a valid
1109  * one. If it is valid, return the index into
1110  * lofi_compress_table.
1111  *
1112  * Return -1 if it is invalid
1113  */
1114 static int lofi_compress_select(char *signature)
1115 {
1116         int i;
1117 
1118         for (i = 0; i < LOFI_COMPRESS_FUNCTIONS; i++) {
1119                 if (strcmp(lofi_compress_table[i].l_name, signature) == 0)
1120                         return (i);
1121         }
1122 
1123         return (-1);
1124 }
1125 
1126 /*
1127  * map a file to a minor number. Return the minor number.
1128  */
1129 static int
1130 lofi_map_file(dev_t dev, struct lofi_ioctl *ulip, int pickminor,
1131     int *rvalp, struct cred *credp, int ioctl_flag)
1132 {
1133         minor_t newminor;
1134         struct lofi_state *lsp;
1135         struct lofi_ioctl *klip;
1136         int     error;
1137         struct vnode *vp;
1138         int64_t Nblocks_prop_val;
1139         int64_t Size_prop_val;
1140         int     compress_index;
1141         vattr_t vattr;
1142         int     flag;
1143         enum vtype v_type;
1144         int zalloced = 0;
1145         dev_t   newdev;
1146         char    namebuf[50];
1147         char    buf[DEV_BSIZE];
1148         char    *tbuf;
1149         ssize_t resid;
1150         enum uio_rw rw;
1151 
1152         klip = copy_in_lofi_ioctl(ulip, ioctl_flag);
1153         if (klip == NULL)
1154                 return (EFAULT);
1155 
1156         mutex_enter(&lofi_lock);
1157 
1158         if (!valid_filename(klip->li_filename)) {
1159                 error = EINVAL;
1160                 goto out;
1161         }
1162 
1163         if (file_to_minor(klip->li_filename) != 0) {
1164                 error = EBUSY;
1165                 goto out;
1166         }
1167 
1168         if (pickminor) {
1169                 /* Find a free one */
1170                 for (newminor = 1; newminor <= lofi_max_files; newminor++)
1171                         if (ddi_get_soft_state(lofi_statep, newminor) == NULL)
1172                                 break;
1173                 if (newminor >= lofi_max_files) {
1174                         error = EAGAIN;
1175                         goto out;
1176                 }
1177         } else {
1178                 newminor = klip->li_minor;
1179                 if (ddi_get_soft_state(lofi_statep, newminor) != NULL) {
1180                         error = EEXIST;
1181                         goto out;
1182                 }
1183         }
1184 
1185         /* make sure it's valid */
1186         error = lookupname(klip->li_filename, UIO_SYSSPACE, FOLLOW,
1187             NULLVPP, &vp);
1188         if (error) {
1189                 goto out;
1190         }
1191         v_type = vp->v_type;
1192         VN_RELE(vp);
1193         if (!V_ISLOFIABLE(v_type)) {
1194                 error = EINVAL;
1195                 goto out;
1196         }
1197         flag = FREAD | FWRITE | FOFFMAX | FEXCL;
1198         error = vn_open(klip->li_filename, UIO_SYSSPACE, flag, 0, &vp, 0, 0);
1199         if (error) {
1200                 /* try read-only */
1201                 flag &= ~FWRITE;
1202                 error = vn_open(klip->li_filename, UIO_SYSSPACE, flag, 0,
1203                     &vp, 0, 0);
1204                 if (error) {
1205                         goto out;
1206                 }
1207         }
1208         vattr.va_mask = AT_SIZE;
1209         error = VOP_GETATTR(vp, &vattr, 0, credp);
1210         if (error) {
1211                 goto closeout;
1212         }
1213         /* the file needs to be a multiple of the block size */
1214         if ((vattr.va_size % DEV_BSIZE) != 0) {
1215                 error = EINVAL;
1216                 goto closeout;
1217         }
1218         newdev = makedevice(getmajor(dev), newminor);
1219         Size_prop_val = vattr.va_size;
1220         if ((ddi_prop_update_int64(newdev, lofi_dip,
1221             SIZE_PROP_NAME, Size_prop_val)) != DDI_PROP_SUCCESS) {
1222                 error = EINVAL;
1223                 goto closeout;
1224         }
1225         Nblocks_prop_val = vattr.va_size / DEV_BSIZE;
1226         if ((ddi_prop_update_int64(newdev, lofi_dip,
1227             NBLOCKS_PROP_NAME, Nblocks_prop_val)) != DDI_PROP_SUCCESS) {
1228                 error = EINVAL;
1229                 goto propout;
1230         }
1231         error = ddi_soft_state_zalloc(lofi_statep, newminor);
1232         if (error == DDI_FAILURE) {
1233                 error = ENOMEM;
1234                 goto propout;
1235         }
1236         zalloced = 1;
1237         (void) snprintf(namebuf, sizeof (namebuf), "%d", newminor);
1238         (void) ddi_create_minor_node(lofi_dip, namebuf, S_IFBLK, newminor,
1239             DDI_PSEUDO, NULL);
1240         if (error != DDI_SUCCESS) {
1241                 error = ENXIO;
1242                 goto propout;
1243         }
1244         (void) snprintf(namebuf, sizeof (namebuf), "%d,raw", newminor);
1245         error = ddi_create_minor_node(lofi_dip, namebuf, S_IFCHR, newminor,
1246             DDI_PSEUDO, NULL);
1247         if (error != DDI_SUCCESS) {
1248                 /* remove block node */
1249                 (void) snprintf(namebuf, sizeof (namebuf), "%d", newminor);
1250                 ddi_remove_minor_node(lofi_dip, namebuf);
1251                 error = ENXIO;
1252                 goto propout;
1253         }
1254         lsp = ddi_get_soft_state(lofi_statep, newminor);
1255         lsp->ls_filename_sz = strlen(klip->li_filename) + 1;
1256         lsp->ls_filename = kmem_alloc(lsp->ls_filename_sz, KM_SLEEP);
1257         (void) snprintf(namebuf, sizeof (namebuf), "%s_taskq_%d",
1258             LOFI_DRIVER_NAME, newminor);
1259         lsp->ls_taskq = taskq_create(namebuf, lofi_taskq_nthreads,
1260             minclsyspri, 1, lofi_taskq_maxalloc, 0);
1261         lsp->ls_kstat = kstat_create(LOFI_DRIVER_NAME, newminor,
1262             NULL, "disk", KSTAT_TYPE_IO, 1, 0);
1263         if (lsp->ls_kstat) {
1264                 mutex_init(&lsp->ls_kstat_lock, NULL, MUTEX_DRIVER, NULL);
1265                 lsp->ls_kstat->ks_lock = &lsp->ls_kstat_lock;
1266                 kstat_install(lsp->ls_kstat);
1267         }
1268         cv_init(&lsp->ls_vp_cv, NULL, CV_DRIVER, NULL);
1269         mutex_init(&lsp->ls_vp_lock, NULL, MUTEX_DRIVER, NULL);
1270 
1271         /*
1272          * save open mode so file can be closed properly and vnode counts
1273          * updated correctly.
1274          */
1275         lsp->ls_openflag = flag;
1276 
1277         /*
1278          * Try to handle stacked lofs vnodes.
1279          */
1280         if (vp->v_type == VREG) {
1281                 if (VOP_REALVP(vp, &lsp->ls_vp) != 0) {
1282                         lsp->ls_vp = vp;
1283                 } else {
1284                         /*
1285                          * Even though vp was obtained via vn_open(), we
1286                          * can't call vn_close() on it, since lofs will
1287                          * pass the VOP_CLOSE() on down to the realvp
1288                          * (which we are about to use). Hence we merely
1289                          * drop the reference to the lofs vnode and hold
1290                          * the realvp so things behave as if we've
1291                          * opened the realvp without any interaction
1292                          * with lofs.
1293                          */
1294                         VN_HOLD(lsp->ls_vp);
1295                         VN_RELE(vp);
1296                 }
1297         } else {
1298                 lsp->ls_vp = vp;
1299         }
1300         lsp->ls_vp_size = vattr.va_size;
1301         (void) strcpy(lsp->ls_filename, klip->li_filename);
1302         if (rvalp)
1303                 *rvalp = (int)newminor;
1304         klip->li_minor = newminor;
1305 
1306         /*
1307          * Read the file signature to check if it is compressed.
1308          * 'rw' is set to read since only reads are allowed to
1309          * a compressed file.
1310          */
1311         rw = UIO_READ;
1312         error = vn_rdwr(rw, lsp->ls_vp, buf, DEV_BSIZE, 0, UIO_SYSSPACE,
1313             0, RLIM64_INFINITY, kcred, &resid);
1314 
1315         if (error != 0)
1316                 goto propout;
1317 
1318         tbuf = buf;
1319         lsp->ls_uncomp_seg_sz = 0;
1320         lsp->ls_vp_comp_size = lsp->ls_vp_size;
1321         lsp->ls_comp_algorithm_len = 0;
1322 
1323         compress_index = lofi_compress_select(tbuf);
1324         if (compress_index != -1) {
1325                 lsp->ls_comp_algorithm_index = compress_index;
1326                 lsp->ls_comp_algorithm_len =
1327                     strlen(lofi_compress_table[compress_index].l_name);
1328                 error = lofi_map_compressed_file(lsp, buf);
1329                 if (error != 0)
1330                         goto propout;
1331 
1332                 /* update DDI properties */
1333                 Size_prop_val = lsp->ls_vp_size;
1334                 if ((ddi_prop_update_int64(newdev, lofi_dip, SIZE_PROP_NAME,
1335                     Size_prop_val)) != DDI_PROP_SUCCESS) {
1336                         error = EINVAL;
1337                         goto propout;
1338                 }
1339 
1340                 Nblocks_prop_val = lsp->ls_vp_size / DEV_BSIZE;
1341                 if ((ddi_prop_update_int64(newdev, lofi_dip, NBLOCKS_PROP_NAME,
1342                     Nblocks_prop_val)) != DDI_PROP_SUCCESS) {
1343                         error = EINVAL;
1344                         goto propout;
1345                 }
1346         }
1347 
1348         fake_disk_geometry(lsp);
1349         mutex_exit(&lofi_lock);
1350         (void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
1351         free_lofi_ioctl(klip);
1352         return (0);
1353 
1354 propout:
1355         (void) ddi_prop_remove(newdev, lofi_dip, SIZE_PROP_NAME);
1356         (void) ddi_prop_remove(newdev, lofi_dip, NBLOCKS_PROP_NAME);
1357 closeout:
1358         (void) VOP_CLOSE(vp, flag, 1, 0, credp);
1359         VN_RELE(vp);
1360 out:
1361         if (zalloced)
1362                 ddi_soft_state_free(lofi_statep, newminor);
1363         mutex_exit(&lofi_lock);
1364         free_lofi_ioctl(klip);
1365         return (error);
1366 }
1367 
1368 /*
1369  * unmap a file.
1370  */
1371 static int
1372 lofi_unmap_file(dev_t dev, struct lofi_ioctl *ulip, int byfilename,
1373     struct cred *credp, int ioctl_flag)
1374 {
1375         struct lofi_state *lsp;
1376         struct lofi_ioctl *klip;
1377         minor_t minor;
1378 
1379         klip = copy_in_lofi_ioctl(ulip, ioctl_flag);
1380         if (klip == NULL)
1381                 return (EFAULT);
1382 
1383         mutex_enter(&lofi_lock);
1384         if (byfilename) {
1385                 minor = file_to_minor(klip->li_filename);
1386         } else {
1387                 minor = klip->li_minor;
1388         }
1389         if (minor == 0) {
1390                 mutex_exit(&lofi_lock);
1391                 free_lofi_ioctl(klip);
1392                 return (ENXIO);
1393         }
1394         lsp = ddi_get_soft_state(lofi_statep, minor);
1395         if (lsp == NULL || lsp->ls_vp == NULL) {
1396                 mutex_exit(&lofi_lock);
1397                 free_lofi_ioctl(klip);
1398                 return (ENXIO);
1399         }
1400 
1401         if (is_opened(lsp)) {
1402                 /*
1403                  * If the 'force' flag is set, then we forcibly close the
1404                  * underlying file.  Subsequent operations will fail, and the
1405                  * DKIOCSTATE ioctl will return DKIO_DEV_GONE.  When the device
1406                  * is last closed, the device will be cleaned up appropriately.
1407                  *
1408                  * This is complicated by the fact that we may have outstanding
1409                  * dispatched I/Os.  Rather than having a single mutex to
1410                  * serialize all I/O, we keep a count of the number of
1411                  * outstanding I/O requests, as well as a flag to indicate that
1412                  * no new I/Os should be dispatched.  We set the flag, wait for
1413                  * the number of outstanding I/Os to reach 0, and then close the
1414                  * underlying vnode.
1415                  */
1416                 if (klip->li_force) {
1417                         mutex_enter(&lsp->ls_vp_lock);
1418                         lsp->ls_vp_closereq = B_TRUE;
1419                         while (lsp->ls_vp_iocount > 0)
1420                                 cv_wait(&lsp->ls_vp_cv, &lsp->ls_vp_lock);
1421                         (void) VOP_CLOSE(lsp->ls_vp, lsp->ls_openflag, 1, 0,
1422                             credp);
1423                         VN_RELE(lsp->ls_vp);
1424                         lsp->ls_vp = NULL;
1425                         cv_broadcast(&lsp->ls_vp_cv);
1426                         mutex_exit(&lsp->ls_vp_lock);
1427                         mutex_exit(&lofi_lock);
1428                         klip->li_minor = minor;
1429                         (void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
1430                         free_lofi_ioctl(klip);
1431                         return (0);
1432                 }
1433                 mutex_exit(&lofi_lock);
1434                 free_lofi_ioctl(klip);
1435                 return (EBUSY);
1436         }
1437 
1438         if (lsp->ls_uncomp_seg_sz > 0) {
1439                 kmem_free(lsp->ls_comp_index_data, lsp->ls_comp_index_data_sz);
1440                 lsp->ls_uncomp_seg_sz = 0;
1441         }
1442 
1443         lofi_free_handle(dev, minor, lsp, credp);
1444 
1445         klip->li_minor = minor;
1446         mutex_exit(&lofi_lock);
1447         (void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
1448         free_lofi_ioctl(klip);
1449         return (0);
1450 }
1451 
1452 /*
1453  * get the filename given the minor number, or the minor number given
1454  * the name.
1455  */
1456 /*ARGSUSED*/
1457 static int
1458 lofi_get_info(dev_t dev, struct lofi_ioctl *ulip, int which,
1459     struct cred *credp, int ioctl_flag)
1460 {
1461         struct lofi_state *lsp;
1462         struct lofi_ioctl *klip;
1463         int     error;
1464         minor_t minor;
1465 
1466         klip = copy_in_lofi_ioctl(ulip, ioctl_flag);
1467         if (klip == NULL)
1468                 return (EFAULT);
1469 
1470         switch (which) {
1471         case LOFI_GET_FILENAME:
1472                 minor = klip->li_minor;
1473                 if (minor == 0) {
1474                         free_lofi_ioctl(klip);
1475                         return (EINVAL);
1476                 }
1477 
1478                 mutex_enter(&lofi_lock);
1479                 lsp = ddi_get_soft_state(lofi_statep, minor);
1480                 if (lsp == NULL) {
1481                         mutex_exit(&lofi_lock);
1482                         free_lofi_ioctl(klip);
1483                         return (ENXIO);
1484                 }
1485                 (void) strcpy(klip->li_filename, lsp->ls_filename);
1486                 if (lsp->ls_comp_algorithm_len == 0)
1487                         klip->li_algorithm[0] = '\0';
1488                 else
1489                         (void) strlcpy(klip->li_algorithm, lofi_compress_table[
1490                             lsp->ls_comp_algorithm_index].l_name,
1491                             lsp->ls_comp_algorithm_len + 1);
1492                 mutex_exit(&lofi_lock);
1493                 error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
1494                 free_lofi_ioctl(klip);
1495                 return (error);
1496         case LOFI_GET_MINOR:
1497                 mutex_enter(&lofi_lock);
1498                 klip->li_minor = file_to_minor(klip->li_filename);
1499                 mutex_exit(&lofi_lock);
1500                 if (klip->li_minor == 0) {
1501                         free_lofi_ioctl(klip);
1502                         return (ENOENT);
1503                 }
1504                 error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
1505                 free_lofi_ioctl(klip);
1506                 return (error);
1507         case LOFI_CHECK_COMPRESSED:
1508                 mutex_enter(&lofi_lock);
1509                 klip->li_minor = file_to_minor(klip->li_filename);
1510                 mutex_exit(&lofi_lock);
1511                 if (klip->li_minor == 0) {
1512                         free_lofi_ioctl(klip);
1513                         return (ENOENT);
1514                 }
1515                 mutex_enter(&lofi_lock);
1516                 lsp = ddi_get_soft_state(lofi_statep, klip->li_minor);
1517                 if (lsp == NULL) {
1518                         mutex_exit(&lofi_lock);
1519                         free_lofi_ioctl(klip);
1520                         return (ENXIO);
1521                 }
1522                 ASSERT(strcmp(klip->li_filename, lsp->ls_filename) == 0);
1523 
1524                 if (lsp->ls_comp_algorithm_len == 0)
1525                         klip->li_algorithm[0] = '\0';
1526                 else
1527                         (void) strlcpy(klip->li_algorithm, lofi_compress_table[
1528                             lsp->ls_comp_algorithm_index].l_name,
1529                             lsp->ls_comp_algorithm_len + 1);
1530 
1531                 mutex_exit(&lofi_lock);
1532                 error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
1533                 free_lofi_ioctl(klip);
1534                 return (error);
1535         default:
1536                 free_lofi_ioctl(klip);
1537                 return (EINVAL);
1538         }
1539 
1540 }
1541 
1542 static int
1543 lofi_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp,
1544     int *rvalp)
1545 {
1546         int     error;
1547         enum dkio_state dkstate;
1548         struct lofi_state *lsp;
1549         minor_t minor;
1550 
1551 #ifdef lint
1552         credp = credp;
1553 #endif
1554 
1555         minor = getminor(dev);
1556         /* lofi ioctls only apply to the master device */
1557         if (minor == 0) {
1558                 struct lofi_ioctl *lip = (struct lofi_ioctl *)arg;
1559 
1560                 /*
1561                  * the query command only need read-access - i.e., normal
1562                  * users are allowed to do those on the ctl device as
1563                  * long as they can open it read-only.
1564                  */
1565                 switch (cmd) {
1566                 case LOFI_MAP_FILE:
1567                         if ((flag & FWRITE) == 0)
1568                                 return (EPERM);
1569                         return (lofi_map_file(dev, lip, 1, rvalp, credp, flag));
1570                 case LOFI_MAP_FILE_MINOR:
1571                         if ((flag & FWRITE) == 0)
1572                                 return (EPERM);
1573                         return (lofi_map_file(dev, lip, 0, rvalp, credp, flag));
1574                 case LOFI_UNMAP_FILE:
1575                         if ((flag & FWRITE) == 0)
1576                                 return (EPERM);
1577                         return (lofi_unmap_file(dev, lip, 1, credp, flag));
1578                 case LOFI_UNMAP_FILE_MINOR:
1579                         if ((flag & FWRITE) == 0)
1580                                 return (EPERM);
1581                         return (lofi_unmap_file(dev, lip, 0, credp, flag));
1582                 case LOFI_GET_FILENAME:
1583                         return (lofi_get_info(dev, lip, LOFI_GET_FILENAME,
1584                             credp, flag));
1585                 case LOFI_GET_MINOR:
1586                         return (lofi_get_info(dev, lip, LOFI_GET_MINOR,
1587                             credp, flag));
1588                 case LOFI_GET_MAXMINOR:
1589                         error = ddi_copyout(&lofi_max_files, &lip->li_minor,
1590                             sizeof (lofi_max_files), flag);
1591                         if (error)
1592                                 return (EFAULT);
1593                         return (0);
1594                 case LOFI_CHECK_COMPRESSED:
1595                         return (lofi_get_info(dev, lip, LOFI_CHECK_COMPRESSED,
1596                             credp, flag));
1597                 default:
1598                         break;
1599                 }
1600         }
1601 
1602         lsp = ddi_get_soft_state(lofi_statep, minor);
1603         if (lsp == NULL)
1604                 return (ENXIO);
1605 
1606         /*
1607          * We explicitly allow DKIOCSTATE, but all other ioctls should fail with
1608          * EIO as if the device was no longer present.
1609          */
1610         if (lsp->ls_vp == NULL && cmd != DKIOCSTATE)
1611                 return (EIO);
1612 
1613         /* these are for faking out utilities like newfs */
1614         switch (cmd) {
1615         case DKIOCGVTOC:
1616                 switch (ddi_model_convert_from(flag & FMODELS)) {
1617                 case DDI_MODEL_ILP32: {
1618                         struct vtoc32 vtoc32;
1619 
1620                         vtoctovtoc32(lsp->ls_vtoc, vtoc32);
1621                         if (ddi_copyout(&vtoc32, (void *)arg,
1622                             sizeof (struct vtoc32), flag))
1623                                 return (EFAULT);
1624                                 break;
1625                         }
1626 
1627                 case DDI_MODEL_NONE:
1628                         if (ddi_copyout(&lsp->ls_vtoc, (void *)arg,
1629                             sizeof (struct vtoc), flag))
1630                                 return (EFAULT);
1631                         break;
1632                 }
1633                 return (0);
1634         case DKIOCINFO:
1635                 error = ddi_copyout(&lsp->ls_ci, (void *)arg,
1636                     sizeof (struct dk_cinfo), flag);
1637                 if (error)
1638                         return (EFAULT);
1639                 return (0);
1640         case DKIOCG_VIRTGEOM:
1641         case DKIOCG_PHYGEOM:
1642         case DKIOCGGEOM:
1643                 error = ddi_copyout(&lsp->ls_dkg, (void *)arg,
1644                     sizeof (struct dk_geom), flag);
1645                 if (error)
1646                         return (EFAULT);
1647                 return (0);
1648         case DKIOCSTATE:
1649                 /*
1650                  * Normally, lofi devices are always in the INSERTED state.  If
1651                  * a device is forcefully unmapped, then the device transitions
1652                  * to the DKIO_DEV_GONE state.
1653                  */
1654                 if (ddi_copyin((void *)arg, &dkstate, sizeof (dkstate),
1655                     flag) != 0)
1656                         return (EFAULT);
1657 
1658                 mutex_enter(&lsp->ls_vp_lock);
1659                 while ((dkstate == DKIO_INSERTED && lsp->ls_vp != NULL) ||
1660                     (dkstate == DKIO_DEV_GONE && lsp->ls_vp == NULL)) {
1661                         /*
1662                          * By virtue of having the device open, we know that
1663                          * 'lsp' will remain valid when we return.
1664                          */
1665                         if (!cv_wait_sig(&lsp->ls_vp_cv,
1666                             &lsp->ls_vp_lock)) {
1667                                 mutex_exit(&lsp->ls_vp_lock);
1668                                 return (EINTR);
1669                         }
1670                 }
1671 
1672                 dkstate = (lsp->ls_vp != NULL ? DKIO_INSERTED : DKIO_DEV_GONE);
1673                 mutex_exit(&lsp->ls_vp_lock);
1674 
1675                 if (ddi_copyout(&dkstate, (void *)arg,
1676                     sizeof (dkstate), flag) != 0)
1677                         return (EFAULT);
1678                 return (0);
1679         default:
1680                 return (ENOTTY);
1681         }
1682 }
1683 
1684 static struct cb_ops lofi_cb_ops = {
1685         lofi_open,              /* open */
1686         lofi_close,             /* close */
1687         lofi_strategy,          /* strategy */
1688         nodev,                  /* print */
1689         nodev,                  /* dump */
1690         lofi_read,              /* read */
1691         lofi_write,             /* write */
1692         lofi_ioctl,             /* ioctl */
1693         nodev,                  /* devmap */
1694         nodev,                  /* mmap */
1695         nodev,                  /* segmap */
1696         nochpoll,               /* poll */
1697         ddi_prop_op,            /* prop_op */
1698         0,                      /* streamtab  */
1699         D_64BIT | D_NEW | D_MP, /* Driver compatibility flag */
1700         CB_REV,
1701         lofi_aread,
1702         lofi_awrite
1703 };
1704 
1705 static struct dev_ops lofi_ops = {
1706         DEVO_REV,               /* devo_rev, */
1707         0,                      /* refcnt  */
1708         lofi_info,              /* info */
1709         nulldev,                /* identify */
1710         nulldev,                /* probe */
1711         lofi_attach,            /* attach */
1712         lofi_detach,            /* detach */
1713         nodev,                  /* reset */
1714         &lofi_cb_ops,               /* driver operations */
1715         NULL                    /* no bus operations */
1716 };
1717 
1718 static struct modldrv modldrv = {
1719         &mod_driverops,
1720         "loopback file driver (%I%)",
1721         &lofi_ops,
1722 };
1723 
1724 static struct modlinkage modlinkage = {
1725         MODREV_1,
1726         &modldrv,
1727         NULL
1728 };
1729 
1730 int
1731 _init(void)
1732 {
1733         int error;
1734 
1735         error = ddi_soft_state_init(&lofi_statep,
1736             sizeof (struct lofi_state), 0);
1737         if (error)
1738                 return (error);
1739 
1740         mutex_init(&lofi_lock, NULL, MUTEX_DRIVER, NULL);
1741         error = mod_install(&modlinkage);
1742         if (error) {
1743                 mutex_destroy(&lofi_lock);
1744                 ddi_soft_state_fini(&lofi_statep);
1745         }
1746 
1747         return (error);
1748 }
1749 
1750 int
1751 _fini(void)
1752 {
1753         int     error;
1754 
1755         if (lofi_busy())
1756                 return (EBUSY);
1757 
1758         error = mod_remove(&modlinkage);
1759         if (error)
1760                 return (error);
1761 
1762         mutex_destroy(&lofi_lock);
1763         ddi_soft_state_fini(&lofi_statep);
1764 
1765         return (error);
1766 }
1767 
1768 int
1769 _info(struct modinfo *modinfop)
1770 {
1771         return (mod_info(&modlinkage, modinfop));
1772 }