1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 #pragma ident   "@(#)vplat.c    1.60    08/04/08 SMI"
  28 
  29 /*
  30  * This module contains functions used to bring up and tear down the
  31  * Virtual Platform: [un]mounting file-systems, [un]plumbing network
  32  * interfaces, [un]configuring devices, establishing resource controls,
  33  * and creating/destroying the zone in the kernel.  These actions, on
  34  * the way up, ready the zone; on the way down, they halt the zone.
  35  * See the much longer block comment at the beginning of zoneadmd.c
  36  * for a bigger picture of how the whole program functions.
  37  *
  38  * This module also has primary responsibility for the layout of "scratch
  39  * zones."  These are mounted, but inactive, zones that are used during
  40  * operating system upgrade and potentially other administrative action.  The
  41  * scratch zone environment is similar to the miniroot environment.  The zone's
  42  * actual root is mounted read-write on /a, and the standard paths (/usr,
  43  * /sbin, /lib) all lead to read-only copies of the running system's binaries.
  44  * This allows the administrative tools to manipulate the zone using "-R /a"
  45  * without relying on any binaries in the zone itself.
  46  *
  47  * If the scratch zone is on an alternate root (Live Upgrade [LU] boot
  48  * environment), then we must resolve the lofs mounts used there to uncover
  49  * writable (unshared) resources.  Shared resources, though, are always
  50  * read-only.  In addition, if the "same" zone with a different root path is
  51  * currently running, then "/b" inside the zone points to the running zone's
  52  * root.  This allows LU to synchronize configuration files during the upgrade
  53  * process.
  54  *
  55  * To construct this environment, this module creates a tmpfs mount on
  56  * $ZONEPATH/lu.  Inside this scratch area, the miniroot-like environment as
  57  * described above is constructed on the fly.  The zone is then created using
  58  * $ZONEPATH/lu as the root.
  59  *
  60  * Note that scratch zones are inactive.  The zone's bits are not running and
  61  * likely cannot be run correctly until upgrade is done.  Init is not running
  62  * there, nor is SMF.  Because of this, the "mounted" state of a scratch zone
  63  * is not a part of the usual halt/ready/boot state machine.
  64  */
  65 
  66 #include <sys/param.h>
  67 #include <sys/mount.h>
  68 #include <sys/mntent.h>
  69 #include <sys/socket.h>
  70 #include <sys/utsname.h>
  71 #include <sys/types.h>
  72 #include <sys/stat.h>
  73 #include <sys/sockio.h>
  74 #include <sys/stropts.h>
  75 #include <sys/conf.h>
  76 
  77 #include <sys/dlpi.h>
  78 #include <libdlpi.h>
  79 #include <libdllink.h>
  80 #include <libdlvlan.h>
  81 
  82 #include <inet/tcp.h>
  83 #include <arpa/inet.h>
  84 #include <netinet/in.h>
  85 #include <net/route.h>
  86 
  87 #include <stdio.h>
  88 #include <errno.h>
  89 #include <fcntl.h>
  90 #include <unistd.h>
  91 #include <rctl.h>
  92 #include <stdlib.h>
  93 #include <string.h>
  94 #include <strings.h>
  95 #include <wait.h>
  96 #include <limits.h>
  97 #include <libgen.h>
  98 #include <libzfs.h>
  99 #include <libdevinfo.h>
 100 #include <zone.h>
 101 #include <assert.h>
 102 #include <libcontract.h>
 103 #include <libcontract_priv.h>
 104 #include <uuid/uuid.h>
 105 
 106 #include <sys/mntio.h>
 107 #include <sys/mnttab.h>
 108 #include <sys/fs/autofs.h>        /* for _autofssys() */
 109 #include <sys/fs/lofs_info.h>
 110 #include <sys/fs/zfs.h>
 111 
 112 #include <pool.h>
 113 #include <sys/pool.h>
 114 #include <sys/priocntl.h>
 115 
 116 #include <libbrand.h>
 117 #include <sys/brand.h>
 118 #include <libzonecfg.h>
 119 #include <synch.h>
 120 
 121 #include "zoneadmd.h"
 122 #include <tsol/label.h>
 123 #include <libtsnet.h>
 124 #include <sys/priv.h>
 125 
 126 #define V4_ADDR_LEN     32
 127 #define V6_ADDR_LEN     128
 128 
 129 #define IPD_DEFAULT_OPTS \
 130         MNTOPT_RO "," MNTOPT_LOFS_NOSUB "," MNTOPT_NODEVICES
 131 
 132 #define DFSTYPES        "/etc/dfs/fstypes"
 133 #define MAXTNZLEN       2048
 134 
 135 #define ALT_MOUNT(mount_cmd)    ((mount_cmd) != Z_MNT_BOOT)
 136 
 137 /* for routing socket */
 138 static int rts_seqno = 0;
 139 
 140 /* mangled zone name when mounting in an alternate root environment */
 141 static char kernzone[ZONENAME_MAX];
 142 
 143 /* array of cached mount entries for resolve_lofs */
 144 static struct mnttab *resolve_lofs_mnts, *resolve_lofs_mnt_max;
 145 
 146 /* for Trusted Extensions */
 147 static tsol_zcent_t *get_zone_label(zlog_t *, priv_set_t *);
 148 static int tsol_mounts(zlog_t *, char *, char *);
 149 static void tsol_unmounts(zlog_t *, char *);
 150 
 151 static m_label_t *zlabel = NULL;
 152 static m_label_t *zid_label = NULL;
 153 static priv_set_t *zprivs = NULL;
 154 
 155 /* from libsocket, not in any header file */
 156 extern int getnetmaskbyaddr(struct in_addr, struct in_addr *);
 157 
 158 /*
 159  * An optimization for build_mnttable: reallocate (and potentially copy the
 160  * data) only once every N times through the loop.
 161  */
 162 #define MNTTAB_HUNK     32
 163 
 164 /*
 165  * Private autofs system call
 166  */
 167 extern int _autofssys(int, void *);
 168 
 169 static int
 170 autofs_cleanup(zoneid_t zoneid)
 171 {
 172         /*
 173          * Ask autofs to unmount all trigger nodes in the given zone.
 174          */
 175         return (_autofssys(AUTOFS_UNMOUNTALL, (void *)zoneid));
 176 }
 177 
 178 static void
 179 free_mnttable(struct mnttab *mnt_array, uint_t nelem)
 180 {
 181         uint_t i;
 182 
 183         if (mnt_array == NULL)
 184                 return;
 185         for (i = 0; i < nelem; i++) {
 186                 free(mnt_array[i].mnt_mountp);
 187                 free(mnt_array[i].mnt_fstype);
 188                 free(mnt_array[i].mnt_special);
 189                 free(mnt_array[i].mnt_mntopts);
 190                 assert(mnt_array[i].mnt_time == NULL);
 191         }
 192         free(mnt_array);
 193 }
 194 
 195 /*
 196  * Build the mount table for the zone rooted at "zroot", storing the resulting
 197  * array of struct mnttabs in "mnt_arrayp" and the number of elements in the
 198  * array in "nelemp".
 199  */
 200 static int
 201 build_mnttable(zlog_t *zlogp, const char *zroot, size_t zrootlen, FILE *mnttab,
 202     struct mnttab **mnt_arrayp, uint_t *nelemp)
 203 {
 204         struct mnttab mnt;
 205         struct mnttab *mnts;
 206         struct mnttab *mnp;
 207         uint_t nmnt;
 208 
 209         rewind(mnttab);
 210         resetmnttab(mnttab);
 211         nmnt = 0;
 212         mnts = NULL;
 213         while (getmntent(mnttab, &mnt) == 0) {
 214                 struct mnttab *tmp_array;
 215 
 216                 if (strncmp(mnt.mnt_mountp, zroot, zrootlen) != 0)
 217                         continue;
 218                 if (nmnt % MNTTAB_HUNK == 0) {
 219                         tmp_array = realloc(mnts,
 220                             (nmnt + MNTTAB_HUNK) * sizeof (*mnts));
 221                         if (tmp_array == NULL) {
 222                                 free_mnttable(mnts, nmnt);
 223                                 return (-1);
 224                         }
 225                         mnts = tmp_array;
 226                 }
 227                 mnp = &mnts[nmnt++];
 228 
 229                 /*
 230                  * Zero out any fields we're not using.
 231                  */
 232                 (void) memset(mnp, 0, sizeof (*mnp));
 233 
 234                 if (mnt.mnt_special != NULL)
 235                         mnp->mnt_special = strdup(mnt.mnt_special);
 236                 if (mnt.mnt_mntopts != NULL)
 237                         mnp->mnt_mntopts = strdup(mnt.mnt_mntopts);
 238                 mnp->mnt_mountp = strdup(mnt.mnt_mountp);
 239                 mnp->mnt_fstype = strdup(mnt.mnt_fstype);
 240                 if ((mnt.mnt_special != NULL && mnp->mnt_special == NULL) ||
 241                     (mnt.mnt_mntopts != NULL && mnp->mnt_mntopts == NULL) ||
 242                     mnp->mnt_mountp == NULL || mnp->mnt_fstype == NULL) {
 243                         zerror(zlogp, B_TRUE, "memory allocation failed");
 244                         free_mnttable(mnts, nmnt);
 245                         return (-1);
 246                 }
 247         }
 248         *mnt_arrayp = mnts;
 249         *nelemp = nmnt;
 250         return (0);
 251 }
 252 
 253 /*
 254  * This is an optimization.  The resolve_lofs function is used quite frequently
 255  * to manipulate file paths, and on a machine with a large number of zones,
 256  * there will be a huge number of mounted file systems.  Thus, we trigger a
 257  * reread of the list of mount points
 258  */
 259 static void
 260 lofs_discard_mnttab(void)
 261 {
 262         free_mnttable(resolve_lofs_mnts,
 263             resolve_lofs_mnt_max - resolve_lofs_mnts);
 264         resolve_lofs_mnts = resolve_lofs_mnt_max = NULL;
 265 }
 266 
 267 static int
 268 lofs_read_mnttab(zlog_t *zlogp)
 269 {
 270         FILE *mnttab;
 271         uint_t nmnts;
 272 
 273         if ((mnttab = fopen(MNTTAB, "r")) == NULL)
 274                 return (-1);
 275         if (build_mnttable(zlogp, "", 0, mnttab, &resolve_lofs_mnts,
 276             &nmnts) == -1) {
 277                 (void) fclose(mnttab);
 278                 return (-1);
 279         }
 280         (void) fclose(mnttab);
 281         resolve_lofs_mnt_max = resolve_lofs_mnts + nmnts;
 282         return (0);
 283 }
 284 
 285 /*
 286  * This function loops over potential loopback mounts and symlinks in a given
 287  * path and resolves them all down to an absolute path.
 288  */
 289 void
 290 resolve_lofs(zlog_t *zlogp, char *path, size_t pathlen)
 291 {
 292         int len, arlen;
 293         const char *altroot;
 294         char tmppath[MAXPATHLEN];
 295         boolean_t outside_altroot;
 296 
 297         if ((len = resolvepath(path, tmppath, sizeof (tmppath))) == -1)
 298                 return;
 299         tmppath[len] = '\0';
 300         (void) strlcpy(path, tmppath, sizeof (tmppath));
 301 
 302         /* This happens once per zoneadmd operation. */
 303         if (resolve_lofs_mnts == NULL && lofs_read_mnttab(zlogp) == -1)
 304                 return;
 305 
 306         altroot = zonecfg_get_root();
 307         arlen = strlen(altroot);
 308         outside_altroot = B_FALSE;
 309         for (;;) {
 310                 struct mnttab *mnp;
 311 
 312                 /* Search in reverse order to find longest match */
 313                 for (mnp = resolve_lofs_mnt_max - 1; mnp >= resolve_lofs_mnts;
 314                     mnp--) {
 315                         if (mnp->mnt_fstype == NULL ||
 316                             mnp->mnt_mountp == NULL ||
 317                             mnp->mnt_special == NULL)
 318                                 continue;
 319                         len = strlen(mnp->mnt_mountp);
 320                         if (strncmp(mnp->mnt_mountp, path, len) == 0 &&
 321                             (path[len] == '/' || path[len] == '\0'))
 322                                 break;
 323                 }
 324                 if (mnp < resolve_lofs_mnts)
 325                         break;
 326                 /* If it's not a lofs then we're done */
 327                 if (strcmp(mnp->mnt_fstype, MNTTYPE_LOFS) != 0)
 328                         break;
 329                 if (outside_altroot) {
 330                         char *cp;
 331                         int olen = sizeof (MNTOPT_RO) - 1;
 332 
 333                         /*
 334                          * If we run into a read-only mount outside of the
 335                          * alternate root environment, then the user doesn't
 336                          * want this path to be made read-write.
 337                          */
 338                         if (mnp->mnt_mntopts != NULL &&
 339                             (cp = strstr(mnp->mnt_mntopts, MNTOPT_RO)) !=
 340                             NULL &&
 341                             (cp == mnp->mnt_mntopts || cp[-1] == ',') &&
 342                             (cp[olen] == '\0' || cp[olen] == ',')) {
 343                                 break;
 344                         }
 345                 } else if (arlen > 0 &&
 346                     (strncmp(mnp->mnt_special, altroot, arlen) != 0 ||
 347                     (mnp->mnt_special[arlen] != '\0' &&
 348                     mnp->mnt_special[arlen] != '/'))) {
 349                         outside_altroot = B_TRUE;
 350                 }
 351                 /* use temporary buffer because new path might be longer */
 352                 (void) snprintf(tmppath, sizeof (tmppath), "%s%s",
 353                     mnp->mnt_special, path + len);
 354                 if ((len = resolvepath(tmppath, path, pathlen)) == -1)
 355                         break;
 356                 path[len] = '\0';
 357         }
 358 }
 359 
 360 /*
 361  * For a regular mount, check if a replacement lofs mount is needed because the
 362  * referenced device is already mounted somewhere.
 363  */
 364 static int
 365 check_lofs_needed(zlog_t *zlogp, struct zone_fstab *fsptr)
 366 {
 367         struct mnttab *mnp;
 368         zone_fsopt_t *optptr, *onext;
 369 
 370         /* This happens once per zoneadmd operation. */
 371         if (resolve_lofs_mnts == NULL && lofs_read_mnttab(zlogp) == -1)
 372                 return (-1);
 373 
 374         /*
 375          * If this special node isn't already in use, then it's ours alone;
 376          * no need to worry about conflicting mounts.
 377          */
 378         for (mnp = resolve_lofs_mnts; mnp < resolve_lofs_mnt_max;
 379             mnp++) {
 380                 if (strcmp(mnp->mnt_special, fsptr->zone_fs_special) == 0)
 381                         break;
 382         }
 383         if (mnp >= resolve_lofs_mnt_max)
 384                 return (0);
 385 
 386         /*
 387          * Convert this duplicate mount into a lofs mount.
 388          */
 389         (void) strlcpy(fsptr->zone_fs_special, mnp->mnt_mountp,
 390             sizeof (fsptr->zone_fs_special));
 391         (void) strlcpy(fsptr->zone_fs_type, MNTTYPE_LOFS,
 392             sizeof (fsptr->zone_fs_type));
 393         fsptr->zone_fs_raw[0] = '\0';
 394 
 395         /*
 396          * Discard all but one of the original options and set that to be the
 397          * same set of options used for inherit package directory resources.
 398          */
 399         optptr = fsptr->zone_fs_options;
 400         if (optptr == NULL) {
 401                 optptr = malloc(sizeof (*optptr));
 402                 if (optptr == NULL) {
 403                         zerror(zlogp, B_TRUE, "cannot mount %s",
 404                             fsptr->zone_fs_dir);
 405                         return (-1);
 406                 }
 407         } else {
 408                 while ((onext = optptr->zone_fsopt_next) != NULL) {
 409                         optptr->zone_fsopt_next = onext->zone_fsopt_next;
 410                         free(onext);
 411                 }
 412         }
 413         (void) strcpy(optptr->zone_fsopt_opt, IPD_DEFAULT_OPTS);
 414         optptr->zone_fsopt_next = NULL;
 415         fsptr->zone_fs_options = optptr;
 416         return (0);
 417 }
 418 
 419 int
 420 make_one_dir(zlog_t *zlogp, const char *prefix, const char *subdir, mode_t mode,
 421     uid_t userid, gid_t groupid)
 422 {
 423         char path[MAXPATHLEN];
 424         struct stat st;
 425 
 426         if (snprintf(path, sizeof (path), "%s%s", prefix, subdir) >
 427             sizeof (path)) {
 428                 zerror(zlogp, B_FALSE, "pathname %s%s is too long", prefix,
 429                     subdir);
 430                 return (-1);
 431         }
 432 
 433         if (lstat(path, &st) == 0) {
 434                 /*
 435                  * We don't check the file mode since presumably the zone
 436                  * administrator may have had good reason to change the mode,
 437                  * and we don't need to second guess him.
 438                  */
 439                 if (!S_ISDIR(st.st_mode)) {
 440                         if (is_system_labeled() &&
 441                             S_ISREG(st.st_mode)) {
 442                                 /*
 443                                  * The need to mount readonly copies of
 444                                  * global zone /etc/ files is unique to
 445                                  * Trusted Extensions.
 446                                  */
 447                                 if (strncmp(subdir, "/etc/",
 448                                     strlen("/etc/")) != 0) {
 449                                         zerror(zlogp, B_FALSE,
 450                                             "%s is not in /etc", path);
 451                                         return (-1);
 452                                 }
 453                         } else {
 454                                 zerror(zlogp, B_FALSE,
 455                                     "%s is not a directory", path);
 456                                 return (-1);
 457                         }
 458                 }
 459                 return (0);
 460         }
 461 
 462         if (mkdirp(path, mode) != 0) {
 463                 if (errno == EROFS)
 464                         zerror(zlogp, B_FALSE, "Could not mkdir %s.\nIt is on "
 465                             "a read-only file system in this local zone.\nMake "
 466                             "sure %s exists in the global zone.", path, subdir);
 467                 else
 468                         zerror(zlogp, B_TRUE, "mkdirp of %s failed", path);
 469                 return (-1);
 470         }
 471 
 472         (void) chown(path, userid, groupid);
 473         return (0);
 474 }
 475 
 476 static void
 477 free_remote_fstypes(char **types)
 478 {
 479         uint_t i;
 480 
 481         if (types == NULL)
 482                 return;
 483         for (i = 0; types[i] != NULL; i++)
 484                 free(types[i]);
 485         free(types);
 486 }
 487 
 488 static char **
 489 get_remote_fstypes(zlog_t *zlogp)
 490 {
 491         char **types = NULL;
 492         FILE *fp;
 493         char buf[MAXPATHLEN];
 494         char fstype[MAXPATHLEN];
 495         uint_t lines = 0;
 496         uint_t i;
 497 
 498         if ((fp = fopen(DFSTYPES, "r")) == NULL) {
 499                 zerror(zlogp, B_TRUE, "failed to open %s", DFSTYPES);
 500                 return (NULL);
 501         }
 502         /*
 503          * Count the number of lines
 504          */
 505         while (fgets(buf, sizeof (buf), fp) != NULL)
 506                 lines++;
 507         if (lines == 0) /* didn't read anything; empty file */
 508                 goto out;
 509         rewind(fp);
 510         /*
 511          * Allocate enough space for a NULL-terminated array.
 512          */
 513         types = calloc(lines + 1, sizeof (char *));
 514         if (types == NULL) {
 515                 zerror(zlogp, B_TRUE, "memory allocation failed");
 516                 goto out;
 517         }
 518         i = 0;
 519         while (fgets(buf, sizeof (buf), fp) != NULL) {
 520                 /* LINTED - fstype is big enough to hold buf */
 521                 if (sscanf(buf, "%s", fstype) == 0) {
 522                         zerror(zlogp, B_FALSE, "unable to parse %s", DFSTYPES);
 523                         free_remote_fstypes(types);
 524                         types = NULL;
 525                         goto out;
 526                 }
 527                 types[i] = strdup(fstype);
 528                 if (types[i] == NULL) {
 529                         zerror(zlogp, B_TRUE, "memory allocation failed");
 530                         free_remote_fstypes(types);
 531                         types = NULL;
 532                         goto out;
 533                 }
 534                 i++;
 535         }
 536 out:
 537         (void) fclose(fp);
 538         return (types);
 539 }
 540 
 541 static boolean_t
 542 is_remote_fstype(const char *fstype, char *const *remote_fstypes)
 543 {
 544         uint_t i;
 545 
 546         if (remote_fstypes == NULL)
 547                 return (B_FALSE);
 548         for (i = 0; remote_fstypes[i] != NULL; i++) {
 549                 if (strcmp(remote_fstypes[i], fstype) == 0)
 550                         return (B_TRUE);
 551         }
 552         return (B_FALSE);
 553 }
 554 
 555 /*
 556  * This converts a zone root path (normally of the form .../root) to a Live
 557  * Upgrade scratch zone root (of the form .../lu).
 558  */
 559 static void
 560 root_to_lu(zlog_t *zlogp, char *zroot, size_t zrootlen, boolean_t isresolved)
 561 {
 562         assert(zone_isnative || zone_iscluster);
 563 
 564         if (!isresolved && zonecfg_in_alt_root())
 565                 resolve_lofs(zlogp, zroot, zrootlen);
 566         (void) strcpy(strrchr(zroot, '/') + 1, "lu");
 567 }
 568 
 569 /*
 570  * The general strategy for unmounting filesystems is as follows:
 571  *
 572  * - Remote filesystems may be dead, and attempting to contact them as
 573  * part of a regular unmount may hang forever; we want to always try to
 574  * forcibly unmount such filesystems and only fall back to regular
 575  * unmounts if the filesystem doesn't support forced unmounts.
 576  *
 577  * - We don't want to unnecessarily corrupt metadata on local
 578  * filesystems (ie UFS), so we want to start off with graceful unmounts,
 579  * and only escalate to doing forced unmounts if we get stuck.
 580  *
 581  * We start off walking backwards through the mount table.  This doesn't
 582  * give us strict ordering but ensures that we try to unmount submounts
 583  * first.  We thus limit the number of failed umount2(2) calls.
 584  *
 585  * The mechanism for determining if we're stuck is to count the number
 586  * of failed unmounts each iteration through the mount table.  This
 587  * gives us an upper bound on the number of filesystems which remain
 588  * mounted (autofs trigger nodes are dealt with separately).  If at the
 589  * end of one unmount+autofs_cleanup cycle we still have the same number
 590  * of mounts that we started out with, we're stuck and try a forced
 591  * unmount.  If that fails (filesystem doesn't support forced unmounts)
 592  * then we bail and are unable to teardown the zone.  If it succeeds,
 593  * we're no longer stuck so we continue with our policy of trying
 594  * graceful mounts first.
 595  *
 596  * Zone must be down (ie, no processes or threads active).
 597  */
 598 static int
 599 unmount_filesystems(zlog_t *zlogp, zoneid_t zoneid, boolean_t unmount_cmd)
 600 {
 601         int error = 0;
 602         FILE *mnttab;
 603         struct mnttab *mnts;
 604         uint_t nmnt;
 605         char zroot[MAXPATHLEN + 1];
 606         size_t zrootlen;
 607         uint_t oldcount = UINT_MAX;
 608         boolean_t stuck = B_FALSE;
 609         char **remote_fstypes = NULL;
 610 
 611         if (zone_get_rootpath(zone_name, zroot, sizeof (zroot)) != Z_OK) {
 612                 zerror(zlogp, B_FALSE, "unable to determine zone root");
 613                 return (-1);
 614         }
 615         if (unmount_cmd)
 616                 root_to_lu(zlogp, zroot, sizeof (zroot), B_FALSE);
 617 
 618         (void) strcat(zroot, "/");
 619         zrootlen = strlen(zroot);
 620 
 621         /*
 622          * For Trusted Extensions unmount each higher level zone's mount
 623          * of our zone's /export/home
 624          */
 625         if (!unmount_cmd)
 626                 tsol_unmounts(zlogp, zone_name);
 627 
 628         if ((mnttab = fopen(MNTTAB, "r")) == NULL) {
 629                 zerror(zlogp, B_TRUE, "failed to open %s", MNTTAB);
 630                 return (-1);
 631         }
 632         /*
 633          * Use our hacky mntfs ioctl so we see everything, even mounts with
 634          * MS_NOMNTTAB.
 635          */
 636         if (ioctl(fileno(mnttab), MNTIOC_SHOWHIDDEN, NULL) < 0) {
 637                 zerror(zlogp, B_TRUE, "unable to configure %s", MNTTAB);
 638                 error++;
 639                 goto out;
 640         }
 641 
 642         /*
 643          * Build the list of remote fstypes so we know which ones we
 644          * should forcibly unmount.
 645          */
 646         remote_fstypes = get_remote_fstypes(zlogp);
 647         for (; /* ever */; ) {
 648                 uint_t newcount = 0;
 649                 boolean_t unmounted;
 650                 struct mnttab *mnp;
 651                 char *path;
 652                 uint_t i;
 653 
 654                 mnts = NULL;
 655                 nmnt = 0;
 656                 /*
 657                  * MNTTAB gives us a way to walk through mounted
 658                  * filesystems; we need to be able to walk them in
 659                  * reverse order, so we build a list of all mounted
 660                  * filesystems.
 661                  */
 662                 if (build_mnttable(zlogp, zroot, zrootlen, mnttab, &mnts,
 663                     &nmnt) != 0) {
 664                         error++;
 665                         goto out;
 666                 }
 667                 for (i = 0; i < nmnt; i++) {
 668                         mnp = &mnts[nmnt - i - 1]; /* access in reverse order */
 669                         path = mnp->mnt_mountp;
 670                         unmounted = B_FALSE;
 671                         /*
 672                          * Try forced unmount first for remote filesystems.
 673                          *
 674                          * Not all remote filesystems support forced unmounts,
 675                          * so if this fails (ENOTSUP) we'll continue on
 676                          * and try a regular unmount.
 677                          */
 678                         if (is_remote_fstype(mnp->mnt_fstype, remote_fstypes)) {
 679                                 if (umount2(path, MS_FORCE) == 0)
 680                                         unmounted = B_TRUE;
 681                         }
 682                         /*
 683                          * Try forced unmount if we're stuck.
 684                          */
 685                         if (stuck) {
 686                                 if (umount2(path, MS_FORCE) == 0) {
 687                                         unmounted = B_TRUE;
 688                                         stuck = B_FALSE;
 689                                 } else {
 690                                         /*
 691                                          * The first failure indicates a
 692                                          * mount we won't be able to get
 693                                          * rid of automatically, so we
 694                                          * bail.
 695                                          */
 696                                         error++;
 697                                         zerror(zlogp, B_FALSE,
 698                                             "unable to unmount '%s'", path);
 699                                         free_mnttable(mnts, nmnt);
 700                                         goto out;
 701                                 }
 702                         }
 703                         /*
 704                          * Try regular unmounts for everything else.
 705                          */
 706                         if (!unmounted && umount2(path, 0) != 0)
 707                                 newcount++;
 708                 }
 709                 free_mnttable(mnts, nmnt);
 710 
 711                 if (newcount == 0)
 712                         break;
 713                 if (newcount >= oldcount) {
 714                         /*
 715                          * Last round didn't unmount anything; we're stuck and
 716                          * should start trying forced unmounts.
 717                          */
 718                         stuck = B_TRUE;
 719                 }
 720                 oldcount = newcount;
 721 
 722                 /*
 723                  * Autofs doesn't let you unmount its trigger nodes from
 724                  * userland so we have to tell the kernel to cleanup for us.
 725                  */
 726                 if (autofs_cleanup(zoneid) != 0) {
 727                         zerror(zlogp, B_TRUE, "unable to remove autofs nodes");
 728                         error++;
 729                         goto out;
 730                 }
 731         }
 732 
 733 out:
 734         free_remote_fstypes(remote_fstypes);
 735         (void) fclose(mnttab);
 736         return (error ? -1 : 0);
 737 }
 738 
 739 static int
 740 fs_compare(const void *m1, const void *m2)
 741 {
 742         struct zone_fstab *i = (struct zone_fstab *)m1;
 743         struct zone_fstab *j = (struct zone_fstab *)m2;
 744 
 745         return (strcmp(i->zone_fs_dir, j->zone_fs_dir));
 746 }
 747 
 748 /*
 749  * Fork and exec (and wait for) the mentioned binary with the provided
 750  * arguments.  Returns (-1) if something went wrong with fork(2) or exec(2),
 751  * returns the exit status otherwise.
 752  *
 753  * If we were unable to exec the provided pathname (for whatever
 754  * reason), we return the special token ZEXIT_EXEC.  The current value
 755  * of ZEXIT_EXEC doesn't conflict with legitimate exit codes of the
 756  * consumers of this function; any future consumers must make sure this
 757  * remains the case.
 758  */
 759 static int
 760 forkexec(zlog_t *zlogp, const char *path, char *const argv[])
 761 {
 762         pid_t child_pid;
 763         int child_status = 0;
 764 
 765         /*
 766          * Do not let another thread localize a message while we are forking.
 767          */
 768         (void) mutex_lock(&msglock);
 769         child_pid = fork();
 770         (void) mutex_unlock(&msglock);
 771         if (child_pid == -1) {
 772                 zerror(zlogp, B_TRUE, "could not fork for %s", argv[0]);
 773                 return (-1);
 774         } else if (child_pid == 0) {
 775                 closefrom(0);
 776                 /* redirect stdin, stdout & stderr to /dev/null */
 777                 (void) open("/dev/null", O_RDONLY);     /* stdin */
 778                 (void) open("/dev/null", O_WRONLY);     /* stdout */
 779                 (void) open("/dev/null", O_WRONLY);     /* stderr */
 780                 (void) execv(path, argv);
 781                 /*
 782                  * Since we are in the child, there is no point calling zerror()
 783                  * since there is nobody waiting to consume it.  So exit with a
 784                  * special code that the parent will recognize and call zerror()
 785                  * accordingly.
 786                  */
 787 
 788                 _exit(ZEXIT_EXEC);
 789         } else {
 790                 (void) waitpid(child_pid, &child_status, 0);
 791         }
 792 
 793         if (WIFSIGNALED(child_status)) {
 794                 zerror(zlogp, B_FALSE, "%s unexpectedly terminated due to "
 795                     "signal %d", path, WTERMSIG(child_status));
 796                 return (-1);
 797         }
 798         assert(WIFEXITED(child_status));
 799         if (WEXITSTATUS(child_status) == ZEXIT_EXEC) {
 800                 zerror(zlogp, B_FALSE, "failed to exec %s", path);
 801                 return (-1);
 802         }
 803         return (WEXITSTATUS(child_status));
 804 }
 805 
 806 static int
 807 dofsck(zlog_t *zlogp, const char *fstype, const char *rawdev)
 808 {
 809         char cmdbuf[MAXPATHLEN];
 810         char *argv[4];
 811         int status;
 812 
 813         /*
 814          * We could alternatively have called /usr/sbin/fsck -F <fstype>, but
 815          * that would cost us an extra fork/exec without buying us anything.
 816          */
 817         if (snprintf(cmdbuf, sizeof (cmdbuf), "/usr/lib/fs/%s/fsck", fstype)
 818             >= sizeof (cmdbuf)) {
 819                 zerror(zlogp, B_FALSE, "file-system type %s too long", fstype);
 820                 return (-1);
 821         }
 822 
 823         argv[0] = "fsck";
 824         argv[1] = "-m";
 825         argv[2] = (char *)rawdev;
 826         argv[3] = NULL;
 827 
 828         status = forkexec(zlogp, cmdbuf, argv);
 829         if (status == 0 || status == -1)
 830                 return (status);
 831         zerror(zlogp, B_FALSE, "fsck of '%s' failed with exit status %d; "
 832             "run fsck manually", rawdev, status);
 833         return (-1);
 834 }
 835 
 836 static int
 837 domount(zlog_t *zlogp, const char *fstype, const char *opts,
 838     const char *special, const char *directory)
 839 {
 840         char cmdbuf[MAXPATHLEN];
 841         char *argv[6];
 842         int status;
 843 
 844         /*
 845          * We could alternatively have called /usr/sbin/mount -F <fstype>, but
 846          * that would cost us an extra fork/exec without buying us anything.
 847          */
 848         if (snprintf(cmdbuf, sizeof (cmdbuf), "/usr/lib/fs/%s/mount", fstype)
 849             >= sizeof (cmdbuf)) {
 850                 zerror(zlogp, B_FALSE, "file-system type %s too long", fstype);
 851                 return (-1);
 852         }
 853         argv[0] = "mount";
 854         if (opts[0] == '\0') {
 855                 argv[1] = (char *)special;
 856                 argv[2] = (char *)directory;
 857                 argv[3] = NULL;
 858         } else {
 859                 argv[1] = "-o";
 860                 argv[2] = (char *)opts;
 861                 argv[3] = (char *)special;
 862                 argv[4] = (char *)directory;
 863                 argv[5] = NULL;
 864         }
 865 
 866         status = forkexec(zlogp, cmdbuf, argv);
 867         if (status == 0 || status == -1)
 868                 return (status);
 869         if (opts[0] == '\0')
 870                 zerror(zlogp, B_FALSE, "\"%s %s %s\" "
 871                     "failed with exit code %d",
 872                     cmdbuf, special, directory, status);
 873         else
 874                 zerror(zlogp, B_FALSE, "\"%s -o %s %s %s\" "
 875                     "failed with exit code %d",
 876                     cmdbuf, opts, special, directory, status);
 877         return (-1);
 878 }
 879 
 880 /*
 881  * Check if a given mount point path exists.
 882  * If it does, make sure it doesn't contain any symlinks.
 883  * Note that if "leaf" is false we're checking an intermediate
 884  * component of the mount point path, so it must be a directory.
 885  * If "leaf" is true, then we're checking the entire mount point
 886  * path, so the mount point itself can be anything aside from a
 887  * symbolic link.
 888  *
 889  * If the path is invalid then a negative value is returned.  If the
 890  * path exists and is a valid mount point path then 0 is returned.
 891  * If the path doesn't exist return a positive value.
 892  */
 893 static int
 894 valid_mount_point(zlog_t *zlogp, const char *path, const boolean_t leaf)
 895 {
 896         struct stat statbuf;
 897         char respath[MAXPATHLEN];
 898         int res;
 899 
 900         if (lstat(path, &statbuf) != 0) {
 901                 if (errno == ENOENT)
 902                         return (1);
 903                 zerror(zlogp, B_TRUE, "can't stat %s", path);
 904                 return (-1);
 905         }
 906         if (S_ISLNK(statbuf.st_mode)) {
 907                 zerror(zlogp, B_FALSE, "%s is a symlink", path);
 908                 return (-1);
 909         }
 910         if (!leaf && !S_ISDIR(statbuf.st_mode)) {
 911                 zerror(zlogp, B_FALSE, "%s is not a directory", path);
 912                 return (-1);
 913         }
 914         if ((res = resolvepath(path, respath, sizeof (respath))) == -1) {
 915                 zerror(zlogp, B_TRUE, "unable to resolve path %s", path);
 916                 return (-1);
 917         }
 918         respath[res] = '\0';
 919         if (strcmp(path, respath) != 0) {
 920                 /*
 921                  * We don't like ".."s, "."s, or "//"s throwing us off
 922                  */
 923                 zerror(zlogp, B_FALSE, "%s is not a canonical path", path);
 924                 return (-1);
 925         }
 926         return (0);
 927 }
 928 
 929 /*
 930  * Validate a mount point path.  A valid mount point path is an
 931  * absolute path that either doesn't exist, or, if it does exists it
 932  * must be an absolute canonical path that doesn't have any symbolic
 933  * links in it.  The target of a mount point path can be any filesystem
 934  * object.  (Different filesystems can support different mount points,
 935  * for example "lofs" and "mntfs" both support files and directories
 936  * while "ufs" just supports directories.)
 937  *
 938  * If the path is invalid then a negative value is returned.  If the
 939  * path exists and is a valid mount point path then 0 is returned.
 940  * If the path doesn't exist return a positive value.
 941  */
 942 int
 943 valid_mount_path(zlog_t *zlogp, const char *rootpath, const char *spec,
 944     const char *dir, const char *fstype)
 945 {
 946         char abspath[MAXPATHLEN], *slashp, *slashp_next;
 947         int rv;
 948 
 949         /*
 950          * Sanity check the target mount point path.
 951          * It must be a non-null string that starts with a '/'.
 952          */
 953         if (dir[0] != '/') {
 954                 if (spec[0] == '\0') {
 955                         /*
 956                          * This must be an invalid ipd entry (see comments
 957                          * in mount_filesystems_ipdent()).
 958                          */
 959                         zerror(zlogp, B_FALSE,
 960                             "invalid inherit-pkg-dir entry: \"%s\"", dir);
 961                 } else {
 962                         /* Something went wrong. */
 963                         zerror(zlogp, B_FALSE, "invalid mount directory, "
 964                             "type: \"%s\", special: \"%s\", dir: \"%s\"",
 965                             fstype, spec, dir);
 966                 }
 967                 return (-1);
 968         }
 969 
 970         /*
 971          * Join rootpath and dir.  Make sure abspath ends with '/', this
 972          * is added to all paths (even non-directory paths) to allow us
 973          * to detect the end of paths below.  If the path already ends
 974          * in a '/', then that's ok too (although we'll fail the
 975          * cannonical path check in valid_mount_point()).
 976          */
 977         if (snprintf(abspath, sizeof (abspath),
 978             "%s%s/", rootpath, dir) >= sizeof (abspath)) {
 979                 zerror(zlogp, B_FALSE, "pathname %s%s is too long",
 980                     rootpath, dir);
 981                 return (-1);
 982         }
 983 
 984         /*
 985          * Starting with rootpath, verify the mount path one component
 986          * at a time.  Continue until we've evaluated all of abspath.
 987          */
 988         slashp = &abspath[strlen(rootpath)];
 989         assert(*slashp == '/');
 990         do {
 991                 slashp_next = strchr(slashp + 1, '/');
 992                 *slashp = '\0';
 993                 if (slashp_next != NULL) {
 994                         /* This is an intermediary mount path component. */
 995                         rv = valid_mount_point(zlogp, abspath, B_FALSE);
 996                 } else {
 997                         /* This is the last component of the mount path. */
 998                         rv = valid_mount_point(zlogp, abspath, B_TRUE);
 999                 }
1000                 if (rv < 0)
1001                         return (rv);
1002                 *slashp = '/';
1003         } while ((slashp = slashp_next) != NULL);
1004         return (rv);
1005 }
1006 
1007 static int
1008 mount_one_dev_device_cb(void *arg, const char *match, const char *name)
1009 {
1010         di_prof_t prof = arg;
1011 
1012         if (name == NULL)
1013                 return (di_prof_add_dev(prof, match));
1014         return (di_prof_add_map(prof, match, name));
1015 }
1016 
1017 static int
1018 mount_one_dev_symlink_cb(void *arg, const char *source, const char *target)
1019 {
1020         di_prof_t prof = arg;
1021 
1022         return (di_prof_add_symlink(prof, source, target));
1023 }
1024 
1025 static int
1026 get_iptype(zlog_t *zlogp, zone_iptype_t *iptypep)
1027 {
1028         zone_dochandle_t handle;
1029 
1030         if ((handle = zonecfg_init_handle()) == NULL) {
1031                 zerror(zlogp, B_TRUE, "getting zone configuration handle");
1032                 return (-1);
1033         }
1034         if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
1035                 zerror(zlogp, B_FALSE, "invalid configuration");
1036                 zonecfg_fini_handle(handle);
1037                 return (-1);
1038         }
1039         if (zonecfg_get_iptype(handle, iptypep) != Z_OK) {
1040                 zerror(zlogp, B_FALSE, "invalid ip-type configuration");
1041                 zonecfg_fini_handle(handle);
1042                 return (-1);
1043         }
1044         zonecfg_fini_handle(handle);
1045         return (0);
1046 }
1047 
1048 /*
1049  * Apply the standard lists of devices/symlinks/mappings and the user-specified
1050  * list of devices (via zonecfg) to the /dev filesystem.  The filesystem will
1051  * use these as a profile/filter to determine what exists in /dev.
1052  */
1053 static int
1054 mount_one_dev(zlog_t *zlogp, char *devpath)
1055 {
1056         char                    brand[MAXNAMELEN];
1057         zone_dochandle_t        handle = NULL;
1058         brand_handle_t          bh = NULL;
1059         struct zone_devtab      ztab;
1060         di_prof_t               prof = NULL;
1061         int                     err;
1062         int                     retval = -1;
1063         zone_iptype_t           iptype;
1064         const char              *curr_iptype;
1065 
1066         if (di_prof_init(devpath, &prof)) {
1067                 zerror(zlogp, B_TRUE, "failed to initialize profile");
1068                 goto cleanup;
1069         }
1070 
1071         /* Get a handle to the brand info for this zone */
1072         if ((zone_get_brand(zone_name, brand, sizeof (brand)) != Z_OK) ||
1073             (bh = brand_open(brand)) == NULL) {
1074                 zerror(zlogp, B_FALSE, "unable to determine zone brand");
1075                 goto cleanup;
1076         }
1077 
1078         if (get_iptype(zlogp, &iptype) < 0) {
1079                 zerror(zlogp, B_TRUE, "unable to determine ip-type");
1080                 goto cleanup;
1081         }
1082         switch (iptype) {
1083         case ZS_SHARED:
1084                 curr_iptype = "shared";
1085                 break;
1086         case ZS_EXCLUSIVE:
1087                 curr_iptype = "exclusive";
1088                 break;
1089         }
1090 
1091         if (brand_platform_iter_devices(bh, zone_name,
1092             mount_one_dev_device_cb, prof, curr_iptype) != 0) {
1093                 zerror(zlogp, B_TRUE, "failed to add standard device");
1094                 goto cleanup;
1095         }
1096 
1097         if (brand_platform_iter_link(bh,
1098             mount_one_dev_symlink_cb, prof) != 0) {
1099                 zerror(zlogp, B_TRUE, "failed to add standard symlink");
1100                 goto cleanup;
1101         }
1102 
1103         /* Add user-specified devices and directories */
1104         if ((handle = zonecfg_init_handle()) == NULL) {
1105                 zerror(zlogp, B_FALSE, "can't initialize zone handle");
1106                 goto cleanup;
1107         }
1108         if (err = zonecfg_get_handle(zone_name, handle)) {
1109                 zerror(zlogp, B_FALSE, "can't get handle for zone "
1110                     "%s: %s", zone_name, zonecfg_strerror(err));
1111                 goto cleanup;
1112         }
1113         if (err = zonecfg_setdevent(handle)) {
1114                 zerror(zlogp, B_FALSE, "%s: %s", zone_name,
1115                     zonecfg_strerror(err));
1116                 goto cleanup;
1117         }
1118         while (zonecfg_getdevent(handle, &ztab) == Z_OK) {
1119                 if (di_prof_add_dev(prof, ztab.zone_dev_match)) {
1120                         zerror(zlogp, B_TRUE, "failed to add "
1121                             "user-specified device");
1122                         goto cleanup;
1123                 }
1124         }
1125         (void) zonecfg_enddevent(handle);
1126 
1127         /* Send profile to kernel */
1128         if (di_prof_commit(prof)) {
1129                 zerror(zlogp, B_TRUE, "failed to commit profile");
1130                 goto cleanup;
1131         }
1132 
1133         retval = 0;
1134 
1135 cleanup:
1136         if (bh != NULL)
1137                 brand_close(bh);
1138         if (handle != NULL)
1139                 zonecfg_fini_handle(handle);
1140         if (prof)
1141                 di_prof_fini(prof);
1142         return (retval);
1143 }
1144 
1145 static int
1146 mount_one(zlog_t *zlogp, struct zone_fstab *fsptr, const char *rootpath)
1147 {
1148         char path[MAXPATHLEN];
1149         char specpath[MAXPATHLEN];
1150         char optstr[MAX_MNTOPT_STR];
1151         zone_fsopt_t *optptr;
1152         int rv;
1153 
1154         if ((rv = valid_mount_path(zlogp, rootpath, fsptr->zone_fs_special,
1155             fsptr->zone_fs_dir, fsptr->zone_fs_type)) < 0) {
1156                 zerror(zlogp, B_FALSE, "%s%s is not a valid mount point",
1157                     rootpath, fsptr->zone_fs_dir);
1158                 return (-1);
1159         } else if (rv > 0) {
1160                 /* The mount point path doesn't exist, create it now. */
1161                 if (make_one_dir(zlogp, rootpath, fsptr->zone_fs_dir,
1162                     DEFAULT_DIR_MODE, DEFAULT_DIR_USER,
1163                     DEFAULT_DIR_GROUP) != 0) {
1164                         zerror(zlogp, B_FALSE, "failed to create mount point");
1165                         return (-1);
1166                 }
1167 
1168                 /*
1169                  * Now this might seem weird, but we need to invoke
1170                  * valid_mount_path() again.  Why?  Because it checks
1171                  * to make sure that the mount point path is canonical,
1172                  * which it can only do if the path exists, so now that
1173                  * we've created the path we have to verify it again.
1174                  */
1175                 if ((rv = valid_mount_path(zlogp, rootpath,
1176                     fsptr->zone_fs_special, fsptr->zone_fs_dir,
1177                     fsptr->zone_fs_type)) < 0) {
1178                         zerror(zlogp, B_FALSE,
1179                             "%s%s is not a valid mount point",
1180                             rootpath, fsptr->zone_fs_dir);
1181                         return (-1);
1182                 }
1183         }
1184 
1185         (void) snprintf(path, sizeof (path), "%s%s", rootpath,
1186             fsptr->zone_fs_dir);
1187 
1188         if (strlen(fsptr->zone_fs_special) == 0) {
1189                 /*
1190                  * A zero-length special is how we distinguish IPDs from
1191                  * general-purpose FSs.  Make sure it mounts from a place that
1192                  * can be seen via the alternate zone's root.
1193                  */
1194                 if (snprintf(specpath, sizeof (specpath), "%s%s",
1195                     zonecfg_get_root(), fsptr->zone_fs_dir) >=
1196                     sizeof (specpath)) {
1197                         zerror(zlogp, B_FALSE, "cannot mount %s: path too "
1198                             "long in alternate root", fsptr->zone_fs_dir);
1199                         return (-1);
1200                 }
1201                 if (zonecfg_in_alt_root())
1202                         resolve_lofs(zlogp, specpath, sizeof (specpath));
1203                 if (domount(zlogp, MNTTYPE_LOFS, IPD_DEFAULT_OPTS,
1204                     specpath, path) != 0) {
1205                         zerror(zlogp, B_TRUE, "failed to loopback mount %s",
1206                             specpath);
1207                         return (-1);
1208                 }
1209                 return (0);
1210         }
1211 
1212         /*
1213          * In general the strategy here is to do just as much verification as
1214          * necessary to avoid crashing or otherwise doing something bad; if the
1215          * administrator initiated the operation via zoneadm(1m), he'll get
1216          * auto-verification which will let him know what's wrong.  If he
1217          * modifies the zone configuration of a running zone and doesn't attempt
1218          * to verify that it's OK we won't crash but won't bother trying to be
1219          * too helpful either.  zoneadm verify is only a couple keystrokes away.
1220          */
1221         if (!zonecfg_valid_fs_type(fsptr->zone_fs_type)) {
1222                 zerror(zlogp, B_FALSE, "cannot mount %s on %s: "
1223                     "invalid file-system type %s", fsptr->zone_fs_special,
1224                     fsptr->zone_fs_dir, fsptr->zone_fs_type);
1225                 return (-1);
1226         }
1227 
1228         /*
1229          * If we're looking at an alternate root environment, then construct
1230          * read-only loopback mounts as necessary.  Note that any special
1231          * paths for lofs zone mounts in an alternate root must have
1232          * already been pre-pended with any alternate root path by the
1233          * time we get here.
1234          */
1235         if (zonecfg_in_alt_root()) {
1236                 struct stat64 st;
1237 
1238                 if (stat64(fsptr->zone_fs_special, &st) != -1 &&
1239                     S_ISBLK(st.st_mode)) {
1240                         /*
1241                          * If we're going to mount a block device we need
1242                          * to check if that device is already mounted
1243                          * somewhere else, and if so, do a lofs mount
1244                          * of the device instead of a direct mount
1245                          */
1246                         if (check_lofs_needed(zlogp, fsptr) == -1)
1247                                 return (-1);
1248                 } else if (strcmp(fsptr->zone_fs_type, MNTTYPE_LOFS) == 0) {
1249                         /*
1250                          * For lofs mounts, the special node is inside the
1251                          * alternate root.  We need lofs resolution for
1252                          * this case in order to get at the underlying
1253                          * read-write path.
1254                          */
1255                         resolve_lofs(zlogp, fsptr->zone_fs_special,
1256                             sizeof (fsptr->zone_fs_special));
1257                 }
1258         }
1259 
1260         /*
1261          * Run 'fsck -m' if there's a device to fsck.
1262          */
1263         if (fsptr->zone_fs_raw[0] != '\0' &&
1264             dofsck(zlogp, fsptr->zone_fs_type, fsptr->zone_fs_raw) != 0)
1265                 return (-1);
1266 
1267         /*
1268          * Build up mount option string.
1269          */
1270         optstr[0] = '\0';
1271         if (fsptr->zone_fs_options != NULL) {
1272                 (void) strlcpy(optstr, fsptr->zone_fs_options->zone_fsopt_opt,
1273                     sizeof (optstr));
1274                 for (optptr = fsptr->zone_fs_options->zone_fsopt_next;
1275                     optptr != NULL; optptr = optptr->zone_fsopt_next) {
1276                         (void) strlcat(optstr, ",", sizeof (optstr));
1277                         (void) strlcat(optstr, optptr->zone_fsopt_opt,
1278                             sizeof (optstr));
1279                 }
1280         }
1281 
1282         if ((rv = domount(zlogp, fsptr->zone_fs_type, optstr,
1283             fsptr->zone_fs_special, path)) != 0)
1284                 return (rv);
1285 
1286         /*
1287          * The mount succeeded.  If this was not a mount of /dev then
1288          * we're done.
1289          */
1290         if (strcmp(fsptr->zone_fs_type, MNTTYPE_DEV) != 0)
1291                 return (0);
1292 
1293         /*
1294          * We just mounted an instance of a /dev filesystem, so now we
1295          * need to configure it.
1296          */
1297         return (mount_one_dev(zlogp, path));
1298 }
1299 
1300 static void
1301 free_fs_data(struct zone_fstab *fsarray, uint_t nelem)
1302 {
1303         uint_t i;
1304 
1305         if (fsarray == NULL)
1306                 return;
1307         for (i = 0; i < nelem; i++)
1308                 zonecfg_free_fs_option_list(fsarray[i].zone_fs_options);
1309         free(fsarray);
1310 }
1311 
1312 /*
1313  * This function initiates the creation of a small Solaris Environment for
1314  * scratch zone. The Environment creation process is split up into two
1315  * functions(build_mounted_pre_var() and build_mounted_post_var()). It
1316  * is done this way because:
1317  *      We need to have both /etc and /var in the root of the scratchzone.
1318  *      We loopback mount zone's own /etc and /var into the root of the
1319  *      scratch zone. Unlike /etc, /var can be a seperate filesystem. So we
1320  *      need to delay the mount of /var till the zone's root gets populated.
1321  *      So mounting of localdirs[](/etc and /var) have been moved to the
1322  *      build_mounted_post_var() which gets called only after the zone
1323  *      specific filesystems are mounted.
1324  *
1325  * Note that the scratch zone we set up for updating the zone (Z_MNT_UPDATE)
1326  * does not loopback mount the zone's own /etc and /var into the root of the
1327  * scratch zone.
1328  */
1329 static boolean_t
1330 build_mounted_pre_var(zlog_t *zlogp, char *rootpath,
1331     size_t rootlen, const char *zonepath, char *luroot, size_t lurootlen)
1332 {
1333         char tmp[MAXPATHLEN], fromdir[MAXPATHLEN];
1334         const char **cpp;
1335         static const char *mkdirs[] = {
1336                 "/system", "/system/contract", "/system/object", "/proc",
1337                 "/dev", "/tmp", "/a", NULL
1338         };
1339         char *altstr;
1340         FILE *fp;
1341         uuid_t uuid;
1342 
1343         assert(zone_isnative || zone_iscluster);
1344 
1345         resolve_lofs(zlogp, rootpath, rootlen);
1346         (void) snprintf(luroot, lurootlen, "%s/lu", zonepath);
1347         resolve_lofs(zlogp, luroot, lurootlen);
1348         (void) snprintf(tmp, sizeof (tmp), "%s/bin", luroot);
1349         (void) symlink("./usr/bin", tmp);
1350 
1351         /*
1352          * These are mostly special mount points; not handled here.  (See
1353          * zone_mount_early.)
1354          */
1355         for (cpp = mkdirs; *cpp != NULL; cpp++) {
1356                 (void) snprintf(tmp, sizeof (tmp), "%s%s", luroot, *cpp);
1357                 if (mkdir(tmp, 0755) != 0) {
1358                         zerror(zlogp, B_TRUE, "cannot create %s", tmp);
1359                         return (B_FALSE);
1360                 }
1361         }
1362         /*
1363          * This is here to support lucopy.  If there's an instance of this same
1364          * zone on the current running system, then we mount its root up as
1365          * read-only inside the scratch zone.
1366          */
1367         (void) zonecfg_get_uuid(zone_name, uuid);
1368         altstr = strdup(zonecfg_get_root());
1369         if (altstr == NULL) {
1370                 zerror(zlogp, B_TRUE, "memory allocation failed");
1371                 return (B_FALSE);
1372         }
1373         zonecfg_set_root("");
1374         (void) strlcpy(tmp, zone_name, sizeof (tmp));
1375         (void) zonecfg_get_name_by_uuid(uuid, tmp, sizeof (tmp));
1376         if (zone_get_rootpath(tmp, fromdir, sizeof (fromdir)) == Z_OK &&
1377             strcmp(fromdir, rootpath) != 0) {
1378                 (void) snprintf(tmp, sizeof (tmp), "%s/b", luroot);
1379                 if (mkdir(tmp, 0755) != 0) {
1380                         zerror(zlogp, B_TRUE, "cannot create %s", tmp);
1381                         return (B_FALSE);
1382                 }
1383                 if (domount(zlogp, MNTTYPE_LOFS, IPD_DEFAULT_OPTS, fromdir,
1384                     tmp) != 0) {
1385                         zerror(zlogp, B_TRUE, "cannot mount %s on %s", tmp,
1386                             fromdir);
1387                         return (B_FALSE);
1388                 }
1389         }
1390         zonecfg_set_root(altstr);
1391         free(altstr);
1392 
1393         if ((fp = zonecfg_open_scratch(luroot, B_TRUE)) == NULL) {
1394                 zerror(zlogp, B_TRUE, "cannot open zone mapfile");
1395                 return (B_FALSE);
1396         }
1397         (void) ftruncate(fileno(fp), 0);
1398         if (zonecfg_add_scratch(fp, zone_name, kernzone, "/") == -1) {
1399                 zerror(zlogp, B_TRUE, "cannot add zone mapfile entry");
1400         }
1401         zonecfg_close_scratch(fp);
1402         (void) snprintf(tmp, sizeof (tmp), "%s/a", luroot);
1403         if (domount(zlogp, MNTTYPE_LOFS, "", rootpath, tmp) != 0)
1404                 return (B_FALSE);
1405         (void) strlcpy(rootpath, tmp, rootlen);
1406         return (B_TRUE);
1407 }
1408 
1409 
1410 static boolean_t
1411 build_mounted_post_var(zlog_t *zlogp, zone_mnt_t mount_cmd, char *rootpath,
1412     const char *luroot)
1413 {
1414         char tmp[MAXPATHLEN], fromdir[MAXPATHLEN];
1415         const char **cpp;
1416         const char **loopdirs;
1417         const char **tmpdirs;
1418         static const char *localdirs[] = {
1419                 "/etc", "/var", NULL
1420         };
1421         static const char *scr_loopdirs[] = {
1422                 "/etc/lib", "/etc/fs", "/lib", "/sbin", "/platform",
1423                 "/usr", NULL
1424         };
1425         static const char *upd_loopdirs[] = {
1426                 "/etc", "/kernel", "/lib", "/opt", "/platform", "/sbin",
1427                 "/usr", "/var", NULL
1428         };
1429         static const char *scr_tmpdirs[] = {
1430                 "/tmp", "/var/run", NULL
1431         };
1432         static const char *upd_tmpdirs[] = {
1433                 "/tmp", "/var/run", "/var/tmp", NULL
1434         };
1435         struct stat st;
1436 
1437         if (mount_cmd == Z_MNT_SCRATCH) {
1438                 /*
1439                  * These are mounted read-write from the zone undergoing
1440                  * upgrade.  We must be careful not to 'leak' things from the
1441                  * main system into the zone, and this accomplishes that goal.
1442                  */
1443                 for (cpp = localdirs; *cpp != NULL; cpp++) {
1444                         (void) snprintf(tmp, sizeof (tmp), "%s%s", luroot,
1445                             *cpp);
1446                         (void) snprintf(fromdir, sizeof (fromdir), "%s%s",
1447                             rootpath, *cpp);
1448                         if (mkdir(tmp, 0755) != 0) {
1449                                 zerror(zlogp, B_TRUE, "cannot create %s", tmp);
1450                                 return (B_FALSE);
1451                         }
1452                         if (domount(zlogp, MNTTYPE_LOFS, "", fromdir, tmp)
1453                             != 0) {
1454                                 zerror(zlogp, B_TRUE, "cannot mount %s on %s",
1455                                     tmp, *cpp);
1456                                 return (B_FALSE);
1457                         }
1458                 }
1459         }
1460 
1461         if (mount_cmd == Z_MNT_UPDATE)
1462                 loopdirs = upd_loopdirs;
1463         else
1464                 loopdirs = scr_loopdirs;
1465 
1466         /*
1467          * These are things mounted read-only from the running system because
1468          * they contain binaries that must match system.
1469          */
1470         for (cpp = loopdirs; *cpp != NULL; cpp++) {
1471                 (void) snprintf(tmp, sizeof (tmp), "%s%s", luroot, *cpp);
1472                 if (mkdir(tmp, 0755) != 0) {
1473                         if (errno != EEXIST) {
1474                                 zerror(zlogp, B_TRUE, "cannot create %s", tmp);
1475                                 return (B_FALSE);
1476                         }
1477                         if (lstat(tmp, &st) != 0) {
1478                                 zerror(zlogp, B_TRUE, "cannot stat %s", tmp);
1479                                 return (B_FALSE);
1480                         }
1481                         /*
1482                          * Ignore any non-directories encountered.  These are
1483                          * things that have been converted into symlinks
1484                          * (/etc/fs and /etc/lib) and no longer need a lofs
1485                          * fixup.
1486                          */
1487                         if (!S_ISDIR(st.st_mode))
1488                                 continue;
1489                 }
1490                 if (domount(zlogp, MNTTYPE_LOFS, IPD_DEFAULT_OPTS, *cpp,
1491                     tmp) != 0) {
1492                         zerror(zlogp, B_TRUE, "cannot mount %s on %s", tmp,
1493                             *cpp);
1494                         return (B_FALSE);
1495                 }
1496         }
1497 
1498         if (mount_cmd == Z_MNT_UPDATE)
1499                 tmpdirs = upd_tmpdirs;
1500         else
1501                 tmpdirs = scr_tmpdirs;
1502 
1503         /*
1504          * These are things with tmpfs mounted inside.
1505          */
1506         for (cpp = tmpdirs; *cpp != NULL; cpp++) {
1507                 (void) snprintf(tmp, sizeof (tmp), "%s%s", luroot, *cpp);
1508                 if (mount_cmd == Z_MNT_SCRATCH && mkdir(tmp, 0755) != 0 &&
1509                     errno != EEXIST) {
1510                         zerror(zlogp, B_TRUE, "cannot create %s", tmp);
1511                         return (B_FALSE);
1512                 }
1513 
1514                 /*
1515                  * We could set the mode for /tmp when we do the mkdir but
1516                  * since that can be modified by the umask we will just set
1517                  * the correct mode for /tmp now.
1518                  */
1519                 if (strcmp(*cpp, "/tmp") == 0 && chmod(tmp, 01777) != 0) {
1520                         zerror(zlogp, B_TRUE, "cannot chmod %s", tmp);
1521                         return (B_FALSE);
1522                 }
1523 
1524                 if (domount(zlogp, MNTTYPE_TMPFS, "", "swap", tmp) != 0) {
1525                         zerror(zlogp, B_TRUE, "cannot mount swap on %s", *cpp);
1526                         return (B_FALSE);
1527                 }
1528         }
1529         return (B_TRUE);
1530 }
1531 
1532 typedef struct plat_gmount_cb_data {
1533         zlog_t                  *pgcd_zlogp;
1534         struct zone_fstab       **pgcd_fs_tab;
1535         int                     *pgcd_num_fs;
1536 } plat_gmount_cb_data_t;
1537 
1538 /*
1539  * plat_gmount_cb() is a callback function invoked by libbrand to iterate
1540  * through all global brand platform mounts.
1541  */
1542 int
1543 plat_gmount_cb(void *data, const char *spec, const char *dir,
1544     const char *fstype, const char *opt)
1545 {
1546         plat_gmount_cb_data_t   *cp = data;
1547         zlog_t                  *zlogp = cp->pgcd_zlogp;
1548         struct zone_fstab       *fs_ptr = *cp->pgcd_fs_tab;
1549         int                     num_fs = *cp->pgcd_num_fs;
1550         struct zone_fstab       *fsp, *tmp_ptr;
1551 
1552         num_fs++;
1553         if ((tmp_ptr = realloc(fs_ptr, num_fs * sizeof (*tmp_ptr))) == NULL) {
1554                 zerror(zlogp, B_TRUE, "memory allocation failed");
1555                 return (-1);
1556         }
1557 
1558         fs_ptr = tmp_ptr;
1559         fsp = &fs_ptr[num_fs - 1];
1560 
1561         /* update the callback struct passed in */
1562         *cp->pgcd_fs_tab = fs_ptr;
1563         *cp->pgcd_num_fs = num_fs;
1564 
1565         fsp->zone_fs_raw[0] = '\0';
1566         (void) strlcpy(fsp->zone_fs_special, spec,
1567             sizeof (fsp->zone_fs_special));
1568         (void) strlcpy(fsp->zone_fs_dir, dir, sizeof (fsp->zone_fs_dir));
1569         (void) strlcpy(fsp->zone_fs_type, fstype, sizeof (fsp->zone_fs_type));
1570         fsp->zone_fs_options = NULL;
1571         if ((opt != NULL) &&
1572             (zonecfg_add_fs_option(fsp, (char *)opt) != Z_OK)) {
1573                 zerror(zlogp, B_FALSE, "error adding property");
1574                 return (-1);
1575         }
1576 
1577         return (0);
1578 }
1579 
1580 static int
1581 mount_filesystems_ipdent(zone_dochandle_t handle, zlog_t *zlogp,
1582     struct zone_fstab **fs_tabp, int *num_fsp)
1583 {
1584         struct zone_fstab *tmp_ptr, *fs_ptr, *fsp, fstab;
1585         int num_fs;
1586 
1587         num_fs = *num_fsp;
1588         fs_ptr = *fs_tabp;
1589 
1590         if (zonecfg_setipdent(handle) != Z_OK) {
1591                 zerror(zlogp, B_FALSE, "invalid configuration");
1592                 return (-1);
1593         }
1594         while (zonecfg_getipdent(handle, &fstab) == Z_OK) {
1595                 num_fs++;
1596                 if ((tmp_ptr = realloc(fs_ptr,
1597                     num_fs * sizeof (*tmp_ptr))) == NULL) {
1598                         zerror(zlogp, B_TRUE, "memory allocation failed");
1599                         (void) zonecfg_endipdent(handle);
1600                         return (-1);
1601                 }
1602 
1603                 /* update the pointers passed in */
1604                 *fs_tabp = tmp_ptr;
1605                 *num_fsp = num_fs;
1606 
1607                 /*
1608                  * IPDs logically only have a mount point; all other properties
1609                  * are implied.
1610                  */
1611                 fs_ptr = tmp_ptr;
1612                 fsp = &fs_ptr[num_fs - 1];
1613                 (void) strlcpy(fsp->zone_fs_dir,
1614                     fstab.zone_fs_dir, sizeof (fsp->zone_fs_dir));
1615                 fsp->zone_fs_special[0] = '\0';
1616                 fsp->zone_fs_raw[0] = '\0';
1617                 fsp->zone_fs_type[0] = '\0';
1618                 fsp->zone_fs_options = NULL;
1619         }
1620         (void) zonecfg_endipdent(handle);
1621         return (0);
1622 }
1623 
1624 static int
1625 mount_filesystems_fsent(zone_dochandle_t handle, zlog_t *zlogp,
1626     struct zone_fstab **fs_tabp, int *num_fsp, zone_mnt_t mount_cmd)
1627 {
1628         struct zone_fstab *tmp_ptr, *fs_ptr, *fsp, fstab;
1629         int num_fs;
1630 
1631         num_fs = *num_fsp;
1632         fs_ptr = *fs_tabp;
1633 
1634         if (zonecfg_setfsent(handle) != Z_OK) {
1635                 zerror(zlogp, B_FALSE, "invalid configuration");
1636                 return (-1);
1637         }
1638         while (zonecfg_getfsent(handle, &fstab) == Z_OK) {
1639                 /*
1640                  * ZFS filesystems will not be accessible under an alternate
1641                  * root, since the pool will not be known.  Ignore them in this
1642                  * case.
1643                  */
1644                 if (ALT_MOUNT(mount_cmd) &&
1645                     strcmp(fstab.zone_fs_type, MNTTYPE_ZFS) == 0)
1646                         continue;
1647 
1648                 num_fs++;
1649                 if ((tmp_ptr = realloc(fs_ptr,
1650                     num_fs * sizeof (*tmp_ptr))) == NULL) {
1651                         zerror(zlogp, B_TRUE, "memory allocation failed");
1652                         (void) zonecfg_endfsent(handle);
1653                         return (-1);
1654                 }
1655                 /* update the pointers passed in */
1656                 *fs_tabp = tmp_ptr;
1657                 *num_fsp = num_fs;
1658 
1659                 fs_ptr = tmp_ptr;
1660                 fsp = &fs_ptr[num_fs - 1];
1661                 (void) strlcpy(fsp->zone_fs_dir,
1662                     fstab.zone_fs_dir, sizeof (fsp->zone_fs_dir));
1663                 (void) strlcpy(fsp->zone_fs_raw, fstab.zone_fs_raw,
1664                     sizeof (fsp->zone_fs_raw));
1665                 (void) strlcpy(fsp->zone_fs_type, fstab.zone_fs_type,
1666                     sizeof (fsp->zone_fs_type));
1667                 fsp->zone_fs_options = fstab.zone_fs_options;
1668 
1669                 /*
1670                  * For all lofs mounts, make sure that the 'special'
1671                  * entry points inside the alternate root.  The
1672                  * source path for a lofs mount in a given zone needs
1673                  * to be relative to the root of the boot environment
1674                  * that contains the zone.  Note that we don't do this
1675                  * for non-lofs mounts since they will have a device
1676                  * as a backing store and device paths must always be
1677                  * specified relative to the current boot environment.
1678                  */
1679                 fsp->zone_fs_special[0] = '\0';
1680                 if (strcmp(fsp->zone_fs_type, MNTTYPE_LOFS) == 0) {
1681                         (void) strlcat(fsp->zone_fs_special, zonecfg_get_root(),
1682                             sizeof (fsp->zone_fs_special));
1683                 }
1684                 (void) strlcat(fsp->zone_fs_special, fstab.zone_fs_special,
1685                     sizeof (fsp->zone_fs_special));
1686         }
1687         (void) zonecfg_endfsent(handle);
1688         return (0);
1689 }
1690 
1691 static int
1692 mount_filesystems(zlog_t *zlogp, zone_mnt_t mount_cmd)
1693 {
1694         char rootpath[MAXPATHLEN];
1695         char zonepath[MAXPATHLEN];
1696         char brand[MAXNAMELEN];
1697         char luroot[MAXPATHLEN];
1698         int i, num_fs = 0;
1699         struct zone_fstab *fs_ptr = NULL;
1700         zone_dochandle_t handle = NULL;
1701         zone_state_t zstate;
1702         brand_handle_t bh;
1703         plat_gmount_cb_data_t cb;
1704 
1705         if (zone_get_state(zone_name, &zstate) != Z_OK ||
1706             (zstate != ZONE_STATE_READY && zstate != ZONE_STATE_MOUNTED)) {
1707                 zerror(zlogp, B_FALSE,
1708                     "zone must be in '%s' or '%s' state to mount file-systems",
1709                     zone_state_str(ZONE_STATE_READY),
1710                     zone_state_str(ZONE_STATE_MOUNTED));
1711                 goto bad;
1712         }
1713 
1714         if (zone_get_zonepath(zone_name, zonepath, sizeof (zonepath)) != Z_OK) {
1715                 zerror(zlogp, B_TRUE, "unable to determine zone path");
1716                 goto bad;
1717         }
1718 
1719         if (zone_get_rootpath(zone_name, rootpath, sizeof (rootpath)) != Z_OK) {
1720                 zerror(zlogp, B_TRUE, "unable to determine zone root");
1721                 goto bad;
1722         }
1723 
1724         if ((handle = zonecfg_init_handle()) == NULL) {
1725                 zerror(zlogp, B_TRUE, "getting zone configuration handle");
1726                 goto bad;
1727         }
1728         if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK ||
1729             zonecfg_setfsent(handle) != Z_OK) {
1730                 zerror(zlogp, B_FALSE, "invalid configuration");
1731                 goto bad;
1732         }
1733 
1734         /* Get a handle to the brand info for this zone */
1735         if ((zone_get_brand(zone_name, brand, sizeof (brand)) != Z_OK) ||
1736             (bh = brand_open(brand)) == NULL) {
1737                 zerror(zlogp, B_FALSE, "unable to determine zone brand");
1738                 zonecfg_fini_handle(handle);
1739                 return (-1);
1740         }
1741 
1742         /*
1743          * Get the list of global filesystems to mount from the brand
1744          * configuration.
1745          */
1746         cb.pgcd_zlogp = zlogp;
1747         cb.pgcd_fs_tab = &fs_ptr;
1748         cb.pgcd_num_fs = &num_fs;
1749         if (brand_platform_iter_gmounts(bh, zonepath,
1750             plat_gmount_cb, &cb) != 0) {
1751                 zerror(zlogp, B_FALSE, "unable to mount filesystems");
1752                 brand_close(bh);
1753                 zonecfg_fini_handle(handle);
1754                 return (-1);
1755         }
1756         brand_close(bh);
1757 
1758         /*
1759          * Iterate through the rest of the filesystems, first the IPDs, then
1760          * the general FSs.  Sort them all, then mount them in sorted order.
1761          * This is to make sure the higher level directories (e.g., /usr)
1762          * get mounted before any beneath them (e.g., /usr/local).
1763          */
1764         if (mount_filesystems_ipdent(handle, zlogp, &fs_ptr, &num_fs) != 0)
1765                 goto bad;
1766 
1767         if (mount_filesystems_fsent(handle, zlogp, &fs_ptr, &num_fs,
1768             mount_cmd) != 0)
1769                 goto bad;
1770 
1771         zonecfg_fini_handle(handle);
1772         handle = NULL;
1773 
1774         /*
1775          * Normally when we mount a zone all the zone filesystems
1776          * get mounted relative to rootpath, which is usually
1777          * <zonepath>/root.  But when mounting a zone for administration
1778          * purposes via the zone "mount" state, build_mounted_pre_var()
1779          * updates rootpath to be <zonepath>/lu/a so we'll mount all
1780          * the zones filesystems there instead.
1781          *
1782          * build_mounted_pre_var() and build_mounted_post_var() will
1783          * also do some extra work to create directories and lofs mount
1784          * a bunch of global zone file system paths into <zonepath>/lu.
1785          *
1786          * This allows us to be able to enter the zone (now rooted at
1787          * <zonepath>/lu) and run the upgrade/patch tools that are in the
1788          * global zone and have them upgrade the to-be-modified zone's
1789          * files mounted on /a.  (Which mirrors the existing standard
1790          * upgrade environment.)
1791          *
1792          * There is of course one catch.  When doing the upgrade
1793          * we need <zoneroot>/lu/dev to be the /dev filesystem
1794          * for the zone and we don't want to have any /dev filesystem
1795          * mounted at <zoneroot>/lu/a/dev.  Since /dev is specified
1796          * as a normal zone filesystem by default we'll try to mount
1797          * it at <zoneroot>/lu/a/dev, so we have to detect this
1798          * case and instead mount it at <zoneroot>/lu/dev.
1799          *
1800          * All this work is done in three phases:
1801          *   1) Create and populate lu directory (build_mounted_pre_var()).
1802          *   2) Mount the required filesystems as per the zone configuration.
1803          *   3) Set up the rest of the scratch zone environment
1804          *      (build_mounted_post_var()).
1805          */
1806         if (ALT_MOUNT(mount_cmd) && !build_mounted_pre_var(zlogp,
1807             rootpath, sizeof (rootpath), zonepath, luroot, sizeof (luroot)))
1808                 goto bad;
1809 
1810         qsort(fs_ptr, num_fs, sizeof (*fs_ptr), fs_compare);
1811 
1812         for (i = 0; i < num_fs; i++) {
1813                 if (ALT_MOUNT(mount_cmd) &&
1814                     strcmp(fs_ptr[i].zone_fs_dir, "/dev") == 0) {
1815                         size_t slen = strlen(rootpath) - 2;
1816 
1817                         /*
1818                          * By default we'll try to mount /dev as /a/dev
1819                          * but /dev is special and always goes at the top
1820                          * so strip the trailing '/a' from the rootpath.
1821                          */
1822                         assert(zone_isnative || zone_iscluster);
1823                         assert(strcmp(&rootpath[slen], "/a") == 0);
1824                         rootpath[slen] = '\0';
1825                         if (mount_one(zlogp, &fs_ptr[i], rootpath) != 0)
1826                                 goto bad;
1827                         rootpath[slen] = '/';
1828                         continue;
1829                 }
1830                 if (mount_one(zlogp, &fs_ptr[i], rootpath) != 0)
1831                         goto bad;
1832         }
1833         if (ALT_MOUNT(mount_cmd) &&
1834             !build_mounted_post_var(zlogp, mount_cmd, rootpath, luroot))
1835                 goto bad;
1836 
1837         /*
1838          * For Trusted Extensions cross-mount each lower level /export/home
1839          */
1840         if (mount_cmd == Z_MNT_BOOT &&
1841             tsol_mounts(zlogp, zone_name, rootpath) != 0)
1842                 goto bad;
1843 
1844         free_fs_data(fs_ptr, num_fs);
1845 
1846         /*
1847          * Everything looks fine.
1848          */
1849         return (0);
1850 
1851 bad:
1852         if (handle != NULL)
1853                 zonecfg_fini_handle(handle);
1854         free_fs_data(fs_ptr, num_fs);
1855         return (-1);
1856 }
1857 
1858 /* caller makes sure neither parameter is NULL */
1859 static int
1860 addr2netmask(char *prefixstr, int maxprefixlen, uchar_t *maskstr)
1861 {
1862         int prefixlen;
1863 
1864         prefixlen = atoi(prefixstr);
1865         if (prefixlen < 0 || prefixlen > maxprefixlen)
1866                 return (1);
1867         while (prefixlen > 0) {
1868                 if (prefixlen >= 8) {
1869                         *maskstr++ = 0xFF;
1870                         prefixlen -= 8;
1871                         continue;
1872                 }
1873                 *maskstr |= 1 << (8 - prefixlen);
1874                 prefixlen--;
1875         }
1876         return (0);
1877 }
1878 
1879 /*
1880  * Tear down all interfaces belonging to the given zone.  This should
1881  * be called with the zone in a state other than "running", so that
1882  * interfaces can't be assigned to the zone after this returns.
1883  *
1884  * If anything goes wrong, log an error message and return an error.
1885  */
1886 static int
1887 unconfigure_shared_network_interfaces(zlog_t *zlogp, zoneid_t zone_id)
1888 {
1889         struct lifnum lifn;
1890         struct lifconf lifc;
1891         struct lifreq *lifrp, lifrl;
1892         int64_t lifc_flags = LIFC_NOXMIT | LIFC_ALLZONES;
1893         int num_ifs, s, i, ret_code = 0;
1894         uint_t bufsize;
1895         char *buf = NULL;
1896 
1897         if ((s = socket(AF_INET, SOCK_DGRAM, 0)) < 0) {
1898                 zerror(zlogp, B_TRUE, "could not get socket");
1899                 ret_code = -1;
1900                 goto bad;
1901         }
1902         lifn.lifn_family = AF_UNSPEC;
1903         lifn.lifn_flags = (int)lifc_flags;
1904         if (ioctl(s, SIOCGLIFNUM, (char *)&lifn) < 0) {
1905                 zerror(zlogp, B_TRUE,
1906                     "could not determine number of network interfaces");
1907                 ret_code = -1;
1908                 goto bad;
1909         }
1910         num_ifs = lifn.lifn_count;
1911         bufsize = num_ifs * sizeof (struct lifreq);
1912         if ((buf = malloc(bufsize)) == NULL) {
1913                 zerror(zlogp, B_TRUE, "memory allocation failed");
1914                 ret_code = -1;
1915                 goto bad;
1916         }
1917         lifc.lifc_family = AF_UNSPEC;
1918         lifc.lifc_flags = (int)lifc_flags;
1919         lifc.lifc_len = bufsize;
1920         lifc.lifc_buf = buf;
1921         if (ioctl(s, SIOCGLIFCONF, (char *)&lifc) < 0) {
1922                 zerror(zlogp, B_TRUE, "could not get configured network "
1923                     "interfaces");
1924                 ret_code = -1;
1925                 goto bad;
1926         }
1927         lifrp = lifc.lifc_req;
1928         for (i = lifc.lifc_len / sizeof (struct lifreq); i > 0; i--, lifrp++) {
1929                 (void) close(s);
1930                 if ((s = socket(lifrp->lifr_addr.ss_family, SOCK_DGRAM, 0)) <
1931                     0) {
1932                         zerror(zlogp, B_TRUE, "%s: could not get socket",
1933                             lifrl.lifr_name);
1934                         ret_code = -1;
1935                         continue;
1936                 }
1937                 (void) memset(&lifrl, 0, sizeof (lifrl));
1938                 (void) strncpy(lifrl.lifr_name, lifrp->lifr_name,
1939                     sizeof (lifrl.lifr_name));
1940                 if (ioctl(s, SIOCGLIFZONE, (caddr_t)&lifrl) < 0) {
1941                         if (errno == ENXIO)
1942                                 /*
1943                                  * Interface may have been removed by admin or
1944                                  * another zone halting.
1945                                  */
1946                                 continue;
1947                         zerror(zlogp, B_TRUE,
1948                             "%s: could not determine the zone to which this "
1949                             "network interface is bound", lifrl.lifr_name);
1950                         ret_code = -1;
1951                         continue;
1952                 }
1953                 if (lifrl.lifr_zoneid == zone_id) {
1954                         if (ioctl(s, SIOCLIFREMOVEIF, (caddr_t)&lifrl) < 0) {
1955                                 zerror(zlogp, B_TRUE,
1956                                     "%s: could not remove network interface",
1957                                     lifrl.lifr_name);
1958                                 ret_code = -1;
1959                                 continue;
1960                         }
1961                 }
1962         }
1963 bad:
1964         if (s > 0)
1965                 (void) close(s);
1966         if (buf)
1967                 free(buf);
1968         return (ret_code);
1969 }
1970 
1971 static union    sockunion {
1972         struct  sockaddr sa;
1973         struct  sockaddr_in sin;
1974         struct  sockaddr_dl sdl;
1975         struct  sockaddr_in6 sin6;
1976 } so_dst, so_ifp;
1977 
1978 static struct {
1979         struct  rt_msghdr hdr;
1980         char    space[512];
1981 } rtmsg;
1982 
1983 static int
1984 salen(struct sockaddr *sa)
1985 {
1986         switch (sa->sa_family) {
1987         case AF_INET:
1988                 return (sizeof (struct sockaddr_in));
1989         case AF_LINK:
1990                 return (sizeof (struct sockaddr_dl));
1991         case AF_INET6:
1992                 return (sizeof (struct sockaddr_in6));
1993         default:
1994                 return (sizeof (struct sockaddr));
1995         }
1996 }
1997 
1998 #define ROUNDUP_LONG(a) \
1999         ((a) > 0 ? (1 + (((a) - 1) | (sizeof (long) - 1))) : sizeof (long))
2000 
2001 /*
2002  * Look up which zone is using a given IP address.  The address in question
2003  * is expected to have been stuffed into the structure to which lifr points
2004  * via a previous SIOCGLIFADDR ioctl().
2005  *
2006  * This is done using black router socket magic.
2007  *
2008  * Return the name of the zone on success or NULL on failure.
2009  *
2010  * This is a lot of code for a simple task; a new ioctl request to take care
2011  * of this might be a useful RFE.
2012  */
2013 
2014 static char *
2015 who_is_using(zlog_t *zlogp, struct lifreq *lifr)
2016 {
2017         static char answer[ZONENAME_MAX];
2018         pid_t pid;
2019         int s, rlen, l, i;
2020         char *cp = rtmsg.space;
2021         struct sockaddr_dl *ifp = NULL;
2022         struct sockaddr *sa;
2023         char save_if_name[LIFNAMSIZ];
2024 
2025         answer[0] = '\0';
2026 
2027         pid = getpid();
2028         if ((s = socket(PF_ROUTE, SOCK_RAW, 0)) < 0) {
2029                 zerror(zlogp, B_TRUE, "could not get routing socket");
2030                 return (NULL);
2031         }
2032 
2033         if (lifr->lifr_addr.ss_family == AF_INET) {
2034                 struct sockaddr_in *sin4;
2035 
2036                 so_dst.sa.sa_family = AF_INET;
2037                 sin4 = (struct sockaddr_in *)&lifr->lifr_addr;
2038                 so_dst.sin.sin_addr = sin4->sin_addr;
2039         } else {
2040                 struct sockaddr_in6 *sin6;
2041 
2042                 so_dst.sa.sa_family = AF_INET6;
2043                 sin6 = (struct sockaddr_in6 *)&lifr->lifr_addr;
2044                 so_dst.sin6.sin6_addr = sin6->sin6_addr;
2045         }
2046 
2047         so_ifp.sa.sa_family = AF_LINK;
2048 
2049         (void) memset(&rtmsg, 0, sizeof (rtmsg));
2050         rtmsg.hdr.rtm_type = RTM_GET;
2051         rtmsg.hdr.rtm_flags = RTF_UP | RTF_HOST;
2052         rtmsg.hdr.rtm_version = RTM_VERSION;
2053         rtmsg.hdr.rtm_seq = ++rts_seqno;
2054         rtmsg.hdr.rtm_addrs = RTA_IFP | RTA_DST;
2055 
2056         l = ROUNDUP_LONG(salen(&so_dst.sa));
2057         (void) memmove(cp, &(so_dst), l);
2058         cp += l;
2059         l = ROUNDUP_LONG(salen(&so_ifp.sa));
2060         (void) memmove(cp, &(so_ifp), l);
2061         cp += l;
2062 
2063         rtmsg.hdr.rtm_msglen = l = cp - (char *)&rtmsg;
2064 
2065         if ((rlen = write(s, &rtmsg, l)) < 0) {
2066                 zerror(zlogp, B_TRUE, "writing to routing socket");
2067                 return (NULL);
2068         } else if (rlen < (int)rtmsg.hdr.rtm_msglen) {
2069                 zerror(zlogp, B_TRUE,
2070                     "write to routing socket got only %d for len\n", rlen);
2071                 return (NULL);
2072         }
2073         do {
2074                 l = read(s, &rtmsg, sizeof (rtmsg));
2075         } while (l > 0 && (rtmsg.hdr.rtm_seq != rts_seqno ||
2076             rtmsg.hdr.rtm_pid != pid));
2077         if (l < 0) {
2078                 zerror(zlogp, B_TRUE, "reading from routing socket");
2079                 return (NULL);
2080         }
2081 
2082         if (rtmsg.hdr.rtm_version != RTM_VERSION) {
2083                 zerror(zlogp, B_FALSE,
2084                     "routing message version %d not understood",
2085                     rtmsg.hdr.rtm_version);
2086                 return (NULL);
2087         }
2088         if (rtmsg.hdr.rtm_msglen != (ushort_t)l) {
2089                 zerror(zlogp, B_FALSE, "message length mismatch, "
2090                     "expected %d bytes, returned %d bytes",
2091                     rtmsg.hdr.rtm_msglen, l);
2092                 return (NULL);
2093         }
2094         if (rtmsg.hdr.rtm_errno != 0)  {
2095                 errno = rtmsg.hdr.rtm_errno;
2096                 zerror(zlogp, B_TRUE, "RTM_GET routing socket message");
2097                 return (NULL);
2098         }
2099         if ((rtmsg.hdr.rtm_addrs & RTA_IFP) == 0) {
2100                 zerror(zlogp, B_FALSE, "network interface not found");
2101                 return (NULL);
2102         }
2103         cp = ((char *)(&rtmsg.hdr + 1));
2104         for (i = 1; i != 0; i <<= 1) {
2105                 /* LINTED E_BAD_PTR_CAST_ALIGN */
2106                 sa = (struct sockaddr *)cp;
2107                 if (i != RTA_IFP) {
2108                         if ((i & rtmsg.hdr.rtm_addrs) != 0)
2109                                 cp += ROUNDUP_LONG(salen(sa));
2110                         continue;
2111                 }
2112                 if (sa->sa_family == AF_LINK &&
2113                     ((struct sockaddr_dl *)sa)->sdl_nlen != 0)
2114                         ifp = (struct sockaddr_dl *)sa;
2115                 break;
2116         }
2117         if (ifp == NULL) {
2118                 zerror(zlogp, B_FALSE, "network interface could not be "
2119                     "determined");
2120                 return (NULL);
2121         }
2122 
2123         /*
2124          * We need to set the I/F name to what we got above, then do the
2125          * appropriate ioctl to get its zone name.  But lifr->lifr_name is
2126          * used by the calling function to do a REMOVEIF, so if we leave the
2127          * "good" zone's I/F name in place, *that* I/F will be removed instead
2128          * of the bad one.  So we save the old (bad) I/F name before over-
2129          * writing it and doing the ioctl, then restore it after the ioctl.
2130          */
2131         (void) strlcpy(save_if_name, lifr->lifr_name, sizeof (save_if_name));
2132         (void) strncpy(lifr->lifr_name, ifp->sdl_data, ifp->sdl_nlen);
2133         lifr->lifr_name[ifp->sdl_nlen] = '\0';
2134         i = ioctl(s, SIOCGLIFZONE, lifr);
2135         (void) strlcpy(lifr->lifr_name, save_if_name, sizeof (save_if_name));
2136         if (i < 0) {
2137                 zerror(zlogp, B_TRUE,
2138                     "%s: could not determine the zone network interface "
2139                     "belongs to", lifr->lifr_name);
2140                 return (NULL);
2141         }
2142         if (getzonenamebyid(lifr->lifr_zoneid, answer, sizeof (answer)) < 0)
2143                 (void) snprintf(answer, sizeof (answer), "%d",
2144                     lifr->lifr_zoneid);
2145 
2146         if (strlen(answer) > 0)
2147                 return (answer);
2148         return (NULL);
2149 }
2150 
2151 typedef struct mcast_rtmsg_s {
2152         struct rt_msghdr        m_rtm;
2153         union {
2154                 struct {
2155                         struct sockaddr_in      m_dst;
2156                         struct sockaddr_in      m_gw;
2157                         struct sockaddr_in      m_netmask;
2158                 } m_v4;
2159                 struct {
2160                         struct sockaddr_in6     m_dst;
2161                         struct sockaddr_in6     m_gw;
2162                         struct sockaddr_in6     m_netmask;
2163                 } m_v6;
2164         } m_u;
2165 } mcast_rtmsg_t;
2166 #define m_dst4          m_u.m_v4.m_dst
2167 #define m_dst6          m_u.m_v6.m_dst
2168 #define m_gw4           m_u.m_v4.m_gw
2169 #define m_gw6           m_u.m_v6.m_gw
2170 #define m_netmask4      m_u.m_v4.m_netmask
2171 #define m_netmask6      m_u.m_v6.m_netmask
2172 
2173 /*
2174  * Configures a single interface: a new virtual interface is added, based on
2175  * the physical interface nwiftabptr->zone_nwif_physical, with the address
2176  * specified in nwiftabptr->zone_nwif_address, for zone zone_id.  Note that
2177  * the "address" can be an IPv6 address (with a /prefixlength required), an
2178  * IPv4 address (with a /prefixlength optional), or a name; for the latter,
2179  * an IPv4 name-to-address resolution will be attempted.
2180  *
2181  * A default interface route for multicast is created on the first IPv4 and
2182  * IPv6 interfaces (that have the IFF_MULTICAST flag set), respectively.
2183  * This should really be done in the init scripts if we ever allow zones to
2184  * modify the routing tables.
2185  *
2186  * If anything goes wrong, we log an detailed error message, attempt to tear
2187  * down whatever we set up and return an error.
2188  */
2189 static int
2190 configure_one_interface(zlog_t *zlogp, zoneid_t zone_id,
2191     struct zone_nwiftab *nwiftabptr, boolean_t *mcast_rt_v4_setp,
2192     boolean_t *mcast_rt_v6_setp)
2193 {
2194         struct lifreq lifr;
2195         struct sockaddr_in netmask4;
2196         struct sockaddr_in6 netmask6;
2197         struct in_addr in4;
2198         struct in6_addr in6;
2199         sa_family_t af;
2200         char *slashp = strchr(nwiftabptr->zone_nwif_address, '/');
2201         mcast_rtmsg_t mcast_rtmsg;
2202         int s;
2203         int rs;
2204         int rlen;
2205         boolean_t got_netmask = B_FALSE;
2206         char addrstr4[INET_ADDRSTRLEN];
2207         int res;
2208 
2209         res = zonecfg_valid_net_address(nwiftabptr->zone_nwif_address, &lifr);
2210         if (res != Z_OK) {
2211                 zerror(zlogp, B_FALSE, "%s: %s", zonecfg_strerror(res),
2212                     nwiftabptr->zone_nwif_address);
2213                 return (-1);
2214         }
2215         af = lifr.lifr_addr.ss_family;
2216         if (af == AF_INET)
2217                 in4 = ((struct sockaddr_in *)(&lifr.lifr_addr))->sin_addr;
2218         else
2219                 in6 = ((struct sockaddr_in6 *)(&lifr.lifr_addr))->sin6_addr;
2220 
2221         if ((s = socket(af, SOCK_DGRAM, 0)) < 0) {
2222                 zerror(zlogp, B_TRUE, "could not get socket");
2223                 return (-1);
2224         }
2225 
2226         (void) strlcpy(lifr.lifr_name, nwiftabptr->zone_nwif_physical,
2227             sizeof (lifr.lifr_name));
2228         if (ioctl(s, SIOCLIFADDIF, (caddr_t)&lifr) < 0) {
2229                 /*
2230                  * Here, we know that the interface can't be brought up.
2231                  * A similar warning message was already printed out to
2232                  * the console by zoneadm(1M) so instead we log the
2233                  * message to syslog and continue.
2234                  */
2235                 zerror(&logsys, B_TRUE, "WARNING: skipping network interface "
2236                     "'%s' which may not be present/plumbed in the "
2237                     "global zone.", lifr.lifr_name);
2238                 (void) close(s);
2239                 return (Z_OK);
2240         }
2241 
2242         if (ioctl(s, SIOCSLIFADDR, (caddr_t)&lifr) < 0) {
2243                 zerror(zlogp, B_TRUE,
2244                     "%s: could not set IP address to %s",
2245                     lifr.lifr_name, nwiftabptr->zone_nwif_address);
2246                 goto bad;
2247         }
2248 
2249         /* Preserve literal IPv4 address for later potential printing. */
2250         if (af == AF_INET)
2251                 (void) inet_ntop(AF_INET, &in4, addrstr4, INET_ADDRSTRLEN);
2252 
2253         lifr.lifr_zoneid = zone_id;
2254         if (ioctl(s, SIOCSLIFZONE, (caddr_t)&lifr) < 0) {
2255                 zerror(zlogp, B_TRUE, "%s: could not place network interface "
2256                     "into zone", lifr.lifr_name);
2257                 goto bad;
2258         }
2259 
2260         if (strcmp(nwiftabptr->zone_nwif_physical, "lo0") == 0) {
2261                 got_netmask = B_TRUE;   /* default setting will be correct */
2262         } else {
2263                 if (af == AF_INET) {
2264                         /*
2265                          * The IPv4 netmask can be determined either
2266                          * directly if a prefix length was supplied with
2267                          * the address or via the netmasks database.  Not
2268                          * being able to determine it is a common failure,
2269                          * but it often is not fatal to operation of the
2270                          * interface.  In that case, a warning will be
2271                          * printed after the rest of the interface's
2272                          * parameters have been configured.
2273                          */
2274                         (void) memset(&netmask4, 0, sizeof (netmask4));
2275                         if (slashp != NULL) {
2276                                 if (addr2netmask(slashp + 1, V4_ADDR_LEN,
2277                                     (uchar_t *)&netmask4.sin_addr) != 0) {
2278                                         *slashp = '/';
2279                                         zerror(zlogp, B_FALSE,
2280                                             "%s: invalid prefix length in %s",
2281                                             lifr.lifr_name,
2282                                             nwiftabptr->zone_nwif_address);
2283                                         goto bad;
2284                                 }
2285                                 got_netmask = B_TRUE;
2286                         } else if (getnetmaskbyaddr(in4,
2287                             &netmask4.sin_addr) == 0) {
2288                                 got_netmask = B_TRUE;
2289                         }
2290                         if (got_netmask) {
2291                                 netmask4.sin_family = af;
2292                                 (void) memcpy(&lifr.lifr_addr, &netmask4,
2293                                     sizeof (netmask4));
2294                         }
2295                 } else {
2296                         (void) memset(&netmask6, 0, sizeof (netmask6));
2297                         if (addr2netmask(slashp + 1, V6_ADDR_LEN,
2298                             (uchar_t *)&netmask6.sin6_addr) != 0) {
2299                                 *slashp = '/';
2300                                 zerror(zlogp, B_FALSE,
2301                                     "%s: invalid prefix length in %s",
2302                                     lifr.lifr_name,
2303                                     nwiftabptr->zone_nwif_address);
2304                                 goto bad;
2305                         }
2306                         got_netmask = B_TRUE;
2307                         netmask6.sin6_family = af;
2308                         (void) memcpy(&lifr.lifr_addr, &netmask6,
2309                             sizeof (netmask6));
2310                 }
2311                 if (got_netmask &&
2312                     ioctl(s, SIOCSLIFNETMASK, (caddr_t)&lifr) < 0) {
2313                         zerror(zlogp, B_TRUE, "%s: could not set netmask",
2314                             lifr.lifr_name);
2315                         goto bad;
2316                 }
2317 
2318                 /*
2319                  * This doesn't set the broadcast address at all. Rather, it
2320                  * gets, then sets the interface's address, relying on the fact
2321                  * that resetting the address will reset the broadcast address.
2322                  */
2323                 if (ioctl(s, SIOCGLIFADDR, (caddr_t)&lifr) < 0) {
2324                         zerror(zlogp, B_TRUE, "%s: could not get address",
2325                             lifr.lifr_name);
2326                         goto bad;
2327                 }
2328                 if (ioctl(s, SIOCSLIFADDR, (caddr_t)&lifr) < 0) {
2329                         zerror(zlogp, B_TRUE,
2330                             "%s: could not reset broadcast address",
2331                             lifr.lifr_name);
2332                         goto bad;
2333                 }
2334         }
2335 
2336         if (ioctl(s, SIOCGLIFFLAGS, (caddr_t)&lifr) < 0) {
2337                 zerror(zlogp, B_TRUE, "%s: could not get flags",
2338                     lifr.lifr_name);
2339                 goto bad;
2340         }
2341         lifr.lifr_flags |= IFF_UP;
2342         if (ioctl(s, SIOCSLIFFLAGS, (caddr_t)&lifr) < 0) {
2343                 int save_errno = errno;
2344                 char *zone_using;
2345 
2346                 /*
2347                  * If we failed with something other than EADDRNOTAVAIL,
2348                  * then skip to the end.  Otherwise, look up our address,
2349                  * then call a function to determine which zone is already
2350                  * using that address.
2351                  */
2352                 if (errno != EADDRNOTAVAIL) {
2353                         zerror(zlogp, B_TRUE,
2354                             "%s: could not bring network interface up",
2355                             lifr.lifr_name);
2356                         goto bad;
2357                 }
2358                 if (ioctl(s, SIOCGLIFADDR, (caddr_t)&lifr) < 0) {
2359                         zerror(zlogp, B_TRUE, "%s: could not get address",
2360                             lifr.lifr_name);
2361                         goto bad;
2362                 }
2363                 zone_using = who_is_using(zlogp, &lifr);
2364                 errno = save_errno;
2365                 if (zone_using == NULL)
2366                         zerror(zlogp, B_TRUE,
2367                             "%s: could not bring network interface up",
2368                             lifr.lifr_name);
2369                 else
2370                         zerror(zlogp, B_TRUE, "%s: could not bring network "
2371                             "interface up: address in use by zone '%s'",
2372                             lifr.lifr_name, zone_using);
2373                 goto bad;
2374         }
2375         if ((lifr.lifr_flags & IFF_MULTICAST) && ((af == AF_INET &&
2376             mcast_rt_v4_setp != NULL && *mcast_rt_v4_setp == B_FALSE) ||
2377             (af == AF_INET6 &&
2378             mcast_rt_v6_setp != NULL && *mcast_rt_v6_setp == B_FALSE))) {
2379                 rs = socket(PF_ROUTE, SOCK_RAW, 0);
2380                 if (rs < 0) {
2381                         zerror(zlogp, B_TRUE, "%s: could not create "
2382                             "routing socket", lifr.lifr_name);
2383                         goto bad;
2384                 }
2385                 (void) shutdown(rs, 0);
2386                 (void) memset((void *)&mcast_rtmsg, 0, sizeof (mcast_rtmsg_t));
2387                 mcast_rtmsg.m_rtm.rtm_msglen =  sizeof (struct rt_msghdr) +
2388                     3 * (af == AF_INET ? sizeof (struct sockaddr_in) :
2389                     sizeof (struct sockaddr_in6));
2390                 mcast_rtmsg.m_rtm.rtm_version = RTM_VERSION;
2391                 mcast_rtmsg.m_rtm.rtm_type = RTM_ADD;
2392                 mcast_rtmsg.m_rtm.rtm_flags = RTF_UP;
2393                 mcast_rtmsg.m_rtm.rtm_addrs =
2394                     RTA_DST | RTA_GATEWAY | RTA_NETMASK;
2395                 mcast_rtmsg.m_rtm.rtm_seq = ++rts_seqno;
2396                 if (af == AF_INET) {
2397                         mcast_rtmsg.m_dst4.sin_family = AF_INET;
2398                         mcast_rtmsg.m_dst4.sin_addr.s_addr =
2399                             htonl(INADDR_UNSPEC_GROUP);
2400                         mcast_rtmsg.m_gw4.sin_family = AF_INET;
2401                         mcast_rtmsg.m_gw4.sin_addr = in4;
2402                         mcast_rtmsg.m_netmask4.sin_family = AF_INET;
2403                         mcast_rtmsg.m_netmask4.sin_addr.s_addr =
2404                             htonl(IN_CLASSD_NET);
2405                 } else {
2406                         mcast_rtmsg.m_dst6.sin6_family = AF_INET6;
2407                         mcast_rtmsg.m_dst6.sin6_addr.s6_addr[0] = 0xffU;
2408                         mcast_rtmsg.m_gw6.sin6_family = AF_INET6;
2409                         mcast_rtmsg.m_gw6.sin6_addr = in6;
2410                         mcast_rtmsg.m_netmask6.sin6_family = AF_INET6;
2411                         mcast_rtmsg.m_netmask6.sin6_addr.s6_addr[0] = 0xffU;
2412                 }
2413                 rlen = write(rs, (char *)&mcast_rtmsg,
2414                     mcast_rtmsg.m_rtm.rtm_msglen);
2415                 /*
2416                  * The write to the multicast socket will fail if the
2417                  * interface belongs to a failed IPMP group. This is a
2418                  * non-fatal error and the zone will continue booting.
2419                  * While the zone is running, if any interface in the
2420                  * failed IPMP group recovers, the zone will fallback to
2421                  * using that interface.
2422                  */
2423                 if (rlen < mcast_rtmsg.m_rtm.rtm_msglen) {
2424                         if (rlen < 0) {
2425                                 zerror(zlogp, B_TRUE, "WARNING: network "
2426                                     "interface '%s' not available as default "
2427                                     "for multicast.", lifr.lifr_name);
2428                         } else {
2429                                 zerror(zlogp, B_FALSE, "WARNING: network "
2430                                     "interface '%s' not available as default "
2431                                     "for multicast; routing socket returned "
2432                                     "unexpected %d bytes.",
2433                                     lifr.lifr_name, rlen);
2434                         }
2435                 } else {
2436 
2437                         if (af == AF_INET) {
2438                                 *mcast_rt_v4_setp = B_TRUE;
2439                         } else {
2440                                 *mcast_rt_v6_setp = B_TRUE;
2441                         }
2442                 }
2443                 (void) close(rs);
2444         }
2445 
2446         if (!got_netmask) {
2447                 /*
2448                  * A common, but often non-fatal problem, is that the system
2449                  * cannot find the netmask for an interface address. This is
2450                  * often caused by it being only in /etc/inet/netmasks, but
2451                  * /etc/nsswitch.conf says to use NIS or NIS+ and it's not
2452                  * in that. This doesn't show up at boot because the netmask
2453                  * is obtained from /etc/inet/netmasks when no network
2454                  * interfaces are up, but isn't consulted when NIS/NIS+ is
2455                  * available. We warn the user here that something like this
2456                  * has happened and we're just running with a default and
2457                  * possible incorrect netmask.
2458                  */
2459                 char buffer[INET6_ADDRSTRLEN];
2460                 void  *addr;
2461 
2462                 if (af == AF_INET)
2463                         addr = &((struct sockaddr_in *)
2464                             (&lifr.lifr_addr))->sin_addr;
2465                 else
2466                         addr = &((struct sockaddr_in6 *)
2467                             (&lifr.lifr_addr))->sin6_addr;
2468 
2469                 /* Find out what netmask interface is going to be using */
2470                 if (ioctl(s, SIOCGLIFNETMASK, (caddr_t)&lifr) < 0 ||
2471                     inet_ntop(af, addr, buffer, sizeof (buffer)) == NULL)
2472                         goto bad;
2473                 zerror(zlogp, B_FALSE,
2474                     "WARNING: %s: no matching subnet found in netmasks(4) for "
2475                     "%s; using default of %s.",
2476                     lifr.lifr_name, addrstr4, buffer);
2477         }
2478 
2479         /*
2480          * If a default router was specified for this interface
2481          * set the route now. Ignore if already set.
2482          */
2483         if (strlen(nwiftabptr->zone_nwif_defrouter) > 0) {
2484                 int status;
2485                 char *argv[7];
2486 
2487                 argv[0] = "route";
2488                 argv[1] = "add";
2489                 argv[2] = "-ifp";
2490                 argv[3] = nwiftabptr->zone_nwif_physical;
2491                 argv[4] = "default";
2492                 argv[5] = nwiftabptr->zone_nwif_defrouter;
2493                 argv[6] = NULL;
2494 
2495                 status = forkexec(zlogp, "/usr/sbin/route", argv);
2496                 if (status != 0 && status != EEXIST)
2497                         zerror(zlogp, B_FALSE, "Unable to set route for "
2498                             "interface %s to %s\n",
2499                             nwiftabptr->zone_nwif_physical,
2500                             nwiftabptr->zone_nwif_defrouter);
2501         }
2502 
2503         (void) close(s);
2504         return (Z_OK);
2505 bad:
2506         (void) ioctl(s, SIOCLIFREMOVEIF, (caddr_t)&lifr);
2507         (void) close(s);
2508         return (-1);
2509 }
2510 
2511 /*
2512  * Sets up network interfaces based on information from the zone configuration.
2513  * An IPv4 loopback interface is set up "for free", modeling the global system.
2514  * If any of the configuration interfaces were IPv6, then an IPv6 loopback
2515  * address is set up as well.
2516  *
2517  * If anything goes wrong, we log a general error message, attempt to tear down
2518  * whatever we set up, and return an error.
2519  */
2520 static int
2521 configure_shared_network_interfaces(zlog_t *zlogp)
2522 {
2523         zone_dochandle_t handle;
2524         struct zone_nwiftab nwiftab, loopback_iftab;
2525         boolean_t saw_v6 = B_FALSE;
2526         boolean_t mcast_rt_v4_set = B_FALSE;
2527         boolean_t mcast_rt_v6_set = B_FALSE;
2528         zoneid_t zoneid;
2529 
2530         if ((zoneid = getzoneidbyname(zone_name)) == ZONE_ID_UNDEFINED) {
2531                 zerror(zlogp, B_TRUE, "unable to get zoneid");
2532                 return (-1);
2533         }
2534 
2535         if ((handle = zonecfg_init_handle()) == NULL) {
2536                 zerror(zlogp, B_TRUE, "getting zone configuration handle");
2537                 return (-1);
2538         }
2539         if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
2540                 zerror(zlogp, B_FALSE, "invalid configuration");
2541                 zonecfg_fini_handle(handle);
2542                 return (-1);
2543         }
2544         if (zonecfg_setnwifent(handle) == Z_OK) {
2545                 for (;;) {
2546                         struct in6_addr in6;
2547 
2548                         if (zonecfg_getnwifent(handle, &nwiftab) != Z_OK)
2549                                 break;
2550                         if (configure_one_interface(zlogp, zoneid,
2551                             &nwiftab, &mcast_rt_v4_set, &mcast_rt_v6_set) !=
2552                             Z_OK) {
2553                                 (void) zonecfg_endnwifent(handle);
2554                                 zonecfg_fini_handle(handle);
2555                                 return (-1);
2556                         }
2557                         if (inet_pton(AF_INET6, nwiftab.zone_nwif_address,
2558                             &in6) == 1)
2559                                 saw_v6 = B_TRUE;
2560                 }
2561                 (void) zonecfg_endnwifent(handle);
2562         }
2563         zonecfg_fini_handle(handle);
2564         if (is_system_labeled()) {
2565                 /*
2566                  * Labeled zones share the loopback interface
2567                  * so it is not plumbed for shared stack instances.
2568                  */
2569                 return (0);
2570         }
2571         (void) strlcpy(loopback_iftab.zone_nwif_physical, "lo0",
2572             sizeof (loopback_iftab.zone_nwif_physical));
2573         (void) strlcpy(loopback_iftab.zone_nwif_address, "127.0.0.1",
2574             sizeof (loopback_iftab.zone_nwif_address));
2575         loopback_iftab.zone_nwif_defrouter[0] = '\0';
2576         if (configure_one_interface(zlogp, zoneid, &loopback_iftab, NULL, NULL)
2577             != Z_OK) {
2578                 return (-1);
2579         }
2580         if (saw_v6) {
2581                 (void) strlcpy(loopback_iftab.zone_nwif_address, "::1/128",
2582                     sizeof (loopback_iftab.zone_nwif_address));
2583                 if (configure_one_interface(zlogp, zoneid,
2584                     &loopback_iftab, NULL, NULL) != Z_OK) {
2585                         return (-1);
2586                 }
2587         }
2588         return (0);
2589 }
2590 
2591 static void
2592 show_owner(zlog_t *zlogp, char *dlname)
2593 {
2594         zoneid_t dl_owner_zid;
2595         char dl_owner_zname[ZONENAME_MAX];
2596 
2597         dl_owner_zid = ALL_ZONES;
2598         if (zone_check_datalink(&dl_owner_zid, dlname) != 0)
2599                 (void) snprintf(dl_owner_zname, ZONENAME_MAX, "<unknown>");
2600         else if (getzonenamebyid(dl_owner_zid, dl_owner_zname, ZONENAME_MAX)
2601             < 0)
2602                 (void) snprintf(dl_owner_zname, ZONENAME_MAX, "<%d>",
2603                     dl_owner_zid);
2604 
2605         errno = EPERM;
2606         zerror(zlogp, B_TRUE, "WARNING: skipping network interface '%s' "
2607             "which is used by the non-global zone '%s'.\n",
2608             dlname, dl_owner_zname);
2609 }
2610 
2611 static int
2612 add_datalink(zlog_t *zlogp, zoneid_t zoneid, char *dlname)
2613 {
2614         /* First check if it's in use by global zone. */
2615         if (zonecfg_ifname_exists(AF_INET, dlname) ||
2616             zonecfg_ifname_exists(AF_INET6, dlname)) {
2617                 errno = EPERM;
2618                 zerror(zlogp, B_TRUE, "WARNING: skipping network interface "
2619                     "'%s' which is used in the global zone.", dlname);
2620                 return (-1);
2621         }
2622 
2623         /* Add access control information */
2624         if (zone_add_datalink(zoneid, dlname) != 0) {
2625                 /* If someone got this link before us, show its name */
2626                 if (errno == EPERM)
2627                         show_owner(zlogp, dlname);
2628                 else
2629                         zerror(zlogp, B_TRUE, "WARNING: unable to add network "
2630                             "interface '%s'.", dlname);
2631                 return (-1);
2632         }
2633 
2634         /* Set zoneid of this link. */
2635         if (dladm_setzid(dlname, zoneid) != DLADM_STATUS_OK) {
2636                 zerror(zlogp, B_TRUE, "WARNING: unable to add network "
2637                     "interface '%s'.", dlname);
2638                 (void) zone_remove_datalink(zoneid, dlname);
2639                 return (-1);
2640         }
2641 
2642         return (0);
2643 }
2644 
2645 static int
2646 remove_datalink(zlog_t *zlogp, zoneid_t zoneid, char *dlname)
2647 {
2648         /*
2649          * Remove access control information.
2650          * If the errno is ENXIO, the interface is not added yet,
2651          * nothing to report then.
2652          */
2653         if (zone_remove_datalink(zoneid, dlname) != 0) {
2654                 if (errno == ENXIO)
2655                         return (0);
2656                 zerror(zlogp, B_TRUE, "unable to remove network interface '%s'",
2657                     dlname);
2658                 return (-1);
2659         }
2660 
2661         if (dladm_setzid(dlname, GLOBAL_ZONEID) != DLADM_STATUS_OK) {
2662                 zerror(zlogp, B_TRUE, "unable to release network "
2663                     "interface '%s'", dlname);
2664                 return (-1);
2665         }
2666         return (0);
2667 }
2668 
2669 /*
2670  * Add the kernel access control information for the interface names.
2671  * If anything goes wrong, we log a general error message, attempt to tear down
2672  * whatever we set up, and return an error.
2673  */
2674 static int
2675 configure_exclusive_network_interfaces(zlog_t *zlogp)
2676 {
2677         zone_dochandle_t handle;
2678         struct zone_nwiftab nwiftab;
2679         zoneid_t zoneid;
2680         char rootpath[MAXPATHLEN];
2681         char path[MAXPATHLEN];
2682         di_prof_t prof = NULL;
2683         boolean_t added = B_FALSE;
2684 
2685         if ((zoneid = getzoneidbyname(zone_name)) == -1) {
2686                 zerror(zlogp, B_TRUE, "unable to get zoneid");
2687                 return (-1);
2688         }
2689 
2690         if ((handle = zonecfg_init_handle()) == NULL) {
2691                 zerror(zlogp, B_TRUE, "getting zone configuration handle");
2692                 return (-1);
2693         }
2694         if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
2695                 zerror(zlogp, B_FALSE, "invalid configuration");
2696                 zonecfg_fini_handle(handle);
2697                 return (-1);
2698         }
2699 
2700         if (zonecfg_setnwifent(handle) != Z_OK) {
2701                 zonecfg_fini_handle(handle);
2702                 return (0);
2703         }
2704 
2705         for (;;) {
2706                 if (zonecfg_getnwifent(handle, &nwiftab) != Z_OK)
2707                         break;
2708 
2709                 if (prof == NULL) {
2710                         if (zone_get_devroot(zone_name, rootpath,
2711                             sizeof (rootpath)) != Z_OK) {
2712                                 (void) zonecfg_endnwifent(handle);
2713                                 zonecfg_fini_handle(handle);
2714                                 zerror(zlogp, B_TRUE,
2715                                     "unable to determine dev root");
2716                                 return (-1);
2717                         }
2718                         (void) snprintf(path, sizeof (path), "%s%s", rootpath,
2719                             "/dev");
2720                         if (di_prof_init(path, &prof) != 0) {
2721                                 (void) zonecfg_endnwifent(handle);
2722                                 zonecfg_fini_handle(handle);
2723                                 zerror(zlogp, B_TRUE,
2724                                     "failed to initialize profile");
2725                                 return (-1);
2726                         }
2727                 }
2728 
2729                 /*
2730                  * Create the /dev entry for backward compatibility.
2731                  * Only create the /dev entry if it's not in use.
2732                  * Note that the zone still boots when the assigned
2733                  * interface is inaccessible, used by others, etc.
2734                  * Also, when vanity naming is used, some interface do
2735                  * do not have corresponding /dev node names (for example,
2736                  * vanity named aggregations).  The /dev entry is not
2737                  * created in that case.  The /dev/net entry is always
2738                  * accessible.
2739                  */
2740                 if (add_datalink(zlogp, zoneid, nwiftab.zone_nwif_physical)
2741                     == 0) {
2742                         char            name[DLPI_LINKNAME_MAX];
2743                         datalink_id_t   linkid;
2744 
2745                         if (dladm_name2info(nwiftab.zone_nwif_physical,
2746                             &linkid, NULL, NULL, NULL) == DLADM_STATUS_OK &&
2747                             dladm_linkid2legacyname(linkid, name,
2748                             sizeof (name)) == DLADM_STATUS_OK) {
2749                                 if (di_prof_add_dev(prof, name) != 0) {
2750                                         (void) zonecfg_endnwifent(handle);
2751                                         zonecfg_fini_handle(handle);
2752                                         zerror(zlogp, B_TRUE,
2753                                             "failed to add network device");
2754                                         return (-1);
2755                                 }
2756                                 added = B_TRUE;
2757                         }
2758                 }
2759         }
2760         (void) zonecfg_endnwifent(handle);
2761         zonecfg_fini_handle(handle);
2762 
2763         if (prof != NULL && added) {
2764                 if (di_prof_commit(prof) != 0) {
2765                         zerror(zlogp, B_TRUE, "failed to commit profile");
2766                         return (-1);
2767                 }
2768         }
2769         if (prof != NULL)
2770                 di_prof_fini(prof);
2771 
2772         return (0);
2773 }
2774 
2775 /*
2776  * Get the list of the data-links from kernel, and try to remove it
2777  */
2778 static int
2779 unconfigure_exclusive_network_interfaces_run(zlog_t *zlogp, zoneid_t zoneid)
2780 {
2781         char *dlnames, *ptr;
2782         int dlnum, dlnum_saved, i;
2783 
2784         dlnum = 0;
2785         if (zone_list_datalink(zoneid, &dlnum, NULL) != 0) {
2786                 zerror(zlogp, B_TRUE, "unable to list network interfaces");
2787                 return (-1);
2788         }
2789 again:
2790         /* this zone doesn't have any data-links */
2791         if (dlnum == 0)
2792                 return (0);
2793 
2794         dlnames = malloc(dlnum * LIFNAMSIZ);
2795         if (dlnames == NULL) {
2796                 zerror(zlogp, B_TRUE, "memory allocation failed");
2797                 return (-1);
2798         }
2799         dlnum_saved = dlnum;
2800 
2801         if (zone_list_datalink(zoneid, &dlnum, dlnames) != 0) {
2802                 zerror(zlogp, B_TRUE, "unable to list network interfaces");
2803                 free(dlnames);
2804                 return (-1);
2805         }
2806         if (dlnum_saved < dlnum) {
2807                 /* list increased, try again */
2808                 free(dlnames);
2809                 goto again;
2810         }
2811         ptr = dlnames;
2812         for (i = 0; i < dlnum; i++) {
2813                 /* Remove access control information */
2814                 if (remove_datalink(zlogp, zoneid, ptr) != 0) {
2815                         free(dlnames);
2816                         return (-1);
2817                 }
2818                 ptr += LIFNAMSIZ;
2819         }
2820         free(dlnames);
2821         return (0);
2822 }
2823 
2824 /*
2825  * Get the list of the data-links from configuration, and try to remove it
2826  */
2827 static int
2828 unconfigure_exclusive_network_interfaces_static(zlog_t *zlogp, zoneid_t zoneid)
2829 {
2830         zone_dochandle_t handle;
2831         struct zone_nwiftab nwiftab;
2832 
2833         if ((handle = zonecfg_init_handle()) == NULL) {
2834                 zerror(zlogp, B_TRUE, "getting zone configuration handle");
2835                 return (-1);
2836         }
2837         if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
2838                 zerror(zlogp, B_FALSE, "invalid configuration");
2839                 zonecfg_fini_handle(handle);
2840                 return (-1);
2841         }
2842         if (zonecfg_setnwifent(handle) != Z_OK) {
2843                 zonecfg_fini_handle(handle);
2844                 return (0);
2845         }
2846         for (;;) {
2847                 if (zonecfg_getnwifent(handle, &nwiftab) != Z_OK)
2848                         break;
2849                 /* Remove access control information */
2850                 if (remove_datalink(zlogp, zoneid, nwiftab.zone_nwif_physical)
2851                     != 0) {
2852                         (void) zonecfg_endnwifent(handle);
2853                         zonecfg_fini_handle(handle);
2854                         return (-1);
2855                 }
2856         }
2857         (void) zonecfg_endnwifent(handle);
2858         zonecfg_fini_handle(handle);
2859         return (0);
2860 }
2861 
2862 /*
2863  * Remove the access control information from the kernel for the exclusive
2864  * network interfaces.
2865  */
2866 static int
2867 unconfigure_exclusive_network_interfaces(zlog_t *zlogp, zoneid_t zoneid)
2868 {
2869         if (unconfigure_exclusive_network_interfaces_run(zlogp, zoneid) != 0) {
2870                 return (unconfigure_exclusive_network_interfaces_static(zlogp,
2871                     zoneid));
2872         }
2873 
2874         return (0);
2875 }
2876 
2877 static int
2878 tcp_abort_conn(zlog_t *zlogp, zoneid_t zoneid,
2879     const struct sockaddr_storage *local, const struct sockaddr_storage *remote)
2880 {
2881         int fd;
2882         struct strioctl ioc;
2883         tcp_ioc_abort_conn_t conn;
2884         int error;
2885 
2886         conn.ac_local = *local;
2887         conn.ac_remote = *remote;
2888         conn.ac_start = TCPS_SYN_SENT;
2889         conn.ac_end = TCPS_TIME_WAIT;
2890         conn.ac_zoneid = zoneid;
2891 
2892         ioc.ic_cmd = TCP_IOC_ABORT_CONN;
2893         ioc.ic_timout = -1; /* infinite timeout */
2894         ioc.ic_len = sizeof (conn);
2895         ioc.ic_dp = (char *)&conn;
2896 
2897         if ((fd = open("/dev/tcp", O_RDONLY)) < 0) {
2898                 zerror(zlogp, B_TRUE, "unable to open %s", "/dev/tcp");
2899                 return (-1);
2900         }
2901 
2902         error = ioctl(fd, I_STR, &ioc);
2903         (void) close(fd);
2904         if (error == 0 || errno == ENOENT)      /* ENOENT is not an error */
2905                 return (0);
2906         return (-1);
2907 }
2908 
2909 static int
2910 tcp_abort_connections(zlog_t *zlogp, zoneid_t zoneid)
2911 {
2912         struct sockaddr_storage l, r;
2913         struct sockaddr_in *local, *remote;
2914         struct sockaddr_in6 *local6, *remote6;
2915         int error;
2916 
2917         /*
2918          * Abort IPv4 connections.
2919          */
2920         bzero(&l, sizeof (*local));
2921         local = (struct sockaddr_in *)&l;
2922         local->sin_family = AF_INET;
2923         local->sin_addr.s_addr = INADDR_ANY;
2924         local->sin_port = 0;
2925 
2926         bzero(&r, sizeof (*remote));
2927         remote = (struct sockaddr_in *)&r;
2928         remote->sin_family = AF_INET;
2929         remote->sin_addr.s_addr = INADDR_ANY;
2930         remote->sin_port = 0;
2931 
2932         if ((error = tcp_abort_conn(zlogp, zoneid, &l, &r)) != 0)
2933                 return (error);
2934 
2935         /*
2936          * Abort IPv6 connections.
2937          */
2938         bzero(&l, sizeof (*local6));
2939         local6 = (struct sockaddr_in6 *)&l;
2940         local6->sin6_family = AF_INET6;
2941         local6->sin6_port = 0;
2942         local6->sin6_addr = in6addr_any;
2943 
2944         bzero(&r, sizeof (*remote6));
2945         remote6 = (struct sockaddr_in6 *)&r;
2946         remote6->sin6_family = AF_INET6;
2947         remote6->sin6_port = 0;
2948         remote6->sin6_addr = in6addr_any;
2949 
2950         if ((error = tcp_abort_conn(zlogp, zoneid, &l, &am