1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 #pragma ident   "@(#)vplat.c    1.61    08/05/07 SMI"
  28 
  29 /*
  30  * This module contains functions used to bring up and tear down the
  31  * Virtual Platform: [un]mounting file-systems, [un]plumbing network
  32  * interfaces, [un]configuring devices, establishing resource controls,
  33  * and creating/destroying the zone in the kernel.  These actions, on
  34  * the way up, ready the zone; on the way down, they halt the zone.
  35  * See the much longer block comment at the beginning of zoneadmd.c
  36  * for a bigger picture of how the whole program functions.
  37  *
  38  * This module also has primary responsibility for the layout of "scratch
  39  * zones."  These are mounted, but inactive, zones that are used during
  40  * operating system upgrade and potentially other administrative action.  The
  41  * scratch zone environment is similar to the miniroot environment.  The zone's
  42  * actual root is mounted read-write on /a, and the standard paths (/usr,
  43  * /sbin, /lib) all lead to read-only copies of the running system's binaries.
  44  * This allows the administrative tools to manipulate the zone using "-R /a"
  45  * without relying on any binaries in the zone itself.
  46  *
  47  * If the scratch zone is on an alternate root (Live Upgrade [LU] boot
  48  * environment), then we must resolve the lofs mounts used there to uncover
  49  * writable (unshared) resources.  Shared resources, though, are always
  50  * read-only.  In addition, if the "same" zone with a different root path is
  51  * currently running, then "/b" inside the zone points to the running zone's
  52  * root.  This allows LU to synchronize configuration files during the upgrade
  53  * process.
  54  *
  55  * To construct this environment, this module creates a tmpfs mount on
  56  * $ZONEPATH/lu.  Inside this scratch area, the miniroot-like environment as
  57  * described above is constructed on the fly.  The zone is then created using
  58  * $ZONEPATH/lu as the root.
  59  *
  60  * Note that scratch zones are inactive.  The zone's bits are not running and
  61  * likely cannot be run correctly until upgrade is done.  Init is not running
  62  * there, nor is SMF.  Because of this, the "mounted" state of a scratch zone
  63  * is not a part of the usual halt/ready/boot state machine.
  64  */
  65 
  66 #include <sys/param.h>
  67 #include <sys/mount.h>
  68 #include <sys/mntent.h>
  69 #include <sys/socket.h>
  70 #include <sys/utsname.h>
  71 #include <sys/types.h>
  72 #include <sys/stat.h>
  73 #include <sys/sockio.h>
  74 #include <sys/stropts.h>
  75 #include <sys/conf.h>
  76 
  77 #include <sys/dlpi.h>
  78 #include <libdlpi.h>
  79 #include <libdllink.h>
  80 #include <libdlvlan.h>
  81 
  82 #include <inet/tcp.h>
  83 #include <arpa/inet.h>
  84 #include <netinet/in.h>
  85 #include <net/route.h>
  86 
  87 #include <stdio.h>
  88 #include <errno.h>
  89 #include <fcntl.h>
  90 #include <unistd.h>
  91 #include <rctl.h>
  92 #include <stdlib.h>
  93 #include <string.h>
  94 #include <strings.h>
  95 #include <wait.h>
  96 #include <limits.h>
  97 #include <libgen.h>
  98 #include <libzfs.h>
  99 #include <libdevinfo.h>
 100 #include <zone.h>
 101 #include <assert.h>
 102 #include <libcontract.h>
 103 #include <libcontract_priv.h>
 104 #include <uuid/uuid.h>
 105 
 106 #include <sys/mntio.h>
 107 #include <sys/mnttab.h>
 108 #include <sys/fs/autofs.h>        /* for _autofssys() */
 109 #include <sys/fs/lofs_info.h>
 110 #include <sys/fs/zfs.h>
 111 
 112 #include <pool.h>
 113 #include <sys/pool.h>
 114 #include <sys/priocntl.h>
 115 
 116 #include <libbrand.h>
 117 #include <sys/brand.h>
 118 #include <libzonecfg.h>
 119 #include <synch.h>
 120 
 121 #include "zoneadmd.h"
 122 #include <tsol/label.h>
 123 #include <libtsnet.h>
 124 #include <sys/priv.h>
 125 
 126 #define V4_ADDR_LEN     32
 127 #define V6_ADDR_LEN     128
 128 
 129 #define IPD_DEFAULT_OPTS \
 130         MNTOPT_RO "," MNTOPT_LOFS_NOSUB "," MNTOPT_NODEVICES
 131 
 132 #define DFSTYPES        "/etc/dfs/fstypes"
 133 #define MAXTNZLEN       2048
 134 
 135 #define ALT_MOUNT(mount_cmd)    ((mount_cmd) != Z_MNT_BOOT)
 136 
 137 /* for routing socket */
 138 static int rts_seqno = 0;
 139 
 140 /* mangled zone name when mounting in an alternate root environment */
 141 static char kernzone[ZONENAME_MAX];
 142 
 143 /* array of cached mount entries for resolve_lofs */
 144 static struct mnttab *resolve_lofs_mnts, *resolve_lofs_mnt_max;
 145 
 146 /* for Trusted Extensions */
 147 static tsol_zcent_t *get_zone_label(zlog_t *, priv_set_t *);
 148 static int tsol_mounts(zlog_t *, char *, char *);
 149 static void tsol_unmounts(zlog_t *, char *);
 150 
 151 static m_label_t *zlabel = NULL;
 152 static m_label_t *zid_label = NULL;
 153 static priv_set_t *zprivs = NULL;
 154 
 155 /* from libsocket, not in any header file */
 156 extern int getnetmaskbyaddr(struct in_addr, struct in_addr *);
 157 
 158 /*
 159  * An optimization for build_mnttable: reallocate (and potentially copy the
 160  * data) only once every N times through the loop.
 161  */
 162 #define MNTTAB_HUNK     32
 163 
 164 /*
 165  * Private autofs system call
 166  */
 167 extern int _autofssys(int, void *);
 168 
 169 static int
 170 autofs_cleanup(zoneid_t zoneid)
 171 {
 172         /*
 173          * Ask autofs to unmount all trigger nodes in the given zone.
 174          */
 175         return (_autofssys(AUTOFS_UNMOUNTALL, (void *)zoneid));
 176 }
 177 
 178 static void
 179 free_mnttable(struct mnttab *mnt_array, uint_t nelem)
 180 {
 181         uint_t i;
 182 
 183         if (mnt_array == NULL)
 184                 return;
 185         for (i = 0; i < nelem; i++) {
 186                 free(mnt_array[i].mnt_mountp);
 187                 free(mnt_array[i].mnt_fstype);
 188                 free(mnt_array[i].mnt_special);
 189                 free(mnt_array[i].mnt_mntopts);
 190                 assert(mnt_array[i].mnt_time == NULL);
 191         }
 192         free(mnt_array);
 193 }
 194 
 195 /*
 196  * Build the mount table for the zone rooted at "zroot", storing the resulting
 197  * array of struct mnttabs in "mnt_arrayp" and the number of elements in the
 198  * array in "nelemp".
 199  */
 200 static int
 201 build_mnttable(zlog_t *zlogp, const char *zroot, size_t zrootlen, FILE *mnttab,
 202     struct mnttab **mnt_arrayp, uint_t *nelemp)
 203 {
 204         struct mnttab mnt;
 205         struct mnttab *mnts;
 206         struct mnttab *mnp;
 207         uint_t nmnt;
 208 
 209         rewind(mnttab);
 210         resetmnttab(mnttab);
 211         nmnt = 0;
 212         mnts = NULL;
 213         while (getmntent(mnttab, &mnt) == 0) {
 214                 struct mnttab *tmp_array;
 215 
 216                 if (strncmp(mnt.mnt_mountp, zroot, zrootlen) != 0)
 217                         continue;
 218                 if (nmnt % MNTTAB_HUNK == 0) {
 219                         tmp_array = realloc(mnts,
 220                             (nmnt + MNTTAB_HUNK) * sizeof (*mnts));
 221                         if (tmp_array == NULL) {
 222                                 free_mnttable(mnts, nmnt);
 223                                 return (-1);
 224                         }
 225                         mnts = tmp_array;
 226                 }
 227                 mnp = &mnts[nmnt++];
 228 
 229                 /*
 230                  * Zero out any fields we're not using.
 231                  */
 232                 (void) memset(mnp, 0, sizeof (*mnp));
 233 
 234                 if (mnt.mnt_special != NULL)
 235                         mnp->mnt_special = strdup(mnt.mnt_special);
 236                 if (mnt.mnt_mntopts != NULL)
 237                         mnp->mnt_mntopts = strdup(mnt.mnt_mntopts);
 238                 mnp->mnt_mountp = strdup(mnt.mnt_mountp);
 239                 mnp->mnt_fstype = strdup(mnt.mnt_fstype);
 240                 if ((mnt.mnt_special != NULL && mnp->mnt_special == NULL) ||
 241                     (mnt.mnt_mntopts != NULL && mnp->mnt_mntopts == NULL) ||
 242                     mnp->mnt_mountp == NULL || mnp->mnt_fstype == NULL) {
 243                         zerror(zlogp, B_TRUE, "memory allocation failed");
 244                         free_mnttable(mnts, nmnt);
 245                         return (-1);
 246                 }
 247         }
 248         *mnt_arrayp = mnts;
 249         *nelemp = nmnt;
 250         return (0);
 251 }
 252 
 253 /*
 254  * This is an optimization.  The resolve_lofs function is used quite frequently
 255  * to manipulate file paths, and on a machine with a large number of zones,
 256  * there will be a huge number of mounted file systems.  Thus, we trigger a
 257  * reread of the list of mount points
 258  */
 259 static void
 260 lofs_discard_mnttab(void)
 261 {
 262         free_mnttable(resolve_lofs_mnts,
 263             resolve_lofs_mnt_max - resolve_lofs_mnts);
 264         resolve_lofs_mnts = resolve_lofs_mnt_max = NULL;
 265 }
 266 
 267 static int
 268 lofs_read_mnttab(zlog_t *zlogp)
 269 {
 270         FILE *mnttab;
 271         uint_t nmnts;
 272 
 273         if ((mnttab = fopen(MNTTAB, "r")) == NULL)
 274                 return (-1);
 275         if (build_mnttable(zlogp, "", 0, mnttab, &resolve_lofs_mnts,
 276             &nmnts) == -1) {
 277                 (void) fclose(mnttab);
 278                 return (-1);
 279         }
 280         (void) fclose(mnttab);
 281         resolve_lofs_mnt_max = resolve_lofs_mnts + nmnts;
 282         return (0);
 283 }
 284 
 285 /*
 286  * This function loops over potential loopback mounts and symlinks in a given
 287  * path and resolves them all down to an absolute path.
 288  */
 289 void
 290 resolve_lofs(zlog_t *zlogp, char *path, size_t pathlen)
 291 {
 292         int len, arlen;
 293         const char *altroot;
 294         char tmppath[MAXPATHLEN];
 295         boolean_t outside_altroot;
 296 
 297         if ((len = resolvepath(path, tmppath, sizeof (tmppath))) == -1)
 298                 return;
 299         tmppath[len] = '\0';
 300         (void) strlcpy(path, tmppath, sizeof (tmppath));
 301 
 302         /* This happens once per zoneadmd operation. */
 303         if (resolve_lofs_mnts == NULL && lofs_read_mnttab(zlogp) == -1)
 304                 return;
 305 
 306         altroot = zonecfg_get_root();
 307         arlen = strlen(altroot);
 308         outside_altroot = B_FALSE;
 309         for (;;) {
 310                 struct mnttab *mnp;
 311 
 312                 /* Search in reverse order to find longest match */
 313                 for (mnp = resolve_lofs_mnt_max - 1; mnp >= resolve_lofs_mnts;
 314                     mnp--) {
 315                         if (mnp->mnt_fstype == NULL ||
 316                             mnp->mnt_mountp == NULL ||
 317                             mnp->mnt_special == NULL)
 318                                 continue;
 319                         len = strlen(mnp->mnt_mountp);
 320                         if (strncmp(mnp->mnt_mountp, path, len) == 0 &&
 321                             (path[len] == '/' || path[len] == '\0'))
 322                                 break;
 323                 }
 324                 if (mnp < resolve_lofs_mnts)
 325                         break;
 326                 /* If it's not a lofs then we're done */
 327                 if (strcmp(mnp->mnt_fstype, MNTTYPE_LOFS) != 0)
 328                         break;
 329                 if (outside_altroot) {
 330                         char *cp;
 331                         int olen = sizeof (MNTOPT_RO) - 1;
 332 
 333                         /*
 334                          * If we run into a read-only mount outside of the
 335                          * alternate root environment, then the user doesn't
 336                          * want this path to be made read-write.
 337                          */
 338                         if (mnp->mnt_mntopts != NULL &&
 339                             (cp = strstr(mnp->mnt_mntopts, MNTOPT_RO)) !=
 340                             NULL &&
 341                             (cp == mnp->mnt_mntopts || cp[-1] == ',') &&
 342                             (cp[olen] == '\0' || cp[olen] == ',')) {
 343                                 break;
 344                         }
 345                 } else if (arlen > 0 &&
 346                     (strncmp(mnp->mnt_special, altroot, arlen) != 0 ||
 347                     (mnp->mnt_special[arlen] != '\0' &&
 348                     mnp->mnt_special[arlen] != '/'))) {
 349                         outside_altroot = B_TRUE;
 350                 }
 351                 /* use temporary buffer because new path might be longer */
 352                 (void) snprintf(tmppath, sizeof (tmppath), "%s%s",
 353                     mnp->mnt_special, path + len);
 354                 if ((len = resolvepath(tmppath, path, pathlen)) == -1)
 355                         break;
 356                 path[len] = '\0';
 357         }
 358 }
 359 
 360 /*
 361  * For a regular mount, check if a replacement lofs mount is needed because the
 362  * referenced device is already mounted somewhere.
 363  */
 364 static int
 365 check_lofs_needed(zlog_t *zlogp, struct zone_fstab *fsptr)
 366 {
 367         struct mnttab *mnp;
 368         zone_fsopt_t *optptr, *onext;
 369 
 370         /* This happens once per zoneadmd operation. */
 371         if (resolve_lofs_mnts == NULL && lofs_read_mnttab(zlogp) == -1)
 372                 return (-1);
 373 
 374         /*
 375          * If this special node isn't already in use, then it's ours alone;
 376          * no need to worry about conflicting mounts.
 377          */
 378         for (mnp = resolve_lofs_mnts; mnp < resolve_lofs_mnt_max;
 379             mnp++) {
 380                 if (strcmp(mnp->mnt_special, fsptr->zone_fs_special) == 0)
 381                         break;
 382         }
 383         if (mnp >= resolve_lofs_mnt_max)
 384                 return (0);
 385 
 386         /*
 387          * Convert this duplicate mount into a lofs mount.
 388          */
 389         (void) strlcpy(fsptr->zone_fs_special, mnp->mnt_mountp,
 390             sizeof (fsptr->zone_fs_special));
 391         (void) strlcpy(fsptr->zone_fs_type, MNTTYPE_LOFS,
 392             sizeof (fsptr->zone_fs_type));
 393         fsptr->zone_fs_raw[0] = '\0';
 394 
 395         /*
 396          * Discard all but one of the original options and set that to be the
 397          * same set of options used for inherit package directory resources.
 398          */
 399         optptr = fsptr->zone_fs_options;
 400         if (optptr == NULL) {
 401                 optptr = malloc(sizeof (*optptr));
 402                 if (optptr == NULL) {
 403                         zerror(zlogp, B_TRUE, "cannot mount %s",
 404                             fsptr->zone_fs_dir);
 405                         return (-1);
 406                 }
 407         } else {
 408                 while ((onext = optptr->zone_fsopt_next) != NULL) {
 409                         optptr->zone_fsopt_next = onext->zone_fsopt_next;
 410                         free(onext);
 411                 }
 412         }
 413         (void) strcpy(optptr->zone_fsopt_opt, IPD_DEFAULT_OPTS);
 414         optptr->zone_fsopt_next = NULL;
 415         fsptr->zone_fs_options = optptr;
 416         return (0);
 417 }
 418 
 419 int
 420 make_one_dir(zlog_t *zlogp, const char *prefix, const char *subdir, mode_t mode,
 421     uid_t userid, gid_t groupid)
 422 {
 423         char path[MAXPATHLEN];
 424         struct stat st;
 425 
 426         if (snprintf(path, sizeof (path), "%s%s", prefix, subdir) >
 427             sizeof (path)) {
 428                 zerror(zlogp, B_FALSE, "pathname %s%s is too long", prefix,
 429                     subdir);
 430                 return (-1);
 431         }
 432 
 433         if (lstat(path, &st) == 0) {
 434                 /*
 435                  * We don't check the file mode since presumably the zone
 436                  * administrator may have had good reason to change the mode,
 437                  * and we don't need to second guess him.
 438                  */
 439                 if (!S_ISDIR(st.st_mode)) {
 440                         if (is_system_labeled() &&
 441                             S_ISREG(st.st_mode)) {
 442                                 /*
 443                                  * The need to mount readonly copies of
 444                                  * global zone /etc/ files is unique to
 445                                  * Trusted Extensions.
 446                                  */
 447                                 if (strncmp(subdir, "/etc/",
 448                                     strlen("/etc/")) != 0) {
 449                                         zerror(zlogp, B_FALSE,
 450                                             "%s is not in /etc", path);
 451                                         return (-1);
 452                                 }
 453                         } else {
 454                                 zerror(zlogp, B_FALSE,
 455                                     "%s is not a directory", path);
 456                                 return (-1);
 457                         }
 458                 }
 459                 return (0);
 460         }
 461 
 462         if (mkdirp(path, mode) != 0) {
 463                 if (errno == EROFS)
 464                         zerror(zlogp, B_FALSE, "Could not mkdir %s.\nIt is on "
 465                             "a read-only file system in this local zone.\nMake "
 466                             "sure %s exists in the global zone.", path, subdir);
 467                 else
 468                         zerror(zlogp, B_TRUE, "mkdirp of %s failed", path);
 469                 return (-1);
 470         }
 471 
 472         (void) chown(path, userid, groupid);
 473         return (0);
 474 }
 475 
 476 static void
 477 free_remote_fstypes(char **types)
 478 {
 479         uint_t i;
 480 
 481         if (types == NULL)
 482                 return;
 483         for (i = 0; types[i] != NULL; i++)
 484                 free(types[i]);
 485         free(types);
 486 }
 487 
 488 static char **
 489 get_remote_fstypes(zlog_t *zlogp)
 490 {
 491         char **types = NULL;
 492         FILE *fp;
 493         char buf[MAXPATHLEN];
 494         char fstype[MAXPATHLEN];
 495         uint_t lines = 0;
 496         uint_t i;
 497 
 498         if ((fp = fopen(DFSTYPES, "r")) == NULL) {
 499                 zerror(zlogp, B_TRUE, "failed to open %s", DFSTYPES);
 500                 return (NULL);
 501         }
 502         /*
 503          * Count the number of lines
 504          */
 505         while (fgets(buf, sizeof (buf), fp) != NULL)
 506                 lines++;
 507         if (lines == 0) /* didn't read anything; empty file */
 508                 goto out;
 509         rewind(fp);
 510         /*
 511          * Allocate enough space for a NULL-terminated array.
 512          */
 513         types = calloc(lines + 1, sizeof (char *));
 514         if (types == NULL) {
 515                 zerror(zlogp, B_TRUE, "memory allocation failed");
 516                 goto out;
 517         }
 518         i = 0;
 519         while (fgets(buf, sizeof (buf), fp) != NULL) {
 520                 /* LINTED - fstype is big enough to hold buf */
 521                 if (sscanf(buf, "%s", fstype) == 0) {
 522                         zerror(zlogp, B_FALSE, "unable to parse %s", DFSTYPES);
 523                         free_remote_fstypes(types);
 524                         types = NULL;
 525                         goto out;
 526                 }
 527                 types[i] = strdup(fstype);
 528                 if (types[i] == NULL) {
 529                         zerror(zlogp, B_TRUE, "memory allocation failed");
 530                         free_remote_fstypes(types);
 531                         types = NULL;
 532                         goto out;
 533                 }
 534                 i++;
 535         }
 536 out:
 537         (void) fclose(fp);
 538         return (types);
 539 }
 540 
 541 static boolean_t
 542 is_remote_fstype(const char *fstype, char *const *remote_fstypes)
 543 {
 544         uint_t i;
 545 
 546         if (remote_fstypes == NULL)
 547                 return (B_FALSE);
 548         for (i = 0; remote_fstypes[i] != NULL; i++) {
 549                 if (strcmp(remote_fstypes[i], fstype) == 0)
 550                         return (B_TRUE);
 551         }
 552         return (B_FALSE);
 553 }
 554 
 555 /*
 556  * This converts a zone root path (normally of the form .../root) to a Live
 557  * Upgrade scratch zone root (of the form .../lu).
 558  */
 559 static void
 560 root_to_lu(zlog_t *zlogp, char *zroot, size_t zrootlen, boolean_t isresolved)
 561 {
 562         assert(zone_isnative || zone_iscluster);
 563 
 564         if (!isresolved && zonecfg_in_alt_root())
 565                 resolve_lofs(zlogp, zroot, zrootlen);
 566         (void) strcpy(strrchr(zroot, '/') + 1, "lu");
 567 }
 568 
 569 /*
 570  * The general strategy for unmounting filesystems is as follows:
 571  *
 572  * - Remote filesystems may be dead, and attempting to contact them as
 573  * part of a regular unmount may hang forever; we want to always try to
 574  * forcibly unmount such filesystems and only fall back to regular
 575  * unmounts if the filesystem doesn't support forced unmounts.
 576  *
 577  * - We don't want to unnecessarily corrupt metadata on local
 578  * filesystems (ie UFS), so we want to start off with graceful unmounts,
 579  * and only escalate to doing forced unmounts if we get stuck.
 580  *
 581  * We start off walking backwards through the mount table.  This doesn't
 582  * give us strict ordering but ensures that we try to unmount submounts
 583  * first.  We thus limit the number of failed umount2(2) calls.
 584  *
 585  * The mechanism for determining if we're stuck is to count the number
 586  * of failed unmounts each iteration through the mount table.  This
 587  * gives us an upper bound on the number of filesystems which remain
 588  * mounted (autofs trigger nodes are dealt with separately).  If at the
 589  * end of one unmount+autofs_cleanup cycle we still have the same number
 590  * of mounts that we started out with, we're stuck and try a forced
 591  * unmount.  If that fails (filesystem doesn't support forced unmounts)
 592  * then we bail and are unable to teardown the zone.  If it succeeds,
 593  * we're no longer stuck so we continue with our policy of trying
 594  * graceful mounts first.
 595  *
 596  * Zone must be down (ie, no processes or threads active).
 597  */
 598 static int
 599 unmount_filesystems(zlog_t *zlogp, zoneid_t zoneid, boolean_t unmount_cmd)
 600 {
 601         int error = 0;
 602         FILE *mnttab;
 603         struct mnttab *mnts;
 604         uint_t nmnt;
 605         char zroot[MAXPATHLEN + 1];
 606         size_t zrootlen;
 607         uint_t oldcount = UINT_MAX;
 608         boolean_t stuck = B_FALSE;
 609         char **remote_fstypes = NULL;
 610 
 611         if (zone_get_rootpath(zone_name, zroot, sizeof (zroot)) != Z_OK) {
 612                 zerror(zlogp, B_FALSE, "unable to determine zone root");
 613                 return (-1);
 614         }
 615         if (unmount_cmd)
 616                 root_to_lu(zlogp, zroot, sizeof (zroot), B_FALSE);
 617 
 618         (void) strcat(zroot, "/");
 619         zrootlen = strlen(zroot);
 620 
 621         /*
 622          * For Trusted Extensions unmount each higher level zone's mount
 623          * of our zone's /export/home
 624          */
 625         if (!unmount_cmd)
 626                 tsol_unmounts(zlogp, zone_name);
 627 
 628         if ((mnttab = fopen(MNTTAB, "r")) == NULL) {
 629                 zerror(zlogp, B_TRUE, "failed to open %s", MNTTAB);
 630                 return (-1);
 631         }
 632         /*
 633          * Use our hacky mntfs ioctl so we see everything, even mounts with
 634          * MS_NOMNTTAB.
 635          */
 636         if (ioctl(fileno(mnttab), MNTIOC_SHOWHIDDEN, NULL) < 0) {
 637                 zerror(zlogp, B_TRUE, "unable to configure %s", MNTTAB);
 638                 error++;
 639                 goto out;
 640         }
 641 
 642         /*
 643          * Build the list of remote fstypes so we know which ones we
 644          * should forcibly unmount.
 645          */
 646         remote_fstypes = get_remote_fstypes(zlogp);
 647         for (; /* ever */; ) {
 648                 uint_t newcount = 0;
 649                 boolean_t unmounted;
 650                 struct mnttab *mnp;
 651                 char *path;
 652                 uint_t i;
 653 
 654                 mnts = NULL;
 655                 nmnt = 0;
 656                 /*
 657                  * MNTTAB gives us a way to walk through mounted
 658                  * filesystems; we need to be able to walk them in
 659                  * reverse order, so we build a list of all mounted
 660                  * filesystems.
 661                  */
 662                 if (build_mnttable(zlogp, zroot, zrootlen, mnttab, &mnts,
 663                     &nmnt) != 0) {
 664                         error++;
 665                         goto out;
 666                 }
 667                 for (i = 0; i < nmnt; i++) {
 668                         mnp = &mnts[nmnt - i - 1]; /* access in reverse order */
 669                         path = mnp->mnt_mountp;
 670                         unmounted = B_FALSE;
 671                         /*
 672                          * Try forced unmount first for remote filesystems.
 673                          *
 674                          * Not all remote filesystems support forced unmounts,
 675                          * so if this fails (ENOTSUP) we'll continue on
 676                          * and try a regular unmount.
 677                          */
 678                         if (is_remote_fstype(mnp->mnt_fstype, remote_fstypes)) {
 679                                 if (umount2(path, MS_FORCE) == 0)
 680                                         unmounted = B_TRUE;
 681                         }
 682                         /*
 683                          * Try forced unmount if we're stuck.
 684                          */
 685                         if (stuck) {
 686                                 if (umount2(path, MS_FORCE) == 0) {
 687                                         unmounted = B_TRUE;
 688                                         stuck = B_FALSE;
 689                                 } else {
 690                                         /*
 691                                          * The first failure indicates a
 692                                          * mount we won't be able to get
 693                                          * rid of automatically, so we
 694                                          * bail.
 695                                          */
 696                                         error++;
 697                                         zerror(zlogp, B_FALSE,
 698                                             "unable to unmount '%s'", path);
 699                                         free_mnttable(mnts, nmnt);
 700                                         goto out;
 701                                 }
 702                         }
 703                         /*
 704                          * Try regular unmounts for everything else.
 705                          */
 706                         if (!unmounted && umount2(path, 0) != 0)
 707                                 newcount++;
 708                 }
 709                 free_mnttable(mnts, nmnt);
 710 
 711                 if (newcount == 0)
 712                         break;
 713                 if (newcount >= oldcount) {
 714                         /*
 715                          * Last round didn't unmount anything; we're stuck and
 716                          * should start trying forced unmounts.
 717                          */
 718                         stuck = B_TRUE;
 719                 }
 720                 oldcount = newcount;
 721 
 722                 /*
 723                  * Autofs doesn't let you unmount its trigger nodes from
 724                  * userland so we have to tell the kernel to cleanup for us.
 725                  */
 726                 if (autofs_cleanup(zoneid) != 0) {
 727                         zerror(zlogp, B_TRUE, "unable to remove autofs nodes");
 728                         error++;
 729                         goto out;
 730                 }
 731         }
 732 
 733 out:
 734         free_remote_fstypes(remote_fstypes);
 735         (void) fclose(mnttab);
 736         return (error ? -1 : 0);
 737 }
 738 
 739 static int
 740 fs_compare(const void *m1, const void *m2)
 741 {
 742         struct zone_fstab *i = (struct zone_fstab *)m1;
 743         struct zone_fstab *j = (struct zone_fstab *)m2;
 744 
 745         return (strcmp(i->zone_fs_dir, j->zone_fs_dir));
 746 }
 747 
 748 /*
 749  * Fork and exec (and wait for) the mentioned binary with the provided
 750  * arguments.  Returns (-1) if something went wrong with fork(2) or exec(2),
 751  * returns the exit status otherwise.
 752  *
 753  * If we were unable to exec the provided pathname (for whatever
 754  * reason), we return the special token ZEXIT_EXEC.  The current value
 755  * of ZEXIT_EXEC doesn't conflict with legitimate exit codes of the
 756  * consumers of this function; any future consumers must make sure this
 757  * remains the case.
 758  */
 759 static int
 760 forkexec(zlog_t *zlogp, const char *path, char *const argv[])
 761 {
 762         pid_t child_pid;
 763         int child_status = 0;
 764 
 765         /*
 766          * Do not let another thread localize a message while we are forking.
 767          */
 768         (void) mutex_lock(&msglock);
 769         child_pid = fork();
 770         (void) mutex_unlock(&msglock);
 771         if (child_pid == -1) {
 772                 zerror(zlogp, B_TRUE, "could not fork for %s", argv[0]);
 773                 return (-1);
 774         } else if (child_pid == 0) {
 775                 closefrom(0);
 776                 /* redirect stdin, stdout & stderr to /dev/null */
 777                 (void) open("/dev/null", O_RDONLY);     /* stdin */
 778                 (void) open("/dev/null", O_WRONLY);     /* stdout */
 779                 (void) open("/dev/null", O_WRONLY);     /* stderr */
 780                 (void) execv(path, argv);
 781                 /*
 782                  * Since we are in the child, there is no point calling zerror()
 783                  * since there is nobody waiting to consume it.  So exit with a
 784                  * special code that the parent will recognize and call zerror()
 785                  * accordingly.
 786                  */
 787 
 788                 _exit(ZEXIT_EXEC);
 789         } else {
 790                 (void) waitpid(child_pid, &child_status, 0);
 791         }
 792 
 793         if (WIFSIGNALED(child_status)) {
 794                 zerror(zlogp, B_FALSE, "%s unexpectedly terminated due to "
 795                     "signal %d", path, WTERMSIG(child_status));
 796                 return (-1);
 797         }
 798         assert(WIFEXITED(child_status));
 799         if (WEXITSTATUS(child_status) == ZEXIT_EXEC) {
 800                 zerror(zlogp, B_FALSE, "failed to exec %s", path);
 801                 return (-1);
 802         }
 803         return (WEXITSTATUS(child_status));
 804 }
 805 
 806 static int
 807 isregfile(const char *path)
 808 {
 809         struct stat64 st;
 810 
 811         if (stat64(path, &st) == -1)
 812                 return (-1);
 813 
 814         return (S_ISREG(st.st_mode));
 815 }
 816 
 817 static int
 818 dofsck(zlog_t *zlogp, const char *fstype, const char *rawdev)
 819 {
 820         char cmdbuf[MAXPATHLEN];
 821         char *argv[4];
 822         int status;
 823 
 824         /*
 825          * We could alternatively have called /usr/sbin/fsck -F <fstype>, but
 826          * that would cost us an extra fork/exec without buying us anything.
 827          */
 828         if (snprintf(cmdbuf, sizeof (cmdbuf), "/usr/lib/fs/%s/fsck", fstype)
 829             >= sizeof (cmdbuf)) {
 830                 zerror(zlogp, B_FALSE, "file-system type %s too long", fstype);
 831                 return (-1);
 832         }
 833 
 834         /*
 835          * If it doesn't exist, that's OK: we verified this previously.
 836          */
 837         if (isregfile(cmdbuf) == -1)
 838                 return (0);
 839 
 840         argv[0] = "fsck";
 841         argv[1] = "-m";
 842         argv[2] = (char *)rawdev;
 843         argv[3] = NULL;
 844 
 845         status = forkexec(zlogp, cmdbuf, argv);
 846         if (status == 0 || status == -1)
 847                 return (status);
 848         zerror(zlogp, B_FALSE, "fsck of '%s' failed with exit status %d; "
 849             "run fsck manually", rawdev, status);
 850         return (-1);
 851 }
 852 
 853 static int
 854 domount(zlog_t *zlogp, const char *fstype, const char *opts,
 855     const char *special, const char *directory)
 856 {
 857         char cmdbuf[MAXPATHLEN];
 858         char *argv[6];
 859         int status;
 860 
 861         /*
 862          * We could alternatively have called /usr/sbin/mount -F <fstype>, but
 863          * that would cost us an extra fork/exec without buying us anything.
 864          */
 865         if (snprintf(cmdbuf, sizeof (cmdbuf), "/usr/lib/fs/%s/mount", fstype)
 866             >= sizeof (cmdbuf)) {
 867                 zerror(zlogp, B_FALSE, "file-system type %s too long", fstype);
 868                 return (-1);
 869         }
 870         argv[0] = "mount";
 871         if (opts[0] == '\0') {
 872                 argv[1] = (char *)special;
 873                 argv[2] = (char *)directory;
 874                 argv[3] = NULL;
 875         } else {
 876                 argv[1] = "-o";
 877                 argv[2] = (char *)opts;
 878                 argv[3] = (char *)special;
 879                 argv[4] = (char *)directory;
 880                 argv[5] = NULL;
 881         }
 882 
 883         status = forkexec(zlogp, cmdbuf, argv);
 884         if (status == 0 || status == -1)
 885                 return (status);
 886         if (opts[0] == '\0')
 887                 zerror(zlogp, B_FALSE, "\"%s %s %s\" "
 888                     "failed with exit code %d",
 889                     cmdbuf, special, directory, status);
 890         else
 891                 zerror(zlogp, B_FALSE, "\"%s -o %s %s %s\" "
 892                     "failed with exit code %d",
 893                     cmdbuf, opts, special, directory, status);
 894         return (-1);
 895 }
 896 
 897 /*
 898  * Check if a given mount point path exists.
 899  * If it does, make sure it doesn't contain any symlinks.
 900  * Note that if "leaf" is false we're checking an intermediate
 901  * component of the mount point path, so it must be a directory.
 902  * If "leaf" is true, then we're checking the entire mount point
 903  * path, so the mount point itself can be anything aside from a
 904  * symbolic link.
 905  *
 906  * If the path is invalid then a negative value is returned.  If the
 907  * path exists and is a valid mount point path then 0 is returned.
 908  * If the path doesn't exist return a positive value.
 909  */
 910 static int
 911 valid_mount_point(zlog_t *zlogp, const char *path, const boolean_t leaf)
 912 {
 913         struct stat statbuf;
 914         char respath[MAXPATHLEN];
 915         int res;
 916 
 917         if (lstat(path, &statbuf) != 0) {
 918                 if (errno == ENOENT)
 919                         return (1);
 920                 zerror(zlogp, B_TRUE, "can't stat %s", path);
 921                 return (-1);
 922         }
 923         if (S_ISLNK(statbuf.st_mode)) {
 924                 zerror(zlogp, B_FALSE, "%s is a symlink", path);
 925                 return (-1);
 926         }
 927         if (!leaf && !S_ISDIR(statbuf.st_mode)) {
 928                 zerror(zlogp, B_FALSE, "%s is not a directory", path);
 929                 return (-1);
 930         }
 931         if ((res = resolvepath(path, respath, sizeof (respath))) == -1) {
 932                 zerror(zlogp, B_TRUE, "unable to resolve path %s", path);
 933                 return (-1);
 934         }
 935         respath[res] = '\0';
 936         if (strcmp(path, respath) != 0) {
 937                 /*
 938                  * We don't like ".."s, "."s, or "//"s throwing us off
 939                  */
 940                 zerror(zlogp, B_FALSE, "%s is not a canonical path", path);
 941                 return (-1);
 942         }
 943         return (0);
 944 }
 945 
 946 /*
 947  * Validate a mount point path.  A valid mount point path is an
 948  * absolute path that either doesn't exist, or, if it does exists it
 949  * must be an absolute canonical path that doesn't have any symbolic
 950  * links in it.  The target of a mount point path can be any filesystem
 951  * object.  (Different filesystems can support different mount points,
 952  * for example "lofs" and "mntfs" both support files and directories
 953  * while "ufs" just supports directories.)
 954  *
 955  * If the path is invalid then a negative value is returned.  If the
 956  * path exists and is a valid mount point path then 0 is returned.
 957  * If the path doesn't exist return a positive value.
 958  */
 959 int
 960 valid_mount_path(zlog_t *zlogp, const char *rootpath, const char *spec,
 961     const char *dir, const char *fstype)
 962 {
 963         char abspath[MAXPATHLEN], *slashp, *slashp_next;
 964         int rv;
 965 
 966         /*
 967          * Sanity check the target mount point path.
 968          * It must be a non-null string that starts with a '/'.
 969          */
 970         if (dir[0] != '/') {
 971                 if (spec[0] == '\0') {
 972                         /*
 973                          * This must be an invalid ipd entry (see comments
 974                          * in mount_filesystems_ipdent()).
 975                          */
 976                         zerror(zlogp, B_FALSE,
 977                             "invalid inherit-pkg-dir entry: \"%s\"", dir);
 978                 } else {
 979                         /* Something went wrong. */
 980                         zerror(zlogp, B_FALSE, "invalid mount directory, "
 981                             "type: \"%s\", special: \"%s\", dir: \"%s\"",
 982                             fstype, spec, dir);
 983                 }
 984                 return (-1);
 985         }
 986 
 987         /*
 988          * Join rootpath and dir.  Make sure abspath ends with '/', this
 989          * is added to all paths (even non-directory paths) to allow us
 990          * to detect the end of paths below.  If the path already ends
 991          * in a '/', then that's ok too (although we'll fail the
 992          * cannonical path check in valid_mount_point()).
 993          */
 994         if (snprintf(abspath, sizeof (abspath),
 995             "%s%s/", rootpath, dir) >= sizeof (abspath)) {
 996                 zerror(zlogp, B_FALSE, "pathname %s%s is too long",
 997                     rootpath, dir);
 998                 return (-1);
 999         }
1000 
1001         /*
1002          * Starting with rootpath, verify the mount path one component
1003          * at a time.  Continue until we've evaluated all of abspath.
1004          */
1005         slashp = &abspath[strlen(rootpath)];
1006         assert(*slashp == '/');
1007         do {
1008                 slashp_next = strchr(slashp + 1, '/');
1009                 *slashp = '\0';
1010                 if (slashp_next != NULL) {
1011                         /* This is an intermediary mount path component. */
1012                         rv = valid_mount_point(zlogp, abspath, B_FALSE);
1013                 } else {
1014                         /* This is the last component of the mount path. */
1015                         rv = valid_mount_point(zlogp, abspath, B_TRUE);
1016                 }
1017                 if (rv < 0)
1018                         return (rv);
1019                 *slashp = '/';
1020         } while ((slashp = slashp_next) != NULL);
1021         return (rv);
1022 }
1023 
1024 static int
1025 mount_one_dev_device_cb(void *arg, const char *match, const char *name)
1026 {
1027         di_prof_t prof = arg;
1028 
1029         if (name == NULL)
1030                 return (di_prof_add_dev(prof, match));
1031         return (di_prof_add_map(prof, match, name));
1032 }
1033 
1034 static int
1035 mount_one_dev_symlink_cb(void *arg, const char *source, const char *target)
1036 {
1037         di_prof_t prof = arg;
1038 
1039         return (di_prof_add_symlink(prof, source, target));
1040 }
1041 
1042 static int
1043 get_iptype(zlog_t *zlogp, zone_iptype_t *iptypep)
1044 {
1045         zone_dochandle_t handle;
1046 
1047         if ((handle = zonecfg_init_handle()) == NULL) {
1048                 zerror(zlogp, B_TRUE, "getting zone configuration handle");
1049                 return (-1);
1050         }
1051         if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
1052                 zerror(zlogp, B_FALSE, "invalid configuration");
1053                 zonecfg_fini_handle(handle);
1054                 return (-1);
1055         }
1056         if (zonecfg_get_iptype(handle, iptypep) != Z_OK) {
1057                 zerror(zlogp, B_FALSE, "invalid ip-type configuration");
1058                 zonecfg_fini_handle(handle);
1059                 return (-1);
1060         }
1061         zonecfg_fini_handle(handle);
1062         return (0);
1063 }
1064 
1065 /*
1066  * Apply the standard lists of devices/symlinks/mappings and the user-specified
1067  * list of devices (via zonecfg) to the /dev filesystem.  The filesystem will
1068  * use these as a profile/filter to determine what exists in /dev.
1069  */
1070 static int
1071 mount_one_dev(zlog_t *zlogp, char *devpath)
1072 {
1073         char                    brand[MAXNAMELEN];
1074         zone_dochandle_t        handle = NULL;
1075         brand_handle_t          bh = NULL;
1076         struct zone_devtab      ztab;
1077         di_prof_t               prof = NULL;
1078         int                     err;
1079         int                     retval = -1;
1080         zone_iptype_t           iptype;
1081         const char              *curr_iptype;
1082 
1083         if (di_prof_init(devpath, &prof)) {
1084                 zerror(zlogp, B_TRUE, "failed to initialize profile");
1085                 goto cleanup;
1086         }
1087 
1088         /* Get a handle to the brand info for this zone */
1089         if ((zone_get_brand(zone_name, brand, sizeof (brand)) != Z_OK) ||
1090             (bh = brand_open(brand)) == NULL) {
1091                 zerror(zlogp, B_FALSE, "unable to determine zone brand");
1092                 goto cleanup;
1093         }
1094 
1095         if (get_iptype(zlogp, &iptype) < 0) {
1096                 zerror(zlogp, B_TRUE, "unable to determine ip-type");
1097                 goto cleanup;
1098         }
1099         switch (iptype) {
1100         case ZS_SHARED:
1101                 curr_iptype = "shared";
1102                 break;
1103         case ZS_EXCLUSIVE:
1104                 curr_iptype = "exclusive";
1105                 break;
1106         }
1107 
1108         if (brand_platform_iter_devices(bh, zone_name,
1109             mount_one_dev_device_cb, prof, curr_iptype) != 0) {
1110                 zerror(zlogp, B_TRUE, "failed to add standard device");
1111                 goto cleanup;
1112         }
1113 
1114         if (brand_platform_iter_link(bh,
1115             mount_one_dev_symlink_cb, prof) != 0) {
1116                 zerror(zlogp, B_TRUE, "failed to add standard symlink");
1117                 goto cleanup;
1118         }
1119 
1120         /* Add user-specified devices and directories */
1121         if ((handle = zonecfg_init_handle()) == NULL) {
1122                 zerror(zlogp, B_FALSE, "can't initialize zone handle");
1123                 goto cleanup;
1124         }
1125         if (err = zonecfg_get_handle(zone_name, handle)) {
1126                 zerror(zlogp, B_FALSE, "can't get handle for zone "
1127                     "%s: %s", zone_name, zonecfg_strerror(err));
1128                 goto cleanup;
1129         }
1130         if (err = zonecfg_setdevent(handle)) {
1131                 zerror(zlogp, B_FALSE, "%s: %s", zone_name,
1132                     zonecfg_strerror(err));
1133                 goto cleanup;
1134         }
1135         while (zonecfg_getdevent(handle, &ztab) == Z_OK) {
1136                 if (di_prof_add_dev(prof, ztab.zone_dev_match)) {
1137                         zerror(zlogp, B_TRUE, "failed to add "
1138                             "user-specified device");
1139                         goto cleanup;
1140                 }
1141         }
1142         (void) zonecfg_enddevent(handle);
1143 
1144         /* Send profile to kernel */
1145         if (di_prof_commit(prof)) {
1146                 zerror(zlogp, B_TRUE, "failed to commit profile");
1147                 goto cleanup;
1148         }
1149 
1150         retval = 0;
1151 
1152 cleanup:
1153         if (bh != NULL)
1154                 brand_close(bh);
1155         if (handle != NULL)
1156                 zonecfg_fini_handle(handle);
1157         if (prof)
1158                 di_prof_fini(prof);
1159         return (retval);
1160 }
1161 
1162 static int
1163 mount_one(zlog_t *zlogp, struct zone_fstab *fsptr, const char *rootpath)
1164 {
1165         char path[MAXPATHLEN];
1166         char specpath[MAXPATHLEN];
1167         char optstr[MAX_MNTOPT_STR];
1168         zone_fsopt_t *optptr;
1169         int rv;
1170 
1171         if ((rv = valid_mount_path(zlogp, rootpath, fsptr->zone_fs_special,
1172             fsptr->zone_fs_dir, fsptr->zone_fs_type)) < 0) {
1173                 zerror(zlogp, B_FALSE, "%s%s is not a valid mount point",
1174                     rootpath, fsptr->zone_fs_dir);
1175                 return (-1);
1176         } else if (rv > 0) {
1177                 /* The mount point path doesn't exist, create it now. */
1178                 if (make_one_dir(zlogp, rootpath, fsptr->zone_fs_dir,
1179                     DEFAULT_DIR_MODE, DEFAULT_DIR_USER,
1180                     DEFAULT_DIR_GROUP) != 0) {
1181                         zerror(zlogp, B_FALSE, "failed to create mount point");
1182                         return (-1);
1183                 }
1184 
1185                 /*
1186                  * Now this might seem weird, but we need to invoke
1187                  * valid_mount_path() again.  Why?  Because it checks
1188                  * to make sure that the mount point path is canonical,
1189                  * which it can only do if the path exists, so now that
1190                  * we've created the path we have to verify it again.
1191                  */
1192                 if ((rv = valid_mount_path(zlogp, rootpath,
1193                     fsptr->zone_fs_special, fsptr->zone_fs_dir,
1194                     fsptr->zone_fs_type)) < 0) {
1195                         zerror(zlogp, B_FALSE,
1196                             "%s%s is not a valid mount point",
1197                             rootpath, fsptr->zone_fs_dir);
1198                         return (-1);
1199                 }
1200         }
1201 
1202         (void) snprintf(path, sizeof (path), "%s%s", rootpath,
1203             fsptr->zone_fs_dir);
1204 
1205         if (strlen(fsptr->zone_fs_special) == 0) {
1206                 /*
1207                  * A zero-length special is how we distinguish IPDs from
1208                  * general-purpose FSs.  Make sure it mounts from a place that
1209                  * can be seen via the alternate zone's root.
1210                  */
1211                 if (snprintf(specpath, sizeof (specpath), "%s%s",
1212                     zonecfg_get_root(), fsptr->zone_fs_dir) >=
1213                     sizeof (specpath)) {
1214                         zerror(zlogp, B_FALSE, "cannot mount %s: path too "
1215                             "long in alternate root", fsptr->zone_fs_dir);
1216                         return (-1);
1217                 }
1218                 if (zonecfg_in_alt_root())
1219                         resolve_lofs(zlogp, specpath, sizeof (specpath));
1220                 if (domount(zlogp, MNTTYPE_LOFS, IPD_DEFAULT_OPTS,
1221                     specpath, path) != 0) {
1222                         zerror(zlogp, B_TRUE, "failed to loopback mount %s",
1223                             specpath);
1224                         return (-1);
1225                 }
1226                 return (0);
1227         }
1228 
1229         /*
1230          * In general the strategy here is to do just as much verification as
1231          * necessary to avoid crashing or otherwise doing something bad; if the
1232          * administrator initiated the operation via zoneadm(1m), he'll get
1233          * auto-verification which will let him know what's wrong.  If he
1234          * modifies the zone configuration of a running zone and doesn't attempt
1235          * to verify that it's OK we won't crash but won't bother trying to be
1236          * too helpful either.  zoneadm verify is only a couple keystrokes away.
1237          */
1238         if (!zonecfg_valid_fs_type(fsptr->zone_fs_type)) {
1239                 zerror(zlogp, B_FALSE, "cannot mount %s on %s: "
1240                     "invalid file-system type %s", fsptr->zone_fs_special,
1241                     fsptr->zone_fs_dir, fsptr->zone_fs_type);
1242                 return (-1);
1243         }
1244 
1245         /*
1246          * If we're looking at an alternate root environment, then construct
1247          * read-only loopback mounts as necessary.  Note that any special
1248          * paths for lofs zone mounts in an alternate root must have
1249          * already been pre-pended with any alternate root path by the
1250          * time we get here.
1251          */
1252         if (zonecfg_in_alt_root()) {
1253                 struct stat64 st;
1254 
1255                 if (stat64(fsptr->zone_fs_special, &st) != -1 &&
1256                     S_ISBLK(st.st_mode)) {
1257                         /*
1258                          * If we're going to mount a block device we need
1259                          * to check if that device is already mounted
1260                          * somewhere else, and if so, do a lofs mount
1261                          * of the device instead of a direct mount
1262                          */
1263                         if (check_lofs_needed(zlogp, fsptr) == -1)
1264                                 return (-1);
1265                 } else if (strcmp(fsptr->zone_fs_type, MNTTYPE_LOFS) == 0) {
1266                         /*
1267                          * For lofs mounts, the special node is inside the
1268                          * alternate root.  We need lofs resolution for
1269                          * this case in order to get at the underlying
1270                          * read-write path.
1271                          */
1272                         resolve_lofs(zlogp, fsptr->zone_fs_special,
1273                             sizeof (fsptr->zone_fs_special));
1274                 }
1275         }
1276 
1277         /*
1278          * Run 'fsck -m' if there's a device to fsck.
1279          */
1280         if (fsptr->zone_fs_raw[0] != '\0' &&
1281             dofsck(zlogp, fsptr->zone_fs_type, fsptr->zone_fs_raw) != 0) {
1282                 return (-1);
1283         } else if (isregfile(fsptr->zone_fs_special) == 1 &&
1284             dofsck(zlogp, fsptr->zone_fs_type, fsptr->zone_fs_special) != 0) {
1285                 return (-1);
1286         }
1287 
1288         /*
1289          * Build up mount option string.
1290          */
1291         optstr[0] = '\0';
1292         if (fsptr->zone_fs_options != NULL) {
1293                 (void) strlcpy(optstr, fsptr->zone_fs_options->zone_fsopt_opt,
1294                     sizeof (optstr));
1295                 for (optptr = fsptr->zone_fs_options->zone_fsopt_next;
1296                     optptr != NULL; optptr = optptr->zone_fsopt_next) {
1297                         (void) strlcat(optstr, ",", sizeof (optstr));
1298                         (void) strlcat(optstr, optptr->zone_fsopt_opt,
1299                             sizeof (optstr));
1300                 }
1301         }
1302 
1303         if ((rv = domount(zlogp, fsptr->zone_fs_type, optstr,
1304             fsptr->zone_fs_special, path)) != 0)
1305                 return (rv);
1306 
1307         /*
1308          * The mount succeeded.  If this was not a mount of /dev then
1309          * we're done.
1310          */
1311         if (strcmp(fsptr->zone_fs_type, MNTTYPE_DEV) != 0)
1312                 return (0);
1313 
1314         /*
1315          * We just mounted an instance of a /dev filesystem, so now we
1316          * need to configure it.
1317          */
1318         return (mount_one_dev(zlogp, path));
1319 }
1320 
1321 static void
1322 free_fs_data(struct zone_fstab *fsarray, uint_t nelem)
1323 {
1324         uint_t i;
1325 
1326         if (fsarray == NULL)
1327                 return;
1328         for (i = 0; i < nelem; i++)
1329                 zonecfg_free_fs_option_list(fsarray[i].zone_fs_options);
1330         free(fsarray);
1331 }
1332 
1333 /*
1334  * This function initiates the creation of a small Solaris Environment for
1335  * scratch zone. The Environment creation process is split up into two
1336  * functions(build_mounted_pre_var() and build_mounted_post_var()). It
1337  * is done this way because:
1338  *      We need to have both /etc and /var in the root of the scratchzone.
1339  *      We loopback mount zone's own /etc and /var into the root of the
1340  *      scratch zone. Unlike /etc, /var can be a seperate filesystem. So we
1341  *      need to delay the mount of /var till the zone's root gets populated.
1342  *      So mounting of localdirs[](/etc and /var) have been moved to the
1343  *      build_mounted_post_var() which gets called only after the zone
1344  *      specific filesystems are mounted.
1345  *
1346  * Note that the scratch zone we set up for updating the zone (Z_MNT_UPDATE)
1347  * does not loopback mount the zone's own /etc and /var into the root of the
1348  * scratch zone.
1349  */
1350 static boolean_t
1351 build_mounted_pre_var(zlog_t *zlogp, char *rootpath,
1352     size_t rootlen, const char *zonepath, char *luroot, size_t lurootlen)
1353 {
1354         char tmp[MAXPATHLEN], fromdir[MAXPATHLEN];
1355         const char **cpp;
1356         static const char *mkdirs[] = {
1357                 "/system", "/system/contract", "/system/object", "/proc",
1358                 "/dev", "/tmp", "/a", NULL
1359         };
1360         char *altstr;
1361         FILE *fp;
1362         uuid_t uuid;
1363 
1364         assert(zone_isnative || zone_iscluster);
1365 
1366         resolve_lofs(zlogp, rootpath, rootlen);
1367         (void) snprintf(luroot, lurootlen, "%s/lu", zonepath);
1368         resolve_lofs(zlogp, luroot, lurootlen);
1369         (void) snprintf(tmp, sizeof (tmp), "%s/bin", luroot);
1370         (void) symlink("./usr/bin", tmp);
1371 
1372         /*
1373          * These are mostly special mount points; not handled here.  (See
1374          * zone_mount_early.)
1375          */
1376         for (cpp = mkdirs; *cpp != NULL; cpp++) {
1377                 (void) snprintf(tmp, sizeof (tmp), "%s%s", luroot, *cpp);
1378                 if (mkdir(tmp, 0755) != 0) {
1379                         zerror(zlogp, B_TRUE, "cannot create %s", tmp);
1380                         return (B_FALSE);
1381                 }
1382         }
1383         /*
1384          * This is here to support lucopy.  If there's an instance of this same
1385          * zone on the current running system, then we mount its root up as
1386          * read-only inside the scratch zone.
1387          */
1388         (void) zonecfg_get_uuid(zone_name, uuid);
1389         altstr = strdup(zonecfg_get_root());
1390         if (altstr == NULL) {
1391                 zerror(zlogp, B_TRUE, "memory allocation failed");
1392                 return (B_FALSE);
1393         }
1394         zonecfg_set_root("");
1395         (void) strlcpy(tmp, zone_name, sizeof (tmp));
1396         (void) zonecfg_get_name_by_uuid(uuid, tmp, sizeof (tmp));
1397         if (zone_get_rootpath(tmp, fromdir, sizeof (fromdir)) == Z_OK &&
1398             strcmp(fromdir, rootpath) != 0) {
1399                 (void) snprintf(tmp, sizeof (tmp), "%s/b", luroot);
1400                 if (mkdir(tmp, 0755) != 0) {
1401                         zerror(zlogp, B_TRUE, "cannot create %s", tmp);
1402                         return (B_FALSE);
1403                 }
1404                 if (domount(zlogp, MNTTYPE_LOFS, IPD_DEFAULT_OPTS, fromdir,
1405                     tmp) != 0) {
1406                         zerror(zlogp, B_TRUE, "cannot mount %s on %s", tmp,
1407                             fromdir);
1408                         return (B_FALSE);
1409                 }
1410         }
1411         zonecfg_set_root(altstr);
1412         free(altstr);
1413 
1414         if ((fp = zonecfg_open_scratch(luroot, B_TRUE)) == NULL) {
1415                 zerror(zlogp, B_TRUE, "cannot open zone mapfile");
1416                 return (B_FALSE);
1417         }
1418         (void) ftruncate(fileno(fp), 0);
1419         if (zonecfg_add_scratch(fp, zone_name, kernzone, "/") == -1) {
1420                 zerror(zlogp, B_TRUE, "cannot add zone mapfile entry");
1421         }
1422         zonecfg_close_scratch(fp);
1423         (void) snprintf(tmp, sizeof (tmp), "%s/a", luroot);
1424         if (domount(zlogp, MNTTYPE_LOFS, "", rootpath, tmp) != 0)
1425                 return (B_FALSE);
1426         (void) strlcpy(rootpath, tmp, rootlen);
1427         return (B_TRUE);
1428 }
1429 
1430 
1431 static boolean_t
1432 build_mounted_post_var(zlog_t *zlogp, zone_mnt_t mount_cmd, char *rootpath,
1433     const char *luroot)
1434 {
1435         char tmp[MAXPATHLEN], fromdir[MAXPATHLEN];
1436         const char **cpp;
1437         const char **loopdirs;
1438         const char **tmpdirs;
1439         static const char *localdirs[] = {
1440                 "/etc", "/var", NULL
1441         };
1442         static const char *scr_loopdirs[] = {
1443                 "/etc/lib", "/etc/fs", "/lib", "/sbin", "/platform",
1444                 "/usr", NULL
1445         };
1446         static const char *upd_loopdirs[] = {
1447                 "/etc", "/kernel", "/lib", "/opt", "/platform", "/sbin",
1448                 "/usr", "/var", NULL
1449         };
1450         static const char *scr_tmpdirs[] = {
1451                 "/tmp", "/var/run", NULL
1452         };
1453         static const char *upd_tmpdirs[] = {
1454                 "/tmp", "/var/run", "/var/tmp", NULL
1455         };
1456         struct stat st;
1457 
1458         if (mount_cmd == Z_MNT_SCRATCH) {
1459                 /*
1460                  * These are mounted read-write from the zone undergoing
1461                  * upgrade.  We must be careful not to 'leak' things from the
1462                  * main system into the zone, and this accomplishes that goal.
1463                  */
1464                 for (cpp = localdirs; *cpp != NULL; cpp++) {
1465                         (void) snprintf(tmp, sizeof (tmp), "%s%s", luroot,
1466                             *cpp);
1467                         (void) snprintf(fromdir, sizeof (fromdir), "%s%s",
1468                             rootpath, *cpp);
1469                         if (mkdir(tmp, 0755) != 0) {
1470                                 zerror(zlogp, B_TRUE, "cannot create %s", tmp);
1471                                 return (B_FALSE);
1472                         }
1473                         if (domount(zlogp, MNTTYPE_LOFS, "", fromdir, tmp)
1474                             != 0) {
1475                                 zerror(zlogp, B_TRUE, "cannot mount %s on %s",
1476                                     tmp, *cpp);
1477                                 return (B_FALSE);
1478                         }
1479                 }
1480         }
1481 
1482         if (mount_cmd == Z_MNT_UPDATE)
1483                 loopdirs = upd_loopdirs;
1484         else
1485                 loopdirs = scr_loopdirs;
1486 
1487         /*
1488          * These are things mounted read-only from the running system because
1489          * they contain binaries that must match system.
1490          */
1491         for (cpp = loopdirs; *cpp != NULL; cpp++) {
1492                 (void) snprintf(tmp, sizeof (tmp), "%s%s", luroot, *cpp);
1493                 if (mkdir(tmp, 0755) != 0) {
1494                         if (errno != EEXIST) {
1495                                 zerror(zlogp, B_TRUE, "cannot create %s", tmp);
1496                                 return (B_FALSE);
1497                         }
1498                         if (lstat(tmp, &st) != 0) {
1499                                 zerror(zlogp, B_TRUE, "cannot stat %s", tmp);
1500                                 return (B_FALSE);
1501                         }
1502                         /*
1503                          * Ignore any non-directories encountered.  These are
1504                          * things that have been converted into symlinks
1505                          * (/etc/fs and /etc/lib) and no longer need a lofs
1506                          * fixup.
1507                          */
1508                         if (!S_ISDIR(st.st_mode))
1509                                 continue;
1510                 }
1511                 if (domount(zlogp, MNTTYPE_LOFS, IPD_DEFAULT_OPTS, *cpp,
1512                     tmp) != 0) {
1513                         zerror(zlogp, B_TRUE, "cannot mount %s on %s", tmp,
1514                             *cpp);
1515                         return (B_FALSE);
1516                 }
1517         }
1518 
1519         if (mount_cmd == Z_MNT_UPDATE)
1520                 tmpdirs = upd_tmpdirs;
1521         else
1522                 tmpdirs = scr_tmpdirs;
1523 
1524         /*
1525          * These are things with tmpfs mounted inside.
1526          */
1527         for (cpp = tmpdirs; *cpp != NULL; cpp++) {
1528                 (void) snprintf(tmp, sizeof (tmp), "%s%s", luroot, *cpp);
1529                 if (mount_cmd == Z_MNT_SCRATCH && mkdir(tmp, 0755) != 0 &&
1530                     errno != EEXIST) {
1531                         zerror(zlogp, B_TRUE, "cannot create %s", tmp);
1532                         return (B_FALSE);
1533                 }
1534 
1535                 /*
1536                  * We could set the mode for /tmp when we do the mkdir but
1537                  * since that can be modified by the umask we will just set
1538                  * the correct mode for /tmp now.
1539                  */
1540                 if (strcmp(*cpp, "/tmp") == 0 && chmod(tmp, 01777) != 0) {
1541                         zerror(zlogp, B_TRUE, "cannot chmod %s", tmp);
1542                         return (B_FALSE);
1543                 }
1544 
1545                 if (domount(zlogp, MNTTYPE_TMPFS, "", "swap", tmp) != 0) {
1546                         zerror(zlogp, B_TRUE, "cannot mount swap on %s", *cpp);
1547                         return (B_FALSE);
1548                 }
1549         }
1550         return (B_TRUE);
1551 }
1552 
1553 typedef struct plat_gmount_cb_data {
1554         zlog_t                  *pgcd_zlogp;
1555         struct zone_fstab       **pgcd_fs_tab;
1556         int                     *pgcd_num_fs;
1557 } plat_gmount_cb_data_t;
1558 
1559 /*
1560  * plat_gmount_cb() is a callback function invoked by libbrand to iterate
1561  * through all global brand platform mounts.
1562  */
1563 int
1564 plat_gmount_cb(void *data, const char *spec, const char *dir,
1565     const char *fstype, const char *opt)
1566 {
1567         plat_gmount_cb_data_t   *cp = data;
1568         zlog_t                  *zlogp = cp->pgcd_zlogp;
1569         struct zone_fstab       *fs_ptr = *cp->pgcd_fs_tab;
1570         int                     num_fs = *cp->pgcd_num_fs;
1571         struct zone_fstab       *fsp, *tmp_ptr;
1572 
1573         num_fs++;
1574         if ((tmp_ptr = realloc(fs_ptr, num_fs * sizeof (*tmp_ptr))) == NULL) {
1575                 zerror(zlogp, B_TRUE, "memory allocation failed");
1576                 return (-1);
1577         }
1578 
1579         fs_ptr = tmp_ptr;
1580         fsp = &fs_ptr[num_fs - 1];
1581 
1582         /* update the callback struct passed in */
1583         *cp->pgcd_fs_tab = fs_ptr;
1584         *cp->pgcd_num_fs = num_fs;
1585 
1586         fsp->zone_fs_raw[0] = '\0';
1587         (void) strlcpy(fsp->zone_fs_special, spec,
1588             sizeof (fsp->zone_fs_special));
1589         (void) strlcpy(fsp->zone_fs_dir, dir, sizeof (fsp->zone_fs_dir));
1590         (void) strlcpy(fsp->zone_fs_type, fstype, sizeof (fsp->zone_fs_type));
1591         fsp->zone_fs_options = NULL;
1592         if ((opt != NULL) &&
1593             (zonecfg_add_fs_option(fsp, (char *)opt) != Z_OK)) {
1594                 zerror(zlogp, B_FALSE, "error adding property");
1595                 return (-1);
1596         }
1597 
1598         return (0);
1599 }
1600 
1601 static int
1602 mount_filesystems_ipdent(zone_dochandle_t handle, zlog_t *zlogp,
1603     struct zone_fstab **fs_tabp, int *num_fsp)
1604 {
1605         struct zone_fstab *tmp_ptr, *fs_ptr, *fsp, fstab;
1606         int num_fs;
1607 
1608         num_fs = *num_fsp;
1609         fs_ptr = *fs_tabp;
1610 
1611         if (zonecfg_setipdent(handle) != Z_OK) {
1612                 zerror(zlogp, B_FALSE, "invalid configuration");
1613                 return (-1);
1614         }
1615         while (zonecfg_getipdent(handle, &fstab) == Z_OK) {
1616                 num_fs++;
1617                 if ((tmp_ptr = realloc(fs_ptr,
1618                     num_fs * sizeof (*tmp_ptr))) == NULL) {
1619                         zerror(zlogp, B_TRUE, "memory allocation failed");
1620                         (void) zonecfg_endipdent(handle);
1621                         return (-1);
1622                 }
1623 
1624                 /* update the pointers passed in */
1625                 *fs_tabp = tmp_ptr;
1626                 *num_fsp = num_fs;
1627 
1628                 /*
1629                  * IPDs logically only have a mount point; all other properties
1630                  * are implied.
1631                  */
1632                 fs_ptr = tmp_ptr;
1633                 fsp = &fs_ptr[num_fs - 1];
1634                 (void) strlcpy(fsp->zone_fs_dir,
1635                     fstab.zone_fs_dir, sizeof (fsp->zone_fs_dir));
1636                 fsp->zone_fs_special[0] = '\0';
1637                 fsp->zone_fs_raw[0] = '\0';
1638                 fsp->zone_fs_type[0] = '\0';
1639                 fsp->zone_fs_options = NULL;
1640         }
1641         (void) zonecfg_endipdent(handle);
1642         return (0);
1643 }
1644 
1645 static int
1646 mount_filesystems_fsent(zone_dochandle_t handle, zlog_t *zlogp,
1647     struct zone_fstab **fs_tabp, int *num_fsp, zone_mnt_t mount_cmd)
1648 {
1649         struct zone_fstab *tmp_ptr, *fs_ptr, *fsp, fstab;
1650         int num_fs;
1651 
1652         num_fs = *num_fsp;
1653         fs_ptr = *fs_tabp;
1654 
1655         if (zonecfg_setfsent(handle) != Z_OK) {
1656                 zerror(zlogp, B_FALSE, "invalid configuration");
1657                 return (-1);
1658         }
1659         while (zonecfg_getfsent(handle, &fstab) == Z_OK) {
1660                 /*
1661                  * ZFS filesystems will not be accessible under an alternate
1662                  * root, since the pool will not be known.  Ignore them in this
1663                  * case.
1664                  */
1665                 if (ALT_MOUNT(mount_cmd) &&
1666                     strcmp(fstab.zone_fs_type, MNTTYPE_ZFS) == 0)
1667                         continue;
1668 
1669                 num_fs++;
1670                 if ((tmp_ptr = realloc(fs_ptr,
1671                     num_fs * sizeof (*tmp_ptr))) == NULL) {
1672                         zerror(zlogp, B_TRUE, "memory allocation failed");
1673                         (void) zonecfg_endfsent(handle);
1674                         return (-1);
1675                 }
1676                 /* update the pointers passed in */
1677                 *fs_tabp = tmp_ptr;
1678                 *num_fsp = num_fs;
1679 
1680                 fs_ptr = tmp_ptr;
1681                 fsp = &fs_ptr[num_fs - 1];
1682                 (void) strlcpy(fsp->zone_fs_dir,
1683                     fstab.zone_fs_dir, sizeof (fsp->zone_fs_dir));
1684                 (void) strlcpy(fsp->zone_fs_raw, fstab.zone_fs_raw,
1685                     sizeof (fsp->zone_fs_raw));
1686                 (void) strlcpy(fsp->zone_fs_type, fstab.zone_fs_type,
1687                     sizeof (fsp->zone_fs_type));
1688                 fsp->zone_fs_options = fstab.zone_fs_options;
1689 
1690                 /*
1691                  * For all lofs mounts, make sure that the 'special'
1692                  * entry points inside the alternate root.  The
1693                  * source path for a lofs mount in a given zone needs
1694                  * to be relative to the root of the boot environment
1695                  * that contains the zone.  Note that we don't do this
1696                  * for non-lofs mounts since they will have a device
1697                  * as a backing store and device paths must always be
1698                  * specified relative to the current boot environment.
1699                  */
1700                 fsp->zone_fs_special[0] = '\0';
1701                 if (strcmp(fsp->zone_fs_type, MNTTYPE_LOFS) == 0) {
1702                         (void) strlcat(fsp->zone_fs_special, zonecfg_get_root(),
1703                             sizeof (fsp->zone_fs_special));
1704                 }
1705                 (void) strlcat(fsp->zone_fs_special, fstab.zone_fs_special,
1706                     sizeof (fsp->zone_fs_special));
1707         }
1708         (void) zonecfg_endfsent(handle);
1709         return (0);
1710 }
1711 
1712 static int
1713 mount_filesystems(zlog_t *zlogp, zone_mnt_t mount_cmd)
1714 {
1715         char rootpath[MAXPATHLEN];
1716         char zonepath[MAXPATHLEN];
1717         char brand[MAXNAMELEN];
1718         char luroot[MAXPATHLEN];
1719         int i, num_fs = 0;
1720         struct zone_fstab *fs_ptr = NULL;
1721         zone_dochandle_t handle = NULL;
1722         zone_state_t zstate;
1723         brand_handle_t bh;
1724         plat_gmount_cb_data_t cb;
1725 
1726         if (zone_get_state(zone_name, &zstate) != Z_OK ||
1727             (zstate != ZONE_STATE_READY && zstate != ZONE_STATE_MOUNTED)) {
1728                 zerror(zlogp, B_FALSE,
1729                     "zone must be in '%s' or '%s' state to mount file-systems",
1730                     zone_state_str(ZONE_STATE_READY),
1731                     zone_state_str(ZONE_STATE_MOUNTED));
1732                 goto bad;
1733         }
1734 
1735         if (zone_get_zonepath(zone_name, zonepath, sizeof (zonepath)) != Z_OK) {
1736                 zerror(zlogp, B_TRUE, "unable to determine zone path");
1737                 goto bad;
1738         }
1739 
1740         if (zone_get_rootpath(zone_name, rootpath, sizeof (rootpath)) != Z_OK) {
1741                 zerror(zlogp, B_TRUE, "unable to determine zone root");
1742                 goto bad;
1743         }
1744 
1745         if ((handle = zonecfg_init_handle()) == NULL) {
1746                 zerror(zlogp, B_TRUE, "getting zone configuration handle");
1747                 goto bad;
1748         }
1749         if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK ||
1750             zonecfg_setfsent(handle) != Z_OK) {
1751                 zerror(zlogp, B_FALSE, "invalid configuration");
1752                 goto bad;
1753         }
1754 
1755         /* Get a handle to the brand info for this zone */
1756         if ((zone_get_brand(zone_name, brand, sizeof (brand)) != Z_OK) ||
1757             (bh = brand_open(brand)) == NULL) {
1758                 zerror(zlogp, B_FALSE, "unable to determine zone brand");
1759                 zonecfg_fini_handle(handle);
1760                 return (-1);
1761         }
1762 
1763         /*
1764          * Get the list of global filesystems to mount from the brand
1765          * configuration.
1766          */
1767         cb.pgcd_zlogp = zlogp;
1768         cb.pgcd_fs_tab = &fs_ptr;
1769         cb.pgcd_num_fs = &num_fs;
1770         if (brand_platform_iter_gmounts(bh, zonepath,
1771             plat_gmount_cb, &cb) != 0) {
1772                 zerror(zlogp, B_FALSE, "unable to mount filesystems");
1773                 brand_close(bh);
1774                 zonecfg_fini_handle(handle);
1775                 return (-1);
1776         }
1777         brand_close(bh);
1778 
1779         /*
1780          * Iterate through the rest of the filesystems, first the IPDs, then
1781          * the general FSs.  Sort them all, then mount them in sorted order.
1782          * This is to make sure the higher level directories (e.g., /usr)
1783          * get mounted before any beneath them (e.g., /usr/local).
1784          */
1785         if (mount_filesystems_ipdent(handle, zlogp, &fs_ptr, &num_fs) != 0)
1786                 goto bad;
1787 
1788         if (mount_filesystems_fsent(handle, zlogp, &fs_ptr, &num_fs,
1789             mount_cmd) != 0)
1790                 goto bad;
1791 
1792         zonecfg_fini_handle(handle);
1793         handle = NULL;
1794 
1795         /*
1796          * Normally when we mount a zone all the zone filesystems
1797          * get mounted relative to rootpath, which is usually
1798          * <zonepath>/root.  But when mounting a zone for administration
1799          * purposes via the zone "mount" state, build_mounted_pre_var()
1800          * updates rootpath to be <zonepath>/lu/a so we'll mount all
1801          * the zones filesystems there instead.
1802          *
1803          * build_mounted_pre_var() and build_mounted_post_var() will
1804          * also do some extra work to create directories and lofs mount
1805          * a bunch of global zone file system paths into <zonepath>/lu.
1806          *
1807          * This allows us to be able to enter the zone (now rooted at
1808          * <zonepath>/lu) and run the upgrade/patch tools that are in the
1809          * global zone and have them upgrade the to-be-modified zone's
1810          * files mounted on /a.  (Which mirrors the existing standard
1811          * upgrade environment.)
1812          *
1813          * There is of course one catch.  When doing the upgrade
1814          * we need <zoneroot>/lu/dev to be the /dev filesystem
1815          * for the zone and we don't want to have any /dev filesystem
1816          * mounted at <zoneroot>/lu/a/dev.  Since /dev is specified
1817          * as a normal zone filesystem by default we'll try to mount
1818          * it at <zoneroot>/lu/a/dev, so we have to detect this
1819          * case and instead mount it at <zoneroot>/lu/dev.
1820          *
1821          * All this work is done in three phases:
1822          *   1) Create and populate lu directory (build_mounted_pre_var()).
1823          *   2) Mount the required filesystems as per the zone configuration.
1824          *   3) Set up the rest of the scratch zone environment
1825          *      (build_mounted_post_var()).
1826          */
1827         if (ALT_MOUNT(mount_cmd) && !build_mounted_pre_var(zlogp,
1828             rootpath, sizeof (rootpath), zonepath, luroot, sizeof (luroot)))
1829                 goto bad;
1830 
1831         qsort(fs_ptr, num_fs, sizeof (*fs_ptr), fs_compare);
1832 
1833         for (i = 0; i < num_fs; i++) {
1834                 if (ALT_MOUNT(mount_cmd) &&
1835                     strcmp(fs_ptr[i].zone_fs_dir, "/dev") == 0) {
1836                         size_t slen = strlen(rootpath) - 2;
1837 
1838                         /*
1839                          * By default we'll try to mount /dev as /a/dev
1840                          * but /dev is special and always goes at the top
1841                          * so strip the trailing '/a' from the rootpath.
1842                          */
1843                         assert(zone_isnative || zone_iscluster);
1844                         assert(strcmp(&rootpath[slen], "/a") == 0);
1845                         rootpath[slen] = '\0';
1846                         if (mount_one(zlogp, &fs_ptr[i], rootpath) != 0)
1847                                 goto bad;
1848                         rootpath[slen] = '/';
1849                         continue;
1850                 }
1851                 if (mount_one(zlogp, &fs_ptr[i], rootpath) != 0)
1852                         goto bad;
1853         }
1854         if (ALT_MOUNT(mount_cmd) &&
1855             !build_mounted_post_var(zlogp, mount_cmd, rootpath, luroot))
1856                 goto bad;
1857 
1858         /*
1859          * For Trusted Extensions cross-mount each lower level /export/home
1860          */
1861         if (mount_cmd == Z_MNT_BOOT &&
1862             tsol_mounts(zlogp, zone_name, rootpath) != 0)
1863                 goto bad;
1864 
1865         free_fs_data(fs_ptr, num_fs);
1866 
1867         /*
1868          * Everything looks fine.
1869          */
1870         return (0);
1871 
1872 bad:
1873         if (handle != NULL)
1874                 zonecfg_fini_handle(handle);
1875         free_fs_data(fs_ptr, num_fs);
1876         return (-1);
1877 }
1878 
1879 /* caller makes sure neither parameter is NULL */
1880 static int
1881 addr2netmask(char *prefixstr, int maxprefixlen, uchar_t *maskstr)
1882 {
1883         int prefixlen;
1884 
1885         prefixlen = atoi(prefixstr);
1886         if (prefixlen < 0 || prefixlen > maxprefixlen)
1887                 return (1);
1888         while (prefixlen > 0) {
1889                 if (prefixlen >= 8) {
1890                         *maskstr++ = 0xFF;
1891                         prefixlen -= 8;
1892                         continue;
1893                 }
1894                 *maskstr |= 1 << (8 - prefixlen);
1895                 prefixlen--;
1896         }
1897         return (0);
1898 }
1899 
1900 /*
1901  * Tear down all interfaces belonging to the given zone.  This should
1902  * be called with the zone in a state other than "running", so that
1903  * interfaces can't be assigned to the zone after this returns.
1904  *
1905  * If anything goes wrong, log an error message and return an error.
1906  */
1907 static int
1908 unconfigure_shared_network_interfaces(zlog_t *zlogp, zoneid_t zone_id)
1909 {
1910         struct lifnum lifn;
1911         struct lifconf lifc;
1912         struct lifreq *lifrp, lifrl;
1913         int64_t lifc_flags = LIFC_NOXMIT | LIFC_ALLZONES;
1914         int num_ifs, s, i, ret_code = 0;
1915         uint_t bufsize;
1916         char *buf = NULL;
1917 
1918         if ((s = socket(AF_INET, SOCK_DGRAM, 0)) < 0) {
1919                 zerror(zlogp, B_TRUE, "could not get socket");
1920                 ret_code = -1;
1921                 goto bad;
1922         }
1923         lifn.lifn_family = AF_UNSPEC;
1924         lifn.lifn_flags = (int)lifc_flags;
1925         if (ioctl(s, SIOCGLIFNUM, (char *)&lifn) < 0) {
1926                 zerror(zlogp, B_TRUE,
1927                     "could not determine number of network interfaces");
1928                 ret_code = -1;
1929                 goto bad;
1930         }
1931         num_ifs = lifn.lifn_count;
1932         bufsize = num_ifs * sizeof (struct lifreq);
1933         if ((buf = malloc(bufsize)) == NULL) {
1934                 zerror(zlogp, B_TRUE, "memory allocation failed");
1935                 ret_code = -1;
1936                 goto bad;
1937         }
1938         lifc.lifc_family = AF_UNSPEC;
1939         lifc.lifc_flags = (int)lifc_flags;
1940         lifc.lifc_len = bufsize;
1941         lifc.lifc_buf = buf;
1942         if (ioctl(s, SIOCGLIFCONF, (char *)&lifc) < 0) {
1943                 zerror(zlogp, B_TRUE, "could not get configured network "
1944                     "interfaces");
1945                 ret_code = -1;
1946                 goto bad;
1947         }
1948         lifrp = lifc.lifc_req;
1949         for (i = lifc.lifc_len / sizeof (struct lifreq); i > 0; i--, lifrp++) {
1950                 (void) close(s);
1951                 if ((s = socket(lifrp->lifr_addr.ss_family, SOCK_DGRAM, 0)) <
1952                     0) {
1953                         zerror(zlogp, B_TRUE, "%s: could not get socket",
1954                             lifrl.lifr_name);
1955                         ret_code = -1;
1956                         continue;
1957                 }
1958                 (void) memset(&lifrl, 0, sizeof (lifrl));
1959                 (void) strncpy(lifrl.lifr_name, lifrp->lifr_name,
1960                     sizeof (lifrl.lifr_name));
1961                 if (ioctl(s, SIOCGLIFZONE, (caddr_t)&lifrl) < 0) {
1962                         if (errno == ENXIO)
1963                                 /*
1964                                  * Interface may have been removed by admin or
1965                                  * another zone halting.
1966                                  */
1967                                 continue;
1968                         zerror(zlogp, B_TRUE,
1969                             "%s: could not determine the zone to which this "
1970                             "network interface is bound", lifrl.lifr_name);
1971                         ret_code = -1;
1972                         continue;
1973                 }
1974                 if (lifrl.lifr_zoneid == zone_id) {
1975                         if (ioctl(s, SIOCLIFREMOVEIF, (caddr_t)&lifrl) < 0) {
1976                                 zerror(zlogp, B_TRUE,
1977                                     "%s: could not remove network interface",
1978                                     lifrl.lifr_name);
1979                                 ret_code = -1;
1980                                 continue;
1981                         }
1982                 }
1983         }
1984 bad:
1985         if (s > 0)
1986                 (void) close(s);
1987         if (buf)
1988                 free(buf);
1989         return (ret_code);
1990 }
1991 
1992 static union    sockunion {
1993         struct  sockaddr sa;
1994         struct  sockaddr_in sin;
1995         struct  sockaddr_dl sdl;
1996         struct  sockaddr_in6 sin6;
1997 } so_dst, so_ifp;
1998 
1999 static struct {
2000         struct  rt_msghdr hdr;
2001         char    space[512];
2002 } rtmsg;
2003 
2004 static int
2005 salen(struct sockaddr *sa)
2006 {
2007         switch (sa->sa_family) {
2008         case AF_INET:
2009                 return (sizeof (struct sockaddr_in));
2010         case AF_LINK:
2011                 return (sizeof (struct sockaddr_dl));
2012         case AF_INET6:
2013                 return (sizeof (struct sockaddr_in6));
2014         default:
2015                 return (sizeof (struct sockaddr));
2016         }
2017 }
2018 
2019 #define ROUNDUP_LONG(a) \
2020         ((a) > 0 ? (1 + (((a) - 1) | (sizeof (long) - 1))) : sizeof (long))
2021 
2022 /*
2023  * Look up which zone is using a given IP address.  The address in question
2024  * is expected to have been stuffed into the structure to which lifr points
2025  * via a previous SIOCGLIFADDR ioctl().
2026  *
2027  * This is done using black router socket magic.
2028  *
2029  * Return the name of the zone on success or NULL on failure.
2030  *
2031  * This is a lot of code for a simple task; a new ioctl request to take care
2032  * of this might be a useful RFE.
2033  */
2034 
2035 static char *
2036 who_is_using(zlog_t *zlogp, struct lifreq *lifr)
2037 {
2038         static char answer[ZONENAME_MAX];
2039         pid_t pid;
2040         int s, rlen, l, i;
2041         char *cp = rtmsg.space;
2042         struct sockaddr_dl *ifp = NULL;
2043         struct sockaddr *sa;
2044         char save_if_name[LIFNAMSIZ];
2045 
2046         answer[0] = '\0';
2047 
2048         pid = getpid();
2049         if ((s = socket(PF_ROUTE, SOCK_RAW, 0)) < 0) {
2050                 zerror(zlogp, B_TRUE, "could not get routing socket");
2051                 return (NULL);
2052         }
2053 
2054         if (lifr->lifr_addr.ss_family == AF_INET) {
2055                 struct sockaddr_in *sin4;
2056 
2057                 so_dst.sa.sa_family = AF_INET;
2058                 sin4 = (struct sockaddr_in *)&lifr->lifr_addr;
2059                 so_dst.sin.sin_addr = sin4->sin_addr;
2060         } else {
2061                 struct sockaddr_in6 *sin6;
2062 
2063                 so_dst.sa.sa_family = AF_INET6;
2064                 sin6 = (struct sockaddr_in6 *)&lifr->lifr_addr;
2065                 so_dst.sin6.sin6_addr = sin6->sin6_addr;
2066         }
2067 
2068         so_ifp.sa.sa_family = AF_LINK;
2069 
2070         (void) memset(&rtmsg, 0, sizeof (rtmsg));
2071         rtmsg.hdr.rtm_type = RTM_GET;
2072         rtmsg.hdr.rtm_flags = RTF_UP | RTF_HOST;
2073         rtmsg.hdr.rtm_version = RTM_VERSION;
2074         rtmsg.hdr.rtm_seq = ++rts_seqno;
2075         rtmsg.hdr.rtm_addrs = RTA_IFP | RTA_DST;
2076 
2077         l = ROUNDUP_LONG(salen(&so_dst.sa));
2078         (void) memmove(cp, &(so_dst), l);
2079         cp += l;
2080         l = ROUNDUP_LONG(salen(&so_ifp.sa));
2081         (void) memmove(cp, &(so_ifp), l);
2082         cp += l;
2083 
2084         rtmsg.hdr.rtm_msglen = l = cp - (char *)&rtmsg;
2085 
2086         if ((rlen = write(s, &rtmsg, l)) < 0) {
2087                 zerror(zlogp, B_TRUE, "writing to routing socket");
2088                 return (NULL);
2089         } else if (rlen < (int)rtmsg.hdr.rtm_msglen) {
2090                 zerror(zlogp, B_TRUE,
2091                     "write to routing socket got only %d for len\n", rlen);
2092                 return (NULL);
2093         }
2094         do {
2095                 l = read(s, &rtmsg, sizeof (rtmsg));
2096         } while (l > 0 && (rtmsg.hdr.rtm_seq != rts_seqno ||
2097             rtmsg.hdr.rtm_pid != pid));
2098         if (l < 0) {
2099                 zerror(zlogp, B_TRUE, "reading from routing socket");
2100                 return (NULL);
2101         }
2102 
2103         if (rtmsg.hdr.rtm_version != RTM_VERSION) {
2104                 zerror(zlogp, B_FALSE,
2105                     "routing message version %d not understood",
2106                     rtmsg.hdr.rtm_version);
2107                 return (NULL);
2108         }
2109         if (rtmsg.hdr.rtm_msglen != (ushort_t)l) {
2110                 zerror(zlogp, B_FALSE, "message length mismatch, "
2111                     "expected %d bytes, returned %d bytes",
2112                     rtmsg.hdr.rtm_msglen, l);
2113                 return (NULL);
2114         }
2115         if (rtmsg.hdr.rtm_errno != 0)  {
2116                 errno = rtmsg.hdr.rtm_errno;
2117                 zerror(zlogp, B_TRUE, "RTM_GET routing socket message");
2118                 return (NULL);
2119         }
2120         if ((rtmsg.hdr.rtm_addrs & RTA_IFP) == 0) {
2121                 zerror(zlogp, B_FALSE, "network interface not found");
2122                 return (NULL);
2123         }
2124         cp = ((char *)(&rtmsg.hdr + 1));
2125         for (i = 1; i != 0; i <<= 1) {
2126                 /* LINTED E_BAD_PTR_CAST_ALIGN */
2127                 sa = (struct sockaddr *)cp;
2128                 if (i != RTA_IFP) {
2129                         if ((i & rtmsg.hdr.rtm_addrs) != 0)
2130                                 cp += ROUNDUP_LONG(salen(sa));
2131                         continue;
2132                 }
2133                 if (sa->sa_family == AF_LINK &&
2134                     ((struct sockaddr_dl *)sa)->sdl_nlen != 0)
2135                         ifp = (struct sockaddr_dl *)sa;
2136                 break;
2137         }
2138         if (ifp == NULL) {
2139                 zerror(zlogp, B_FALSE, "network interface could not be "
2140                     "determined");
2141                 return (NULL);
2142         }
2143 
2144         /*
2145          * We need to set the I/F name to what we got above, then do the
2146          * appropriate ioctl to get its zone name.  But lifr->lifr_name is
2147          * used by the calling function to do a REMOVEIF, so if we leave the
2148          * "good" zone's I/F name in place, *that* I/F will be removed instead
2149          * of the bad one.  So we save the old (bad) I/F name before over-
2150          * writing it and doing the ioctl, then restore it after the ioctl.
2151          */
2152         (void) strlcpy(save_if_name, lifr->lifr_name, sizeof (save_if_name));
2153         (void) strncpy(lifr->lifr_name, ifp->sdl_data, ifp->sdl_nlen);
2154         lifr->lifr_name[ifp->sdl_nlen] = '\0';
2155         i = ioctl(s, SIOCGLIFZONE, lifr);
2156         (void) strlcpy(lifr->lifr_name, save_if_name, sizeof (save_if_name));
2157         if (i < 0) {
2158                 zerror(zlogp, B_TRUE,
2159                     "%s: could not determine the zone network interface "
2160                     "belongs to", lifr->lifr_name);
2161                 return (NULL);
2162         }
2163         if (getzonenamebyid(lifr->lifr_zoneid, answer, sizeof (answer)) < 0)
2164                 (void) snprintf(answer, sizeof (answer), "%d",
2165                     lifr->lifr_zoneid);
2166 
2167         if (strlen(answer) > 0)
2168                 return (answer);
2169         return (NULL);
2170 }
2171 
2172 typedef struct mcast_rtmsg_s {
2173         struct rt_msghdr        m_rtm;
2174         union {
2175                 struct {
2176                         struct sockaddr_in      m_dst;
2177                         struct sockaddr_in      m_gw;
2178                         struct sockaddr_in      m_netmask;
2179                 } m_v4;
2180                 struct {
2181                         struct sockaddr_in6     m_dst;
2182                         struct sockaddr_in6     m_gw;
2183                         struct sockaddr_in6     m_netmask;
2184                 } m_v6;
2185         } m_u;
2186 } mcast_rtmsg_t;
2187 #define m_dst4          m_u.m_v4.m_dst
2188 #define m_dst6          m_u.m_v6.m_dst
2189 #define m_gw4           m_u.m_v4.m_gw
2190 #define m_gw6           m_u.m_v6.m_gw
2191 #define m_netmask4      m_u.m_v4.m_netmask
2192 #define m_netmask6      m_u.m_v6.m_netmask
2193 
2194 /*
2195  * Configures a single interface: a new virtual interface is added, based on
2196  * the physical interface nwiftabptr->zone_nwif_physical, with the address
2197  * specified in nwiftabptr->zone_nwif_address, for zone zone_id.  Note that
2198  * the "address" can be an IPv6 address (with a /prefixlength required), an
2199  * IPv4 address (with a /prefixlength optional), or a name; for the latter,
2200  * an IPv4 name-to-address resolution will be attempted.
2201  *
2202  * A default interface route for multicast is created on the first IPv4 and
2203  * IPv6 interfaces (that have the IFF_MULTICAST flag set), respectively.
2204  * This should really be done in the init scripts if we ever allow zones to
2205  * modify the routing tables.
2206  *
2207  * If anything goes wrong, we log an detailed error message, attempt to tear
2208  * down whatever we set up and return an error.
2209  */
2210 static int
2211 configure_one_interface(zlog_t *zlogp, zoneid_t zone_id,
2212     struct zone_nwiftab *nwiftabptr, boolean_t *mcast_rt_v4_setp,
2213     boolean_t *mcast_rt_v6_setp)
2214 {
2215         struct lifreq lifr;
2216         struct sockaddr_in netmask4;
2217         struct sockaddr_in6 netmask6;
2218         struct in_addr in4;
2219         struct in6_addr in6;
2220         sa_family_t af;
2221         char *slashp = strchr(nwiftabptr->zone_nwif_address, '/');
2222         mcast_rtmsg_t mcast_rtmsg;
2223         int s;
2224         int rs;
2225         int rlen;
2226         boolean_t got_netmask = B_FALSE;
2227         char addrstr4[INET_ADDRSTRLEN];
2228         int res;
2229 
2230         res = zonecfg_valid_net_address(nwiftabptr->zone_nwif_address, &lifr);
2231         if (res != Z_OK) {
2232                 zerror(zlogp, B_FALSE, "%s: %s", zonecfg_strerror(res),
2233                     nwiftabptr->zone_nwif_address);
2234                 return (-1);
2235         }
2236         af = lifr.lifr_addr.ss_family;
2237         if (af == AF_INET)
2238                 in4 = ((struct sockaddr_in *)(&lifr.lifr_addr))->sin_addr;
2239         else
2240                 in6 = ((struct sockaddr_in6 *)(&lifr.lifr_addr))->sin6_addr;
2241 
2242         if ((s = socket(af, SOCK_DGRAM, 0)) < 0) {
2243                 zerror(zlogp, B_TRUE, "could not get socket");
2244                 return (-1);
2245         }
2246 
2247         (void) strlcpy(lifr.lifr_name, nwiftabptr->zone_nwif_physical,
2248             sizeof (lifr.lifr_name));
2249         if (ioctl(s, SIOCLIFADDIF, (caddr_t)&lifr) < 0) {
2250                 /*
2251                  * Here, we know that the interface can't be brought up.
2252                  * A similar warning message was already printed out to
2253                  * the console by zoneadm(1M) so instead we log the
2254                  * message to syslog and continue.
2255                  */
2256                 zerror(&logsys, B_TRUE, "WARNING: skipping network interface "
2257                     "'%s' which may not be present/plumbed in the "
2258                     "global zone.", lifr.lifr_name);
2259                 (void) close(s);
2260                 return (Z_OK);
2261         }
2262 
2263         if (ioctl(s, SIOCSLIFADDR, (caddr_t)&lifr) < 0) {
2264                 zerror(zlogp, B_TRUE,
2265                     "%s: could not set IP address to %s",
2266                     lifr.lifr_name, nwiftabptr->zone_nwif_address);
2267                 goto bad;
2268         }
2269 
2270         /* Preserve literal IPv4 address for later potential printing. */
2271         if (af == AF_INET)
2272                 (void) inet_ntop(AF_INET, &in4, addrstr4, INET_ADDRSTRLEN);
2273 
2274         lifr.lifr_zoneid = zone_id;
2275         if (ioctl(s, SIOCSLIFZONE, (caddr_t)&lifr) < 0) {
2276                 zerror(zlogp, B_TRUE, "%s: could not place network interface "
2277                     "into zone", lifr.lifr_name);
2278                 goto bad;
2279         }
2280 
2281         if (strcmp(nwiftabptr->zone_nwif_physical, "lo0") == 0) {
2282                 got_netmask = B_TRUE;   /* default setting will be correct */
2283         } else {
2284                 if (af == AF_INET) {
2285                         /*
2286                          * The IPv4 netmask can be determined either
2287                          * directly if a prefix length was supplied with
2288                          * the address or via the netmasks database.  Not
2289                          * being able to determine it is a common failure,
2290                          * but it often is not fatal to operation of the
2291                          * interface.  In that case, a warning will be
2292                          * printed after the rest of the interface's
2293                          * parameters have been configured.
2294                          */
2295                         (void) memset(&netmask4, 0, sizeof (netmask4));
2296                         if (slashp != NULL) {
2297                                 if (addr2netmask(slashp + 1, V4_ADDR_LEN,
2298                                     (uchar_t *)&netmask4.sin_addr) != 0) {
2299                                         *slashp = '/';
2300                                         zerror(zlogp, B_FALSE,
2301                                             "%s: invalid prefix length in %s",
2302                                             lifr.lifr_name,
2303                                             nwiftabptr->zone_nwif_address);
2304                                         goto bad;
2305                                 }
2306                                 got_netmask = B_TRUE;
2307                         } else if (getnetmaskbyaddr(in4,
2308                             &netmask4.sin_addr) == 0) {
2309                                 got_netmask = B_TRUE;
2310                         }
2311                         if (got_netmask) {
2312                                 netmask4.sin_family = af;
2313                                 (void) memcpy(&lifr.lifr_addr, &netmask4,
2314                                     sizeof (netmask4));
2315                         }
2316                 } else {
2317                         (void) memset(&netmask6, 0, sizeof (netmask6));
2318                         if (addr2netmask(slashp + 1, V6_ADDR_LEN,
2319                             (uchar_t *)&netmask6.sin6_addr) != 0) {
2320                                 *slashp = '/';
2321                                 zerror(zlogp, B_FALSE,
2322                                     "%s: invalid prefix length in %s",
2323                                     lifr.lifr_name,
2324                                     nwiftabptr->zone_nwif_address);
2325                                 goto bad;
2326                         }
2327                         got_netmask = B_TRUE;
2328                         netmask6.sin6_family = af;
2329                         (void) memcpy(&lifr.lifr_addr, &netmask6,
2330                             sizeof (netmask6));
2331                 }
2332                 if (got_netmask &&
2333                     ioctl(s, SIOCSLIFNETMASK, (caddr_t)&lifr) < 0) {
2334                         zerror(zlogp, B_TRUE, "%s: could not set netmask",
2335                             lifr.lifr_name);
2336                         goto bad;
2337                 }
2338 
2339                 /*
2340                  * This doesn't set the broadcast address at all. Rather, it
2341                  * gets, then sets the interface's address, relying on the fact
2342                  * that resetting the address will reset the broadcast address.
2343                  */
2344                 if (ioctl(s, SIOCGLIFADDR, (caddr_t)&lifr) < 0) {
2345                         zerror(zlogp, B_TRUE, "%s: could not get address",
2346                             lifr.lifr_name);
2347                         goto bad;
2348                 }
2349                 if (ioctl(s, SIOCSLIFADDR, (caddr_t)&lifr) < 0) {
2350                         zerror(zlogp, B_TRUE,
2351                             "%s: could not reset broadcast address",
2352                             lifr.lifr_name);
2353                         goto bad;
2354                 }
2355         }
2356 
2357         if (ioctl(s, SIOCGLIFFLAGS, (caddr_t)&lifr) < 0) {
2358                 zerror(zlogp, B_TRUE, "%s: could not get flags",
2359                     lifr.lifr_name);
2360                 goto bad;
2361         }
2362         lifr.lifr_flags |= IFF_UP;
2363         if (ioctl(s, SIOCSLIFFLAGS, (caddr_t)&lifr) < 0) {
2364                 int save_errno = errno;
2365                 char *zone_using;
2366 
2367                 /*
2368                  * If we failed with something other than EADDRNOTAVAIL,
2369                  * then skip to the end.  Otherwise, look up our address,
2370                  * then call a function to determine which zone is already
2371                  * using that address.
2372                  */
2373                 if (errno != EADDRNOTAVAIL) {
2374                         zerror(zlogp, B_TRUE,
2375                             "%s: could not bring network interface up",
2376                             lifr.lifr_name);
2377                         goto bad;
2378                 }
2379                 if (ioctl(s, SIOCGLIFADDR, (caddr_t)&lifr) < 0) {
2380                         zerror(zlogp, B_TRUE, "%s: could not get address",
2381                             lifr.lifr_name);
2382                         goto bad;
2383                 }
2384                 zone_using = who_is_using(zlogp, &lifr);
2385                 errno = save_errno;
2386                 if (zone_using == NULL)
2387                         zerror(zlogp, B_TRUE,
2388                             "%s: could not bring network interface up",
2389                             lifr.lifr_name);
2390                 else
2391                         zerror(zlogp, B_TRUE, "%s: could not bring network "
2392                             "interface up: address in use by zone '%s'",
2393                             lifr.lifr_name, zone_using);
2394                 goto bad;
2395         }
2396         if ((lifr.lifr_flags & IFF_MULTICAST) && ((af == AF_INET &&
2397             mcast_rt_v4_setp != NULL && *mcast_rt_v4_setp == B_FALSE) ||
2398             (af == AF_INET6 &&
2399             mcast_rt_v6_setp != NULL && *mcast_rt_v6_setp == B_FALSE))) {
2400                 rs = socket(PF_ROUTE, SOCK_RAW, 0);
2401                 if (rs < 0) {
2402                         zerror(zlogp, B_TRUE, "%s: could not create "
2403                             "routing socket", lifr.lifr_name);
2404                         goto bad;
2405                 }
2406                 (void) shutdown(rs, 0);
2407                 (void) memset((void *)&mcast_rtmsg, 0, sizeof (mcast_rtmsg_t));
2408                 mcast_rtmsg.m_rtm.rtm_msglen =  sizeof (struct rt_msghdr) +
2409                     3 * (af == AF_INET ? sizeof (struct sockaddr_in) :
2410                     sizeof (struct sockaddr_in6));
2411                 mcast_rtmsg.m_rtm.rtm_version = RTM_VERSION;
2412                 mcast_rtmsg.m_rtm.rtm_type = RTM_ADD;
2413                 mcast_rtmsg.m_rtm.rtm_flags = RTF_UP;
2414                 mcast_rtmsg.m_rtm.rtm_addrs =
2415                     RTA_DST | RTA_GATEWAY | RTA_NETMASK;
2416                 mcast_rtmsg.m_rtm.rtm_seq = ++rts_seqno;
2417                 if (af == AF_INET) {
2418                         mcast_rtmsg.m_dst4.sin_family = AF_INET;
2419                         mcast_rtmsg.m_dst4.sin_addr.s_addr =
2420                             htonl(INADDR_UNSPEC_GROUP);
2421                         mcast_rtmsg.m_gw4.sin_family = AF_INET;
2422                         mcast_rtmsg.m_gw4.sin_addr = in4;
2423                         mcast_rtmsg.m_netmask4.sin_family = AF_INET;
2424                         mcast_rtmsg.m_netmask4.sin_addr.s_addr =
2425                             htonl(IN_CLASSD_NET);
2426                 } else {
2427                         mcast_rtmsg.m_dst6.sin6_family = AF_INET6;
2428                         mcast_rtmsg.m_dst6.sin6_addr.s6_addr[0] = 0xffU;
2429                         mcast_rtmsg.m_gw6.sin6_family = AF_INET6;
2430                         mcast_rtmsg.m_gw6.sin6_addr = in6;
2431                         mcast_rtmsg.m_netmask6.sin6_family = AF_INET6;
2432                         mcast_rtmsg.m_netmask6.sin6_addr.s6_addr[0] = 0xffU;
2433                 }
2434                 rlen = write(rs, (char *)&mcast_rtmsg,
2435                     mcast_rtmsg.m_rtm.rtm_msglen);
2436                 /*
2437                  * The write to the multicast socket will fail if the
2438                  * interface belongs to a failed IPMP group. This is a
2439                  * non-fatal error and the zone will continue booting.
2440                  * While the zone is running, if any interface in the
2441                  * failed IPMP group recovers, the zone will fallback to
2442                  * using that interface.
2443                  */
2444                 if (rlen < mcast_rtmsg.m_rtm.rtm_msglen) {
2445                         if (rlen < 0) {
2446                                 zerror(zlogp, B_TRUE, "WARNING: network "
2447                                     "interface '%s' not available as default "
2448                                     "for multicast.", lifr.lifr_name);
2449                         } else {
2450                                 zerror(zlogp, B_FALSE, "WARNING: network "
2451                                     "interface '%s' not available as default "
2452                                     "for multicast; routing socket returned "
2453                                     "unexpected %d bytes.",
2454                                     lifr.lifr_name, rlen);
2455                         }
2456                 } else {
2457 
2458                         if (af == AF_INET) {
2459                                 *mcast_rt_v4_setp = B_TRUE;
2460                         } else {
2461                                 *mcast_rt_v6_setp = B_TRUE;
2462                         }
2463                 }
2464                 (void) close(rs);
2465         }
2466 
2467         if (!got_netmask) {
2468                 /*
2469                  * A common, but often non-fatal problem, is that the system
2470                  * cannot find the netmask for an interface address. This is
2471                  * often caused by it being only in /etc/inet/netmasks, but
2472                  * /etc/nsswitch.conf says to use NIS or NIS+ and it's not
2473                  * in that. This doesn't show up at boot because the netmask
2474                  * is obtained from /etc/inet/netmasks when no network
2475                  * interfaces are up, but isn't consulted when NIS/NIS+ is
2476                  * available. We warn the user here that something like this
2477                  * has happened and we're just running with a default and
2478                  * possible incorrect netmask.
2479                  */
2480                 char buffer[INET6_ADDRSTRLEN];
2481                 void  *addr;
2482 
2483                 if (af == AF_INET)
2484                         addr = &((struct sockaddr_in *)
2485                             (&lifr.lifr_addr))->sin_addr;
2486                 else
2487                         addr = &((struct sockaddr_in6 *)
2488                             (&lifr.lifr_addr))->sin6_addr;
2489 
2490                 /* Find out what netmask interface is going to be using */
2491                 if (ioctl(s, SIOCGLIFNETMASK, (caddr_t)&lifr) < 0 ||
2492                     inet_ntop(af, addr, buffer, sizeof (buffer)) == NULL)
2493                         goto bad;
2494                 zerror(zlogp, B_FALSE,
2495                     "WARNING: %s: no matching subnet found in netmasks(4) for "
2496                     "%s; using default of %s.",
2497                     lifr.lifr_name, addrstr4, buffer);
2498         }
2499 
2500         /*
2501          * If a default router was specified for this interface
2502          * set the route now. Ignore if already set.
2503          */
2504         if (strlen(nwiftabptr->zone_nwif_defrouter) > 0) {
2505                 int status;
2506                 char *argv[7];
2507 
2508                 argv[0] = "route";
2509                 argv[1] = "add";
2510                 argv[2] = "-ifp";
2511                 argv[3] = nwiftabptr->zone_nwif_physical;
2512                 argv[4] = "default";
2513                 argv[5] = nwiftabptr->zone_nwif_defrouter;
2514                 argv[6] = NULL;
2515 
2516                 status = forkexec(zlogp, "/usr/sbin/route", argv);
2517                 if (status != 0 && status != EEXIST)
2518                         zerror(zlogp, B_FALSE, "Unable to set route for "
2519                             "interface %s to %s\n",
2520                             nwiftabptr->zone_nwif_physical,
2521                             nwiftabptr->zone_nwif_defrouter);
2522         }
2523 
2524         (void) close(s);
2525         return (Z_OK);
2526 bad:
2527         (void) ioctl(s, SIOCLIFREMOVEIF, (caddr_t)&lifr);
2528         (void) close(s);
2529         return (-1);
2530 }
2531 
2532 /*
2533  * Sets up network interfaces based on information from the zone configuration.
2534  * An IPv4 loopback interface is set up "for free", modeling the global system.
2535  * If any of the configuration interfaces were IPv6, then an IPv6 loopback
2536  * address is set up as well.
2537  *
2538  * If anything goes wrong, we log a general error message, attempt to tear down
2539  * whatever we set up, and return an error.
2540  */
2541 static int
2542 configure_shared_network_interfaces(zlog_t *zlogp)
2543 {
2544         zone_dochandle_t handle;
2545         struct zone_nwiftab nwiftab, loopback_iftab;
2546         boolean_t saw_v6 = B_FALSE;
2547         boolean_t mcast_rt_v4_set = B_FALSE;
2548         boolean_t mcast_rt_v6_set = B_FALSE;
2549         zoneid_t zoneid;
2550 
2551         if ((zoneid = getzoneidbyname(zone_name)) == ZONE_ID_UNDEFINED) {
2552                 zerror(zlogp, B_TRUE, "unable to get zoneid");
2553                 return (-1);
2554         }
2555 
2556         if ((handle = zonecfg_init_handle()) == NULL) {
2557                 zerror(zlogp, B_TRUE, "getting zone configuration handle");
2558                 return (-1);
2559         }
2560         if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
2561                 zerror(zlogp, B_FALSE, "invalid configuration");
2562                 zonecfg_fini_handle(handle);
2563                 return (-1);
2564         }
2565         if (zonecfg_setnwifent(handle) == Z_OK) {
2566                 for (;;) {
2567                         struct in6_addr in6;
2568 
2569                         if (zonecfg_getnwifent(handle, &nwiftab) != Z_OK)
2570                                 break;
2571                         if (configure_one_interface(zlogp, zoneid,
2572                             &nwiftab, &mcast_rt_v4_set, &mcast_rt_v6_set) !=
2573                             Z_OK) {
2574                                 (void) zonecfg_endnwifent(handle);
2575                                 zonecfg_fini_handle(handle);
2576                                 return (-1);
2577                         }
2578                         if (inet_pton(AF_INET6, nwiftab.zone_nwif_address,
2579                             &in6) == 1)
2580                                 saw_v6 = B_TRUE;
2581                 }
2582                 (void) zonecfg_endnwifent(handle);
2583         }
2584         zonecfg_fini_handle(handle);
2585         if (is_system_labeled()) {
2586                 /*
2587                  * Labeled zones share the loopback interface
2588                  * so it is not plumbed for shared stack instances.
2589                  */
2590                 return (0);
2591         }
2592         (void) strlcpy(loopback_iftab.zone_nwif_physical, "lo0",
2593             sizeof (loopback_iftab.zone_nwif_physical));
2594         (void) strlcpy(loopback_iftab.zone_nwif_address, "127.0.0.1",
2595             sizeof (loopback_iftab.zone_nwif_address));
2596         loopback_iftab.zone_nwif_defrouter[0] = '\0';
2597         if (configure_one_interface(zlogp, zoneid, &loopback_iftab, NULL, NULL)
2598             != Z_OK) {
2599                 return (-1);
2600         }
2601         if (saw_v6) {
2602                 (void) strlcpy(loopback_iftab.zone_nwif_address, "::1/128",
2603                     sizeof (loopback_iftab.zone_nwif_address));
2604                 if (configure_one_interface(zlogp, zoneid,
2605                     &loopback_iftab, NULL, NULL) != Z_OK) {
2606                         return (-1);
2607                 }
2608         }
2609         return (0);
2610 }
2611 
2612 static void
2613 show_owner(zlog_t *zlogp, char *dlname)
2614 {
2615         zoneid_t dl_owner_zid;
2616         char dl_owner_zname[ZONENAME_MAX];
2617 
2618         dl_owner_zid = ALL_ZONES;
2619         if (zone_check_datalink(&dl_owner_zid, dlname) != 0)
2620                 (void) snprintf(dl_owner_zname, ZONENAME_MAX, "<unknown>");
2621         else if (getzonenamebyid(dl_owner_zid, dl_owner_zname, ZONENAME_MAX)
2622             < 0)
2623                 (void) snprintf(dl_owner_zname, ZONENAME_MAX, "<%d>",
2624                     dl_owner_zid);
2625 
2626         errno = EPERM;
2627         zerror(zlogp, B_TRUE, "WARNING: skipping network interface '%s' "
2628             "which is used by the non-global zone '%s'.\n",
2629             dlname, dl_owner_zname);
2630 }
2631 
2632 static int
2633 add_datalink(zlog_t *zlogp, zoneid_t zoneid, char *dlname)
2634 {
2635         /* First check if it's in use by global zone. */
2636         if (zonecfg_ifname_exists(AF_INET, dlname) ||
2637             zonecfg_ifname_exists(AF_INET6, dlname)) {
2638                 errno = EPERM;
2639                 zerror(zlogp, B_TRUE, "WARNING: skipping network interface "
2640                     "'%s' which is used in the global zone.", dlname);
2641                 return (-1);
2642         }
2643 
2644         /* Add access control information */
2645         if (zone_add_datalink(zoneid, dlname) != 0) {
2646                 /* If someone got this link before us, show its name */
2647                 if (errno == EPERM)
2648                         show_owner(zlogp, dlname);
2649                 else
2650                         zerror(zlogp, B_TRUE, "WARNING: unable to add network "
2651                             "interface '%s'.", dlname);
2652                 return (-1);
2653         }
2654 
2655         /* Set zoneid of this link. */
2656         if (dladm_setzid(dlname, zoneid) != DLADM_STATUS_OK) {
2657                 zerror(zlogp, B_TRUE, "WARNING: unable to add network "
2658                     "interface '%s'.", dlname);
2659                 (void) zone_remove_datalink(zoneid, dlname);
2660                 return (-1);
2661         }
2662 
2663         return (0);
2664 }
2665 
2666 static int
2667 remove_datalink(zlog_t *zlogp, zoneid_t zoneid, char *dlname)
2668 {
2669         /*
2670          * Remove access control information.
2671          * If the errno is ENXIO, the interface is not added yet,
2672          * nothing to report then.
2673          */
2674         if (zone_remove_datalink(zoneid, dlname) != 0) {
2675                 if (errno == ENXIO)
2676                         return (0);
2677                 zerror(zlogp, B_TRUE, "unable to remove network interface '%s'",
2678                     dlname);
2679                 return (-1);
2680         }
2681 
2682         if (dladm_setzid(dlname, GLOBAL_ZONEID) != DLADM_STATUS_OK) {
2683                 zerror(zlogp, B_TRUE, "unable to release network "
2684                     "interface '%s'", dlname);
2685                 return (-1);
2686         }
2687         return (0);
2688 }
2689 
2690 /*
2691  * Add the kernel access control information for the interface names.
2692  * If anything goes wrong, we log a general error message, attempt to tear down
2693  * whatever we set up, and return an error.
2694  */
2695 static int
2696 configure_exclusive_network_interfaces(zlog_t *zlogp)
2697 {
2698         zone_dochandle_t handle;
2699         struct zone_nwiftab nwiftab;
2700         zoneid_t zoneid;
2701         char rootpath[MAXPATHLEN];
2702         char path[MAXPATHLEN];
2703         di_prof_t prof = NULL;
2704         boolean_t added = B_FALSE;
2705 
2706         if ((zoneid = getzoneidbyname(zone_name)) == -1) {
2707                 zerror(zlogp, B_TRUE, "unable to get zoneid");
2708                 return (-1);
2709         }
2710 
2711         if ((handle = zonecfg_init_handle()) == NULL) {
2712                 zerror(zlogp, B_TRUE, "getting zone configuration handle");
2713                 return (-1);
2714         }
2715         if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
2716                 zerror(zlogp, B_FALSE, "invalid configuration");
2717                 zonecfg_fini_handle(handle);
2718                 return (-1);
2719         }
2720 
2721         if (zonecfg_setnwifent(handle) != Z_OK) {
2722                 zonecfg_fini_handle(handle);
2723                 return (0);
2724         }
2725 
2726         for (;;) {
2727                 if (zonecfg_getnwifent(handle, &nwiftab) != Z_OK)
2728                         break;
2729 
2730                 if (prof == NULL) {
2731                         if (zone_get_devroot(zone_name, rootpath,
2732                             sizeof (rootpath)) != Z_OK) {
2733                                 (void) zonecfg_endnwifent(handle);
2734                                 zonecfg_fini_handle(handle);
2735                                 zerror(zlogp, B_TRUE,
2736                                     "unable to determine dev root");
2737                                 return (-1);
2738                         }
2739                         (void) snprintf(path, sizeof (path), "%s%s", rootpath,
2740                             "/dev");
2741                         if (di_prof_init(path, &prof) != 0) {
2742                                 (void) zonecfg_endnwifent(handle);
2743                                 zonecfg_fini_handle(handle);
2744                                 zerror(zlogp, B_TRUE,
2745                                     "failed to initialize profile");
2746                                 return (-1);
2747                         }
2748                 }
2749 
2750                 /*
2751                  * Create the /dev entry for backward compatibility.
2752                  * Only create the /dev entry if it's not in use.
2753                  * Note that the zone still boots when the assigned
2754                  * interface is inaccessible, used by others, etc.
2755                  * Also, when vanity naming is used, some interface do
2756                  * do not have corresponding /dev node names (for example,
2757                  * vanity named aggregations).  The /dev entry is not
2758                  * created in that case.  The /dev/net entry is always
2759                  * accessible.
2760                  */
2761                 if (add_datalink(zlogp, zoneid, nwiftab.zone_nwif_physical)
2762                     == 0) {
2763                         char            name[DLPI_LINKNAME_MAX];
2764                         datalink_id_t   linkid;
2765 
2766                         if (dladm_name2info(nwiftab.zone_nwif_physical,
2767                             &linkid, NULL, NULL, NULL) == DLADM_STATUS_OK &&
2768                             dladm_linkid2legacyname(linkid, name,
2769                             sizeof (name)) == DLADM_STATUS_OK) {
2770                                 if (di_prof_add_dev(prof, name) != 0) {
2771                                         (void) zonecfg_endnwifent(handle);
2772                                         zonecfg_fini_handle(handle);
2773                                         zerror(zlogp, B_TRUE,
2774                                             "failed to add network device");
2775                                         return (-1);
2776                                 }
2777                                 added = B_TRUE;
2778                         }
2779                 }
2780         }
2781         (void) zonecfg_endnwifent(handle);
2782         zonecfg_fini_handle(handle);
2783 
2784         if (prof != NULL && added) {
2785                 if (di_prof_commit(prof) != 0) {
2786                         zerror(zlogp, B_TRUE, "failed to commit profile");
2787                         return (-1);
2788                 }
2789         }
2790         if (prof != NULL)
2791                 di_prof_fini(prof);
2792 
2793         return (0);
2794 }
2795 
2796 /*
2797  * Get the list of the data-links from kernel, and try to remove it
2798  */
2799 static int
2800 unconfigure_exclusive_network_interfaces_run(zlog_t *zlogp, zoneid_t zoneid)
2801 {
2802         char *dlnames, *ptr;
2803         int dlnum, dlnum_saved, i;
2804 
2805         dlnum = 0;
2806         if (zone_list_datalink(zoneid, &dlnum, NULL) != 0) {
2807                 zerror(zlogp, B_TRUE, "unable to list network interfaces");
2808                 return (-1);
2809         }
2810 again:
2811         /* this zone doesn't have any data-links */
2812         if (dlnum == 0)
2813                 return (0);
2814 
2815         dlnames = malloc(dlnum * LIFNAMSIZ);
2816         if (dlnames == NULL) {
2817                 zerror(zlogp, B_TRUE, "memory allocation failed");
2818                 return (-1);
2819         }
2820         dlnum_saved = dlnum;
2821 
2822         if (zone_list_datalink(zoneid, &dlnum, dlnames) != 0) {
2823                 zerror(zlogp, B_TRUE, "unable to list network interfaces");
2824                 free(dlnames);
2825                 return (-1);
2826         }
2827         if (dlnum_saved < dlnum) {
2828                 /* list increased, try again */
2829                 free(dlnames);
2830                 goto again;
2831         }
2832         ptr = dlnames;
2833         for (i = 0; i < dlnum; i++) {
2834                 /* Remove access control information */
2835                 if (remove_datalink(zlogp, zoneid, ptr) != 0) {
2836                         free(dlnames);
2837                         return (-1);
2838                 }
2839                 ptr += LIFNAMSIZ;
2840         }
2841         free(dlnames);
2842         return (0);
2843 }
2844 
2845 /*
2846  * Get the list of the data-links from configuration, and try to remove it
2847  */
2848 static int
2849 unconfigure_exclusive_network_interfaces_static(zlog_t *zlogp, zoneid_t zoneid)
2850 {
2851         zone_dochandle_t handle;
2852         struct zone_nwiftab nwiftab;
2853 
2854         if ((handle = zonecfg_init_handle()) == NULL) {
2855                 zerror(zlogp, B_TRUE, "getting zone configuration handle");
2856                 return (-1);
2857         }
2858         if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
2859                 zerror(zlogp, B_FALSE, "invalid configuration");
2860                 zonecfg_fini_handle(handle);
2861                 return (-1);
2862         }
2863         if (zonecfg_setnwifent(handle) != Z_OK) {
2864                 zonecfg_fini_handle(handle);
2865                 return (0);
2866         }
2867         for (;;) {
2868                 if (zonecfg_getnwifent(handle, &nwiftab) != Z_OK)
2869                         break;
2870                 /* Remove access control information */
2871                 if (remove_datalink(zlogp, zoneid, nwiftab.zone_nwif_physical)
2872                     != 0) {
2873                         (void) zonecfg_endnwifent(handle);
2874                         zonecfg_fini_handle(handle);
2875                         return (-1);
2876                 }
2877         }
2878         (void) zonecfg_endnwifent(handle);
2879         zonecfg_fini_handle(handle);
2880         return (0);
2881 }
2882 
2883 /*
2884  * Remove the access control information from the kernel for the exclusive
2885  * network interfaces.
2886  */
2887 static int
2888 unconfigure_exclusive_network_interfaces(zlog_t *zlogp, zoneid_t zoneid)
2889 {
2890         if (unconfigure_exclusive_network_interfaces_run(zlogp, zoneid) != 0) {
2891                 return (unconfigure_exclusive_network_interfaces_static(zlogp,
2892                     zoneid));
2893         }
2894 
2895         return (0);
2896 }
2897 
2898 static int
2899 tcp_abort_conn(zlog_t *zlogp, zoneid_t zoneid,
2900     const struct sockaddr_storage *local, const struct sockaddr_storage *remote)
2901 {
2902         int fd;
2903         struct strioctl ioc;
2904         tcp_ioc_abort_conn_t conn;
2905         int error;
2906 
2907         conn.ac_local = *local;
2908         conn.ac_remote = *remote;
2909         conn.ac_start = TCPS_SYN_SENT;
2910         conn.ac_end = TCPS_TIME_WAIT;
2911         conn.ac_zoneid = zoneid;
2912 
2913         ioc.ic_cmd = TCP_IOC_ABORT_CONN;
2914         ioc.ic_timout = -1; /* infinite timeout */
2915         ioc.ic_len = sizeof (conn);
2916         ioc.ic_dp = (char *)&conn;
2917 
2918         if ((fd = open("/dev/tcp", O_RDONLY)) < 0) {
2919                 zerror(zlogp, B_TRUE, "unable to open %s", "/dev/tcp");
2920                 return (-1);
2921         }
2922 
2923         error = ioctl(fd, I_STR, &ioc);
2924         (void) close(fd);
2925         if (error == 0 || errno == ENOENT)      /* ENOENT is not an error */
2926                 return (0);
2927         return (-1);
2928 }
2929 
2930 static int
2931 tcp_abort_connections(zlog_t *zlogp, zoneid_t zoneid)
2932 {
2933         struct sockaddr_storage l, r;
2934         struct sockaddr_in *local, *remote;
2935         struct sockaddr_in6 *local6, *remote6;
2936         int error;
2937 
2938         /*
2939          * Abort IPv4 connections.
2940          */
2941         bzero(&l, sizeof (*local));
2942         local = (struct sockaddr_in *)&l;
2943         local->sin_family = AF_INET;
2944         local->sin_addr.s_addr = INADDR_ANY;
2945         local->sin_port = 0;
2946 
2947         bzero(&r, sizeof (*remote));
2948         remote = (struct sockaddr_in *)&r;
2949         remote->sin_family = AF_INET;
2950         remote->sin_addr.s_addr = INADDR_ANY;
2951         remote->sin_port = 0;
2952 
2953         if ((error = tcp_abort_conn(zlogp, zoneid, &l, &r)) != 0)
2954                 retur