1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #include <errno.h>
27 #include <fcntl.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <strings.h>
31 #include <unistd.h>
32 #include <sys/auxv.h>
33 #include <sys/bitmap.h>
34 #include <sys/brand.h>
35 #include <sys/inttypes.h>
36 #include <sys/lwp.h>
37 #include <sys/syscall.h>
38 #include <sys/systm.h>
39 #include <sys/utsname.h>
40 #include <sys/systeminfo.h>
41 #include <sys/zone.h>
42 #include <sys/stat.h>
43 #include <sys/mntent.h>
44 #include <sys/ctfs.h>
45 #include <sys/priv.h>
46 #include <sys/acctctl.h>
47 #include <libgen.h>
48
49 #include <s10_brand.h>
50 #include <s10_misc.h>
51
52 /*
53 * Principles of emulation 101.
54 *
55 *
56 * *** Setting errno
57 *
58 * Just don't do it. This emulation library is loaded onto a
59 * seperate link map from the application who's address space we're
60 * running in. We have our own private copy of libc, so there for,
61 * the errno value accessible from here is is also private and changing
62 * it will not affect any errno value that the processes who's address
63 * space we are running in will see. To return an error condition we
64 * should return the negated errno value we'd like the system to return.
65 * For more information about this see the comment in s10_handler().
66 * Basically, when we return to the caller that initiated the system
67 * call it's their responsibility to set errno.
68 *
69 *
70 * *** Recursion Considerations
71 *
72 * When emulating system calls we need to be very careful about what
73 * library calls we invoke. Library calls should be kept to a minimum.
74 * One issue is that library calls can invoke system calls, so if we're
75 * emulating a system call and we invoke a library call that depends on
76 * that system call we will probably enter a recursive loop, which would
77 * be bad.
78 *
79 *
80 * *** Return Values.
81 *
82 * When declaring new syscall emulation functions, it is very important
83 * to to set the proper RV_* flags in the s10_sysent_table. Upon failure,
84 * syscall emulation fuctions should return an errno value. Upon success
85 * syscall emulation functions should return 0 and set the sysret_t return
86 * value parameters accordingly.
87 *
88 *
89 * *** Agent lwp considerations
90 *
91 * It is currently impossible to do any emulation for these system call
92 * when they are being invoked on behalf of an agent lwp. To understand why
93 * it's impossible you have to understand how agent lwp syscalls work.
94 *
95 * The agent lwp syscall process works as follows:
96 * 1 The controlling process stops the target.
97 * 2 The controlling process injects an agent lwp which is also stopped.
98 * This agent lwp assumes the userland stack and register values
99 * of another stopped lwp in the current process.
100 * 3 The controlling process configures the agent lwp to start
101 * executing the requested system call.
102 * 4 The controlling process configure /proc to stop the agent lwp when
103 * it enters the requested system call.
104 * 5 The controlling processes allows the agent lwp to start executing.
105 * 6 The agent lwp traps into the kernel to perform the requested system
106 * call and immediately stop.
107 * 7 The controlling process copies all the arguments for the requested
108 * system call onto the agent lwp's stack.
109 * 8 The controlling process configures /proc to stop the agent lwp
110 * when it completes the requested system call.
111 * 9 The controlling processes allows the agent lwp to start executing.
112 * 10 The agent lwp executes the system call and then stop before returning
113 * to userland.
114 * 11 The controlling process copies the return value and return arguments
115 * back from the agent lwps stack.
116 * 12 The controlling process destroys the agent lwp and restarts
117 * the target process.
118 *
119 * The fundamental problem is that when the agent executes the request
120 * system call in step 5, if we're emulating that system call then the
121 * lwp is redirected back to our emulation layer without blocking
122 * in the kernel. But our emulation layer can't access the arguments
123 * for the system call because they haven't been copied to the stack
124 * yet and they still only exist in the controlling processes address
125 * space. This prevents us from being able to do any emulation of
126 * agent lwp system calls. Hence, currently our brand trap interposition
127 * callback (s10_brand_syscall_callback_common) will detect if a system
128 * call is being made by an agent lwp, and if this is the case it will
129 * never redirect the system call to this emulation library.
130 *
131 * In the future, if this proves to be a problem the the easiest solution
132 * would probably be to replace the branded versions of these application
133 * with their native counterparts. Ie, truss, plimit, and pfiles could be
134 * replace with wrapper scripts that execute the native versions of these
135 * applications. In the case of plimit and pfiles this should be pretty
136 * strait forward. Truss would probably be more tricky since it can
137 * execute applications which would be branded applications, so in that
138 * case it might be necessary to create a loadable library which could
139 * be LD_PRELOADed into truss and this library would interpose on the
140 * exec() system call to allow truss to correctly execute branded
141 * processes. It should be pointed out that this solution could work
142 * because "native agent lwps" (ie, agent lwps created by native
143 * processes) can be treated differently from "branded aged lwps" (ie,
144 * agent lwps created by branded processes), since native agent lwps
145 * would presumably be making native system calls and hence not need
146 * any interposition.
147 *
148 *
149 * *** s10 brand emulation scope considerations
150 *
151 * One of the differences between the lx brand and the s8 and s9
152 * brands, is that the s8 and s9 brands only interpose on syscalls
153 * that need some kind of emulation, where as the lx brand interposes
154 * on _all_ system calls. Lx branded system calls that don't need
155 * any emulation are then redirected back to the kernel from the
156 * userland library via the IN_KERNEL_SYSCALL macro. The lx-syscall
157 * dtrace provider depends on this behavior.
158 *
159 */
160
161 static zoneid_t zoneid;
162 static boolean_t ipshared;
163 static boolean_t emul_global_zone = B_FALSE;
164 static int emul_vers;
165 pid_t zone_init_pid;
166
167 #define EMULATE(cb, args) { (sysent_cb_t)(cb), (args) }
168 #define NOSYS EMULATE(s10_unimpl, (0 | RV_DEFAULT))
169
170 typedef long (*sysent_cb_t)();
171 typedef struct s10_sysent_table {
172 sysent_cb_t st_callc;
173 uintptr_t st_args;
174 } s10_sysent_table_t;
175 s10_sysent_table_t s10_sysent_table[];
176
177 #define S10_UTS_RELEASE "5.10"
178 #define S10_UTS_VERSION "Generic_Virtual"
179
180 /*LINTED: static unused*/
181 static volatile int s10_abort_err;
182 /*LINTED: static unused*/
183 static volatile const char *s10_abort_msg;
184 /*LINTED: static unused*/
185 static volatile const char *s10_abort_file;
186 /*LINTED: static unused*/
187 static volatile int s10_abort_line;
188
189 extern int errno;
190
191 /*ARGSUSED*/
192 void
193 _s10_abort(int err, const char *msg, const char *file, int line)
194 {
195 sysret_t rval;
196
197 /* Save the error message into convenient globals */
198 s10_abort_err = err;
199 s10_abort_msg = msg;
200 s10_abort_file = file;
201 s10_abort_line = line;
202
203 /* kill ourselves */
204 abort();
205
206 /* If abort() didn't work, try something stronger. */
207 (void) __systemcall(&rval, SYS_lwp_kill + 1024, _lwp_self(), SIGKILL);
208 }
209
210 static int
211 s10_uucopy(const void *from, void *to, size_t size)
212 {
213 sysret_t rval;
214 int err;
215
216 err = __systemcall(&rval, SYS_uucopy + 1024, from, to, size);
217 if (err == 0)
218 return (0);
219 return (EFAULT);
220 }
221
222 /*
223 * ATTENTION: uucopystr() does NOT ensure that string are null terminated!
224 */
225 static int
226 s10_uucopystr(const void *from, void *to, size_t size)
227 {
228 sysret_t rval;
229 int err;
230
231 err = __systemcall(&rval, SYS_uucopystr + 1024, from, to, size);
232 if (err == 0)
233 return (0);
234 return (EFAULT);
235 }
236
237 /*
238 * Figures out the PID of init for the zone. Also returns a boolean
239 * indicating whether this process currently has that pid: if so,
240 * then at this moment, we are init.
241 */
242 static boolean_t
243 get_initpid_info(void)
244 {
245 pid_t pid;
246 sysret_t rval;
247 int err;
248
249 /*
250 * Determine the current process PID and the PID of the zone's init.
251 * We use care not to call getpid() here, because we're not supposed
252 * to call getpid() until after the program is fully linked-- the
253 * first call to getpid() is a signal from the linker to debuggers
254 * that linking has been completed.
255 */
256 if ((err = __systemcall(&rval, SYS_brand,
257 B_S10_PIDINFO, &pid, &zone_init_pid)) != 0) {
258 s10_abort(err, "Failed to get init's pid");
259 }
260
261 /*
262 * Note that we need to be cautious with the pid we get back--
263 * it should not be stashed and used in place of getpid(), since
264 * we might fork(2). So we keep zone_init_pid and toss the pid
265 * we otherwise got.
266 */
267 if (pid == zone_init_pid)
268 return (B_TRUE);
269
270 return (B_FALSE);
271 }
272
273 /*
274 * This function is defined to be NOSYS but it won't be called from the
275 * the kernel since the NOSYS system calls are not enabled in the kernel.
276 * Thus, the only time this function is called is directly from within the
277 * indirect system call path.
278 */
279 /*ARGSUSED*/
280 static long
281 s10_unimpl(sysret_t *rv, uintptr_t p1)
282 {
283 sysret_t rval;
284
285 /*
286 * We'd like to print out some kind of error message here like
287 * "unsupported syscall", but we can't because it's not safe to
288 * assume that stderr or STDERR_FILENO actually points to something
289 * that is a terminal, and if we wrote to those files we could
290 * inadvertantly write to some applications open files, which would
291 * be bad.
292 *
293 * Normally, if an application calls an invalid system call
294 * it get a SIGSYS sent to it. So we'll just go ahead and send
295 * ourselves a signal here. Note that this is far from ideal since
296 * if the application has registered a signal handler, that signal
297 * handler may recieve a ucontext_t as the third parameter to
298 * indicate the context of the process when the signal was
299 * generated, and in this case that context will not be what the
300 * application is expecting. Hence, we should probably create a
301 * brandsys() kernel function that can deliver the signal to us
302 * with the correct ucontext_t.
303 */
304 (void) __systemcall(&rval, SYS_lwp_kill + 1024, _lwp_self(), SIGSYS);
305 return (ENOSYS);
306 }
307
308 #if defined(__sparc) && !defined(__sparcv9)
309 /*
310 * Yuck. For 32-bit sparc applications, handle indirect system calls.
311 * Note that we declare this interface to use the maximum number of
312 * system call arguments. If we recieve a system call that uses less
313 * arguments, then the additional arguments will be garbage, but they
314 * will also be ignored so that should be ok.
315 */
316 static long
317 s10_indir(sysret_t *rv, int code,
318 uintptr_t a0, uintptr_t a1, uintptr_t a2, uintptr_t a3, uintptr_t a4,
319 uintptr_t a5, uintptr_t a6, uintptr_t a7)
320 {
321 s10_sysent_table_t *sst = &(s10_sysent_table[code]);
322
323 s10_assert(code < NSYSCALL);
324 switch (sst->st_args & NARGS_MASK) {
325 case 0:
326 return ((sst->st_callc)(rv));
327 case 1:
328 return ((sst->st_callc)(rv, a0));
329 case 2:
330 return ((sst->st_callc)(rv, a0, a1));
331 case 3:
332 return ((sst->st_callc)(rv, a0, a1, a2));
333 case 4:
334 return ((sst->st_callc)(rv, a0, a1, a2, a3));
335 case 5:
336 return ((sst->st_callc)(rv, a0, a1, a2, a3, a4));
337 case 6:
338 return ((sst->st_callc)(rv, rv, a0, a1, a2, a3, a4, a5));
339 case 7:
340 return ((sst->st_callc)(rv, a0, a1, a2, a3, a4, a5, a6));
341 case 8:
342 return ((sst->st_callc)(rv, a0, a1, a2, a3, a4, a5, a6, a7));
343 }
344 s10_abort(0, "invalid entry in s10_sysent_table");
345 return (EINVAL);
346 }
347 #endif /* __sparc && !__sparcv9 */
348
349 /*
350 * The process contract CT_TGET and CT_TSET parameter structure ct_param_t
351 * changed between S10 and Nevada, so we have to emulate the old S10
352 * ct_param_t structure when interposing on the ioctl syscall.
353 */
354 typedef struct s10_ct_param {
355 uint32_t ctpm_id;
356 uint32_t ctpm_pad;
357 uint64_t ctpm_value;
358 } s10_ct_param_t;
359
360 /*
361 * New first arg "legacy" should be set to 1.
362 */
363 static int
364 s10_getpagesizes(sysret_t *rval, size_t *buf, int nelem)
365 {
366 int err;
367
368 if ((err = __systemcall(rval, SYS_getpagesizes + 1024, 1, buf, nelem))
369 != 0)
370 return (err);
371 return (0);
372 }
373
374 int
375 s10_ioctl(sysret_t *rval, int fdes, int cmd, intptr_t arg)
376 {
377 int err;
378 s10_ct_param_t s10param;
379 ct_param_t param;
380 struct stat statbuf;
381
382 /*
383 * We have to emulate process contract ioctls for init(1M) because the
384 * ioctl parameter structure changed between S10 and Nevada. This is
385 * a relatively simple process of filling Nevada structure fields,
386 * shuffling values, and initiating a native system call.
387 *
388 * For now, we'll assume that all consumers of CT_TGET and CT_TSET will
389 * need emulation. We'll issue a stat to make sure that the ioctl
390 * is meant for the contract file system.
391 *
392 */
393 switch (cmd) {
394 case CT_TGET:
395 if ((err = __systemcall(rval, SYS_fstat + 1024, fdes,
396 &statbuf)) != 0)
397 return (err);
398 if (strcmp(statbuf.st_fstype, MNTTYPE_CTFS) != 0)
399 goto nonemuioctl;
400 if (s10_uucopy((const void *)arg, &s10param,
401 sizeof (s10param)) != 0)
402 return (EFAULT);
403 param.ctpm_id = s10param.ctpm_id;
404 param.ctpm_size = sizeof (uint64_t);
405 param.ctpm_value = &s10param.ctpm_value;
406 if ((err = __systemcall(rval, SYS_ioctl + 1024, fdes,
407 cmd, ¶m)) != 0)
408 return (err);
409 if (s10_uucopy(&s10param, (void *)arg,
410 sizeof (s10param)) != 0)
411 return (EFAULT);
412 return (0);
413 case CT_TSET:
414 if ((err = __systemcall(rval, SYS_fstat + 1024, fdes,
415 &statbuf)) != 0)
416 return (err);
417 if (strcmp(statbuf.st_fstype, MNTTYPE_CTFS) != 0)
418 goto nonemuioctl;
419 if (s10_uucopy((const void *)arg, &s10param,
420 sizeof (s10param)) != 0)
421 return (EFAULT);
422 param.ctpm_id = s10param.ctpm_id;
423 param.ctpm_size = sizeof (uint64_t);
424 param.ctpm_value = &s10param.ctpm_value;
425 if ((err = __systemcall(rval, SYS_ioctl + 1024, fdes,
426 cmd, ¶m)) != 0)
427 return (err);
428 return (0);
429 }
430
431 nonemuioctl:
432 if ((err = __systemcall(rval, SYS_ioctl + 1024, fdes, cmd, arg)) != 0)
433 return (err);
434 return (0);
435 }
436
437 /*
438 * Unfortunately, pwrite()'s behavior differs between S10 and Nevada when
439 * applied to files opened with O_APPEND. The offset argument is ignored and
440 * the buffer is appended to the target file in S10, whereas the current file
441 * position is ignored in Nevada (i.e., pwrite() acts as though the target file
442 * wasn't opened with O_APPEND). This is a result of the fix for CR 6655660
443 * (pwrite() must ignore the O_APPEND/FAPPEND flag).
444 *
445 * We emulate the old S10 pwrite() behavior by checking whether the target file
446 * was opened with O_APPEND. If it was, then invoke the write() system call
447 * instead of pwrite(); otherwise, invoke the pwrite() system call as usual.
448 */
449 static int
450 s10_pwrite(sysret_t *rval, int fd, const void *bufferp, size_t num_bytes,
451 off_t offset)
452 {
453 int err;
454
455 if ((err = __systemcall(rval, SYS_fcntl + 1024, fd, F_GETFL)) != 0)
456 return (err);
457 if (rval->sys_rval1 & O_APPEND)
458 return (__systemcall(rval, SYS_write + 1024, fd, bufferp,
459 num_bytes));
460 return (__systemcall(rval, SYS_pwrite + 1024, fd, bufferp, num_bytes,
461 offset));
462 }
463
464 #ifndef _LP64
465 /*
466 * This is the large file version of the pwrite() system call for 32-bit
467 * processes. This exists for the same reason that s10_pwrite() exists; see
468 * the comment above s10_pwrite().
469 */
470 static int
471 s10_pwrite64(sysret_t *rval, int fd, const void *bufferp, size32_t num_bytes,
472 uint32_t offset_1, uint32_t offset_2)
473 {
474 int err;
475
476 if ((err = __systemcall(rval, SYS_fcntl + 1024, fd, F_GETFL)) != 0)
477 return (err);
478 if (rval->sys_rval1 & O_APPEND)
479 return (__systemcall(rval, SYS_write + 1024, fd, bufferp,
480 num_bytes));
481 return (__systemcall(rval, SYS_pwrite64 + 1024, fd, bufferp,
482 num_bytes, offset_1, offset_2));
483 }
484 #endif /* !_LP64 */
485
486 #define S10_AC_PROC (0x1 << 28)
487 #define S10_AC_TASK (0x2 << 28)
488 #define S10_AC_FLOW (0x4 << 28)
489 #define S10_AC_MODE(x) ((x) & 0xf0000000)
490 #define S10_AC_OPTION(x) ((x) & 0x0fffffff)
491
492 /*
493 * The mode shift, mode mask and option mask for acctctl have changed. The
494 * mode is currently the top full byte and the option is the lower 3 full bytes.
495 */
496 int
497 s10_acctctl(sysret_t *rval, int cmd, void *buf, size_t bufsz)
498 {
499 int mode = S10_AC_MODE(cmd);
500 int option = S10_AC_OPTION(cmd);
501
502 switch (mode) {
503 case S10_AC_PROC:
504 mode = AC_PROC;
505 break;
506 case S10_AC_TASK:
507 mode = AC_TASK;
508 break;
509 case S10_AC_FLOW:
510 mode = AC_FLOW;
511 break;
512 default:
513 return (S10_TRUSS_POINT_3(rval, SYS_acctctl, EINVAL, cmd, buf,
514 bufsz));
515 }
516
517 return (__systemcall(rval, SYS_acctctl + 1024, mode | option, buf,
518 bufsz));
519 }
520
521 /*
522 * Determine whether the executable passed to SYS_exec or SYS_execve is a
523 * wrapper around a native executable. If so, then fudge the executable's
524 * name and parameters to eliminate any trace of the wrapper. This will make
525 * pgrep and other commands that examine process' executable names and
526 * command-line parameters work properly.
527 */
528 static int
529 s10_exec_native(sysret_t *rval, const char *fname, const char **argp,
530 const char **envp)
531 {
532 const char *filename = fname;
533 char path[64];
534 int err;
535
536 /* Get a copy of the executable we're trying to run */
537 path[0] = '\0';
538 (void) s10_uucopystr(filename, path, sizeof (path));
539
540 /* Check if we're trying to run a native binary */
541 if (strncmp(path, "/.SUNWnative/usr/lib/brand/solaris10/s10_native",
542 sizeof (path)) != 0)
543 return (0);
544
545 /* Skip the first element in the argv array */
546 argp++;
547
548 /*
549 * The name of the new program to execute was the second parameter
550 * passed to s10_exec_native().
551 */
552 if (s10_uucopy(argp, &filename, sizeof (char *)) != 0)
553 return (EFAULT);
554
555 /* If an exec call succeeds, it never returns */
556 err = __systemcall(rval, SYS_brand + 1024, B_EXEC_NATIVE, filename,
557 argp, envp, NULL, NULL, NULL);
558 s10_assert(err != 0);
559 return (err);
560 }
561
562 /*
563 * Interpose on the SYS_exec syscall to detect native wrappers.
564 */
565 int
566 s10_exec(sysret_t *rval, const char *fname, const char **argp)
567 {
568 int err;
569
570 if ((err = s10_exec_native(rval, fname, argp, NULL)) != 0)
571 return (err);
572
573 /* If an exec call succeeds, it never returns */
574 err = __systemcall(rval, SYS_exec + 1024, fname, argp);
575 s10_assert(err != 0);
576 return (err);
577 }
578
579 /*
580 * Interpose on the SYS_execve syscall to detect native wrappers.
581 */
582 int
583 s10_execve(sysret_t *rval, const char *fname, const char **argp,
584 const char **envp)
585 {
586 int err;
587
588 if ((err = s10_exec_native(rval, fname, argp, envp)) != 0)
589 return (err);
590
591 /* If an exec call succeeds, it never returns */
592 err = __systemcall(rval, SYS_execve + 1024, fname, argp, envp);
593 s10_assert(err != 0);
594 return (err);
595 }
596
597 /*
598 * S10's issetugid() syscall is now a subcode to privsys().
599 */
600 static int
601 s10_issetugid(sysret_t *rval)
602 {
603 int err;
604
605 if ((err = __systemcall(rval, SYS_privsys + 1024, PRIVSYS_ISSETUGID,
606 0, 0, 0, 0, 0)) != 0)
607 return (err);
608 return (0);
609 }
610
611 /*
612 * New last arg "block" flag should be zero. The block flag is used by
613 * the Opensolaris AIO implementation, which is now part of libc.
614 */
615 static int
616 s10_sigqueue(sysret_t *rval, pid_t pid, int signo, void *value, int si_code)
617 {
618 int err;
619
620 if ((err = __systemcall(rval, SYS_sigqueue + 1024, pid, signo, value,
621 si_code, 0)) != 0)
622 return (err);
623 return (0);
624 }
625
626 static long
627 s10_uname(sysret_t *rv, uintptr_t p1)
628 {
629 struct utsname un, *unp = (struct utsname *)p1;
630 int rev, err;
631
632 if ((err = __systemcall(rv, SYS_uname + 1024, &un)) != 0)
633 return (err);
634
635 rev = atoi(&un.release[2]);
636 s10_assert(rev >= 11);
637 bzero(un.release, _SYS_NMLN);
638 (void) strlcpy(un.release, S10_UTS_RELEASE, _SYS_NMLN);
639 bzero(un.version, _SYS_NMLN);
640 (void) strlcpy(un.version, S10_UTS_VERSION, _SYS_NMLN);
641
642 /* copy out the modified uname info */
643 if (s10_uucopy(&un, unp, sizeof (un)) != 0)
644 return (EFAULT);
645
646 return (0);
647 }
648
649 int
650 s10_sysinfo(sysret_t *rv, int command, char *buf, long count)
651 {
652 char *value;
653 int err, len;
654
655 /*
656 * We must interpose on the sysinfo(2) commands SI_RELEASE and
657 * SI_VERSION; all others get passed to the native sysinfo(2)
658 * command.
659 */
660 switch (command) {
661 case SI_RELEASE:
662 value = S10_UTS_RELEASE;
663 break;
664
665 case SI_VERSION:
666 value = S10_UTS_VERSION;
667 break;
668
669 default:
670 /*
671 * The default action is to pass the command to the
672 * native sysinfo(2) syscall.
673 */
674 if ((err = __systemcall(rv, SYS_systeminfo + 1024,
675 command, buf, count)) != 0)
676 return (err);
677
678 return (0);
679 }
680
681 len = strlen(value) + 1;
682 if (count > 0) {
683 if (s10_uucopystr(value, buf, count) != 0)
684 return (EFAULT);
685
686 /* Assure NULL termination of buf as s10_uucopystr() doesn't. */
687 if (len > count && s10_uucopy("\0", buf + (count - 1), 1) != 0)
688 return (EFAULT);
689 }
690
691 /*
692 * On success, sysinfo(2) returns the size of buffer required to hold
693 * the complete value plus its terminating NULL byte.
694 */
695 rv->sys_rval1 = len;
696 rv->sys_rval2 = 0;
697 S10_TRUSS_POINT_3(rv, SYS_systeminfo, 0, command, buf, count);
698 return (0);
699 }
700
701 /*
702 * If the emul_global_zone flag is set then emulate some aspects of the
703 * zone system call. In particular, emulate the global zone ID on the
704 * ZONE_LOOKUP subcommand and emulate some of the global zone attributes
705 * on the ZONE_GETATTR subcommand. If the flag is not set or we're performing
706 * some other operation, simply pass the calls through.
707 */
708 int
709 s10_zone(sysret_t *rval, int cmd, void *arg1, void *arg2, void *arg3,
710 void *arg4)
711 {
712 char *aval;
713 int len;
714 zoneid_t zid;
715 int attr;
716 char *buf;
717 size_t bufsize;
718
719 /*
720 * We only emulate the zone syscall for a subset of specific commands,
721 * otherwise we just pass the call through.
722 */
723 if (!emul_global_zone)
724 return (__systemcall(rval, SYS_zone + 1024, cmd, arg1, arg2,
725 arg3, arg4));
726
727 switch (cmd) {
728 case ZONE_LOOKUP:
729 (void) S10_TRUSS_POINT_1(rval, SYS_zone, 0, cmd);
730 rval->sys_rval1 = GLOBAL_ZONEID;
731 rval->sys_rval2 = 0;
732 return (0);
733
734 case ZONE_GETATTR:
735 zid = (zoneid_t)(uintptr_t)arg1;
736 attr = (int)(uintptr_t)arg2;
737 buf = (char *)arg3;
738 bufsize = (size_t)arg4;
739
740 /*
741 * If the request is for the global zone then we're emulating
742 * that, otherwise pass this thru.
743 */
744 if (zid != GLOBAL_ZONEID)
745 goto passthru;
746
747 (void) S10_TRUSS_POINT_3(rval, SYS_zone, 0, cmd, zid, attr);
748
749 switch (attr) {
750 case ZONE_ATTR_NAME:
751 aval = GLOBAL_ZONENAME;
752 break;
753
754 case ZONE_ATTR_BRAND:
755 aval = NATIVE_BRAND_NAME;
756 break;
757 default:
758 /*
759 * We only emulate a subset of the attrs, use the
760 * real zone id to pass thru the rest.
761 */
762 arg1 = (void *)(uintptr_t)zoneid;
763 goto passthru;
764 }
765
766 len = strlen(aval) + 1;
767 if (len > bufsize)
768 return (ENAMETOOLONG);
769
770 if (buf != NULL) {
771 if (len == 1) {
772 if (s10_uucopy("\0", buf, 1) != 0)
773 return (EFAULT);
774 } else {
775 if (s10_uucopystr(aval, buf, len) != 0)
776 return (EFAULT);
777
778 /*
779 * Assure NULL termination of "buf" as
780 * s10_uucopystr() does NOT.
781 */
782 if (s10_uucopy("\0", buf + (len - 1), 1) != 0)
783 return (EFAULT);
784 }
785 }
786
787 rval->sys_rval1 = len;
788 rval->sys_rval2 = 0;
789 return (0);
790
791 default:
792 break;
793 }
794
795 passthru:
796 return (__systemcall(rval, SYS_zone + 1024, cmd, arg1, arg2, arg3,
797 arg4));
798 }
799
800 /*
801 * This routine is run only when the init daemon starts up, in order
802 * to do any pre-initialization needed before the environment boots.
803 */
804 static void
805 s10_init1m_handler()
806 {
807 /*
808 * Take special actions in advance of starting init(1m).
809 *
810 * XXX Nothing to do (yet).
811 */
812 }
813
814 /*
815 * Close a libc file handle, but don't actually close the underlying
816 * file descriptor.
817 */
818 static void
819 s10_close_fh(FILE *file)
820 {
821 int fd, fd_new;
822
823 if (file == NULL)
824 return;
825
826 if ((fd = fileno(file)) < 0)
827 return;
828
829 fd_new = dup(fd);
830 if (fd_new == -1)
831 return;
832
833 (void) fclose(file);
834 (void) dup2(fd_new, fd);
835 (void) close(fd_new);
836 }
837
838 /*ARGSUSED*/
839 int
840 s10_init(int argc, char *argv[], char *envp[])
841 {
842 sysret_t rval;
843 s10_brand_reg_t reg;
844 s10_elf_data_t sed;
845 auxv_t *ap;
846 uintptr_t *p;
847 int i, err;
848 ushort_t flags;
849 char *bname;
850
851 /* Sanity check our translation table return value codes */
852 for (i = 0; i < NSYSCALL; i++) {
853 s10_sysent_table_t *est = &(s10_sysent_table[i]);
854 s10_assert(BIT_ONLYONESET(est->st_args & RV_MASK));
855 }
856
857 /*
858 * We need to shutdown all libc stdio. libc stdio normally goes to
859 * file descriptors, but since we're actually part of a another
860 * process we don't own these file descriptors and we can't make
861 * any assumptions about their state.
862 */
863 s10_close_fh(stdin);
864 s10_close_fh(stdout);
865 s10_close_fh(stderr);
866
867 /*
868 * Cache the pid of the zone's init process and determine if
869 * we're init(1m) for the zone. Remember: we might be init
870 * now, but as soon as we fork(2) we won't be.
871 */
872 if (get_initpid_info()) {
873 s10_init1m_handler();
874 }
875
876 /* get the current zoneid */
877 err = __systemcall(&rval, SYS_zone, ZONE_LOOKUP, NULL);
878 s10_assert(err == 0);
879 zoneid = (zoneid_t)rval.sys_rval1;
880
881 /* Get the emulation version number. */
882 if ((err = __systemcall(&rval, SYS_zone, ZONE_GETATTR, zoneid,
883 S10_EMUL_VERSION_NUM, &emul_vers, sizeof (emul_vers))) != 0 ||
884 emul_vers != 0) {
885 s10_abort(err, "The zone's patch level is unsupported");
886 /*NOTREACHED*/
887 }
888
889 /* Figure out if this zone has a shared ip */
890 err = __systemcall(&rval, SYS_zone, ZONE_GETATTR, zoneid,
891 ZONE_ATTR_FLAGS, &flags, sizeof (flags));
892 s10_assert(err == 0);
893 ipshared = ((flags & ZF_NET_EXCL) == 0);
894
895 bname = basename(argv[0]);
896
897 /*
898 * In general we want the S10 commands that are zone-aware to continue
899 * to behave as they normally do within a zone. Since these commands
900 * are zone-aware, they should continue to "do the right thing".
901 * However, some zone-aware commands aren't going to work the way
902 * we expect them to inside the branded zone. In particular, the pkg
903 * and patch commands will not properly manage all pkgs/patches
904 * unless the commands think they are running in the global zone. For
905 * these commands we want to emulate the global zone.
906 *
907 * XXX One issue is the handling of hollow pkgs. This is not normally
908 * a problem since the p2v/v2v process handles those. However, if
909 * the user attempts to install a hollow pkg after the zone is running,
910 * the pkg code will do the wrong thing. Luckily, most of the hollow
911 * pkgs are core pkgs which will already be installed in the image
912 * before we p2v/v2v it into the zone and there should be little need
913 * to pkgadd these later.
914 */
915 if (strcmp("pkgadd", bname) == 0 || strcmp("pkgrm", bname) == 0 ||
916 strcmp("pkgcond", bname) == 0 ||
917 strcmp("patchadd", bname) == 0 || strcmp("patchrm", bname) == 0)
918 emul_global_zone = B_TRUE;
919
920 /*
921 * Register our syscall emulation table with the kernel.
922 * Note that we don't have to do invoke (syscall_number + 1024)
923 * until we've actually establised a syscall emulation callback
924 * handler address, which is what we're doing with this brand
925 * syscall.
926 */
927 reg.sbr_version = S10_VERSION;
928 reg.sbr_handler = (caddr_t)s10_handler;
929 if ((err = __systemcall(&rval, SYS_brand, B_REGISTER, ®)) != 0) {
930 s10_abort(err, "Failed to brand current process");
931 /*NOTREACHED*/
932 }
933
934 /* Get data about the executable we're running from the kernel. */
935 if ((err = __systemcall(&rval, SYS_brand + 1024,
936 B_ELFDATA, (void *)&sed)) != 0) {
937 s10_abort(err,
938 "Failed to get required brand ELF data from the kernel");
939 /*NOTREACHED*/
940 }
941
942 /*
943 * Find the aux vector on the stack.
944 */
945 p = (uintptr_t *)envp;
946 while (*p != NULL)
947 p++;
948
949 /*
950 * p is now pointing at the 0 word after the environ pointers.
951 * After that is the aux vectors.
952 *
953 * The aux vectors are currently pointing to the brand emulation
954 * library and associated linker. We're going to change them to
955 * point to the brand executable and associated linker (or to no
956 * linker for static binaries). This matches the process data
957 * stored within the kernel and visible from /proc, which was
958 * all setup in s10_elfexec(). We do this so that when a debugger
959 * attaches to the process it sees the process as a normal solaris
960 * process, this brand emulation library and everything on it's
961 * link map will not be visible, unless our librtld_db plugin
962 * is used. Note that this is very different from how Linux
963 * branded processes are implemented within lx branded zones.
964 * In that situation, the primary linkmap of the process is the
965 * brand emulation libraries linkmap, not the Linux applications
966 * linkmap.
967 *
968 * We also need to clear the AF_SUN_NOPLM flag from the AT_SUN_AUXFLAGS
969 * aux vector. This flag told our linker that we don't have a
970 * primary link map. Now that our linker is done initializing, we
971 * want to clear this flag before we transfer control to the
972 * applications copy of the linker, since we want that linker to have
973 * a primary link map which will be the link map for the application
974 * we're running.
975 */
976 p++;
977 for (ap = (auxv_t *)p; ap->a_type != AT_NULL; ap++) {
978 switch (ap->a_type) {
979 case AT_BASE:
980 /* Hide AT_BASE if static binary */
981 if (sed.sed_base == NULL) {
982 ap->a_type = AT_IGNORE;
983 ap->a_un.a_val = NULL;
984 } else {
985 ap->a_un.a_val = sed.sed_base;
986 }
987 break;
988 case AT_ENTRY:
989 ap->a_un.a_val = sed.sed_entry;
990 break;
991 case AT_PHDR:
992 ap->a_un.a_val = sed.sed_phdr;
993 break;
994 case AT_PHENT:
995 ap->a_un.a_val = sed.sed_phent;
996 break;
997 case AT_PHNUM:
998 ap->a_un.a_val = sed.sed_phnum;
999 break;
1000 case AT_SUN_AUXFLAGS:
1001 ap->a_un.a_val &= ~AF_SUN_NOPLM;
1002 break;
1003 case AT_SUN_EMULATOR:
1004 /*
1005 * ld.so.1 inspects AT_SUN_EMULATOR to see if
1006 * if it is the linker for the brand emulation
1007 * library. Hide AT_SUN_EMULATOR, as the
1008 * linker we are about to jump to is the linker
1009 * for the binary.
1010 */
1011 ap->a_type = AT_IGNORE;
1012 ap->a_un.a_val = NULL;
1013 break;
1014 case AT_SUN_LDDATA:
1015 /* Hide AT_SUN_LDDATA if static binary */
1016 if (sed.sed_lddata == NULL) {
1017 ap->a_type = AT_IGNORE;
1018 ap->a_un.a_val = NULL;
1019 } else {
1020 ap->a_un.a_val = sed.sed_lddata;
1021 }
1022 break;
1023 default:
1024 break;
1025 }
1026 }
1027
1028 s10_runexe(argv, sed.sed_ldentry);
1029 /*NOTREACHED*/
1030 s10_abort(0, "s10_runexe() returned");
1031 return (-1);
1032 }
1033
1034 /*
1035 * This table must have at least NSYSCALL entries in it.
1036 *
1037 * The second parameter of each entry in the s10_sysent_table
1038 * contains the number of parameters and flags that describe the
1039 * syscall return value encoding. See the block comments at the
1040 * top of this file for more information about the syscall return
1041 * value flags and when they should be used.
1042 */
1043 s10_sysent_table_t s10_sysent_table[] = {
1044 #if defined(__sparc) && !defined(__sparcv9)
1045 EMULATE(s10_indir, 9 | RV_64RVAL), /* 0 */
1046 #else /* !__sparc || __sparcv9 */
1047 NOSYS, /* 0 */
1048 #endif /* !__sparc || __sparcv9 */
1049 NOSYS, /* 1 */
1050 NOSYS, /* 2 */
1051 NOSYS, /* 3 */
1052 NOSYS, /* 4 */
1053 NOSYS, /* 5 */
1054 NOSYS, /* 6 */
1055 NOSYS, /* 7 */
1056 NOSYS, /* 8 */
1057 NOSYS, /* 9 */
1058 NOSYS, /* 10 */
1059 EMULATE(s10_exec, 2 | RV_DEFAULT), /* 11 */
1060 NOSYS, /* 12 */
1061 NOSYS, /* 13 */
1062 NOSYS, /* 14 */
1063 NOSYS, /* 15 */
1064 NOSYS, /* 16 */
1065 NOSYS, /* 17 */
1066 NOSYS, /* 18 */
1067 NOSYS, /* 19 */
1068 NOSYS, /* 20 */
1069 NOSYS, /* 21 */
1070 NOSYS, /* 22 */
1071 NOSYS, /* 23 */
1072 NOSYS, /* 24 */
1073 NOSYS, /* 25 */
1074 NOSYS, /* 26 */
1075 NOSYS, /* 27 */
1076 NOSYS, /* 28 */
1077 NOSYS, /* 29 */
1078 NOSYS, /* 30 */
1079 NOSYS, /* 31 */
1080 NOSYS, /* 32 */
1081 NOSYS, /* 33 */
1082 NOSYS, /* 34 */
1083 NOSYS, /* 35 */
1084 NOSYS, /* 36 */
1085 NOSYS, /* 37 */
1086 NOSYS, /* 38 */
1087 NOSYS, /* 39 */
1088 NOSYS, /* 40 */
1089 NOSYS, /* 41 */
1090 NOSYS, /* 42 */
1091 NOSYS, /* 43 */
1092 NOSYS, /* 44 */
1093 NOSYS, /* 45 */
1094 NOSYS, /* 46 */
1095 NOSYS, /* 47 */
1096 NOSYS, /* 48 */
1097 NOSYS, /* 49 */
1098 NOSYS, /* 50 */
1099 NOSYS, /* 51 */
1100 NOSYS, /* 52 */
1101 NOSYS, /* 53 */
1102 EMULATE(s10_ioctl, 3 | RV_DEFAULT), /* 54 */
1103 NOSYS, /* 55 */
1104 NOSYS, /* 56 */
1105 NOSYS, /* 57 */
1106 NOSYS, /* 58 */
1107 EMULATE(s10_execve, 3 | RV_DEFAULT), /* 59 */
1108 NOSYS, /* 60 */
1109 NOSYS, /* 61 */
1110 NOSYS, /* 62 */
1111 NOSYS, /* 63 */
1112 NOSYS, /* 64 */
1113 NOSYS, /* 65 */
1114 NOSYS, /* 66 */
1115 NOSYS, /* 67 */
1116 NOSYS, /* 68 */
1117 NOSYS, /* 69 */
1118 NOSYS, /* 70 */
1119 EMULATE(s10_acctctl, 3 | RV_DEFAULT), /* 71 */
1120 NOSYS, /* 72 */
1121 EMULATE(s10_getpagesizes, 2 | RV_DEFAULT), /* 73 */
1122 NOSYS, /* 74 */
1123 EMULATE(s10_issetugid, 0 | RV_DEFAULT), /* 75 */
1124 NOSYS, /* 76 */
1125 NOSYS, /* 77 */
1126 NOSYS, /* 78 */
1127 NOSYS, /* 79 */
1128 NOSYS, /* 80 */
1129 NOSYS, /* 81 */
1130 NOSYS, /* 82 */
1131 NOSYS, /* 83 */
1132 NOSYS, /* 84 */
1133 NOSYS, /* 85 */
1134 NOSYS, /* 86 */
1135 NOSYS, /* 87 */
1136 NOSYS, /* 88 */
1137 NOSYS, /* 89 */
1138 NOSYS, /* 90 */
1139 NOSYS, /* 91 */
1140 NOSYS, /* 92 */
1141 NOSYS, /* 93 */
1142 NOSYS, /* 94 */
1143 NOSYS, /* 95 */
1144 NOSYS, /* 96 */
1145 NOSYS, /* 97 */
1146 NOSYS, /* 98 */
1147 NOSYS, /* 99 */
1148 NOSYS, /* 100 */
1149 NOSYS, /* 101 */
1150 NOSYS, /* 102 */
1151 NOSYS, /* 103 */
1152 NOSYS, /* 104 */
1153 NOSYS, /* 105 */
1154 NOSYS, /* 106 */
1155 NOSYS, /* 107 */
1156 NOSYS, /* 108 */
1157 NOSYS, /* 109 */
1158 NOSYS, /* 110 */
1159 NOSYS, /* 111 */
1160 NOSYS, /* 112 */
1161 NOSYS, /* 113 */
1162 NOSYS, /* 114 */
1163 NOSYS, /* 115 */
1164 NOSYS, /* 116 */
1165 NOSYS, /* 117 */
1166 NOSYS, /* 118 */
1167 NOSYS, /* 119 */
1168 NOSYS, /* 120 */
1169 NOSYS, /* 121 */
1170 NOSYS, /* 122 */
1171 NOSYS, /* 123 */
1172 NOSYS, /* 124 */
1173 NOSYS, /* 125 */
1174 NOSYS, /* 126 */
1175 NOSYS, /* 127 */
1176 NOSYS, /* 128 */
1177 NOSYS, /* 129 */
1178 NOSYS, /* 130 */
1179 NOSYS, /* 131 */
1180 NOSYS, /* 132 */
1181 NOSYS, /* 133 */
1182 NOSYS, /* 134 */
1183 EMULATE(s10_uname, 1 | RV_DEFAULT), /* 135 */
1184 NOSYS, /* 136 */
1185 NOSYS, /* 137 */
1186 NOSYS, /* 138 */
1187 EMULATE(s10_sysinfo, 3 | RV_DEFAULT), /* 139 */
1188 NOSYS, /* 140 */
1189 NOSYS, /* 141 */
1190 NOSYS, /* 142 */
1191 NOSYS, /* 143 */
1192 NOSYS, /* 144 */
1193 NOSYS, /* 145 */
1194 NOSYS, /* 146 */
1195 NOSYS, /* 147 */
1196 NOSYS, /* 148 */
1197 NOSYS, /* 149 */
1198 NOSYS, /* 150 */
1199 NOSYS, /* 151 */
1200 NOSYS, /* 152 */
1201 NOSYS, /* 153 */
1202 NOSYS, /* 154 */
1203 NOSYS, /* 155 */
1204 NOSYS, /* 156 */
1205 NOSYS, /* 157 */
1206 NOSYS, /* 158 */
1207 NOSYS, /* 159 */
1208 NOSYS, /* 160 */
1209 NOSYS, /* 161 */
1210 NOSYS, /* 162 */
1211 NOSYS, /* 163 */
1212 NOSYS, /* 164 */
1213 NOSYS, /* 165 */
1214 NOSYS, /* 166 */
1215 NOSYS, /* 167 */
1216 NOSYS, /* 168 */
1217 NOSYS, /* 169 */
1218 NOSYS, /* 170 */
1219 NOSYS, /* 171 */
1220 NOSYS, /* 172 */
1221 NOSYS, /* 173 */
1222 EMULATE(s10_pwrite, 4 | RV_DEFAULT), /* 174 */
1223 NOSYS, /* 175 */
1224 NOSYS, /* 176 */
1225 NOSYS, /* 177 */
1226 NOSYS, /* 178 */
1227 NOSYS, /* 179 */
1228 NOSYS, /* 180 */
1229 NOSYS, /* 181 */
1230 NOSYS, /* 182 */
1231 NOSYS, /* 183 */
1232 NOSYS, /* 184 */
1233 NOSYS, /* 185 */
1234 NOSYS, /* 186 */
1235 NOSYS, /* 187 */
1236 NOSYS, /* 188 */
1237 NOSYS, /* 189 */
1238 EMULATE(s10_sigqueue, 4 | RV_DEFAULT), /* 190 */
1239 NOSYS, /* 191 */
1240 NOSYS, /* 192 */
1241 NOSYS, /* 193 */
1242 NOSYS, /* 194 */
1243 NOSYS, /* 195 */
1244 NOSYS, /* 196 */
1245 NOSYS, /* 197 */
1246 NOSYS, /* 198 */
1247 NOSYS, /* 199 */
1248 NOSYS, /* 200 */
1249 NOSYS, /* 201 */
1250 NOSYS, /* 202 */
1251 NOSYS, /* 203 */
1252 NOSYS, /* 204 */
1253 NOSYS, /* 205 */
1254 NOSYS, /* 206 */
1255 NOSYS, /* 207 */
1256 NOSYS, /* 208 */
1257 NOSYS, /* 209 */
1258 NOSYS, /* 210 */
1259 NOSYS, /* 211 */
1260 NOSYS, /* 212 */
1261 NOSYS, /* 213 */
1262 NOSYS, /* 214 */
1263 NOSYS, /* 215 */
1264 NOSYS, /* 216 */
1265 NOSYS, /* 217 */
1266 NOSYS, /* 218 */
1267 NOSYS, /* 219 */
1268 NOSYS, /* 220 */
1269 NOSYS, /* 221 */
1270 NOSYS, /* 222 */
1271 #ifdef _LP64
1272 NOSYS, /* 223 */
1273 #else /* !_LP64 */
1274 EMULATE(s10_pwrite64, 5 | RV_DEFAULT), /* 223 */
1275 #endif /* !_LP64 */
1276 NOSYS, /* 224 */
1277 NOSYS, /* 225 */
1278 NOSYS, /* 226 */
1279 EMULATE(s10_zone, 5 | RV_DEFAULT), /* 227 */
1280 NOSYS, /* 228 */
1281 NOSYS, /* 229 */
1282 NOSYS, /* 230 */
1283 NOSYS, /* 231 */
1284 NOSYS, /* 232 */
1285 NOSYS, /* 233 */
1286 NOSYS, /* 234 */
1287 NOSYS, /* 235 */
1288 NOSYS, /* 236 */
1289 NOSYS, /* 237 */
1290 NOSYS, /* 238 */
1291 NOSYS, /* 239 */
1292 NOSYS, /* 240 */
1293 NOSYS, /* 241 */
1294 NOSYS, /* 242 */
1295 NOSYS, /* 243 */
1296 NOSYS, /* 244 */
1297 NOSYS, /* 245 */
1298 NOSYS, /* 246 */
1299 NOSYS, /* 247 */
1300 NOSYS, /* 248 */
1301 NOSYS, /* 249 */
1302 NOSYS, /* 250 */
1303 NOSYS, /* 251 */
1304 NOSYS, /* 252 */
1305 NOSYS, /* 253 */
1306 NOSYS, /* 254 */
1307 NOSYS /* 255 */
1308 };