--- /dev/null
+/* userns_child_exec.c
+
+ Copyright 2013, Michael Kerrisk
+ Licensed under GNU General Public License v2 or later
+
+ Create a child process that executes a shell command in new
+ namespace(s); allow UID and GID mappings to be specified when
+ creating a user namespace.
+*/
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <sched.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/wait.h>
+#include <signal.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+#include <errno.h>
+
+/* A simple error-handling function: print an error message based
+ on the value in 'errno' and terminate the calling process */
+
+#define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \
+ } while (0)
+
+struct child_args {
+ char **argv; /* Command to be executed by child, with arguments */
+ int pipe_fd[2]; /* Pipe used to synchronize parent and child */
+};
+
+static int verbose, setid;
+
+static void
+usage(char *pname)
+{
+ fprintf(stderr, "Usage: %s [options] cmd [arg...]\n\n", pname);
+ fprintf(stderr, "Create a child process that executes a shell command "
+ "in a new user namespace,\n"
+ "and possibly also other new namespace(s).\n\n");
+ fprintf(stderr, "Options can be:\n\n");
+#define fpe(str) fprintf(stderr, " %s", str);
+ fpe("-i New IPC namespace\n");
+ fpe("-m New mount namespace\n");
+ fpe("-n New network namespace\n");
+ fpe("-p New PID namespace\n");
+ fpe("-u New UTS namespace\n");
+ fpe("-U New user namespace\n");
+ fpe("-M uid_map Specify UID map for user namespace\n");
+ fpe("-G gid_map Specify GID map for user namespace\n");
+ fpe(" If -M or -G is specified, -U is required\n");
+ fpe("-s Set uid/gid to 0 in the new user namespace\n");
+ fpe("-v Display verbose messages\n");
+ fpe("\n");
+ fpe("Map strings for -M and -G consist of records of the form:\n");
+ fpe("\n");
+ fpe(" ID-inside-ns ID-outside-ns len\n");
+ fpe("\n");
+ fpe("A map string can contain multiple records, separated by commas;\n");
+ fpe("the commas are replaced by newlines before writing to map files.\n");
+
+ exit(EXIT_FAILURE);
+}
+
+/* Update the mapping file 'map_file', with the value provided in
+ 'mapping', a string that defines a UID or GID mapping. A UID or
+ GID mapping consists of one or more newline-delimited records
+ of the form:
+
+ ID_inside-ns ID-outside-ns length
+
+ Requiring the user to supply a string that contains newlines is
+ of course inconvenient for command-line use. Thus, we permit the
+ use of commas to delimit records in this string, and replace them
+ with newlines before writing the string to the file. */
+
+static void
+update_map(char *mapping, char *map_file)
+{
+ int fd, j;
+ size_t map_len; /* Length of 'mapping' */
+
+ /* Replace commas in mapping string with newlines */
+
+ map_len = strlen(mapping);
+ for (j = 0; j < map_len; j++)
+ if (mapping[j] == ',')
+ mapping[j] = '\n';
+
+ fd = open(map_file, O_RDWR);
+ if (fd == -1) {
+ fprintf(stderr, "open %s: %s\n", map_file, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ if (write(fd, mapping, map_len) != map_len) {
+ fprintf(stderr, "write %s: %s\n", map_file, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ close(fd);
+}
+
+static int /* Start function for cloned child */
+childFunc(void *arg)
+{
+ struct child_args *args = (struct child_args *) arg;
+ char ch;
+
+ /* Wait until the parent has updated the UID and GID mappings. See
+ the comment in main(). We wait for end of file on a pipe that will
+ be closed by the parent process once it has updated the mappings. */
+
+ close(args->pipe_fd[1]); /* Close our descriptor for the write end
+ of the pipe so that we see EOF when
+ parent closes its descriptor */
+ if (read(args->pipe_fd[0], &ch, 1) != 0) {
+ fprintf(stderr, "Failure in child: read from pipe returned != 0\n");
+ exit(EXIT_FAILURE);
+ }
+
+ if (setid) {
+ if (setgid(0) < 0)
+ fprintf(stderr, "Failure in child to setgid 0: %s\n", strerror(errno));
+ if (setuid(0) < 0)
+ fprintf(stderr, "Failure in child to setuid 0: %s\n", strerror(errno));
+ }
+
+ /* Execute a shell command */
+
+ execvp(args->argv[0], args->argv);
+ errExit("execvp");
+}
+
+#define STACK_SIZE (1024 * 1024)
+
+static char child_stack[STACK_SIZE]; /* Space for child's stack */
+
+int
+main(int argc, char *argv[])
+{
+ int flags, opt;
+ pid_t child_pid;
+ struct child_args args;
+ char *uid_map, *gid_map;
+ char map_path[PATH_MAX];
+
+ /* Parse command-line options. The initial '+' character in
+ the final getopt() argument prevents GNU-style permutation
+ of command-line options. That's useful, since sometimes
+ the 'command' to be executed by this program itself
+ has command-line options. We don't want getopt() to treat
+ those as options to this program. */
+
+ flags = 0;
+ verbose = 0;
+ setid = 0;
+ gid_map = NULL;
+ uid_map = NULL;
+ while ((opt = getopt(argc, argv, "+imnpuUM:G:vs")) != -1) {
+ switch (opt) {
+ case 'i': flags |= CLONE_NEWIPC; break;
+ case 'm': flags |= CLONE_NEWNS; break;
+ case 'n': flags |= CLONE_NEWNET; break;
+ case 'p': flags |= CLONE_NEWPID; break;
+ case 'u': flags |= CLONE_NEWUTS; break;
+ case 'v': verbose = 1; break;
+ case 'M': uid_map = optarg; break;
+ case 'G': gid_map = optarg; break;
+ case 'U': flags |= CLONE_NEWUSER; break;
+ case 's': setid = 1; break;
+ default: usage(argv[0]);
+ }
+ }
+
+ /* -M or -G without -U is nonsensical */
+
+ if ((uid_map != NULL || gid_map != NULL) &&
+ !(flags & CLONE_NEWUSER))
+ usage(argv[0]);
+
+ args.argv = &argv[optind];
+
+ /* We use a pipe to synchronize the parent and child, in order to
+ ensure that the parent sets the UID and GID maps before the child
+ calls execve(). This ensures that the child maintains its
+ capabilities during the execve() in the common case where we
+ want to map the child's effective user ID to 0 in the new user
+ namespace. Without this synchronization, the child would lose
+ its capabilities if it performed an execve() with nonzero
+ user IDs (see the capabilities(7) man page for details of the
+ transformation of a process's capabilities during execve()). */
+
+ if (pipe(args.pipe_fd) == -1)
+ errExit("pipe");
+
+ /* Create the child in new namespace(s) */
+
+ child_pid = clone(childFunc, child_stack + STACK_SIZE,
+ flags | SIGCHLD, &args);
+ if (child_pid == -1)
+ errExit("clone");
+
+ /* Parent falls through to here */
+
+ if (verbose)
+ printf("%s: PID of child created by clone() is %ld\n",
+ argv[0], (long) child_pid);
+
+ /* Update the UID and GID maps in the child */
+
+ if (uid_map != NULL) {
+ snprintf(map_path, PATH_MAX, "/proc/%ld/uid_map",
+ (long) child_pid);
+ update_map(uid_map, map_path);
+ }
+ if (gid_map != NULL) {
+ snprintf(map_path, PATH_MAX, "/proc/%ld/gid_map",
+ (long) child_pid);
+ update_map(gid_map, map_path);
+ }
+
+ /* Close the write end of the pipe, to signal to the child that we
+ have updated the UID and GID maps */
+
+ close(args.pipe_fd[1]);
+
+ if (waitpid(child_pid, NULL, 0) == -1) /* Wait for child */
+ errExit("waitpid");
+
+ if (verbose)
+ printf("%s: terminating\n", argv[0]);
+
+ exit(EXIT_SUCCESS);
+}