passt/pasta.c

// SPDX-License-Identifier: GPL-2.0-or-later

/* PASST - Plug A Simple Socket Transport
 *  for qemu/UNIX domain socket mode
 *
 * PASTA - Pack A Subtle Tap Abstraction
 *  for network namespace/tap device mode
 *
 * pasta.c - pasta (namespace) specific implementations
 *
 * Copyright (c) 2020-2021 Red Hat GmbH
 * Author: Stefano Brivio <sbrivio@redhat.com>
 *
 * #syscalls:pasta clone waitid exit exit_group rt_sigprocmask
 * #syscalls:pasta rt_sigreturn|sigreturn
 * #syscalls:pasta arm:sigreturn ppc64:sigreturn s390x:sigreturn
 */

#include <sched.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <libgen.h>
#include <limits.h>
#include <stdlib.h>
#include <stdbool.h>
#include <stdint.h>
#include <unistd.h>
#include <syslog.h>
#include <sys/epoll.h>
#include <sys/inotify.h>
#include <sys/mount.h>
#include <sys/timerfd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/statfs.h>
#include <fcntl.h>
#include <sys/wait.h>
#include <signal.h>
#include <dirent.h>
#include <arpa/inet.h>
#include <netinet/in.h>
#include <net/ethernet.h>
#include <sys/syscall.h>
#include <linux/magic.h>

#include "util.h"
#include "passt.h"
#include "isolation.h"
#include "netlink.h"
#include "log.h"

/* PID of child, in case we created a namespace */
int pasta_child_pid;

/**
 * pasta_child_handler() - Exit once shell exits (if we started it), reap clones
 * @signal:	Unused, handler deals with SIGCHLD only
 */
void pasta_child_handler(int signal)
{
	siginfo_t infop;

	(void)signal;

	if (signal != SIGCHLD)
		return;

	if (pasta_child_pid &&
	    !waitid(P_PID, pasta_child_pid, &infop, WEXITED | WNOHANG)) {
		if (infop.si_pid == pasta_child_pid) {
			if (infop.si_code == CLD_EXITED)
				exit(infop.si_status);

			/* If killed by a signal, si_status is the number.
			 * Follow common shell convention of returning it + 128.
			 */
			exit(infop.si_status + 128);

			/* Nothing to do, detached PID namespace going away */
		}
	}

	waitid(P_ALL, 0, NULL, WEXITED | WNOHANG);
	waitid(P_ALL, 0, NULL, WEXITED | WNOHANG);
}

/**
 * pasta_wait_for_ns() - Busy loop until we can enter the target namespace
 * @arg:	Execution context
 *
 * Return: 0
 */
static int pasta_wait_for_ns(void *arg)
{
	struct ctx *c = (struct ctx *)arg;
	int flags = O_RDONLY | O_CLOEXEC;
	char ns[PATH_MAX];

	snprintf(ns, PATH_MAX, "/proc/%i/ns/net", pasta_child_pid);
	do {
		while ((c->pasta_netns_fd = open(ns, flags)) < 0) {
			if (errno != ENOENT)
				return 0;
		}
	} while (setns(c->pasta_netns_fd, CLONE_NEWNET) &&
		 !close(c->pasta_netns_fd));

	return 0;
}

/**
 * ns_check() - Check if we can enter configured namespaces
 * @arg:	Execution context
 *
 * Return: 0
 */
static int ns_check(void *arg)
{
	struct ctx *c = (struct ctx *)arg;

	if (setns(c->pasta_netns_fd, CLONE_NEWNET))
		c->pasta_netns_fd = -1;

	return 0;

}

/**
 * pasta_open_ns() - Open network namespace descriptors
 * @c:		Execution context
 * @netns:	network namespace path
 *
 * Return: 0 on success, negative error code otherwise
 */
void pasta_open_ns(struct ctx *c, const char *netns)
{
	int nfd = -1;

	nfd = open(netns, O_RDONLY | O_CLOEXEC);
	if (nfd < 0) {
		die("Couldn't open network namespace %s: %s",
		    netns, strerror(errno));
	}

	c->pasta_netns_fd = nfd;

	NS_CALL(ns_check, c);

	if (c->pasta_netns_fd < 0)
		die("Couldn't switch to pasta namespaces: %s", strerror(errno));

	if (!c->no_netns_quit) {
		char buf[PATH_MAX] = { 0 };

		strncpy(buf, netns, PATH_MAX - 1);
		strncpy(c->netns_base, basename(buf), PATH_MAX - 1);
		strncpy(buf, netns, PATH_MAX - 1);
		strncpy(c->netns_dir, dirname(buf), PATH_MAX - 1);
	}
}

/**
 * struct pasta_spawn_cmd_arg - Argument for pasta_spawn_cmd()
 * @exe:	Executable to run
 * @argv:	Command and arguments to run
 */
struct pasta_spawn_cmd_arg {
	const char *exe;
	char *const *argv;
};

/**
 * pasta_spawn_cmd() - Prepare new netns, start command or shell
 * @arg:	See @pasta_spawn_cmd_arg
 *
 * Return: this function never returns
 */
static int pasta_spawn_cmd(void *arg)
{
	const struct pasta_spawn_cmd_arg *a;
	sigset_t set;

	/* We run in a detached PID and mount namespace: mount /proc over */
	if (mount("", "/proc", "proc", 0, NULL))
		warn("Couldn't mount /proc: %s", strerror(errno));

	if (write_file("/proc/sys/net/ipv4/ping_group_range", "0 0"))
		warn("Cannot set ping_group_range, ICMP requests might fail");

	/* Wait for the parent to be ready: see main() */
	sigemptyset(&set);
	sigaddset(&set, SIGUSR1);
	sigwaitinfo(&set, NULL);

	a = (const struct pasta_spawn_cmd_arg *)arg;
	execvp(a->exe, a->argv);

	perror("execvp");
	exit(EXIT_FAILURE);
}

/**
 * pasta_start_ns() - Fork command in new namespace if target ns is not given
 * @c:		Execution context
 * @uid:	UID we're running as in the init namespace
 * @gid:	GID we're running as in the init namespace
 * @argc:	Number of arguments for spawned command
 * @argv:	Command to spawn and arguments
 */
void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid,
		    int argc, char *argv[])
{
	char ns_fn_stack[NS_FN_STACK_SIZE]
	__attribute__ ((aligned(__alignof__(max_align_t))));
	struct pasta_spawn_cmd_arg arg = {
		.exe = argv[0],
		.argv = argv,
	};
	char uidmap[BUFSIZ], gidmap[BUFSIZ];
	char *sh_argv[] = { NULL, NULL };
	char sh_arg0[PATH_MAX + 1];
	sigset_t set;

	c->foreground = 1;
	if (!c->debug)
		c->quiet = 1;

	/* Configure user and group mappings */
	snprintf(uidmap, BUFSIZ, "0 %u 1", uid);
	snprintf(gidmap, BUFSIZ, "0 %u 1", gid);

	if (write_file("/proc/self/uid_map", uidmap) ||
	    write_file("/proc/self/setgroups", "deny") ||
	    write_file("/proc/self/gid_map", gidmap)) {
		warn("Couldn't configure user mappings");
	}

	if (argc == 0) {
		arg.exe = getenv("SHELL");
		if (!arg.exe)
			arg.exe = "/bin/sh";

		if ((size_t)snprintf(sh_arg0, sizeof(sh_arg0),
				     "-%s", arg.exe) >= sizeof(sh_arg0))
			die("$SHELL is too long (%zu bytes)", strlen(arg.exe));

		sh_argv[0] = sh_arg0;
		arg.argv = sh_argv;
	}

	/* Block SIGUSR1 in child, we queue it in main() when we're ready */
	sigemptyset(&set);
	sigaddset(&set, SIGUSR1);
	sigprocmask(SIG_BLOCK, &set, NULL);

	pasta_child_pid = do_clone(pasta_spawn_cmd, ns_fn_stack,
				   sizeof(ns_fn_stack),
				   CLONE_NEWIPC | CLONE_NEWPID | CLONE_NEWNET |
				   CLONE_NEWUTS | CLONE_NEWNS  | SIGCHLD,
				   (void *)&arg);

	if (pasta_child_pid == -1) {
		perror("clone");
		exit(EXIT_FAILURE);
	}

	NS_CALL(pasta_wait_for_ns, c);
	if (c->pasta_netns_fd < 0)
		die("Failed to join network namespace: %s", strerror(errno));
}

/**
 * pasta_ns_conf() - Set up loopback and tap interfaces in namespace as needed
 * @c:		Execution context
 */
void pasta_ns_conf(struct ctx *c)
{
	int rc = 0;

	rc = nl_link_up(nl_sock_ns, 1 /* lo */, 0);
	if (rc < 0)
		die("Couldn't bring up loopback interface in namespace: %s",
		    strerror(-rc));

	/* Get or set MAC in target namespace */
	if (MAC_IS_ZERO(c->mac_guest))
		nl_link_get_mac(nl_sock_ns, c->pasta_ifi, c->mac_guest);
	else
		rc = nl_link_set_mac(nl_sock_ns, c->pasta_ifi, c->mac_guest);
	if (rc < 0)
		die("Couldn't set MAC address in namespace: %s",
		    strerror(-rc));

	if (c->pasta_conf_ns) {
		nl_link_up(nl_sock_ns, c->pasta_ifi, c->mtu);

		if (c->ifi4) {
			if (c->no_copy_addrs) {
				rc = nl_addr_set(nl_sock_ns, c->pasta_ifi,
						 AF_INET,
						 &c->ip4.addr,
						 c->ip4.prefix_len);
			} else {
				rc = nl_addr_dup(nl_sock, c->ifi4,
						 nl_sock_ns, c->pasta_ifi,
						 AF_INET);
			}

			if (rc < 0) {
				die("Couldn't set IPv4 address(es) in namespace: %s",
				    strerror(-rc));
			}

			if (c->no_copy_routes) {
				rc = nl_route_set_def(nl_sock_ns, c->pasta_ifi,
						      AF_INET, &c->ip4.gw);
			} else {
				rc = nl_route_dup(nl_sock, c->ifi4, nl_sock_ns,
						  c->pasta_ifi, AF_INET);
			}

			if (rc < 0) {
				die("Couldn't set IPv4 route(s) in guest: %s",
				    strerror(-rc));
			}
		}

		if (c->ifi6) {
			if (c->no_copy_addrs) {
				rc = nl_addr_set(nl_sock_ns, c->pasta_ifi,
						 AF_INET6, &c->ip6.addr, 64);
			} else {
				rc = nl_addr_dup(nl_sock, c->ifi6,
						 nl_sock_ns, c->pasta_ifi,
						 AF_INET6);
			}

			if (rc < 0) {
				die("Couldn't set IPv6 address(es) in namespace: %s",
				    strerror(-rc));
			}

			if (c->no_copy_routes) {
				rc = nl_route_set_def(nl_sock_ns, c->pasta_ifi,
						      AF_INET6, &c->ip6.gw);
			} else {
				rc = nl_route_dup(nl_sock, c->ifi6,
						  nl_sock_ns, c->pasta_ifi,
						  AF_INET6);
			}

			if (rc < 0) {
				die("Couldn't set IPv6 route(s) in guest: %s",
				    strerror(-rc));
			}
		}
	}

	proto_update_l2_buf(c->mac_guest, NULL);
}

/**
 * pasta_netns_quit_timer() - Set up fallback timer to monitor namespace
 *
 * Return: timerfd file descriptor, negative error code on failure
 */
static int pasta_netns_quit_timer(void)
{
	int fd = timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC);
	struct itimerspec it = { { 1, 0 }, { 1, 0 } }; /* one-second interval */

	if (fd == -1) {
		err("timerfd_create(): %s", strerror(errno));
		return -errno;
	}

	if (timerfd_settime(fd, 0, &it, NULL) < 0) {
		err("timerfd_settime(): %s", strerror(errno));
		close(fd);
		return -errno;
	}

	return fd;
}

/**
 * pasta_netns_quit_init() - Watch network namespace to quit once it's gone
 * @c:		Execution context
 */
void pasta_netns_quit_init(const struct ctx *c)
{
	union epoll_ref ref = { .type = EPOLL_TYPE_NSQUIT_INOTIFY };
	struct epoll_event ev = { .events = EPOLLIN };
	int flags = O_NONBLOCK | O_CLOEXEC;
	struct statfs s = { 0 };
	bool try_inotify = true;
	int fd = -1, dir_fd;

	if (c->mode != MODE_PASTA || c->no_netns_quit || !*c->netns_base)
		return;

	if ((dir_fd = open(c->netns_dir, O_CLOEXEC | O_RDONLY)) < 0)
		die("netns dir open: %s, exiting", strerror(errno));

	if (fstatfs(dir_fd, &s)          || s.f_type == DEVPTS_SUPER_MAGIC ||
	    s.f_type == PROC_SUPER_MAGIC || s.f_type == SYSFS_MAGIC)
		try_inotify = false;

	if (try_inotify && (fd = inotify_init1(flags)) < 0)
		warn("inotify_init1(): %s, use a timer", strerror(errno));

	if (fd >= 0 && inotify_add_watch(fd, c->netns_dir, IN_DELETE) < 0) {
		warn("inotify_add_watch(): %s, use a timer",
		     strerror(errno));
		close(fd);
		fd = -1;
	}

	if (fd < 0) {
		if ((fd = pasta_netns_quit_timer()) < 0)
			die("Failed to set up fallback netns timer, exiting");

		ref.nsdir_fd = dir_fd;

		ref.type = EPOLL_TYPE_NSQUIT_TIMER;
	} else {
		close(dir_fd);
	}

	if (fd > FD_REF_MAX)
		die("netns monitor file number %i too big, exiting", fd);

	ref.fd = fd;
	ev.data.u64 = ref.u64;
	epoll_ctl(c->epollfd, EPOLL_CTL_ADD, fd, &ev);
}

/**
 * pasta_netns_quit_inotify_handler() - Handle inotify watch, exit if ns is gone
 * @c:		Execution context
 * @inotify_fd:	inotify file descriptor with watch on namespace directory
 */
void pasta_netns_quit_inotify_handler(struct ctx *c, int inotify_fd)
{
	char buf[sizeof(struct inotify_event) + NAME_MAX + 1];
	const struct inotify_event *in_ev = (struct inotify_event *)buf;

	if (read(inotify_fd, buf, sizeof(buf)) < (ssize_t)sizeof(*in_ev))
		return;

	if (strncmp(in_ev->name, c->netns_base, sizeof(c->netns_base)))
		return;

	info("Namespace %s is gone, exiting", c->netns_base);
	exit(EXIT_SUCCESS);
}

/**
 * pasta_netns_quit_timer_handler() - Handle timer, exit if ns is gone
 * @c:		Execution context
 * @ref:	epoll reference for timer descriptor
 */
void pasta_netns_quit_timer_handler(struct ctx *c, union epoll_ref ref)
{
	uint64_t expirations;
	ssize_t n;
	int fd;

	n = read(ref.fd, &expirations, sizeof(expirations));
	if (n < 0)
		die("Namespace watch timer read() error: %s", strerror(errno));
	if ((size_t)n < sizeof(expirations))
		warn("Namespace watch timer: short read(): %zi", n);

	fd = openat(ref.nsdir_fd, c->netns_base, O_PATH | O_CLOEXEC);
	if (fd < 0) {
		if (errno == EACCES)	/* Expected for existing procfs entry */
			return;

		info("Namespace %s is gone, exiting", c->netns_base);
		exit(EXIT_SUCCESS);
	}

	close(fd);
}