1
0
mirror of https://passt.top/passt synced 2024-06-23 11:37:04 +00:00
passt/pasta.c
Stefano Brivio 33b1bdd079 seccomp: Add a number of alternate and per-arch syscalls
Depending on the C library, but not necessarily in all the
functions we use, statx() might be used instead of stat(),
getdents() instead of getdents64(), readlinkat() instead of
readlink(), openat() instead of open().

On aarch64, it's clone() and not fork(), and dup3() instead of
dup2() -- just allow the existing alternative instead of dealing
with per-arch selections.

Since glibc commit 9a7565403758 ("posix: Consolidate fork
implementation"), we need to allow set_robust_list() for
fork()/clone(), even in a single-threaded context.

On some architectures, epoll_pwait() is provided instead of
epoll_wait(), but never both. Same with newfstat() and
fstat(), sigreturn() and rt_sigreturn(), getdents64() and
getdents(), readlink() and readlinkat(), unlink() and
unlinkat(), whereas pipe() might not be available, but
pipe2() always is, exclusively or not.

Seen on Fedora 34: newfstatat() is used on top of fstat().

syslog() is an actual system call on some glibc/arch combinations,
instead of a connect()/send() implementation.

On ppc64 and ppc64le, _llseek(), recv(), send() and getuid()
are used. For ppc64 only: ugetrlimit() for the getrlimit()
implementation, plus sigreturn() and fcntl64().

On s390x, additionally, we need to allow socketcall() (on top
of socket()), and sigreturn() also for passt (not just for
pasta).

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
2022-01-26 16:30:59 +01:00

269 lines
5.9 KiB
C

// SPDX-License-Identifier: AGPL-3.0-or-later
/* PASST - Plug A Simple Socket Transport
* for qemu/UNIX domain socket mode
*
* PASTA - Pack A Subtle Tap Abstraction
* for network namespace/tap device mode
*
* pasta.c - pasta (namespace) specific implementations
*
* Copyright (c) 2020-2021 Red Hat GmbH
* Author: Stefano Brivio <sbrivio@redhat.com>
*
* #syscalls:pasta clone unshare waitid kill execve exit_group rt_sigprocmask
* #syscalls:pasta geteuid getdents64|getdents readlink|readlinkat setsid
* #syscalls:pasta nanosleep clock_nanosleep
*/
#include <sched.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <limits.h>
#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
#include <syslog.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/wait.h>
#include <signal.h>
#include <dirent.h>
#include <arpa/inet.h>
#include <netinet/in.h>
#include <net/ethernet.h>
#include <sys/syscall.h>
#include "util.h"
#include "passt.h"
#include "netlink.h"
/* PID of child, in case we created a namespace, and its procfs link */
static int pasta_child_pid;
static char pasta_child_ns[PATH_MAX];
/**
* pasta_ns_cleanup() - Look for processes in namespace, terminate them
*/
static void pasta_ns_cleanup(void)
{
char proc_path[PATH_MAX], ns_link[PATH_MAX], buf[BUFSIZ];
int recheck = 0, found = 0, waited = 0;
int dir_fd, n;
if (!*pasta_child_ns)
return;
loop:
if ((dir_fd = open("/proc", O_RDONLY | O_DIRECTORY)) < 0)
return;
while ((n = syscall(SYS_getdents64, dir_fd, buf, BUFSIZ)) > 0) {
struct dirent *dp = (struct dirent *)buf;
int pos = 0;
while (dp->d_reclen && pos < n) {
pid_t pid;
errno = 0;
pid = strtol(dp->d_name, NULL, 0);
if (!pid || errno)
goto next;
snprintf(proc_path, PATH_MAX, "/proc/%i/ns/net", pid);
if (readlink(proc_path, ns_link, PATH_MAX) < 0)
goto next;
if (!strncmp(ns_link, pasta_child_ns, PATH_MAX)) {
found = 1;
if (waited)
kill(pid, SIGKILL);
else
kill(pid, SIGQUIT);
}
next:
dp = (struct dirent *)(buf + (pos += dp->d_reclen));
}
}
close(dir_fd);
if (!found)
return;
if (waited) {
if (recheck) {
info("Some processes in namespace didn't quit");
} else {
found = 0;
recheck = 1;
goto loop;
}
return;
}
info("Waiting for all processes in namespace to terminate");
sleep(1);
waited = 1;
goto loop;
}
/**
* pasta_child_handler() - Exit once shell exits (if we started it), reap clones
* @signal: Unused, handler deals with SIGCHLD only
*/
void pasta_child_handler(int signal)
{
siginfo_t infop;
(void)signal;
if (pasta_child_pid &&
!waitid(P_PID, pasta_child_pid, &infop, WEXITED | WNOHANG)) {
if (infop.si_pid == pasta_child_pid) {
pasta_ns_cleanup();
exit(EXIT_SUCCESS);
}
}
waitid(P_ALL, 0, NULL, WEXITED | WNOHANG);
waitid(P_ALL, 0, NULL, WEXITED | WNOHANG);
}
/**
* pasta_wait_for_ns() - Busy loop until we can enter the target namespace
* @arg: Execution context
*
* Return: 0
*/
static int pasta_wait_for_ns(void *arg)
{
struct ctx *c = (struct ctx *)arg;
char ns[PATH_MAX];
if (c->netns_only)
goto netns;
snprintf(ns, PATH_MAX, "/proc/%i/ns/user", pasta_child_pid);
do
while ((c->pasta_userns_fd = open(ns, O_RDONLY)) < 0);
while (setns(c->pasta_userns_fd, CLONE_NEWUSER) &&
!close(c->pasta_userns_fd));
netns:
snprintf(ns, PATH_MAX, "/proc/%i/ns/net", pasta_child_pid);
do
while ((c->pasta_netns_fd = open(ns, O_RDONLY)) < 0);
while (setns(c->pasta_netns_fd, CLONE_NEWNET) &&
!close(c->pasta_netns_fd));
return 0;
}
/**
* pasta_start_ns() - Fork shell in new namespace if target ns is not given
* @c: Execution context
*/
void pasta_start_ns(struct ctx *c)
{
int euid = geteuid(), fd;
char *shell;
c->foreground = 1;
if (!c->debug)
c->quiet = 1;
if ((pasta_child_pid = fork()) == -1) {
perror("fork");
exit(EXIT_FAILURE);
}
if (pasta_child_pid) {
char proc_path[PATH_MAX];
NS_CALL(pasta_wait_for_ns, c);
snprintf(proc_path, PATH_MAX, "/proc/%i/ns/net",
pasta_child_pid);
if (readlink(proc_path, pasta_child_ns, PATH_MAX) < 0)
warn("Cannot read link to ns, won't clean up on exit");
return;
}
if (unshare(CLONE_NEWNET | (c->netns_only ? 0 : CLONE_NEWUSER))) {
perror("unshare");
exit(EXIT_FAILURE);
}
if (!c->netns_only) {
char buf[BUFSIZ];
snprintf(buf, BUFSIZ, "%i %i %i", 0, euid, 1);
fd = open("/proc/self/uid_map", O_WRONLY);
if (write(fd, buf, strlen(buf)) < 0)
warn("Cannot set uid_map in namespace");
close(fd);
fd = open("/proc/self/setgroups", O_WRONLY);
if (write(fd, "deny", sizeof("deny")) < 0)
warn("Cannot write to setgroups in namespace");
close(fd);
fd = open("/proc/self/gid_map", O_WRONLY);
if (write(fd, buf, strlen(buf)) < 0)
warn("Cannot set gid_map in namespace");
close(fd);
}
fd = open("/proc/sys/net/ipv4/ping_group_range", O_WRONLY);
if (write(fd, "0 0", strlen("0 0")) < 0)
warn("Cannot set ping_group_range, ICMP requests might fail");
close(fd);
shell = getenv("SHELL") ? getenv("SHELL") : "/bin/sh";
if (strstr(shell, "/bash"))
execve(shell, ((char *[]) { shell, "-l", NULL }), environ);
else
execve(shell, ((char *[]) { shell, NULL }), environ);
perror("execve");
exit(EXIT_FAILURE);
}
/**
* pasta_ns_conf() - Set up loopback and tap interfaces in namespace as needed
* @c: Execution context
*/
void pasta_ns_conf(struct ctx *c)
{
nl_link(1, 1 /* lo */, MAC_ZERO, 1, 0);
if (c->pasta_conf_ns) {
int prefix_len;
nl_link(1, c->pasta_ifi, c->mac_guest, 1, c->mtu);
if (c->v4) {
prefix_len = __builtin_popcount(c->mask4);
nl_addr(1, c->pasta_ifi, AF_INET, &c->addr4,
&prefix_len, NULL);
nl_route(1, c->pasta_ifi, AF_INET, &c->gw4);
}
if (c->v6) {
prefix_len = 64;
nl_addr(1, c->pasta_ifi, AF_INET6, &c->addr6,
&prefix_len, NULL);
nl_route(1, c->pasta_ifi, AF_INET6, &c->gw6);
}
} else {
nl_link(1, c->pasta_ifi, c->mac_guest, 0, 0);
}
proto_update_l2_buf(c->mac_guest, NULL, NULL);
}