mirror of
https://passt.top/passt
synced 2025-01-12 07:32:58 +00:00
09603cab28
If a parent accidentally or due to implementation reasons leaks any open file, we don't want to have access to them, except for the file passed via --fd, if any. This is the case for Podman when Podman's parent leaks files into Podman: it's not practical for Podman to close unrelated files before starting pasta, as reported by Paul. Use close_range(2) to close all open files except for standard streams and the one from --fd. Given that parts of conf() depend on other files to be already opened, such as the epoll file descriptor, we can't easily defer this to a more convenient point, where --fd was already parsed. Introduce a minimal, duplicate version of --fd parsing to keep this simple. As we need to check that the passed --fd option doesn't exceed INT_MAX, because we'll parse it with strtol() but file descriptor indices are signed ints (regardless of the arguments close_range() take), extend the existing check in the actual --fd parsing in conf(), also rejecting file descriptors numbers that match standard streams, while at it. Suggested-by: Paul Holzinger <pholzing@redhat.com> Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Reviewed-by: Paul Holzinger <pholzing@redhat.com>
394 lines
11 KiB
C
394 lines
11 KiB
C
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
|
|
/* PASST - Plug A Simple Socket Transport
|
|
* for qemu/UNIX domain socket mode
|
|
*
|
|
* PASTA - Pack A Subtle Tap Abstraction
|
|
* for network namespace/tap device mode
|
|
*
|
|
* isolation.c - Self isolation helpers
|
|
*
|
|
* Copyright Red Hat
|
|
* Author: Stefano Brivio <sbrivio@redhat.com>
|
|
* Author: David Gibson <david@gibson.dropbear.id.au>
|
|
*/
|
|
/**
|
|
* DOC: Theory of Operation
|
|
*
|
|
* For security the passt/pasta process performs a number of
|
|
* self-isolations steps, dropping capabilities, setting namespaces
|
|
* and otherwise minimising the impact we can have on the system at
|
|
* large if we were compromised.
|
|
*
|
|
* Obviously we can't isolate ourselves from resources before we've
|
|
* done anything we need to do with those resources, so we have
|
|
* multiple stages of self-isolation. In order these are:
|
|
*
|
|
* 1. isolate_initial()
|
|
* ====================
|
|
*
|
|
* Executed immediately after startup, drops capabilities we don't
|
|
* need at any point during execution (or which we gain back when we
|
|
* need by joining other namespaces), and closes any leaked file we
|
|
* might have inherited from the parent process.
|
|
*
|
|
* 2. isolate_user()
|
|
* =================
|
|
*
|
|
* Executed once we know what user and user namespace we want to
|
|
* operate in. Sets our final UID & GID, and enters the correct user
|
|
* namespace.
|
|
*
|
|
* 3. isolate_prefork()
|
|
* ====================
|
|
*
|
|
* Executed after all setup, but before daemonising (fork()ing into
|
|
* the background). Uses mount namespace and pivot_root() to remove
|
|
* our access to the filesystem.
|
|
*
|
|
* 4. isolate_postfork()
|
|
* =====================
|
|
*
|
|
* Executed immediately after daemonizing, but before entering the
|
|
* actual packet forwarding phase of operation. Or, if not
|
|
* daemonizing, immediately after isolate_prefork(). Uses seccomp()
|
|
* to restrict ourselves to the handful of syscalls we need during
|
|
* runtime operation.
|
|
*/
|
|
|
|
#include <errno.h>
|
|
#include <fcntl.h>
|
|
#include <grp.h>
|
|
#include <inttypes.h>
|
|
#include <limits.h>
|
|
#include <pwd.h>
|
|
#include <sched.h>
|
|
#include <stdbool.h>
|
|
#include <stddef.h>
|
|
#include <stdlib.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <time.h>
|
|
#include <unistd.h>
|
|
#include <sys/mount.h>
|
|
#include <sys/prctl.h>
|
|
#include <sys/socket.h>
|
|
#include <sys/syscall.h>
|
|
#include <sys/types.h>
|
|
#include <netinet/in.h>
|
|
#include <netinet/if_ether.h>
|
|
|
|
#include <linux/audit.h>
|
|
#include <linux/capability.h>
|
|
#include <linux/filter.h>
|
|
#include <linux/seccomp.h>
|
|
|
|
#include "util.h"
|
|
#include "seccomp.h"
|
|
#include "passt.h"
|
|
#include "log.h"
|
|
#include "isolation.h"
|
|
|
|
#define CAP_VERSION _LINUX_CAPABILITY_VERSION_3
|
|
#define CAP_WORDS _LINUX_CAPABILITY_U32S_3
|
|
|
|
/**
|
|
* drop_caps_ep_except() - Drop capabilities from effective & permitted sets
|
|
* @keep: Capabilities to keep
|
|
*/
|
|
static void drop_caps_ep_except(uint64_t keep)
|
|
{
|
|
struct __user_cap_header_struct hdr = {
|
|
.version = CAP_VERSION,
|
|
.pid = 0,
|
|
};
|
|
struct __user_cap_data_struct data[CAP_WORDS];
|
|
int i;
|
|
|
|
if (syscall(SYS_capget, &hdr, data))
|
|
die_perror("Couldn't get current capabilities");
|
|
|
|
for (i = 0; i < CAP_WORDS; i++) {
|
|
uint32_t mask = keep >> (32 * i);
|
|
|
|
data[i].effective &= mask;
|
|
data[i].permitted &= mask;
|
|
}
|
|
|
|
if (syscall(SYS_capset, &hdr, data))
|
|
die_perror("Couldn't drop capabilities");
|
|
}
|
|
|
|
/**
|
|
* clamp_caps() - Prevent any children from gaining caps
|
|
*
|
|
* This drops all capabilities from both the inheritable and the
|
|
* bounding set. This means that any exec()ed processes can't gain
|
|
* capabilities, even if they have file capabilities which would grant
|
|
* them. We shouldn't ever exec() in any case, but this provides an
|
|
* additional layer of protection. Executing this requires
|
|
* CAP_SETPCAP, which we will have within our userns.
|
|
*
|
|
* Note that dropping capabilites from the bounding set limits
|
|
* exec()ed processes, but does not remove them from the effective or
|
|
* permitted sets, so it doesn't reduce our own capabilities.
|
|
*/
|
|
static void clamp_caps(void)
|
|
{
|
|
struct __user_cap_data_struct data[CAP_WORDS];
|
|
struct __user_cap_header_struct hdr = {
|
|
.version = CAP_VERSION,
|
|
.pid = 0,
|
|
};
|
|
int i;
|
|
|
|
for (i = 0; i < 64; i++) {
|
|
/* Some errors can be ignored:
|
|
* - EINVAL, we'll get this for all values in 0..63
|
|
* that are not actually allocated caps
|
|
* - EPERM, we'll get this if we don't have
|
|
* CAP_SETPCAP, which can happen if using
|
|
* --netns-only. We don't need CAP_SETPCAP for
|
|
* normal operation, so carry on without it.
|
|
*/
|
|
if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0) &&
|
|
errno != EINVAL && errno != EPERM)
|
|
die_perror("Couldn't drop cap %i from bounding set", i);
|
|
}
|
|
|
|
if (syscall(SYS_capget, &hdr, data))
|
|
die_perror("Couldn't get current capabilities");
|
|
|
|
for (i = 0; i < CAP_WORDS; i++)
|
|
data[i].inheritable = 0;
|
|
|
|
if (syscall(SYS_capset, &hdr, data))
|
|
die_perror("Couldn't drop inheritable capabilities");
|
|
}
|
|
|
|
/**
|
|
* isolate_initial() - Early, mostly config independent self isolation
|
|
* @argc: Argument count
|
|
* @argv: Command line options: only --fd (if present) is relevant here
|
|
*
|
|
* Should:
|
|
* - drop unneeded capabilities
|
|
* - close all open files except for standard streams and the one from --fd
|
|
* Musn't:
|
|
* - remove filesytem access (we need to access files during setup)
|
|
*/
|
|
void isolate_initial(int argc, char **argv)
|
|
{
|
|
uint64_t keep;
|
|
|
|
/* We want to keep CAP_NET_BIND_SERVICE in the initial
|
|
* namespace if we have it, so that we can forward low ports
|
|
* into the guest/namespace
|
|
*
|
|
* We have to keep CAP_SETUID and CAP_SETGID at this stage, so
|
|
* that we can switch user away from root.
|
|
*
|
|
* We have to keep some capabilities for the --netns-only case:
|
|
* - CAP_SYS_ADMIN, so that we can setns() to the netns.
|
|
* - Keep CAP_NET_ADMIN, so that we can configure interfaces
|
|
*
|
|
* It's debatable whether it's useful to drop caps when we
|
|
* retain SETUID and SYS_ADMIN, but we might as well. We drop
|
|
* further capabilites in isolate_user() and
|
|
* isolate_prefork().
|
|
*/
|
|
keep = BIT(CAP_NET_BIND_SERVICE) | BIT(CAP_SETUID) | BIT(CAP_SETGID) |
|
|
BIT(CAP_SYS_ADMIN) | BIT(CAP_NET_ADMIN);
|
|
|
|
/* Since Linux 5.12, if we want to update /proc/self/uid_map to create
|
|
* a mapping from UID 0, which only happens with pasta spawning a child
|
|
* from a non-init user namespace (pasta can't run as root), we need to
|
|
* retain CAP_SETFCAP too.
|
|
* We also need to keep CAP_SYS_PTRACE in order to join an existing netns
|
|
* path under /proc/$pid/ns/net which was created in the same userns.
|
|
*/
|
|
if (!ns_is_init() && !geteuid())
|
|
keep |= BIT(CAP_SETFCAP) | BIT(CAP_SYS_PTRACE);
|
|
|
|
drop_caps_ep_except(keep);
|
|
|
|
close_open_files(argc, argv);
|
|
}
|
|
|
|
/**
|
|
* isolate_user() - Switch to final UID/GID and move into userns
|
|
* @uid: User ID to run as (in original userns)
|
|
* @gid: Group ID to run as (in original userns)
|
|
* @use_userns: Whether to join or create a userns
|
|
* @userns: userns path to enter, may be empty
|
|
* @mode: Mode (passt or pasta)
|
|
*
|
|
* Should:
|
|
* - set our final UID and GID
|
|
* - enter our final user namespace
|
|
* Mustn't:
|
|
* - remove filesystem access (we need that for further setup)
|
|
*/
|
|
void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns,
|
|
enum passt_modes mode)
|
|
{
|
|
uint64_t ns_caps = 0;
|
|
|
|
/* First set our UID & GID in the original namespace */
|
|
if (setgroups(0, NULL)) {
|
|
/* If we don't have CAP_SETGID, this will EPERM */
|
|
if (errno != EPERM)
|
|
die_perror("Can't drop supplementary groups");
|
|
}
|
|
|
|
if (setgid(gid) != 0)
|
|
die_perror("Can't set GID to %u", gid);
|
|
|
|
if (setuid(uid) != 0)
|
|
die_perror("Can't set UID to %u", uid);
|
|
|
|
if (*userns) { /* If given a userns, join it */
|
|
int ufd;
|
|
|
|
ufd = open(userns, O_RDONLY | O_CLOEXEC);
|
|
if (ufd < 0)
|
|
die_perror("Couldn't open user namespace %s", userns);
|
|
|
|
if (setns(ufd, CLONE_NEWUSER) != 0)
|
|
die_perror("Couldn't enter user namespace %s", userns);
|
|
|
|
close(ufd);
|
|
|
|
} else if (use_userns) { /* Create and join a new userns */
|
|
if (unshare(CLONE_NEWUSER) != 0)
|
|
die_perror("Couldn't create user namespace");
|
|
}
|
|
|
|
/* Joining a new userns gives us full capabilities; drop the
|
|
* ones we don't need. With --netns-only we haven't changed
|
|
* userns but we can drop more capabilities now than at
|
|
* isolate_initial()
|
|
*/
|
|
/* Keep CAP_SYS_ADMIN, so we can unshare() further in
|
|
* isolate_prefork(), pasta also needs it to setns() into the
|
|
* netns
|
|
*/
|
|
ns_caps |= BIT(CAP_SYS_ADMIN);
|
|
if (mode == MODE_PASTA) {
|
|
/* Keep CAP_NET_ADMIN, so we can configure the if */
|
|
ns_caps |= BIT(CAP_NET_ADMIN);
|
|
/* Keep CAP_NET_BIND_SERVICE, so we can splice
|
|
* outbound connections to low port numbers
|
|
*/
|
|
ns_caps |= BIT(CAP_NET_BIND_SERVICE);
|
|
/* Keep CAP_SYS_PTRACE to join the netns of an
|
|
* existing process */
|
|
if (*userns || !use_userns)
|
|
ns_caps |= BIT(CAP_SYS_PTRACE);
|
|
}
|
|
|
|
drop_caps_ep_except(ns_caps);
|
|
}
|
|
|
|
/**
|
|
* isolate_prefork() - Self isolation before daemonizing
|
|
* @c: Execution context
|
|
*
|
|
* Return: negative error code on failure, zero on success
|
|
*
|
|
* Should:
|
|
* - Move us to our own IPC and UTS namespaces
|
|
* - Move us to a mount namespace with only an empty directory
|
|
* - Drop unneeded capabilities (in the new user namespace)
|
|
* Mustn't:
|
|
* - Remove syscalls we need to daemonise
|
|
*/
|
|
int isolate_prefork(const struct ctx *c)
|
|
{
|
|
int flags = CLONE_NEWIPC | CLONE_NEWNS | CLONE_NEWUTS;
|
|
uint64_t ns_caps = 0;
|
|
|
|
/* If we run in foreground, we have no chance to actually move to a new
|
|
* PID namespace. For passt, use CLONE_NEWPID anyway, in case somebody
|
|
* ever gets around seccomp profiles -- there's no harm in passing it.
|
|
*/
|
|
if (!c->foreground || c->mode != MODE_PASTA)
|
|
flags |= CLONE_NEWPID;
|
|
|
|
if (unshare(flags)) {
|
|
err_perror("Failed to detach isolating namespaces");
|
|
return -errno;
|
|
}
|
|
|
|
if (mount("", "/", "", MS_UNBINDABLE | MS_REC, NULL)) {
|
|
err_perror("Failed to remount /");
|
|
return -errno;
|
|
}
|
|
|
|
if (mount("", TMPDIR, "tmpfs",
|
|
MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RDONLY,
|
|
"nr_inodes=2,nr_blocks=0")) {
|
|
err_perror("Failed to mount empty tmpfs for pivot_root()");
|
|
return -errno;
|
|
}
|
|
|
|
if (chdir(TMPDIR)) {
|
|
err_perror("Failed to change directory into empty tmpfs");
|
|
return -errno;
|
|
}
|
|
|
|
if (syscall(SYS_pivot_root, ".", ".")) {
|
|
err_perror("Failed to pivot_root() into empty tmpfs");
|
|
return -errno;
|
|
}
|
|
|
|
if (umount2(".", MNT_DETACH | UMOUNT_NOFOLLOW)) {
|
|
err_perror("Failed to unmount original root filesystem");
|
|
return -errno;
|
|
}
|
|
|
|
/* Now that initialization is more-or-less complete, we can
|
|
* drop further capabilities
|
|
*/
|
|
if (c->mode == MODE_PASTA) {
|
|
/* Keep CAP_SYS_ADMIN, so we can enter the netns */
|
|
ns_caps |= BIT(CAP_SYS_ADMIN);
|
|
/* Keep CAP_NET_BIND_SERVICE, so we can splice
|
|
* outbound connections to low port numbers
|
|
*/
|
|
ns_caps |= BIT(CAP_NET_BIND_SERVICE);
|
|
}
|
|
|
|
clamp_caps();
|
|
drop_caps_ep_except(ns_caps);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* isolate_postfork() - Self isolation after daemonizing
|
|
* @c: Execution context
|
|
*
|
|
* Should:
|
|
* - disable core dumps
|
|
* - limit to a minimal set of syscalls
|
|
*/
|
|
void isolate_postfork(const struct ctx *c)
|
|
{
|
|
struct sock_fprog prog;
|
|
|
|
prctl(PR_SET_DUMPABLE, 0);
|
|
|
|
if (c->mode == MODE_PASTA) {
|
|
prog.len = (unsigned short)ARRAY_SIZE(filter_pasta);
|
|
prog.filter = filter_pasta;
|
|
} else {
|
|
prog.len = (unsigned short)ARRAY_SIZE(filter_passt);
|
|
prog.filter = filter_passt;
|
|
}
|
|
|
|
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) ||
|
|
prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog))
|
|
die_perror("Failed to apply seccomp filter");
|
|
}
|