passt: Relicense to GPL 2.0, or any later version
In practical terms, passt doesn't benefit from the additional
protection offered by the AGPL over the GPL, because it's not
suitable to be executed over a computer network.
Further, restricting the distribution under the version 3 of the GPL
wouldn't provide any practical advantage either, as long as the passt
codebase is concerned, and might cause unnecessary compatibility
dilemmas.
Change licensing terms to the GNU General Public License Version 2,
or any later version, with written permission from all current and
past contributors, namely: myself, David Gibson, Laine Stump, Andrea
Bolognani, Paul Holzinger, Richard W.M. Jones, Chris Kuhn, Florian
Weimer, Giuseppe Scrivano, Stefan Hajnoczi, and Vasiliy Ulyanov.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
2023-04-05 20:11:44 +02:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
2022-09-12 22:24:03 +10:00
|
|
|
|
|
|
|
/* PASST - Plug A Simple Socket Transport
|
|
|
|
* for qemu/UNIX domain socket mode
|
|
|
|
*
|
|
|
|
* PASTA - Pack A Subtle Tap Abstraction
|
|
|
|
* for network namespace/tap device mode
|
|
|
|
*
|
|
|
|
* isolation.c - Self isolation helpers
|
|
|
|
*
|
|
|
|
* Copyright Red Hat
|
|
|
|
* Author: Stefano Brivio <sbrivio@redhat.com>
|
|
|
|
* Author: David Gibson <david@gibson.dropbear.id.au>
|
|
|
|
*/
|
2022-10-14 15:25:31 +11:00
|
|
|
/**
|
|
|
|
* DOC: Theory of Operation
|
|
|
|
*
|
|
|
|
* For security the passt/pasta process performs a number of
|
|
|
|
* self-isolations steps, dropping capabilities, setting namespaces
|
|
|
|
* and otherwise minimising the impact we can have on the system at
|
|
|
|
* large if we were compromised.
|
|
|
|
*
|
|
|
|
* Obviously we can't isolate ourselves from resources before we've
|
|
|
|
* done anything we need to do with those resources, so we have
|
|
|
|
* multiple stages of self-isolation. In order these are:
|
|
|
|
*
|
|
|
|
* 1. isolate_initial()
|
|
|
|
* ====================
|
|
|
|
*
|
|
|
|
* Executed immediately after startup, drops capabilities we don't
|
|
|
|
* need at any point during execution (or which we gain back when we
|
passt, util: Close any open file that the parent might have leaked
If a parent accidentally or due to implementation reasons leaks any
open file, we don't want to have access to them, except for the file
passed via --fd, if any.
This is the case for Podman when Podman's parent leaks files into
Podman: it's not practical for Podman to close unrelated files before
starting pasta, as reported by Paul.
Use close_range(2) to close all open files except for standard streams
and the one from --fd.
Given that parts of conf() depend on other files to be already opened,
such as the epoll file descriptor, we can't easily defer this to a
more convenient point, where --fd was already parsed. Introduce a
minimal, duplicate version of --fd parsing to keep this simple.
As we need to check that the passed --fd option doesn't exceed
INT_MAX, because we'll parse it with strtol() but file descriptor
indices are signed ints (regardless of the arguments close_range()
take), extend the existing check in the actual --fd parsing in conf(),
also rejecting file descriptors numbers that match standard streams,
while at it.
Suggested-by: Paul Holzinger <pholzing@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Paul Holzinger <pholzing@redhat.com>
2024-08-06 20:32:11 +02:00
|
|
|
* need by joining other namespaces), and closes any leaked file we
|
|
|
|
* might have inherited from the parent process.
|
2022-10-14 15:25:31 +11:00
|
|
|
*
|
|
|
|
* 2. isolate_user()
|
|
|
|
* =================
|
|
|
|
*
|
|
|
|
* Executed once we know what user and user namespace we want to
|
|
|
|
* operate in. Sets our final UID & GID, and enters the correct user
|
|
|
|
* namespace.
|
|
|
|
*
|
|
|
|
* 3. isolate_prefork()
|
|
|
|
* ====================
|
|
|
|
*
|
|
|
|
* Executed after all setup, but before daemonising (fork()ing into
|
|
|
|
* the background). Uses mount namespace and pivot_root() to remove
|
|
|
|
* our access to the filesystem.
|
|
|
|
*
|
|
|
|
* 4. isolate_postfork()
|
|
|
|
* =====================
|
|
|
|
*
|
|
|
|
* Executed immediately after daemonizing, but before entering the
|
|
|
|
* actual packet forwarding phase of operation. Or, if not
|
|
|
|
* daemonizing, immediately after isolate_prefork(). Uses seccomp()
|
|
|
|
* to restrict ourselves to the handful of syscalls we need during
|
|
|
|
* runtime operation.
|
|
|
|
*/
|
2022-09-12 22:24:03 +10:00
|
|
|
|
|
|
|
#include <errno.h>
|
|
|
|
#include <fcntl.h>
|
|
|
|
#include <grp.h>
|
|
|
|
#include <inttypes.h>
|
|
|
|
#include <limits.h>
|
|
|
|
#include <pwd.h>
|
|
|
|
#include <sched.h>
|
2022-09-12 22:24:07 +10:00
|
|
|
#include <stdbool.h>
|
2022-09-12 22:24:03 +10:00
|
|
|
#include <stddef.h>
|
|
|
|
#include <stdlib.h>
|
2023-03-08 04:00:22 +01:00
|
|
|
#include <stdio.h>
|
2022-09-12 22:24:03 +10:00
|
|
|
#include <string.h>
|
|
|
|
#include <time.h>
|
|
|
|
#include <unistd.h>
|
|
|
|
#include <sys/mount.h>
|
|
|
|
#include <sys/prctl.h>
|
|
|
|
#include <sys/socket.h>
|
|
|
|
#include <sys/syscall.h>
|
|
|
|
#include <sys/types.h>
|
|
|
|
#include <netinet/in.h>
|
|
|
|
#include <netinet/if_ether.h>
|
|
|
|
|
|
|
|
#include <linux/audit.h>
|
|
|
|
#include <linux/capability.h>
|
|
|
|
#include <linux/filter.h>
|
|
|
|
#include <linux/seccomp.h>
|
|
|
|
|
|
|
|
#include "util.h"
|
|
|
|
#include "seccomp.h"
|
|
|
|
#include "passt.h"
|
2022-09-24 09:53:15 +02:00
|
|
|
#include "log.h"
|
2022-09-12 22:24:03 +10:00
|
|
|
#include "isolation.h"
|
|
|
|
|
2022-10-14 15:25:34 +11:00
|
|
|
#define CAP_VERSION _LINUX_CAPABILITY_VERSION_3
|
|
|
|
#define CAP_WORDS _LINUX_CAPABILITY_U32S_3
|
|
|
|
|
2022-09-12 22:24:03 +10:00
|
|
|
/**
|
2022-10-14 15:25:34 +11:00
|
|
|
* drop_caps_ep_except() - Drop capabilities from effective & permitted sets
|
|
|
|
* @keep: Capabilities to keep
|
2022-09-12 22:24:03 +10:00
|
|
|
*/
|
2022-10-14 15:25:34 +11:00
|
|
|
static void drop_caps_ep_except(uint64_t keep)
|
2022-09-12 22:24:03 +10:00
|
|
|
{
|
2022-10-14 15:25:34 +11:00
|
|
|
struct __user_cap_header_struct hdr = {
|
|
|
|
.version = CAP_VERSION,
|
|
|
|
.pid = 0,
|
|
|
|
};
|
|
|
|
struct __user_cap_data_struct data[CAP_WORDS];
|
2022-09-12 22:24:03 +10:00
|
|
|
int i;
|
|
|
|
|
2023-02-15 03:24:37 -05:00
|
|
|
if (syscall(SYS_capget, &hdr, data))
|
2024-06-17 11:55:04 +02:00
|
|
|
die_perror("Couldn't get current capabilities");
|
2022-09-12 22:24:03 +10:00
|
|
|
|
2022-10-14 15:25:34 +11:00
|
|
|
for (i = 0; i < CAP_WORDS; i++) {
|
|
|
|
uint32_t mask = keep >> (32 * i);
|
|
|
|
|
|
|
|
data[i].effective &= mask;
|
|
|
|
data[i].permitted &= mask;
|
|
|
|
}
|
|
|
|
|
2023-02-15 03:24:37 -05:00
|
|
|
if (syscall(SYS_capset, &hdr, data))
|
2024-06-17 11:55:04 +02:00
|
|
|
die_perror("Couldn't drop capabilities");
|
2022-09-12 22:24:03 +10:00
|
|
|
}
|
|
|
|
|
2022-10-14 15:25:35 +11:00
|
|
|
/**
|
|
|
|
* clamp_caps() - Prevent any children from gaining caps
|
|
|
|
*
|
|
|
|
* This drops all capabilities from both the inheritable and the
|
|
|
|
* bounding set. This means that any exec()ed processes can't gain
|
|
|
|
* capabilities, even if they have file capabilities which would grant
|
|
|
|
* them. We shouldn't ever exec() in any case, but this provides an
|
|
|
|
* additional layer of protection. Executing this requires
|
|
|
|
* CAP_SETPCAP, which we will have within our userns.
|
|
|
|
*
|
|
|
|
* Note that dropping capabilites from the bounding set limits
|
|
|
|
* exec()ed processes, but does not remove them from the effective or
|
|
|
|
* permitted sets, so it doesn't reduce our own capabilities.
|
|
|
|
*/
|
|
|
|
static void clamp_caps(void)
|
|
|
|
{
|
|
|
|
struct __user_cap_data_struct data[CAP_WORDS];
|
|
|
|
struct __user_cap_header_struct hdr = {
|
|
|
|
.version = CAP_VERSION,
|
|
|
|
.pid = 0,
|
|
|
|
};
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < 64; i++) {
|
|
|
|
/* Some errors can be ignored:
|
|
|
|
* - EINVAL, we'll get this for all values in 0..63
|
|
|
|
* that are not actually allocated caps
|
|
|
|
* - EPERM, we'll get this if we don't have
|
|
|
|
* CAP_SETPCAP, which can happen if using
|
|
|
|
* --netns-only. We don't need CAP_SETPCAP for
|
|
|
|
* normal operation, so carry on without it.
|
|
|
|
*/
|
|
|
|
if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0) &&
|
2023-02-15 03:24:37 -05:00
|
|
|
errno != EINVAL && errno != EPERM)
|
2024-06-17 11:55:04 +02:00
|
|
|
die_perror("Couldn't drop cap %i from bounding set", i);
|
2022-10-14 15:25:35 +11:00
|
|
|
}
|
|
|
|
|
2023-02-15 03:24:37 -05:00
|
|
|
if (syscall(SYS_capget, &hdr, data))
|
2024-06-17 11:55:04 +02:00
|
|
|
die_perror("Couldn't get current capabilities");
|
2022-10-14 15:25:35 +11:00
|
|
|
|
|
|
|
for (i = 0; i < CAP_WORDS; i++)
|
|
|
|
data[i].inheritable = 0;
|
|
|
|
|
2023-02-15 03:24:37 -05:00
|
|
|
if (syscall(SYS_capset, &hdr, data))
|
2024-06-17 11:55:04 +02:00
|
|
|
die_perror("Couldn't drop inheritable capabilities");
|
2022-10-14 15:25:35 +11:00
|
|
|
}
|
|
|
|
|
2022-10-14 15:25:31 +11:00
|
|
|
/**
|
passt, util: Close any open file that the parent might have leaked
If a parent accidentally or due to implementation reasons leaks any
open file, we don't want to have access to them, except for the file
passed via --fd, if any.
This is the case for Podman when Podman's parent leaks files into
Podman: it's not practical for Podman to close unrelated files before
starting pasta, as reported by Paul.
Use close_range(2) to close all open files except for standard streams
and the one from --fd.
Given that parts of conf() depend on other files to be already opened,
such as the epoll file descriptor, we can't easily defer this to a
more convenient point, where --fd was already parsed. Introduce a
minimal, duplicate version of --fd parsing to keep this simple.
As we need to check that the passed --fd option doesn't exceed
INT_MAX, because we'll parse it with strtol() but file descriptor
indices are signed ints (regardless of the arguments close_range()
take), extend the existing check in the actual --fd parsing in conf(),
also rejecting file descriptors numbers that match standard streams,
while at it.
Suggested-by: Paul Holzinger <pholzing@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Paul Holzinger <pholzing@redhat.com>
2024-08-06 20:32:11 +02:00
|
|
|
* isolate_initial() - Early, mostly config independent self isolation
|
|
|
|
* @argc: Argument count
|
|
|
|
* @argv: Command line options: only --fd (if present) is relevant here
|
2022-10-14 15:25:31 +11:00
|
|
|
*
|
|
|
|
* Should:
|
|
|
|
* - drop unneeded capabilities
|
passt, util: Close any open file that the parent might have leaked
If a parent accidentally or due to implementation reasons leaks any
open file, we don't want to have access to them, except for the file
passed via --fd, if any.
This is the case for Podman when Podman's parent leaks files into
Podman: it's not practical for Podman to close unrelated files before
starting pasta, as reported by Paul.
Use close_range(2) to close all open files except for standard streams
and the one from --fd.
Given that parts of conf() depend on other files to be already opened,
such as the epoll file descriptor, we can't easily defer this to a
more convenient point, where --fd was already parsed. Introduce a
minimal, duplicate version of --fd parsing to keep this simple.
As we need to check that the passed --fd option doesn't exceed
INT_MAX, because we'll parse it with strtol() but file descriptor
indices are signed ints (regardless of the arguments close_range()
take), extend the existing check in the actual --fd parsing in conf(),
also rejecting file descriptors numbers that match standard streams,
while at it.
Suggested-by: Paul Holzinger <pholzing@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Paul Holzinger <pholzing@redhat.com>
2024-08-06 20:32:11 +02:00
|
|
|
* - close all open files except for standard streams and the one from --fd
|
2022-10-14 15:25:31 +11:00
|
|
|
* Musn't:
|
|
|
|
* - remove filesytem access (we need to access files during setup)
|
|
|
|
*/
|
passt, util: Close any open file that the parent might have leaked
If a parent accidentally or due to implementation reasons leaks any
open file, we don't want to have access to them, except for the file
passed via --fd, if any.
This is the case for Podman when Podman's parent leaks files into
Podman: it's not practical for Podman to close unrelated files before
starting pasta, as reported by Paul.
Use close_range(2) to close all open files except for standard streams
and the one from --fd.
Given that parts of conf() depend on other files to be already opened,
such as the epoll file descriptor, we can't easily defer this to a
more convenient point, where --fd was already parsed. Introduce a
minimal, duplicate version of --fd parsing to keep this simple.
As we need to check that the passed --fd option doesn't exceed
INT_MAX, because we'll parse it with strtol() but file descriptor
indices are signed ints (regardless of the arguments close_range()
take), extend the existing check in the actual --fd parsing in conf(),
also rejecting file descriptors numbers that match standard streams,
while at it.
Suggested-by: Paul Holzinger <pholzing@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Paul Holzinger <pholzing@redhat.com>
2024-08-06 20:32:11 +02:00
|
|
|
void isolate_initial(int argc, char **argv)
|
2022-10-14 15:25:31 +11:00
|
|
|
{
|
2023-05-21 15:03:31 +02:00
|
|
|
uint64_t keep;
|
|
|
|
|
2022-10-14 15:25:34 +11:00
|
|
|
/* We want to keep CAP_NET_BIND_SERVICE in the initial
|
|
|
|
* namespace if we have it, so that we can forward low ports
|
|
|
|
* into the guest/namespace
|
|
|
|
*
|
|
|
|
* We have to keep CAP_SETUID and CAP_SETGID at this stage, so
|
|
|
|
* that we can switch user away from root.
|
|
|
|
*
|
|
|
|
* We have to keep some capabilities for the --netns-only case:
|
|
|
|
* - CAP_SYS_ADMIN, so that we can setns() to the netns.
|
|
|
|
* - Keep CAP_NET_ADMIN, so that we can configure interfaces
|
|
|
|
*
|
|
|
|
* It's debatable whether it's useful to drop caps when we
|
|
|
|
* retain SETUID and SYS_ADMIN, but we might as well. We drop
|
|
|
|
* further capabilites in isolate_user() and
|
|
|
|
* isolate_prefork().
|
|
|
|
*/
|
2023-05-21 15:03:31 +02:00
|
|
|
keep = BIT(CAP_NET_BIND_SERVICE) | BIT(CAP_SETUID) | BIT(CAP_SETGID) |
|
|
|
|
BIT(CAP_SYS_ADMIN) | BIT(CAP_NET_ADMIN);
|
|
|
|
|
|
|
|
/* Since Linux 5.12, if we want to update /proc/self/uid_map to create
|
|
|
|
* a mapping from UID 0, which only happens with pasta spawning a child
|
|
|
|
* from a non-init user namespace (pasta can't run as root), we need to
|
|
|
|
* retain CAP_SETFCAP too.
|
2023-06-23 10:25:32 +02:00
|
|
|
* We also need to keep CAP_SYS_PTRACE in order to join an existing netns
|
|
|
|
* path under /proc/$pid/ns/net which was created in the same userns.
|
2023-05-21 15:03:31 +02:00
|
|
|
*/
|
|
|
|
if (!ns_is_init() && !geteuid())
|
2023-06-23 10:25:32 +02:00
|
|
|
keep |= BIT(CAP_SETFCAP) | BIT(CAP_SYS_PTRACE);
|
2023-05-21 15:03:31 +02:00
|
|
|
|
|
|
|
drop_caps_ep_except(keep);
|
passt, util: Close any open file that the parent might have leaked
If a parent accidentally or due to implementation reasons leaks any
open file, we don't want to have access to them, except for the file
passed via --fd, if any.
This is the case for Podman when Podman's parent leaks files into
Podman: it's not practical for Podman to close unrelated files before
starting pasta, as reported by Paul.
Use close_range(2) to close all open files except for standard streams
and the one from --fd.
Given that parts of conf() depend on other files to be already opened,
such as the epoll file descriptor, we can't easily defer this to a
more convenient point, where --fd was already parsed. Introduce a
minimal, duplicate version of --fd parsing to keep this simple.
As we need to check that the passed --fd option doesn't exceed
INT_MAX, because we'll parse it with strtol() but file descriptor
indices are signed ints (regardless of the arguments close_range()
take), extend the existing check in the actual --fd parsing in conf(),
also rejecting file descriptors numbers that match standard streams,
while at it.
Suggested-by: Paul Holzinger <pholzing@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Paul Holzinger <pholzing@redhat.com>
2024-08-06 20:32:11 +02:00
|
|
|
|
|
|
|
close_open_files(argc, argv);
|
2022-10-14 15:25:31 +11:00
|
|
|
}
|
|
|
|
|
2022-09-12 22:24:03 +10:00
|
|
|
/**
|
2022-09-12 22:24:07 +10:00
|
|
|
* isolate_user() - Switch to final UID/GID and move into userns
|
|
|
|
* @uid: User ID to run as (in original userns)
|
|
|
|
* @gid: Group ID to run as (in original userns)
|
|
|
|
* @use_userns: Whether to join or create a userns
|
|
|
|
* @userns: userns path to enter, may be empty
|
2022-10-14 15:25:34 +11:00
|
|
|
* @mode: Mode (passt or pasta)
|
2022-10-14 15:25:31 +11:00
|
|
|
*
|
|
|
|
* Should:
|
|
|
|
* - set our final UID and GID
|
|
|
|
* - enter our final user namespace
|
|
|
|
* Mustn't:
|
|
|
|
* - remove filesystem access (we need that for further setup)
|
2022-09-12 22:24:03 +10:00
|
|
|
*/
|
2022-10-14 15:25:34 +11:00
|
|
|
void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns,
|
|
|
|
enum passt_modes mode)
|
2022-09-12 22:24:03 +10:00
|
|
|
{
|
2022-10-14 15:25:34 +11:00
|
|
|
uint64_t ns_caps = 0;
|
|
|
|
|
2022-09-12 22:24:07 +10:00
|
|
|
/* First set our UID & GID in the original namespace */
|
2022-09-12 22:24:03 +10:00
|
|
|
if (setgroups(0, NULL)) {
|
2022-09-12 22:24:07 +10:00
|
|
|
/* If we don't have CAP_SETGID, this will EPERM */
|
2023-02-15 03:24:37 -05:00
|
|
|
if (errno != EPERM)
|
2024-06-17 11:55:04 +02:00
|
|
|
die_perror("Can't drop supplementary groups");
|
2022-09-12 22:24:03 +10:00
|
|
|
}
|
|
|
|
|
2023-02-15 03:24:37 -05:00
|
|
|
if (setgid(gid) != 0)
|
2024-06-17 11:55:04 +02:00
|
|
|
die_perror("Can't set GID to %u", gid);
|
2022-09-12 22:24:07 +10:00
|
|
|
|
2023-02-15 03:24:37 -05:00
|
|
|
if (setuid(uid) != 0)
|
2024-06-17 11:55:04 +02:00
|
|
|
die_perror("Can't set UID to %u", uid);
|
2022-09-12 22:24:07 +10:00
|
|
|
|
2022-10-14 15:25:33 +11:00
|
|
|
if (*userns) { /* If given a userns, join it */
|
2022-09-12 22:24:07 +10:00
|
|
|
int ufd;
|
|
|
|
|
|
|
|
ufd = open(userns, O_RDONLY | O_CLOEXEC);
|
2023-02-15 03:24:37 -05:00
|
|
|
if (ufd < 0)
|
2024-06-17 11:55:04 +02:00
|
|
|
die_perror("Couldn't open user namespace %s", userns);
|
2022-09-12 22:24:07 +10:00
|
|
|
|
2023-02-15 03:24:37 -05:00
|
|
|
if (setns(ufd, CLONE_NEWUSER) != 0)
|
2024-06-17 11:55:04 +02:00
|
|
|
die_perror("Couldn't enter user namespace %s", userns);
|
2022-09-12 22:24:07 +10:00
|
|
|
|
|
|
|
close(ufd);
|
2022-10-14 15:25:34 +11:00
|
|
|
|
2022-10-14 15:25:33 +11:00
|
|
|
} else if (use_userns) { /* Create and join a new userns */
|
2023-02-15 03:24:37 -05:00
|
|
|
if (unshare(CLONE_NEWUSER) != 0)
|
2024-06-17 11:55:04 +02:00
|
|
|
die_perror("Couldn't create user namespace");
|
2022-10-14 15:25:32 +11:00
|
|
|
}
|
2022-10-14 15:25:34 +11:00
|
|
|
|
|
|
|
/* Joining a new userns gives us full capabilities; drop the
|
|
|
|
* ones we don't need. With --netns-only we haven't changed
|
|
|
|
* userns but we can drop more capabilities now than at
|
|
|
|
* isolate_initial()
|
|
|
|
*/
|
|
|
|
/* Keep CAP_SYS_ADMIN, so we can unshare() further in
|
|
|
|
* isolate_prefork(), pasta also needs it to setns() into the
|
|
|
|
* netns
|
|
|
|
*/
|
|
|
|
ns_caps |= BIT(CAP_SYS_ADMIN);
|
|
|
|
if (mode == MODE_PASTA) {
|
|
|
|
/* Keep CAP_NET_ADMIN, so we can configure the if */
|
|
|
|
ns_caps |= BIT(CAP_NET_ADMIN);
|
|
|
|
/* Keep CAP_NET_BIND_SERVICE, so we can splice
|
|
|
|
* outbound connections to low port numbers
|
|
|
|
*/
|
|
|
|
ns_caps |= BIT(CAP_NET_BIND_SERVICE);
|
|
|
|
/* Keep CAP_SYS_PTRACE to join the netns of an
|
|
|
|
* existing process */
|
|
|
|
if (*userns || !use_userns)
|
|
|
|
ns_caps |= BIT(CAP_SYS_PTRACE);
|
|
|
|
}
|
|
|
|
|
|
|
|
drop_caps_ep_except(ns_caps);
|
2022-09-12 22:24:03 +10:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2022-10-14 15:25:31 +11:00
|
|
|
* isolate_prefork() - Self isolation before daemonizing
|
|
|
|
* @c: Execution context
|
2022-09-12 22:24:03 +10:00
|
|
|
*
|
|
|
|
* Return: negative error code on failure, zero on success
|
2022-10-14 15:25:31 +11:00
|
|
|
*
|
|
|
|
* Should:
|
|
|
|
* - Move us to our own IPC and UTS namespaces
|
|
|
|
* - Move us to a mount namespace with only an empty directory
|
|
|
|
* - Drop unneeded capabilities (in the new user namespace)
|
|
|
|
* Mustn't:
|
|
|
|
* - Remove syscalls we need to daemonise
|
2022-09-12 22:24:03 +10:00
|
|
|
*/
|
2023-09-29 15:50:19 +10:00
|
|
|
int isolate_prefork(const struct ctx *c)
|
2022-09-12 22:24:03 +10:00
|
|
|
{
|
|
|
|
int flags = CLONE_NEWIPC | CLONE_NEWNS | CLONE_NEWUTS;
|
2022-10-14 15:25:34 +11:00
|
|
|
uint64_t ns_caps = 0;
|
2022-09-12 22:24:03 +10:00
|
|
|
|
|
|
|
/* If we run in foreground, we have no chance to actually move to a new
|
|
|
|
* PID namespace. For passt, use CLONE_NEWPID anyway, in case somebody
|
|
|
|
* ever gets around seccomp profiles -- there's no harm in passing it.
|
|
|
|
*/
|
2024-06-13 14:36:53 +02:00
|
|
|
if (!c->foreground || c->mode != MODE_PASTA)
|
2022-09-12 22:24:03 +10:00
|
|
|
flags |= CLONE_NEWPID;
|
|
|
|
|
|
|
|
if (unshare(flags)) {
|
2024-06-15 00:37:11 +02:00
|
|
|
err_perror("Failed to detach isolating namespaces");
|
2022-09-12 22:24:03 +10:00
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (mount("", "/", "", MS_UNBINDABLE | MS_REC, NULL)) {
|
2024-06-15 00:37:11 +02:00
|
|
|
err_perror("Failed to remount /");
|
2022-09-12 22:24:03 +10:00
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (mount("", TMPDIR, "tmpfs",
|
|
|
|
MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RDONLY,
|
|
|
|
"nr_inodes=2,nr_blocks=0")) {
|
2024-06-15 00:37:11 +02:00
|
|
|
err_perror("Failed to mount empty tmpfs for pivot_root()");
|
2022-09-12 22:24:03 +10:00
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (chdir(TMPDIR)) {
|
2024-06-15 00:37:11 +02:00
|
|
|
err_perror("Failed to change directory into empty tmpfs");
|
2022-09-12 22:24:03 +10:00
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (syscall(SYS_pivot_root, ".", ".")) {
|
2024-06-15 00:37:11 +02:00
|
|
|
err_perror("Failed to pivot_root() into empty tmpfs");
|
2022-09-12 22:24:03 +10:00
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (umount2(".", MNT_DETACH | UMOUNT_NOFOLLOW)) {
|
2024-06-15 00:37:11 +02:00
|
|
|
err_perror("Failed to unmount original root filesystem");
|
2022-09-12 22:24:03 +10:00
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
|
2022-10-14 15:25:34 +11:00
|
|
|
/* Now that initialization is more-or-less complete, we can
|
|
|
|
* drop further capabilities
|
|
|
|
*/
|
|
|
|
if (c->mode == MODE_PASTA) {
|
|
|
|
/* Keep CAP_SYS_ADMIN, so we can enter the netns */
|
|
|
|
ns_caps |= BIT(CAP_SYS_ADMIN);
|
|
|
|
/* Keep CAP_NET_BIND_SERVICE, so we can splice
|
|
|
|
* outbound connections to low port numbers
|
|
|
|
*/
|
|
|
|
ns_caps |= BIT(CAP_NET_BIND_SERVICE);
|
|
|
|
}
|
|
|
|
|
2022-10-14 15:25:35 +11:00
|
|
|
clamp_caps();
|
2022-10-14 15:25:34 +11:00
|
|
|
drop_caps_ep_except(ns_caps);
|
2022-09-12 22:24:03 +10:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2022-10-14 15:25:31 +11:00
|
|
|
* isolate_postfork() - Self isolation after daemonizing
|
2022-09-12 22:24:03 +10:00
|
|
|
* @c: Execution context
|
2022-10-14 15:25:31 +11:00
|
|
|
*
|
|
|
|
* Should:
|
|
|
|
* - disable core dumps
|
|
|
|
* - limit to a minimal set of syscalls
|
2022-09-12 22:24:03 +10:00
|
|
|
*/
|
2022-10-14 15:25:31 +11:00
|
|
|
void isolate_postfork(const struct ctx *c)
|
2022-09-12 22:24:03 +10:00
|
|
|
{
|
|
|
|
struct sock_fprog prog;
|
|
|
|
|
2022-10-14 15:25:31 +11:00
|
|
|
prctl(PR_SET_DUMPABLE, 0);
|
|
|
|
|
2024-06-13 14:36:53 +02:00
|
|
|
if (c->mode == MODE_PASTA) {
|
2022-09-12 22:24:03 +10:00
|
|
|
prog.len = (unsigned short)ARRAY_SIZE(filter_pasta);
|
|
|
|
prog.filter = filter_pasta;
|
2024-06-13 14:36:53 +02:00
|
|
|
} else {
|
|
|
|
prog.len = (unsigned short)ARRAY_SIZE(filter_passt);
|
|
|
|
prog.filter = filter_passt;
|
2022-09-12 22:24:03 +10:00
|
|
|
}
|
|
|
|
|
|
|
|
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) ||
|
2024-06-15 00:37:11 +02:00
|
|
|
prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog))
|
|
|
|
die_perror("Failed to apply seccomp filter");
|
2022-09-12 22:24:03 +10:00
|
|
|
}
|