diff --git a/README.md b/README.md index d16b705..1c8baf3 100644 --- a/README.md +++ b/README.md @@ -232,9 +232,10 @@ speeding up local connections, and usually requiring NAT. _pasta_: `seccomp`](/passt/tree/seccomp.sh)) * ✅ root operation not allowed outside user namespaces * ✅ all capabilities dropped, other than `CAP_NET_BIND_SERVICE` (if granted) +* ✅ with default options, user, mount, IPC, UTS, PID namespaces are detached * ✅ no external dependencies (other than a standard C library) -* ✅ restrictive seccomp profiles (50 syscalls allowed for _passt_, 62 for - _pasta_) +* ✅ restrictive seccomp profiles (22 syscalls allowed for _passt_, 34 for + _pasta_ on x86_64) * ✅ static checkers in continuous integration (clang-tidy, cppcheck) * 🛠️ rework of TCP state machine (flags instead of states), TCP timers, and code de-duplication diff --git a/conf.c b/conf.c index abe63a1..732d918 100644 --- a/conf.c +++ b/conf.c @@ -10,8 +10,6 @@ * * Copyright (c) 2020-2021 Red Hat GmbH * Author: Stefano Brivio - * - * #syscalls stat|statx */ #include @@ -46,31 +44,31 @@ */ void get_bound_ports(struct ctx *c, int ns, uint8_t proto) { - uint8_t *udp_map, *udp_exclude, *tcp_map, *tcp_exclude; + uint8_t *udp_map, *udp_excl, *tcp_map, *tcp_excl; if (ns) { udp_map = c->udp.port_to_tap; - udp_exclude = c->udp.port_to_init; + udp_excl = c->udp.port_to_init; tcp_map = c->tcp.port_to_tap; - tcp_exclude = c->tcp.port_to_init; + tcp_excl = c->tcp.port_to_init; } else { udp_map = c->udp.port_to_init; - udp_exclude = c->udp.port_to_tap; + udp_excl = c->udp.port_to_tap; tcp_map = c->tcp.port_to_init; - tcp_exclude = c->tcp.port_to_tap; + tcp_excl = c->tcp.port_to_tap; } if (proto == IPPROTO_UDP) { memset(udp_map, 0, USHRT_MAX / 8); - procfs_scan_listen("udp", udp_map, udp_exclude); - procfs_scan_listen("udp6", udp_map, udp_exclude); + procfs_scan_listen(c, IPPROTO_UDP, V4, ns, udp_map, udp_excl); + procfs_scan_listen(c, IPPROTO_UDP, V6, ns, udp_map, udp_excl); - procfs_scan_listen("tcp", udp_map, udp_exclude); - procfs_scan_listen("tcp6", udp_map, udp_exclude); + procfs_scan_listen(c, IPPROTO_TCP, V4, ns, udp_map, udp_excl); + procfs_scan_listen(c, IPPROTO_TCP, V6, ns, udp_map, udp_excl); } else if (proto == IPPROTO_TCP) { memset(tcp_map, 0, USHRT_MAX / 8); - procfs_scan_listen("tcp", tcp_map, tcp_exclude); - procfs_scan_listen("tcp6", tcp_map, tcp_exclude); + procfs_scan_listen(c, IPPROTO_TCP, V4, ns, tcp_map, tcp_excl); + procfs_scan_listen(c, IPPROTO_TCP, V6, ns, tcp_map, tcp_excl); } } @@ -367,7 +365,7 @@ static int conf_ns_check(void *arg) static int conf_ns_opt(struct ctx *c, char *nsdir, char *conf_userns, const char *optarg) { - int ufd = 0, nfd = 0, try, ret, netns_only_reset = c->netns_only; + int ufd = -1, nfd = -1, try, ret, netns_only_reset = c->netns_only; char userns[PATH_MAX] = { 0 }, netns[PATH_MAX]; char *endptr; pid_t pid; @@ -416,7 +414,7 @@ static int conf_ns_opt(struct ctx *c, nfd = open(netns, O_RDONLY); - if (nfd >= 0 && ufd >= 0) { + if (nfd >= 0 && (ufd >= 0 || c->netns_only)) { c->pasta_netns_fd = nfd; c->pasta_userns_fd = ufd; @@ -425,10 +423,10 @@ static int conf_ns_opt(struct ctx *c, return 0; } - if (nfd > 0) + if (nfd >= 0) close(nfd); - if (ufd > 0) + if (ufd >= 0) close(ufd); } @@ -565,9 +563,9 @@ static void usage(const char *name) info( " if FILE is not given, log to:"); if (strstr(name, "pasta") || strstr(name, "passt4netns")) - info(" /tmp/pasta_ISO8601-TIMESTAMP_INSTANCE-NUMBER.pcap"); + info(" /tmp/pasta_ISO8601-TIMESTAMP_PID.pcap"); else - info(" /tmp/passt_ISO8601-TIMESTAMP_INSTANCE-NUMBER.pcap"); + info(" /tmp/passt_ISO8601-TIMESTAMP_PID.pcap"); info( " -P, --pid FILE Write own PID to the given file"); info( " -m, --mtu MTU Assign MTU via DHCP/NDP"); @@ -664,7 +662,7 @@ pasta_opts: info( " SPEC is as described above"); info( " default: auto"); info( " --userns NSPATH Target user namespace to join"); - info( " --netns-only Don't join or create user namespace"); + info( " --netns-only Don't join existing user namespace"); info( " implied if PATH or NAME are given without --userns"); info( " --nsrun-dir Directory for nsfs mountpoints"); info( " default: " NETNS_RUN_DIR); @@ -1170,7 +1168,7 @@ void conf(struct ctx *c, int argc, char **argv) usage(argv[0]); } - if (c->mode == MODE_PASTA && c->pasta_netns_fd <= 0) + if (c->mode == MODE_PASTA && c->pasta_netns_fd == -1) pasta_start_ns(c); if (nl_sock_init(c)) { @@ -1216,6 +1214,11 @@ void conf(struct ctx *c, int argc, char **argv) c->tcp.init_detect_ports = c->udp.init_detect_ports = 0; if (c->mode == MODE_PASTA) { + c->proc_net_tcp[V4][0] = c->proc_net_tcp[V4][1] = -1; + c->proc_net_tcp[V6][0] = c->proc_net_tcp[V6][1] = -1; + c->proc_net_udp[V4][0] = c->proc_net_udp[V4][1] = -1; + c->proc_net_udp[V6][0] = c->proc_net_udp[V6][1] = -1; + if (!tcp_tap || tcp_tap == PORT_AUTO) { c->tcp.ns_detect_ports = 1; ns_ports_arg.proto = IPPROTO_TCP; diff --git a/passt.1 b/passt.1 index b0d7d87..92681f6 100644 --- a/passt.1 +++ b/passt.1 @@ -80,7 +80,8 @@ Don't print informational messages. .TP .BR \-f ", " \-\-foreground -Don't run in background. +Don't run in background. This implies that the process is not moved to a +detached PID namespace after starting, because the PID itself cannot change. Default is to fork into background. .TP @@ -100,14 +101,13 @@ Capture tap-facing (that is, guest-side or namespace-side) network packets to If \fIfile\fR is not given, capture packets to - \fB/tmp/passt_\fIISO8601-timestamp\fR_\fIinstance-number\fB.pcap\fR + \fB/tmp/passt_\fIISO8601-timestamp\fR_\fIPID\fB.pcap\fR in \fBpasst\fR mode and to - \fB/tmp/pasta_\fIISO8601-timestamp\fR_\fIinstance-number\fB.pcap\fR + \fB/tmp/pasta_\fIISO8601-timestamp\fR_\fIPID\fB.pcap\fR -in \fBpasta\fR mode, where \fIinstance-number\fR is a progressive count of -other detected instances running on the same host. +in \fBpasta\fR mode, where \fIPID\fR is the ID of the running process. .TP .BR \-P ", " \-\-pid " " \fIfile @@ -379,8 +379,9 @@ This option requires PID, PATH or NAME to be specified. .TP .BR \-\-netns-only -Join or create only the network namespace, not a user namespace. This is implied -if PATH or NAME are given without \-\-userns. +Join only a target network namespace, not a user namespace, and don't create one +for sandboxing purposes either. This is implied if PATH or NAME are given +without \-\-userns. .TP .BR \-\-nsrun-dir " " \fIpath diff --git a/passt.c b/passt.c index a8bb88e..508d525 100644 --- a/passt.c +++ b/passt.c @@ -30,7 +30,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -53,7 +55,6 @@ #include #include #include -#include #include #include "util.h" @@ -228,42 +229,61 @@ static void check_root(void) } /** - * drop_caps() - Drop capabilities we might have except for CAP_NET_BIND_SERVICE + * sandbox() - Unshare IPC, mount, PID, UTS, and user namespaces, "unmount" root + * + * Return: negative error code on failure, zero on success */ -static void drop_caps(void) +static int sandbox(struct ctx *c) { - int i; + int flags = CLONE_NEWIPC | CLONE_NEWNS | CLONE_NEWUTS; - for (i = 0; i < 64; i++) { - if (i == CAP_NET_BIND_SERVICE) - continue; + errno = 0; - prctl(PR_CAPBSET_DROP, i, 0, 0, 0); + if (!c->netns_only) { + if (c->pasta_userns_fd == -1) + flags |= CLONE_NEWUSER; + else + setns(c->pasta_userns_fd, CLONE_NEWUSER); } + + c->pasta_userns_fd = -1; + + /* If we run in foreground, we have no chance to actually move to a new + * PID namespace. For passt, use CLONE_NEWPID anyway, in case somebody + * ever gets around seccomp profiles -- there's no harm in passing it. + */ + if (!c->foreground || c->mode == MODE_PASST) + flags |= CLONE_NEWPID; + + unshare(flags); + + mount("", "/", "", MS_UNBINDABLE | MS_REC, NULL); + mount("", TMPDIR, "tmpfs", MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RDONLY, + "nr_inodes=2,nr_blocks=0"); + chdir(TMPDIR); + syscall(SYS_pivot_root, ".", "."); + umount2(".", MNT_DETACH | UMOUNT_NOFOLLOW); + + if (errno) + return -errno; + + drop_caps(); /* Relative to the new user namespace this time. */ + + return 0; } /** - * pid_file() - Write own PID to file, if configured - * @c: Execution context + * exit_handler() - Signal handler for SIGQUIT and SIGTERM + * @unused: Unused, handler deals with SIGQUIT and SIGTERM only + * + * TODO: After unsharing the PID namespace and forking, SIG_DFL for SIGTERM and + * SIGQUIT unexpectedly doesn't cause the process to terminate, figure out why. */ -static void pid_file(struct ctx *c) { - char pid_buf[12]; - int pid_fd, n; +void exit_handler(int signal) +{ + (void)signal; - if (!*c->pid_file) - return; - - pid_fd = open(c->pid_file, O_CREAT | O_WRONLY, S_IRUSR | S_IWUSR); - if (pid_fd < 0) - return; - - n = snprintf(pid_buf, sizeof(pid_buf), "%i\n", getpid()); - - if (write(pid_fd, pid_buf, n) < 0) { - perror("PID file write"); - exit(EXIT_FAILURE); - } - close(pid_fd); + exit(EXIT_SUCCESS); } /** @@ -273,36 +293,36 @@ static void pid_file(struct ctx *c) { * * Return: non-zero on failure * - * #syscalls read write open|openat close fork|clone dup2|dup3 ioctl writev - * #syscalls socket bind connect getsockopt setsockopt recvfrom sendto shutdown - * #syscalls accept4 accept listen set_robust_list getrlimit setrlimit - * #syscalls openat fcntl lseek clone setsid exit exit_group getpid chdir - * #syscalls epoll_ctl epoll_create1 epoll_wait|epoll_pwait epoll_pwait - * #syscalls prlimit64 clock_gettime fstat|newfstat newfstatat syslog - * #syscalls ppc64le:_llseek ppc64le:recv ppc64le:send ppc64le:getuid - * #syscalls ppc64:_llseek ppc64:recv ppc64:send ppc64:getuid ppc64:ugetrlimit - * #syscalls s390x:socketcall s390x:sigreturn - * #syscalls:pasta rt_sigreturn|sigreturn ppc64:sigreturn ppc64:fcntl64 + * #syscalls read write writev + * #syscalls socket bind connect getsockopt setsockopt s390x:socketcall close + * #syscalls recvfrom sendto shutdown ppc64le:recv ppc64le:send + * #syscalls accept4|accept listen + * #syscalls epoll_ctl epoll_wait|epoll_pwait epoll_pwait clock_gettime */ int main(int argc, char **argv) { + int nfds, i, devnull_fd = -1, pidfile_fd = -1; struct epoll_event events[EPOLL_EVENTS]; struct ctx c = { 0 }; struct rlimit limit; struct timespec now; + struct sigaction sa; char *log_name; - int nfds, i; #ifndef PASST_LEGACY_NO_OPTIONS check_root(); #endif drop_caps(); - if (strstr(argv[0], "pasta") || strstr(argv[0], "passt4netns")) { - struct sigaction sa; + c.pasta_userns_fd = c.pasta_netns_fd = c.fd_tap = c.fd_tap_listen = -1; - sigemptyset(&sa.sa_mask); - sa.sa_flags = 0; + sigemptyset(&sa.sa_mask); + sa.sa_flags = 0; + sa.sa_handler = exit_handler; + sigaction(SIGTERM, &sa, NULL); + sigaction(SIGQUIT, &sa, NULL); + + if (strstr(argv[0], "pasta") || strstr(argv[0], "passt4netns")) { sa.sa_handler = pasta_child_handler; sigaction(SIGCHLD, &sa, NULL); signal(SIGPIPE, SIG_IGN); @@ -323,8 +343,6 @@ int main(int argc, char **argv) conf(&c, argc, argv); - seccomp(&c); - if (!c.debug && (c.stderr || isatty(fileno(stdout)))) __openlog(log_name, LOG_PERROR, LOG_DAEMON); @@ -369,12 +387,26 @@ int main(int argc, char **argv) else __setlogmask(LOG_UPTO(LOG_INFO)); - if (!c.foreground && daemon(0, 0)) { - perror("daemon"); + pcap_init(&c); + + if (!c.foreground) + devnull_fd = open("/dev/null", O_RDWR); + + if (*c.pid_file) + pidfile_fd = open(c.pid_file, + O_CREAT | O_WRONLY, S_IRUSR | S_IWUSR); + + if (sandbox(&c)) { + err("Failed to sandbox process, exiting\n"); exit(EXIT_FAILURE); } - pid_file(&c); + if (!c.foreground) + __daemon(pidfile_fd, devnull_fd); + else + write_pidfile(pidfile_fd, getpid()); + + seccomp(&c); timer_init(&c, &now); loop: diff --git a/passt.h b/passt.h index 0ef1897..d7011da 100644 --- a/passt.h +++ b/passt.h @@ -99,8 +99,10 @@ enum passt_modes { * @pcap: Path for packet capture file * @pid_file: Path to PID file, empty string if not configured * @pasta_netns_fd: File descriptor for network namespace in pasta mode - * @pasta_userns_fd: File descriptor for user namespace in pasta mode + * @pasta_userns_fd: Descriptor for user namespace to join, -1 once joined * @netns_only: In pasta mode, don't join or create a user namespace + * @proc_net_tcp: Stored handles for /proc/net/tcp{,6} in init and ns + * @proc_net_udp: Stored handles for /proc/net/udp{,6} in init and ns * @epollfd: File descriptor for epoll instance * @fd_tap_listen: File descriptor for listening AF_UNIX socket, if any * @fd_tap: File descriptor for AF_UNIX socket or tuntap device @@ -155,6 +157,9 @@ struct ctx { int pasta_userns_fd; int netns_only; + int proc_net_tcp[IP_VERSIONS][2]; + int proc_net_udp[IP_VERSIONS][2]; + int epollfd; int fd_tap_listen; int fd_tap; diff --git a/pasta.c b/pasta.c index bce30d4..972cbcf 100644 --- a/pasta.c +++ b/pasta.c @@ -11,9 +11,8 @@ * Copyright (c) 2020-2021 Red Hat GmbH * Author: Stefano Brivio * - * #syscalls:pasta clone unshare waitid kill execve exit_group rt_sigprocmask - * #syscalls:pasta geteuid getdents64|getdents readlink|readlinkat setsid - * #syscalls:pasta nanosleep clock_nanosleep + * #syscalls:pasta clone waitid exit exit_group rt_sigprocmask + * #syscalls:pasta rt_sigreturn|sigreturn ppc64:sigreturn s390x:sigreturn */ #include @@ -40,75 +39,8 @@ #include "passt.h" #include "netlink.h" -/* PID of child, in case we created a namespace, and its procfs link */ +/* PID of child, in case we created a namespace */ static int pasta_child_pid; -static char pasta_child_ns[PATH_MAX]; - -/** - * pasta_ns_cleanup() - Look for processes in namespace, terminate them - */ -static void pasta_ns_cleanup(void) -{ - char proc_path[PATH_MAX], ns_link[PATH_MAX], buf[BUFSIZ]; - int recheck = 0, found = 0, waited = 0; - int dir_fd, n; - - if (!*pasta_child_ns) - return; - -loop: - if ((dir_fd = open("/proc", O_RDONLY | O_DIRECTORY)) < 0) - return; - - while ((n = syscall(SYS_getdents64, dir_fd, buf, BUFSIZ)) > 0) { - struct dirent *dp = (struct dirent *)buf; - int pos = 0; - - while (dp->d_reclen && pos < n) { - pid_t pid; - - errno = 0; - pid = strtol(dp->d_name, NULL, 0); - if (!pid || errno) - goto next; - - snprintf(proc_path, PATH_MAX, "/proc/%i/ns/net", pid); - if (readlink(proc_path, ns_link, PATH_MAX) < 0) - goto next; - - if (!strncmp(ns_link, pasta_child_ns, PATH_MAX)) { - found = 1; - if (waited) - kill(pid, SIGKILL); - else - kill(pid, SIGQUIT); - } -next: - dp = (struct dirent *)(buf + (pos += dp->d_reclen)); - } - } - - close(dir_fd); - - if (!found) - return; - - if (waited) { - if (recheck) { - info("Some processes in namespace didn't quit"); - } else { - found = 0; - recheck = 1; - goto loop; - } - return; - } - - info("Waiting for all processes in namespace to terminate"); - sleep(1); - waited = 1; - goto loop; -} /** * pasta_child_handler() - Exit once shell exits (if we started it), reap clones @@ -120,12 +52,14 @@ void pasta_child_handler(int signal) (void)signal; + if (signal != SIGCHLD) + return; + if (pasta_child_pid && !waitid(P_PID, pasta_child_pid, &infop, WEXITED | WNOHANG)) { - if (infop.si_pid == pasta_child_pid) { - pasta_ns_cleanup(); + if (infop.si_pid == pasta_child_pid) exit(EXIT_SUCCESS); - } + /* Nothing to do, detached PID namespace going away */ } waitid(P_ALL, 0, NULL, WEXITED | WNOHANG); @@ -163,45 +97,31 @@ netns: } /** - * pasta_start_ns() - Fork shell in new namespace if target ns is not given + * struct pasta_setup_ns_arg - Argument for pasta_setup_ns() * @c: Execution context + * @euid: Effective UID of caller */ -void pasta_start_ns(struct ctx *c) +struct pasta_setup_ns_arg { + struct ctx *c; + int euid; +}; + +/** + * pasta_setup_ns() - Map credentials, enable access to ping sockets, run shell + * @arg: See @pasta_setup_ns_arg + * + * Return: this function never returns + */ +static int pasta_setup_ns(void *arg) { - int euid = geteuid(), fd; + struct pasta_setup_ns_arg *a = (struct pasta_setup_ns_arg *)arg; char *shell; + int fd; - c->foreground = 1; - if (!c->debug) - c->quiet = 1; - - if ((pasta_child_pid = fork()) == -1) { - perror("fork"); - exit(EXIT_FAILURE); - } - - if (pasta_child_pid) { - char proc_path[PATH_MAX]; - - NS_CALL(pasta_wait_for_ns, c); - - snprintf(proc_path, PATH_MAX, "/proc/%i/ns/net", - pasta_child_pid); - if (readlink(proc_path, pasta_child_ns, PATH_MAX) < 0) - warn("Cannot read link to ns, won't clean up on exit"); - - return; - } - - if (unshare(CLONE_NEWNET | (c->netns_only ? 0 : CLONE_NEWUSER))) { - perror("unshare"); - exit(EXIT_FAILURE); - } - - if (!c->netns_only) { + if (!a->c->netns_only) { char buf[BUFSIZ]; - snprintf(buf, BUFSIZ, "%i %i %i", 0, euid, 1); + snprintf(buf, BUFSIZ, "%i %i %i", 0, a->euid, 1); fd = open("/proc/self/uid_map", O_WRONLY); if (write(fd, buf, strlen(buf)) < 0) @@ -234,6 +154,39 @@ void pasta_start_ns(struct ctx *c) exit(EXIT_FAILURE); } +/** + * pasta_start_ns() - Fork shell in new namespace if target ns is not given + * @c: Execution context + */ +void pasta_start_ns(struct ctx *c) +{ + struct pasta_setup_ns_arg arg = { .c = c, .euid = geteuid() }; + char ns_fn_stack[NS_FN_STACK_SIZE]; + + c->foreground = 1; + if (!c->debug) + c->quiet = 1; + + pasta_child_pid = clone(pasta_setup_ns, + ns_fn_stack + sizeof(ns_fn_stack) / 2, + (c->netns_only ? 0 : CLONE_NEWNET) | + CLONE_NEWIPC | CLONE_NEWPID | CLONE_NEWUSER | + CLONE_NEWUTS, + (void *)&arg); + + if (pasta_child_pid == -1) { + perror("clone"); + exit(EXIT_FAILURE); + } + + drop_caps(); + + if (pasta_child_pid) { + NS_CALL(pasta_wait_for_ns, c); + return; + } +} + /** * pasta_ns_conf() - Set up loopback and tap interfaces in namespace as needed * @c: Execution context diff --git a/pcap.c b/pcap.c index e00fc45..9c617ce 100644 --- a/pcap.c +++ b/pcap.c @@ -167,9 +167,8 @@ fail: /** * pcap_init() - Initialise pcap file * @c: Execution context - * @index: pcap name index: passt instance number or pasta netns socket */ -void pcap_init(struct ctx *c, int index) +void pcap_init(struct ctx *c) { struct timeval tv; @@ -196,7 +195,7 @@ void pcap_init(struct ctx *c, int index) snprintf(name + strlen(PCAP_PREFIX) + strlen(PCAP_ISO8601_STR), sizeof(name) - strlen(PCAP_PREFIX) - strlen(PCAP_ISO8601_STR), - "_%i.pcap", index); + "_%i.pcap", getpid()); strncpy(c->pcap, name, PATH_MAX); } diff --git a/pcap.h b/pcap.h index 26f4f35..73b5ed8 100644 --- a/pcap.h +++ b/pcap.h @@ -6,4 +6,4 @@ void pcap(char *pkt, size_t len); void pcapm(struct msghdr *mh); void pcapmm(struct mmsghdr *mmh, unsigned int vlen); -void pcap_init(struct ctx *c, int sock_index); +void pcap_init(struct ctx *c); diff --git a/slirp4netns.sh b/slirp4netns.sh index 518f581..7c2188d 100755 --- a/slirp4netns.sh +++ b/slirp4netns.sh @@ -10,7 +10,7 @@ # # slirp4netns.sh - Compatibility wrapper for pasta, behaving like slirp4netns(1) # -# WARNING: Draft quality, not really tested, --enable-sandbox not supported yet +# WARNING: Draft quality, not really tested # # Copyright (c) 2021 Red Hat GmbH # Author: Stefano Brivio diff --git a/tap.c b/tap.c index 22db9c5..38004a5 100644 --- a/tap.c +++ b/tap.c @@ -11,7 +11,6 @@ * Copyright (c) 2020-2021 Red Hat GmbH * Author: Stefano Brivio * - * #syscalls recvfrom sendto */ #include @@ -769,12 +768,10 @@ restart: } /** - * tap_sock_init_unix() - Create and bind AF_UNIX socket, listen for connection + * tap_sock_unix_init() - Create and bind AF_UNIX socket, listen for connection * @c: Execution context - * - * #syscalls:passt unlink|unlinkat */ -static void tap_sock_init_unix(struct ctx *c) +static void tap_sock_unix_init(struct ctx *c) { int fd = socket(AF_UNIX, SOCK_STREAM, 0), ex; struct epoll_event ev = { 0 }; @@ -783,11 +780,6 @@ static void tap_sock_init_unix(struct ctx *c) }; int i, ret; - if (c->fd_tap_listen != -1) { - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap_listen, &ev); - close(c->fd_tap_listen); - } - if (fd < 0) { perror("UNIX socket"); exit(EXIT_FAILURE); @@ -834,8 +826,6 @@ static void tap_sock_init_unix(struct ctx *c) S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH); #endif - pcap_init(c, i); - listen(fd, 0); ev.data.fd = c->fd_tap_listen = fd; @@ -852,19 +842,26 @@ static void tap_sock_init_unix(struct ctx *c) } /** - * tap_sock_accept_unix() - Accept connection on listening socket + * tap_sock_unix_new() - Handle new connection on listening socket * @c: Execution context */ -static void tap_sock_accept_unix(struct ctx *c) +static void tap_sock_unix_new(struct ctx *c) { struct epoll_event ev = { 0 }; int v = INT_MAX / 2; - c->fd_tap = accept(c->fd_tap_listen, NULL, NULL); + /* Another client is already connected: accept and close right away. */ + if (c->fd_tap != -1) { + int discard = accept4(c->fd_tap_listen, NULL, NULL, + SOCK_NONBLOCK); - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap_listen, &ev); - close(c->fd_tap_listen); - c->fd_tap_listen = -1; + if (discard != -1) + close(discard); + + return; + } + + c->fd_tap = accept4(c->fd_tap_listen, NULL, NULL, 0); if (!c->low_rmem) setsockopt(c->fd_tap, SOL_SOCKET, SO_RCVBUF, &v, sizeof(v)); @@ -884,8 +881,6 @@ static int tun_ns_fd = -1; * @c: Execution context * * Return: 0 - * - * #syscalls:pasta ioctl */ static int tap_ns_tun(void *arg) { @@ -907,7 +902,7 @@ static int tap_ns_tun(void *arg) * tap_sock_init_tun() - Set up tuntap file descriptor * @c: Execution context */ -static void tap_sock_init_tun(struct ctx *c) +static void tap_sock_tun_init(struct ctx *c) { struct epoll_event ev = { 0 }; @@ -919,8 +914,6 @@ static void tap_sock_init_tun(struct ctx *c) pasta_ns_conf(c); - pcap_init(c, c->pasta_netns_fd); - c->fd_tap = tun_ns_fd; ev.data.fd = c->fd_tap; @@ -937,12 +930,15 @@ void tap_sock_init(struct ctx *c) if (c->fd_tap != -1) { epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL); close(c->fd_tap); + c->fd_tap = -1; } - if (c->mode == MODE_PASST) - tap_sock_init_unix(c); - else - tap_sock_init_tun(c); + if (c->mode == MODE_PASST) { + if (c->fd_tap_listen == -1) + tap_sock_unix_init(c); + } else { + tap_sock_tun_init(c); + } } /** @@ -955,18 +951,18 @@ void tap_sock_init(struct ctx *c) void tap_handler(struct ctx *c, int fd, uint32_t events, struct timespec *now) { if (fd == c->fd_tap_listen && events == EPOLLIN) { - tap_sock_accept_unix(c); + tap_sock_unix_new(c); return; } if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) - goto fail; + goto reinit; if ((c->mode == MODE_PASST && tap_handler_passt(c, now)) || (c->mode == MODE_PASTA && tap_handler_pasta(c, now))) - goto fail; + goto reinit; return; -fail: +reinit: tap_sock_init(c); } diff --git a/tcp.c b/tcp.c index 723b18e..e4fac22 100644 --- a/tcp.c +++ b/tcp.c @@ -304,7 +304,7 @@ * - SPLICE_FIN_TO: FIN (EPOLLRDHUP) seen from connected socket * - SPLICE_FIN_BOTH: FIN (EPOLLRDHUP) seen from both sides * - * #syscalls pipe|pipe2 pipe2 + * #syscalls:pasta pipe2|pipe fcntl ppc64:fcntl64 */ #include @@ -3028,7 +3028,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, * @ref: epoll reference * @events: epoll events bitmap * - * #syscalls splice + * #syscalls:pasta splice */ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref, uint32_t events) @@ -3374,7 +3374,7 @@ static void tcp_set_pipe_size(struct ctx *c) smaller: for (i = 0; i < TCP_SPLICE_PIPE_POOL_SIZE * 2; i++) { - if (pipe(probe_pipe[i])) { + if (pipe2(probe_pipe[i], 0)) { i++; break; } @@ -3493,7 +3493,7 @@ static void tcp_sock_init_one(struct ctx *c, int ns, in_port_t port) * tcp_sock_init_ns() - Bind sockets in namespace for inbound connections * @arg: Execution context * - * Return: 0 on success, -1 on failure + * Return: 0 */ static int tcp_sock_init_ns(void *arg) { @@ -3560,8 +3560,7 @@ static int tcp_sock_refill(void *arg) int i, *p4, *p6; if (a->ns) { - if (ns_enter(a->c)) - return 0; + ns_enter(a->c); p4 = ns_sock_pool4; p6 = ns_sock_pool6; } else { @@ -3594,8 +3593,6 @@ static int tcp_sock_refill(void *arg) * @c: Execution context * * Return: 0 on success, -1 on failure - * - * #syscalls getrandom */ int tcp_sock_init(struct ctx *c, struct timespec *now) { diff --git a/test/demo/passt b/test/demo/passt index b5762aa..76aac86 100644 --- a/test/demo/passt +++ b/test/demo/passt @@ -84,7 +84,8 @@ say Now let's run 'passt' in the new namespace, and nl say enter this namespace from the guest terminal too. sleep 3 -pout TARGET_PID echo $$ +guest pstree -p | grep pasta +gout TARGET_PID pstree -p | grep pasta | sed -n 's/.*(\([0-9].*\))$/\1/p' sleep 1 passtb ./passt -f -t 5201,5203 diff --git a/test/demo/pasta b/test/demo/pasta index f8f0cd0..c136965 100644 --- a/test/demo/pasta +++ b/test/demo/pasta @@ -58,7 +58,8 @@ say For convenience, let's enter this namespace nl say from another terminal. sleep 3 -pout TARGET_PID echo $$ +ns pstree -p | grep pasta +nsout TARGET_PID pstree -p | grep pasta | sed -n 's/.*(\([0-9].*\))$/\1/p' sleep 1 ns nsenter -t __TARGET_PID__ -U -n --preserve-credentials @@ -172,7 +173,7 @@ sleep 2 passtb perf record -g ./pasta sleep 2 -pout TARGET_PID echo $$ +nsout TARGET_PID pstree -p | grep pasta | sed -n 's/.*(\([0-9].*\))$/\1/p' sleep 1 ns nsenter -t __TARGET_PID__ -U -n --preserve-credentials sleep 5 diff --git a/test/lib/setup b/test/lib/setup index ab51787..df21655 100755 --- a/test/lib/setup +++ b/test/lib/setup @@ -115,13 +115,14 @@ setup_passt_in_ns() { [ ${PCAP} -eq 1 ] && __opts="${__opts} -p /tmp/pasta_with_passt.pcap" [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" - pane_run PASST "./pasta ${__opts} -t 10001,10002,10011,10012 -T 10003,10013 -u 10001,10002,10011,10012 -U 10003,10013" + __pid_file="$(mktemp)" + pane_run PASST "./pasta ${__opts} -t 10001,10002,10011,10012 -T 10003,10013 -u 10001,10002,10011,10012 -U 10003,10013 -P ${__pid_file}" sleep 1 pane_run PASST '' pane_wait PASST - pane_run PASST 'echo $$' - pane_wait PASST - __ns_pid="$(pane_parse PASST)" + __pasta_pid="$(cat "${__pid_file}")" + __ns_pid="$(cat /proc/${__pasta_pid}/task/${__pasta_pid}/children | cut -f1 -d' ')" + rm "${__pid_file}" pane_run GUEST "nsenter -t ${__ns_pid} -U -n --preserve-credentials" pane_run NS "nsenter -t ${__ns_pid} -U -n --preserve-credentials" @@ -172,15 +173,18 @@ setup_two_guests() { # 10004 | as server | to init | to guest | to ns #2 # 10005 | | | as server | to ns #2 + __pid1_file="$(mktemp)" + __pid2_file="$(mktemp)" + __opts= [ ${PCAP} -eq 1 ] && __opts="${__opts} -p /tmp/pasta_1.pcap" [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" - pane_run PASST_1 "./pasta ${__opts} -t 10001,10002 -T 10003,10004 -u 10001,10002 -U 10003,10004" + pane_run PASST_1 "./pasta ${__opts} -P ${__pid1_file} -t 10001,10002 -T 10003,10004 -u 10001,10002 -U 10003,10004" __opts= [ ${PCAP} -eq 1 ] && __opts="${__opts} -p /tmp/pasta_2.pcap" [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" - pane_run PASST_2 "./pasta ${__opts} -t 10004,10005 -T 10003,10001 -u 10004,10005 -U 10003,10001" + pane_run PASST_2 "./pasta ${__opts} -P ${__pid2_file} -t 10004,10005 -T 10003,10001 -u 10004,10005 -U 10003,10001" sleep 1 pane_run PASST_1 '' @@ -188,12 +192,12 @@ setup_two_guests() { pane_wait PASST_1 pane_wait PASST_2 - pane_run PASST_1 'echo $$' - pane_run PASST_2 'echo $$' - pane_wait PASST_1 - pane_wait PASST_2 - __ns1_pid="$(pane_parse PASST_1)" - __ns2_pid="$(pane_parse PASST_2)" + __pasta1_pid="$(cat "${__pid1_file}")" + __ns1_pid="$(cat /proc/${__pasta1_pid}/task/${__pasta1_pid}/children | cut -f1 -d' ')" + rm "${__pid1_file}" + __pasta2_pid="$(cat "${__pid2_file}")" + __ns2_pid="$(cat /proc/${__pasta2_pid}/task/${__pasta2_pid}/children | cut -f1 -d' ')" + rm "${__pid2_file}" pane_run GUEST_1 "nsenter -t ${__ns1_pid} -U -n --preserve-credentials" pane_run GUEST_2 "nsenter -t ${__ns2_pid} -U -n --preserve-credentials" diff --git a/udp.c b/udp.c index e1a9ecb..348f695 100644 --- a/udp.c +++ b/udp.c @@ -529,7 +529,9 @@ static int udp_splice_connect_ns(void *arg) a = (struct udp_splice_connect_ns_arg *)arg; - ns_enter(a->c); + if (ns_enter(a->c)) + return 0; + a->s = udp_splice_connect(a->c, a->v6, a->bound_sock, a->src, a->dst, UDP_BACK_TO_INIT); @@ -1029,7 +1031,8 @@ int udp_sock_init_ns(void *arg) struct ctx *c = (struct ctx *)arg; int dst; - ns_enter(c); + if (ns_enter(c)) + return 0; for (dst = 0; dst < USHRT_MAX; dst++) { if (!bitmap_isset(c->udp.port_to_init, dst)) diff --git a/util.c b/util.c index 94d49a6..e9fca3b 100644 --- a/util.c +++ b/util.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -23,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -32,6 +34,8 @@ #include #include +#include + #include "util.h" #include "passt.h" @@ -431,31 +435,51 @@ char *line_read(char *buf, size_t len, int fd) /** * procfs_scan_listen() - Set bits for listening TCP or UDP sockets from procfs - * @name: Corresponding name of file under /proc/net/ + * @proto: IPPROTO_TCP or IPPROTO_UDP + * @ip_version: IP version, V4 or V6 + * @ns: Use saved file descriptors for namespace if set * @map: Bitmap where numbers of ports in listening state will be set * @exclude: Bitmap of ports to exclude from setting (and clear) + * + * #syscalls:pasta lseek ppc64le:_llseek ppc64:_llseek */ -void procfs_scan_listen(char *name, uint8_t *map, uint8_t *exclude) +void procfs_scan_listen(struct ctx *c, uint8_t proto, int ip_version, int ns, + uint8_t *map, uint8_t *exclude) { - char line[BUFSIZ], path[PATH_MAX]; + char line[BUFSIZ], *path; unsigned long port; unsigned int state; - int fd; + int *fd; - snprintf(path, PATH_MAX, "/proc/net/%s", name); - if ((fd = open(path, O_RDONLY)) < 0) + if (proto == IPPROTO_TCP) { + fd = &c->proc_net_tcp[ip_version][ns]; + if (ip_version == V4) + path = "/proc/net/tcp"; + else + path = "/proc/net/tcp6"; + } else { + fd = &c->proc_net_udp[ip_version][ns]; + if (ip_version == V4) + path = "/proc/net/udp"; + else + path = "/proc/net/udp6"; + } + + if (*fd != -1) + lseek(*fd, 0, SEEK_SET); + else if ((*fd = open(path, O_RDONLY)) < 0) return; *line = 0; - line_read(line, sizeof(line), fd); - while (line_read(line, sizeof(line), fd)) { + line_read(line, sizeof(line), *fd); + while (line_read(line, sizeof(line), *fd)) { /* NOLINTNEXTLINE(cert-err34-c): != 2 if conversion fails */ if (sscanf(line, "%*u: %*x:%lx %*x:%*x %x", &port, &state) != 2) continue; /* See enum in kernel's include/net/tcp_states.h */ - if ((strstr(name, "tcp") && state != 0x0a) || - (strstr(name, "udp") && state != 0x07)) + if ((proto == IPPROTO_TCP && state != 0x0a) || + (proto == IPPROTO_UDP && state != 0x07)) continue; if (bitmap_isset(exclude, port)) @@ -463,25 +487,98 @@ void procfs_scan_listen(char *name, uint8_t *map, uint8_t *exclude) else bitmap_set(map, port); } - - close(fd); } /** - * ns_enter() - Enter configured network and user namespaces + * drop_caps() - Drop capabilities we might have except for CAP_NET_BIND_SERVICE + */ +void drop_caps(void) +{ + int i; + + for (i = 0; i < 64; i++) { + if (i == CAP_NET_BIND_SERVICE) + continue; + + prctl(PR_CAPBSET_DROP, i, 0, 0, 0); + } +} + +/** + * ns_enter() - Enter configured user (unless already joined) and network ns * @c: Execution context * - * Return: 0 on success, -1 on failure + * Return: 0, won't return on failure * * #syscalls:pasta setns */ int ns_enter(struct ctx *c) { - if (!c->netns_only && setns(c->pasta_userns_fd, CLONE_NEWUSER)) - return -errno; + if (!c->netns_only && + c->pasta_userns_fd != -1 && + setns(c->pasta_userns_fd, CLONE_NEWUSER)) + exit(EXIT_FAILURE); if (setns(c->pasta_netns_fd, CLONE_NEWNET)) - return -errno; + exit(EXIT_FAILURE); + + return 0; +} + +/** + * pid_file() - Write PID to file, if requested to do so, and close it + * @fd: Open PID file descriptor, closed on exit, -1 to skip writing it + * @pid: PID value to write + */ +void write_pidfile(int fd, pid_t pid) { + char pid_buf[12]; + int n; + + if (fd == -1) + return; + + n = snprintf(pid_buf, sizeof(pid_buf), "%i\n", pid); + + if (write(fd, pid_buf, n) < 0) { + perror("PID file write"); + exit(EXIT_FAILURE); + } + + close(fd); +} + +/** + * __daemon() - daemon()-like function writing PID file before parent exits + * @pidfile_fd: Open PID file descriptor + * @devnull_fd: Open file descriptor for /dev/null + * + * Return: child PID on success, won't return on failure + */ +int __daemon(int pidfile_fd, int devnull_fd) +{ + pid_t pid = fork(); + + if (pid == -1) { + perror("fork"); + exit(EXIT_FAILURE); + } + + if (pid) { + write_pidfile(pidfile_fd, pid); + exit(EXIT_SUCCESS); + } + + errno = 0; + + setsid(); + + dup2(devnull_fd, STDIN_FILENO); + dup2(devnull_fd, STDOUT_FILENO); + dup2(devnull_fd, STDERR_FILENO); + close(devnull_fd); + + if (errno) + exit(EXIT_FAILURE); return 0; } diff --git a/util.h b/util.h index add4c1e..b7852e9 100644 --- a/util.h +++ b/util.h @@ -54,6 +54,12 @@ void debug(const char *format, ...); #define STRINGIFY(x) #x #define STR(x) STRINGIFY(x) +#ifdef P_tmpdir +#define TMPDIR P_tmpdir +#else +#define TMPDIR "/tmp" +#endif + #define V4 0 #define V6 1 #define IP_VERSIONS 2 @@ -202,5 +208,9 @@ void bitmap_set(uint8_t *map, int bit); void bitmap_clear(uint8_t *map, int bit); int bitmap_isset(const uint8_t *map, int bit); char *line_read(char *buf, size_t len, int fd); -void procfs_scan_listen(char *name, uint8_t *map, uint8_t *exclude); +void procfs_scan_listen(struct ctx *c, uint8_t proto, int ip_version, int ns, + uint8_t *map, uint8_t *exclude); +void drop_caps(void); int ns_enter(struct ctx *c); +void write_pidfile(int fd, pid_t pid); +int __daemon(int pidfile_fd, int devnull_fd);