/* * vircgroupv2devices.c: methods for cgroups v2 BPF devices * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library. If not, see * . */ #include #if WITH_DECL_BPF_CGROUP_DEVICE # include # include # include # include # include #endif /* !WITH_DECL_BPF_CGROUP_DEVICE */ #include "internal.h" #define LIBVIRT_VIRCGROUPPRIV_H_ALLOW #include "vircgrouppriv.h" #include "viralloc.h" #include "virbpf.h" #include "vircgroup.h" #include "vircgroupv2devices.h" #include "virerror.h" #include "virfile.h" #include "virlog.h" VIR_LOG_INIT("util.cgroup"); #define VIR_FROM_THIS VIR_FROM_CGROUP #if WITH_DECL_BPF_CGROUP_DEVICE bool virCgroupV2DevicesAvailable(virCgroup *group) { VIR_AUTOCLOSE cgroupfd = -1; unsigned int progCnt = 0; cgroupfd = open(group->unified.mountPoint, O_RDONLY); if (cgroupfd < 0) { VIR_DEBUG("failed to open cgroup '%s'", group->unified.mountPoint); return false; } if (virBPFQueryProg(cgroupfd, 0, BPF_CGROUP_DEVICE, &progCnt, NULL) < 0) { VIR_DEBUG("failed to query cgroup progs: %s", g_strerror(errno)); return false; } return true; } /* Steps to get assembly version of devices BPF program: * * Save the following program into bpfprog.c, compile it using clang: * * clang -O2 -Wall -target bpf -c bpfprog.c -o bpfprog.o * * Now you can use llvm-objdump to get the list if instructions: * * llvm-objdump -S -no-show-raw-insn bpfprog.o * * which can be converted into program using VIR_BPF_* macros. * * ---------------------------------------------------------------------------- * #include * #include * * #define SEC(NAME) __attribute__((section(NAME), used)) * * struct bpf_map_def { * unsigned int type; * unsigned int key_size; * unsigned int value_size; * unsigned int max_entries; * unsigned int map_flags; * unsigned int inner_map_idx; * unsigned int numa_node; * }; * * static void *(*bpf_map_lookup_elem)(void *map, void *key) = * (void *) BPF_FUNC_map_lookup_elem; * * struct bpf_map_def SEC("maps") devices = { * .type = BPF_MAP_TYPE_HASH, * .key_size = sizeof(__u64), * .value_size = sizeof(__u32), * .max_entries = 65, * }; * * SEC("cgroup/dev") int * bpf_libvirt_cgroup_device(struct bpf_cgroup_dev_ctx *ctx) * { * __u64 key = ((__u64)ctx->major << 32) | ctx->minor; * __u32 *val = 0; * * val = bpf_map_lookup_elem(&devices, &key); * if (val && (ctx->access_type & *val) == ctx->access_type) * return 1; * * key = ((__u64)ctx->major << 32) | 0xffffffff; * val = bpf_map_lookup_elem(&devices, &key); * if (val && (ctx->access_type & *val) == ctx->access_type) * return 1; * * key = 0xffffffff00000000 | ctx->minor; * val = bpf_map_lookup_elem(&devices, &key); * if (val && (ctx->access_type & *val) == ctx->access_type) * return 1; * * key = 0xffffffffffffffff; * val = bpf_map_lookup_elem(&devices, &key); * if (val && (ctx->access_type & *val) == ctx->access_type) * return 1; * * return 0; * } * * char _license[] SEC("license") = "GPL"; * __u32 _version SEC("version") = LINUX_VERSION_CODE; * ---------------------------------------------------------------------------- * */ static int virCgroupV2DevicesLoadProg(int mapfd) { struct bpf_insn prog[] = { /* 0: r6 = r1 */ VIR_BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), /* 1: r1 = *(u32 *)(r6 + 8) */ VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, 8), /* 2: r2 = *(u32 *)(r6 + 4) */ VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 4), /* 3: r2 <<= 32 */ VIR_BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32), /* 4: r2 |= r1 */ VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1), /* 5: *(u64 *)(r10 - 8) = r2 */ VIR_BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8), /* 6: r2 = r10 */ VIR_BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), /* 7: r2 += -8 */ VIR_BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), /* 8: r1 = 0 ll */ VIR_BPF_LD_MAP_FD(BPF_REG_1, mapfd), /* 10: call 1 */ VIR_BPF_CALL_INSN(BPF_FUNC_map_lookup_elem), /* 11: r1 = r0 */ VIR_BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), /* 12: if r1 == 0 goto +5 */ VIR_BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5), /* 13: r0 = 1 */ VIR_BPF_MOV64_IMM(BPF_REG_0, 1), /* 14: r2 = *(u32 *)(r6 + 0) */ VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 0), /* 15: r1 = *(u32 *)(r1 + 0) */ VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 0), /* 16: r1 &= r2 */ VIR_BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), /* 17: if r1 == r2 goto +50 */ VIR_BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 50), /* LBB0_2: */ /* 18: r1 = *(u32 *)(r6 + 4) */ VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, 4), /* 19: r1 <<= 32 */ VIR_BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 32), /* 20: r2 = 4294967295 ll */ VIR_BPF_LD_IMM64(BPF_REG_2, 0xffffffff), /* 22: r1 |= r2 */ VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_1, BPF_REG_2), /* 23: *(u64 *)(r10 - 8) = r1 */ VIR_BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), /* 24: r2 = r10 */ VIR_BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), /* 25: r2 += -8 */ VIR_BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), /* 26: r1 = 0 ll */ VIR_BPF_LD_MAP_FD(BPF_REG_1, mapfd), /* 28: call 1 */ VIR_BPF_CALL_INSN(BPF_FUNC_map_lookup_elem), /* 29: r1 = r0 */ VIR_BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), /* 30: if r1 == 0 goto +5 */ VIR_BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5), /* 31: r0 = 1 */ VIR_BPF_MOV64_IMM(BPF_REG_0, 1), /* 32: r2 = *(u32 *)(r6 + 0) */ VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 0), /* 33: r1 = *(u32 *)(r1 + 0) */ VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 0), /* 34: r1 &= r2 */ VIR_BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), /* 35: if r1 == r2 goto +32 */ VIR_BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 32), /* LBB0_4: */ /* 36: r1 = *(u32 *)(r6 + 8) */ VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, 8), /* 37: r2 = -4294967296 ll */ VIR_BPF_LD_IMM64(BPF_REG_2, 0xffffffff00000000), /* 39: r1 |= r2 */ VIR_BPF_ALU64_REG(BPF_OR, BPF_REG_1, BPF_REG_2), /* 40: *(u64 *)(r10 - 8) = r1 */ VIR_BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), /* 41: r2 = r10 */ VIR_BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), /* 42: r2 += -8 */ VIR_BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), /* 43: r1 = 0 ll */ VIR_BPF_LD_MAP_FD(BPF_REG_1, mapfd), /* 45: call 1 */ VIR_BPF_CALL_INSN(BPF_FUNC_map_lookup_elem), /* 46: r1 = r0 */ VIR_BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), /* 47: if r1 == 0 goto +5 */ VIR_BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5), /* 48: r0 = 1 */ VIR_BPF_MOV64_IMM(BPF_REG_0, 1), /* 49: r2 = *(u32 *)(r6 + 0) */ VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 0), /* 50: r1 = *(u32 *)(r1 + 0) */ VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 0), /* 51: r1 &= r2 */ VIR_BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), /* 52: if r1 == r2 goto +15 */ VIR_BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 15), /* LBB0_6: */ /* 53: r1 = -1 */ VIR_BPF_MOV64_IMM(BPF_REG_1, -1), /* 54: *(u64 *)(r10 - 8) = r1 */ VIR_BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), /* 55: r2 = r10 */ VIR_BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), /* 56: r2 += -8 */ VIR_BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), /* 57: r1 = 0 ll */ VIR_BPF_LD_MAP_FD(BPF_REG_1, mapfd), /* 59: call 1 */ VIR_BPF_CALL_INSN(BPF_FUNC_map_lookup_elem), /* 60: r1 = r0 */ VIR_BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), /* 61: if r1 == 0 goto +5 */ VIR_BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5), /* 62: r0 = 1 */ VIR_BPF_MOV64_IMM(BPF_REG_0, 1), /* 63: r2 = *(u32 *)(r6 + 0) */ VIR_BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 0), /* 64: r1 = *(u32 *)(r1 + 0) */ VIR_BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 0), /* 65: r1 &= r2 */ VIR_BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), /* 66: if r1 == r2 goto +1 */ VIR_BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 1), /* LBB0_8: */ /* 67: r0 = 0 */ VIR_BPF_MOV64_IMM(BPF_REG_0, 0), /* LBB0_9: */ /* 68: exit */ VIR_BPF_EXIT_INSN(), }; return virBPFLoadProg(prog, BPF_PROG_TYPE_CGROUP_DEVICE, G_N_ELEMENTS(prog)); } static int virCgroupV2DevicesAttachProg(virCgroup *group, int mapfd, size_t max) { int ret = -1; VIR_AUTOCLOSE progfd = -1; VIR_AUTOCLOSE cgroupfd = -1; g_autofree char *path = NULL; if (virCgroupPathOfController(group, VIR_CGROUP_CONTROLLER_DEVICES, NULL, &path) < 0) { goto cleanup; } progfd = virCgroupV2DevicesLoadProg(mapfd); if (progfd < 0) { virReportSystemError(errno, "%s", _("failed to load cgroup BPF prog")); goto cleanup; } cgroupfd = open(path, O_RDONLY); if (cgroupfd < 0) { virReportSystemError(errno, _("unable to open '%s'"), path); goto cleanup; } if (virBPFAttachProg(progfd, cgroupfd, BPF_CGROUP_DEVICE) < 0) { virReportSystemError(errno, "%s", _("failed to attach cgroup BPF prog")); goto cleanup; } if (group->unified.devices.progfd > 0) { VIR_DEBUG("Closing existing program that was replaced by new one."); VIR_FORCE_CLOSE(group->unified.devices.progfd); } if (group->unified.devices.mapfd > 0) { VIR_DEBUG("Closing existing map that was replaced by new one."); VIR_FORCE_CLOSE(group->unified.devices.mapfd); } group->unified.devices.progfd = progfd; group->unified.devices.mapfd = mapfd; group->unified.devices.max = max; progfd = -1; mapfd = -1; ret = 0; cleanup: VIR_FORCE_CLOSE(mapfd); return ret; } static int virCgroupV2DevicesCountMapEntries(int mapfd) { int ret = 0; int rc; uint64_t key = 0; uint64_t prevKey = 0; while ((rc = virBPFGetNextElem(mapfd, &prevKey, &key)) == 0) { ret++; prevKey = key; } if (rc < 0 && errno != ENOENT) return -1; return ret; } # define MAX_PROG_IDS 10 int virCgroupV2DevicesDetectProg(virCgroup *group) { g_autofree char *path = NULL; VIR_AUTOCLOSE cgroupfd = -1; unsigned int progcnt = 0; unsigned int progids[MAX_PROG_IDS] = { 0 }; int progfd = -1; int mapfd = -1; int nitems = -1; struct bpf_prog_info progInfo = { 0 }; struct bpf_map_info mapInfo = { 0 }; g_autofree unsigned int *mapIDs = NULL; if (group->unified.devices.progfd > 0 && group->unified.devices.mapfd > 0) return 0; if (virCgroupPathOfController(group, VIR_CGROUP_CONTROLLER_DEVICES, NULL, &path) < 0) { return -1; } cgroupfd = open(path, O_RDONLY); if (cgroupfd < 0) { virReportSystemError(errno, _("unable to open '%s'"), path); return -1; } if (virBPFQueryProg(cgroupfd, MAX_PROG_IDS, BPF_CGROUP_DEVICE, &progcnt, progids) < 0) { virReportSystemError(errno, "%s", _("unable to query cgroup BPF progs")); return -1; } if (progcnt == 0) return 0; /* No need to have alternate code, this function will not be called * if compiled with old kernel. */ progfd = virBPFGetProg(progids[0]); if (progfd < 0) { virReportSystemError(errno, "%s", _("failed to get cgroup BPF prog FD")); return -1; } if (virBPFGetProgInfo(progfd, &progInfo, &mapIDs) < 0) { virReportSystemError(errno, "%s", _("failed to get cgroup BPF prog info")); return -1; } if (progInfo.nr_map_ids == 0) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("no map for cgroup BPF prog")); return -1; } mapfd = virBPFGetMap(mapIDs[0]); if (mapfd < 0) { virReportSystemError(errno, "%s", _("failed to get cgroup BPF map FD")); return -1; } if (virBPFGetMapInfo(mapfd, &mapInfo) < 0) { virReportSystemError(errno, "%s", _("failed to get cgroup BPF map info")); return -1; } nitems = virCgroupV2DevicesCountMapEntries(mapfd); if (nitems < 0) { virReportSystemError(errno, "%s", _("failed to count cgroup BPF map items")); return -1; } group->unified.devices.progfd = progfd; group->unified.devices.mapfd = mapfd; group->unified.devices.max = mapInfo.max_entries; group->unified.devices.count = nitems; return 0; } # define VIR_CGROUP_V2_INITIAL_BPF_MAP_SIZE 64 static int virCgroupV2DevicesCreateMap(size_t size) { int mapfd = virBPFCreateMap(BPF_MAP_TYPE_HASH, sizeof(uint64_t), sizeof(uint32_t), size); if (mapfd < 0) { if (errno == EPERM) { virReportSystemError(errno, "%s", _("failed to initialize device BPF map; " "locked memory limit for libvirtd probably " "needs to be raised")); return -1; } else { virReportSystemError(errno, "%s", _("failed to initialize device BPF map")); return -1; } } return mapfd; } static int virCgroupV2DevicesReallocMap(int mapfd, size_t size) { uint64_t key = 0; uint64_t prevKey = 0; int rc; int ret = -1; VIR_AUTOCLOSE newmapfd = virCgroupV2DevicesCreateMap(size); VIR_DEBUG("realloc devices map mapfd:%d, size:%zu", mapfd, size); if (newmapfd < 0) return -1; while ((rc = virBPFGetNextElem(mapfd, &prevKey, &key)) == 0) { uint32_t val = 0; if (virBPFLookupElem(mapfd, &key, &val) < 0) { virReportSystemError(errno, "%s", _("failed to lookup device in old map")); return -1; } if (virBPFUpdateElem(newmapfd, &key, &val) < 0) { virReportSystemError(errno, "%s", _("failed to add device into new map")); return -1; } prevKey = key; } if (rc < 0 && errno != ENOENT) { virReportSystemError(errno, "%s", _("failed to copy all device rules")); return -1; } ret = newmapfd; newmapfd = -1; return ret; } int virCgroupV2DevicesCreateProg(virCgroup *group) { int mapfd = -1; if (group->unified.devices.progfd > 0 && group->unified.devices.mapfd > 0) return 0; mapfd = virCgroupV2DevicesCreateMap(VIR_CGROUP_V2_INITIAL_BPF_MAP_SIZE); if (mapfd < 0) return -1; return virCgroupV2DevicesAttachProg(group, mapfd, VIR_CGROUP_V2_INITIAL_BPF_MAP_SIZE); } int virCgroupV2DevicesPrepareProg(virCgroup *group) { if (virCgroupV2DevicesDetectProg(group) < 0) return -1; if (virCgroupV2DevicesCreateProg(group) < 0) return -1; if (group->unified.devices.count >= group->unified.devices.max) { size_t max = group->unified.devices.max * 2; int newmapfd = virCgroupV2DevicesReallocMap(group->unified.devices.mapfd, max); if (newmapfd < 0) return -1; if (virCgroupV2DevicesAttachProg(group, newmapfd, max) < 0) return -1; } return 0; } int virCgroupV2DevicesCloseProg(virCgroup *group) { if (group->unified.devices.mapfd > 0) VIR_FORCE_CLOSE(group->unified.devices.mapfd); if (group->unified.devices.progfd > 0) VIR_FORCE_CLOSE(group->unified.devices.progfd); return 0; } uint32_t virCgroupV2DevicesGetPerms(int perms, char type) { uint32_t ret = 0; if (perms & VIR_CGROUP_DEVICE_MKNOD) ret |= BPF_DEVCG_ACC_MKNOD << 16; if (perms & VIR_CGROUP_DEVICE_READ) ret |= BPF_DEVCG_ACC_READ << 16; if (perms & VIR_CGROUP_DEVICE_WRITE) ret |= BPF_DEVCG_ACC_WRITE << 16; if (type == 'b') ret |= BPF_DEVCG_DEV_BLOCK; else if (type == 'c') ret |= BPF_DEVCG_DEV_CHAR; else ret |= BPF_DEVCG_DEV_BLOCK | BPF_DEVCG_DEV_CHAR; return ret; } #else /* !WITH_DECL_BPF_CGROUP_DEVICE */ bool virCgroupV2DevicesAvailable(virCgroup *group G_GNUC_UNUSED) { return false; } int virCgroupV2DevicesDetectProg(virCgroup *group G_GNUC_UNUSED) { virReportSystemError(ENOSYS, "%s", _("cgroups v2 BPF devices not supported " "with this kernel")); return -1; } int virCgroupV2DevicesCreateProg(virCgroup *group G_GNUC_UNUSED) { virReportSystemError(ENOSYS, "%s", _("cgroups v2 BPF devices not supported " "with this kernel")); return -1; } int virCgroupV2DevicesPrepareProg(virCgroup *group G_GNUC_UNUSED) { virReportSystemError(ENOSYS, "%s", _("cgroups v2 BPF devices not supported " "with this kernel")); return -1; } int virCgroupV2DevicesCloseProg(virCgroup *group G_GNUC_UNUSED) { return 0; } uint32_t virCgroupV2DevicesGetPerms(int perms G_GNUC_UNUSED, char type G_GNUC_UNUSED) { return 0; } #endif /* !WITH_DECL_BPF_CGROUP_DEVICE */ uint64_t virCgroupV2DevicesGetKey(int major, int minor) { return (uint64_t)major << 32 | ((uint64_t)minor & 0x00000000ffffffff); }