diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index ddcceee16f..38fa16bc5f 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -10358,7 +10358,7 @@ qemuDomainUpdateCurrentMemorySize(virDomainObjPtr vm) * such as '0004:04:00.0', and tells if the device is a NVLink2 * bridge. */ -static ATTRIBUTE_UNUSED bool +static bool ppc64VFIODeviceIsNV2Bridge(const char *device) { const char *nvlink2Files[] = {"ibm,gpu", "ibm,nvlink", @@ -10396,7 +10396,9 @@ getPPC64MemLockLimitBytes(virDomainDefPtr def) unsigned long long maxMemory = 0; unsigned long long passthroughLimit = 0; size_t i, nPCIHostBridges = 0; + virPCIDeviceAddressPtr pciAddr; bool usesVFIO = false; + bool nvlink2Capable = false; for (i = 0; i < def->ncontrollers; i++) { virDomainControllerDefPtr cont = def->controllers[i]; @@ -10414,7 +10416,17 @@ getPPC64MemLockLimitBytes(virDomainDefPtr def) dev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI && dev->source.subsys.u.pci.backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) { usesVFIO = true; - break; + + pciAddr = &dev->source.subsys.u.pci.addr; + if (virPCIDeviceAddressIsValid(pciAddr, false)) { + VIR_AUTOFREE(char *) pciAddrStr = NULL; + + pciAddrStr = virPCIDeviceAddressAsString(pciAddr); + if (ppc64VFIODeviceIsNV2Bridge(pciAddrStr)) { + nvlink2Capable = true; + break; + } + } } } @@ -10441,29 +10453,59 @@ getPPC64MemLockLimitBytes(virDomainDefPtr def) 4096 * nPCIHostBridges + 8192; - /* passthroughLimit := max( 2 GiB * #PHBs, (c) - * memory (d) - * + memory * 1/512 * #PHBs + 8 MiB ) (e) + /* NVLink2 support in QEMU is a special case of the passthrough + * mechanics explained in the usesVFIO case below. The GPU RAM + * is placed with a gap after maxMemory. The current QEMU + * implementation puts the NVIDIA RAM above the PCI MMIO, which + * starts at 32TiB and is the MMIO reserved for the guest main RAM. * - * (c) is the pre-DDW VFIO DMA window accounting. We're allowing 2 GiB - * rather than 1 GiB + * This window ends at 64TiB, and this is where the GPUs are being + * placed. The next available window size is at 128TiB, and + * 64TiB..128TiB will fit all possible NVIDIA GPUs. * - * (d) is the with-DDW (and memory pre-registration and related - * features) DMA window accounting - assuming that we only account RAM - * once, even if mapped to multiple PHBs + * The same assumption as the most common case applies here: + * the guest will request a 64-bit DMA window, per PHB, that is + * big enough to map all its RAM, which is now at 128TiB due + * to the GPUs. * - * (e) is the with-DDW userspace view and overhead for the 64-bit DMA - * window. This is based a bit on expected guest behaviour, but there - * really isn't a way to completely avoid that. We assume the guest - * requests a 64-bit DMA window (per PHB) just big enough to map all - * its RAM. 4 kiB page size gives the 1/512; it will be less with 64 - * kiB pages, less still if the guest is mapped with hugepages (unlike - * the default 32-bit DMA window, DDW windows can use large IOMMU - * pages). 8 MiB is for second and further level overheads, like (b) */ - if (usesVFIO) + * Note that the NVIDIA RAM window must be accounted for the TCE + * table size, but *not* for the main RAM (maxMemory). This gives + * us the following passthroughLimit for the NVLink2 case: + * + * passthroughLimit = maxMemory + + * 128TiB/512KiB * #PHBs + 8 MiB */ + if (nvlink2Capable) { + passthroughLimit = maxMemory + + 128 * (1ULL<<30) / 512 * nPCIHostBridges + + 8192; + } else if (usesVFIO) { + /* For regular (non-NVLink2 present) VFIO passthrough, the value + * of passthroughLimit is: + * + * passthroughLimit := max( 2 GiB * #PHBs, (c) + * memory (d) + * + memory * 1/512 * #PHBs + 8 MiB ) (e) + * + * (c) is the pre-DDW VFIO DMA window accounting. We're allowing 2 + * GiB rather than 1 GiB + * + * (d) is the with-DDW (and memory pre-registration and related + * features) DMA window accounting - assuming that we only account + * RAM once, even if mapped to multiple PHBs + * + * (e) is the with-DDW userspace view and overhead for the 64-bit + * DMA window. This is based a bit on expected guest behaviour, but + * there really isn't a way to completely avoid that. We assume the + * guest requests a 64-bit DMA window (per PHB) just big enough to + * map all its RAM. 4 kiB page size gives the 1/512; it will be + * less with 64 kiB pages, less still if the guest is mapped with + * hugepages (unlike the default 32-bit DMA window, DDW windows + * can use large IOMMU pages). 8 MiB is for second and further level + * overheads, like (b) */ passthroughLimit = MAX(2 * 1024 * 1024 * nPCIHostBridges, memory + memory / 512 * nPCIHostBridges + 8192); + } memKB = baseLimit + passthroughLimit;