mirror of
https://gitlab.com/libvirt/libvirt.git
synced 2025-02-22 19:32:19 +00:00
qemu: Fix race between device rebind and kvm cleanup
Certain hypervisors (like qemu/kvm) map the PCI bar(s) on the host when doing device passthrough. This can lead to a race condition where the hypervisor is still cleaning up the device while libvirt is trying to re-attach it to the host device driver. To avoid this situation, we look through /proc/iomem, and if the hypervisor is still holding onto the bar (denoted by the string in the matcher variable), then we can wait around a bit for that to clear up. v2: Thanks to review by DV, make sure we wait the full timeout per-device Signed-off-by: Chris Lalancette <clalance@redhat.com>
This commit is contained in:
parent
654dd2902d
commit
be34c3c7ef
@ -439,6 +439,7 @@ pciGetDevice;
|
||||
pciFreeDevice;
|
||||
pciDettachDevice;
|
||||
pciReAttachDevice;
|
||||
pciWaitForDeviceCleanup;
|
||||
pciResetDevice;
|
||||
pciDeviceSetManaged;
|
||||
pciDeviceGetManaged;
|
||||
|
@ -2277,12 +2277,19 @@ qemuDomainReAttachHostDevices(virConnectPtr conn,
|
||||
|
||||
for (i = 0; i < pciDeviceListCount(pcidevs); i++) {
|
||||
pciDevice *dev = pciDeviceListGet(pcidevs, i);
|
||||
if (pciDeviceGetManaged(dev) &&
|
||||
pciReAttachDevice(NULL, dev) < 0) {
|
||||
virErrorPtr err = virGetLastError();
|
||||
VIR_ERROR(_("Failed to re-attach PCI device: %s"),
|
||||
err ? err->message : "");
|
||||
virResetError(err);
|
||||
int retries = 100;
|
||||
if (pciDeviceGetManaged(dev)) {
|
||||
while (pciWaitForDeviceCleanup(dev, "kvm_assigned_device")
|
||||
&& retries) {
|
||||
usleep(100*1000);
|
||||
retries--;
|
||||
}
|
||||
if (pciReAttachDevice(NULL, dev) < 0) {
|
||||
virErrorPtr err = virGetLastError();
|
||||
VIR_ERROR(_("Failed to re-attach PCI device: %s"),
|
||||
err ? err->message : "");
|
||||
virResetError(err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -883,6 +883,103 @@ pciReAttachDevice(virConnectPtr conn, pciDevice *dev)
|
||||
return pciUnBindDeviceFromStub(conn, dev, driver);
|
||||
}
|
||||
|
||||
/* Certain hypervisors (like qemu/kvm) map the PCI bar(s) on
|
||||
* the host when doing device passthrough. This can lead to a race
|
||||
* condition where the hypervisor is still cleaning up the device while
|
||||
* libvirt is trying to re-attach it to the host device driver. To avoid
|
||||
* this situation, we look through /proc/iomem, and if the hypervisor is
|
||||
* still holding onto the bar (denoted by the string in the matcher variable),
|
||||
* then we can wait around a bit for that to clear up.
|
||||
*
|
||||
* A typical /proc/iomem looks like this (snipped for brevity):
|
||||
* 00010000-0008efff : System RAM
|
||||
* 0008f000-0008ffff : reserved
|
||||
* ...
|
||||
* 00100000-cc9fcfff : System RAM
|
||||
* 00200000-00483d3b : Kernel code
|
||||
* 00483d3c-005c88df : Kernel data
|
||||
* cc9fd000-ccc71fff : ACPI Non-volatile Storage
|
||||
* ...
|
||||
* d0200000-d02fffff : PCI Bus #05
|
||||
* d0200000-d021ffff : 0000:05:00.0
|
||||
* d0200000-d021ffff : e1000e
|
||||
* d0220000-d023ffff : 0000:05:00.0
|
||||
* d0220000-d023ffff : e1000e
|
||||
* ...
|
||||
* f0000000-f0003fff : 0000:00:1b.0
|
||||
* f0000000-f0003fff : kvm_assigned_device
|
||||
*
|
||||
* Returns 0 if we are clear to continue, and 1 if the hypervisor is still
|
||||
* holding onto the resource.
|
||||
*/
|
||||
int
|
||||
pciWaitForDeviceCleanup(pciDevice *dev, const char *matcher)
|
||||
{
|
||||
FILE *fp;
|
||||
char line[160];
|
||||
unsigned long long start, end;
|
||||
int consumed;
|
||||
char *rest;
|
||||
unsigned long long domain;
|
||||
int bus, slot, function;
|
||||
int in_matching_device;
|
||||
int ret;
|
||||
size_t match_depth;
|
||||
|
||||
fp = fopen("/proc/iomem", "r");
|
||||
if (!fp) {
|
||||
/* If we failed to open iomem, we just basically ignore the error. The
|
||||
* unbind might succeed anyway, and besides, it's very likely we have
|
||||
* no way to report the error
|
||||
*/
|
||||
VIR_DEBUG0("Failed to open /proc/iomem, trying to continue anyway");
|
||||
return 0;
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
in_matching_device = 0;
|
||||
match_depth = 0;
|
||||
while (fgets(line, sizeof(line), fp) != 0) {
|
||||
/* the logic here is a bit confusing. For each line, we look to
|
||||
* see if it matches the domain:bus:slot.function we were given.
|
||||
* If this line matches the DBSF, then any subsequent lines indented
|
||||
* by 2 spaces are the PCI regions for this device. It's also
|
||||
* possible that none of the PCI regions are currently mapped, in
|
||||
* which case we have no indented regions. This code handles all
|
||||
* of these situations
|
||||
*/
|
||||
if (in_matching_device && (strspn(line, " ") == (match_depth + 2))) {
|
||||
if (sscanf(line, "%Lx-%Lx : %n", &start, &end, &consumed) != 2)
|
||||
continue;
|
||||
|
||||
rest = line + consumed;
|
||||
if (STRPREFIX(rest, matcher)) {
|
||||
ret = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else {
|
||||
in_matching_device = 0;
|
||||
if (sscanf(line, "%Lx-%Lx : %n", &start, &end, &consumed) != 2)
|
||||
continue;
|
||||
|
||||
rest = line + consumed;
|
||||
if (sscanf(rest, "%Lx:%x:%x.%x", &domain, &bus, &slot, &function) != 4)
|
||||
continue;
|
||||
|
||||
if (domain != dev->domain || bus != dev->bus || slot != dev->slot ||
|
||||
function != dev->function)
|
||||
continue;
|
||||
in_matching_device = 1;
|
||||
match_depth = strspn(line, " ");
|
||||
}
|
||||
}
|
||||
|
||||
fclose(fp);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static char *
|
||||
pciReadDeviceID(pciDevice *dev, const char *id_name)
|
||||
{
|
||||
|
@ -81,5 +81,6 @@ int pciDeviceFileIterate(virConnectPtr conn,
|
||||
int pciDeviceIsAssignable(virConnectPtr conn,
|
||||
pciDevice *dev,
|
||||
int strict_acs_check);
|
||||
int pciWaitForDeviceCleanup(pciDevice *dev, const char *matcher);
|
||||
|
||||
#endif /* __VIR_PCI_H__ */
|
||||
|
Loading…
x
Reference in New Issue
Block a user