libvirt/src/util/virpci.c

/*
 * virpci.c: helper APIs for managing host PCI devices
 *
 * Copyright (C) 2009-2015 Red Hat, Inc.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library.  If not, see
 * <http://www.gnu.org/licenses/>.
 */

#include <config.h>

#include "virpci.h"
#include "virnetdev.h"

#include <dirent.h>
#include <fcntl.h>
#include <inttypes.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>

#include "virlog.h"
#include "vircommand.h"
#include "virerror.h"
#include "virfile.h"
#include "virkmod.h"
#include "virstring.h"
#include "viralloc.h"
#include "virpcivpd.h"

VIR_LOG_INIT("util.pci");

#define PCI_SYSFS "/sys/bus/pci/"
#define PCI_ID_LEN 10   /* "XXXX XXXX" */

VIR_ENUM_IMPL(virPCIELinkSpeed,
              VIR_PCIE_LINK_SPEED_LAST,
              "", "2.5", "5", "8", "16",
);

VIR_ENUM_IMPL(virPCIStubDriver,
              VIR_PCI_STUB_DRIVER_LAST,
              "none",
              "pciback", /* XEN */
              "vfio-pci", /* VFIO */
);

VIR_ENUM_IMPL(virPCIHeader,
              VIR_PCI_HEADER_LAST,
              "endpoint",
              "pci-bridge",
              "cardbus-bridge",
);

struct _virPCIDevice {
    virPCIDeviceAddress address;

    char          *name;              /* domain:bus:slot.function */
    char          id[PCI_ID_LEN];     /* product vendor */
    char          *path;

    /* The driver:domain which uses the device */
    char          *used_by_drvname;
    char          *used_by_domname;

    /* The following 5 items are only valid after virPCIDeviceInit()
     * has been called for the virPCIDevice object. This is *not* done
     * in most cases (because it creates extra overhead, and parts of
     * it can fail if libvirtd is running unprivileged)
     */
    unsigned int  pcie_cap_pos;
    unsigned int  pci_pm_cap_pos;
    bool          has_flr;
    bool          has_pm_reset;
    bool          is_pcie;
    /**/

    bool          managed;

    virPCIStubDriver stubDriver;

    /* used by reattach function */
    bool          unbind_from_stub;
    bool          remove_slot;
    bool          reprobe;
};

struct _virPCIDeviceList {
    virObjectLockable parent;

    size_t count;
    virPCIDevice **devs;
};


#define VIR_FROM_THIS VIR_FROM_NONE

/* Specifications referenced in comments:
 *  PCI30  - PCI Local Bus Specification 3.0
 *  PCIe20 - PCI Express Base Specification 2.0
 *  BR12   - PCI-to-PCI Bridge Architecture Specification 1.2
 *  PM12   - PCI Bus Power Management Interface Specification 1.2
 *  ECN_AF - Advanced Capabilities for Conventional PCI ECN
 */

/* Type 0 config space header length; PCI30 Section 6.1 Configuration Space Organization */
#define PCI_CONF_LEN            0x100
#define PCI_CONF_HEADER_LEN     0x40

/* PCI30 6.2.1 */
#define PCI_HEADER_TYPE         0x0e    /* Header type */
#define PCI_HEADER_TYPE_BRIDGE 0x1
#define PCI_HEADER_TYPE_MASK   0x7f
#define PCI_HEADER_TYPE_MULTI  0x80

/* PCI30 6.2.1  Device Identification */
#define PCI_CLASS_DEVICE        0x0a    /* Device class */

/* Class Code for bridge; PCI30 D.7  Base Class 06h */
#define PCI_CLASS_BRIDGE_PCI    0x0604

/* PCI30 6.2.3  Device Status */
#define PCI_STATUS              0x06    /* 16 bits */
#define PCI_STATUS_CAP_LIST    0x10    /* Support Capability List */

/* PCI30 6.7  Capabilities List */
#define PCI_CAPABILITY_LIST     0x34    /* Offset of first capability list entry */
#define PCI_CAP_FLAGS           2       /* Capability defined flags (16 bits) */

/* PM12 3.2.1  Capability Identifier */
#define PCI_CAP_ID_PM           0x01    /* Power Management */
/* PCI30 H Capability IDs */
#define PCI_CAP_ID_EXP          0x10    /* PCI Express */
/* ECN_AF 6.x.1.1  Capability ID for AF */
#define PCI_CAP_ID_AF           0x13    /* Advanced Features */

/* PCIe20 7.8.3  Device Capabilities Register (Offset 04h) */
#define PCI_EXP_DEVCAP          0x4     /* Device capabilities */
#define PCI_EXP_DEVCAP_FLR     (1<<28)  /* Function Level Reset */
#define PCI_EXP_LNKCAP          0xc     /* Link Capabilities */
#define PCI_EXP_LNKCAP_SPEED    0x0000f /* Maximum Link Speed */
#define PCI_EXP_LNKCAP_WIDTH    0x003f0 /* Maximum Link Width */
#define PCI_EXP_LNKSTA          0x12    /* Link Status */
#define PCI_EXP_LNKSTA_SPEED    0x000f  /* Negotiated Link Speed */
#define PCI_EXP_LNKSTA_WIDTH    0x03f0  /* Negotiated Link Width */

/* Header type 1 BR12 3.2 PCI-to-PCI Bridge Configuration Space Header Format */
#define PCI_PRIMARY_BUS         0x18    /* BR12 3.2.5.2 Primary bus number */
#define PCI_SECONDARY_BUS       0x19    /* BR12 3.2.5.3 Secondary bus number */
#define PCI_SUBORDINATE_BUS     0x1a    /* BR12 3.2.5.4 Highest bus number behind the bridge */
#define PCI_BRIDGE_CONTROL      0x3e
/* BR12 3.2.5.18  Bridge Control Register */
#define PCI_BRIDGE_CTL_RESET   0x40    /* Secondary bus reset */

/* PM12 3.2.4  Power Management Control/Status (Offset = 4) */
#define PCI_PM_CTRL                4    /* PM control and status register */
#define PCI_PM_CTRL_STATE_MASK    0x3  /* Current power state (D0 to D3) */
#define PCI_PM_CTRL_STATE_D0      0x0  /* D0 state */
#define PCI_PM_CTRL_STATE_D3hot   0x3  /* D3 state */
#define PCI_PM_CTRL_NO_SOFT_RESET 0x8  /* No reset for D3hot->D0 */

/* ECN_AF 6.x.1  Advanced Features Capability Structure */
#define PCI_AF_CAP              0x3     /* Advanced features capabilities */
#define PCI_AF_CAP_FLR         0x2     /* Function Level Reset */

#define PCI_EXP_FLAGS           0x2
#define PCI_EXP_FLAGS_TYPE      0x00f0
#define PCI_EXP_TYPE_DOWNSTREAM 0x6

#define PCI_EXT_CAP_BASE          0x100
#define PCI_EXT_CAP_LIMIT         0x1000
#define PCI_EXT_CAP_ID_MASK       0x0000ffff
#define PCI_EXT_CAP_OFFSET_SHIFT  20
#define PCI_EXT_CAP_OFFSET_MASK   0x00000ffc

#define PCI_EXT_CAP_ID_ACS      0x000d
#define PCI_EXT_ACS_CTRL        0x06

#define PCI_EXT_CAP_ACS_SV      0x01
#define PCI_EXT_CAP_ACS_RR      0x04
#define PCI_EXT_CAP_ACS_CR      0x08
#define PCI_EXT_CAP_ACS_UF      0x10
#define PCI_EXT_CAP_ACS_ENABLED (PCI_EXT_CAP_ACS_SV | \
                                 PCI_EXT_CAP_ACS_RR | \
                                 PCI_EXT_CAP_ACS_CR | \
                                 PCI_EXT_CAP_ACS_UF)

#define PCI_EXP_TYPE_ROOT_INT_EP 0x9    /* Root Complex Integrated Endpoint */
#define PCI_EXP_TYPE_ROOT_EC 0xa        /* Root Complex Event Collector */

static virClass *virPCIDeviceListClass;

static void virPCIDeviceListDispose(void *obj);

static int virPCIOnceInit(void)
{
    if (!VIR_CLASS_NEW(virPCIDeviceList, virClassForObjectLockable()))
        return -1;

    return 0;
}

VIR_ONCE_GLOBAL_INIT(virPCI);


static char *
virPCIDriverDir(const char *driver)
{
    return g_strdup_printf(PCI_SYSFS "drivers/%s", driver);
}


static char *
virPCIFile(const char *device, const char *file)
{
    return g_strdup_printf(PCI_SYSFS "devices/%s/%s", device, file);
}


/* virPCIDeviceGetDriverPathAndName - put the path to the driver
 * directory of the driver in use for this device in @path and the
 * name of the driver in @name. Both could be NULL if it's not bound
 * to any driver.
 *
 * Return 0 for success, -1 for error.
 */
int
virPCIDeviceGetDriverPathAndName(virPCIDevice *dev, char **path, char **name)
{
    int ret = -1;
    g_autofree char *drvlink = NULL;

    *path = *name = NULL;

    /* drvlink = "/sys/bus/pci/dddd:bb:ss.ff/driver" */
    drvlink = virPCIFile(dev->name, "driver");

    if (!virFileExists(drvlink)) {
        ret = 0;
        goto cleanup;
    }

    if (virFileIsLink(drvlink) != 1) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Invalid device %s driver file %s is not a symlink"),
                       dev->name, drvlink);
        goto cleanup;
    }
    if (virFileResolveLink(drvlink, path) < 0) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Unable to resolve device %s driver symlink %s"),
                       dev->name, drvlink);
        goto cleanup;
    }
    /* path = "/sys/bus/pci/drivers/${drivername}" */

    *name = g_path_get_basename(*path);
    /* name = "${drivername}" */

    ret = 0;
 cleanup:
    if (ret < 0) {
        VIR_FREE(*path);
        VIR_FREE(*name);
    }
    return ret;
}


static int
virPCIDeviceConfigOpenInternal(virPCIDevice *dev, bool readonly, bool fatal)
{
    int fd;

    fd = open(dev->path, readonly ? O_RDONLY : O_RDWR);

    if (fd < 0) {
        if (fatal) {
            virReportSystemError(errno,
                                 _("Failed to open config space file '%s'"),
                                 dev->path);
        } else {
            VIR_WARN("Failed to open config space file '%s': %s",
                     dev->path, g_strerror(errno));
        }
        return -1;
    }

    VIR_DEBUG("%s %s: opened %s", dev->id, dev->name, dev->path);
    return fd;
}

static int
virPCIDeviceConfigOpen(virPCIDevice *dev)
{
    return virPCIDeviceConfigOpenInternal(dev, true, true);
}

static int
virPCIDeviceConfigOpenTry(virPCIDevice *dev)
{
    return virPCIDeviceConfigOpenInternal(dev, true, false);
}

static int
virPCIDeviceConfigOpenWrite(virPCIDevice *dev)
{
    return virPCIDeviceConfigOpenInternal(dev, false, true);
}

static void
virPCIDeviceConfigClose(virPCIDevice *dev, int cfgfd)
{
    if (VIR_CLOSE(cfgfd) < 0) {
        VIR_WARN("Failed to close config space file '%s': %s",
                 dev->path, g_strerror(errno));
    }
}


static int
virPCIDeviceRead(virPCIDevice *dev,
                 int cfgfd,
                 unsigned int pos,
                 uint8_t *buf,
                 unsigned int buflen)
{
    memset(buf, 0, buflen);
    errno = 0;

    if (lseek(cfgfd, pos, SEEK_SET) != pos ||
        saferead(cfgfd, buf, buflen) != buflen) {
        VIR_DEBUG("Failed to read %u bytes at %u from '%s' : %s",
                 buflen, pos, dev->path, g_strerror(errno));
        return -1;
    }
    return 0;
}


/**
 * virPCIDeviceReadN:
 * @dev: virPCIDevice object (used only to log name of config file)
 * @cfgfd: open file descriptor for device config file in sysfs
 * @pos: byte offset in the file to read from
 *
 * read "N" (where "N" is "8", "16", or "32", and appears at the end
 * of the function name) bytes from a PCI device's already-opened
 * sysfs config file and return them as the return value from the
 * function.
 *
 * Returns the value at @pos in the file, or 0 if there was an
 * error. NB: since 0 could be a valid value, occurrence of an error
 * must be determined by examining errno. errno is always reset to 0
 * before the seek/read is attempted (see virPCIDeviceRead()), so if
 * errno != 0 on return from one of these functions, then either the
 * seek or the read operation failed for some reason. If errno == 0
 * and the return value is 0, then the config file really does contain
 * the value 0 at @pos.
 */
static uint8_t
virPCIDeviceRead8(virPCIDevice *dev, int cfgfd, unsigned int pos)
{
    uint8_t buf;
    virPCIDeviceRead(dev, cfgfd, pos, &buf, sizeof(buf));
    return buf;
}

static uint16_t
virPCIDeviceRead16(virPCIDevice *dev, int cfgfd, unsigned int pos)
{
    uint8_t buf[2];
    virPCIDeviceRead(dev, cfgfd, pos, &buf[0], sizeof(buf));
    return (buf[0] << 0) | (buf[1] << 8);
}

static uint32_t
virPCIDeviceRead32(virPCIDevice *dev, int cfgfd, unsigned int pos)
{
    uint8_t buf[4];
    virPCIDeviceRead(dev, cfgfd, pos, &buf[0], sizeof(buf));
    return (buf[0] << 0) | (buf[1] << 8) | (buf[2] << 16) | (buf[3] << 24);
}

static int
virPCIDeviceReadClass(virPCIDevice *dev, uint16_t *device_class)
{
    g_autofree char *path = NULL;
    g_autofree char *id_str = NULL;
    unsigned int value;

    path = virPCIFile(dev->name, "class");

    /* class string is '0xNNNNNN\n' ... i.e. 9 bytes */
    if (virFileReadAll(path, 9, &id_str) < 0)
        return -1;

    id_str[8] = '\0';
    if (virStrToLong_ui(id_str, NULL, 16, &value) < 0) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Unusual value in %s/devices/%s/class: %s"),
                       PCI_SYSFS, dev->name, id_str);
        return -1;
    }

    *device_class = (value >> 8) & 0xFFFF;
    return 0;
}

static int
virPCIDeviceWrite(virPCIDevice *dev,
                  int cfgfd,
                  unsigned int pos,
                  uint8_t *buf,
                  unsigned int buflen)
{
    if (lseek(cfgfd, pos, SEEK_SET) != pos ||
        safewrite(cfgfd, buf, buflen) != buflen) {
        VIR_WARN("Failed to write to '%s' : %s", dev->path,
                 g_strerror(errno));
        return -1;
    }
    return 0;
}

static void
virPCIDeviceWrite16(virPCIDevice *dev, int cfgfd, unsigned int pos, uint16_t val)
{
    uint8_t buf[2] = { (val >> 0), (val >> 8) };
    virPCIDeviceWrite(dev, cfgfd, pos, &buf[0], sizeof(buf));
}

static void
virPCIDeviceWrite32(virPCIDevice *dev, int cfgfd, unsigned int pos, uint32_t val)
{
    uint8_t buf[4] = { (val >> 0), (val >> 8), (val >> 16), (val >> 24) };
    virPCIDeviceWrite(dev, cfgfd, pos, &buf[0], sizeof(buf));
}

typedef int (*virPCIDeviceIterPredicate)(virPCIDevice *, virPCIDevice *,
                                         void *);

/* Iterate over available PCI devices calling @predicate
 * to compare each one to @dev.
 * Return -1 on error since we don't want to assume it is
 * safe to reset if there is an error.
 */
static int
virPCIDeviceIterDevices(virPCIDeviceIterPredicate predicate,
                        virPCIDevice *dev,
                        virPCIDevice **matched,
                        void *data)
{
    g_autoptr(DIR) dir = NULL;
    struct dirent *entry;
    int ret = 0;
    int rc;

    *matched = NULL;

    VIR_DEBUG("%s %s: iterating over " PCI_SYSFS "devices", dev->id, dev->name);

    if (virDirOpen(&dir, PCI_SYSFS "devices") < 0)
        return -1;

    while ((ret = virDirRead(dir, &entry, PCI_SYSFS "devices")) > 0) {
        g_autoptr(virPCIDevice) check = NULL;
        virPCIDeviceAddress devAddr;
        char *tmp;

        /* expected format: <domain>:<bus>:<slot>.<function> */
        if (/* domain */
            virStrToLong_ui(entry->d_name, &tmp, 16, &devAddr.domain) < 0 || *tmp != ':' ||
            /* bus */
            virStrToLong_ui(tmp + 1, &tmp, 16, &devAddr.bus) < 0 || *tmp != ':' ||
            /* slot */
            virStrToLong_ui(tmp + 1, &tmp, 16, &devAddr.slot) < 0 || *tmp != '.' ||
            /* function */
            virStrToLong_ui(tmp + 1, NULL, 16, &devAddr.function) < 0) {
            VIR_WARN("Unusual entry in " PCI_SYSFS "devices: %s", entry->d_name);
            continue;
        }

        check = virPCIDeviceNew(&devAddr);
        if (!check) {
            ret = -1;
            break;
        }

        rc = predicate(dev, check, data);
        if (rc < 0) {
            /* the predicate returned an error, bail */
            ret = -1;
            break;
        } else if (rc == 1) {
            VIR_DEBUG("%s %s: iter matched on %s", dev->id, dev->name, check->name);
            *matched = g_steal_pointer(&check);
            ret = 1;
            break;
        }
    }
    return ret;
}


/**
 * virPCIDeviceFindCapabilityOffset:
 * @dev: virPCIDevice object (used only to log name of config file)
 * @cfgfd: open file descriptor for device config file in sysfs
 * @capability: PCI_CAP_ID_* being requested
 * @offset: used to return the offset of @capability in the file
 *
 * Find the offset of @capability within the PCI config file @cfgfd of
 * the device @dev. if found, the offset is returned in @offset,
 * otherwise @offset is set to 0.
 *
 * Returns 0 on success, -1 on failure.
 */
static int
virPCIDeviceFindCapabilityOffset(virPCIDevice *dev,
                                 int cfgfd,
                                 unsigned int capability,
                                 unsigned int *offset)
{
    uint16_t status;
    uint8_t pos;

    *offset = 0; /* assume failure (*nothing* can be at offset 0) */

    status = virPCIDeviceRead16(dev, cfgfd, PCI_STATUS);
    if (errno != 0 || !(status & PCI_STATUS_CAP_LIST))
        goto error;

    pos = virPCIDeviceRead8(dev, cfgfd, PCI_CAPABILITY_LIST);
    if (errno != 0)
        goto error;

    /* Zero indicates last capability, capabilities can't
     * be in the config space header and 0xff is returned
     * by the kernel if we don't have access to this region
     *
     * Note: we're not handling loops or extended
     * capabilities here.
     */
    while (pos >= PCI_CONF_HEADER_LEN && pos != 0xff) {
        uint8_t capid = virPCIDeviceRead8(dev, cfgfd, pos);
        if (errno != 0)
            goto error;

        if (capid == capability) {
            VIR_DEBUG("%s %s: found cap 0x%.2x at 0x%.2x",
                      dev->id, dev->name, capability, pos);
            *offset = pos;
            return 0;
        }

        pos = virPCIDeviceRead8(dev, cfgfd, pos + 1);
        if (errno != 0)
            goto error;
    }

 error:
    VIR_DEBUG("%s %s: failed to find cap 0x%.2x (%s)",
              dev->id, dev->name, capability, g_strerror(errno));

    /* reset errno in case the failure was due to insufficient
     * privileges to read the entire PCI config file
     */
    errno = 0;

    return -1;
}

static unsigned int
virPCIDeviceFindExtendedCapabilityOffset(virPCIDevice *dev,
                                         int cfgfd,
                                         unsigned int capability)
{
    int ttl;
    unsigned int pos;
    uint32_t header;

    /* minimum 8 bytes per capability */
    ttl = (PCI_EXT_CAP_LIMIT - PCI_EXT_CAP_BASE) / 8;
    pos = PCI_EXT_CAP_BASE;

    while (ttl > 0 && pos >= PCI_EXT_CAP_BASE) {
        header = virPCIDeviceRead32(dev, cfgfd, pos);

        if ((header & PCI_EXT_CAP_ID_MASK) == capability)
            return pos;

        pos = (header >> PCI_EXT_CAP_OFFSET_SHIFT) & PCI_EXT_CAP_OFFSET_MASK;
        ttl--;
    }

    return 0;
}

/* detects whether this device has FLR.  Returns 0 if the device does
 * not have FLR, 1 if it does, and -1 on error
 */
static bool
virPCIDeviceDetectFunctionLevelReset(virPCIDevice *dev, int cfgfd)
{
    uint32_t caps;
    unsigned int pos;
    g_autofree char *path = NULL;
    int found;

    /* The PCIe Function Level Reset capability allows
     * individual device functions to be reset without
     * affecting any other functions on the device or
     * any other devices on the bus. This is only common
     * on SR-IOV NICs at the moment.
     */
    if (dev->pcie_cap_pos) {
        caps = virPCIDeviceRead32(dev, cfgfd, dev->pcie_cap_pos + PCI_EXP_DEVCAP);
        if (caps & PCI_EXP_DEVCAP_FLR) {
            VIR_DEBUG("%s %s: detected PCIe FLR capability", dev->id, dev->name);
            return true;
        }
    }

    /* The PCI AF Function Level Reset capability is
     * the same thing, except for conventional PCI
     * devices. This is not common yet.
     */
    if (virPCIDeviceFindCapabilityOffset(dev, cfgfd, PCI_CAP_ID_AF, &pos) < 0)
        goto error;

    if (pos) {
        caps = virPCIDeviceRead16(dev, cfgfd, pos + PCI_AF_CAP);
        if (caps & PCI_AF_CAP_FLR) {
            VIR_DEBUG("%s %s: detected PCI FLR capability", dev->id, dev->name);
            return true;
        }
    }

    /* there are some buggy devices that do support FLR, but forget to
     * advertise that fact in their capabilities.  However, FLR is *required*
     * to be present for virtual functions (VFs), so if we see that this
     * device is a VF, we just assume FLR works
     */

    path = g_strdup_printf(PCI_SYSFS "devices/%s/physfn", dev->name);

    found = virFileExists(path);
    if (found) {
        VIR_DEBUG("%s %s: buggy device didn't advertise FLR, but is a VF; forcing flr on",
                  dev->id, dev->name);
        return true;
    }

 error:
    VIR_DEBUG("%s %s: no FLR capability found", dev->id, dev->name);
    return false;
}

/* Require the device has the PCI Power Management capability
 * and that a D3hot->D0 transition will results in a full
 * internal reset, not just a soft reset.
 */
static bool
virPCIDeviceDetectPowerManagementReset(virPCIDevice *dev, int cfgfd)
{
    if (dev->pci_pm_cap_pos) {
        uint32_t ctl;

        /* require the NO_SOFT_RESET bit is clear */
        ctl = virPCIDeviceRead32(dev, cfgfd, dev->pci_pm_cap_pos + PCI_PM_CTRL);
        if (!(ctl & PCI_PM_CTRL_NO_SOFT_RESET)) {
            VIR_DEBUG("%s %s: detected PM reset capability", dev->id, dev->name);
            return true;
        }
    }

    VIR_DEBUG("%s %s: no PM reset capability found", dev->id, dev->name);

    return false;
}

/* Any active devices on the same domain/bus ? */
static int
virPCIDeviceSharesBusWithActive(virPCIDevice *dev, virPCIDevice *check, void *data)
{
    virPCIDeviceList *inactiveDevs = data;

    /* Different domain, different bus, or simply identical device */
    if (dev->address.domain != check->address.domain ||
        dev->address.bus != check->address.bus ||
        (dev->address.slot == check->address.slot &&
         dev->address.function == check->address.function))
        return 0;

    /* same bus, but inactive, i.e. about to be assigned to guest */
    if (inactiveDevs && virPCIDeviceListFind(inactiveDevs, &check->address))
        return 0;

    return 1;
}

static virPCIDevice *
virPCIDeviceBusContainsActiveDevices(virPCIDevice *dev,
                                     virPCIDeviceList *inactiveDevs)
{
    virPCIDevice *active = NULL;
    if (virPCIDeviceIterDevices(virPCIDeviceSharesBusWithActive,
                                dev, &active, inactiveDevs) < 0)
        return NULL;
    return active;
}

/* Is @check the parent of @dev ? */
static int
virPCIDeviceIsParent(virPCIDevice *dev, virPCIDevice *check, void *data)
{
    uint16_t device_class;
    uint8_t header_type, secondary, subordinate;
    virPCIDevice **best = data;
    int ret = 0;
    int fd;

    if (dev->address.domain != check->address.domain)
        return 0;

    if ((fd = virPCIDeviceConfigOpenTry(check)) < 0)
        return 0;

    /* Is it a bridge? */
    ret = virPCIDeviceReadClass(check, &device_class);
    if (ret < 0 || device_class != PCI_CLASS_BRIDGE_PCI)
        goto cleanup;

    /* Is it a plane? */
    header_type = virPCIDeviceRead8(check, fd, PCI_HEADER_TYPE);
    if ((header_type & PCI_HEADER_TYPE_MASK) != PCI_HEADER_TYPE_BRIDGE)
        goto cleanup;

    secondary   = virPCIDeviceRead8(check, fd, PCI_SECONDARY_BUS);
    subordinate = virPCIDeviceRead8(check, fd, PCI_SUBORDINATE_BUS);

    VIR_DEBUG("%s %s: found parent device %s", dev->id, dev->name, check->name);

    /* if the secondary bus exactly equals the device's bus, then we found
     * the direct parent.  No further work is necessary
     */
    if (dev->address.bus == secondary) {
        ret = 1;
        goto cleanup;
    }

    /* otherwise, SRIOV allows VFs to be on different buses than their PFs.
     * In this case, what we need to do is look for the "best" match; i.e.
     * the most restrictive match that still satisfies all of the conditions.
     */
    if (dev->address.bus > secondary && dev->address.bus <= subordinate) {
        if (*best == NULL) {
            *best = virPCIDeviceNew(&check->address);
            if (*best == NULL) {
                ret = -1;
                goto cleanup;
            }
        } else {
            /* OK, we had already recorded a previous "best" match for the
             * parent.  See if the current device is more restrictive than the
             * best, and if so, make it the new best
             */
            int bestfd;
            uint8_t best_secondary;

            if ((bestfd = virPCIDeviceConfigOpenTry(*best)) < 0)
                goto cleanup;
            best_secondary = virPCIDeviceRead8(*best, bestfd, PCI_SECONDARY_BUS);
            virPCIDeviceConfigClose(*best, bestfd);

            if (secondary > best_secondary) {
                virPCIDeviceFree(*best);
                *best = virPCIDeviceNew(&check->address);
                if (*best == NULL) {
                    ret = -1;
                    goto cleanup;
                }
            }
        }
    }

 cleanup:
    virPCIDeviceConfigClose(check, fd);
    return ret;
}

static int
virPCIDeviceGetParent(virPCIDevice *dev, virPCIDevice **parent)
{
    virPCIDevice *best = NULL;
    int ret;

    *parent = NULL;
    ret = virPCIDeviceIterDevices(virPCIDeviceIsParent, dev, parent, &best);
    if (ret == 1)
        virPCIDeviceFree(best);
    else if (ret == 0)
        *parent = best;
    return ret;
}

/* Secondary Bus Reset is our sledgehammer - it resets all
 * devices behind a bus.
 */
static int
virPCIDeviceTrySecondaryBusReset(virPCIDevice *dev,
                                 int cfgfd,
                                 virPCIDeviceList *inactiveDevs)
{
    g_autoptr(virPCIDevice) parent = NULL;
    g_autoptr(virPCIDevice) conflict = NULL;
    uint8_t config_space[PCI_CONF_LEN];
    uint16_t ctl;
    int ret = -1;
    int parentfd;

    /* Refuse to do a secondary bus reset if there are other
     * devices/functions behind the bus are used by the host
     * or other guests.
     */
    if ((conflict = virPCIDeviceBusContainsActiveDevices(dev, inactiveDevs))) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Active %s devices on bus with %s, not doing bus reset"),
                       conflict->name, dev->name);
        return -1;
    }

    /* Find the parent bus */
    if (virPCIDeviceGetParent(dev, &parent) < 0)
        return -1;
    if (!parent) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Failed to find parent device for %s"),
                       dev->name);
        return -1;
    }
    if ((parentfd = virPCIDeviceConfigOpenWrite(parent)) < 0)
        goto out;

    VIR_DEBUG("%s %s: doing a secondary bus reset", dev->id, dev->name);

    /* Save and restore the device's config space; we only do this
     * for the supplied device since we refuse to do a reset if there
     * are multiple devices/functions
     */
    if (virPCIDeviceRead(dev, cfgfd, 0, config_space, PCI_CONF_LEN) < 0) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Failed to read PCI config space for %s"),
                       dev->name);
        goto out;
    }

    /* Read the control register, set the reset flag, wait 200ms,
     * unset the reset flag and wait 200ms.
     */
    ctl = virPCIDeviceRead16(dev, parentfd, PCI_BRIDGE_CONTROL);

    virPCIDeviceWrite16(parent, parentfd, PCI_BRIDGE_CONTROL,
                        ctl | PCI_BRIDGE_CTL_RESET);

    g_usleep(200 * 1000); /* sleep 200ms */

    virPCIDeviceWrite16(parent, parentfd, PCI_BRIDGE_CONTROL, ctl);

    g_usleep(200 * 1000); /* sleep 200ms */

    if (virPCIDeviceWrite(dev, cfgfd, 0, config_space, PCI_CONF_LEN) < 0) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Failed to restore PCI config space for %s"),
                       dev->name);
        goto out;
    }
    ret = 0;

 out:
    virPCIDeviceConfigClose(parent, parentfd);
    return ret;
}

/* Power management reset attempts to reset a device using a
 * D-state transition from D3hot to D0. Note, in detect_pm_reset()
 * above we require the device supports a full internal reset.
 */
static int
virPCIDeviceTryPowerManagementReset(virPCIDevice *dev, int cfgfd)
{
    uint8_t config_space[PCI_CONF_LEN];
    uint32_t ctl;

    if (!dev->pci_pm_cap_pos)
        return -1;

    /* Save and restore the device's config space. */
    if (virPCIDeviceRead(dev, cfgfd, 0, &config_space[0], PCI_CONF_LEN) < 0) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Failed to read PCI config space for %s"),
                       dev->name);
        return -1;
    }

    VIR_DEBUG("%s %s: doing a power management reset", dev->id, dev->name);

    ctl = virPCIDeviceRead32(dev, cfgfd, dev->pci_pm_cap_pos + PCI_PM_CTRL);
    ctl &= ~PCI_PM_CTRL_STATE_MASK;

    virPCIDeviceWrite32(dev, cfgfd, dev->pci_pm_cap_pos + PCI_PM_CTRL,
                        ctl | PCI_PM_CTRL_STATE_D3hot);

    g_usleep(10 * 1000); /* sleep 10ms */

    virPCIDeviceWrite32(dev, cfgfd, dev->pci_pm_cap_pos + PCI_PM_CTRL,
                        ctl | PCI_PM_CTRL_STATE_D0);

    g_usleep(10 * 1000); /* sleep 10ms */

    if (virPCIDeviceWrite(dev, cfgfd, 0, &config_space[0], PCI_CONF_LEN) < 0) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Failed to restore PCI config space for %s"),
                       dev->name);
        return -1;
    }

    return 0;
}

/**
 * virPCIDeviceInit:
 * @dev: virPCIDevice object needing its PCI capabilities info initialized
 * @cfgfd: open file descriptor for device config file in sysfs
 *
 * Initialize the PCI capabilities attributes of a virPCIDevice object
 * (i.e. pcie_cap_pos, pci_pm_cap_pos, has_flr, has_pm_reset, and
 * is_pcie). This is done by walking the info in the (already-opened)
 * device PCI config file in sysfs. This function can be called
 * regardless of whether a process has sufficient privilege to read
 * the entire file (unprivileged processes can only read the 1st 64
 * bytes, while the Express Capabilities are all located beyond that
 * boundary).
 *
 * In the case that we are unable to read a capability
 * directly, we will attempt to infer its value by other means. In
 * particular, we can determine that a device is (almost surely) PCIe
 * by checking that the length of the config file is != 256 (since all
 * conventional PCI config files are 256 bytes), and we know that any
 * device that is an SR-IOV VF will have FLR available (since that is
 * required by the SR-IOV spec.)
 *
 * Always returns success (0) (for now)
 */
static int
virPCIDeviceInit(virPCIDevice *dev, int cfgfd)
{
    dev->is_pcie = false;
    if (virPCIDeviceFindCapabilityOffset(dev, cfgfd, PCI_CAP_ID_EXP, &dev->pcie_cap_pos) < 0) {
        /* an unprivileged process is unable to read *all* of a
         * device's PCI config (it can only read the first 64
         * bytes, which isn't enough for see the Express
         * Capabilities data). If virPCIDeviceFindCapabilityOffset
         * returns failure (and not just a pcie_cap_pos == 0,
         * which is *success* at determining the device is *not*
         * PCIe) we make an educated guess based on the length of
         * the device's config file - if it is 256 bytes, then it
         * is definitely a legacy PCI device. If it's larger than
         * that, then it is *probably PCIe (although it could be
         * PCI-x, but those are extremely rare). If the config
         * file can't be found (in which case the "length" will be
         * -1), then we blindly assume the most likely outcome -
         * PCIe.
         */
        off_t configLen = virFileLength(virPCIDeviceGetConfigPath(dev), -1);

        if (configLen != 256)
            dev->is_pcie = true;

    } else {
        dev->is_pcie = (dev->pcie_cap_pos != 0);
    }

    virPCIDeviceFindCapabilityOffset(dev, cfgfd, PCI_CAP_ID_PM, &dev->pci_pm_cap_pos);
    dev->has_flr = virPCIDeviceDetectFunctionLevelReset(dev, cfgfd);
    dev->has_pm_reset = virPCIDeviceDetectPowerManagementReset(dev, cfgfd);

    return 0;
}

int
virPCIDeviceReset(virPCIDevice *dev,
                  virPCIDeviceList *activeDevs,
                  virPCIDeviceList *inactiveDevs)
{
    g_autofree char *drvPath = NULL;
    g_autofree char *drvName = NULL;
    int ret = -1;
    int fd = -1;
    int hdrType = -1;

    if (virPCIGetHeaderType(dev, &hdrType) < 0)
        return -1;

    if (hdrType != VIR_PCI_HEADER_ENDPOINT) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Invalid attempt to reset PCI device %s. "
                         "Only PCI endpoint devices can be reset"),
                       dev->name);
        return -1;
    }

    if (activeDevs && virPCIDeviceListFind(activeDevs, &dev->address)) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Not resetting active device %s"), dev->name);
        return -1;
    }

    /* If the device is currently bound to vfio-pci, ignore all
     * requests to reset it, since the vfio-pci driver will always
     * reset it whenever appropriate, so doing it ourselves would just
     * be redundant.
     */
    if (virPCIDeviceGetDriverPathAndName(dev, &drvPath, &drvName) < 0)
        goto cleanup;

    if (virPCIStubDriverTypeFromString(drvName) == VIR_PCI_STUB_DRIVER_VFIO) {
        VIR_DEBUG("Device %s is bound to vfio-pci - skip reset",
                  dev->name);
        ret = 0;
        goto cleanup;
    }
    VIR_DEBUG("Resetting device %s", dev->name);

    if ((fd = virPCIDeviceConfigOpenWrite(dev)) < 0)
        goto cleanup;

    if (virPCIDeviceInit(dev, fd) < 0)
        goto cleanup;

    /* KVM will perform FLR when starting and stopping
     * a guest, so there is no need for us to do it here.
     */
    if (dev->has_flr) {
        ret = 0;
        goto cleanup;
    }

    /* If the device supports PCI power management reset,
     * that's the next best thing because it only resets
     * the function, not the whole device.
     */
    if (dev->has_pm_reset)
        ret = virPCIDeviceTryPowerManagementReset(dev, fd);

    /* Bus reset is not an option with the root bus */
    if (ret < 0 && dev->address.bus != 0)
        ret = virPCIDeviceTrySecondaryBusReset(dev, fd, inactiveDevs);

    if (ret < 0) {
        virErrorPtr err = virGetLastError();
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Unable to reset PCI device %s: %s"),
                       dev->name,
                       err ? err->message :
                       _("no FLR, PM reset or bus reset available"));
    }

 cleanup:
    virPCIDeviceConfigClose(dev, fd);
    return ret;
}


static int
virPCIProbeStubDriver(virPCIStubDriver driver)
{
    const char *drvname = NULL;
    g_autofree char *drvpath = NULL;
    g_autofree char *errbuf = NULL;

    if (driver == VIR_PCI_STUB_DRIVER_NONE ||
        !(drvname = virPCIStubDriverTypeToString(driver))) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       "%s",
                       _("Attempting to use unknown stub driver"));
        return -1;
    }

    drvpath = virPCIDriverDir(drvname);

    /* driver previously loaded, return */
    if (virFileExists(drvpath))
        return 0;

    if ((errbuf = virKModLoad(drvname))) {
        VIR_WARN("failed to load driver %s: %s", drvname, errbuf);
        goto cleanup;
    }

    /* driver loaded after probing */
    if (virFileExists(drvpath))
        return 0;

 cleanup:
    /* If we know failure was because of admin config, let's report that;
     * otherwise, report a more generic failure message
     */
    if (virKModIsProhibited(drvname)) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Failed to load PCI stub module %s: "
                         "administratively prohibited"),
                       drvname);
    } else {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Failed to load PCI stub module %s"),
                       drvname);
    }

    return -1;
}

int
virPCIDeviceUnbind(virPCIDevice *dev)
{
    g_autofree char *path = NULL;
    g_autofree char *drvpath = NULL;
    g_autofree char *driver = NULL;

    if (virPCIDeviceGetDriverPathAndName(dev, &drvpath, &driver) < 0)
        return -1;

    if (!driver)
        /* The device is not bound to any driver */
        return 0;

    path = virPCIFile(dev->name, "driver/unbind");

    if (virFileExists(path)) {
        if (virFileWriteStr(path, dev->name, 0) < 0) {
            virReportSystemError(errno,
                                 _("Failed to unbind PCI device '%s' from %s"),
                                 dev->name, driver);
            return -1;
        }
    }

    return 0;
}


/**
 * virPCIDeviceRebind:
 *  @dev: virPCIDevice object describing the device to rebind
 *
 * unbind a device from its driver, then immediately rebind it.
 *
 * Returns 0 on success, -1 on failure
 */
int virPCIDeviceRebind(virPCIDevice *dev)
{
    if (virPCIDeviceUnbind(dev) < 0)
        return -1;

    if (virFileWriteStr(PCI_SYSFS "drivers_probe", dev->name, 0) < 0) {
        virReportSystemError(errno,
                             _("Failed to trigger a probe for PCI device '%s'"),
                             dev->name);
        return -1;
    }

    return 0;
}


/*
 * Bind a PCI device to a driver using driver_override sysfs interface.
 * E.g.
 *
 *  echo driver-name > /sys/bus/pci/devices/0000:03:00.0/driver_override
 *  echo 0000:03:00.0 > /sys/bus/pci/devices/0000:03:00.0/driver/unbind
 *  echo 0000:03:00.0 > /sys/bus/pci/drivers_probe
 *
 * An empty driverName will cause the device to be bound to its
 * preferred driver.
 */
static int
virPCIDeviceBindWithDriverOverride(virPCIDevice *dev,
                                   const char *driverName)
{
    g_autofree char *path = NULL;

    path = virPCIFile(dev->name, "driver_override");

    if (virFileWriteStr(path, driverName, 0) < 0) {
        virReportSystemError(errno,
                             _("Failed to add driver '%s' to driver_override "
                               " interface of PCI device '%s'"),
                             driverName, dev->name);
        return -1;
    }

    if (virPCIDeviceRebind(dev) < 0)
        return -1;

    return 0;
}

static int
virPCIDeviceUnbindFromStub(virPCIDevice *dev)
{
    if (!dev->unbind_from_stub) {
        VIR_DEBUG("Unbind from stub skipped for PCI device %s", dev->name);
        return 0;
    }

    return virPCIDeviceBindWithDriverOverride(dev, "\n");
}

static int
virPCIDeviceBindToStub(virPCIDevice *dev)
{
    const char *stubDriverName;
    g_autofree char *stubDriverPath = NULL;
    g_autofree char *driverLink = NULL;

    /* Check the device is configured to use one of the known stub drivers */
    if (dev->stubDriver == VIR_PCI_STUB_DRIVER_NONE) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("No stub driver configured for PCI device %s"),
                       dev->name);
        return -1;
    } else if (!(stubDriverName = virPCIStubDriverTypeToString(dev->stubDriver))) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Unknown stub driver configured for PCI device %s"),
                       dev->name);
        return -1;
    }

    stubDriverPath = virPCIDriverDir(stubDriverName);
    driverLink = virPCIFile(dev->name, "driver");

    if (virFileExists(driverLink)) {
        if (virFileLinkPointsTo(driverLink, stubDriverPath)) {
            /* The device is already bound to the correct driver */
            VIR_DEBUG("Device %s is already bound to %s",
                      dev->name, stubDriverName);
            return 0;
        }
    }

    if (virPCIDeviceBindWithDriverOverride(dev, stubDriverName) < 0)
        return -1;

    dev->unbind_from_stub = true;
    return 0;
}

/* virPCIDeviceDetach:
 *
 * Detach this device from the host driver, attach it to the stub
 * driver (previously set with virPCIDeviceSetStubDriver(), and add *a
 * copy* of the object to the inactiveDevs list (if provided). This
 * function will *never* consume dev, so the caller should free it.
 *
 * Returns 0 on success, -1 on failure (will fail if the device is
 * already in the activeDevs list, but will be a NOP if the device is
 * already bound to the stub).
 *
 * GENERAL NOTE: activeDevs should be a list of all PCI devices
 * currently in use by a domain. inactiveDevs is a list of all PCI
 * devices that libvirt has detached from the host driver + attached
 * to the stub driver, but hasn't yet assigned to a domain. Any device
 * that is still attached to its host driver should not be on either
 * list.
 */
int
virPCIDeviceDetach(virPCIDevice *dev,
                   virPCIDeviceList *activeDevs,
                   virPCIDeviceList *inactiveDevs)
{
    if (virPCIProbeStubDriver(dev->stubDriver) < 0)
        return -1;

    if (activeDevs && virPCIDeviceListFind(activeDevs, &dev->address)) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Not detaching active device %s"), dev->name);
        return -1;
    }

    if (virPCIDeviceBindToStub(dev) < 0)
        return -1;

    /* Add *a copy of* the dev into list inactiveDevs, if
     * it's not already there.
     */
    if (inactiveDevs && !virPCIDeviceListFind(inactiveDevs, &dev->address)) {
        VIR_DEBUG("Adding PCI device %s to inactive list", dev->name);
        if (virPCIDeviceListAddCopy(inactiveDevs, dev) < 0)
            return -1;
    }

    return 0;
}

/*
 * Pre-condition: inactivePCIHostdevs & activePCIHostdevs
 * are locked
 */
int
virPCIDeviceReattach(virPCIDevice *dev,
                     virPCIDeviceList *activeDevs,
                     virPCIDeviceList *inactiveDevs)
{
    if (activeDevs && virPCIDeviceListFind(activeDevs, &dev->address)) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Not reattaching active device %s"), dev->name);
        return -1;
    }

    if (virPCIDeviceUnbindFromStub(dev) < 0)
        return -1;

    /* Steal the dev from list inactiveDevs */
    if (inactiveDevs) {
        VIR_DEBUG("Removing PCI device %s from inactive list", dev->name);
        virPCIDeviceListDel(inactiveDevs, &dev->address);
    }

    return 0;
}

static char *
virPCIDeviceReadID(virPCIDevice *dev, const char *id_name)
{
    g_autofree char *path = NULL;
    g_autofree char *id_str = NULL;

    path = virPCIFile(dev->name, id_name);

    /* ID string is '0xNNNN\n' ... i.e. 7 bytes */
    if (virFileReadAll(path, 7, &id_str) < 0)
        return NULL;

    /* Check for 0x suffix */
    if (id_str[0] != '0' || id_str[1] != 'x')
        return NULL;

    /* Chop off the newline; we know the string is 7 bytes */
    id_str[6] = '\0';

    return g_steal_pointer(&id_str);
}

bool
virPCIDeviceAddressIsValid(virPCIDeviceAddress *addr,
                           bool report)
{
    if (addr->bus > 0xFF) {
        if (report)
            virReportError(VIR_ERR_XML_ERROR,
                           _("Invalid PCI address bus='0x%x', "
                             "must be <= 0xFF"),
                           addr->bus);
        return false;
    }
    if (addr->slot > 0x1F) {
        if (report)
            virReportError(VIR_ERR_XML_ERROR,
                           _("Invalid PCI address slot='0x%x', "
                             "must be <= 0x1F"),
                           addr->slot);
        return false;
    }
    if (addr->function > 7) {
        if (report)
            virReportError(VIR_ERR_XML_ERROR,
                           _("Invalid PCI address function=0x%x, "
                             "must be <= 7"),
                           addr->function);
        return false;
    }
    if (virPCIDeviceAddressIsEmpty(addr)) {
        if (report)
            virReportError(VIR_ERR_XML_ERROR, "%s",
                           _("Invalid PCI address 0000:00:00, at least "
                             "one of domain, bus, or slot must be > 0"));
        return false;
    }
    return true;
}

bool
virPCIDeviceAddressIsEmpty(const virPCIDeviceAddress *addr)
{
    return !(addr->domain || addr->bus || addr->slot);
}

bool
virPCIDeviceAddressEqual(const virPCIDeviceAddress *addr1,
                         const virPCIDeviceAddress *addr2)
{
    if (addr1->domain == addr2->domain &&
        addr1->bus == addr2->bus &&
        addr1->slot == addr2->slot &&
        addr1->function == addr2->function) {
        return true;
    }
    return false;
}

/**
 * virPCIDeviceAddressCopy:
 * @dst: where to store address
 * @src: source address to copy
 *
 * Creates a deep copy of given @src address and stores it into
 * @dst which has to be pre-allocated by caller.
 */
void virPCIDeviceAddressCopy(virPCIDeviceAddress *dst,
                             const virPCIDeviceAddress *src)
{
    memcpy(dst, src, sizeof(*src));
}

char *
virPCIDeviceAddressAsString(const virPCIDeviceAddress *addr)
{
    return g_strdup_printf(VIR_PCI_DEVICE_ADDRESS_FMT, addr->domain,
                           addr->bus, addr->slot, addr->function);
}

bool
virPCIDeviceExists(const virPCIDeviceAddress *addr)
{
    g_autofree char *devName = virPCIDeviceAddressAsString(addr);
    g_autofree char *devPath = g_strdup_printf(PCI_SYSFS "devices/%s/config",
                                               devName);

    return virFileExists(devPath);
}

virPCIDevice *
virPCIDeviceNew(const virPCIDeviceAddress *address)
{
    g_autoptr(virPCIDevice) dev = NULL;
    g_autofree char *vendor = NULL;
    g_autofree char *product = NULL;

    dev = g_new0(virPCIDevice, 1);

    virPCIDeviceAddressCopy(&dev->address, address);

    dev->name = virPCIDeviceAddressAsString(&dev->address);

    dev->path = g_strdup_printf(PCI_SYSFS "devices/%s/config", dev->name);

    if (!virFileExists(dev->path)) {
        virReportSystemError(errno,
                             _("Device %s not found: could not access %s"),
                             dev->name, dev->path);
        return NULL;
    }

    vendor  = virPCIDeviceReadID(dev, "vendor");
    product = virPCIDeviceReadID(dev, "device");

    if (!vendor || !product) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Failed to read product/vendor ID for %s"),
                       dev->name);
        return NULL;
    }

    /* strings contain '0x' prefix */
    if (g_snprintf(dev->id, sizeof(dev->id), "%s %s", &vendor[2],
                   &product[2]) >= sizeof(dev->id)) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("dev->id buffer overflow: %s %s"),
                       &vendor[2], &product[2]);
        return NULL;
    }

    VIR_DEBUG("%s %s: initialized", dev->id, dev->name);

    return g_steal_pointer(&dev);
}


virPCIDevice *
virPCIDeviceCopy(virPCIDevice *dev)
{
    virPCIDevice *copy;

    copy = g_new0(virPCIDevice, 1);

    /* shallow copy to take care of most attributes */
    *copy = *dev;
    copy->path = NULL;
    copy->used_by_drvname = copy->used_by_domname = NULL;
    copy->name = g_strdup(dev->name);
    copy->path = g_strdup(dev->path);
    copy->used_by_drvname = g_strdup(dev->used_by_drvname);
    copy->used_by_domname = g_strdup(dev->used_by_domname);
    return copy;
}


void
virPCIDeviceFree(virPCIDevice *dev)
{
    if (!dev)
        return;
    VIR_DEBUG("%s %s: freeing", dev->id, dev->name);
    g_free(dev->name);
    g_free(dev->path);
    g_free(dev->used_by_drvname);
    g_free(dev->used_by_domname);
    g_free(dev);
}

/**
 * virPCIDeviceGetAddress:
 * @dev: device to get address from
 *
 * Take a PCI device on input and return its PCI address. The
 * returned object is owned by the device and must not be freed.
 *
 * Returns: a pointer to the address, which can never be NULL.
 */
virPCIDeviceAddress *
virPCIDeviceGetAddress(virPCIDevice *dev)
{
    return &(dev->address);
}

const char *
virPCIDeviceGetName(virPCIDevice *dev)
{
    return dev->name;
}

/**
 * virPCIDeviceGetConfigPath:
 *
 * Returns a pointer to a string containing the path of @dev's PCI
 * config file.
 */
const char *
virPCIDeviceGetConfigPath(virPCIDevice *dev)
{
    return dev->path;
}

void virPCIDeviceSetManaged(virPCIDevice *dev, bool managed)
{
    dev->managed = managed;
}

bool
virPCIDeviceGetManaged(virPCIDevice *dev)
{
    return dev->managed;
}

void
virPCIDeviceSetStubDriver(virPCIDevice *dev, virPCIStubDriver driver)
{
    dev->stubDriver = driver;
}

virPCIStubDriver
virPCIDeviceGetStubDriver(virPCIDevice *dev)
{
    return dev->stubDriver;
}

bool
virPCIDeviceGetUnbindFromStub(virPCIDevice *dev)
{
    return dev->unbind_from_stub;
}

void
virPCIDeviceSetUnbindFromStub(virPCIDevice *dev, bool unbind)
{
    dev->unbind_from_stub = unbind;
}

bool
virPCIDeviceGetRemoveSlot(virPCIDevice *dev)
{
    return dev->remove_slot;
}

void
virPCIDeviceSetRemoveSlot(virPCIDevice *dev, bool remove_slot)
{
    dev->remove_slot = remove_slot;
}

bool
virPCIDeviceGetReprobe(virPCIDevice *dev)
{
    return dev->reprobe;
}

void
virPCIDeviceSetReprobe(virPCIDevice *dev, bool reprobe)
{
    dev->reprobe = reprobe;
}

int
virPCIDeviceSetUsedBy(virPCIDevice *dev,
                      const char *drv_name,
                      const char *dom_name)
{
    VIR_FREE(dev->used_by_drvname);
    VIR_FREE(dev->used_by_domname);
    dev->used_by_drvname = g_strdup(drv_name);
    dev->used_by_domname = g_strdup(dom_name);

    return 0;
}

void
virPCIDeviceGetUsedBy(virPCIDevice *dev,
                      const char **drv_name,
                      const char **dom_name)
{
    *drv_name = dev->used_by_drvname;
    *dom_name = dev->used_by_domname;
}

virPCIDeviceList *
virPCIDeviceListNew(void)
{
    virPCIDeviceList *list;

    if (virPCIInitialize() < 0)
        return NULL;

    if (!(list = virObjectLockableNew(virPCIDeviceListClass)))
        return NULL;

    return list;
}

static void
virPCIDeviceListDispose(void *obj)
{
    virPCIDeviceList *list = obj;
    size_t i;

    for (i = 0; i < list->count; i++) {
        g_clear_pointer(&list->devs[i], virPCIDeviceFree);
    }

    list->count = 0;
    g_free(list->devs);
}

int
virPCIDeviceListAdd(virPCIDeviceList *list,
                    virPCIDevice *dev)
{
    if (virPCIDeviceListFind(list, &dev->address)) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Device %s is already in use"), dev->name);
        return -1;
    }
    VIR_APPEND_ELEMENT(list->devs, list->count, dev);

    return 0;
}


/* virPCIDeviceListAddCopy - add a *copy* of the device to this list */
int
virPCIDeviceListAddCopy(virPCIDeviceList *list, virPCIDevice *dev)
{
    g_autoptr(virPCIDevice) copy = virPCIDeviceCopy(dev);

    if (!copy)
        return -1;
    if (virPCIDeviceListAdd(list, copy) < 0)
        return -1;

    copy = NULL;
    return 0;
}


virPCIDevice *
virPCIDeviceListGet(virPCIDeviceList *list,
                    int idx)
{
    if (idx >= list->count)
        return NULL;
    if (idx < 0)
        return NULL;

    return list->devs[idx];
}

size_t
virPCIDeviceListCount(virPCIDeviceList *list)
{
    return list->count;
}

virPCIDevice *
virPCIDeviceListStealIndex(virPCIDeviceList *list,
                           int idx)
{
    virPCIDevice *ret;

    if (idx < 0 || idx >= list->count)
        return NULL;

    ret = list->devs[idx];
    VIR_DELETE_ELEMENT(list->devs, idx, list->count);
    return ret;
}

virPCIDevice *
virPCIDeviceListSteal(virPCIDeviceList *list,
                      virPCIDeviceAddress *devAddr)
{
    return virPCIDeviceListStealIndex(list, virPCIDeviceListFindIndex(list, devAddr));
}

void
virPCIDeviceListDel(virPCIDeviceList *list,
                    virPCIDeviceAddress *devAddr)
{
    virPCIDeviceFree(virPCIDeviceListSteal(list, devAddr));
}

int
virPCIDeviceListFindIndex(virPCIDeviceList *list,
                          virPCIDeviceAddress *devAddr)
{
    size_t i;

    for (i = 0; i < list->count; i++) {
        virPCIDevice *other = list->devs[i];
        if (other->address.domain   == devAddr->domain &&
            other->address.bus      == devAddr->bus    &&
            other->address.slot     == devAddr->slot   &&
            other->address.function == devAddr->function)
            return i;
    }
    return -1;
}


virPCIDevice *
virPCIDeviceListFindByIDs(virPCIDeviceList *list,
                          unsigned int domain,
                          unsigned int bus,
                          unsigned int slot,
                          unsigned int function)
{
    size_t i;

    for (i = 0; i < list->count; i++) {
        virPCIDevice *other = list->devs[i];
        if (other->address.domain   == domain &&
            other->address.bus      == bus    &&
            other->address.slot     == slot   &&
            other->address.function == function)
            return list->devs[i];
    }
    return NULL;
}


virPCIDevice *
virPCIDeviceListFind(virPCIDeviceList *list, virPCIDeviceAddress *devAddr)
{
    int idx;

    if ((idx = virPCIDeviceListFindIndex(list, devAddr)) >= 0)
        return list->devs[idx];
    else
        return NULL;
}


int virPCIDeviceFileIterate(virPCIDevice *dev,
                            virPCIDeviceFileActor actor,
                            void *opaque)
{
    g_autofree char *pcidir = NULL;
    g_autoptr(DIR) dir = NULL;
    struct dirent *ent;
    int direrr;

    pcidir = g_strdup_printf("/sys/bus/pci/devices/" VIR_PCI_DEVICE_ADDRESS_FMT,
                             dev->address.domain, dev->address.bus, dev->address.slot,
                             dev->address.function);

    if (virDirOpen(&dir, pcidir) < 0)
        return -1;

    while ((direrr = virDirRead(dir, &ent, pcidir)) > 0) {
        g_autofree char *file = NULL;
        /* Device assignment requires:
         *   $PCIDIR/config, $PCIDIR/resource, $PCIDIR/resourceNNN,
         *   $PCIDIR/rom, $PCIDIR/reset, $PCIDIR/vendor, $PCIDIR/device
         */
        if (STREQ(ent->d_name, "config") ||
            STRPREFIX(ent->d_name, "resource") ||
            STREQ(ent->d_name, "rom") ||
            STREQ(ent->d_name, "vendor") ||
            STREQ(ent->d_name, "device") ||
            STREQ(ent->d_name, "reset")) {
            file = g_strdup_printf("%s/%s", pcidir, ent->d_name);
            if ((actor)(dev, file, opaque) < 0)
                return -1;
        }
    }
    if (direrr < 0)
        return -1;

    return 0;
}


/* virPCIDeviceAddressIOMMUGroupIterate:
 *   Call @actor for all devices in the same iommu_group as orig
 *   (including orig itself) Even if there is no iommu_group for the
 *   device, call @actor once for orig.
 */
int
virPCIDeviceAddressIOMMUGroupIterate(virPCIDeviceAddress *orig,
                                     virPCIDeviceAddressActor actor,
                                     void *opaque)
{
    g_autofree char *groupPath = NULL;
    g_autoptr(DIR) groupDir = NULL;
    struct dirent *ent;
    int direrr;

    groupPath = g_strdup_printf(PCI_SYSFS "devices/" VIR_PCI_DEVICE_ADDRESS_FMT "/iommu_group/devices",
                                orig->domain, orig->bus, orig->slot, orig->function);

    if (virDirOpenQuiet(&groupDir, groupPath) < 0) {
        /* just process the original device, nothing more */
        return (actor)(orig, opaque);
    }

    while ((direrr = virDirRead(groupDir, &ent, groupPath)) > 0) {
        virPCIDeviceAddress newDev = { 0 };

        if (virPCIDeviceAddressParse(ent->d_name, &newDev) < 0) {
            virReportError(VIR_ERR_INTERNAL_ERROR,
                           _("Found invalid device link '%s' in '%s'"),
                           ent->d_name, groupPath);
            return -1;
        }

        if ((actor)(&newDev, opaque) < 0)
            return -1;
    }
    if (direrr < 0)
        return -1;

    return 0;
}


static int
virPCIDeviceGetIOMMUGroupAddOne(virPCIDeviceAddress *newDevAddr, void *opaque)
{
    virPCIDeviceList *groupList = opaque;
    g_autoptr(virPCIDevice) newDev = NULL;

    if (!(newDev = virPCIDeviceNew(newDevAddr)))
        return -1;

    if (virPCIDeviceListAdd(groupList, newDev) < 0)
        return -1;

    newDev = NULL; /* it's now on the list */
    return 0;
}


/*
 * virPCIDeviceGetIOMMUGroupList - return a virPCIDeviceList containing
 * all of the devices in the same iommu_group as @dev.
 *
 * Return the new list, or NULL on failure
 */
virPCIDeviceList *
virPCIDeviceGetIOMMUGroupList(virPCIDevice *dev)
{
    virPCIDeviceList *groupList = virPCIDeviceListNew();

    if (!groupList)
        goto error;

    if (virPCIDeviceAddressIOMMUGroupIterate(&(dev->address),
                                             virPCIDeviceGetIOMMUGroupAddOne,
                                             groupList) < 0)
        goto error;

    return groupList;

 error:
    virObjectUnref(groupList);
    return NULL;
}


typedef struct {
    virPCIDeviceAddress ***iommuGroupDevices;
    size_t *nIommuGroupDevices;
} virPCIDeviceAddressList;

static int
virPCIGetIOMMUGroupAddressesAddOne(virPCIDeviceAddress *newDevAddr, void *opaque)
{
    virPCIDeviceAddressList *addrList = opaque;
    g_autofree virPCIDeviceAddress *copyAddr = NULL;

    /* make a copy to insert onto the list */
    copyAddr = g_new0(virPCIDeviceAddress, 1);

    *copyAddr = *newDevAddr;

    VIR_APPEND_ELEMENT(*addrList->iommuGroupDevices,
                       *addrList->nIommuGroupDevices, copyAddr);

    return 0;
}


/*
 * virPCIDeviceAddressGetIOMMUGroupAddresses - return a
 * virPCIDeviceList containing all of the devices in the same
 * iommu_group as @dev.
 *
 * Return the new list, or NULL on failure
 */
int
virPCIDeviceAddressGetIOMMUGroupAddresses(virPCIDeviceAddress *devAddr,
                                          virPCIDeviceAddress ***iommuGroupDevices,
                                          size_t *nIommuGroupDevices)
{
    virPCIDeviceAddressList addrList = { iommuGroupDevices,
                                         nIommuGroupDevices };

    if (virPCIDeviceAddressIOMMUGroupIterate(devAddr,
                                             virPCIGetIOMMUGroupAddressesAddOne,
                                             &addrList) < 0)
        return -1;

    return 0;
}


/* virPCIDeviceAddressGetIOMMUGroupNum - return the group number of
 * this PCI device's iommu_group, or -2 if there is no iommu_group for
 * the device (or -1 if there was any other error)
 */
int
virPCIDeviceAddressGetIOMMUGroupNum(virPCIDeviceAddress *addr)
{
    g_autofree char *devName = NULL;
    g_autofree char *devPath = NULL;
    g_autofree char *groupPath = NULL;
    g_autofree char *groupNumStr = NULL;
    unsigned int groupNum;

    devName = virPCIDeviceAddressAsString(addr);

    devPath = virPCIFile(devName, "iommu_group");

    if (virFileIsLink(devPath) != 1)
        return -2;
    if (virFileResolveLink(devPath, &groupPath) < 0) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Unable to resolve device %s iommu_group symlink %s"),
                       devName, devPath);
        return -1;
    }

    groupNumStr = g_path_get_basename(groupPath);
    if (virStrToLong_ui(groupNumStr, NULL, 10, &groupNum) < 0) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("device %s iommu_group symlink %s has "
                         "invalid group number %s"),
                       devName, groupPath, groupNumStr);
        return -1;
    }

    return groupNum;
}


char *
virPCIDeviceAddressGetIOMMUGroupDev(const virPCIDeviceAddress *devAddr)
{
    g_autoptr(virPCIDevice) pci = NULL;

    if (!(pci = virPCIDeviceNew(devAddr)))
        return NULL;

    return virPCIDeviceGetIOMMUGroupDev(pci);
}


/* virPCIDeviceGetIOMMUGroupDev - return the name of the device used
 * to control this PCI device's group (e.g. "/dev/vfio/15")
 */
char *
virPCIDeviceGetIOMMUGroupDev(virPCIDevice *dev)
{
    g_autofree char *devPath = NULL;
    g_autofree char *groupPath = NULL;
    g_autofree char *groupFile = NULL;

    devPath = virPCIFile(dev->name, "iommu_group");

    if (virFileIsLink(devPath) != 1) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Invalid device %s iommu_group file %s is not a symlink"),
                       dev->name, devPath);
        return NULL;
    }
    if (virFileResolveLink(devPath, &groupPath) < 0) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Unable to resolve device %s iommu_group symlink %s"),
                       dev->name, devPath);
        return NULL;
    }
    groupFile = g_path_get_basename(groupPath);

    return g_strdup_printf("/dev/vfio/%s", groupFile);
}

static int
virPCIDeviceDownstreamLacksACS(virPCIDevice *dev)
{
    uint16_t flags;
    uint16_t ctrl;
    unsigned int pos;
    int fd;
    int ret = 0;
    uint16_t device_class;

    if ((fd = virPCIDeviceConfigOpen(dev)) < 0)
        return -1;

    if (virPCIDeviceInit(dev, fd) < 0) {
        ret = -1;
        goto cleanup;
    }

    if (virPCIDeviceReadClass(dev, &device_class) < 0)
        goto cleanup;

    pos = dev->pcie_cap_pos;
    if (!pos || device_class != PCI_CLASS_BRIDGE_PCI)
        goto cleanup;

    flags = virPCIDeviceRead16(dev, fd, pos + PCI_EXP_FLAGS);
    if (((flags & PCI_EXP_FLAGS_TYPE) >> 4) != PCI_EXP_TYPE_DOWNSTREAM)
        goto cleanup;

    pos = virPCIDeviceFindExtendedCapabilityOffset(dev, fd, PCI_EXT_CAP_ID_ACS);
    if (!pos) {
        VIR_DEBUG("%s %s: downstream port lacks ACS", dev->id, dev->name);
        ret = 1;
        goto cleanup;
    }

    ctrl = virPCIDeviceRead16(dev, fd, pos + PCI_EXT_ACS_CTRL);
    if ((ctrl & PCI_EXT_CAP_ACS_ENABLED) != PCI_EXT_CAP_ACS_ENABLED) {
        VIR_DEBUG("%s %s: downstream port has ACS disabled",
                  dev->id, dev->name);
        ret = 1;
        goto cleanup;
    }

 cleanup:
    virPCIDeviceConfigClose(dev, fd);
    return ret;
}

static int
virPCIDeviceIsBehindSwitchLackingACS(virPCIDevice *dev)
{
    g_autoptr(virPCIDevice) parent = NULL;

    if (virPCIDeviceGetParent(dev, &parent) < 0)
        return -1;
    if (!parent) {
        /* if we have no parent, and this is the root bus, ACS doesn't come
         * into play since devices on the root bus can't P2P without going
         * through the root IOMMU.
         */
        if (dev->address.bus == 0) {
            return 0;
        } else {
            virReportError(VIR_ERR_INTERNAL_ERROR,
                           _("Failed to find parent device for %s"),
                           dev->name);
            return -1;
        }
    }

    /* XXX we should rather fail when we can't find device's parent and
     * stop the loop when we get to root instead of just stopping when no
     * parent can be found
     */
    do {
        g_autoptr(virPCIDevice) tmp = NULL;
        int acs;
        int ret;

        acs = virPCIDeviceDownstreamLacksACS(parent);

        if (acs) {
            if (acs < 0)
                return -1;
            else
                return 1;
        }

        tmp = g_steal_pointer(&parent);
        ret = virPCIDeviceGetParent(tmp, &parent);
        if (ret < 0)
            return -1;
    } while (parent);

    return 0;
}

int virPCIDeviceIsAssignable(virPCIDevice *dev,
                             int strict_acs_check)
{
    int ret;

    /* XXX This could be a great place to actually check that a non-managed
     * device isn't in use, e.g. by checking that device is either un-bound
     * or bound to a stub driver.
     */

    ret = virPCIDeviceIsBehindSwitchLackingACS(dev);
    if (ret < 0)
        return 0;

    if (ret) {
        if (!strict_acs_check) {
            VIR_DEBUG("%s %s: strict ACS check disabled; device assignment allowed",
                      dev->id, dev->name);
        } else {
            virReportError(VIR_ERR_INTERNAL_ERROR,
                           _("Device %s is behind a switch lacking ACS and "
                             "cannot be assigned"),
                           dev->name);
            return 0;
        }
    }

    return 1;
}

static int
logStrToLong_ui(char const *s,
                char **end_ptr,
                int base,
                unsigned int *result)
{
    int ret = 0;

    ret = virStrToLong_ui(s, end_ptr, base, result);
    if (ret != 0)
        VIR_ERROR(_("Failed to convert '%s' to unsigned int"), s);
    return ret;
}

int
virPCIDeviceAddressParse(char *address,
                         virPCIDeviceAddress *bdf)
{
    char *p = NULL;

    if ((address == NULL) || (logStrToLong_ui(address, &p, 16,
                                              &bdf->domain) == -1)) {
        return -1;
    }

    if ((p == NULL) || (logStrToLong_ui(p+1, &p, 16,
                                        &bdf->bus) == -1)) {
        return -1;
    }

    if ((p == NULL) || (logStrToLong_ui(p+1, &p, 16,
                                        &bdf->slot) == -1)) {
        return -1;
    }

    if ((p == NULL) || (logStrToLong_ui(p+1, &p, 16,
                                        &bdf->function) == -1)) {
        return -1;
    }

    return 0;
}


bool
virZPCIDeviceAddressIsIncomplete(const virZPCIDeviceAddress *addr)
{
    return !addr->uid.isSet || !addr->fid.isSet;
}


bool
virZPCIDeviceAddressIsPresent(const virZPCIDeviceAddress *addr)
{
    return addr->uid.isSet || addr->fid.isSet;
}


void
virPCIVirtualFunctionListFree(virPCIVirtualFunctionList *list)
{
    size_t i;

    if (!list)
        return;

    for (i = 0; i < list->nfunctions; i++) {
        g_free(list->functions[i].addr);
        g_free(list->functions[i].ifname);
    }

    g_free(list);
}


int
virPCIGetVirtualFunctions(const char *sysfs_path,
                          virPCIVirtualFunctionList **vfs)
{
    return virPCIGetVirtualFunctionsFull(sysfs_path, vfs, NULL);
}


#ifdef __linux__

virPCIDeviceAddress *
virPCIGetDeviceAddressFromSysfsLink(const char *device_link)
{
    g_autofree virPCIDeviceAddress *bdf = NULL;
    g_autofree char *config_address = NULL;
    g_autofree char *device_path = NULL;

    if (!virFileExists(device_link)) {
        VIR_DEBUG("'%s' does not exist", device_link);
        return NULL;
    }

    device_path = virFileCanonicalizePath(device_link);
    if (device_path == NULL) {
        virReportSystemError(errno,
                             _("Failed to resolve device link '%s'"),
                             device_link);
        return NULL;
    }

    config_address = g_path_get_basename(device_path);
    bdf = g_new0(virPCIDeviceAddress, 1);

    if (virPCIDeviceAddressParse(config_address, bdf) < 0) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Failed to parse PCI config address '%s'"),
                       config_address);
        return NULL;
    }

    return g_steal_pointer(&bdf);
}

/**
 * virPCIGetPhysicalFunction:
 * @vf_sysfs_path: sysfs path for the virtual function
 * @pf: where to store the physical function's address
 *
 * Given @vf_sysfs_path, this function will store the pointer
 * to a newly-allocated virPCIDeviceAddress in @pf.
 *
 * @pf might be NULL if @vf_sysfs_path does not point to a
 * virtual function. If it's not NULL, then it should be
 * freed by the caller when no longer needed.
 *
 * Returns: >=0 on success, <0 on failure
 */
int
virPCIGetPhysicalFunction(const char *vf_sysfs_path,
                          virPCIDeviceAddress **pf)
{
    g_autofree char *device_link = NULL;

    *pf = NULL;

    virBuildPath(&device_link, vf_sysfs_path, "physfn");

    if ((*pf = virPCIGetDeviceAddressFromSysfsLink(device_link))) {
        VIR_DEBUG("PF for VF device '%s': " VIR_PCI_DEVICE_ADDRESS_FMT,
                  vf_sysfs_path,
                  (*pf)->domain, (*pf)->bus, (*pf)->slot, (*pf)->function);
    }

    return 0;
}


/**
 * virPCIGetVirtualFunctionsFull:
 * @sysfs_path: path to physical function sysfs entry
 * @vfs: filled with the virtual function data
 * @pfNetDevName: Optional netdev name of this PF. If provided, the netdev
 *                names of the VFs are queried too.
 *
 *
 * Returns virtual functions of a physical function.
 */
int
virPCIGetVirtualFunctionsFull(const char *sysfs_path,
                              virPCIVirtualFunctionList **vfs,
                              const char *pfNetDevName)
{
    g_autofree char *totalvfs_file = NULL;
    g_autofree char *totalvfs_str = NULL;
    g_autoptr(virPCIVirtualFunctionList) list = g_new0(virPCIVirtualFunctionList, 1);

    *vfs = NULL;

    totalvfs_file = g_strdup_printf("%s/sriov_totalvfs", sysfs_path);
    if (virFileExists(totalvfs_file)) {
        char *end = NULL; /* so that terminating \n doesn't create error */
        unsigned long long maxfunctions = 0;

        if (virFileReadAll(totalvfs_file, 16, &totalvfs_str) < 0)
            return -1;
        if (virStrToLong_ull(totalvfs_str, &end, 10, &maxfunctions) < 0) {
            virReportError(VIR_ERR_INTERNAL_ERROR,
                           _("Unrecognized value in %s: %s"),
                           totalvfs_file, totalvfs_str);
            return -1;
        }
        list->maxfunctions = maxfunctions;
    }

    do {
        g_autofree char *device_link = NULL;
        struct virPCIVirtualFunction fnc = { NULL, NULL };

        /* look for virtfn%d links until one isn't found */
        device_link = g_strdup_printf("%s/virtfn%zu", sysfs_path, list->nfunctions);

        if (!virFileExists(device_link))
            break;

        if (!(fnc.addr = virPCIGetDeviceAddressFromSysfsLink(device_link))) {
            virReportError(VIR_ERR_INTERNAL_ERROR,
                           _("Failed to get SRIOV function from device link '%s'"),
                           device_link);
            return -1;
        }

        if (pfNetDevName &&
            virPCIGetNetName(device_link, 0, pfNetDevName, &fnc.ifname) < 0) {
            g_free(fnc.addr);
            return -1;
        }

        VIR_APPEND_ELEMENT(list->functions, list->nfunctions, fnc);
    } while (1);

    VIR_DEBUG("Found %zu virtual functions for %s", list->nfunctions, sysfs_path);

    *vfs = g_steal_pointer(&list);
    return 0;
}


/*
 * Returns 1 if vf device is a virtual function, 0 if not, -1 on error
 */
int
virPCIIsVirtualFunction(const char *vf_sysfs_device_link)
{
    g_autofree char *vf_sysfs_physfn_link = NULL;

    vf_sysfs_physfn_link = g_strdup_printf("%s/physfn", vf_sysfs_device_link);

    return virFileExists(vf_sysfs_physfn_link);
}

/*
 * Returns the sriov virtual function index of vf given its pf
 */
int
virPCIGetVirtualFunctionIndex(const char *pf_sysfs_device_link,
                              const char *vf_sysfs_device_link,
                              int *vf_index)
{
    size_t i;
    g_autofree virPCIDeviceAddress *vf_bdf = NULL;
    g_autoptr(virPCIVirtualFunctionList) virt_fns = NULL;

    if (!(vf_bdf = virPCIGetDeviceAddressFromSysfsLink(vf_sysfs_device_link)))
        return -1;

    if (virPCIGetVirtualFunctions(pf_sysfs_device_link, &virt_fns) < 0) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Error getting physical function's '%s' "
                         "virtual_functions"), pf_sysfs_device_link);
        return -1;
    }

    for (i = 0; i < virt_fns->nfunctions; i++) {
        if (virPCIDeviceAddressEqual(vf_bdf, virt_fns->functions[i].addr)) {
            *vf_index = i;
            return 0;
        }
    }

    return -1;
}

/*
 * Returns a path to the PCI sysfs file given the BDF of the PCI function
 */

int
virPCIDeviceAddressGetSysfsFile(virPCIDeviceAddress *addr,
                                char **pci_sysfs_device_link)
{
    *pci_sysfs_device_link = g_strdup_printf(PCI_SYSFS "devices/" VIR_PCI_DEVICE_ADDRESS_FMT, addr->domain,
                                             addr->bus, addr->slot, addr->function);
    return 0;
}

/**
 * virPCIGetNetName:
 * @device_link_sysfs_path: sysfs path to the PCI device
 * @idx: used to choose which netdev when there are several
 *       (ignored if physPortID is set or physPortName is available)

 * @physPortNetDevName: if non-null, attempt to learn the phys_port_id
 *                      of the netdev interface named
 *                      @physPortNetDevName, and find a netdev for
 *                      this PCI device that has the same
 *                      phys_port_id. if @physPortNetDevName is NULL,
 *                      or has no phys_port_id, then use
 *                      phys_port_name or idx to determine which
 *                      netdev to return. (NB: as of today, only mlx
 *                      drivers/cards can have multiple phys_ports for
 *                      a single PCI device; on all other devices
 *                      there is only a single choice of netdev, and
 *                      phys_port_id, phys_port_name, and idx are
 *                      unavailable/unused)
 * @netname: used to return the name of the netdev
 *       (set to NULL (but returns success) if there is no netdev)
 *
 * Returns 0 on success, -1 on error (error has been logged)
 */
int
virPCIGetNetName(const char *device_link_sysfs_path,
                 size_t idx,
                 const char *physPortNetDevName,
                 char **netname)
{
    g_autofree char *physPortID = NULL;
    g_autofree char *pcidev_sysfs_net_path = NULL;
    g_autofree char *firstEntryName = NULL;
    g_autoptr(DIR) dir = NULL;
    struct dirent *entry = NULL;
    size_t i = 0;

    *netname = NULL;

    if (physPortNetDevName &&
        virNetDevGetPhysPortID(physPortNetDevName, &physPortID) < 0) {
        return -1;
    }

    virBuildPath(&pcidev_sysfs_net_path, device_link_sysfs_path, "net");

    if (virDirOpenQuiet(&dir, pcidev_sysfs_net_path) < 0) {
        /* this *isn't* an error - caller needs to check for netname == NULL */
        return 0;
    }

    while (virDirRead(dir, &entry, pcidev_sysfs_net_path) > 0) {
        /* save the first entry we find to use as a failsafe
         * in case we don't match the phys_port_id. This is
         * needed because some NIC drivers (e.g. i40e)
         * implement phys_port_id for PFs, but not for VFs
         */
        if (!firstEntryName)
            firstEntryName = g_strdup(entry->d_name);

        /* if the caller sent a physPortID, compare it to the
         * physportID of this netdev. If not, look for entry[idx].
         */
        if (physPortID) {
            g_autofree char *thisPhysPortID = NULL;

            if (virNetDevGetPhysPortID(entry->d_name, &thisPhysPortID) < 0)
                return -1;

            /* if this one doesn't match, keep looking */
            if (STRNEQ_NULLABLE(physPortID, thisPhysPortID))
                continue;

        } else {
            /* Most switch devices use phys_port_name instead of
             * phys_port_id.
             * NOTE: VFs' representors net devices can be linked to PF's PCI
             * device, which mean that there'll be multiple net devices
             * instances and to get a proper net device need to match on
             * specific regex.
             * To get PF netdev, for ex., used following regex:
             * "(p[0-9]+$)|(p[0-9]+s[0-9]+$)"
             * or to get exact VF's netdev next regex is used:
             * "pf0vf1$"
             */
            g_autofree char *thisPhysPortName = NULL;

            if (virNetDevGetPhysPortName(entry->d_name, &thisPhysPortName) < 0)
                return -1;

            if (thisPhysPortName) {

                /* if this one doesn't match, keep looking */
                if (!virStringMatch(thisPhysPortName, VIR_PF_PHYS_PORT_NAME_REGEX))
                    continue;

            } else {

                if (i++ < idx)
                    continue;
            }
        }

        *netname = g_strdup(entry->d_name);
        return 0;
    }

    if (firstEntryName) {
        /* we didn't match the provided phys_port_id / find a
         * phys_port_name matching VIR_PF_PHYS_PORT_NAME_REGEX / find
         * as many net devices as the value of idx, but this is
         * probably because phys_port_id / phys_port_name isn't
         * implemented for this NIC driver, so just return the first
         * (probably only) netname we found.
         */
        *netname = g_steal_pointer(&firstEntryName);
        return 0;
    }

    virReportError(VIR_ERR_INTERNAL_ERROR,
                   _("Could not find any network device under PCI device at %s"),
                   device_link_sysfs_path);
    return -1;
}

int
virPCIGetVirtualFunctionInfo(const char *vf_sysfs_device_path,
                             int pfNetDevIdx,
                             char **pfname,
                             int *vf_index)
{
    g_autofree virPCIDeviceAddress *pf_config_address = NULL;
    g_autofree char *pf_sysfs_device_path = NULL;
    g_autofree char *vfname = NULL;

    if (virPCIGetPhysicalFunction(vf_sysfs_device_path, &pf_config_address) < 0)
        return -1;

    if (!pf_config_address)
        return -1;

    if (virPCIDeviceAddressGetSysfsFile(pf_config_address,
                                        &pf_sysfs_device_path) < 0) {
        return -1;
    }

    if (virPCIGetVirtualFunctionIndex(pf_sysfs_device_path,
                                      vf_sysfs_device_path, vf_index) < 0) {
        return -1;
    }

    /* If the caller hasn't asked for a specific pfNetDevIdx, and VF
     * is bound to a netdev, learn that netdev's phys_port_id (if
     * available). This can be used to disambiguate when the PF has
     * multiple netdevs. If the VF isn't bound to a netdev, then we
     * return netdev[pfNetDevIdx] on the PF, which may or may not be
     * correct.
     */
    if (pfNetDevIdx == -1) {
        if (virPCIGetNetName(vf_sysfs_device_path, 0, NULL, &vfname) < 0)
            return -1;

        pfNetDevIdx = 0;
    }

    if (virPCIGetNetName(pf_sysfs_device_path, pfNetDevIdx, vfname, pfname) < 0)
        return -1;

    if (!*pfname) {
        /* this shouldn't be possible. A VF can't exist unless its
         * PF device is bound to a network driver
         */
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("The PF device for VF %s has no network device name"),
                       vf_sysfs_device_path);
        return -1;
    }

    return 0;
}


bool
virPCIDeviceHasVPD(virPCIDevice *dev)
{
    g_autofree char *vpdPath = NULL;

    vpdPath = virPCIFile(dev->name, "vpd");
    if (!virFileExists(vpdPath)) {
        VIR_INFO("Device VPD file does not exist %s", vpdPath);
        return false;
    } else if (!virFileIsRegular(vpdPath)) {
        VIR_WARN("VPD path does not point to a regular file %s", vpdPath);
        return false;
    }
    return true;
}

/**
 * virPCIDeviceGetVPD:
 * @dev: a PCI device to get a PCI VPD for.
 *
 * Obtain a PCI device's Vital Product Data (VPD). VPD is optional in
 * both PCI Local Bus and PCIe specifications so there is no guarantee it
 * will be there for a particular device.
 *
 * Returns: a pointer to virPCIVPDResource which needs to be freed by the caller
 * or NULL if getting it failed for some reason (e.g. invalid format, I/O error).
 */
virPCIVPDResource *
virPCIDeviceGetVPD(virPCIDevice *dev)
{
    g_autofree char *vpdPath = NULL;
    int fd;
    g_autoptr(virPCIVPDResource) res = NULL;

    vpdPath = virPCIFile(dev->name, "vpd");
    if (!virPCIDeviceHasVPD(dev)) {
        virReportError(VIR_ERR_INTERNAL_ERROR, _("Device %s does not have a VPD"),
                virPCIDeviceGetName(dev));
        return NULL;
    }
    if ((fd = open(vpdPath, O_RDONLY)) < 0) {
        virReportSystemError(-fd, _("Failed to open a VPD file '%s'"), vpdPath);
        return NULL;
    }
    res = virPCIVPDParse(fd);

    if (VIR_CLOSE(fd) < 0) {
        virReportSystemError(errno, _("Unable to close the VPD file, fd: %d"), fd);
        return NULL;
    }

    return g_steal_pointer(&res);
}

#else
static const char *unsupported = N_("not supported on non-linux platforms");

virPCIDeviceAddress *
virPCIGetDeviceAddressFromSysfsLink(const char *device_link G_GNUC_UNUSED)
{
    virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
    return NULL;
}


int
virPCIGetPhysicalFunction(const char *vf_sysfs_path G_GNUC_UNUSED,
                          virPCIDeviceAddress **pf G_GNUC_UNUSED)
{
    virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
    return -1;
}

int
virPCIGetVirtualFunctionsFull(const char *sysfs_path G_GNUC_UNUSED,
                              virPCIVirtualFunctionList **vfs G_GNUC_UNUSED,
                              const char *pfNetDevName G_GNUC_UNUSED)
{
    virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
    return -1;
}

int
virPCIIsVirtualFunction(const char *vf_sysfs_device_link G_GNUC_UNUSED)
{
    virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
    return -1;
}

int
virPCIGetVirtualFunctionIndex(const char *pf_sysfs_device_link G_GNUC_UNUSED,
                              const char *vf_sysfs_device_link G_GNUC_UNUSED,
                              int *vf_index G_GNUC_UNUSED)
{
    virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
    return -1;

}


int
virPCIDeviceAddressGetSysfsFile(virPCIDeviceAddress *dev G_GNUC_UNUSED,
                                char **pci_sysfs_device_link G_GNUC_UNUSED)
{
    virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
    return -1;
}

int
virPCIGetNetName(const char *device_link_sysfs_path G_GNUC_UNUSED,
                 size_t idx G_GNUC_UNUSED,
                 const char *physPortNetDevName G_GNUC_UNUSED,
                 char **netname G_GNUC_UNUSED)
{
    virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
    return -1;
}

int
virPCIGetVirtualFunctionInfo(const char *vf_sysfs_device_path G_GNUC_UNUSED,
                             int pfNetDevIdx G_GNUC_UNUSED,
                             char **pfname G_GNUC_UNUSED,
                             int *vf_index G_GNUC_UNUSED)
{
    virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
    return -1;
}

bool
virPCIDeviceHasVPD(virPCIDevice *dev G_GNUC_UNUSED)
{
    virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
    return NULL;
}

virPCIVPDResource *
virPCIDeviceGetVPD(virPCIDevice *dev G_GNUC_UNUSED)
{
    virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _(unsupported));
    return NULL;
}
#endif /* __linux__ */

int
virPCIDeviceIsPCIExpress(virPCIDevice *dev)
{
    int fd;
    int ret = -1;

    if ((fd = virPCIDeviceConfigOpen(dev)) < 0)
        return ret;

    if (virPCIDeviceInit(dev, fd) < 0)
        goto cleanup;

    ret = dev->is_pcie;

 cleanup:
    virPCIDeviceConfigClose(dev, fd);
    return ret;
}

int
virPCIDeviceHasPCIExpressLink(virPCIDevice *dev)
{
    int fd;
    int ret = -1;
    uint16_t cap, type;

    if ((fd = virPCIDeviceConfigOpen(dev)) < 0)
        return ret;

    if (virPCIDeviceInit(dev, fd) < 0)
        goto cleanup;

    if (dev->pcie_cap_pos == 0) {
        ret = 0;
        goto cleanup;
    }

    cap = virPCIDeviceRead16(dev, fd, dev->pcie_cap_pos + PCI_CAP_FLAGS);
    type = (cap & PCI_EXP_FLAGS_TYPE) >> 4;

    ret = type != PCI_EXP_TYPE_ROOT_INT_EP && type != PCI_EXP_TYPE_ROOT_EC;

 cleanup:
    virPCIDeviceConfigClose(dev, fd);
    return ret;
}

int
virPCIDeviceGetLinkCapSta(virPCIDevice *dev,
                          int *cap_port,
                          unsigned int *cap_speed,
                          unsigned int *cap_width,
                          unsigned int *sta_speed,
                          unsigned int *sta_width)
{
    uint32_t t;
    int fd;
    int ret = -1;

    if ((fd = virPCIDeviceConfigOpen(dev)) < 0)
        return ret;

    if (virPCIDeviceInit(dev, fd) < 0)
        goto cleanup;

    if (!dev->pcie_cap_pos) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("pci device %s is not a PCI-Express device"),
                       dev->name);
        goto cleanup;
    }

    t = virPCIDeviceRead32(dev, fd, dev->pcie_cap_pos + PCI_EXP_LNKCAP);

    *cap_port = t >> 24;
    *cap_speed = t & PCI_EXP_LNKCAP_SPEED;
    *cap_width = (t & PCI_EXP_LNKCAP_WIDTH) >> 4;

    t = virPCIDeviceRead16(dev, fd, dev->pcie_cap_pos + PCI_EXP_LNKSTA);

    *sta_speed = t & PCI_EXP_LNKSTA_SPEED;
    *sta_width = (t & PCI_EXP_LNKSTA_WIDTH) >> 4;
    ret = 0;

 cleanup:
    virPCIDeviceConfigClose(dev, fd);
    return ret;
}


int virPCIGetHeaderType(virPCIDevice *dev, int *hdrType)
{
    int fd;
    uint8_t type;

    *hdrType = -1;

    if ((fd = virPCIDeviceConfigOpen(dev)) < 0)
        return -1;

    type = virPCIDeviceRead8(dev, fd, PCI_HEADER_TYPE);

    virPCIDeviceConfigClose(dev, fd);

    type &= PCI_HEADER_TYPE_MASK;
    if (type >= VIR_PCI_HEADER_LAST) {
        virReportError(VIR_ERR_INTERNAL_ERROR,
                       _("Unknown PCI header type '%d' for device '%s'"),
                       type, dev->name);
        return -1;
    }

    *hdrType = type;

    return 0;
}


void
virPCIEDeviceInfoFree(virPCIEDeviceInfo *dev)
{
    if (!dev)
        return;

    g_free(dev->link_cap);
    g_free(dev->link_sta);
    g_free(dev);
}

void
virPCIDeviceAddressFree(virPCIDeviceAddress *address)
{
    g_free(address);
}