diff --git a/ChangeLog b/ChangeLog index d4c583c9e2..2dfcb14127 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +Mon Mar 2 17:07:44 CET 2009 Daniel Veillard + + * configure.in po/POTFILES.in src/Makefile.am src/libvirt_private.syms + src/pci.c src/pci.h: Add implementations of dettach, reattach and + reset for PCI devices, patch by Mark McLoughlin + Mon Mar 2 12:12:11 CET 2009 Daniel Veillard * qemud/qemud.c src/node_device_hal.c: activate DBus multithreading diff --git a/configure.in b/configure.in index d2e8252abc..30e4b3281e 100644 --- a/configure.in +++ b/configure.in @@ -128,6 +128,8 @@ AC_PATH_PROG([UDEVADM], [udevadm], [], [/sbin:/usr/sbin:/usr/local/sbin:$PATH]) AC_PATH_PROG([UDEVSETTLE], [udevsettle], [], [/sbin:/usr/sbin:/usr/local/sbin:$PATH]) +AC_PATH_PROG([MODPROBE], [modprobe], [], + [/sbin:/usr/sbin:/usr/local/sbin:$PATH]) AC_DEFINE_UNQUOTED([DNSMASQ],["$DNSMASQ"], [Location or name of the dnsmasq program]) @@ -141,6 +143,10 @@ if test -n "$UDEVSETTLE"; then AC_DEFINE_UNQUOTED([UDEVSETTLE],["$UDEVSETTLE"], [Location or name of the udevsettle program]) fi +if test -n "$MODPROBE"; then + AC_DEFINE_UNQUOTED([MODPROBE],["$MODPROBE"], + [Location or name of the modprobe program]) +fi dnl Specific dir for HTML output ? AC_ARG_WITH([html-dir], [AC_HELP_STRING([--with-html-dir=path], diff --git a/po/POTFILES.in b/po/POTFILES.in index 7461f93ff0..237d462017 100644 --- a/po/POTFILES.in +++ b/po/POTFILES.in @@ -18,6 +18,7 @@ src/node_device_conf.c src/nodeinfo.c src/openvz_conf.c src/openvz_driver.c +src/pci.c src/proxy_internal.c src/qemu_conf.c src/qemu_driver.c diff --git a/src/Makefile.am b/src/Makefile.am index 3a798d22e1..a636e8aa3c 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -48,6 +48,7 @@ UTIL_SOURCES = \ iptables.c iptables.h \ logging.c logging.h \ memory.c memory.h \ + pci.c pci.h \ qparams.c qparams.h \ threads.c threads.h \ threads-pthread.h \ diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index c38e142d19..7ad7e669ee 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -229,6 +229,14 @@ virNodeDeviceObjUnlock; virNodeDeviceAssignDef; +# pci.h +pciGetDevice; +pciFreeDevice; +pciDettachDevice; +pciReAttachDevice; +pciResetDevice; + + # qparams.h qparam_get_query; qparam_query_parse; diff --git a/src/pci.c b/src/pci.c new file mode 100644 index 0000000000..2343be3e74 --- /dev/null +++ b/src/pci.c @@ -0,0 +1,829 @@ +/* + * Copyright (C) 2009 Red Hat, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Authors: + * Mark McLoughlin + */ + +#include + +#include "pci.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "logging.h" +#include "memory.h" +#include "util.h" +#include "virterror_internal.h" + +#define PCI_SYSFS "/sys/bus/pci/" +#define PCI_ID_LEN 10 /* "XXXX XXXX" */ +#define PCI_ADDR_LEN 13 /* "XXXX:XX:XX.X" */ + +struct _pciDevice { + unsigned domain; + unsigned bus; + unsigned slot; + unsigned function; + + char name[PCI_ADDR_LEN]; /* domain:bus:slot.function */ + char id[PCI_ID_LEN]; /* product vendor */ + char path[PATH_MAX]; + int fd; + + unsigned initted; + unsigned pcie_cap_pos; + unsigned pci_pm_cap_pos; + unsigned has_flr : 1; + unsigned has_pm_reset : 1; +}; + +/* For virReportOOMError() and virReportSystemError() */ +#define VIR_FROM_THIS VIR_FROM_NONE + +#define pciReportError(conn, code, fmt...) \ + virReportErrorHelper(conn, VIR_FROM_NONE, code, __FILE__, \ + __FUNCTION__, __LINE__, fmt) + +/* Specifications referenced in comments: + * PCI30 - PCI Local Bus Specification 3.0 + * PCIe20 - PCI Express Base Specification 2.0 + * BR12 - PCI-to-PCI Bridge Architecture Specification 1.2 + * PM12 - PCI Bus Power Management Interface Specification 1.2 + * ECN_AF - Advanced Capabilities for Conventional PCI ECN + */ + +/* Type 0 config space header length; PCI30 Section 6.1 Configuration Space Organization */ +#define PCI_CONF_LEN 0x100 +#define PCI_CONF_HEADER_LEN 0x40 + +/* PCI30 6.2.1 */ +#define PCI_HEADER_TYPE 0x0e /* Header type */ +#define PCI_HEADER_TYPE_BRIDGE 0x1 +#define PCI_HEADER_TYPE_MASK 0x7f +#define PCI_HEADER_TYPE_MULTI 0x80 + +/* PCI30 6.2.1 Device Identification */ +#define PCI_CLASS_DEVICE 0x0a /* Device class */ + +/* Class Code for bridge; PCI30 D.7 Base Class 06h */ +#define PCI_CLASS_BRIDGE_PCI 0x0604 + +/* PCI30 6.2.3 Device Status */ +#define PCI_STATUS 0x06 /* 16 bits */ +#define PCI_STATUS_CAP_LIST 0x10 /* Support Capability List */ + +/* PCI30 6.7 Capabilities List */ +#define PCI_CAPABILITY_LIST 0x34 /* Offset of first capability list entry */ + +/* PM12 3.2.1 Capability Identifier */ +#define PCI_CAP_ID_PM 0x01 /* Power Management */ +/* PCI30 H Capability IDs */ +#define PCI_CAP_ID_EXP 0x10 /* PCI Express */ +/* ECN_AF 6.x.1.1 Capability ID for AF */ +#define PCI_CAP_ID_AF 0x13 /* Advanced Features */ + +/* PCIe20 7.8.3 Device Capabilities Register (Offset 04h) */ +#define PCI_EXP_DEVCAP 0x4 /* Device capabilities */ +#define PCI_EXP_DEVCAP_FLR (1<<28) /* Function Level Reset */ + +/* Header type 1 BR12 3.2 PCI-to-PCI Bridge Configuration Space Header Format */ +#define PCI_PRIMARY_BUS 0x18 /* BR12 3.2.5.2 Primary bus number */ +#define PCI_SECONDARY_BUS 0x19 /* BR12 3.2.5.3 Secondary bus number */ +#define PCI_SUBORDINATE_BUS 0x1a /* BR12 3.2.5.4 Highest bus number behind the bridge */ +#define PCI_BRIDGE_CONTROL 0x3e +/* BR12 3.2.5.18 Bridge Control Register */ +#define PCI_BRIDGE_CTL_RESET 0x40 /* Secondary bus reset */ + +/* PM12 3.2.4 Power Management Control/Status (Offset = 4) */ +#define PCI_PM_CTRL 4 /* PM control and status register */ +#define PCI_PM_CTRL_STATE_MASK 0x3 /* Current power state (D0 to D3) */ +#define PCI_PM_CTRL_STATE_D0 0x0 /* D0 state */ +#define PCI_PM_CTRL_STATE_D3hot 0x3 /* D3 state */ +#define PCI_PM_CTRL_NO_SOFT_RESET 0x8 /* No reset for D3hot->D0 */ + +/* ECN_AF 6.x.1 Advanced Features Capability Structure */ +#define PCI_AF_CAP 0x3 /* Advanced features capabilities */ +#define PCI_AF_CAP_FLR 0x2 /* Function Level Reset */ + +static int +pciOpenConfig(pciDevice *dev) +{ + int fd; + + if (dev->fd > 0) + return 0; + + fd = open(dev->path, O_RDWR); + if (fd < 0) { + char ebuf[1024]; + VIR_WARN(_("Failed to open config space file '%s': %s"), + dev->path, virStrerror(errno, ebuf, sizeof(ebuf))); + return -1; + } + VIR_DEBUG("%s %s: opened %s", dev->id, dev->name, dev->path); + dev->fd = fd; + return 0; +} + +static int +pciRead(pciDevice *dev, unsigned pos, uint8_t *buf, unsigned buflen) +{ + memset(buf, 0, buflen); + + if (pciOpenConfig(dev) < 0) + return -1; + + if (pread(dev->fd, buf, buflen, pos) < 0) { + char ebuf[1024]; + VIR_WARN(_("Failed to read from '%s' : %s"), dev->path, + virStrerror(errno, ebuf, sizeof(ebuf))); + return -1; + } + return 0; +} + +static uint8_t +pciRead8(pciDevice *dev, unsigned pos) +{ + uint8_t buf; + pciRead(dev, pos, &buf, sizeof(buf)); + return buf; +} + +static uint16_t +pciRead16(pciDevice *dev, unsigned pos) +{ + uint8_t buf[2]; + pciRead(dev, pos, &buf[0], sizeof(buf)); + return (buf[0] << 0) | (buf[1] << 8); +} + +static uint32_t +pciRead32(pciDevice *dev, unsigned pos) +{ + uint8_t buf[4]; + pciRead(dev, pos, &buf[0], sizeof(buf)); + return (buf[0] << 0) | (buf[1] << 8) | (buf[2] << 16) | (buf[3] << 24); +} + +static int +pciWrite(pciDevice *dev, unsigned pos, uint8_t *buf, unsigned buflen) +{ + if (pciOpenConfig(dev) < 0) + return -1; + + if (pwrite(dev->fd, buf, buflen, pos) < 0) { + char ebuf[1024]; + VIR_WARN(_("Failed to write to '%s' : %s"), dev->path, + virStrerror(errno, ebuf, sizeof(ebuf))); + return -1; + } + return 0; +} + +static void +pciWrite16(pciDevice *dev, unsigned pos, uint16_t val) +{ + uint8_t buf[2] = { (val >> 0), (val >> 8) }; + pciWrite(dev, pos, &buf[0], sizeof(buf)); +} + +static void +pciWrite32(pciDevice *dev, unsigned pos, uint32_t val) +{ + uint8_t buf[4] = { (val >> 0), (val >> 8), (val >> 16), (val >> 14) }; + pciWrite(dev, pos, &buf[0], sizeof(buf)); +} + +typedef int (*pciIterPredicate)(pciDevice *, pciDevice *); + +/* Iterate over available PCI devices calling @predicate + * to compare each one to @dev. + * Return -1 on error since we don't want to assume it is + * safe to reset if there is an error. + */ +static int +pciIterDevices(virConnectPtr conn, + pciIterPredicate predicate, + pciDevice *dev, + pciDevice **matched) +{ + DIR *dir; + struct dirent *entry; + + *matched = NULL; + + VIR_DEBUG("%s %s: iterating over " PCI_SYSFS "devices", dev->id, dev->name); + + dir = opendir(PCI_SYSFS "devices"); + if (!dir) { + VIR_WARN0("Failed to open " PCI_SYSFS "devices"); + return -1; + } + + while ((entry = readdir(dir))) { + unsigned domain, bus, slot, function; + pciDevice *try; + + /* Ignore '.' and '..' */ + if (entry->d_name[0] == '.') + continue; + + if (sscanf(entry->d_name, "%x:%x:%x.%x", &domain, &bus, &slot, &function) < 4) { + VIR_WARN("Unusual entry in " PCI_SYSFS "devices: %s", entry->d_name); + continue; + } + + try = pciGetDevice(conn, domain, bus, slot, function); + if (!try) + return -1; + + if (predicate(try, dev)) { + VIR_DEBUG("%s %s: iter matched on %s", dev->id, dev->name, try->name); + *matched = try; + break; + } + pciFreeDevice(conn, try); + } + closedir(dir); + return 0; +} + +static uint8_t +pciFindCapabilityOffset(pciDevice *dev, unsigned capability) +{ + uint16_t status; + uint8_t pos; + + status = pciRead16(dev, PCI_STATUS); + if (!(status & PCI_STATUS_CAP_LIST)) + return 0; + + pos = pciRead8(dev, PCI_CAPABILITY_LIST); + + /* Zero indicates last capability, capabilities can't + * be in the config space header and 0xff is returned + * by the kernel if we don't have access to this region + * + * Note: we're not handling loops or extended + * capabilities here. + */ + while (pos >= PCI_CONF_HEADER_LEN && pos != 0xff) { + uint8_t capid = pciRead8(dev, pos); + if (capid == capability) { + VIR_DEBUG("%s %s: found cap 0x%.2x at 0x%.2x", + dev->id, dev->name, capability, pos); + return pos; + } + + pos = pciRead8(dev, pos + 1); + } + + VIR_DEBUG("%s %s: failed to find cap 0x%.2x", dev->id, dev->name, capability); + + return 0; +} + +static unsigned +pciDetectFunctionLevelReset(pciDevice *dev) +{ + uint16_t caps; + uint8_t pos; + + /* The PCIe Function Level Reset capability allows + * individual device functions to be reset without + * affecting any other functions on the device or + * any other devices on the bus. This is only common + * on SR-IOV NICs at the moment. + */ + if (dev->pcie_cap_pos) { + caps = pciRead16(dev, dev->pcie_cap_pos + PCI_EXP_DEVCAP); + if (caps & PCI_EXP_DEVCAP_FLR) { + VIR_DEBUG("%s %s: detected PCIe FLR capability", dev->id, dev->name); + return 1; + } + } + + /* The PCI AF Function Level Reset capability is + * the same thing, except for conventional PCI + * devices. This is not common yet. + */ + pos = pciFindCapabilityOffset(dev, PCI_CAP_ID_AF); + if (pos) { + caps = pciRead16(dev, pos + PCI_AF_CAP); + if (caps & PCI_AF_CAP_FLR) { + VIR_DEBUG("%s %s: detected PCI FLR capability", dev->id, dev->name); + return 1; + } + } + + VIR_DEBUG("%s %s: no FLR capability found", dev->id, dev->name); + + return 0; +} + +/* Require the device has the PCI Power Management capability + * and that a D3hot->D0 transition will results in a full + * internal reset, not just a soft reset. + */ +static unsigned +pciDetectPowerManagementReset(pciDevice *dev) +{ + if (dev->pci_pm_cap_pos) { + uint32_t ctl; + + /* require the NO_SOFT_RESET bit is clear */ + ctl = pciRead32(dev, dev->pci_pm_cap_pos + PCI_PM_CTRL); + if (!(ctl & PCI_PM_CTRL_NO_SOFT_RESET)) { + VIR_DEBUG("%s %s: detected PM reset capability", dev->id, dev->name); + return 1; + } + } + + VIR_DEBUG("%s %s: no PM reset capability found", dev->id, dev->name); + + return 0; +} + +/* Any devices other than the one supplied on the same domain/bus ? */ +static int +pciSharesBus(pciDevice *a, pciDevice *b) +{ + return + a->domain == b->domain && + a->bus == b->bus && + (a->slot != b->slot || + a->function != b->function); +} + +static int +pciBusContainsOtherDevices(virConnectPtr conn, pciDevice *dev) +{ + pciDevice *matched = NULL; + if (pciIterDevices(conn, pciSharesBus, dev, &matched) < 0) + return 1; + if (!matched) + return 0; + pciFreeDevice(conn, matched); + return 1; +} + +/* Any other functions on this device ? */ +static int +pciSharesDevice(pciDevice *a, pciDevice *b) +{ + return + a->domain == b->domain && + a->bus == b->bus && + a->slot == b->slot && + a->function != b->function; +} + +static int +pciDeviceContainsOtherFunctions(virConnectPtr conn, pciDevice *dev) +{ + pciDevice *matched = NULL; + if (pciIterDevices(conn, pciSharesDevice, dev, &matched) < 0) + return 1; + if (!matched) + return 0; + pciFreeDevice(conn, matched); + return 1; +} + +/* Is @a the parent of @b ? */ +static int +pciIsParent(pciDevice *a, pciDevice *b) +{ + uint16_t device_class; + uint8_t header_type, secondary, subordinate; + + if (a->domain != b->domain) + return 0; + + /* Is it a bridge? */ + device_class = pciRead16(a, PCI_CLASS_DEVICE); + if (device_class != PCI_CLASS_BRIDGE_PCI) + return 0; + + /* Is it a plane? */ + header_type = pciRead8(a, PCI_HEADER_TYPE); + if ((header_type & PCI_HEADER_TYPE_MASK) != PCI_HEADER_TYPE_BRIDGE) + return 0; + + secondary = pciRead8(a, PCI_SECONDARY_BUS); + subordinate = pciRead8(a, PCI_SUBORDINATE_BUS); + + VIR_DEBUG("%s %s: found parent device %s\n", b->id, b->name, a->name); + + /* No, it's superman! */ + return (b->bus >= secondary && b->bus <= subordinate); +} + +static pciDevice * +pciGetParentDevice(virConnectPtr conn, pciDevice *dev) +{ + pciDevice *parent = NULL; + pciIterDevices(conn, pciIsParent, dev, &parent); + return parent; +} + +/* Secondary Bus Reset is our sledgehammer - it resets all + * devices behind a bus. + */ +static int +pciTrySecondaryBusReset(virConnectPtr conn, pciDevice *dev) +{ + pciDevice *parent; + uint8_t config_space[PCI_CONF_LEN]; + uint16_t ctl; + int ret = -1; + + /* For now, we just refuse to do a secondary bus reset + * if there are other devices/functions behind the bus. + * In future, we could allow it so long as those devices + * are not in use by the host or other guests. + */ + if (pciBusContainsOtherDevices(conn, dev)) { + VIR_WARN("Other devices on bus with %s, not doing bus reset", + dev->name); + return -1; + } + + /* Find the parent bus */ + parent = pciGetParentDevice(conn, dev); + if (!parent) { + VIR_WARN("Failed to find parent device for %s", dev->name); + return -1; + } + + VIR_DEBUG("%s %s: doing a secondary bus reset", dev->id, dev->name); + + /* Save and restore the device's config space; we only do this + * for the supplied device since we refuse to do a reset if there + * are multiple devices/functions + */ + if (pciRead(dev, 0, config_space, PCI_CONF_LEN) < 0) { + VIR_WARN("Failed to save PCI config space for %s", dev->name); + goto out; + } + + /* Read the control register, set the reset flag, wait 200ms, + * unset the reset flag and wait 200ms. + */ + ctl = pciRead16(dev, PCI_BRIDGE_CONTROL); + + pciWrite16(parent, PCI_BRIDGE_CONTROL, ctl | PCI_BRIDGE_CTL_RESET); + + usleep(200 * 1000); /* sleep 200ms */ + + pciWrite16(parent, PCI_BRIDGE_CONTROL, ctl); + + usleep(200 * 1000); /* sleep 200ms */ + + if (pciWrite(dev, 0, config_space, PCI_CONF_LEN) < 0) + VIR_WARN("Failed to restore PCI config space for %s", dev->name); + + ret = 0; +out: + pciFreeDevice(conn, parent); + return ret; +} + +/* Power management reset attempts to reset a device using a + * D-state transition from D3hot to D0. Note, in detect_pm_reset() + * above we require the device supports a full internal reset. + */ +static int +pciTryPowerManagementReset(virConnectPtr conn, pciDevice *dev) +{ + uint8_t config_space[PCI_CONF_LEN]; + uint32_t ctl; + + if (!dev->pci_pm_cap_pos) + return -1; + + /* For now, we just refuse to do a power management reset + * if there are other functions on this device. + * In future, we could allow it so long as those functions + * are not in use by the host or other guests. + */ + if (pciDeviceContainsOtherFunctions(conn, dev)) { + VIR_WARN("%s contains other functions, not resetting", dev->name); + return -1; + } + + /* Save and restore the device's config space. */ + if (pciRead(dev, 0, &config_space[0], PCI_CONF_LEN) < 0) { + VIR_WARN("Failed to save PCI config space for %s", dev->name); + return -1; + } + + VIR_DEBUG("%s %s: doing a power management reset", dev->id, dev->name); + + ctl = pciRead32(dev, dev->pci_pm_cap_pos + PCI_PM_CTRL); + ctl &= ~PCI_PM_CTRL_STATE_MASK; + + pciWrite32(dev, dev->pci_pm_cap_pos + PCI_PM_CTRL, ctl|PCI_PM_CTRL_STATE_D3hot); + + usleep(10 * 1000); /* sleep 10ms */ + + pciWrite32(dev, dev->pci_pm_cap_pos + PCI_PM_CTRL, ctl|PCI_PM_CTRL_STATE_D0); + + usleep(10 * 1000); /* sleep 10ms */ + + if (pciWrite(dev, 0, &config_space[0], PCI_CONF_LEN) < 0) + VIR_WARN("Failed to restore PCI config space for %s", dev->name); + + return 0; +} + +static int +pciInitDevice(virConnectPtr conn, pciDevice *dev) +{ + if (pciOpenConfig(dev) < 0) { + virReportSystemError(conn, errno, + _("Failed to open config space file '%s'"), + dev->path); + return -1; + } + + dev->pcie_cap_pos = pciFindCapabilityOffset(dev, PCI_CAP_ID_EXP); + dev->pci_pm_cap_pos = pciFindCapabilityOffset(dev, PCI_CAP_ID_PM); + dev->has_flr = pciDetectFunctionLevelReset(dev); + dev->has_pm_reset = pciDetectPowerManagementReset(dev); + dev->initted = 1; + return 0; +} + +int +pciResetDevice(virConnectPtr conn, pciDevice *dev) +{ + int ret = -1; + + if (!dev->initted && pciInitDevice(conn, dev) < 0) + return -1; + + /* KVM will perform FLR when starting and stopping + * a guest, so there is no need for us to do it here. + */ + if (dev->has_flr) + return 0; + + /* Bus reset is not an option with the root bus */ + if (dev->bus != 0) + ret = pciTrySecondaryBusReset(conn, dev); + + /* Next best option is a PCI power management reset */ + if (ret < 0 && dev->has_pm_reset) + ret = pciTryPowerManagementReset(conn, dev); + + if (ret < 0) + pciReportError(conn, VIR_ERR_NO_SUPPORT, + _("No PCI reset capability available for %s"), + dev->name); + return ret; +} + +static int +pciBindDeviceToStub(virConnectPtr conn, pciDevice *dev, const char *stub_module) +{ + char stub_dir[PATH_MAX]; + char path[PATH_MAX]; + + snprintf(stub_dir, sizeof(stub_dir), PCI_SYSFS "drivers/%s", stub_module); + + /* Try loading the stub module if it isn't already loaded; + * Do not return an error if the stub module is not available. + */ + if (!virFileExists(stub_dir)) { + const char *const modprobeargv[] = { MODPROBE, stub_module, NULL }; + + if (virRun(conn, modprobeargv, NULL) < 0) { + char ebuf[1024]; + VIR_WARN(_("modprobe %s failed: %s"), stub_module, + virStrerror(errno, ebuf, sizeof ebuf)); + } + } + + if (!virFileExists(stub_dir)) { + VIR_WARN(_("%s module not available, cannot bind device %s to it"), + stub_module, dev->name); + } else { + /* Add the PCI device ID to the stub's dynamic ID table; + * this is needed to allow us to bind the device to the stub. + * Note: if the device is not currently bound to any driver, + * stub will immediately be bound to the device. Also, note + * that if a new device with this ID is hotplugged, or if a probe + * is triggered for such a device, it will also be immediately + * bound by the stub. + */ + snprintf(path, sizeof(path), "%s/new_id", stub_dir); + if (virFileWriteStr(path, dev->id) < 0) { + virReportSystemError(conn, errno, + _("Failed to add PCI device ID '%s' to %s"), + dev->id, stub_module); + return -1; + } + } + + /* If the device is already bound to a driver, unbind it. + * Note, this will have rather unpleasant side effects if this + * PCI device happens to be IDE controller for the disk hosting + * your root filesystem. + */ + snprintf(path, sizeof(path), + PCI_SYSFS "devices/%s/driver/unbind", dev->name); + if (virFileExists(path) && virFileWriteStr(path, dev->name) < 0) { + virReportSystemError(conn, errno, + _("Failed to unbind PCI device '%s'"), dev->name); + return -1; + } + + if (virFileExists(stub_dir)) { + /* If the device isn't already bound to pci-stub, try binding it now. + */ + snprintf(path, sizeof(path), PCI_SYSFS "devices/%s/driver", dev->name); + if (!virFileLinkPointsTo(path, stub_dir)) { + snprintf(path, sizeof(path), "%s/bind", stub_dir); + if (virFileWriteStr(path, dev->name) < 0) { + virReportSystemError(conn, errno, + _("Failed to bind PCI device '%s' to %s"), + dev->name, stub_module); + return -1; + } + } + + /* If 'remove_id' exists, remove the device id from pci-stub's dynamic + * ID table so that 'drivers_probe' works below. + */ + snprintf(path, sizeof(path), "%s/remove_id", stub_dir); + if (virFileExists(path) && virFileWriteStr(path, dev->id) < 0) { + virReportSystemError(conn, errno, + _("Failed to remove PCI ID '%s' from %s"), + dev->id, stub_module); + return -1; + } + } + + return 0; +} + +int +pciDettachDevice(virConnectPtr conn, pciDevice *dev) +{ + return pciBindDeviceToStub(conn, dev, "pci-stub"); +} + +static int +pciUnBindDeviceFromStub(virConnectPtr conn, pciDevice *dev, const char *stub_module) +{ + char stub_dir[PATH_MAX]; + char path[PATH_MAX]; + + snprintf(stub_dir, sizeof(stub_dir), PCI_SYSFS "drivers/%s", stub_module); + + /* If the device is bound to stub, unbind it. + */ + snprintf(path, sizeof(path), PCI_SYSFS "devices/%s/driver", dev->name); + if (virFileExists(stub_dir) && virFileLinkPointsTo(path, stub_dir)) { + snprintf(path, sizeof(path), "%s/unbind", stub_dir); + if (virFileWriteStr(path, dev->name) < 0) { + virReportSystemError(conn, errno, + _("Failed to bind PCI device '%s' to %s"), + dev->name, stub_module); + return -1; + } + } + + /* Trigger a re-probe of the device is not in the stub's dynamic + * ID table. If the stub is available, but 'remove_id' isn't + * available, then re-probing would just cause the device to be + * re-bound to the stub. + */ + snprintf(path, sizeof(path), "%s/remove_id", stub_dir); + if (!virFileExists(stub_dir) || virFileExists(path)) { + if (virFileWriteStr(PCI_SYSFS "drivers_probe", dev->name) < 0) { + virReportSystemError(conn, errno, + _("Failed to trigger a re-probe for PCI device '%s'"), + dev->name); + return -1; + } + } + + return 0; +} + +int +pciReAttachDevice(virConnectPtr conn, pciDevice *dev) +{ + return pciUnBindDeviceFromStub(conn, dev, "pci-stub"); +} + +static char * +pciReadDeviceID(pciDevice *dev, const char *id_name) +{ + char path[PATH_MAX]; + char *id_str; + + snprintf(path, sizeof(path), PCI_SYSFS "devices/%s/%s", + dev->name, id_name); + + /* ID string is '0xNNNN\n' ... i.e. 7 bytes */ + if (virFileReadAll(path, 7, &id_str) < 7) { + VIR_FREE(id_str); + return NULL; + } + + /* Check for 0x suffix */ + if (id_str[0] != '0' || id_str[1] != 'x') { + VIR_FREE(id_str); + return NULL; + } + + /* Chop off the newline; we know the string is 7 bytes */ + id_str[6] = '\0'; + + return id_str; +} + +pciDevice * +pciGetDevice(virConnectPtr conn, + unsigned domain, + unsigned bus, + unsigned slot, + unsigned function) +{ + pciDevice *dev; + char *vendor, *product; + + if (VIR_ALLOC(dev) < 0) { + virReportOOMError(conn); + return NULL; + } + + dev->domain = domain; + dev->bus = bus; + dev->slot = slot; + dev->function = function; + + snprintf(dev->name, sizeof(dev->name), "%.4x:%.2x:%.2x.%.1x", + dev->domain, dev->bus, dev->slot, dev->function); + snprintf(dev->path, sizeof(dev->path), + PCI_SYSFS "devices/%s/config", dev->name); + + vendor = pciReadDeviceID(dev, "vendor"); + product = pciReadDeviceID(dev, "device"); + + if (!vendor || !product) { + pciReportError(conn, VIR_ERR_NO_SUPPORT, + _("Failed to read product/vendor ID for %s"), + dev->name); + VIR_FREE(product); + VIR_FREE(vendor); + pciFreeDevice(conn, dev); + return NULL; + } + + /* strings contain '0x' prefix */ + snprintf(dev->id, sizeof(dev->id), "%s %s", &vendor[2], &product[2]); + + VIR_FREE(product); + VIR_FREE(vendor); + + VIR_DEBUG("%s %s: initialized", dev->id, dev->name); + + return dev; +} + +void +pciFreeDevice(virConnectPtr conn ATTRIBUTE_UNUSED, pciDevice *dev) +{ + VIR_DEBUG("%s %s: freeing", dev->id, dev->name); + if (dev->fd) + close(dev->fd); + VIR_FREE(dev); +} diff --git a/src/pci.h b/src/pci.h new file mode 100644 index 0000000000..47882efb4d --- /dev/null +++ b/src/pci.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2009 Red Hat, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Authors: + * Mark McLoughlin + */ + +#ifndef __VIR_PCI_H__ +#define __VIR_PCI_H__ + +#include +#include "internal.h" + +typedef struct _pciDevice pciDevice; + +pciDevice *pciGetDevice (virConnectPtr conn, + unsigned domain, + unsigned bus, + unsigned slot, + unsigned function); +void pciFreeDevice (virConnectPtr conn, + pciDevice *dev); +int pciDettachDevice (virConnectPtr conn, + pciDevice *dev); +int pciReAttachDevice (virConnectPtr conn, + pciDevice *dev); +int pciResetDevice (virConnectPtr conn, + pciDevice *dev); + +#endif /* __VIR_PCI_H__ */