diff options
Diffstat (limited to 'hw/i386')
-rw-r--r-- | hw/i386/Makefile.objs | 1 | ||||
-rw-r--r-- | hw/i386/acpi-build.c | 214 | ||||
-rw-r--r-- | hw/i386/amd_iommu.c | 1212 | ||||
-rw-r--r-- | hw/i386/amd_iommu.h | 289 | ||||
-rw-r--r-- | hw/i386/intel_iommu.c | 119 | ||||
-rw-r--r-- | hw/i386/intel_iommu_internal.h | 2 | ||||
-rw-r--r-- | hw/i386/kvm/apic.c | 66 | ||||
-rw-r--r-- | hw/i386/kvm/i8259.c | 2 | ||||
-rw-r--r-- | hw/i386/kvm/pci-assign.c | 4 | ||||
-rw-r--r-- | hw/i386/kvmvapic.c | 20 | ||||
-rw-r--r-- | hw/i386/pc.c | 185 | ||||
-rw-r--r-- | hw/i386/pc_piix.c | 33 | ||||
-rw-r--r-- | hw/i386/pc_q35.c | 30 | ||||
-rw-r--r-- | hw/i386/trace-events | 33 | ||||
-rw-r--r-- | hw/i386/x86-iommu.c | 6 | ||||
-rw-r--r-- | hw/i386/xen/xen_apic.c | 6 | ||||
-rw-r--r-- | hw/i386/xen/xen_platform.c | 33 |
17 files changed, 2040 insertions, 215 deletions
diff --git a/hw/i386/Makefile.objs b/hw/i386/Makefile.objs index 90e94ffefd..909ead6a77 100644 --- a/hw/i386/Makefile.objs +++ b/hw/i386/Makefile.objs @@ -3,6 +3,7 @@ obj-y += multiboot.o obj-y += pc.o pc_piix.o pc_q35.o obj-y += pc_sysfw.o obj-y += x86-iommu.o intel_iommu.o +obj-y += amd_iommu.o obj-$(CONFIG_XEN) += ../xenpv/ xen/ obj-y += kvmvapic.o diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index a26a4bb03f..9708cdc463 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -53,13 +53,13 @@ #include "hw/pci/pci_bus.h" #include "hw/pci-host/q35.h" #include "hw/i386/x86-iommu.h" -#include "hw/timer/hpet.h" #include "hw/acpi/aml-build.h" #include "qapi/qmp/qint.h" #include "qom/qom-qobject.h" -#include "hw/i386/x86-iommu.h" +#include "hw/i386/amd_iommu.h" +#include "hw/i386/intel_iommu.h" #include "hw/acpi/ipmi.h" @@ -339,24 +339,38 @@ build_fadt(GArray *table_data, BIOSLinker *linker, AcpiPmInfo *pm, void pc_madt_cpu_entry(AcpiDeviceIf *adev, int uid, CPUArchIdList *apic_ids, GArray *entry) { - int apic_id; - AcpiMadtProcessorApic *apic = acpi_data_push(entry, sizeof *apic); + uint32_t apic_id = apic_ids->cpus[uid].arch_id; - apic_id = apic_ids->cpus[uid].arch_id; - apic->type = ACPI_APIC_PROCESSOR; - apic->length = sizeof(*apic); - apic->processor_id = uid; - apic->local_apic_id = apic_id; - if (apic_ids->cpus[uid].cpu != NULL) { - apic->flags = cpu_to_le32(1); + /* ACPI spec says that LAPIC entry for non present + * CPU may be omitted from MADT or it must be marked + * as disabled. However omitting non present CPU from + * MADT breaks hotplug on linux. So possible CPUs + * should be put in MADT but kept disabled. + */ + if (apic_id < 255) { + AcpiMadtProcessorApic *apic = acpi_data_push(entry, sizeof *apic); + + apic->type = ACPI_APIC_PROCESSOR; + apic->length = sizeof(*apic); + apic->processor_id = uid; + apic->local_apic_id = apic_id; + if (apic_ids->cpus[uid].cpu != NULL) { + apic->flags = cpu_to_le32(1); + } else { + apic->flags = cpu_to_le32(0); + } } else { - /* ACPI spec says that LAPIC entry for non present - * CPU may be omitted from MADT or it must be marked - * as disabled. However omitting non present CPU from - * MADT breaks hotplug on linux. So possible CPUs - * should be put in MADT but kept disabled. - */ - apic->flags = cpu_to_le32(0); + AcpiMadtProcessorX2Apic *apic = acpi_data_push(entry, sizeof *apic); + + apic->type = ACPI_APIC_LOCAL_X2APIC; + apic->length = sizeof(*apic); + apic->uid = cpu_to_le32(uid); + apic->x2apic_id = cpu_to_le32(apic_id); + if (apic_ids->cpus[uid].cpu != NULL) { + apic->flags = cpu_to_le32(1); + } else { + apic->flags = cpu_to_le32(0); + } } } @@ -368,11 +382,11 @@ build_madt(GArray *table_data, BIOSLinker *linker, PCMachineState *pcms) int madt_start = table_data->len; AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_GET_CLASS(pcms->acpi_dev); AcpiDeviceIf *adev = ACPI_DEVICE_IF(pcms->acpi_dev); + bool x2apic_mode = false; AcpiMultipleApicTable *madt; AcpiMadtIoApic *io_apic; AcpiMadtIntsrcovr *intsrcovr; - AcpiMadtLocalNmi *local_nmi; int i; madt = acpi_data_push(table_data, sizeof *madt); @@ -381,6 +395,9 @@ build_madt(GArray *table_data, BIOSLinker *linker, PCMachineState *pcms) for (i = 0; i < apic_ids->len; i++) { adevc->madt_cpu(adev, i, apic_ids, table_data); + if (apic_ids->cpus[i].arch_id > 254) { + x2apic_mode = true; + } } g_free(apic_ids); @@ -413,12 +430,25 @@ build_madt(GArray *table_data, BIOSLinker *linker, PCMachineState *pcms) intsrcovr->flags = cpu_to_le16(0xd); /* active high, level triggered */ } - local_nmi = acpi_data_push(table_data, sizeof *local_nmi); - local_nmi->type = ACPI_APIC_LOCAL_NMI; - local_nmi->length = sizeof(*local_nmi); - local_nmi->processor_id = 0xff; /* all processors */ - local_nmi->flags = cpu_to_le16(0); - local_nmi->lint = 1; /* ACPI_LINT1 */ + if (x2apic_mode) { + AcpiMadtLocalX2ApicNmi *local_nmi; + + local_nmi = acpi_data_push(table_data, sizeof *local_nmi); + local_nmi->type = ACPI_APIC_LOCAL_X2APIC_NMI; + local_nmi->length = sizeof(*local_nmi); + local_nmi->uid = 0xFFFFFFFF; /* all processors */ + local_nmi->flags = cpu_to_le16(0); + local_nmi->lint = 1; /* ACPI_LINT1 */ + } else { + AcpiMadtLocalNmi *local_nmi; + + local_nmi = acpi_data_push(table_data, sizeof *local_nmi); + local_nmi->type = ACPI_APIC_LOCAL_NMI; + local_nmi->length = sizeof(*local_nmi); + local_nmi->processor_id = 0xff; /* all processors */ + local_nmi->flags = cpu_to_le16(0); + local_nmi->lint = 1; /* ACPI_LINT1 */ + } build_header(linker, table_data, (void *)(table_data->data + madt_start), "APIC", @@ -789,7 +819,7 @@ static gint crs_range_compare(gconstpointer a, gconstpointer b) static void crs_replace_with_free_ranges(GPtrArray *ranges, uint64_t start, uint64_t end) { - GPtrArray *free_ranges = g_ptr_array_new_with_free_func(crs_range_free); + GPtrArray *free_ranges = g_ptr_array_new(); uint64_t free_base = start; int i; @@ -813,7 +843,7 @@ static void crs_replace_with_free_ranges(GPtrArray *ranges, g_ptr_array_add(ranges, g_ptr_array_index(free_ranges, i)); } - g_ptr_array_free(free_ranges, false); + g_ptr_array_free(free_ranges, true); } /* @@ -2038,6 +2068,13 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, method = aml_method("_E03", 0, AML_NOTSERIALIZED); aml_append(method, aml_call0(MEMORY_HOTPLUG_HANDLER_PATH)); aml_append(scope, method); + + if (pcms->acpi_nvdimm_state.is_enabled) { + method = aml_method("_E04", 0, AML_NOTSERIALIZED); + aml_append(method, aml_notify(aml_name("\\_SB.NVDR"), + aml_int(0x80))); + aml_append(scope, method); + } } aml_append(dsdt, scope); @@ -2390,7 +2427,6 @@ static void build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) { AcpiSystemResourceAffinityTable *srat; - AcpiSratProcessorAffinity *core; AcpiSratMemoryAffinity *numamem; int i; @@ -2409,22 +2445,34 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) srat->reserved1 = cpu_to_le32(1); for (i = 0; i < apic_ids->len; i++) { - int j; - int apic_id = apic_ids->cpus[i].arch_id; - - core = acpi_data_push(table_data, sizeof *core); - core->type = ACPI_SRAT_PROCESSOR_APIC; - core->length = sizeof(*core); - core->local_apic_id = apic_id; - for (j = 0; j < nb_numa_nodes; j++) { - if (test_bit(i, numa_info[j].node_cpu)) { + int j = numa_get_node_for_cpu(i); + uint32_t apic_id = apic_ids->cpus[i].arch_id; + + if (apic_id < 255) { + AcpiSratProcessorAffinity *core; + + core = acpi_data_push(table_data, sizeof *core); + core->type = ACPI_SRAT_PROCESSOR_APIC; + core->length = sizeof(*core); + core->local_apic_id = apic_id; + if (j < nb_numa_nodes) { core->proximity_lo = j; - break; } + memset(core->proximity_hi, 0, 3); + core->local_sapic_eid = 0; + core->flags = cpu_to_le32(1); + } else { + AcpiSratProcessorX2ApicAffinity *core; + + core = acpi_data_push(table_data, sizeof *core); + core->type = ACPI_SRAT_PROCESSOR_x2APIC; + core->length = sizeof(*core); + core->x2apic_id = cpu_to_le32(apic_id); + if (j < nb_numa_nodes) { + core->proximity_domain = cpu_to_le32(j); + } + core->flags = cpu_to_le32(1); } - memset(core->proximity_hi, 0, 3); - core->local_sapic_eid = 0; - core->flags = cpu_to_le32(1); } @@ -2557,11 +2605,68 @@ build_dmar_q35(GArray *table_data, BIOSLinker *linker) scope->length = ioapic_scope_size; scope->enumeration_id = ACPI_BUILD_IOAPIC_ID; scope->bus = Q35_PSEUDO_BUS_PLATFORM; - scope->path[0] = cpu_to_le16(Q35_PSEUDO_DEVFN_IOAPIC); + scope->path[0].device = PCI_SLOT(Q35_PSEUDO_DEVFN_IOAPIC); + scope->path[0].function = PCI_FUNC(Q35_PSEUDO_DEVFN_IOAPIC); build_header(linker, table_data, (void *)(table_data->data + dmar_start), "DMAR", table_data->len - dmar_start, 1, NULL, NULL); } +/* + * IVRS table as specified in AMD IOMMU Specification v2.62, Section 5.2 + * accessible here http://support.amd.com/TechDocs/48882_IOMMU.pdf + */ +static void +build_amd_iommu(GArray *table_data, BIOSLinker *linker) +{ + int iommu_start = table_data->len; + AMDVIState *s = AMD_IOMMU_DEVICE(x86_iommu_get_default()); + + /* IVRS header */ + acpi_data_push(table_data, sizeof(AcpiTableHeader)); + /* IVinfo - IO virtualization information common to all + * IOMMU units in a system + */ + build_append_int_noprefix(table_data, 40UL << 8/* PASize */, 4); + /* reserved */ + build_append_int_noprefix(table_data, 0, 8); + + /* IVHD definition - type 10h */ + build_append_int_noprefix(table_data, 0x10, 1); + /* virtualization flags */ + build_append_int_noprefix(table_data, + (1UL << 0) | /* HtTunEn */ + (1UL << 4) | /* iotblSup */ + (1UL << 6) | /* PrefSup */ + (1UL << 7), /* PPRSup */ + 1); + /* IVHD length */ + build_append_int_noprefix(table_data, 0x24, 2); + /* DeviceID */ + build_append_int_noprefix(table_data, s->devid, 2); + /* Capability offset */ + build_append_int_noprefix(table_data, s->capab_offset, 2); + /* IOMMU base address */ + build_append_int_noprefix(table_data, s->mmio.addr, 8); + /* PCI Segment Group */ + build_append_int_noprefix(table_data, 0, 2); + /* IOMMU info */ + build_append_int_noprefix(table_data, 0, 2); + /* IOMMU Feature Reporting */ + build_append_int_noprefix(table_data, + (48UL << 30) | /* HATS */ + (48UL << 28) | /* GATS */ + (1UL << 2), /* GTSup */ + 4); + /* + * Type 1 device entry reporting all devices + * These are 4-byte device entries currently reporting the range of + * Refer to Spec - Table 95:IVHD Device Entry Type Codes(4-byte) + */ + build_append_int_noprefix(table_data, 0x0000001, 4); + + build_header(linker, table_data, (void *)(table_data->data + iommu_start), + "IVRS", table_data->len - iommu_start, 1, NULL, NULL); +} static GArray * build_rsdp(GArray *rsdp_table, BIOSLinker *linker, unsigned rsdt_tbl_offset) @@ -2622,11 +2727,6 @@ static bool acpi_get_mcfg(AcpiMcfgInfo *mcfg) return true; } -static bool acpi_has_iommu(void) -{ - return !!x86_iommu_get_default(); -} - static void acpi_build(AcpiBuildTables *tables, MachineState *machine) { @@ -2706,13 +2806,19 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine) acpi_add_table(table_offsets, tables_blob); build_mcfg_q35(tables_blob, tables->linker, &mcfg); } - if (acpi_has_iommu()) { - acpi_add_table(table_offsets, tables_blob); - build_dmar_q35(tables_blob, tables->linker); + if (x86_iommu_get_default()) { + IommuType IOMMUType = x86_iommu_get_type(); + if (IOMMUType == TYPE_AMD) { + acpi_add_table(table_offsets, tables_blob); + build_amd_iommu(tables_blob, tables->linker); + } else if (IOMMUType == TYPE_INTEL) { + acpi_add_table(table_offsets, tables_blob); + build_dmar_q35(tables_blob, tables->linker); + } } if (pcms->acpi_nvdimm_state.is_enabled) { nvdimm_build_acpi(table_offsets, tables_blob, tables->linker, - pcms->acpi_nvdimm_state.dsm_mem); + &pcms->acpi_nvdimm_state, machine->ram_slots); } /* Add tables supplied by user (if any) */ @@ -2754,7 +2860,7 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine) */ int legacy_aml_len = pcmc->legacy_acpi_table_size + - ACPI_BUILD_LEGACY_CPU_AML_SIZE * max_cpus; + ACPI_BUILD_LEGACY_CPU_AML_SIZE * pcms->apic_id_limit; int legacy_table_size = ROUND_UP(tables_blob->len - aml_len + legacy_aml_len, ACPI_BUILD_ALIGN_SIZE); @@ -2830,7 +2936,7 @@ static MemoryRegion *acpi_add_rom_blob(AcpiBuildState *build_state, uint64_t max_size) { return rom_add_blob(name, blob->data, acpi_data_len(blob), max_size, -1, - name, acpi_build_update, build_state); + name, acpi_build_update, build_state, NULL); } static const VMStateDescription vmstate_acpi_build = { @@ -2855,7 +2961,7 @@ void acpi_setup(void) return; } - if (!pcmc->has_acpi_build) { + if (!pcms->acpi_build_enabled) { ACPI_BUILD_DPRINTF("ACPI build disabled. Bailing out.\n"); return; } diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c new file mode 100644 index 0000000000..47b79d9112 --- /dev/null +++ b/hw/i386/amd_iommu.c @@ -0,0 +1,1212 @@ +/* + * QEMU emulation of AMD IOMMU (AMD-Vi) + * + * Copyright (C) 2011 Eduard - Gabriel Munteanu + * Copyright (C) 2015 David Kiarie, <davidkiarie4@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License along + * with this program; if not, see <http://www.gnu.org/licenses/>. + * + * Cache implementation inspired by hw/i386/intel_iommu.c + */ +#include "qemu/osdep.h" +#include "hw/i386/amd_iommu.h" +#include "qemu/error-report.h" +#include "trace.h" + +/* used AMD-Vi MMIO registers */ +const char *amdvi_mmio_low[] = { + "AMDVI_MMIO_DEVTAB_BASE", + "AMDVI_MMIO_CMDBUF_BASE", + "AMDVI_MMIO_EVTLOG_BASE", + "AMDVI_MMIO_CONTROL", + "AMDVI_MMIO_EXCL_BASE", + "AMDVI_MMIO_EXCL_LIMIT", + "AMDVI_MMIO_EXT_FEATURES", + "AMDVI_MMIO_PPR_BASE", + "UNHANDLED" +}; +const char *amdvi_mmio_high[] = { + "AMDVI_MMIO_COMMAND_HEAD", + "AMDVI_MMIO_COMMAND_TAIL", + "AMDVI_MMIO_EVTLOG_HEAD", + "AMDVI_MMIO_EVTLOG_TAIL", + "AMDVI_MMIO_STATUS", + "AMDVI_MMIO_PPR_HEAD", + "AMDVI_MMIO_PPR_TAIL", + "UNHANDLED" +}; + +struct AMDVIAddressSpace { + uint8_t bus_num; /* bus number */ + uint8_t devfn; /* device function */ + AMDVIState *iommu_state; /* AMDVI - one per machine */ + MemoryRegion iommu; /* Device's address translation region */ + MemoryRegion iommu_ir; /* Device's interrupt remapping region */ + AddressSpace as; /* device's corresponding address space */ +}; + +/* AMDVI cache entry */ +typedef struct AMDVIIOTLBEntry { + uint16_t domid; /* assigned domain id */ + uint16_t devid; /* device owning entry */ + uint64_t perms; /* access permissions */ + uint64_t translated_addr; /* translated address */ + uint64_t page_mask; /* physical page size */ +} AMDVIIOTLBEntry; + +/* configure MMIO registers at startup/reset */ +static void amdvi_set_quad(AMDVIState *s, hwaddr addr, uint64_t val, + uint64_t romask, uint64_t w1cmask) +{ + stq_le_p(&s->mmior[addr], val); + stq_le_p(&s->romask[addr], romask); + stq_le_p(&s->w1cmask[addr], w1cmask); +} + +static uint16_t amdvi_readw(AMDVIState *s, hwaddr addr) +{ + return lduw_le_p(&s->mmior[addr]); +} + +static uint32_t amdvi_readl(AMDVIState *s, hwaddr addr) +{ + return ldl_le_p(&s->mmior[addr]); +} + +static uint64_t amdvi_readq(AMDVIState *s, hwaddr addr) +{ + return ldq_le_p(&s->mmior[addr]); +} + +/* internal write */ +static void amdvi_writeq_raw(AMDVIState *s, uint64_t val, hwaddr addr) +{ + stq_le_p(&s->mmior[addr], val); +} + +/* external write */ +static void amdvi_writew(AMDVIState *s, hwaddr addr, uint16_t val) +{ + uint16_t romask = lduw_le_p(&s->romask[addr]); + uint16_t w1cmask = lduw_le_p(&s->w1cmask[addr]); + uint16_t oldval = lduw_le_p(&s->mmior[addr]); + stw_le_p(&s->mmior[addr], + ((oldval & romask) | (val & ~romask)) & ~(val & w1cmask)); +} + +static void amdvi_writel(AMDVIState *s, hwaddr addr, uint32_t val) +{ + uint32_t romask = ldl_le_p(&s->romask[addr]); + uint32_t w1cmask = ldl_le_p(&s->w1cmask[addr]); + uint32_t oldval = ldl_le_p(&s->mmior[addr]); + stl_le_p(&s->mmior[addr], + ((oldval & romask) | (val & ~romask)) & ~(val & w1cmask)); +} + +static void amdvi_writeq(AMDVIState *s, hwaddr addr, uint64_t val) +{ + uint64_t romask = ldq_le_p(&s->romask[addr]); + uint64_t w1cmask = ldq_le_p(&s->w1cmask[addr]); + uint32_t oldval = ldq_le_p(&s->mmior[addr]); + stq_le_p(&s->mmior[addr], + ((oldval & romask) | (val & ~romask)) & ~(val & w1cmask)); +} + +/* OR a 64-bit register with a 64-bit value */ +static bool amdvi_test_mask(AMDVIState *s, hwaddr addr, uint64_t val) +{ + return amdvi_readq(s, addr) | val; +} + +/* OR a 64-bit register with a 64-bit value storing result in the register */ +static void amdvi_assign_orq(AMDVIState *s, hwaddr addr, uint64_t val) +{ + amdvi_writeq_raw(s, addr, amdvi_readq(s, addr) | val); +} + +/* AND a 64-bit register with a 64-bit value storing result in the register */ +static void amdvi_assign_andq(AMDVIState *s, hwaddr addr, uint64_t val) +{ + amdvi_writeq_raw(s, addr, amdvi_readq(s, addr) & val); +} + +static void amdvi_generate_msi_interrupt(AMDVIState *s) +{ + MSIMessage msg = {}; + MemTxAttrs attrs = { + .requester_id = pci_requester_id(&s->pci.dev) + }; + + if (msi_enabled(&s->pci.dev)) { + msg = msi_get_message(&s->pci.dev, 0); + address_space_stl_le(&address_space_memory, msg.address, msg.data, + attrs, NULL); + } +} + +static void amdvi_log_event(AMDVIState *s, uint64_t *evt) +{ + /* event logging not enabled */ + if (!s->evtlog_enabled || amdvi_test_mask(s, AMDVI_MMIO_STATUS, + AMDVI_MMIO_STATUS_EVT_OVF)) { + return; + } + + /* event log buffer full */ + if (s->evtlog_tail >= s->evtlog_len) { + amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_EVT_OVF); + /* generate interrupt */ + amdvi_generate_msi_interrupt(s); + return; + } + + if (dma_memory_write(&address_space_memory, s->evtlog + s->evtlog_tail, + &evt, AMDVI_EVENT_LEN)) { + trace_amdvi_evntlog_fail(s->evtlog, s->evtlog_tail); + } + + s->evtlog_tail += AMDVI_EVENT_LEN; + amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_COMP_INT); + amdvi_generate_msi_interrupt(s); +} + +static void amdvi_setevent_bits(uint64_t *buffer, uint64_t value, int start, + int length) +{ + int index = start / 64, bitpos = start % 64; + uint64_t mask = MAKE_64BIT_MASK(start, length); + buffer[index] &= ~mask; + buffer[index] |= (value << bitpos) & mask; +} +/* + * AMDVi event structure + * 0:15 -> DeviceID + * 55:63 -> event type + miscellaneous info + * 63:127 -> related address + */ +static void amdvi_encode_event(uint64_t *evt, uint16_t devid, uint64_t addr, + uint16_t info) +{ + amdvi_setevent_bits(evt, devid, 0, 16); + amdvi_setevent_bits(evt, info, 55, 8); + amdvi_setevent_bits(evt, addr, 63, 64); +} +/* log an error encountered during a page walk + * + * @addr: virtual address in translation request + */ +static void amdvi_page_fault(AMDVIState *s, uint16_t devid, + hwaddr addr, uint16_t info) +{ + uint64_t evt[4]; + + info |= AMDVI_EVENT_IOPF_I | AMDVI_EVENT_IOPF; + amdvi_encode_event(evt, devid, addr, info); + amdvi_log_event(s, evt); + pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS, + PCI_STATUS_SIG_TARGET_ABORT); +} +/* + * log a master abort accessing device table + * @devtab : address of device table entry + * @info : error flags + */ +static void amdvi_log_devtab_error(AMDVIState *s, uint16_t devid, + hwaddr devtab, uint16_t info) +{ + uint64_t evt[4]; + + info |= AMDVI_EVENT_DEV_TAB_HW_ERROR; + + amdvi_encode_event(evt, devid, devtab, info); + amdvi_log_event(s, evt); + pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS, + PCI_STATUS_SIG_TARGET_ABORT); +} +/* log an event trying to access command buffer + * @addr : address that couldn't be accessed + */ +static void amdvi_log_command_error(AMDVIState *s, hwaddr addr) +{ + uint64_t evt[4], info = AMDVI_EVENT_COMMAND_HW_ERROR; + + amdvi_encode_event(evt, 0, addr, info); + amdvi_log_event(s, evt); + pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS, + PCI_STATUS_SIG_TARGET_ABORT); +} +/* log an illegal comand event + * @addr : address of illegal command + */ +static void amdvi_log_illegalcom_error(AMDVIState *s, uint16_t info, + hwaddr addr) +{ + uint64_t evt[4]; + + info |= AMDVI_EVENT_ILLEGAL_COMMAND_ERROR; + amdvi_encode_event(evt, 0, addr, info); + amdvi_log_event(s, evt); +} +/* log an error accessing device table + * + * @devid : device owning the table entry + * @devtab : address of device table entry + * @info : error flags + */ +static void amdvi_log_illegaldevtab_error(AMDVIState *s, uint16_t devid, + hwaddr addr, uint16_t info) +{ + uint64_t evt[4]; + + info |= AMDVI_EVENT_ILLEGAL_DEVTAB_ENTRY; + amdvi_encode_event(evt, devid, addr, info); + amdvi_log_event(s, evt); +} +/* log an error accessing a PTE entry + * @addr : address that couldn't be accessed + */ +static void amdvi_log_pagetab_error(AMDVIState *s, uint16_t devid, + hwaddr addr, uint16_t info) +{ + uint64_t evt[4]; + + info |= AMDVI_EVENT_PAGE_TAB_HW_ERROR; + amdvi_encode_event(evt, devid, addr, info); + amdvi_log_event(s, evt); + pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS, + PCI_STATUS_SIG_TARGET_ABORT); +} + +static gboolean amdvi_uint64_equal(gconstpointer v1, gconstpointer v2) +{ + return *((const uint64_t *)v1) == *((const uint64_t *)v2); +} + +static guint amdvi_uint64_hash(gconstpointer v) +{ + return (guint)*(const uint64_t *)v; +} + +static AMDVIIOTLBEntry *amdvi_iotlb_lookup(AMDVIState *s, hwaddr addr, + uint64_t devid) +{ + uint64_t key = (addr >> AMDVI_PAGE_SHIFT_4K) | + ((uint64_t)(devid) << AMDVI_DEVID_SHIFT); + return g_hash_table_lookup(s->iotlb, &key); +} + +static void amdvi_iotlb_reset(AMDVIState *s) +{ + assert(s->iotlb); + trace_amdvi_iotlb_reset(); + g_hash_table_remove_all(s->iotlb); +} + +static gboolean amdvi_iotlb_remove_by_devid(gpointer key, gpointer value, + gpointer user_data) +{ + AMDVIIOTLBEntry *entry = (AMDVIIOTLBEntry *)value; + uint16_t devid = *(uint16_t *)user_data; + return entry->devid == devid; +} + +static void amdvi_iotlb_remove_page(AMDVIState *s, hwaddr addr, + uint64_t devid) +{ + uint64_t key = (addr >> AMDVI_PAGE_SHIFT_4K) | + ((uint64_t)(devid) << AMDVI_DEVID_SHIFT); + g_hash_table_remove(s->iotlb, &key); +} + +static void amdvi_update_iotlb(AMDVIState *s, uint16_t devid, + uint64_t gpa, IOMMUTLBEntry to_cache, + uint16_t domid) +{ + AMDVIIOTLBEntry *entry = g_new(AMDVIIOTLBEntry, 1); + uint64_t *key = g_new(uint64_t, 1); + uint64_t gfn = gpa >> AMDVI_PAGE_SHIFT_4K; + + /* don't cache erroneous translations */ + if (to_cache.perm != IOMMU_NONE) { + trace_amdvi_cache_update(domid, PCI_BUS_NUM(devid), PCI_SLOT(devid), + PCI_FUNC(devid), gpa, to_cache.translated_addr); + + if (g_hash_table_size(s->iotlb) >= AMDVI_IOTLB_MAX_SIZE) { + amdvi_iotlb_reset(s); + } + + entry->domid = domid; + entry->perms = to_cache.perm; + entry->translated_addr = to_cache.translated_addr; + entry->page_mask = to_cache.addr_mask; + *key = gfn | ((uint64_t)(devid) << AMDVI_DEVID_SHIFT); + g_hash_table_replace(s->iotlb, key, entry); + } +} + +static void amdvi_completion_wait(AMDVIState *s, uint64_t *cmd) +{ + /* pad the last 3 bits */ + hwaddr addr = cpu_to_le64(extract64(cmd[0], 3, 49)) << 3; + uint64_t data = cpu_to_le64(cmd[1]); + + if (extract64(cmd[0], 51, 8)) { + amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4), + s->cmdbuf + s->cmdbuf_head); + } + if (extract64(cmd[0], 0, 1)) { + if (dma_memory_write(&address_space_memory, addr, &data, + AMDVI_COMPLETION_DATA_SIZE)) { + trace_amdvi_completion_wait_fail(addr); + } + } + /* set completion interrupt */ + if (extract64(cmd[0], 1, 1)) { + amdvi_test_mask(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_COMP_INT); + /* generate interrupt */ + amdvi_generate_msi_interrupt(s); + } + trace_amdvi_completion_wait(addr, data); +} + +/* log error without aborting since linux seems to be using reserved bits */ +static void amdvi_inval_devtab_entry(AMDVIState *s, uint64_t *cmd) +{ + uint16_t devid = cpu_to_le16((uint16_t)extract64(cmd[0], 0, 16)); + + /* This command should invalidate internal caches of which there isn't */ + if (extract64(cmd[0], 15, 16) || cmd[1]) { + amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4), + s->cmdbuf + s->cmdbuf_head); + } + trace_amdvi_devtab_inval(PCI_BUS_NUM(devid), PCI_SLOT(devid), + PCI_FUNC(devid)); +} + +static void amdvi_complete_ppr(AMDVIState *s, uint64_t *cmd) +{ + if (extract64(cmd[0], 15, 16) || extract64(cmd[0], 19, 8) || + extract64(cmd[1], 0, 2) || extract64(cmd[1], 3, 29) + || extract64(cmd[1], 47, 16)) { + amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4), + s->cmdbuf + s->cmdbuf_head); + } + trace_amdvi_ppr_exec(); +} + +static void amdvi_inval_all(AMDVIState *s, uint64_t *cmd) +{ + if (extract64(cmd[0], 0, 60) || cmd[1]) { + amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4), + s->cmdbuf + s->cmdbuf_head); + } + + amdvi_iotlb_reset(s); + trace_amdvi_all_inval(); +} + +static gboolean amdvi_iotlb_remove_by_domid(gpointer key, gpointer value, + gpointer user_data) +{ + AMDVIIOTLBEntry *entry = (AMDVIIOTLBEntry *)value; + uint16_t domid = *(uint16_t *)user_data; + return entry->domid == domid; +} + +/* we don't have devid - we can't remove pages by address */ +static void amdvi_inval_pages(AMDVIState *s, uint64_t *cmd) +{ + uint16_t domid = cpu_to_le16((uint16_t)extract64(cmd[0], 32, 16)); + + if (extract64(cmd[0], 20, 12) || extract64(cmd[0], 16, 12) || + extract64(cmd[0], 3, 10)) { + amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4), + s->cmdbuf + s->cmdbuf_head); + } + + g_hash_table_foreach_remove(s->iotlb, amdvi_iotlb_remove_by_domid, + &domid); + trace_amdvi_pages_inval(domid); +} + +static void amdvi_prefetch_pages(AMDVIState *s, uint64_t *cmd) +{ + if (extract64(cmd[0], 16, 8) || extract64(cmd[0], 20, 8) || + extract64(cmd[1], 1, 1) || extract64(cmd[1], 3, 1) || + extract64(cmd[1], 5, 7)) { + amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4), + s->cmdbuf + s->cmdbuf_head); + } + + trace_amdvi_prefetch_pages(); +} + +static void amdvi_inval_inttable(AMDVIState *s, uint64_t *cmd) +{ + if (extract64(cmd[0], 16, 16) || cmd[1]) { + amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4), + s->cmdbuf + s->cmdbuf_head); + return; + } + + trace_amdvi_intr_inval(); +} + +/* FIXME: Try to work with the specified size instead of all the pages + * when the S bit is on + */ +static void iommu_inval_iotlb(AMDVIState *s, uint64_t *cmd) +{ + + uint16_t devid = extract64(cmd[0], 0, 16); + if (extract64(cmd[1], 1, 1) || extract64(cmd[1], 3, 9)) { + amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4), + s->cmdbuf + s->cmdbuf_head); + return; + } + + if (extract64(cmd[1], 0, 1)) { + g_hash_table_foreach_remove(s->iotlb, amdvi_iotlb_remove_by_devid, + &devid); + } else { + amdvi_iotlb_remove_page(s, cpu_to_le64(extract64(cmd[1], 12, 52)) << 12, + cpu_to_le16(extract64(cmd[1], 0, 16))); + } + trace_amdvi_iotlb_inval(); +} + +/* not honouring reserved bits is regarded as an illegal command */ +static void amdvi_cmdbuf_exec(AMDVIState *s) +{ + uint64_t cmd[2]; + + if (dma_memory_read(&address_space_memory, s->cmdbuf + s->cmdbuf_head, + cmd, AMDVI_COMMAND_SIZE)) { + trace_amdvi_command_read_fail(s->cmdbuf, s->cmdbuf_head); + amdvi_log_command_error(s, s->cmdbuf + s->cmdbuf_head); + return; + } + + switch (extract64(cmd[0], 60, 4)) { + case AMDVI_CMD_COMPLETION_WAIT: + amdvi_completion_wait(s, cmd); + break; + case AMDVI_CMD_INVAL_DEVTAB_ENTRY: + amdvi_inval_devtab_entry(s, cmd); + break; + case AMDVI_CMD_INVAL_AMDVI_PAGES: + amdvi_inval_pages(s, cmd); + break; + case AMDVI_CMD_INVAL_IOTLB_PAGES: + iommu_inval_iotlb(s, cmd); + break; + case AMDVI_CMD_INVAL_INTR_TABLE: + amdvi_inval_inttable(s, cmd); + break; + case AMDVI_CMD_PREFETCH_AMDVI_PAGES: + amdvi_prefetch_pages(s, cmd); + break; + case AMDVI_CMD_COMPLETE_PPR_REQUEST: + amdvi_complete_ppr(s, cmd); + break; + case AMDVI_CMD_INVAL_AMDVI_ALL: + amdvi_inval_all(s, cmd); + break; + default: + trace_amdvi_unhandled_command(extract64(cmd[1], 60, 4)); + /* log illegal command */ + amdvi_log_illegalcom_error(s, extract64(cmd[1], 60, 4), + s->cmdbuf + s->cmdbuf_head); + } +} + +static void amdvi_cmdbuf_run(AMDVIState *s) +{ + if (!s->cmdbuf_enabled) { + trace_amdvi_command_error(amdvi_readq(s, AMDVI_MMIO_CONTROL)); + return; + } + + /* check if there is work to do. */ + while (s->cmdbuf_head != s->cmdbuf_tail) { + trace_amdvi_command_exec(s->cmdbuf_head, s->cmdbuf_tail, s->cmdbuf); + amdvi_cmdbuf_exec(s); + s->cmdbuf_head += AMDVI_COMMAND_SIZE; + amdvi_writeq_raw(s, s->cmdbuf_head, AMDVI_MMIO_COMMAND_HEAD); + + /* wrap head pointer */ + if (s->cmdbuf_head >= s->cmdbuf_len * AMDVI_COMMAND_SIZE) { + s->cmdbuf_head = 0; + } + } +} + +static void amdvi_mmio_trace(hwaddr addr, unsigned size) +{ + uint8_t index = (addr & ~0x2000) / 8; + + if ((addr & 0x2000)) { + /* high table */ + index = index >= AMDVI_MMIO_REGS_HIGH ? AMDVI_MMIO_REGS_HIGH : index; + trace_amdvi_mmio_read(amdvi_mmio_high[index], addr, size, addr & ~0x07); + } else { + index = index >= AMDVI_MMIO_REGS_LOW ? AMDVI_MMIO_REGS_LOW : index; + trace_amdvi_mmio_read(amdvi_mmio_high[index], addr, size, addr & ~0x07); + } +} + +static uint64_t amdvi_mmio_read(void *opaque, hwaddr addr, unsigned size) +{ + AMDVIState *s = opaque; + + uint64_t val = -1; + if (addr + size > AMDVI_MMIO_SIZE) { + trace_amdvi_mmio_read("error: addr outside region: max ", + (uint64_t)AMDVI_MMIO_SIZE, addr, size); + return (uint64_t)-1; + } + + if (size == 2) { + val = amdvi_readw(s, addr); + } else if (size == 4) { + val = amdvi_readl(s, addr); + } else if (size == 8) { + val = amdvi_readq(s, addr); + } + amdvi_mmio_trace(addr, size); + + return val; +} + +static void amdvi_handle_control_write(AMDVIState *s) +{ + unsigned long control = amdvi_readq(s, AMDVI_MMIO_CONTROL); + s->enabled = !!(control & AMDVI_MMIO_CONTROL_AMDVIEN); + + s->ats_enabled = !!(control & AMDVI_MMIO_CONTROL_HTTUNEN); + s->evtlog_enabled = s->enabled && !!(control & + AMDVI_MMIO_CONTROL_EVENTLOGEN); + + s->evtlog_intr = !!(control & AMDVI_MMIO_CONTROL_EVENTINTEN); + s->completion_wait_intr = !!(control & AMDVI_MMIO_CONTROL_COMWAITINTEN); + s->cmdbuf_enabled = s->enabled && !!(control & + AMDVI_MMIO_CONTROL_CMDBUFLEN); + + /* update the flags depending on the control register */ + if (s->cmdbuf_enabled) { + amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_CMDBUF_RUN); + } else { + amdvi_assign_andq(s, AMDVI_MMIO_STATUS, ~AMDVI_MMIO_STATUS_CMDBUF_RUN); + } + if (s->evtlog_enabled) { + amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_EVT_RUN); + } else { + amdvi_assign_andq(s, AMDVI_MMIO_STATUS, ~AMDVI_MMIO_STATUS_EVT_RUN); + } + + trace_amdvi_control_status(control); + amdvi_cmdbuf_run(s); +} + +static inline void amdvi_handle_devtab_write(AMDVIState *s) + +{ + uint64_t val = amdvi_readq(s, AMDVI_MMIO_DEVICE_TABLE); + s->devtab = (val & AMDVI_MMIO_DEVTAB_BASE_MASK); + + /* set device table length */ + s->devtab_len = ((val & AMDVI_MMIO_DEVTAB_SIZE_MASK) + 1 * + (AMDVI_MMIO_DEVTAB_SIZE_UNIT / + AMDVI_MMIO_DEVTAB_ENTRY_SIZE)); +} + +static inline void amdvi_handle_cmdhead_write(AMDVIState *s) +{ + s->cmdbuf_head = amdvi_readq(s, AMDVI_MMIO_COMMAND_HEAD) + & AMDVI_MMIO_CMDBUF_HEAD_MASK; + amdvi_cmdbuf_run(s); +} + +static inline void amdvi_handle_cmdbase_write(AMDVIState *s) +{ + s->cmdbuf = amdvi_readq(s, AMDVI_MMIO_COMMAND_BASE) + & AMDVI_MMIO_CMDBUF_BASE_MASK; + s->cmdbuf_len = 1UL << (amdvi_readq(s, AMDVI_MMIO_CMDBUF_SIZE_BYTE) + & AMDVI_MMIO_CMDBUF_SIZE_MASK); + s->cmdbuf_head = s->cmdbuf_tail = 0; +} + +static inline void amdvi_handle_cmdtail_write(AMDVIState *s) +{ + s->cmdbuf_tail = amdvi_readq(s, AMDVI_MMIO_COMMAND_TAIL) + & AMDVI_MMIO_CMDBUF_TAIL_MASK; + amdvi_cmdbuf_run(s); +} + +static inline void amdvi_handle_excllim_write(AMDVIState *s) +{ + uint64_t val = amdvi_readq(s, AMDVI_MMIO_EXCL_LIMIT); + s->excl_limit = (val & AMDVI_MMIO_EXCL_LIMIT_MASK) | + AMDVI_MMIO_EXCL_LIMIT_LOW; +} + +static inline void amdvi_handle_evtbase_write(AMDVIState *s) +{ + uint64_t val = amdvi_readq(s, AMDVI_MMIO_EVENT_BASE); + s->evtlog = val & AMDVI_MMIO_EVTLOG_BASE_MASK; + s->evtlog_len = 1UL << (amdvi_readq(s, AMDVI_MMIO_EVTLOG_SIZE_BYTE) + & AMDVI_MMIO_EVTLOG_SIZE_MASK); +} + +static inline void amdvi_handle_evttail_write(AMDVIState *s) +{ + uint64_t val = amdvi_readq(s, AMDVI_MMIO_EVENT_TAIL); + s->evtlog_tail = val & AMDVI_MMIO_EVTLOG_TAIL_MASK; +} + +static inline void amdvi_handle_evthead_write(AMDVIState *s) +{ + uint64_t val = amdvi_readq(s, AMDVI_MMIO_EVENT_HEAD); + s->evtlog_head = val & AMDVI_MMIO_EVTLOG_HEAD_MASK; +} + +static inline void amdvi_handle_pprbase_write(AMDVIState *s) +{ + uint64_t val = amdvi_readq(s, AMDVI_MMIO_PPR_BASE); + s->ppr_log = val & AMDVI_MMIO_PPRLOG_BASE_MASK; + s->pprlog_len = 1UL << (amdvi_readq(s, AMDVI_MMIO_PPRLOG_SIZE_BYTE) + & AMDVI_MMIO_PPRLOG_SIZE_MASK); +} + +static inline void amdvi_handle_pprhead_write(AMDVIState *s) +{ + uint64_t val = amdvi_readq(s, AMDVI_MMIO_PPR_HEAD); + s->pprlog_head = val & AMDVI_MMIO_PPRLOG_HEAD_MASK; +} + +static inline void amdvi_handle_pprtail_write(AMDVIState *s) +{ + uint64_t val = amdvi_readq(s, AMDVI_MMIO_PPR_TAIL); + s->pprlog_tail = val & AMDVI_MMIO_PPRLOG_TAIL_MASK; +} + +/* FIXME: something might go wrong if System Software writes in chunks + * of one byte but linux writes in chunks of 4 bytes so currently it + * works correctly with linux but will definitely be busted if software + * reads/writes 8 bytes + */ +static void amdvi_mmio_reg_write(AMDVIState *s, unsigned size, uint64_t val, + hwaddr addr) +{ + if (size == 2) { + amdvi_writew(s, addr, val); + } else if (size == 4) { + amdvi_writel(s, addr, val); + } else if (size == 8) { + amdvi_writeq(s, addr, val); + } +} + +static void amdvi_mmio_write(void *opaque, hwaddr addr, uint64_t val, + unsigned size) +{ + AMDVIState *s = opaque; + unsigned long offset = addr & 0x07; + + if (addr + size > AMDVI_MMIO_SIZE) { + trace_amdvi_mmio_write("error: addr outside region: max ", + (uint64_t)AMDVI_MMIO_SIZE, size, val, offset); + return; + } + + amdvi_mmio_trace(addr, size); + switch (addr & ~0x07) { + case AMDVI_MMIO_CONTROL: + amdvi_mmio_reg_write(s, size, val, addr); + amdvi_handle_control_write(s); + break; + case AMDVI_MMIO_DEVICE_TABLE: + amdvi_mmio_reg_write(s, size, val, addr); + /* set device table address + * This also suffers from inability to tell whether software + * is done writing + */ + if (offset || (size == 8)) { + amdvi_handle_devtab_write(s); + } + break; + case AMDVI_MMIO_COMMAND_HEAD: + amdvi_mmio_reg_write(s, size, val, addr); + amdvi_handle_cmdhead_write(s); + break; + case AMDVI_MMIO_COMMAND_BASE: + amdvi_mmio_reg_write(s, size, val, addr); + /* FIXME - make sure System Software has finished writing incase + * it writes in chucks less than 8 bytes in a robust way.As for + * now, this hacks works for the linux driver + */ + if (offset || (size == 8)) { + amdvi_handle_cmdbase_write(s); + } + break; + case AMDVI_MMIO_COMMAND_TAIL: + amdvi_mmio_reg_write(s, size, val, addr); + amdvi_handle_cmdtail_write(s); + break; + case AMDVI_MMIO_EVENT_BASE: + amdvi_mmio_reg_write(s, size, val, addr); + amdvi_handle_evtbase_write(s); + break; + case AMDVI_MMIO_EVENT_HEAD: + amdvi_mmio_reg_write(s, size, val, addr); + amdvi_handle_evthead_write(s); + break; + case AMDVI_MMIO_EVENT_TAIL: + amdvi_mmio_reg_write(s, size, val, addr); + amdvi_handle_evttail_write(s); + break; + case AMDVI_MMIO_EXCL_LIMIT: + amdvi_mmio_reg_write(s, size, val, addr); + amdvi_handle_excllim_write(s); + break; + /* PPR log base - unused for now */ + case AMDVI_MMIO_PPR_BASE: + amdvi_mmio_reg_write(s, size, val, addr); + amdvi_handle_pprbase_write(s); + break; + /* PPR log head - also unused for now */ + case AMDVI_MMIO_PPR_HEAD: + amdvi_mmio_reg_write(s, size, val, addr); + amdvi_handle_pprhead_write(s); + break; + /* PPR log tail - unused for now */ + case AMDVI_MMIO_PPR_TAIL: + amdvi_mmio_reg_write(s, size, val, addr); + amdvi_handle_pprtail_write(s); + break; + } +} + +static inline uint64_t amdvi_get_perms(uint64_t entry) +{ + return (entry & (AMDVI_DEV_PERM_READ | AMDVI_DEV_PERM_WRITE)) >> + AMDVI_DEV_PERM_SHIFT; +} + +/* a valid entry should have V = 1 and reserved bits honoured */ +static bool amdvi_validate_dte(AMDVIState *s, uint16_t devid, + uint64_t *dte) +{ + if ((dte[0] & AMDVI_DTE_LOWER_QUAD_RESERVED) + || (dte[1] & AMDVI_DTE_MIDDLE_QUAD_RESERVED) + || (dte[2] & AMDVI_DTE_UPPER_QUAD_RESERVED) || dte[3]) { + amdvi_log_illegaldevtab_error(s, devid, + s->devtab + + devid * AMDVI_DEVTAB_ENTRY_SIZE, 0); + return false; + } + + return dte[0] & AMDVI_DEV_VALID; +} + +/* get a device table entry given the devid */ +static bool amdvi_get_dte(AMDVIState *s, int devid, uint64_t *entry) +{ + uint32_t offset = devid * AMDVI_DEVTAB_ENTRY_SIZE; + + if (dma_memory_read(&address_space_memory, s->devtab + offset, entry, + AMDVI_DEVTAB_ENTRY_SIZE)) { + trace_amdvi_dte_get_fail(s->devtab, offset); + /* log error accessing dte */ + amdvi_log_devtab_error(s, devid, s->devtab + offset, 0); + return false; + } + + *entry = le64_to_cpu(*entry); + if (!amdvi_validate_dte(s, devid, entry)) { + trace_amdvi_invalid_dte(entry[0]); + return false; + } + + return true; +} + +/* get pte translation mode */ +static inline uint8_t get_pte_translation_mode(uint64_t pte) +{ + return (pte >> AMDVI_DEV_MODE_RSHIFT) & AMDVI_DEV_MODE_MASK; +} + +static inline uint64_t pte_override_page_mask(uint64_t pte) +{ + uint8_t page_mask = 12; + uint64_t addr = (pte & AMDVI_DEV_PT_ROOT_MASK) ^ AMDVI_DEV_PT_ROOT_MASK; + /* find the first zero bit */ + while (addr & 1) { + page_mask++; + addr = addr >> 1; + } + + return ~((1ULL << page_mask) - 1); +} + +static inline uint64_t pte_get_page_mask(uint64_t oldlevel) +{ + return ~((1UL << ((oldlevel * 9) + 3)) - 1); +} + +static inline uint64_t amdvi_get_pte_entry(AMDVIState *s, uint64_t pte_addr, + uint16_t devid) +{ + uint64_t pte; + + if (dma_memory_read(&address_space_memory, pte_addr, &pte, sizeof(pte))) { + trace_amdvi_get_pte_hwerror(pte_addr); + amdvi_log_pagetab_error(s, devid, pte_addr, 0); + pte = 0; + return pte; + } + + pte = le64_to_cpu(pte); + return pte; +} + +static void amdvi_page_walk(AMDVIAddressSpace *as, uint64_t *dte, + IOMMUTLBEntry *ret, unsigned perms, + hwaddr addr) +{ + unsigned level, present, pte_perms, oldlevel; + uint64_t pte = dte[0], pte_addr, page_mask; + + /* make sure the DTE has TV = 1 */ + if (pte & AMDVI_DEV_TRANSLATION_VALID) { + level = get_pte_translation_mode(pte); + if (level >= 7) { + trace_amdvi_mode_invalid(level, addr); + return; + } + if (level == 0) { + goto no_remap; + } + + /* we are at the leaf page table or page table encodes a huge page */ + while (level > 0) { + pte_perms = amdvi_get_perms(pte); + present = pte & 1; + if (!present || perms != (perms & pte_perms)) { + amdvi_page_fault(as->iommu_state, as->devfn, addr, perms); + trace_amdvi_page_fault(addr); + return; + } + + /* go to the next lower level */ + pte_addr = pte & AMDVI_DEV_PT_ROOT_MASK; + /* add offset and load pte */ + pte_addr += ((addr >> (3 + 9 * level)) & 0x1FF) << 3; + pte = amdvi_get_pte_entry(as->iommu_state, pte_addr, as->devfn); + if (!pte) { + return; + } + oldlevel = level; + level = get_pte_translation_mode(pte); + if (level == 0x7) { + break; + } + } + + if (level == 0x7) { + page_mask = pte_override_page_mask(pte); + } else { + page_mask = pte_get_page_mask(oldlevel); + } + + /* get access permissions from pte */ + ret->iova = addr & page_mask; + ret->translated_addr = (pte & AMDVI_DEV_PT_ROOT_MASK) & page_mask; + ret->addr_mask = ~page_mask; + ret->perm = amdvi_get_perms(pte); + return; + } +no_remap: + ret->iova = addr & AMDVI_PAGE_MASK_4K; + ret->translated_addr = addr & AMDVI_PAGE_MASK_4K; + ret->addr_mask = ~AMDVI_PAGE_MASK_4K; + ret->perm = amdvi_get_perms(pte); +} + +static void amdvi_do_translate(AMDVIAddressSpace *as, hwaddr addr, + bool is_write, IOMMUTLBEntry *ret) +{ + AMDVIState *s = as->iommu_state; + uint16_t devid = PCI_BUILD_BDF(as->bus_num, as->devfn); + AMDVIIOTLBEntry *iotlb_entry = amdvi_iotlb_lookup(s, addr, devid); + uint64_t entry[4]; + + if (iotlb_entry) { + trace_amdvi_iotlb_hit(PCI_BUS_NUM(devid), PCI_SLOT(devid), + PCI_FUNC(devid), addr, iotlb_entry->translated_addr); + ret->iova = addr & ~iotlb_entry->page_mask; + ret->translated_addr = iotlb_entry->translated_addr; + ret->addr_mask = iotlb_entry->page_mask; + ret->perm = iotlb_entry->perms; + return; + } + + /* devices with V = 0 are not translated */ + if (!amdvi_get_dte(s, devid, entry)) { + goto out; + } + + amdvi_page_walk(as, entry, ret, + is_write ? AMDVI_PERM_WRITE : AMDVI_PERM_READ, addr); + + amdvi_update_iotlb(s, devid, addr, *ret, + entry[1] & AMDVI_DEV_DOMID_ID_MASK); + return; + +out: + ret->iova = addr & AMDVI_PAGE_MASK_4K; + ret->translated_addr = addr & AMDVI_PAGE_MASK_4K; + ret->addr_mask = ~AMDVI_PAGE_MASK_4K; + ret->perm = IOMMU_RW; +} + +static inline bool amdvi_is_interrupt_addr(hwaddr addr) +{ + return addr >= AMDVI_INT_ADDR_FIRST && addr <= AMDVI_INT_ADDR_LAST; +} + +static IOMMUTLBEntry amdvi_translate(MemoryRegion *iommu, hwaddr addr, + bool is_write) +{ + AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu); + AMDVIState *s = as->iommu_state; + IOMMUTLBEntry ret = { + .target_as = &address_space_memory, + .iova = addr, + .translated_addr = 0, + .addr_mask = ~(hwaddr)0, + .perm = IOMMU_NONE + }; + + if (!s->enabled) { + /* AMDVI disabled - corresponds to iommu=off not + * failure to provide any parameter + */ + ret.iova = addr & AMDVI_PAGE_MASK_4K; + ret.translated_addr = addr & AMDVI_PAGE_MASK_4K; + ret.addr_mask = ~AMDVI_PAGE_MASK_4K; + ret.perm = IOMMU_RW; + return ret; + } else if (amdvi_is_interrupt_addr(addr)) { + ret.iova = addr & AMDVI_PAGE_MASK_4K; + ret.translated_addr = addr & AMDVI_PAGE_MASK_4K; + ret.addr_mask = ~AMDVI_PAGE_MASK_4K; + ret.perm = IOMMU_WO; + return ret; + } + + amdvi_do_translate(as, addr, is_write, &ret); + trace_amdvi_translation_result(as->bus_num, PCI_SLOT(as->devfn), + PCI_FUNC(as->devfn), addr, ret.translated_addr); + return ret; +} + +static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) +{ + AMDVIState *s = opaque; + AMDVIAddressSpace **iommu_as; + int bus_num = pci_bus_num(bus); + + iommu_as = s->address_spaces[bus_num]; + + /* allocate memory during the first run */ + if (!iommu_as) { + iommu_as = g_malloc0(sizeof(AMDVIAddressSpace *) * PCI_DEVFN_MAX); + s->address_spaces[bus_num] = iommu_as; + } + + /* set up AMD-Vi region */ + if (!iommu_as[devfn]) { + iommu_as[devfn] = g_malloc0(sizeof(AMDVIAddressSpace)); + iommu_as[devfn]->bus_num = (uint8_t)bus_num; + iommu_as[devfn]->devfn = (uint8_t)devfn; + iommu_as[devfn]->iommu_state = s; + + memory_region_init_iommu(&iommu_as[devfn]->iommu, OBJECT(s), + &s->iommu_ops, "amd-iommu", UINT64_MAX); + address_space_init(&iommu_as[devfn]->as, &iommu_as[devfn]->iommu, + "amd-iommu"); + } + return &iommu_as[devfn]->as; +} + +static const MemoryRegionOps mmio_mem_ops = { + .read = amdvi_mmio_read, + .write = amdvi_mmio_write, + .endianness = DEVICE_LITTLE_ENDIAN, + .impl = { + .min_access_size = 1, + .max_access_size = 8, + .unaligned = false, + }, + .valid = { + .min_access_size = 1, + .max_access_size = 8, + } +}; + +static void amdvi_iommu_notify_flag_changed(MemoryRegion *iommu, + IOMMUNotifierFlag old, + IOMMUNotifierFlag new) +{ + AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu); + + if (new & IOMMU_NOTIFIER_MAP) { + error_report("device %02x.%02x.%x requires iommu notifier which is not " + "currently supported", as->bus_num, PCI_SLOT(as->devfn), + PCI_FUNC(as->devfn)); + exit(1); + } +} + +static void amdvi_init(AMDVIState *s) +{ + amdvi_iotlb_reset(s); + + s->iommu_ops.translate = amdvi_translate; + s->iommu_ops.notify_flag_changed = amdvi_iommu_notify_flag_changed; + s->devtab_len = 0; + s->cmdbuf_len = 0; + s->cmdbuf_head = 0; + s->cmdbuf_tail = 0; + s->evtlog_head = 0; + s->evtlog_tail = 0; + s->excl_enabled = false; + s->excl_allow = false; + s->mmio_enabled = false; + s->enabled = false; + s->ats_enabled = false; + s->cmdbuf_enabled = false; + + /* reset MMIO */ + memset(s->mmior, 0, AMDVI_MMIO_SIZE); + amdvi_set_quad(s, AMDVI_MMIO_EXT_FEATURES, AMDVI_EXT_FEATURES, + 0xffffffffffffffef, 0); + amdvi_set_quad(s, AMDVI_MMIO_STATUS, 0, 0x98, 0x67); + + /* reset device ident */ + pci_config_set_vendor_id(s->pci.dev.config, PCI_VENDOR_ID_AMD); + pci_config_set_prog_interface(s->pci.dev.config, 00); + pci_config_set_device_id(s->pci.dev.config, s->devid); + pci_config_set_class(s->pci.dev.config, 0x0806); + + /* reset AMDVI specific capabilities, all r/o */ + pci_set_long(s->pci.dev.config + s->capab_offset, AMDVI_CAPAB_FEATURES); + pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_BAR_LOW, + s->mmio.addr & ~(0xffff0000)); + pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_BAR_HIGH, + (s->mmio.addr & ~(0xffff)) >> 16); + pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_RANGE, + 0xff000000); + pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_MISC, 0); + pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_MISC, + AMDVI_MAX_PH_ADDR | AMDVI_MAX_GVA_ADDR | AMDVI_MAX_VA_ADDR); +} + +static void amdvi_reset(DeviceState *dev) +{ + AMDVIState *s = AMD_IOMMU_DEVICE(dev); + + msi_reset(&s->pci.dev); + amdvi_init(s); +} + +static void amdvi_realize(DeviceState *dev, Error **err) +{ + int ret = 0; + AMDVIState *s = AMD_IOMMU_DEVICE(dev); + X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(dev); + PCIBus *bus = PC_MACHINE(qdev_get_machine())->bus; + s->iotlb = g_hash_table_new_full(amdvi_uint64_hash, + amdvi_uint64_equal, g_free, g_free); + + /* This device should take care of IOMMU PCI properties */ + x86_iommu->type = TYPE_AMD; + qdev_set_parent_bus(DEVICE(&s->pci), &bus->qbus); + object_property_set_bool(OBJECT(&s->pci), true, "realized", err); + s->capab_offset = pci_add_capability(&s->pci.dev, AMDVI_CAPAB_ID_SEC, 0, + AMDVI_CAPAB_SIZE); + assert(s->capab_offset > 0); + ret = pci_add_capability(&s->pci.dev, PCI_CAP_ID_MSI, 0, AMDVI_CAPAB_REG_SIZE); + assert(ret > 0); + ret = pci_add_capability(&s->pci.dev, PCI_CAP_ID_HT, 0, AMDVI_CAPAB_REG_SIZE); + assert(ret > 0); + + /* set up MMIO */ + memory_region_init_io(&s->mmio, OBJECT(s), &mmio_mem_ops, s, "amdvi-mmio", + AMDVI_MMIO_SIZE); + + sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->mmio); + sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, AMDVI_BASE_ADDR); + pci_setup_iommu(bus, amdvi_host_dma_iommu, s); + s->devid = object_property_get_int(OBJECT(&s->pci), "addr", err); + msi_init(&s->pci.dev, 0, 1, true, false, err); + amdvi_init(s); +} + +static const VMStateDescription vmstate_amdvi = { + .name = "amd-iommu", + .unmigratable = 1 +}; + +static void amdvi_instance_init(Object *klass) +{ + AMDVIState *s = AMD_IOMMU_DEVICE(klass); + + object_initialize(&s->pci, sizeof(s->pci), TYPE_AMD_IOMMU_PCI); +} + +static void amdvi_class_init(ObjectClass *klass, void* data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + X86IOMMUClass *dc_class = X86_IOMMU_CLASS(klass); + + dc->reset = amdvi_reset; + dc->vmsd = &vmstate_amdvi; + dc->hotpluggable = false; + dc_class->realize = amdvi_realize; +} + +static const TypeInfo amdvi = { + .name = TYPE_AMD_IOMMU_DEVICE, + .parent = TYPE_X86_IOMMU_DEVICE, + .instance_size = sizeof(AMDVIState), + .instance_init = amdvi_instance_init, + .class_init = amdvi_class_init +}; + +static const TypeInfo amdviPCI = { + .name = "AMDVI-PCI", + .parent = TYPE_PCI_DEVICE, + .instance_size = sizeof(AMDVIPCIState), +}; + +static void amdviPCI_register_types(void) +{ + type_register_static(&amdviPCI); + type_register_static(&amdvi); +} + +type_init(amdviPCI_register_types); diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h new file mode 100644 index 0000000000..884926e9e7 --- /dev/null +++ b/hw/i386/amd_iommu.h @@ -0,0 +1,289 @@ +/* + * QEMU emulation of an AMD IOMMU (AMD-Vi) + * + * Copyright (C) 2011 Eduard - Gabriel Munteanu + * Copyright (C) 2015 David Kiarie, <davidkiarie4@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License along + * with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef AMD_IOMMU_H_ +#define AMD_IOMMU_H_ + +#include "hw/hw.h" +#include "hw/pci/pci.h" +#include "hw/pci/msi.h" +#include "hw/sysbus.h" +#include "sysemu/dma.h" +#include "hw/i386/pc.h" +#include "hw/pci/pci_bus.h" +#include "hw/i386/x86-iommu.h" + +/* Capability registers */ +#define AMDVI_CAPAB_BAR_LOW 0x04 +#define AMDVI_CAPAB_BAR_HIGH 0x08 +#define AMDVI_CAPAB_RANGE 0x0C +#define AMDVI_CAPAB_MISC 0x10 + +#define AMDVI_CAPAB_SIZE 0x18 +#define AMDVI_CAPAB_REG_SIZE 0x04 + +/* Capability header data */ +#define AMDVI_CAPAB_ID_SEC 0xf +#define AMDVI_CAPAB_FLAT_EXT (1 << 28) +#define AMDVI_CAPAB_EFR_SUP (1 << 27) +#define AMDVI_CAPAB_FLAG_NPCACHE (1 << 26) +#define AMDVI_CAPAB_FLAG_HTTUNNEL (1 << 25) +#define AMDVI_CAPAB_FLAG_IOTLBSUP (1 << 24) +#define AMDVI_CAPAB_INIT_TYPE (3 << 16) + +/* No. of used MMIO registers */ +#define AMDVI_MMIO_REGS_HIGH 8 +#define AMDVI_MMIO_REGS_LOW 7 + +/* MMIO registers */ +#define AMDVI_MMIO_DEVICE_TABLE 0x0000 +#define AMDVI_MMIO_COMMAND_BASE 0x0008 +#define AMDVI_MMIO_EVENT_BASE 0x0010 +#define AMDVI_MMIO_CONTROL 0x0018 +#define AMDVI_MMIO_EXCL_BASE 0x0020 +#define AMDVI_MMIO_EXCL_LIMIT 0x0028 +#define AMDVI_MMIO_EXT_FEATURES 0x0030 +#define AMDVI_MMIO_COMMAND_HEAD 0x2000 +#define AMDVI_MMIO_COMMAND_TAIL 0x2008 +#define AMDVI_MMIO_EVENT_HEAD 0x2010 +#define AMDVI_MMIO_EVENT_TAIL 0x2018 +#define AMDVI_MMIO_STATUS 0x2020 +#define AMDVI_MMIO_PPR_BASE 0x0038 +#define AMDVI_MMIO_PPR_HEAD 0x2030 +#define AMDVI_MMIO_PPR_TAIL 0x2038 + +#define AMDVI_MMIO_SIZE 0x4000 + +#define AMDVI_MMIO_DEVTAB_SIZE_MASK ((1ULL << 12) - 1) +#define AMDVI_MMIO_DEVTAB_BASE_MASK (((1ULL << 52) - 1) & ~ \ + AMDVI_MMIO_DEVTAB_SIZE_MASK) +#define AMDVI_MMIO_DEVTAB_ENTRY_SIZE 32 +#define AMDVI_MMIO_DEVTAB_SIZE_UNIT 4096 + +/* some of this are similar but just for readability */ +#define AMDVI_MMIO_CMDBUF_SIZE_BYTE (AMDVI_MMIO_COMMAND_BASE + 7) +#define AMDVI_MMIO_CMDBUF_SIZE_MASK 0x0f +#define AMDVI_MMIO_CMDBUF_BASE_MASK AMDVI_MMIO_DEVTAB_BASE_MASK +#define AMDVI_MMIO_CMDBUF_HEAD_MASK (((1ULL << 19) - 1) & ~0x0f) +#define AMDVI_MMIO_CMDBUF_TAIL_MASK AMDVI_MMIO_EVTLOG_HEAD_MASK + +#define AMDVI_MMIO_EVTLOG_SIZE_BYTE (AMDVI_MMIO_EVENT_BASE + 7) +#define AMDVI_MMIO_EVTLOG_SIZE_MASK AMDVI_MMIO_CMDBUF_SIZE_MASK +#define AMDVI_MMIO_EVTLOG_BASE_MASK AMDVI_MMIO_CMDBUF_BASE_MASK +#define AMDVI_MMIO_EVTLOG_HEAD_MASK (((1ULL << 19) - 1) & ~0x0f) +#define AMDVI_MMIO_EVTLOG_TAIL_MASK AMDVI_MMIO_EVTLOG_HEAD_MASK + +#define AMDVI_MMIO_PPRLOG_SIZE_BYTE (AMDVI_MMIO_EVENT_BASE + 7) +#define AMDVI_MMIO_PPRLOG_HEAD_MASK AMDVI_MMIO_EVTLOG_HEAD_MASK +#define AMDVI_MMIO_PPRLOG_TAIL_MASK AMDVI_MMIO_EVTLOG_HEAD_MASK +#define AMDVI_MMIO_PPRLOG_BASE_MASK AMDVI_MMIO_EVTLOG_BASE_MASK +#define AMDVI_MMIO_PPRLOG_SIZE_MASK AMDVI_MMIO_EVTLOG_SIZE_MASK + +#define AMDVI_MMIO_EXCL_ENABLED_MASK (1ULL << 0) +#define AMDVI_MMIO_EXCL_ALLOW_MASK (1ULL << 1) +#define AMDVI_MMIO_EXCL_LIMIT_MASK AMDVI_MMIO_DEVTAB_BASE_MASK +#define AMDVI_MMIO_EXCL_LIMIT_LOW 0xfff + +/* mmio control register flags */ +#define AMDVI_MMIO_CONTROL_AMDVIEN (1ULL << 0) +#define AMDVI_MMIO_CONTROL_HTTUNEN (1ULL << 1) +#define AMDVI_MMIO_CONTROL_EVENTLOGEN (1ULL << 2) +#define AMDVI_MMIO_CONTROL_EVENTINTEN (1ULL << 3) +#define AMDVI_MMIO_CONTROL_COMWAITINTEN (1ULL << 4) +#define AMDVI_MMIO_CONTROL_CMDBUFLEN (1ULL << 12) + +/* MMIO status register bits */ +#define AMDVI_MMIO_STATUS_CMDBUF_RUN (1 << 4) +#define AMDVI_MMIO_STATUS_EVT_RUN (1 << 3) +#define AMDVI_MMIO_STATUS_COMP_INT (1 << 2) +#define AMDVI_MMIO_STATUS_EVT_OVF (1 << 0) + +#define AMDVI_CMDBUF_ID_BYTE 0x07 +#define AMDVI_CMDBUF_ID_RSHIFT 4 + +#define AMDVI_CMD_COMPLETION_WAIT 0x01 +#define AMDVI_CMD_INVAL_DEVTAB_ENTRY 0x02 +#define AMDVI_CMD_INVAL_AMDVI_PAGES 0x03 +#define AMDVI_CMD_INVAL_IOTLB_PAGES 0x04 +#define AMDVI_CMD_INVAL_INTR_TABLE 0x05 +#define AMDVI_CMD_PREFETCH_AMDVI_PAGES 0x06 +#define AMDVI_CMD_COMPLETE_PPR_REQUEST 0x07 +#define AMDVI_CMD_INVAL_AMDVI_ALL 0x08 + +#define AMDVI_DEVTAB_ENTRY_SIZE 32 + +/* Device table entry bits 0:63 */ +#define AMDVI_DEV_VALID (1ULL << 0) +#define AMDVI_DEV_TRANSLATION_VALID (1ULL << 1) +#define AMDVI_DEV_MODE_MASK 0x7 +#define AMDVI_DEV_MODE_RSHIFT 9 +#define AMDVI_DEV_PT_ROOT_MASK 0xffffffffff000 +#define AMDVI_DEV_PT_ROOT_RSHIFT 12 +#define AMDVI_DEV_PERM_SHIFT 61 +#define AMDVI_DEV_PERM_READ (1ULL << 61) +#define AMDVI_DEV_PERM_WRITE (1ULL << 62) + +/* Device table entry bits 64:127 */ +#define AMDVI_DEV_DOMID_ID_MASK ((1ULL << 16) - 1) + +/* Event codes and flags, as stored in the info field */ +#define AMDVI_EVENT_ILLEGAL_DEVTAB_ENTRY (0x1U << 12) +#define AMDVI_EVENT_IOPF (0x2U << 12) +#define AMDVI_EVENT_IOPF_I (1U << 3) +#define AMDVI_EVENT_DEV_TAB_HW_ERROR (0x3U << 12) +#define AMDVI_EVENT_PAGE_TAB_HW_ERROR (0x4U << 12) +#define AMDVI_EVENT_ILLEGAL_COMMAND_ERROR (0x5U << 12) +#define AMDVI_EVENT_COMMAND_HW_ERROR (0x6U << 12) + +#define AMDVI_EVENT_LEN 16 +#define AMDVI_PERM_READ (1 << 0) +#define AMDVI_PERM_WRITE (1 << 1) + +#define AMDVI_FEATURE_PREFETCH (1ULL << 0) /* page prefetch */ +#define AMDVI_FEATURE_PPR (1ULL << 1) /* PPR Support */ +#define AMDVI_FEATURE_GT (1ULL << 4) /* Guest Translation */ +#define AMDVI_FEATURE_IA (1ULL << 6) /* inval all support */ +#define AMDVI_FEATURE_GA (1ULL << 7) /* guest VAPIC support */ +#define AMDVI_FEATURE_HE (1ULL << 8) /* hardware error regs */ +#define AMDVI_FEATURE_PC (1ULL << 9) /* Perf counters */ + +/* reserved DTE bits */ +#define AMDVI_DTE_LOWER_QUAD_RESERVED 0x80300000000000fc +#define AMDVI_DTE_MIDDLE_QUAD_RESERVED 0x0000000000000100 +#define AMDVI_DTE_UPPER_QUAD_RESERVED 0x08f0000000000000 + +/* AMDVI paging mode */ +#define AMDVI_GATS_MODE (6ULL << 12) +#define AMDVI_HATS_MODE (6ULL << 10) + +/* IOTLB */ +#define AMDVI_IOTLB_MAX_SIZE 1024 +#define AMDVI_DEVID_SHIFT 36 + +/* extended feature support */ +#define AMDVI_EXT_FEATURES (AMDVI_FEATURE_PREFETCH | AMDVI_FEATURE_PPR | \ + AMDVI_FEATURE_IA | AMDVI_FEATURE_GT | AMDVI_FEATURE_HE | \ + AMDVI_GATS_MODE | AMDVI_HATS_MODE) + +/* capabilities header */ +#define AMDVI_CAPAB_FEATURES (AMDVI_CAPAB_FLAT_EXT | \ + AMDVI_CAPAB_FLAG_NPCACHE | AMDVI_CAPAB_FLAG_IOTLBSUP \ + | AMDVI_CAPAB_ID_SEC | AMDVI_CAPAB_INIT_TYPE | \ + AMDVI_CAPAB_FLAG_HTTUNNEL | AMDVI_CAPAB_EFR_SUP) + +/* AMDVI default address */ +#define AMDVI_BASE_ADDR 0xfed80000 + +/* page management constants */ +#define AMDVI_PAGE_SHIFT 12 +#define AMDVI_PAGE_SIZE (1ULL << AMDVI_PAGE_SHIFT) + +#define AMDVI_PAGE_SHIFT_4K 12 +#define AMDVI_PAGE_MASK_4K (~((1ULL << AMDVI_PAGE_SHIFT_4K) - 1)) + +#define AMDVI_MAX_VA_ADDR (48UL << 5) +#define AMDVI_MAX_PH_ADDR (40UL << 8) +#define AMDVI_MAX_GVA_ADDR (48UL << 15) + +/* Completion Wait data size */ +#define AMDVI_COMPLETION_DATA_SIZE 8 + +#define AMDVI_COMMAND_SIZE 16 +/* Completion Wait data size */ +#define AMDVI_COMPLETION_DATA_SIZE 8 + +#define AMDVI_COMMAND_SIZE 16 + +#define AMDVI_INT_ADDR_FIRST 0xfee00000 +#define AMDVI_INT_ADDR_LAST 0xfeefffff + +#define TYPE_AMD_IOMMU_DEVICE "amd-iommu" +#define AMD_IOMMU_DEVICE(obj)\ + OBJECT_CHECK(AMDVIState, (obj), TYPE_AMD_IOMMU_DEVICE) + +#define TYPE_AMD_IOMMU_PCI "AMDVI-PCI" + +typedef struct AMDVIAddressSpace AMDVIAddressSpace; + +/* functions to steal PCI config space */ +typedef struct AMDVIPCIState { + PCIDevice dev; /* The PCI device itself */ +} AMDVIPCIState; + +typedef struct AMDVIState { + X86IOMMUState iommu; /* IOMMU bus device */ + AMDVIPCIState pci; /* IOMMU PCI device */ + + uint32_t version; + uint32_t capab_offset; /* capability offset pointer */ + + uint64_t mmio_addr; + + uint32_t devid; /* auto-assigned devid */ + + bool enabled; /* IOMMU enabled */ + bool ats_enabled; /* address translation enabled */ + bool cmdbuf_enabled; /* command buffer enabled */ + bool evtlog_enabled; /* event log enabled */ + bool excl_enabled; + + hwaddr devtab; /* base address device table */ + size_t devtab_len; /* device table length */ + + hwaddr cmdbuf; /* command buffer base address */ + uint64_t cmdbuf_len; /* command buffer length */ + uint32_t cmdbuf_head; /* current IOMMU read position */ + uint32_t cmdbuf_tail; /* next Software write position */ + bool completion_wait_intr; + + hwaddr evtlog; /* base address event log */ + bool evtlog_intr; + uint32_t evtlog_len; /* event log length */ + uint32_t evtlog_head; /* current IOMMU write position */ + uint32_t evtlog_tail; /* current Software read position */ + + /* unused for now */ + hwaddr excl_base; /* base DVA - IOMMU exclusion range */ + hwaddr excl_limit; /* limit of IOMMU exclusion range */ + bool excl_allow; /* translate accesses to the exclusion range */ + bool excl_enable; /* exclusion range enabled */ + + hwaddr ppr_log; /* base address ppr log */ + uint32_t pprlog_len; /* ppr log len */ + uint32_t pprlog_head; /* ppr log head */ + uint32_t pprlog_tail; /* ppr log tail */ + + MemoryRegion mmio; /* MMIO region */ + uint8_t mmior[AMDVI_MMIO_SIZE]; /* read/write MMIO */ + uint8_t w1cmask[AMDVI_MMIO_SIZE]; /* read/write 1 clear mask */ + uint8_t romask[AMDVI_MMIO_SIZE]; /* MMIO read/only mask */ + bool mmio_enabled; + + /* IOMMU function */ + MemoryRegionIOMMUOps iommu_ops; + + /* for each served device */ + AMDVIAddressSpace **address_spaces[PCI_BUS_MAX]; + + /* IOTLB */ + GHashTable *iotlb; +} AMDVIState; + +#endif diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 28c31a2cdf..5f3e35123d 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -21,16 +21,20 @@ #include "qemu/osdep.h" #include "qemu/error-report.h" +#include "qapi/error.h" #include "hw/sysbus.h" #include "exec/address-spaces.h" #include "intel_iommu_internal.h" #include "hw/pci/pci.h" #include "hw/pci/pci_bus.h" #include "hw/i386/pc.h" +#include "hw/i386/apic-msidef.h" #include "hw/boards.h" #include "hw/i386/x86-iommu.h" #include "hw/pci-host/q35.h" #include "sysemu/kvm.h" +#include "hw/i386/apic_internal.h" +#include "kvm_i386.h" /*#define DEBUG_INTEL_IOMMU*/ #ifdef DEBUG_INTEL_IOMMU @@ -214,7 +218,7 @@ static void vtd_reset_iotlb(IntelIOMMUState *s) g_hash_table_remove_all(s->iotlb); } -static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint8_t source_id, +static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id, uint32_t level) { return gfn | ((uint64_t)(source_id) << VTD_IOTLB_SID_SHIFT) | @@ -279,18 +283,17 @@ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, static void vtd_generate_interrupt(IntelIOMMUState *s, hwaddr mesg_addr_reg, hwaddr mesg_data_reg) { - hwaddr addr; - uint32_t data; + MSIMessage msi; assert(mesg_data_reg < DMAR_REG_SIZE); assert(mesg_addr_reg < DMAR_REG_SIZE); - addr = vtd_get_long_raw(s, mesg_addr_reg); - data = vtd_get_long_raw(s, mesg_data_reg); + msi.address = vtd_get_long_raw(s, mesg_addr_reg); + msi.data = vtd_get_long_raw(s, mesg_data_reg); - VTD_DPRINTF(FLOG, "msi: addr 0x%"PRIx64 " data 0x%"PRIx32, addr, data); - address_space_stl_le(&address_space_memory, addr, data, - MEMTXATTRS_UNSPECIFIED, NULL); + VTD_DPRINTF(FLOG, "msi: addr 0x%"PRIx64 " data 0x%"PRIx32, + msi.address, msi.data); + apic_get_class()->send_msi(&msi); } /* Generate a fault event to software via MSI if conditions are met. @@ -985,6 +988,7 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s, mask = 7; /* Mask bit 2:0 in the SID field */ break; } + mask = ~mask; VTD_DPRINTF(INV, "device-selective invalidation source 0x%"PRIx16 " mask %"PRIu16, source_id, mask); vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id)); @@ -1974,14 +1978,20 @@ static IOMMUTLBEntry vtd_iommu_translate(MemoryRegion *iommu, hwaddr addr, return ret; } -static void vtd_iommu_notify_started(MemoryRegion *iommu) +static void vtd_iommu_notify_flag_changed(MemoryRegion *iommu, + IOMMUNotifierFlag old, + IOMMUNotifierFlag new) { VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); - hw_error("Device at bus %s addr %02x.%d requires iommu notifier which " - "is currently not supported by intel-iommu emulation", - vtd_as->bus->qbus.name, PCI_SLOT(vtd_as->devfn), - PCI_FUNC(vtd_as->devfn)); + if (new & IOMMU_NOTIFIER_MAP) { + error_report("Device at bus %s addr %02x.%d requires iommu " + "notifier which is currently not supported by " + "intel-iommu emulation", + vtd_as->bus->qbus.name, PCI_SLOT(vtd_as->devfn), + PCI_FUNC(vtd_as->devfn)); + exit(1); + } } static const VMStateDescription vtd_vmstate = { @@ -2005,6 +2015,9 @@ static const MemoryRegionOps vtd_mem_ops = { static Property vtd_properties[] = { DEFINE_PROP_UINT32("version", IntelIOMMUState, version, 0), + DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim, + ON_OFF_AUTO_AUTO), + DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false), DEFINE_PROP_END_OF_LIST(), }; @@ -2127,6 +2140,7 @@ static void vtd_generate_msi_message(VTDIrq *irq, MSIMessage *msg_out) msg.dest_mode = irq->dest_mode; msg.redir_hint = irq->redir_hint; msg.dest = irq->dest; + msg.__addr_hi = irq->dest & 0xffffff00; msg.__addr_head = cpu_to_le32(0xfee); /* Keep this from original MSI address bits */ msg.__not_used = irq->msi_addr_last_bits; @@ -2167,7 +2181,7 @@ static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu, } addr.data = origin->address & VTD_MSI_ADDR_LO_MASK; - if (le16_to_cpu(addr.addr.__head) != 0xfee) { + if (addr.addr.__head != 0xfee) { VTD_DPRINTF(GENERAL, "error: MSI addr low 32 bits invalid: " "0x%"PRIx32, addr.data); return -VTD_FR_IR_REQ_RSVD; @@ -2203,6 +2217,8 @@ static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu, } } else { uint8_t vector = origin->data & 0xff; + uint8_t trigger_mode = (origin->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1; + VTD_DPRINTF(IR, "received IOAPIC interrupt"); /* IOAPIC entry vector should be aligned with IRTE vector * (see vt-d spec 5.1.5.1). */ @@ -2211,6 +2227,15 @@ static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu, "entry: %d, IRTE: %d, index: %d", vector, irq.vector, index); } + + /* The Trigger Mode field must match the Trigger Mode in the IRTE. + * (see vt-d spec 5.1.5.1). */ + if (trigger_mode != irq.trigger_mode) { + VTD_DPRINTF(GENERAL, "IOAPIC trigger mode inconsistent: " + "entry: %u, IRTE: %u, index: %d", + trigger_mode, irq.trigger_mode, index); + } + } /* @@ -2275,11 +2300,7 @@ static MemTxResult vtd_mem_ir_write(void *opaque, hwaddr addr, " for device sid 0x%04x", to.address, to.data, sid); - if (dma_memory_write(&address_space_memory, to.address, - &to.data, size)) { - VTD_DPRINTF(GENERAL, "error: fail to write 0x%"PRIx64 - " value 0x%"PRIx32, to.address, to.data); - } + apic_get_class()->send_msi(&to); return MEMTX_OK; } @@ -2348,7 +2369,7 @@ static void vtd_init(IntelIOMMUState *s) memset(s->womask, 0, DMAR_REG_SIZE); s->iommu_ops.translate = vtd_iommu_translate; - s->iommu_ops.notify_started = vtd_iommu_notify_started; + s->iommu_ops.notify_flag_changed = vtd_iommu_notify_flag_changed; s->root = 0; s->root_extended = false; s->dmar_enabled = false; @@ -2364,7 +2385,11 @@ static void vtd_init(IntelIOMMUState *s) s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO; if (x86_iommu->intr_supported) { - s->ecap |= VTD_ECAP_IR | VTD_ECAP_EIM | VTD_ECAP_MHMV; + s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV; + if (s->intr_eim == ON_OFF_AUTO_ON) { + s->ecap |= VTD_ECAP_EIM; + } + assert(s->intr_eim != ON_OFF_AUTO_AUTO); } vtd_reset_context_cache(s); @@ -2439,12 +2464,48 @@ static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) IntelIOMMUState *s = opaque; VTDAddressSpace *vtd_as; - assert(0 <= devfn && devfn <= X86_IOMMU_PCI_DEVFN_MAX); + assert(0 <= devfn && devfn < X86_IOMMU_PCI_DEVFN_MAX); vtd_as = vtd_find_add_as(s, bus, devfn); return &vtd_as->as; } +static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) +{ + X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); + + /* Currently Intel IOMMU IR only support "kernel-irqchip={off|split}" */ + if (x86_iommu->intr_supported && kvm_irqchip_in_kernel() && + !kvm_irqchip_is_split()) { + error_setg(errp, "Intel Interrupt Remapping cannot work with " + "kernel-irqchip=on, please use 'split|off'."); + return false; + } + if (s->intr_eim == ON_OFF_AUTO_ON && !x86_iommu->intr_supported) { + error_setg(errp, "eim=on cannot be selected without intremap=on"); + return false; + } + + if (s->intr_eim == ON_OFF_AUTO_AUTO) { + s->intr_eim = (kvm_irqchip_in_kernel() || s->buggy_eim) + && x86_iommu->intr_supported ? + ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; + } + if (s->intr_eim == ON_OFF_AUTO_ON && !s->buggy_eim) { + if (!kvm_irqchip_in_kernel()) { + error_setg(errp, "eim=on requires accel=kvm,kernel-irqchip=split"); + return false; + } + if (!kvm_enable_x2apic()) { + error_setg(errp, "eim=on requires support on the KVM side" + "(X2APIC_API, first shipped in v4.7)"); + return false; + } + } + + return true; +} + static void vtd_realize(DeviceState *dev, Error **errp) { PCMachineState *pcms = PC_MACHINE(qdev_get_machine()); @@ -2453,6 +2514,12 @@ static void vtd_realize(DeviceState *dev, Error **errp) X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(dev); VTD_DPRINTF(GENERAL, ""); + x86_iommu->type = TYPE_INTEL; + + if (!vtd_decide_config(s, errp)) { + return; + } + memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num)); memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s, "intel_iommu", DMAR_REG_SIZE); @@ -2467,14 +2534,6 @@ static void vtd_realize(DeviceState *dev, Error **errp) pci_setup_iommu(bus, vtd_host_dma_iommu, dev); /* Pseudo address space under root PCI bus. */ pcms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC); - - /* Currently Intel IOMMU IR only support "kernel-irqchip={off|split}" */ - if (x86_iommu->intr_supported && kvm_irqchip_in_kernel() && - !kvm_irqchip_is_split()) { - error_report("Intel Interrupt Remapping cannot work with " - "kernel-irqchip=on, please use 'split|off'."); - exit(1); - } } static void vtd_class_init(ObjectClass *klass, void *data) diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h index 0829a5064f..11abfa2233 100644 --- a/hw/i386/intel_iommu_internal.h +++ b/hw/i386/intel_iommu_internal.h @@ -115,7 +115,7 @@ /* The shift of source_id in the key of IOTLB hash table */ #define VTD_IOTLB_SID_SHIFT 36 -#define VTD_IOTLB_LVL_SHIFT 44 +#define VTD_IOTLB_LVL_SHIFT 52 #define VTD_IOTLB_MAX_SIZE 1024 /* Max size of the hash table */ /* IOTLB_REG */ diff --git a/hw/i386/kvm/apic.c b/hw/i386/kvm/apic.c index 2bd0de82b4..01cbaa88d2 100644 --- a/hw/i386/kvm/apic.c +++ b/hw/i386/kvm/apic.c @@ -15,6 +15,7 @@ #include "hw/i386/apic_internal.h" #include "hw/pci/msi.h" #include "sysemu/kvm.h" +#include "target-i386/kvm_i386.h" static inline void kvm_apic_set_reg(struct kvm_lapic_state *kapic, int reg_id, uint32_t val) @@ -28,13 +29,16 @@ static inline uint32_t kvm_apic_get_reg(struct kvm_lapic_state *kapic, return *((uint32_t *)(kapic->regs + (reg_id << 4))); } -void kvm_put_apic_state(DeviceState *dev, struct kvm_lapic_state *kapic) +static void kvm_put_apic_state(APICCommonState *s, struct kvm_lapic_state *kapic) { - APICCommonState *s = APIC_COMMON(dev); int i; memset(kapic, 0, sizeof(*kapic)); - kvm_apic_set_reg(kapic, 0x2, s->id << 24); + if (kvm_has_x2apic_api() && s->apicbase & MSR_IA32_APICBASE_EXTD) { + kvm_apic_set_reg(kapic, 0x2, s->initial_apic_id); + } else { + kvm_apic_set_reg(kapic, 0x2, s->id << 24); + } kvm_apic_set_reg(kapic, 0x8, s->tpr); kvm_apic_set_reg(kapic, 0xd, s->log_dest << 24); kvm_apic_set_reg(kapic, 0xe, s->dest_mode << 28 | 0x0fffffff); @@ -59,7 +63,11 @@ void kvm_get_apic_state(DeviceState *dev, struct kvm_lapic_state *kapic) APICCommonState *s = APIC_COMMON(dev); int i, v; - s->id = kvm_apic_get_reg(kapic, 0x2) >> 24; + if (kvm_has_x2apic_api() && s->apicbase & MSR_IA32_APICBASE_EXTD) { + assert(kvm_apic_get_reg(kapic, 0x2) == s->initial_apic_id); + } else { + s->id = kvm_apic_get_reg(kapic, 0x2) >> 24; + } s->tpr = kvm_apic_get_reg(kapic, 0x8); s->arb_id = kvm_apic_get_reg(kapic, 0x9); s->log_dest = kvm_apic_get_reg(kapic, 0xd) >> 24; @@ -125,10 +133,30 @@ static void kvm_apic_vapic_base_update(APICCommonState *s) } } -static void do_inject_external_nmi(void *data) +static void kvm_apic_put(CPUState *cs, run_on_cpu_data data) +{ + APICCommonState *s = data.host_ptr; + struct kvm_lapic_state kapic; + int ret; + + kvm_put_apicbase(s->cpu, s->apicbase); + kvm_put_apic_state(s, &kapic); + + ret = kvm_vcpu_ioctl(CPU(s->cpu), KVM_SET_LAPIC, &kapic); + if (ret < 0) { + fprintf(stderr, "KVM_SET_LAPIC failed: %s\n", strerror(ret)); + abort(); + } +} + +static void kvm_apic_post_load(APICCommonState *s) +{ + run_on_cpu(CPU(s->cpu), kvm_apic_put, RUN_ON_CPU_HOST_PTR(s)); +} + +static void do_inject_external_nmi(CPUState *cpu, run_on_cpu_data data) { - APICCommonState *s = data; - CPUState *cpu = CPU(s->cpu); + APICCommonState *s = data.host_ptr; uint32_t lvt; int ret; @@ -146,7 +174,18 @@ static void do_inject_external_nmi(void *data) static void kvm_apic_external_nmi(APICCommonState *s) { - run_on_cpu(CPU(s->cpu), do_inject_external_nmi, s); + run_on_cpu(CPU(s->cpu), do_inject_external_nmi, RUN_ON_CPU_HOST_PTR(s)); +} + +static void kvm_send_msi(MSIMessage *msg) +{ + int ret; + + ret = kvm_irqchip_send_msi(kvm_state, *msg); + if (ret < 0) { + fprintf(stderr, "KVM: injection failed, MSI lost (%s)\n", + strerror(-ret)); + } } static uint64_t kvm_apic_mem_read(void *opaque, hwaddr addr, @@ -159,13 +198,8 @@ static void kvm_apic_mem_write(void *opaque, hwaddr addr, uint64_t data, unsigned size) { MSIMessage msg = { .address = addr, .data = data }; - int ret; - ret = kvm_irqchip_send_msi(kvm_state, msg); - if (ret < 0) { - fprintf(stderr, "KVM: injection failed, MSI lost (%s)\n", - strerror(-ret)); - } + kvm_send_msi(&msg); } static const MemoryRegionOps kvm_apic_io_ops = { @@ -178,6 +212,8 @@ static void kvm_apic_reset(APICCommonState *s) { /* Not used by KVM, which uses the CPU mp_state instead. */ s->wait_for_sipi = 0; + + run_on_cpu(CPU(s->cpu), kvm_apic_put, RUN_ON_CPU_HOST_PTR(s)); } static void kvm_apic_realize(DeviceState *dev, Error **errp) @@ -206,9 +242,11 @@ static void kvm_apic_class_init(ObjectClass *klass, void *data) k->set_base = kvm_apic_set_base; k->set_tpr = kvm_apic_set_tpr; k->get_tpr = kvm_apic_get_tpr; + k->post_load = kvm_apic_post_load; k->enable_tpr_reporting = kvm_apic_enable_tpr_reporting; k->vapic_base_update = kvm_apic_vapic_base_update; k->external_nmi = kvm_apic_external_nmi; + k->send_msi = kvm_send_msi; } static const TypeInfo kvm_apic_info = { diff --git a/hw/i386/kvm/i8259.c b/hw/i386/kvm/i8259.c index 2b207de01b..11d1b726b6 100644 --- a/hw/i386/kvm/i8259.c +++ b/hw/i386/kvm/i8259.c @@ -92,7 +92,7 @@ static void kvm_pic_put(PICCommonState *s) ret = kvm_vm_ioctl(kvm_state, KVM_SET_IRQCHIP, &chip); if (ret < 0) { - fprintf(stderr, "KVM_GET_IRQCHIP failed: %s\n", strerror(ret)); + fprintf(stderr, "KVM_SET_IRQCHIP failed: %s\n", strerror(ret)); abort(); } } diff --git a/hw/i386/kvm/pci-assign.c b/hw/i386/kvm/pci-assign.c index 8238fbc630..87dcbdd51a 100644 --- a/hw/i386/kvm/pci-assign.c +++ b/hw/i386/kvm/pci-assign.c @@ -1251,6 +1251,7 @@ static int assigned_device_pci_cap_init(PCIDevice *pci_dev, Error **errp) error_propagate(errp, local_err); return -ENOTSUP; } + dev->dev.cap_present |= QEMU_PCI_CAP_MSI; dev->cap.available |= ASSIGNED_DEVICE_CAP_MSI; /* Only 32-bit/no-mask currently supported */ ret = pci_add_capability2(pci_dev, PCI_CAP_ID_MSI, pos, 10, @@ -1285,6 +1286,7 @@ static int assigned_device_pci_cap_init(PCIDevice *pci_dev, Error **errp) error_propagate(errp, local_err); return -ENOTSUP; } + dev->dev.cap_present |= QEMU_PCI_CAP_MSIX; dev->cap.available |= ASSIGNED_DEVICE_CAP_MSIX; ret = pci_add_capability2(pci_dev, PCI_CAP_ID_MSIX, pos, 12, &local_err); @@ -1648,6 +1650,7 @@ static void assigned_dev_register_msix_mmio(AssignedDevice *dev, Error **errp) dev->msix_table = NULL; return; } + dev->dev.msix_table = (uint8_t *)dev->msix_table; assigned_dev_msix_reset(dev); @@ -1665,6 +1668,7 @@ static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev) error_report("error unmapping msix_table! %s", strerror(errno)); } dev->msix_table = NULL; + dev->dev.msix_table = NULL; } static const VMStateDescription vmstate_assigned_device = { diff --git a/hw/i386/kvmvapic.c b/hw/i386/kvmvapic.c index 3bf1ddd976..b30d1b90c6 100644 --- a/hw/i386/kvmvapic.c +++ b/hw/i386/kvmvapic.c @@ -17,6 +17,7 @@ #include "sysemu/kvm.h" #include "hw/i386/apic_internal.h" #include "hw/sysbus.h" +#include "tcg/tcg.h" #define VAPIC_IO_PORT 0x7e @@ -449,6 +450,9 @@ static void patch_instruction(VAPICROMState *s, X86CPU *cpu, target_ulong ip) resume_all_vcpus(); if (!kvm_enabled()) { + /* tb_lock will be reset when cpu_loop_exit_noexc longjmps + * back into the cpu_exec loop. */ + tb_lock(); tb_gen_code(cs, current_pc, current_cs_base, current_flags, 1); cpu_loop_exit_noexc(cs); } @@ -483,10 +487,9 @@ typedef struct VAPICEnableTPRReporting { bool enable; } VAPICEnableTPRReporting; -static void vapic_do_enable_tpr_reporting(void *data) +static void vapic_do_enable_tpr_reporting(CPUState *cpu, run_on_cpu_data data) { - VAPICEnableTPRReporting *info = data; - + VAPICEnableTPRReporting *info = data.host_ptr; apic_enable_tpr_access_reporting(info->apic, info->enable); } @@ -501,7 +504,7 @@ static void vapic_enable_tpr_reporting(bool enable) CPU_FOREACH(cs) { cpu = X86_CPU(cs); info.apic = cpu->apic_state; - run_on_cpu(cs, vapic_do_enable_tpr_reporting, &info); + run_on_cpu(cs, vapic_do_enable_tpr_reporting, RUN_ON_CPU_HOST_PTR(&info)); } } @@ -734,10 +737,10 @@ static void vapic_realize(DeviceState *dev, Error **errp) nb_option_roms++; } -static void do_vapic_enable(void *data) +static void do_vapic_enable(CPUState *cs, run_on_cpu_data data) { - VAPICROMState *s = data; - X86CPU *cpu = X86_CPU(first_cpu); + VAPICROMState *s = data.host_ptr; + X86CPU *cpu = X86_CPU(cs); static const uint8_t enabled = 1; cpu_physical_memory_write(s->vapic_paddr + offsetof(VAPICState, enabled), @@ -758,7 +761,7 @@ static void kvmvapic_vm_state_change(void *opaque, int running, if (s->state == VAPIC_ACTIVE) { if (smp_cpus == 1) { - run_on_cpu(first_cpu, do_vapic_enable, s); + run_on_cpu(first_cpu, do_vapic_enable, RUN_ON_CPU_HOST_PTR(s)); } else { zero = g_malloc0(s->rom_state.vapic_size); cpu_physical_memory_write(s->vapic_paddr, zero, @@ -768,6 +771,7 @@ static void kvmvapic_vm_state_change(void *opaque, int running, } qemu_del_vm_change_state_handler(s->vmsentry); + s->vmsentry = NULL; } static int vapic_post_load(void *opaque, int version_id) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 0daa4d1f7f..0779fa2639 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -68,6 +68,7 @@ #include "qapi-visit.h" #include "qom/cpu.h" #include "hw/nmi.h" +#include "hw/i386/intel_iommu.h" #include "sysemu/hax.h" @@ -163,13 +164,15 @@ int cpu_get_pic_interrupt(CPUX86State *env) X86CPU *cpu = x86_env_get_cpu(env); int intno; - intno = apic_get_interrupt(cpu->apic_state); - if (intno >= 0) { - return intno; - } - /* read the irq from the PIC */ - if (!apic_accept_pic_intr(cpu->apic_state)) { - return -1; + if (!kvm_irqchip_in_kernel()) { + intno = apic_get_interrupt(cpu->apic_state); + if (intno >= 0) { + return intno; + } + /* read the irq from the PIC */ + if (!apic_accept_pic_intr(cpu->apic_state)) { + return -1; + } } intno = pic_read_irq(isa_pic); @@ -182,7 +185,7 @@ static void pic_irq_request(void *opaque, int irq, int level) X86CPU *cpu = X86_CPU(cs); DPRINTF("pic_irqs: %s irq %d\n", level? "raise" : "lower", irq); - if (cpu->apic_state) { + if (cpu->apic_state && !kvm_irqchip_in_kernel()) { CPU_FOREACH(cs) { cpu = X86_CPU(cs); if (apic_accept_pic_intr(cpu->apic_state)) { @@ -532,9 +535,9 @@ static uint64_t port92_read(void *opaque, hwaddr addr, return ret; } -static void port92_init(ISADevice *dev, qemu_irq *a20_out) +static void port92_init(ISADevice *dev, qemu_irq a20_out) { - qdev_connect_gpio_out_named(DEVICE(dev), PORT92_A20_LINE, 0, *a20_out); + qdev_connect_gpio_out_named(DEVICE(dev), PORT92_A20_LINE, 0, a20_out); } static const VMStateDescription vmstate_port92_isa = { @@ -743,20 +746,19 @@ static FWCfgState *bochs_bios_init(AddressSpace *as, PCMachineState *pcms) int i, j; fw_cfg = fw_cfg_init_io_dma(FW_CFG_IO_BASE, FW_CFG_IO_BASE + 4, as); + fw_cfg_add_i16(fw_cfg, FW_CFG_NB_CPUS, pcms->boot_cpus); /* FW_CFG_MAX_CPUS is a bit confusing/problematic on x86: * - * SeaBIOS needs FW_CFG_MAX_CPUS for CPU hotplug, but the CPU hotplug - * QEMU<->SeaBIOS interface is not based on the "CPU index", but on the APIC - * ID of hotplugged CPUs[1]. This means that FW_CFG_MAX_CPUS is not the - * "maximum number of CPUs", but the "limit to the APIC ID values SeaBIOS - * may see". + * For machine types prior to 1.8, SeaBIOS needs FW_CFG_MAX_CPUS for + * building MPTable, ACPI MADT, ACPI CPU hotplug and ACPI SRAT table, + * that tables are based on xAPIC ID and QEMU<->SeaBIOS interface + * for CPU hotplug also uses APIC ID and not "CPU index". + * This means that FW_CFG_MAX_CPUS is not the "maximum number of CPUs", + * but the "limit to the APIC ID values SeaBIOS may see". * - * So, this means we must not use max_cpus, here, but the maximum possible - * APIC ID value, plus one. - * - * [1] The only kind of "CPU identifier" used between SeaBIOS and QEMU is - * the APIC ID, not the "CPU index" + * So for compatibility reasons with old BIOSes we are stuck with + * "etc/max-cpus" actually being apic_id_limit */ fw_cfg_add_i16(fw_cfg, FW_CFG_MAX_CPUS, (uint16_t)pcms->apic_id_limit); fw_cfg_add_i64(fw_cfg, FW_CFG_RAM_SIZE, (uint64_t)ram_size); @@ -779,11 +781,9 @@ static FWCfgState *bochs_bios_init(AddressSpace *as, PCMachineState *pcms) for (i = 0; i < max_cpus; i++) { unsigned int apic_id = x86_cpu_apic_id_from_index(i); assert(apic_id < pcms->apic_id_limit); - for (j = 0; j < nb_numa_nodes; j++) { - if (test_bit(i, numa_info[j].node_cpu)) { - numa_fw_cfg[apic_id + 1] = cpu_to_le64(j); - break; - } + j = numa_get_node_for_cpu(i); + if (j < nb_numa_nodes) { + numa_fw_cfg[apic_id + 1] = cpu_to_le64(j); } } for (i = 0; i < nb_numa_nodes; i++) { @@ -1093,17 +1093,6 @@ void pc_acpi_smi_interrupt(void *opaque, int irq, int level) } } -static int pc_present_cpus_count(PCMachineState *pcms) -{ - int i, boot_cpus = 0; - for (i = 0; i < pcms->possible_cpus->len; i++) { - if (pcms->possible_cpus->cpus[i].cpu) { - boot_cpus++; - } - } - return boot_cpus; -} - static X86CPU *pc_new_cpu(const char *typename, int64_t apic_id, Error **errp) { @@ -1196,12 +1185,6 @@ void pc_cpus_init(PCMachineState *pcms) * This is used for FW_CFG_MAX_CPUS. See comments on bochs_bios_init(). */ pcms->apic_id_limit = x86_cpu_apic_id_from_index(max_cpus - 1) + 1; - if (pcms->apic_id_limit > ACPI_CPU_HOTPLUG_ID_LIMIT) { - error_report("max_cpus is too large. APIC ID of last CPU is %u", - pcms->apic_id_limit - 1); - exit(1); - } - pcms->possible_cpus = g_malloc0(sizeof(CPUArchIdList) + sizeof(CPUArchId) * max_cpus); for (i = 0; i < max_cpus; i++) { @@ -1246,6 +1229,19 @@ static void pc_build_feature_control_file(PCMachineState *pcms) fw_cfg_add_file(pcms->fw_cfg, "etc/msr_feature_control", val, sizeof(*val)); } +static void rtc_set_cpus_count(ISADevice *rtc, uint16_t cpus_count) +{ + if (cpus_count > 0xff) { + /* If the number of CPUs can't be represented in 8 bits, the + * BIOS must use "FW_CFG_NB_CPUS". Set RTC field to 0 just + * to make old BIOSes fail more predictably. + */ + rtc_set_memory(rtc, 0x5f, 0); + } else { + rtc_set_memory(rtc, 0x5f, cpus_count - 1); + } +} + static void pc_machine_done(Notifier *notifier, void *data) { @@ -1254,7 +1250,7 @@ void pc_machine_done(Notifier *notifier, void *data) PCIBus *bus = pcms->bus; /* set the number of CPUs */ - rtc_set_memory(pcms->rtc, 0x5f, pc_present_cpus_count(pcms) - 1); + rtc_set_cpus_count(pcms->rtc, pcms->boot_cpus); if (bus) { int extra_hosts = 0; @@ -1277,6 +1273,21 @@ void pc_machine_done(Notifier *notifier, void *data) if (pcms->fw_cfg) { pc_build_smbios(pcms->fw_cfg); pc_build_feature_control_file(pcms); + /* update FW_CFG_NB_CPUS to account for -device added CPUs */ + fw_cfg_modify_i16(pcms->fw_cfg, FW_CFG_NB_CPUS, pcms->boot_cpus); + } + + if (pcms->apic_id_limit > 255) { + IntelIOMMUState *iommu = INTEL_IOMMU_DEVICE(x86_iommu_get_default()); + + if (!iommu || !iommu->x86_iommu.intr_supported || + iommu->intr_eim != ON_OFF_AUTO_ON) { + error_report("current -smp configuration requires " + "Extended Interrupt Mode enabled. " + "You can add an IOMMU using: " + "-device intel-iommu,intremap=on,eim=on"); + exit(EXIT_FAILURE); + } } } @@ -1341,6 +1352,7 @@ void xen_load_linux(PCMachineState *pcms) assert(MACHINE(pcms)->kernel_filename != NULL); fw_cfg = fw_cfg_init_io(FW_CFG_IO_BASE); + fw_cfg_add_i16(fw_cfg, FW_CFG_NB_CPUS, pcms->boot_cpus); rom_set_fw(fw_cfg); load_linux(pcms, fw_cfg); @@ -1595,12 +1607,12 @@ void pc_basic_device_init(ISABus *isa_bus, qemu_irq *gsi, pcspk_init(isa_bus, pit); } - serial_hds_isa_init(isa_bus, MAX_SERIAL_PORTS); + serial_hds_isa_init(isa_bus, 0, MAX_SERIAL_PORTS); parallel_hds_isa_init(isa_bus, MAX_PARALLEL_PORTS); a20_line = qemu_allocate_irqs(handle_a20_line_change, first_cpu, 2); i8042 = isa_create_simple(isa_bus, "i8042"); - i8042_setup_a20_line(i8042, &a20_line[0]); + i8042_setup_a20_line(i8042, a20_line[0]); if (!no_vmport) { vmport_init(isa_bus); vmmouse = isa_try_create(isa_bus, "vmmouse"); @@ -1613,7 +1625,8 @@ void pc_basic_device_init(ISABus *isa_bus, qemu_irq *gsi, qdev_init_nofail(dev); } port92 = isa_create_simple(isa_bus, "port92"); - port92_init(port92, &a20_line[1]); + port92_init(port92, a20_line[1]); + g_free(a20_line); DMA_init(isa_bus, 0); @@ -1705,6 +1718,10 @@ static void pc_dimm_plug(HotplugHandler *hotplug_dev, goto out; } + if (object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)) { + nvdimm_plug(&pcms->acpi_nvdimm_state); + } + hhc = HOTPLUG_HANDLER_GET_CLASS(pcms->acpi_dev); hhc->plug(HOTPLUG_HANDLER(pcms->acpi_dev), dev, &error_abort); out: @@ -1724,6 +1741,12 @@ static void pc_dimm_unplug_request(HotplugHandler *hotplug_dev, goto out; } + if (object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)) { + error_setg(&local_err, + "nvdimm device hot unplug is not supported yet."); + goto out; + } + hhc = HOTPLUG_HANDLER_GET_CLASS(pcms->acpi_dev); hhc->unplug_request(HOTPLUG_HANDLER(pcms->acpi_dev), dev, &local_err); @@ -1799,9 +1822,11 @@ static void pc_cpu_plug(HotplugHandler *hotplug_dev, } } + /* increment the number of CPUs */ + pcms->boot_cpus++; if (dev->hotplugged) { - /* increment the number of CPUs */ - rtc_set_memory(pcms->rtc, 0x5f, rtc_get_memory(pcms->rtc, 0x5f) + 1); + rtc_set_cpus_count(pcms->rtc, pcms->boot_cpus); + fw_cfg_modify_i16(pcms->fw_cfg, FW_CFG_NB_CPUS, pcms->boot_cpus); } found_cpu = pc_find_cpu_slot(pcms, CPU(dev), NULL); @@ -1855,7 +1880,11 @@ static void pc_cpu_unplug_cb(HotplugHandler *hotplug_dev, found_cpu->cpu = NULL; object_unparent(OBJECT(dev)); - rtc_set_memory(pcms->rtc, 0x5f, rtc_get_memory(pcms->rtc, 0x5f) - 1); + /* decrement the number of CPUs */ + pcms->boot_cpus--; + /* Update the number of CPUs in CMOS */ + rtc_set_cpus_count(pcms->rtc, pcms->boot_cpus); + fw_cfg_modify_i16(pcms->fw_cfg, FW_CFG_NB_CPUS, pcms->boot_cpus); out: error_propagate(errp, local_err); } @@ -2141,41 +2170,13 @@ static void pc_machine_initfn(Object *obj) { PCMachineState *pcms = PC_MACHINE(obj); - object_property_add(obj, PC_MACHINE_MEMHP_REGION_SIZE, "int", - pc_machine_get_hotplug_memory_region_size, - NULL, NULL, NULL, &error_abort); - pcms->max_ram_below_4g = 0; /* use default */ - object_property_add(obj, PC_MACHINE_MAX_RAM_BELOW_4G, "size", - pc_machine_get_max_ram_below_4g, - pc_machine_set_max_ram_below_4g, - NULL, NULL, &error_abort); - object_property_set_description(obj, PC_MACHINE_MAX_RAM_BELOW_4G, - "Maximum ram below the 4G boundary (32bit boundary)", - &error_abort); - pcms->smm = ON_OFF_AUTO_AUTO; - object_property_add(obj, PC_MACHINE_SMM, "OnOffAuto", - pc_machine_get_smm, - pc_machine_set_smm, - NULL, NULL, &error_abort); - object_property_set_description(obj, PC_MACHINE_SMM, - "Enable SMM (pc & q35)", - &error_abort); - pcms->vmport = ON_OFF_AUTO_AUTO; - object_property_add(obj, PC_MACHINE_VMPORT, "OnOffAuto", - pc_machine_get_vmport, - pc_machine_set_vmport, - NULL, NULL, &error_abort); - object_property_set_description(obj, PC_MACHINE_VMPORT, - "Enable vmport (pc & q35)", - &error_abort); - /* nvdimm is disabled on default. */ pcms->acpi_nvdimm_state.is_enabled = false; - object_property_add_bool(obj, PC_MACHINE_NVDIMM, pc_machine_get_nvdimm, - pc_machine_set_nvdimm, &error_abort); + /* acpi build is enabled by default if machine supports it */ + pcms->acpi_build_enabled = PC_MACHINE_GET_CLASS(pcms)->has_acpi_build; } static void pc_machine_reset(void) @@ -2310,6 +2311,32 @@ static void pc_machine_class_init(ObjectClass *oc, void *data) hc->unplug_request = pc_machine_device_unplug_request_cb; hc->unplug = pc_machine_device_unplug_cb; nc->nmi_monitor_handler = x86_nmi; + + object_class_property_add(oc, PC_MACHINE_MEMHP_REGION_SIZE, "int", + pc_machine_get_hotplug_memory_region_size, NULL, + NULL, NULL, &error_abort); + + object_class_property_add(oc, PC_MACHINE_MAX_RAM_BELOW_4G, "size", + pc_machine_get_max_ram_below_4g, pc_machine_set_max_ram_below_4g, + NULL, NULL, &error_abort); + + object_class_property_set_description(oc, PC_MACHINE_MAX_RAM_BELOW_4G, + "Maximum ram below the 4G boundary (32bit boundary)", &error_abort); + + object_class_property_add(oc, PC_MACHINE_SMM, "OnOffAuto", + pc_machine_get_smm, pc_machine_set_smm, + NULL, NULL, &error_abort); + object_class_property_set_description(oc, PC_MACHINE_SMM, + "Enable SMM (pc & q35)", &error_abort); + + object_class_property_add(oc, PC_MACHINE_VMPORT, "OnOffAuto", + pc_machine_get_vmport, pc_machine_set_vmport, + NULL, NULL, &error_abort); + object_class_property_set_description(oc, PC_MACHINE_VMPORT, + "Enable vmport (pc & q35)", &error_abort); + + object_class_property_add_bool(oc, PC_MACHINE_NVDIMM, + pc_machine_get_nvdimm, pc_machine_set_nvdimm, &error_abort); } static const TypeInfo pc_machine_info = { diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 84241296a8..68835dde0b 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -74,7 +74,6 @@ static void pc_init1(MachineState *machine, ISABus *isa_bus; PCII440FXState *i440fx_state; int piix3_devfn = -1; - qemu_irq *gsi; qemu_irq *i8259; qemu_irq smi_irq; GSIState *gsi_state; @@ -185,16 +184,16 @@ static void pc_init1(MachineState *machine, gsi_state = g_malloc0(sizeof(*gsi_state)); if (kvm_ioapic_in_kernel()) { kvm_pc_setup_irq_routing(pcmc->pci_enabled); - gsi = qemu_allocate_irqs(kvm_pc_gsi_handler, gsi_state, - GSI_NUM_PINS); + pcms->gsi = qemu_allocate_irqs(kvm_pc_gsi_handler, gsi_state, + GSI_NUM_PINS); } else { - gsi = qemu_allocate_irqs(gsi_handler, gsi_state, GSI_NUM_PINS); + pcms->gsi = qemu_allocate_irqs(gsi_handler, gsi_state, GSI_NUM_PINS); } if (pcmc->pci_enabled) { pci_bus = i440fx_init(host_type, pci_type, - &i440fx_state, &piix3_devfn, &isa_bus, gsi, + &i440fx_state, &piix3_devfn, &isa_bus, pcms->gsi, system_memory, system_io, machine->ram_size, pcms->below_4g_mem_size, pcms->above_4g_mem_size, @@ -207,7 +206,7 @@ static void pc_init1(MachineState *machine, &error_abort); no_hpet = 1; } - isa_bus_irqs(isa_bus, gsi); + isa_bus_irqs(isa_bus, pcms->gsi); if (kvm_pic_in_kernel()) { i8259 = kvm_i8259_init(isa_bus); @@ -225,7 +224,7 @@ static void pc_init1(MachineState *machine, ioapic_init_gsi(gsi_state, "i440fx"); } - pc_register_ferr_irq(gsi[13]); + pc_register_ferr_irq(pcms->gsi[13]); pc_vga_init(isa_bus, pcmc->pci_enabled ? pci_bus : NULL); @@ -235,7 +234,7 @@ static void pc_init1(MachineState *machine, } /* init basic PC hardware */ - pc_basic_device_init(isa_bus, gsi, &rtc_state, true, + pc_basic_device_init(isa_bus, pcms->gsi, &rtc_state, true, (pcms->vmport != ON_OFF_AUTO_ON), 0x4); pc_nic_init(isa_bus, pci_bus); @@ -279,7 +278,7 @@ static void pc_init1(MachineState *machine, smi_irq = qemu_allocate_irq(pc_acpi_smi_interrupt, first_cpu, 0); /* TODO: Populate SPD eeprom data. */ smbus = piix4_pm_init(pci_bus, piix3_devfn + 3, 0xb100, - gsi[9], smi_irq, + pcms->gsi[9], smi_irq, pc_machine_is_smm_enabled(pcms), &piix4_pm); smbus_eeprom_init(smbus, 8, NULL, 0); @@ -447,13 +446,25 @@ static void pc_i440fx_machine_options(MachineClass *m) m->default_display = "std"; } -static void pc_i440fx_2_7_machine_options(MachineClass *m) +static void pc_i440fx_2_8_machine_options(MachineClass *m) { pc_i440fx_machine_options(m); m->alias = "pc"; m->is_default = 1; } +DEFINE_I440FX_MACHINE(v2_8, "pc-i440fx-2.8", NULL, + pc_i440fx_2_8_machine_options); + + +static void pc_i440fx_2_7_machine_options(MachineClass *m) +{ + pc_i440fx_2_8_machine_options(m); + m->is_default = 0; + m->alias = NULL; + SET_MACHINE_COMPAT(m, PC_COMPAT_2_7); +} + DEFINE_I440FX_MACHINE(v2_7, "pc-i440fx-2.7", NULL, pc_i440fx_2_7_machine_options); @@ -462,8 +473,6 @@ static void pc_i440fx_2_6_machine_options(MachineClass *m) { PCMachineClass *pcmc = PC_MACHINE_CLASS(m); pc_i440fx_2_7_machine_options(m); - m->is_default = 0; - m->alias = NULL; pcmc->legacy_cpu_hotplug = true; SET_MACHINE_COMPAT(m, PC_COMPAT_2_6); } diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index c0b9961928..b40d19ee00 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -69,7 +69,6 @@ static void pc_q35_init(MachineState *machine) MemoryRegion *ram_memory; GSIState *gsi_state; ISABus *isa_bus; - qemu_irq *gsi; qemu_irq *i8259; int i; ICH9LPCState *ich9_lpc; @@ -153,10 +152,10 @@ static void pc_q35_init(MachineState *machine) gsi_state = g_malloc0(sizeof(*gsi_state)); if (kvm_ioapic_in_kernel()) { kvm_pc_setup_irq_routing(pcmc->pci_enabled); - gsi = qemu_allocate_irqs(kvm_pc_gsi_handler, gsi_state, - GSI_NUM_PINS); + pcms->gsi = qemu_allocate_irqs(kvm_pc_gsi_handler, gsi_state, + GSI_NUM_PINS); } else { - gsi = qemu_allocate_irqs(gsi_handler, gsi_state, GSI_NUM_PINS); + pcms->gsi = qemu_allocate_irqs(gsi_handler, gsi_state, GSI_NUM_PINS); } /* create pci host bus */ @@ -195,7 +194,7 @@ static void pc_q35_init(MachineState *machine) ich9_lpc = ICH9_LPC_DEVICE(lpc); lpc_dev = DEVICE(lpc); for (i = 0; i < GSI_NUM_PINS; i++) { - qdev_connect_gpio_out_named(lpc_dev, ICH9_GPIO_GSI, i, gsi[i]); + qdev_connect_gpio_out_named(lpc_dev, ICH9_GPIO_GSI, i, pcms->gsi[i]); } pci_bus_irqs(host_bus, ich9_lpc_set_irq, ich9_lpc_map_irq, ich9_lpc, ICH9_LPC_NB_PIRQS); @@ -213,11 +212,13 @@ static void pc_q35_init(MachineState *machine) for (i = 0; i < ISA_NUM_IRQS; i++) { gsi_state->i8259_irq[i] = i8259[i]; } + g_free(i8259); + if (pcmc->pci_enabled) { ioapic_init_gsi(gsi_state, "q35"); } - pc_register_ferr_irq(gsi[13]); + pc_register_ferr_irq(pcms->gsi[13]); assert(pcms->vmport != ON_OFF_AUTO__MAX); if (pcms->vmport == ON_OFF_AUTO_AUTO) { @@ -225,7 +226,7 @@ static void pc_q35_init(MachineState *machine) } /* init basic PC hardware */ - pc_basic_device_init(isa_bus, gsi, &rtc_state, !mc->no_floppy, + pc_basic_device_init(isa_bus, pcms->gsi, &rtc_state, !mc->no_floppy, (pcms->vmport != ON_OFF_AUTO_ON), 0xff0104); /* connect pm stuff to lpc */ @@ -290,14 +291,26 @@ static void pc_q35_machine_options(MachineClass *m) m->default_display = "std"; m->no_floppy = 1; m->has_dynamic_sysbus = true; + m->max_cpus = 288; } -static void pc_q35_2_7_machine_options(MachineClass *m) +static void pc_q35_2_8_machine_options(MachineClass *m) { pc_q35_machine_options(m); m->alias = "q35"; } +DEFINE_Q35_MACHINE(v2_8, "pc-q35-2.8", NULL, + pc_q35_2_8_machine_options); + +static void pc_q35_2_7_machine_options(MachineClass *m) +{ + pc_q35_2_8_machine_options(m); + m->alias = NULL; + m->max_cpus = 255; + SET_MACHINE_COMPAT(m, PC_COMPAT_2_7); +} + DEFINE_Q35_MACHINE(v2_7, "pc-q35-2.7", NULL, pc_q35_2_7_machine_options); @@ -305,7 +318,6 @@ static void pc_q35_2_6_machine_options(MachineClass *m) { PCMachineClass *pcmc = PC_MACHINE_CLASS(m); pc_q35_2_7_machine_options(m); - m->alias = NULL; pcmc->legacy_cpu_hotplug = true; SET_MACHINE_COMPAT(m, PC_COMPAT_2_6); } diff --git a/hw/i386/trace-events b/hw/i386/trace-events index 7735e46eaf..d2b497327e 100644 --- a/hw/i386/trace-events +++ b/hw/i386/trace-events @@ -7,9 +7,34 @@ xen_platform_log(char *s) "xen platform: %s" xen_pv_mmio_read(uint64_t addr) "WARNING: read from Xen PV Device MMIO space (address %"PRIx64")" xen_pv_mmio_write(uint64_t addr) "WARNING: write to Xen PV Device MMIO space (address %"PRIx64")" -# hw/i386/pc.c -mhp_pc_dimm_assigned_slot(int slot) "0x%d" -mhp_pc_dimm_assigned_address(uint64_t addr) "0x%"PRIx64 - # hw/i386/x86-iommu.c x86_iommu_iec_notify(bool global, uint32_t index, uint32_t mask) "Notify IEC invalidation: global=%d index=%" PRIu32 " mask=%" PRIu32 + +# hw/i386/amd_iommu.c +amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" + offset 0x%"PRIx32 +amdvi_cache_update(uint16_t domid, uint8_t bus, uint8_t slot, uint8_t func, uint64_t gpa, uint64_t txaddr) " update iotlb domid 0x%"PRIx16" devid: %02x:%02x.%x gpa 0x%"PRIx64" hpa 0x%"PRIx64 +amdvi_completion_wait_fail(uint64_t addr) "error: fail to write at address 0x%"PRIx64 +amdvi_mmio_write(const char *reg, uint64_t addr, unsigned size, uint64_t val, uint64_t offset) "%s write addr 0x%"PRIx64", size %u, val 0x%"PRIx64", offset 0x%"PRIx64 +amdvi_mmio_read(const char *reg, uint64_t addr, unsigned size, uint64_t offset) "%s read addr 0x%"PRIx64", size %u offset 0x%"PRIx64 +amdvi_command_error(uint64_t status) "error: Executing commands with command buffer disabled 0x%"PRIx64 +amdvi_command_read_fail(uint64_t addr, uint32_t head) "error: fail to access memory at 0x%"PRIx64" + 0x%"PRIx32 +amdvi_command_exec(uint32_t head, uint32_t tail, uint64_t buf) "command buffer head at 0x%"PRIx32" command buffer tail at 0x%"PRIx32" command buffer base at 0x%"PRIx64 +amdvi_unhandled_command(uint8_t type) "unhandled command 0x%"PRIx8 +amdvi_intr_inval(void) "Interrupt table invalidated" +amdvi_iotlb_inval(void) "IOTLB pages invalidated" +amdvi_prefetch_pages(void) "Pre-fetch of AMD-Vi pages requested" +amdvi_pages_inval(uint16_t domid) "AMD-Vi pages for domain 0x%"PRIx16 " invalidated" +amdvi_all_inval(void) "Invalidation of all AMD-Vi cache requested " +amdvi_ppr_exec(void) "Execution of PPR queue requested " +amdvi_devtab_inval(uint8_t bus, uint8_t slot, uint8_t func) "device table entry for devid: %02x:%02x.%x invalidated" +amdvi_completion_wait(uint64_t addr, uint64_t data) "completion wait requested with store address 0x%"PRIx64" and store data 0x%"PRIx64 +amdvi_control_status(uint64_t val) "MMIO_STATUS state 0x%"PRIx64 +amdvi_iotlb_reset(void) "IOTLB exceed size limit - reset " +amdvi_completion_wait_exec(uint64_t addr, uint64_t data) "completion wait requested with store address 0x%"PRIx64" and store data 0x%"PRIx64 +amdvi_dte_get_fail(uint64_t addr, uint32_t offset) "error: failed to access Device Entry devtab 0x%"PRIx64" offset 0x%"PRIx32 +amdvi_invalid_dte(uint64_t addr) "PTE entry at 0x%"PRIx64" is invalid " +amdvi_get_pte_hwerror(uint64_t addr) "hardware error eccessing PTE at addr 0x%"PRIx64 +amdvi_mode_invalid(uint8_t level, uint64_t addr)"error: translation level 0x%"PRIx8" translating addr 0x%"PRIx64 +amdvi_page_fault(uint64_t addr) "error: page fault accessing guest physical address 0x%"PRIx64 +amdvi_iotlb_hit(uint8_t bus, uint8_t slot, uint8_t func, uint64_t addr, uint64_t txaddr) "hit iotlb devid %02x:%02x.%x gpa 0x%"PRIx64" hpa 0x%"PRIx64 +amdvi_translation_result(uint8_t bus, uint8_t slot, uint8_t func, uint64_t addr, uint64_t txaddr) "devid: %02x:%02x.%x gpa 0x%"PRIx64" hpa 0x%"PRIx64 diff --git a/hw/i386/x86-iommu.c b/hw/i386/x86-iommu.c index ce26b2a71d..2278af7c32 100644 --- a/hw/i386/x86-iommu.c +++ b/hw/i386/x86-iommu.c @@ -71,6 +71,11 @@ X86IOMMUState *x86_iommu_get_default(void) return x86_iommu_default; } +IommuType x86_iommu_get_type(void) +{ + return x86_iommu_default->type; +} + static void x86_iommu_realize(DeviceState *dev, Error **errp) { X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(dev); @@ -79,6 +84,7 @@ static void x86_iommu_realize(DeviceState *dev, Error **errp) if (x86_class->realize) { x86_class->realize(dev, errp); } + x86_iommu_set_default(X86_IOMMU_DEVICE(dev)); } diff --git a/hw/i386/xen/xen_apic.c b/hw/i386/xen/xen_apic.c index 21d68ee04b..55769eba7e 100644 --- a/hw/i386/xen/xen_apic.c +++ b/hw/i386/xen/xen_apic.c @@ -68,6 +68,11 @@ static void xen_apic_external_nmi(APICCommonState *s) { } +static void xen_send_msi(MSIMessage *msi) +{ + xen_hvm_inject_msi(msi->address, msi->data); +} + static void xen_apic_class_init(ObjectClass *klass, void *data) { APICCommonClass *k = APIC_COMMON_CLASS(klass); @@ -78,6 +83,7 @@ static void xen_apic_class_init(ObjectClass *klass, void *data) k->get_tpr = xen_apic_get_tpr; k->vapic_base_update = xen_apic_vapic_base_update; k->external_nmi = xen_apic_external_nmi; + k->send_msi = xen_send_msi; } static const TypeInfo xen_apic_info = { diff --git a/hw/i386/xen/xen_platform.c b/hw/i386/xen/xen_platform.c index aa7839324c..2e1e543881 100644 --- a/hw/i386/xen/xen_platform.c +++ b/hw/i386/xen/xen_platform.c @@ -114,6 +114,10 @@ static void unplug_disks(PCIBus *b, PCIDevice *d, void *o) PCI_CLASS_STORAGE_IDE && strcmp(d->name, "xen-pci-passthrough") != 0) { pci_piix3_xen_ide_unplug(DEVICE(d)); + } else if (pci_get_word(d->config + PCI_CLASS_DEVICE) == + PCI_CLASS_STORAGE_SCSI + && strcmp(d->name, "xen-pci-passthrough") != 0) { + object_unparent(OBJECT(d)); } } @@ -134,8 +138,6 @@ static void platform_fixed_ioport_writew(void *opaque, uint32_t addr, uint32_t v devices, and bit 2 the non-primary-master IDE devices. */ if (val & UNPLUG_ALL_IDE_DISKS) { DPRINTF("unplug disks\n"); - blk_drain_all(); - blk_flush_all(); pci_unplug_disks(pci_dev->bus); } if (val & UNPLUG_ALL_NICS) { @@ -309,13 +311,38 @@ static void xen_platform_ioport_writeb(void *opaque, hwaddr addr, uint64_t val, unsigned int size) { PCIXenPlatformState *s = opaque; + PCIDevice *pci_dev = PCI_DEVICE(s); switch (addr) { case 0: /* Platform flags */ platform_fixed_ioport_writeb(opaque, 0, (uint32_t)val); break; + case 4: + if (val == 1) { + /* + * SUSE unplug for Xenlinux + * xen-kmp used this since xen-3.0.4, instead the official protocol + * from xen-3.3+ It did an unconditional "outl(1, (ioaddr + 4));" + * Pre VMDP 1.7 used 4 and 8 depending on how VMDP was configured. + * If VMDP was to control both disk and LAN it would use 4. + * If it controlled just disk or just LAN, it would use 8 below. + */ + pci_unplug_disks(pci_dev->bus); + pci_unplug_nics(pci_dev->bus); + } + break; case 8: - log_writeb(s, (uint32_t)val); + switch (val) { + case 1: + pci_unplug_disks(pci_dev->bus); + break; + case 2: + pci_unplug_nics(pci_dev->bus); + break; + default: + log_writeb(s, (uint32_t)val); + break; + } break; default: break; |