diff --git a/arch/src/x86_64/gdt.rs b/arch/src/x86_64/gdt.rs index 81a72682d..71ee6ca2d 100644 --- a/arch/src/x86_64/gdt.rs +++ b/arch/src/x86_64/gdt.rs @@ -1,3 +1,5 @@ +// Copyright © 2020, Oracle and/or its affiliates. +// // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 // @@ -24,8 +26,34 @@ fn get_base(entry: u64) -> u64 { | (((entry) & 0x00000000FFFF0000) >> 16) } +// Extract the segment limit from the GDT segment descriptor. +// +// In a segment descriptor, the limit field is 20 bits, so it can directly describe +// a range from 0 to 0xFFFFF (1MByte). When G flag is set (4-KByte page granularity) it +// scales the value in the limit field by a factor of 2^12 (4Kbytes), making the effective +// limit range from 0xFFF (4 KBytes) to 0xFFFF_FFFF (4 GBytes). +// +// However, the limit field in the VMCS definition is a 32 bit field, and the limit value is not +// automatically scaled using the G flag. This means that for a desired range of 4GB for a +// given segment, its limit must be specified as 0xFFFF_FFFF. Therefore the method of obtaining +// the limit from the GDT entry is not sufficient, since it only provides 20 bits when 32 bits +// are necessary. Fortunately, we can check if the G flag is set when extracting the limit since +// the full GDT entry is passed as an argument, and perform the scaling of the limit value to +// return the full 32 bit value. +// +// The scaling mentioned above is required when using PVH boot, since the guest boots in protected +// (32-bit) mode and must be able to access the entire 32-bit address space. It does not cause issues +// for the case of direct boot to 64-bit (long) mode, since in 64-bit mode the processor does not +// perform runtime limit checking on code or data segments. fn get_limit(entry: u64) -> u32 { - ((((entry) & 0x000F000000000000) >> 32) | ((entry) & 0x000000000000FFFF)) as u32 + let limit: u32 = + ((((entry) & 0x000F000000000000) >> 32) | ((entry) & 0x000000000000FFFF)) as u32; + + // Perform manual limit scaling if G flag is set + match get_g(entry) { + 0 => limit, + _ => ((limit << 12) | 0xFFF), // G flag is either 0 or 1 + } } fn get_g(entry: u64) -> u8 { @@ -109,7 +137,7 @@ mod tests { assert_eq!(0xB, seg.type_); // base and limit assert_eq!(0x100000, seg.base); - assert_eq!(0xfffff, seg.limit); + assert_eq!(0xffffffff, seg.limit); assert_eq!(0x0, seg.unusable); } } diff --git a/arch/src/x86_64/regs.rs b/arch/src/x86_64/regs.rs index c963f7533..ac5835fea 100644 --- a/arch/src/x86_64/regs.rs +++ b/arch/src/x86_64/regs.rs @@ -1,3 +1,5 @@ +// Copyright © 2020, Oracle and/or its affiliates. +// // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 // @@ -8,10 +10,11 @@ use std::{mem, result}; use super::gdt::{gdt_entry, kvm_segment_from_gdt}; +use super::BootProtocol; use arch_gen::x86::msr_index; use kvm_bindings::{kvm_fpu, kvm_msr_entry, kvm_regs, kvm_sregs, Msrs}; use kvm_ioctls::VcpuFd; -use layout::{BOOT_GDT_START, BOOT_IDT_START, PDE_START, PDPTE_START, PML4_START}; +use layout::{BOOT_GDT_START, BOOT_IDT_START, PDE_START, PDPTE_START, PML4_START, PVH_INFO_START}; use vm_memory::{Address, Bytes, GuestMemory, GuestMemoryError, GuestMemoryMmap}; // MTRR constants @@ -81,16 +84,31 @@ pub fn setup_msrs(vcpu: &VcpuFd) -> Result<()> { /// * `boot_ip` - Starting instruction pointer. /// * `boot_sp` - Starting stack pointer. /// * `boot_si` - Must point to zero page address per Linux ABI. -pub fn setup_regs(vcpu: &VcpuFd, boot_ip: u64, boot_sp: u64, boot_si: u64) -> Result<()> { - let regs: kvm_regs = kvm_regs { - rflags: 0x0000000000000002u64, - rip: boot_ip, - rsp: boot_sp, - rbp: boot_sp, - rsi: boot_si, - ..Default::default() +pub fn setup_regs( + vcpu: &VcpuFd, + boot_ip: u64, + boot_sp: u64, + boot_si: u64, + boot_prot: BootProtocol, +) -> Result<()> { + let regs: kvm_regs = match boot_prot { + // Configure regs as required by PVH boot protocol. + BootProtocol::PvhBoot => kvm_regs { + rflags: 0x0000000000000002u64, + rbx: PVH_INFO_START.raw_value(), + rip: boot_ip, + ..Default::default() + }, + // Configure regs as required by Linux 64-bit boot protocol. + BootProtocol::LinuxBoot => kvm_regs { + rflags: 0x0000000000000002u64, + rip: boot_ip, + rsp: boot_sp, + rbp: boot_sp, + rsi: boot_si, + ..Default::default() + }, }; - vcpu.set_regs(®s).map_err(Error::SetBaseRegisters) } @@ -100,11 +118,14 @@ pub fn setup_regs(vcpu: &VcpuFd, boot_ip: u64, boot_sp: u64, boot_si: u64) -> Re /// /// * `mem` - The memory that will be passed to the guest. /// * `vcpu` - Structure for the VCPU that holds the VCPU's fd. -pub fn setup_sregs(mem: &GuestMemoryMmap, vcpu: &VcpuFd) -> Result<()> { +pub fn setup_sregs(mem: &GuestMemoryMmap, vcpu: &VcpuFd, boot_prot: BootProtocol) -> Result<()> { let mut sregs: kvm_sregs = vcpu.get_sregs().map_err(Error::GetStatusRegisters)?; - configure_segments_and_sregs(mem, &mut sregs)?; - setup_page_tables(mem, &mut sregs)?; // TODO(dgreid) - Can this be done once per system instead? + configure_segments_and_sregs(mem, &mut sregs, boot_prot)?; + + if let BootProtocol::LinuxBoot = boot_prot { + setup_page_tables(mem, &mut sregs)?; // TODO(dgreid) - Can this be done once per system instead? + } vcpu.set_sregs(&sregs).map_err(Error::SetStatusRegisters) } @@ -136,13 +157,31 @@ fn write_idt_value(val: u64, guest_mem: &GuestMemoryMmap) -> Result<()> { .map_err(Error::WriteIDT) } -fn configure_segments_and_sregs(mem: &GuestMemoryMmap, sregs: &mut kvm_sregs) -> Result<()> { - let gdt_table: [u64; BOOT_GDT_MAX as usize] = [ - gdt_entry(0, 0, 0), // NULL - gdt_entry(0xa09b, 0, 0xfffff), // CODE - gdt_entry(0xc093, 0, 0xfffff), // DATA - gdt_entry(0x808b, 0, 0xfffff), // TSS - ]; +fn configure_segments_and_sregs( + mem: &GuestMemoryMmap, + sregs: &mut kvm_sregs, + boot_prot: BootProtocol, +) -> Result<()> { + let gdt_table: [u64; BOOT_GDT_MAX as usize] = match boot_prot { + BootProtocol::PvhBoot => { + // Configure GDT entries as specified by PVH boot protocol + [ + gdt_entry(0, 0, 0), // NULL + gdt_entry(0xc09b, 0, 0xffffffff), // CODE + gdt_entry(0xc093, 0, 0xffffffff), // DATA + gdt_entry(0x008b, 0, 0x67), // TSS + ] + } + BootProtocol::LinuxBoot => { + // Configure GDT entries as specified by Linux 64bit boot protocol + [ + gdt_entry(0, 0, 0), // NULL + gdt_entry(0xa09b, 0, 0xfffff), // CODE + gdt_entry(0xc093, 0, 0xfffff), // DATA + gdt_entry(0x808b, 0, 0xfffff), // TSS + ] + } + }; let code_seg = kvm_segment_from_gdt(gdt_table[1], 1); let data_seg = kvm_segment_from_gdt(gdt_table[2], 2); @@ -165,9 +204,17 @@ fn configure_segments_and_sregs(mem: &GuestMemoryMmap, sregs: &mut kvm_sregs) -> sregs.ss = data_seg; sregs.tr = tss_seg; - /* 64-bit protected mode */ - sregs.cr0 |= X86_CR0_PE; - sregs.efer |= EFER_LME | EFER_LMA; + match boot_prot { + BootProtocol::PvhBoot => { + sregs.cr0 = X86_CR0_PE; + sregs.cr4 = 0; + } + BootProtocol::LinuxBoot => { + /* 64-bit protected mode */ + sregs.cr0 |= X86_CR0_PE; + sregs.efer |= EFER_LME | EFER_LMA; + } + } Ok(()) } @@ -280,7 +327,7 @@ mod tests { fn segments_and_sregs() { let mut sregs: kvm_sregs = Default::default(); let gm = create_guest_mem(); - configure_segments_and_sregs(&gm, &mut sregs).unwrap(); + configure_segments_and_sregs(&gm, &mut sregs, BootProtocol::LinuxBoot).unwrap(); assert_eq!(0x0, read_u64(&gm, BOOT_GDT_START)); assert_eq!( @@ -298,13 +345,13 @@ mod tests { assert_eq!(0x0, read_u64(&gm, BOOT_IDT_START)); assert_eq!(0, sregs.cs.base); - assert_eq!(0xfffff, sregs.ds.limit); + assert_eq!(0xffffffff, sregs.ds.limit); assert_eq!(0x10, sregs.es.selector); assert_eq!(1, sregs.fs.present); assert_eq!(1, sregs.gs.g); assert_eq!(0, sregs.ss.avl); assert_eq!(0, sregs.tr.base); - assert_eq!(0xfffff, sregs.tr.limit); + assert_eq!(0xffffffff, sregs.tr.limit); assert_eq!(0, sregs.tr.avl); assert_eq!(X86_CR0_PE, sregs.cr0); assert_eq!(EFER_LME | EFER_LMA, sregs.efer); @@ -398,6 +445,7 @@ mod tests { expected_regs.rip, expected_regs.rsp, expected_regs.rsi, + BootProtocol::LinuxBoot, ) .unwrap(); @@ -413,10 +461,10 @@ mod tests { let mut expected_sregs: kvm_sregs = vcpu.get_sregs().unwrap(); let gm = create_guest_mem(); - configure_segments_and_sregs(&gm, &mut expected_sregs).unwrap(); + configure_segments_and_sregs(&gm, &mut expected_sregs, BootProtocol::LinuxBoot).unwrap(); setup_page_tables(&gm, &mut expected_sregs).unwrap(); - setup_sregs(&gm, &vcpu).unwrap(); + setup_sregs(&gm, &vcpu, BootProtocol::LinuxBoot).unwrap(); let actual_sregs: kvm_sregs = vcpu.get_sregs().unwrap(); assert_eq!(expected_sregs, actual_sregs); } diff --git a/vmm/src/cpu.rs b/vmm/src/cpu.rs index d7fca698b..2a5707386 100644 --- a/vmm/src/cpu.rs +++ b/vmm/src/cpu.rs @@ -296,11 +296,16 @@ impl Vcpu { kernel_entry_point.entry_addr.raw_value(), arch::x86_64::layout::BOOT_STACK_POINTER.raw_value(), arch::x86_64::layout::ZERO_PAGE_START.raw_value(), + kernel_entry_point.protocol, ) .map_err(Error::REGSConfiguration)?; arch::x86_64::regs::setup_fpu(&self.fd).map_err(Error::FPUConfiguration)?; - arch::x86_64::regs::setup_sregs(&vm_memory.memory(), &self.fd) - .map_err(Error::SREGSConfiguration)?; + arch::x86_64::regs::setup_sregs( + &vm_memory.memory(), + &self.fd, + kernel_entry_point.protocol, + ) + .map_err(Error::SREGSConfiguration)?; } arch::x86_64::interrupts::set_lint(&self.fd).map_err(Error::LocalIntConfiguration)?; Ok(())