arch: x86_64: Add 5th level of paging when needed

For correctness, when the CPUID supports the LA57 feature, the VMM sets
the CR4.LA57 register, which means a fifth level of page table might be
needed. Even if it's not needed because the kernel should not use
addresses over 1GiB, it's better to define this new level anyway.

This patch only applies to the Linux boot codepath, which means it
affects both vmlinux without PVH and bzImage binaries. The bzImage
does not need this since the page tables and CR4 registers are set in
the decompression code from the kernel.

And for vmlinux with PVH, if we follow the PVH specification, the kernel
must be responsible for setting things up, but the implementation is
missing. This means for now that PVH does not support LA57 with 5 levels
of paging.

Signed-off-by: Sebastien Boeuf <sebastien.boeuf@intel.com>
This commit is contained in:
Sebastien Boeuf 2020-06-10 19:03:10 +02:00 committed by Samuel Ortiz
parent abd6204d27
commit bf37ebdcb6
2 changed files with 32 additions and 13 deletions

View File

@ -47,9 +47,10 @@ pub const BOOT_STACK_START: GuestAddress = GuestAddress(0x8000);
pub const BOOT_STACK_POINTER: GuestAddress = GuestAddress(0x8ff0);
// Initial pagetables.
pub const PML4_START: GuestAddress = GuestAddress(0x9000);
pub const PDPTE_START: GuestAddress = GuestAddress(0xa000);
pub const PDE_START: GuestAddress = GuestAddress(0xb000);
pub const PML5_START: GuestAddress = GuestAddress(0x9000);
pub const PML4_START: GuestAddress = GuestAddress(0xa000);
pub const PDPTE_START: GuestAddress = GuestAddress(0xb000);
pub const PDE_START: GuestAddress = GuestAddress(0xc000);
/// Kernel command line start address.
pub const CMDLINE_START: GuestAddress = GuestAddress(0x20000);

View File

@ -14,7 +14,9 @@ use super::BootProtocol;
use arch_gen::x86::msr_index;
use kvm_bindings::{kvm_fpu, kvm_msr_entry, kvm_regs, kvm_sregs, Msrs};
use kvm_ioctls::VcpuFd;
use layout::{BOOT_GDT_START, BOOT_IDT_START, PDE_START, PDPTE_START, PML4_START, PVH_INFO_START};
use layout::{
BOOT_GDT_START, BOOT_IDT_START, PDE_START, PDPTE_START, PML4_START, PML5_START, PVH_INFO_START,
};
use vm_memory::{Address, Bytes, GuestMemory, GuestMemoryError, GuestMemoryMmap};
// MTRR constants
@ -45,6 +47,8 @@ pub enum Error {
WritePDEAddress(GuestMemoryError),
/// Writing PML4 to RAM failed.
WritePML4Address(GuestMemoryError),
/// Writing PML5 to RAM failed.
WritePML5Address(GuestMemoryError),
}
pub type Result<T> = result::Result<T, Error>;
@ -221,7 +225,18 @@ fn configure_segments_and_sregs(
}
fn setup_page_tables(mem: &GuestMemoryMmap, sregs: &mut kvm_sregs) -> Result<()> {
// Puts PML4 right after zero page but aligned to 4k.
// Puts PML5 or PML4 right after zero page but aligned to 4k.
if unsafe { std::arch::x86_64::__cpuid(7).ecx } & (1 << 16) != 0 {
// Entry covering VA [0..256TB)
mem.write_obj(PML4_START.raw_value() | 0x03, PML5_START)
.map_err(Error::WritePML5Address)?;
sregs.cr3 = PML5_START.raw_value();
sregs.cr4 |= X86_CR4_LA57;
} else {
sregs.cr3 = PML4_START.raw_value();
}
// Entry covering VA [0..512GB)
mem.write_obj(PDPTE_START.raw_value() | 0x03, PML4_START)
@ -230,6 +245,7 @@ fn setup_page_tables(mem: &GuestMemoryMmap, sregs: &mut kvm_sregs) -> Result<()>
// Entry covering VA [0..1GB)
mem.write_obj(PDE_START.raw_value() | 0x03, PDPTE_START)
.map_err(Error::WritePDPTEAddress)?;
// 512 2MB entries together covering VA [0..1GB). Note we are assuming
// CPU supports 2MB pages (/proc/cpuinfo has 'pse'). All modern CPUs do.
for i in 0..512 {
@ -237,14 +253,9 @@ fn setup_page_tables(mem: &GuestMemoryMmap, sregs: &mut kvm_sregs) -> Result<()>
.map_err(Error::WritePDEAddress)?;
}
sregs.cr3 = PML4_START.raw_value();
sregs.cr4 |= X86_CR4_PAE;
sregs.cr0 |= X86_CR0_PG;
if unsafe { std::arch::x86_64::__cpuid(7).ecx } & (1 << 16) != 0 {
sregs.cr4 |= X86_CR4_LA57;
}
Ok(())
}
@ -374,8 +385,11 @@ mod tests {
let gm = create_guest_mem();
setup_page_tables(&gm, &mut sregs).unwrap();
assert_eq!(0xa003, read_u64(&gm, PML4_START));
assert_eq!(0xb003, read_u64(&gm, PDPTE_START));
if unsafe { std::arch::x86_64::__cpuid(7).ecx } & (1 << 16) != 0 {
assert_eq!(0xa003, read_u64(&gm, PML5_START));
}
assert_eq!(0xb003, read_u64(&gm, PML4_START));
assert_eq!(0xc003, read_u64(&gm, PDPTE_START));
for i in 0..512 {
assert_eq!(
(i << 21) + 0x83u64,
@ -383,7 +397,11 @@ mod tests {
);
}
if unsafe { std::arch::x86_64::__cpuid(7).ecx } & (1 << 16) != 0 {
assert_eq!(PML5_START.raw_value(), sregs.cr3);
} else {
assert_eq!(PML4_START.raw_value(), sregs.cr3);
}
assert_eq!(X86_CR4_PAE, sregs.cr4);
assert_eq!(X86_CR0_PG, sregs.cr0);
}