Skip to content

Commit 49a6cc3

Browse files
tranji-cloudgvisor-bot
authored andcommitted
kvm: pagetables: Enable 5-level paging
This change enables 5-level paging by supporting P4D table. During PageTable.InitArch(), we detect CPU capabilities and set a flag to determine the paging mode. Update iterateRangeCanonical() call handle P4D level. We avoid using a separate walkP4D function because the call chain would be too deep for the nosplit attribute. The P4D level logic is wrapped around a if/else statement for clarity PiperOrigin-RevId: 800161142
1 parent a6229c3 commit 49a6cc3

File tree

9 files changed

+181
-56
lines changed

9 files changed

+181
-56
lines changed

pkg/ring0/kernel_amd64.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,9 @@ func (c *CPU) CR4() uint64 {
205205
if hasUMIP {
206206
cr4 |= _CR4_UMIP
207207
}
208+
if hasLA57 {
209+
cr4 |= _CR4_LA57
210+
}
208211
return cr4
209212
}
210213

pkg/ring0/lib_amd64.go

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ var (
8888
hasXSAVEOPT bool
8989
hasXSAVE bool
9090
hasFSGSBASE bool
91+
hasLA57 bool
9192
validXCR0Mask uintptr
9293
localXCR0 uintptr
9394
)
@@ -100,19 +101,12 @@ var (
100101
func Init(fs cpuid.FeatureSet) {
101102
// Initialize all sizes.
102103
VirtualAddressBits = uintptr(fs.VirtualAddressBits())
103-
// TODO(gvisor.dev/issue/7349): introduce support for 5-level paging.
104-
// Four-level page tables allows to address up to 48-bit virtual
105-
// addresses.
106-
if VirtualAddressBits > 48 {
107-
VirtualAddressBits = 48
108-
}
109104
if PhysicalAddressBits == 0 {
110105
PhysicalAddressBits = uintptr(fs.PhysicalAddressBits())
111106
}
112107
UserspaceSize = uintptr(1) << (VirtualAddressBits - 1)
113108
MaximumUserAddress = (UserspaceSize - 1) & ^uintptr(hostarch.PageSize-1)
114109
KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1)
115-
116110
// Initialize all functions.
117111
hasSMEP = fs.HasFeature(cpuid.X86FeatureSMEP)
118112
hasSMAP = fs.HasFeature(cpuid.X86FeatureSMAP)
@@ -121,6 +115,7 @@ func Init(fs cpuid.FeatureSet) {
121115
hasXSAVEOPT = fs.UseXsaveopt()
122116
hasXSAVE = fs.UseXsave()
123117
hasFSGSBASE = fs.HasFeature(cpuid.X86FeatureFSGSBase)
118+
hasLA57 = fs.HasFeature(cpuid.X86FeatureLA57)
124119
validXCR0Mask = uintptr(fs.ValidXCR0Mask())
125120
if hasXSAVE {
126121
XCR0DisabledMask := uintptr((1 << 9) | (1 << 17) | (1 << 18))

pkg/ring0/pagetables/BUILD

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ go_library(
7171
"//pkg/sentry/platform/kvm:__subpackages__",
7272
],
7373
deps = [
74+
"//pkg/cpuid",
7475
"//pkg/hostarch",
7576
"//pkg/sync",
7677
],
@@ -87,5 +88,8 @@ go_test(
8788
":walker_check_arm64",
8889
],
8990
library = ":pagetables",
90-
deps = ["//pkg/hostarch"],
91+
deps = [
92+
"//pkg/cpuid",
93+
"//pkg/hostarch",
94+
],
9195
)

pkg/ring0/pagetables/pagetables.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,10 @@ type PageTables struct {
5656
// readOnlyShared indicates the Pagetables are read-only and
5757
// own the ranges that are shared with other Pagetables.
5858
readOnlyShared bool
59+
60+
// largeAddressesEnabled indicates the Pagetables support addresess
61+
// larger than 48 bits.
62+
largeAddressesEnabled bool
5963
}
6064

6165
// Init initializes a set of PageTables.

pkg/ring0/pagetables/pagetables_amd64.go

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -14,28 +14,34 @@
1414

1515
package pagetables
1616

17+
import (
18+
"gvisor.dev/gvisor/pkg/cpuid"
19+
)
20+
1721
// Address constraints.
18-
//
19-
// The lowerTop and upperBottom currently apply to four-level pagetables;
20-
// additional refactoring would be necessary to support five-level pagetables.
21-
const (
22-
lowerTop = 0x00007fffffffffff
23-
upperBottom = 0xffff800000000000
22+
var (
23+
lowerTop uintptr = 0x00007fffffffffff
24+
upperBottom uintptr = 0xffff800000000000
25+
pgdShift = 39
26+
pgdMask uintptr = 0x1ff << pgdShift
27+
pgdSize uintptr = 1 << pgdShift
28+
)
2429

30+
const (
2531
pteShift = 12
2632
pmdShift = 21
2733
pudShift = 30
28-
pgdShift = 39
34+
p4dShift = 39
2935

3036
pteMask = 0x1ff << pteShift
3137
pmdMask = 0x1ff << pmdShift
3238
pudMask = 0x1ff << pudShift
33-
pgdMask = 0x1ff << pgdShift
39+
p4dMask = 0x1ff << p4dShift
3440

3541
pteSize = 1 << pteShift
3642
pmdSize = 1 << pmdShift
3743
pudSize = 1 << pudShift
38-
pgdSize = 1 << pgdShift
44+
p4dSize = 1 << p4dShift
3945

4046
executeDisable = 1 << 63
4147
entriesPerPage = 512
@@ -47,6 +53,16 @@ const (
4753
//
4854
//go:nosplit
4955
func (p *PageTables) InitArch(allocator Allocator) {
56+
featureSet := cpuid.HostFeatureSet()
57+
if featureSet.HasFeature(cpuid.X86FeatureLA57) {
58+
p.largeAddressesEnabled = true
59+
lowerTop = 0x00FFFFFFFFFFFFFF
60+
upperBottom = 0xFF00000000000000
61+
pgdShift = 48
62+
pgdMask = 0x1ff << pgdShift
63+
pgdSize = 1 << pgdShift
64+
}
65+
5066
if p.upperSharedPageTables != nil {
5167
p.cloneUpperShared()
5268
}
@@ -58,10 +74,10 @@ func pgdIndex(upperStart uintptr) uintptr {
5874
panic("upperStart should be pgd size aligned")
5975
}
6076
if upperStart >= upperBottom {
61-
return entriesPerPage/2 + (upperStart-upperBottom)/pgdSize
77+
return entriesPerPage/2 + (upperStart-upperBottom)>>pgdShift
6278
}
6379
if upperStart < lowerTop {
64-
return upperStart / pgdSize
80+
return upperStart >> pgdShift
6581
}
6682
panic("upperStart should be in canonical range")
6783
}

pkg/ring0/pagetables/pagetables_amd64_test.go

Lines changed: 55 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -20,62 +20,100 @@ package pagetables
2020
import (
2121
"testing"
2222

23+
"gvisor.dev/gvisor/pkg/cpuid"
2324
"gvisor.dev/gvisor/pkg/hostarch"
2425
)
2526

26-
func Test2MAnd4K(t *testing.T) {
27-
pt := New(NewRuntimeAllocator())
27+
var (
28+
lowerTopAligned uintptr = 0x00007f0000000000
29+
pt *PageTables
30+
)
31+
32+
func getLargeAddressesEnabled() bool {
33+
featureSet := cpuid.HostFeatureSet()
34+
return featureSet.HasFeature(cpuid.X86FeatureLA57)
35+
}
36+
37+
func getLowerTopAligned() uintptr {
38+
if getLargeAddressesEnabled() {
39+
return 0x00FF000000000000
40+
}
41+
return lowerTopAligned
42+
}
43+
44+
func InitTest() {
45+
cpuid.Initialize()
46+
pt = New(NewRuntimeAllocator())
47+
pt.InitArch(NewRuntimeAllocator())
48+
}
49+
50+
func TestLargeAddresses(t *testing.T) {
51+
InitTest()
52+
if !getLargeAddressesEnabled() {
53+
t.Skip("Large addresses are not supported on this platform")
54+
}
55+
pt.Map(hostarch.Addr(1<<50), pteSize, MapOpts{AccessType: hostarch.ReadWrite}, pteSize*42)
56+
pt.Map(hostarch.Addr(1<<54), pmdSize, MapOpts{AccessType: hostarch.Read}, pmdSize*42)
57+
58+
checkMappings(t, pt, []mapping{
59+
{uintptr(1 << 50), pteSize, pteSize * 42, MapOpts{AccessType: hostarch.ReadWrite}},
60+
{uintptr(1 << 54), pmdSize, pmdSize * 42, MapOpts{AccessType: hostarch.Read}},
61+
})
62+
}
2863

64+
func Test2MAnd4K(t *testing.T) {
65+
InitTest()
2966
// Map a small page and a huge page.
3067
pt.Map(0x400000, pteSize, MapOpts{AccessType: hostarch.ReadWrite}, pteSize*42)
31-
pt.Map(0x00007f0000000000, pmdSize, MapOpts{AccessType: hostarch.Read}, pmdSize*47)
68+
pt.Map(hostarch.Addr(getLowerTopAligned()), pmdSize, MapOpts{AccessType: hostarch.Read}, pmdSize*47)
3269

3370
checkMappings(t, pt, []mapping{
3471
{0x400000, pteSize, pteSize * 42, MapOpts{AccessType: hostarch.ReadWrite}},
35-
{0x00007f0000000000, pmdSize, pmdSize * 47, MapOpts{AccessType: hostarch.Read}},
72+
{getLowerTopAligned(), pmdSize, pmdSize * 47, MapOpts{AccessType: hostarch.Read}},
3673
})
3774
}
3875

3976
func Test1GAnd4K(t *testing.T) {
40-
pt := New(NewRuntimeAllocator())
77+
InitTest()
4178

4279
// Map a small page and a super page.
4380
pt.Map(0x400000, pteSize, MapOpts{AccessType: hostarch.ReadWrite}, pteSize*42)
44-
pt.Map(0x00007f0000000000, pudSize, MapOpts{AccessType: hostarch.Read}, pudSize*47)
81+
pt.Map(hostarch.Addr(getLowerTopAligned()), pudSize, MapOpts{AccessType: hostarch.Read}, pudSize*47)
4582

4683
checkMappings(t, pt, []mapping{
4784
{0x400000, pteSize, pteSize * 42, MapOpts{AccessType: hostarch.ReadWrite}},
48-
{0x00007f0000000000, pudSize, pudSize * 47, MapOpts{AccessType: hostarch.Read}},
85+
{getLowerTopAligned(), pudSize, pudSize * 47, MapOpts{AccessType: hostarch.Read}},
4986
})
5087
}
5188

5289
func TestSplit1GPage(t *testing.T) {
53-
pt := New(NewRuntimeAllocator())
90+
InitTest()
5491

5592
// Map a super page and knock out the middle.
56-
pt.Map(0x00007f0000000000, pudSize, MapOpts{AccessType: hostarch.Read}, pudSize*42)
57-
pt.Unmap(hostarch.Addr(0x00007f0000000000+pteSize), pudSize-(2*pteSize))
93+
pt.Map(hostarch.Addr(getLowerTopAligned()), pudSize, MapOpts{AccessType: hostarch.Read}, pudSize*42)
94+
pt.Unmap(hostarch.Addr(getLowerTopAligned()+pteSize), pudSize-(2*pteSize))
5895

5996
checkMappings(t, pt, []mapping{
60-
{0x00007f0000000000, pteSize, pudSize * 42, MapOpts{AccessType: hostarch.Read}},
61-
{0x00007f0000000000 + pudSize - pteSize, pteSize, pudSize*42 + pudSize - pteSize, MapOpts{AccessType: hostarch.Read}},
97+
{getLowerTopAligned(), pteSize, pudSize * 42, MapOpts{AccessType: hostarch.Read}},
98+
{getLowerTopAligned() + pudSize - pteSize, pteSize, pudSize*42 + pudSize - pteSize, MapOpts{AccessType: hostarch.Read}},
6299
})
63100
}
64101

65102
func TestSplit2MPage(t *testing.T) {
66-
pt := New(NewRuntimeAllocator())
103+
InitTest()
67104

68105
// Map a huge page and knock out the middle.
69-
pt.Map(0x00007f0000000000, pmdSize, MapOpts{AccessType: hostarch.Read}, pmdSize*42)
70-
pt.Unmap(hostarch.Addr(0x00007f0000000000+pteSize), pmdSize-(2*pteSize))
106+
pt.Map(hostarch.Addr(getLowerTopAligned()), pmdSize, MapOpts{AccessType: hostarch.Read}, pmdSize*42)
107+
pt.Unmap(hostarch.Addr(getLowerTopAligned()+pteSize), pmdSize-(2*pteSize))
71108

72109
checkMappings(t, pt, []mapping{
73-
{0x00007f0000000000, pteSize, pmdSize * 42, MapOpts{AccessType: hostarch.Read}},
74-
{0x00007f0000000000 + pmdSize - pteSize, pteSize, pmdSize*42 + pmdSize - pteSize, MapOpts{AccessType: hostarch.Read}},
110+
{getLowerTopAligned(), pteSize, pmdSize * 42, MapOpts{AccessType: hostarch.Read}},
111+
{getLowerTopAligned() + pmdSize - pteSize, pteSize, pmdSize*42 + pmdSize - pteSize, MapOpts{AccessType: hostarch.Read}},
75112
})
76113
}
77114

78115
func TestNumMemoryTypes(t *testing.T) {
116+
InitTest()
79117
// The PAT accommodates up to 8 entries. However, PTE.Set() currently
80118
// assumes that NumMemoryTypes <= 4, since the location of the most
81119
// significant bit of the PAT index in page table entries varies depending

pkg/ring0/pagetables/walker_amd64.go

Lines changed: 74 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -262,30 +262,85 @@ func (w *Walker) iterateRangeCanonical(start, end uintptr) bool {
262262
nextBoundary := addrEnd(start, end, pgdSize)
263263
pgdIndex := uint16((start & pgdMask) >> pgdShift)
264264
pgdEntry := &w.pageTables.root[pgdIndex]
265-
if !pgdEntry.Valid() {
266-
if !w.visitor.requiresAlloc() {
267-
// Skip over this entry.
268-
start = nextBoundary
269-
continue
265+
if !w.pageTables.largeAddressesEnabled {
266+
if !pgdEntry.Valid() {
267+
if !w.visitor.requiresAlloc() {
268+
// Skip over this entry.
269+
start = nextBoundary
270+
continue
271+
}
272+
273+
// Allocate a new pgd.
274+
pudEntries = w.pageTables.Allocator.NewPTEs() // escapes: depends on allocator.
275+
pgdEntry.setPageTable(w.pageTables, pudEntries)
276+
} else {
277+
pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) // escapes: see above.
278+
}
279+
// Map the next level.
280+
ok, clearPUDEntries := w.walkPUDs(pudEntries, start, nextBoundary)
281+
if !ok {
282+
return false
270283
}
271284

272-
// Allocate a new pgd.
273-
pudEntries = w.pageTables.Allocator.NewPTEs() // escapes: depends on allocator.
274-
pgdEntry.setPageTable(w.pageTables, pudEntries)
285+
// Check if we no longer need this page table.
286+
if clearPUDEntries == entriesPerPage {
287+
pgdEntry.Clear()
288+
w.pageTables.Allocator.FreePTEs(pudEntries) // escapes: see above.
289+
}
275290
} else {
276-
pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) // escapes: see above.
277-
}
291+
var p4dEntries *PTEs
292+
if !pgdEntry.Valid() {
293+
if !w.visitor.requiresAlloc() {
294+
// Skip over this entry.
295+
start = nextBoundary
296+
continue
297+
}
278298

279-
// Map the next level.
280-
ok, clearPUDEntries := w.walkPUDs(pudEntries, start, nextBoundary)
281-
if !ok {
282-
return false
283-
}
299+
// Allocate a new pgd.
300+
p4dEntries = w.pageTables.Allocator.NewPTEs() // escapes: depends on allocator.
301+
pgdEntry.setPageTable(w.pageTables, p4dEntries)
302+
} else {
303+
p4dEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) // escapes: see above.
304+
}
305+
var clearP4DEntries uint16 = 0
306+
p4dStart := start
307+
p4dEnd := nextBoundary
308+
for p4dStart < p4dEnd {
309+
nextP4DBoundary := addrEnd(p4dStart, p4dEnd, p4dSize)
310+
p4dIndex := uint16((p4dStart & p4dMask) >> p4dShift)
311+
p4dEntry := &p4dEntries[p4dIndex]
312+
if !p4dEntry.Valid() {
313+
if !w.visitor.requiresAlloc() {
314+
// Skip over this entry.
315+
clearP4DEntries++
316+
p4dStart = nextP4DBoundary
317+
continue
318+
}
319+
// Allocate a new pud.
320+
pudEntries = w.pageTables.Allocator.NewPTEs() // escapes: depends on allocator.
321+
p4dEntry.setPageTable(w.pageTables, pudEntries)
322+
} else {
323+
pudEntries = w.pageTables.Allocator.LookupPTEs(p4dEntry.Address()) // escapes: see above.
324+
}
325+
326+
ok, clearPUDEntries := w.walkPUDs(pudEntries, p4dStart, nextP4DBoundary)
327+
if !ok {
328+
return false
329+
}
330+
if clearPUDEntries == entriesPerPage {
331+
p4dEntry.Clear()
332+
w.pageTables.Allocator.FreePTEs(pudEntries) // escapes: see above.
333+
clearP4DEntries++
334+
}
284335

285-
// Check if we no longer need this page table.
286-
if clearPUDEntries == entriesPerPage {
287-
pgdEntry.Clear()
288-
w.pageTables.Allocator.FreePTEs(pudEntries) // escapes: see above.
336+
p4dStart = nextP4DBoundary
337+
}
338+
339+
// Check if we no longer need this page table.
340+
if clearP4DEntries == entriesPerPage {
341+
pgdEntry.Clear()
342+
w.pageTables.Allocator.FreePTEs(p4dEntries) // escapes: see above.
343+
}
289344
}
290345

291346
// Advance to the next PGD entry's range for the next loop.

pkg/ring0/x86.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ const (
3131
_CR4_OSFXSR = 1 << 9
3232
_CR4_OSXMMEXCPT = 1 << 10
3333
_CR4_UMIP = 1 << 11
34+
_CR4_LA57 = 1 << 12
3435
_CR4_FSGSBASE = 1 << 16
3536
_CR4_PCIDE = 1 << 17
3637
_CR4_OSXSAVE = 1 << 18

0 commit comments

Comments
 (0)