1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * AMD SVM-SEV Host Support.
4 *
5 * Copyright (C) 2023 Advanced Micro Devices, Inc.
6 *
7 * Author: Ashish Kalra <[email protected]>
8 *
9 */
10
11 #include <linux/cc_platform.h>
12 #include <linux/printk.h>
13 #include <linux/mm_types.h>
14 #include <linux/set_memory.h>
15 #include <linux/memblock.h>
16 #include <linux/kernel.h>
17 #include <linux/mm.h>
18 #include <linux/cpumask.h>
19 #include <linux/iommu.h>
20 #include <linux/amd-iommu.h>
21 #include <linux/nospec.h>
22
23 #include <asm/sev.h>
24 #include <asm/processor.h>
25 #include <asm/setup.h>
26 #include <asm/svm.h>
27 #include <asm/smp.h>
28 #include <asm/cpu.h>
29 #include <asm/apic.h>
30 #include <asm/cpuid.h>
31 #include <asm/cmdline.h>
32 #include <asm/iommu.h>
33
34 /*
35 * The RMP entry information as returned by the RMPREAD instruction.
36 */
37 struct rmpentry {
38 u64 gpa;
39 u8 assigned :1,
40 rsvd1 :7;
41 u8 pagesize :1,
42 hpage_region_status :1,
43 rsvd2 :6;
44 u8 immutable :1,
45 rsvd3 :7;
46 u8 rsvd4;
47 u32 asid;
48 } __packed;
49
50 /*
51 * The raw RMP entry format is not architectural. The format is defined in PPR
52 * Family 19h Model 01h, Rev B1 processor. This format represents the actual
53 * entry in the RMP table memory. The bitfield definitions are used for machines
54 * without the RMPREAD instruction (Zen3 and Zen4), otherwise the "hi" and "lo"
55 * fields are only used for dumping the raw data.
56 */
57 struct rmpentry_raw {
58 union {
59 struct {
60 u64 assigned : 1,
61 pagesize : 1,
62 immutable : 1,
63 rsvd1 : 9,
64 gpa : 39,
65 asid : 10,
66 vmsa : 1,
67 validated : 1,
68 rsvd2 : 1;
69 };
70 u64 lo;
71 };
72 u64 hi;
73 } __packed;
74
75 /*
76 * The first 16KB from the RMP_BASE is used by the processor for the
77 * bookkeeping, the range needs to be added during the RMP entry lookup.
78 */
79 #define RMPTABLE_CPU_BOOKKEEPING_SZ 0x4000
80
81 /*
82 * For a non-segmented RMP table, use the maximum physical addressing as the
83 * segment size in order to always arrive at index 0 in the table.
84 */
85 #define RMPTABLE_NON_SEGMENTED_SHIFT 52
86
87 struct rmp_segment_desc {
88 struct rmpentry_raw *rmp_entry;
89 u64 max_index;
90 u64 size;
91 };
92
93 /*
94 * Segmented RMP Table support.
95 * - The segment size is used for two purposes:
96 * - Identify the amount of memory covered by an RMP segment
97 * - Quickly locate an RMP segment table entry for a physical address
98 *
99 * - The RMP segment table contains pointers to an RMP table that covers
100 * a specific portion of memory. There can be up to 512 8-byte entries,
101 * one pages worth.
102 */
103 #define RST_ENTRY_MAPPED_SIZE(x) ((x) & GENMASK_ULL(19, 0))
104 #define RST_ENTRY_SEGMENT_BASE(x) ((x) & GENMASK_ULL(51, 20))
105
106 #define RST_SIZE SZ_4K
107 static struct rmp_segment_desc **rmp_segment_table __ro_after_init;
108 static unsigned int rst_max_index __ro_after_init = 512;
109
110 static unsigned int rmp_segment_shift;
111 static u64 rmp_segment_size;
112 static u64 rmp_segment_mask;
113
114 #define RST_ENTRY_INDEX(x) ((x) >> rmp_segment_shift)
115 #define RMP_ENTRY_INDEX(x) ((u64)(PHYS_PFN((x) & rmp_segment_mask)))
116
117 static u64 rmp_cfg;
118
119 /* Mask to apply to a PFN to get the first PFN of a 2MB page */
120 #define PFN_PMD_MASK GENMASK_ULL(63, PMD_SHIFT - PAGE_SHIFT)
121
122 static u64 probed_rmp_base, probed_rmp_size;
123
124 static LIST_HEAD(snp_leaked_pages_list);
125 static DEFINE_SPINLOCK(snp_leaked_pages_list_lock);
126
127 static unsigned long snp_nr_leaked_pages;
128
129 #undef pr_fmt
130 #define pr_fmt(fmt) "SEV-SNP: " fmt
131
__mfd_enable(unsigned int cpu)132 static int __mfd_enable(unsigned int cpu)
133 {
134 u64 val;
135
136 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
137 return 0;
138
139 rdmsrl(MSR_AMD64_SYSCFG, val);
140
141 val |= MSR_AMD64_SYSCFG_MFDM;
142
143 wrmsrl(MSR_AMD64_SYSCFG, val);
144
145 return 0;
146 }
147
mfd_enable(void * arg)148 static __init void mfd_enable(void *arg)
149 {
150 __mfd_enable(smp_processor_id());
151 }
152
__snp_enable(unsigned int cpu)153 static int __snp_enable(unsigned int cpu)
154 {
155 u64 val;
156
157 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
158 return 0;
159
160 rdmsrl(MSR_AMD64_SYSCFG, val);
161
162 val |= MSR_AMD64_SYSCFG_SNP_EN;
163 val |= MSR_AMD64_SYSCFG_SNP_VMPL_EN;
164
165 wrmsrl(MSR_AMD64_SYSCFG, val);
166
167 return 0;
168 }
169
snp_enable(void * arg)170 static __init void snp_enable(void *arg)
171 {
172 __snp_enable(smp_processor_id());
173 }
174
__snp_fixup_e820_tables(u64 pa)175 static void __init __snp_fixup_e820_tables(u64 pa)
176 {
177 if (IS_ALIGNED(pa, PMD_SIZE))
178 return;
179
180 /*
181 * Handle cases where the RMP table placement by the BIOS is not
182 * 2M aligned and the kexec kernel could try to allocate
183 * from within that chunk which then causes a fatal RMP fault.
184 *
185 * The e820_table needs to be updated as it is converted to
186 * kernel memory resources and used by KEXEC_FILE_LOAD syscall
187 * to load kexec segments.
188 *
189 * The e820_table_firmware needs to be updated as it is exposed
190 * to sysfs and used by the KEXEC_LOAD syscall to load kexec
191 * segments.
192 *
193 * The e820_table_kexec needs to be updated as it passed to
194 * the kexec-ed kernel.
195 */
196 pa = ALIGN_DOWN(pa, PMD_SIZE);
197 if (e820__mapped_any(pa, pa + PMD_SIZE, E820_TYPE_RAM)) {
198 pr_info("Reserving start/end of RMP table on a 2MB boundary [0x%016llx]\n", pa);
199 e820__range_update(pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
200 e820__range_update_table(e820_table_kexec, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
201 e820__range_update_table(e820_table_firmware, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
202 if (!memblock_is_region_reserved(pa, PMD_SIZE))
203 memblock_reserve(pa, PMD_SIZE);
204 }
205 }
206
fixup_e820_tables_for_segmented_rmp(void)207 static void __init fixup_e820_tables_for_segmented_rmp(void)
208 {
209 u64 pa, *rst, size, mapped_size;
210 unsigned int i;
211
212 __snp_fixup_e820_tables(probed_rmp_base);
213
214 pa = probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ;
215
216 __snp_fixup_e820_tables(pa + RST_SIZE);
217
218 rst = early_memremap(pa, RST_SIZE);
219 if (!rst)
220 return;
221
222 for (i = 0; i < rst_max_index; i++) {
223 pa = RST_ENTRY_SEGMENT_BASE(rst[i]);
224 mapped_size = RST_ENTRY_MAPPED_SIZE(rst[i]);
225 if (!mapped_size)
226 continue;
227
228 __snp_fixup_e820_tables(pa);
229
230 /*
231 * Mapped size in GB. Mapped size is allowed to exceed
232 * the segment coverage size, but gets reduced to the
233 * segment coverage size.
234 */
235 mapped_size <<= 30;
236 if (mapped_size > rmp_segment_size)
237 mapped_size = rmp_segment_size;
238
239 /* Calculate the RMP segment size (16 bytes/page mapped) */
240 size = PHYS_PFN(mapped_size) << 4;
241
242 __snp_fixup_e820_tables(pa + size);
243 }
244
245 early_memunmap(rst, RST_SIZE);
246 }
247
fixup_e820_tables_for_contiguous_rmp(void)248 static void __init fixup_e820_tables_for_contiguous_rmp(void)
249 {
250 __snp_fixup_e820_tables(probed_rmp_base);
251 __snp_fixup_e820_tables(probed_rmp_base + probed_rmp_size);
252 }
253
snp_fixup_e820_tables(void)254 void __init snp_fixup_e820_tables(void)
255 {
256 if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED) {
257 fixup_e820_tables_for_segmented_rmp();
258 } else {
259 fixup_e820_tables_for_contiguous_rmp();
260 }
261 }
262
clear_rmptable_bookkeeping(void)263 static bool __init clear_rmptable_bookkeeping(void)
264 {
265 void *bk;
266
267 bk = memremap(probed_rmp_base, RMPTABLE_CPU_BOOKKEEPING_SZ, MEMREMAP_WB);
268 if (!bk) {
269 pr_err("Failed to map RMP bookkeeping area\n");
270 return false;
271 }
272
273 memset(bk, 0, RMPTABLE_CPU_BOOKKEEPING_SZ);
274
275 memunmap(bk);
276
277 return true;
278 }
279
alloc_rmp_segment_desc(u64 segment_pa,u64 segment_size,u64 pa)280 static bool __init alloc_rmp_segment_desc(u64 segment_pa, u64 segment_size, u64 pa)
281 {
282 u64 rst_index, rmp_segment_size_max;
283 struct rmp_segment_desc *desc;
284 void *rmp_segment;
285
286 /* Calculate the maximum size an RMP can be (16 bytes/page mapped) */
287 rmp_segment_size_max = PHYS_PFN(rmp_segment_size) << 4;
288
289 /* Validate the RMP segment size */
290 if (segment_size > rmp_segment_size_max) {
291 pr_err("Invalid RMP size 0x%llx for configured segment size 0x%llx\n",
292 segment_size, rmp_segment_size_max);
293 return false;
294 }
295
296 /* Validate the RMP segment table index */
297 rst_index = RST_ENTRY_INDEX(pa);
298 if (rst_index >= rst_max_index) {
299 pr_err("Invalid RMP segment base address 0x%llx for configured segment size 0x%llx\n",
300 pa, rmp_segment_size);
301 return false;
302 }
303
304 if (rmp_segment_table[rst_index]) {
305 pr_err("RMP segment descriptor already exists at index %llu\n", rst_index);
306 return false;
307 }
308
309 rmp_segment = memremap(segment_pa, segment_size, MEMREMAP_WB);
310 if (!rmp_segment) {
311 pr_err("Failed to map RMP segment addr 0x%llx size 0x%llx\n",
312 segment_pa, segment_size);
313 return false;
314 }
315
316 desc = kzalloc(sizeof(*desc), GFP_KERNEL);
317 if (!desc) {
318 memunmap(rmp_segment);
319 return false;
320 }
321
322 desc->rmp_entry = rmp_segment;
323 desc->max_index = segment_size / sizeof(*desc->rmp_entry);
324 desc->size = segment_size;
325
326 rmp_segment_table[rst_index] = desc;
327
328 return true;
329 }
330
free_rmp_segment_table(void)331 static void __init free_rmp_segment_table(void)
332 {
333 unsigned int i;
334
335 for (i = 0; i < rst_max_index; i++) {
336 struct rmp_segment_desc *desc;
337
338 desc = rmp_segment_table[i];
339 if (!desc)
340 continue;
341
342 memunmap(desc->rmp_entry);
343
344 kfree(desc);
345 }
346
347 free_page((unsigned long)rmp_segment_table);
348
349 rmp_segment_table = NULL;
350 }
351
352 /* Allocate the table used to index into the RMP segments */
alloc_rmp_segment_table(void)353 static bool __init alloc_rmp_segment_table(void)
354 {
355 struct page *page;
356
357 page = alloc_page(__GFP_ZERO);
358 if (!page)
359 return false;
360
361 rmp_segment_table = page_address(page);
362
363 return true;
364 }
365
setup_contiguous_rmptable(void)366 static bool __init setup_contiguous_rmptable(void)
367 {
368 u64 max_rmp_pfn, calc_rmp_sz, rmptable_segment, rmptable_size, rmp_end;
369
370 if (!probed_rmp_size)
371 return false;
372
373 rmp_end = probed_rmp_base + probed_rmp_size - 1;
374
375 /*
376 * Calculate the amount of memory that must be reserved by the BIOS to
377 * address the whole RAM, including the bookkeeping area. The RMP itself
378 * must also be covered.
379 */
380 max_rmp_pfn = max_pfn;
381 if (PFN_UP(rmp_end) > max_pfn)
382 max_rmp_pfn = PFN_UP(rmp_end);
383
384 calc_rmp_sz = (max_rmp_pfn << 4) + RMPTABLE_CPU_BOOKKEEPING_SZ;
385 if (calc_rmp_sz > probed_rmp_size) {
386 pr_err("Memory reserved for the RMP table does not cover full system RAM (expected 0x%llx got 0x%llx)\n",
387 calc_rmp_sz, probed_rmp_size);
388 return false;
389 }
390
391 if (!alloc_rmp_segment_table())
392 return false;
393
394 /* Map only the RMP entries */
395 rmptable_segment = probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ;
396 rmptable_size = probed_rmp_size - RMPTABLE_CPU_BOOKKEEPING_SZ;
397
398 if (!alloc_rmp_segment_desc(rmptable_segment, rmptable_size, 0)) {
399 free_rmp_segment_table();
400 return false;
401 }
402
403 return true;
404 }
405
setup_segmented_rmptable(void)406 static bool __init setup_segmented_rmptable(void)
407 {
408 u64 rst_pa, *rst, pa, ram_pa_end, ram_pa_max;
409 unsigned int i, max_index;
410
411 if (!probed_rmp_base)
412 return false;
413
414 if (!alloc_rmp_segment_table())
415 return false;
416
417 rst_pa = probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ;
418 rst = memremap(rst_pa, RST_SIZE, MEMREMAP_WB);
419 if (!rst) {
420 pr_err("Failed to map RMP segment table addr 0x%llx\n", rst_pa);
421 goto e_free;
422 }
423
424 pr_info("Segmented RMP using %lluGB segments\n", rmp_segment_size >> 30);
425
426 ram_pa_max = max_pfn << PAGE_SHIFT;
427
428 max_index = 0;
429 ram_pa_end = 0;
430 for (i = 0; i < rst_max_index; i++) {
431 u64 rmp_segment, rmp_size, mapped_size;
432
433 mapped_size = RST_ENTRY_MAPPED_SIZE(rst[i]);
434 if (!mapped_size)
435 continue;
436
437 max_index = i;
438
439 /*
440 * Mapped size in GB. Mapped size is allowed to exceed the
441 * segment coverage size, but gets reduced to the segment
442 * coverage size.
443 */
444 mapped_size <<= 30;
445 if (mapped_size > rmp_segment_size) {
446 pr_info("RMP segment %u mapped size (0x%llx) reduced to 0x%llx\n",
447 i, mapped_size, rmp_segment_size);
448 mapped_size = rmp_segment_size;
449 }
450
451 rmp_segment = RST_ENTRY_SEGMENT_BASE(rst[i]);
452
453 /* Calculate the RMP segment size (16 bytes/page mapped) */
454 rmp_size = PHYS_PFN(mapped_size) << 4;
455
456 pa = (u64)i << rmp_segment_shift;
457
458 /*
459 * Some segments may be for MMIO mapped above system RAM. These
460 * segments are used for Trusted I/O.
461 */
462 if (pa < ram_pa_max)
463 ram_pa_end = pa + mapped_size;
464
465 if (!alloc_rmp_segment_desc(rmp_segment, rmp_size, pa))
466 goto e_unmap;
467
468 pr_info("RMP segment %u physical address [0x%llx - 0x%llx] covering [0x%llx - 0x%llx]\n",
469 i, rmp_segment, rmp_segment + rmp_size - 1, pa, pa + mapped_size - 1);
470 }
471
472 if (ram_pa_max > ram_pa_end) {
473 pr_err("Segmented RMP does not cover full system RAM (expected 0x%llx got 0x%llx)\n",
474 ram_pa_max, ram_pa_end);
475 goto e_unmap;
476 }
477
478 /* Adjust the maximum index based on the found segments */
479 rst_max_index = max_index + 1;
480
481 memunmap(rst);
482
483 return true;
484
485 e_unmap:
486 memunmap(rst);
487
488 e_free:
489 free_rmp_segment_table();
490
491 return false;
492 }
493
setup_rmptable(void)494 static bool __init setup_rmptable(void)
495 {
496 if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED) {
497 return setup_segmented_rmptable();
498 } else {
499 return setup_contiguous_rmptable();
500 }
501 }
502
503 /*
504 * Do the necessary preparations which are verified by the firmware as
505 * described in the SNP_INIT_EX firmware command description in the SNP
506 * firmware ABI spec.
507 */
snp_rmptable_init(void)508 int __init snp_rmptable_init(void)
509 {
510 unsigned int i;
511 u64 val;
512
513 if (WARN_ON_ONCE(!cc_platform_has(CC_ATTR_HOST_SEV_SNP)))
514 return -ENOSYS;
515
516 if (WARN_ON_ONCE(!amd_iommu_snp_en))
517 return -ENOSYS;
518
519 if (!setup_rmptable())
520 return -ENOSYS;
521
522 /*
523 * Check if SEV-SNP is already enabled, this can happen in case of
524 * kexec boot.
525 */
526 rdmsrl(MSR_AMD64_SYSCFG, val);
527 if (val & MSR_AMD64_SYSCFG_SNP_EN)
528 goto skip_enable;
529
530 /* Zero out the RMP bookkeeping area */
531 if (!clear_rmptable_bookkeeping()) {
532 free_rmp_segment_table();
533 return -ENOSYS;
534 }
535
536 /* Zero out the RMP entries */
537 for (i = 0; i < rst_max_index; i++) {
538 struct rmp_segment_desc *desc;
539
540 desc = rmp_segment_table[i];
541 if (!desc)
542 continue;
543
544 memset(desc->rmp_entry, 0, desc->size);
545 }
546
547 /* Flush the caches to ensure that data is written before SNP is enabled. */
548 wbinvd_on_all_cpus();
549
550 /* MtrrFixDramModEn must be enabled on all the CPUs prior to enabling SNP. */
551 on_each_cpu(mfd_enable, NULL, 1);
552
553 on_each_cpu(snp_enable, NULL, 1);
554
555 skip_enable:
556 cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/rmptable_init:online", __snp_enable, NULL);
557
558 /*
559 * Setting crash_kexec_post_notifiers to 'true' to ensure that SNP panic
560 * notifier is invoked to do SNP IOMMU shutdown before kdump.
561 */
562 crash_kexec_post_notifiers = true;
563
564 return 0;
565 }
566
set_rmp_segment_info(unsigned int segment_shift)567 static void set_rmp_segment_info(unsigned int segment_shift)
568 {
569 rmp_segment_shift = segment_shift;
570 rmp_segment_size = 1ULL << rmp_segment_shift;
571 rmp_segment_mask = rmp_segment_size - 1;
572 }
573
574 #define RMP_ADDR_MASK GENMASK_ULL(51, 13)
575
probe_contiguous_rmptable_info(void)576 static bool probe_contiguous_rmptable_info(void)
577 {
578 u64 rmp_sz, rmp_base, rmp_end;
579
580 rdmsrl(MSR_AMD64_RMP_BASE, rmp_base);
581 rdmsrl(MSR_AMD64_RMP_END, rmp_end);
582
583 if (!(rmp_base & RMP_ADDR_MASK) || !(rmp_end & RMP_ADDR_MASK)) {
584 pr_err("Memory for the RMP table has not been reserved by BIOS\n");
585 return false;
586 }
587
588 if (rmp_base > rmp_end) {
589 pr_err("RMP configuration not valid: base=%#llx, end=%#llx\n", rmp_base, rmp_end);
590 return false;
591 }
592
593 rmp_sz = rmp_end - rmp_base + 1;
594
595 /* Treat the contiguous RMP table as a single segment */
596 rst_max_index = 1;
597
598 set_rmp_segment_info(RMPTABLE_NON_SEGMENTED_SHIFT);
599
600 probed_rmp_base = rmp_base;
601 probed_rmp_size = rmp_sz;
602
603 pr_info("RMP table physical range [0x%016llx - 0x%016llx]\n",
604 rmp_base, rmp_end);
605
606 return true;
607 }
608
probe_segmented_rmptable_info(void)609 static bool probe_segmented_rmptable_info(void)
610 {
611 unsigned int eax, ebx, segment_shift, segment_shift_min, segment_shift_max;
612 u64 rmp_base, rmp_end;
613
614 rdmsrl(MSR_AMD64_RMP_BASE, rmp_base);
615 if (!(rmp_base & RMP_ADDR_MASK)) {
616 pr_err("Memory for the RMP table has not been reserved by BIOS\n");
617 return false;
618 }
619
620 rdmsrl(MSR_AMD64_RMP_END, rmp_end);
621 WARN_ONCE(rmp_end & RMP_ADDR_MASK,
622 "Segmented RMP enabled but RMP_END MSR is non-zero\n");
623
624 /* Obtain the min and max supported RMP segment size */
625 eax = cpuid_eax(0x80000025);
626 segment_shift_min = eax & GENMASK(5, 0);
627 segment_shift_max = (eax & GENMASK(11, 6)) >> 6;
628
629 /* Verify the segment size is within the supported limits */
630 segment_shift = MSR_AMD64_RMP_SEGMENT_SHIFT(rmp_cfg);
631 if (segment_shift > segment_shift_max || segment_shift < segment_shift_min) {
632 pr_err("RMP segment size (%u) is not within advertised bounds (min=%u, max=%u)\n",
633 segment_shift, segment_shift_min, segment_shift_max);
634 return false;
635 }
636
637 /* Override the max supported RST index if a hardware limit exists */
638 ebx = cpuid_ebx(0x80000025);
639 if (ebx & BIT(10))
640 rst_max_index = ebx & GENMASK(9, 0);
641
642 set_rmp_segment_info(segment_shift);
643
644 probed_rmp_base = rmp_base;
645 probed_rmp_size = 0;
646
647 pr_info("Segmented RMP base table physical range [0x%016llx - 0x%016llx]\n",
648 rmp_base, rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ + RST_SIZE);
649
650 return true;
651 }
652
snp_probe_rmptable_info(void)653 bool snp_probe_rmptable_info(void)
654 {
655 if (cpu_feature_enabled(X86_FEATURE_SEGMENTED_RMP))
656 rdmsrl(MSR_AMD64_RMP_CFG, rmp_cfg);
657
658 if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED)
659 return probe_segmented_rmptable_info();
660 else
661 return probe_contiguous_rmptable_info();
662 }
663
664 /*
665 * About the array_index_nospec() usage below:
666 *
667 * This function can get called by exported functions like
668 * snp_lookup_rmpentry(), which is used by the KVM #PF handler, among
669 * others, and since the @pfn passed in cannot always be trusted,
670 * speculation should be stopped as a protective measure.
671 */
get_raw_rmpentry(u64 pfn)672 static struct rmpentry_raw *get_raw_rmpentry(u64 pfn)
673 {
674 u64 paddr, rst_index, segment_index;
675 struct rmp_segment_desc *desc;
676
677 if (!rmp_segment_table)
678 return ERR_PTR(-ENODEV);
679
680 paddr = pfn << PAGE_SHIFT;
681
682 rst_index = RST_ENTRY_INDEX(paddr);
683 if (unlikely(rst_index >= rst_max_index))
684 return ERR_PTR(-EFAULT);
685
686 rst_index = array_index_nospec(rst_index, rst_max_index);
687
688 desc = rmp_segment_table[rst_index];
689 if (unlikely(!desc))
690 return ERR_PTR(-EFAULT);
691
692 segment_index = RMP_ENTRY_INDEX(paddr);
693 if (unlikely(segment_index >= desc->max_index))
694 return ERR_PTR(-EFAULT);
695
696 segment_index = array_index_nospec(segment_index, desc->max_index);
697
698 return desc->rmp_entry + segment_index;
699 }
700
get_rmpentry(u64 pfn,struct rmpentry * e)701 static int get_rmpentry(u64 pfn, struct rmpentry *e)
702 {
703 struct rmpentry_raw *e_raw;
704
705 if (cpu_feature_enabled(X86_FEATURE_RMPREAD)) {
706 int ret;
707
708 /* Binutils version 2.44 supports the RMPREAD mnemonic. */
709 asm volatile(".byte 0xf2, 0x0f, 0x01, 0xfd"
710 : "=a" (ret)
711 : "a" (pfn << PAGE_SHIFT), "c" (e)
712 : "memory", "cc");
713
714 return ret;
715 }
716
717 e_raw = get_raw_rmpentry(pfn);
718 if (IS_ERR(e_raw))
719 return PTR_ERR(e_raw);
720
721 /*
722 * Map the raw RMP table entry onto the RMPREAD output format.
723 * The 2MB region status indicator (hpage_region_status field) is not
724 * calculated, since the overhead could be significant and the field
725 * is not used.
726 */
727 memset(e, 0, sizeof(*e));
728 e->gpa = e_raw->gpa << PAGE_SHIFT;
729 e->asid = e_raw->asid;
730 e->assigned = e_raw->assigned;
731 e->pagesize = e_raw->pagesize;
732 e->immutable = e_raw->immutable;
733
734 return 0;
735 }
736
__snp_lookup_rmpentry(u64 pfn,struct rmpentry * e,int * level)737 static int __snp_lookup_rmpentry(u64 pfn, struct rmpentry *e, int *level)
738 {
739 struct rmpentry e_large;
740 int ret;
741
742 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
743 return -ENODEV;
744
745 ret = get_rmpentry(pfn, e);
746 if (ret)
747 return ret;
748
749 /*
750 * Find the authoritative RMP entry for a PFN. This can be either a 4K
751 * RMP entry or a special large RMP entry that is authoritative for a
752 * whole 2M area.
753 */
754 ret = get_rmpentry(pfn & PFN_PMD_MASK, &e_large);
755 if (ret)
756 return ret;
757
758 *level = RMP_TO_PG_LEVEL(e_large.pagesize);
759
760 return 0;
761 }
762
snp_lookup_rmpentry(u64 pfn,bool * assigned,int * level)763 int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level)
764 {
765 struct rmpentry e;
766 int ret;
767
768 ret = __snp_lookup_rmpentry(pfn, &e, level);
769 if (ret)
770 return ret;
771
772 *assigned = !!e.assigned;
773 return 0;
774 }
775 EXPORT_SYMBOL_GPL(snp_lookup_rmpentry);
776
777 /*
778 * Dump the raw RMP entry for a particular PFN. These bits are documented in the
779 * PPR for a particular CPU model and provide useful information about how a
780 * particular PFN is being utilized by the kernel/firmware at the time certain
781 * unexpected events occur, such as RMP faults.
782 */
dump_rmpentry(u64 pfn)783 static void dump_rmpentry(u64 pfn)
784 {
785 struct rmpentry_raw *e_raw;
786 u64 pfn_i, pfn_end;
787 struct rmpentry e;
788 int level, ret;
789
790 ret = __snp_lookup_rmpentry(pfn, &e, &level);
791 if (ret) {
792 pr_err("Failed to read RMP entry for PFN 0x%llx, error %d\n",
793 pfn, ret);
794 return;
795 }
796
797 if (e.assigned) {
798 e_raw = get_raw_rmpentry(pfn);
799 if (IS_ERR(e_raw)) {
800 pr_err("Failed to read RMP contents for PFN 0x%llx, error %ld\n",
801 pfn, PTR_ERR(e_raw));
802 return;
803 }
804
805 pr_info("PFN 0x%llx, RMP entry: [0x%016llx - 0x%016llx]\n",
806 pfn, e_raw->lo, e_raw->hi);
807 return;
808 }
809
810 /*
811 * If the RMP entry for a particular PFN is not in an assigned state,
812 * then it is sometimes useful to get an idea of whether or not any RMP
813 * entries for other PFNs within the same 2MB region are assigned, since
814 * those too can affect the ability to access a particular PFN in
815 * certain situations, such as when the PFN is being accessed via a 2MB
816 * mapping in the host page table.
817 */
818 pfn_i = ALIGN_DOWN(pfn, PTRS_PER_PMD);
819 pfn_end = pfn_i + PTRS_PER_PMD;
820
821 pr_info("PFN 0x%llx unassigned, dumping non-zero entries in 2M PFN region: [0x%llx - 0x%llx]\n",
822 pfn, pfn_i, pfn_end);
823
824 while (pfn_i < pfn_end) {
825 e_raw = get_raw_rmpentry(pfn_i);
826 if (IS_ERR(e_raw)) {
827 pr_err("Error %ld reading RMP contents for PFN 0x%llx\n",
828 PTR_ERR(e_raw), pfn_i);
829 pfn_i++;
830 continue;
831 }
832
833 if (e_raw->lo || e_raw->hi)
834 pr_info("PFN: 0x%llx, [0x%016llx - 0x%016llx]\n", pfn_i, e_raw->lo, e_raw->hi);
835 pfn_i++;
836 }
837 }
838
snp_dump_hva_rmpentry(unsigned long hva)839 void snp_dump_hva_rmpentry(unsigned long hva)
840 {
841 unsigned long paddr;
842 unsigned int level;
843 pgd_t *pgd;
844 pte_t *pte;
845
846 pgd = __va(read_cr3_pa());
847 pgd += pgd_index(hva);
848 pte = lookup_address_in_pgd(pgd, hva, &level);
849
850 if (!pte) {
851 pr_err("Can't dump RMP entry for HVA %lx: no PTE/PFN found\n", hva);
852 return;
853 }
854
855 paddr = PFN_PHYS(pte_pfn(*pte)) | (hva & ~page_level_mask(level));
856 dump_rmpentry(PHYS_PFN(paddr));
857 }
858
859 /*
860 * PSMASH a 2MB aligned page into 4K pages in the RMP table while preserving the
861 * Validated bit.
862 */
psmash(u64 pfn)863 int psmash(u64 pfn)
864 {
865 unsigned long paddr = pfn << PAGE_SHIFT;
866 int ret;
867
868 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
869 return -ENODEV;
870
871 if (!pfn_valid(pfn))
872 return -EINVAL;
873
874 /* Binutils version 2.36 supports the PSMASH mnemonic. */
875 asm volatile(".byte 0xF3, 0x0F, 0x01, 0xFF"
876 : "=a" (ret)
877 : "a" (paddr)
878 : "memory", "cc");
879
880 return ret;
881 }
882 EXPORT_SYMBOL_GPL(psmash);
883
884 /*
885 * If the kernel uses a 2MB or larger directmap mapping to write to an address,
886 * and that mapping contains any 4KB pages that are set to private in the RMP
887 * table, an RMP #PF will trigger and cause a host crash. Hypervisor code that
888 * owns the PFNs being transitioned will never attempt such a write, but other
889 * kernel tasks writing to other PFNs in the range may trigger these checks
890 * inadvertently due a large directmap mapping that happens to overlap such a
891 * PFN.
892 *
893 * Prevent this by splitting any 2MB+ mappings that might end up containing a
894 * mix of private/shared PFNs as a result of a subsequent RMPUPDATE for the
895 * PFN/rmp_level passed in.
896 *
897 * Note that there is no attempt here to scan all the RMP entries for the 2MB
898 * physical range, since it would only be worthwhile in determining if a
899 * subsequent RMPUPDATE for a 4KB PFN would result in all the entries being of
900 * the same shared/private state, thus avoiding the need to split the mapping.
901 * But that would mean the entries are currently in a mixed state, and so the
902 * mapping would have already been split as a result of prior transitions.
903 * And since the 4K split is only done if the mapping is 2MB+, and there isn't
904 * currently a mechanism in place to restore 2MB+ mappings, such a check would
905 * not provide any usable benefit.
906 *
907 * More specifics on how these checks are carried out can be found in APM
908 * Volume 2, "RMP and VMPL Access Checks".
909 */
adjust_direct_map(u64 pfn,int rmp_level)910 static int adjust_direct_map(u64 pfn, int rmp_level)
911 {
912 unsigned long vaddr;
913 unsigned int level;
914 int npages, ret;
915 pte_t *pte;
916
917 /*
918 * pfn_to_kaddr() will return a vaddr only within the direct
919 * map range.
920 */
921 vaddr = (unsigned long)pfn_to_kaddr(pfn);
922
923 /* Only 4KB/2MB RMP entries are supported by current hardware. */
924 if (WARN_ON_ONCE(rmp_level > PG_LEVEL_2M))
925 return -EINVAL;
926
927 if (!pfn_valid(pfn))
928 return -EINVAL;
929
930 if (rmp_level == PG_LEVEL_2M &&
931 (!IS_ALIGNED(pfn, PTRS_PER_PMD) || !pfn_valid(pfn + PTRS_PER_PMD - 1)))
932 return -EINVAL;
933
934 /*
935 * If an entire 2MB physical range is being transitioned, then there is
936 * no risk of RMP #PFs due to write accesses from overlapping mappings,
937 * since even accesses from 1GB mappings will be treated as 2MB accesses
938 * as far as RMP table checks are concerned.
939 */
940 if (rmp_level == PG_LEVEL_2M)
941 return 0;
942
943 pte = lookup_address(vaddr, &level);
944 if (!pte || pte_none(*pte))
945 return 0;
946
947 if (level == PG_LEVEL_4K)
948 return 0;
949
950 npages = page_level_size(rmp_level) / PAGE_SIZE;
951 ret = set_memory_4k(vaddr, npages);
952 if (ret)
953 pr_warn("Failed to split direct map for PFN 0x%llx, ret: %d\n",
954 pfn, ret);
955
956 return ret;
957 }
958
959 /*
960 * It is expected that those operations are seldom enough so that no mutual
961 * exclusion of updaters is needed and thus the overlap error condition below
962 * should happen very rarely and would get resolved relatively quickly by
963 * the firmware.
964 *
965 * If not, one could consider introducing a mutex or so here to sync concurrent
966 * RMP updates and thus diminish the amount of cases where firmware needs to
967 * lock 2M ranges to protect against concurrent updates.
968 *
969 * The optimal solution would be range locking to avoid locking disjoint
970 * regions unnecessarily but there's no support for that yet.
971 */
rmpupdate(u64 pfn,struct rmp_state * state)972 static int rmpupdate(u64 pfn, struct rmp_state *state)
973 {
974 unsigned long paddr = pfn << PAGE_SHIFT;
975 int ret, level;
976
977 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
978 return -ENODEV;
979
980 level = RMP_TO_PG_LEVEL(state->pagesize);
981
982 if (adjust_direct_map(pfn, level))
983 return -EFAULT;
984
985 do {
986 /* Binutils version 2.36 supports the RMPUPDATE mnemonic. */
987 asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFE"
988 : "=a" (ret)
989 : "a" (paddr), "c" ((unsigned long)state)
990 : "memory", "cc");
991 } while (ret == RMPUPDATE_FAIL_OVERLAP);
992
993 if (ret) {
994 pr_err("RMPUPDATE failed for PFN %llx, pg_level: %d, ret: %d\n",
995 pfn, level, ret);
996 dump_rmpentry(pfn);
997 dump_stack();
998 return -EFAULT;
999 }
1000
1001 return 0;
1002 }
1003
1004 /* Transition a page to guest-owned/private state in the RMP table. */
rmp_make_private(u64 pfn,u64 gpa,enum pg_level level,u32 asid,bool immutable)1005 int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable)
1006 {
1007 struct rmp_state state;
1008
1009 memset(&state, 0, sizeof(state));
1010 state.assigned = 1;
1011 state.asid = asid;
1012 state.immutable = immutable;
1013 state.gpa = gpa;
1014 state.pagesize = PG_LEVEL_TO_RMP(level);
1015
1016 return rmpupdate(pfn, &state);
1017 }
1018 EXPORT_SYMBOL_GPL(rmp_make_private);
1019
1020 /* Transition a page to hypervisor-owned/shared state in the RMP table. */
rmp_make_shared(u64 pfn,enum pg_level level)1021 int rmp_make_shared(u64 pfn, enum pg_level level)
1022 {
1023 struct rmp_state state;
1024
1025 memset(&state, 0, sizeof(state));
1026 state.pagesize = PG_LEVEL_TO_RMP(level);
1027
1028 return rmpupdate(pfn, &state);
1029 }
1030 EXPORT_SYMBOL_GPL(rmp_make_shared);
1031
snp_leak_pages(u64 pfn,unsigned int npages)1032 void snp_leak_pages(u64 pfn, unsigned int npages)
1033 {
1034 struct page *page = pfn_to_page(pfn);
1035
1036 pr_warn("Leaking PFN range 0x%llx-0x%llx\n", pfn, pfn + npages);
1037
1038 spin_lock(&snp_leaked_pages_list_lock);
1039 while (npages--) {
1040
1041 /*
1042 * Reuse the page's buddy list for chaining into the leaked
1043 * pages list. This page should not be on a free list currently
1044 * and is also unsafe to be added to a free list.
1045 */
1046 if (likely(!PageCompound(page)) ||
1047
1048 /*
1049 * Skip inserting tail pages of compound page as
1050 * page->buddy_list of tail pages is not usable.
1051 */
1052 (PageHead(page) && compound_nr(page) <= npages))
1053 list_add_tail(&page->buddy_list, &snp_leaked_pages_list);
1054
1055 dump_rmpentry(pfn);
1056 snp_nr_leaked_pages++;
1057 pfn++;
1058 page++;
1059 }
1060 spin_unlock(&snp_leaked_pages_list_lock);
1061 }
1062 EXPORT_SYMBOL_GPL(snp_leak_pages);
1063
kdump_sev_callback(void)1064 void kdump_sev_callback(void)
1065 {
1066 /*
1067 * Do wbinvd() on remote CPUs when SNP is enabled in order to
1068 * safely do SNP_SHUTDOWN on the local CPU.
1069 */
1070 if (cc_platform_has(CC_ATTR_HOST_SEV_SNP))
1071 wbinvd();
1072 }
1073