Some details about Linux memory addressing and page table processing

It is easy to know from some Linux kernel books that the X86 architecture uses level 2 (10-10-12) page tables, and the X86-64 architecture uses level 4 (9-9-9-9-12) page tables or even level 5 (a layer p4d_t is added between pgd_t and pud_t), but some hidden problems are often ignored. For example, is the page table of each process stored in the kernel space? Why does the physical address of the page frame where the page table is located in the kernel only need to add an offset page to convert it to a virtual address_ OFFSET? CR3 register contents and tasks_ Are struct - > mm - > PGD all physical addresses of the global page table? Some page table operation functions such as pud_ Why is offset used after__ The address of va() and why these functions are needed with MMU?

First, if you forget the details of multi-level page table memory addressing, the following figure can quickly remind you.  

 

  Taking X86-64 architecture as an example, the above describes a level 4 page table. It should be noted that under Linux, the logical address is consistent with the virtual address (each segment descriptor Base is 0), p**_index() is used to calculate the index or offset of each level in the virtual address relative to the page directory / table Base address, which is stored in the CR3 register or the physical address field of the upper level page directory entry or page table entry.

 

The following describes the characteristics of memory addressing:

  •   The kernel allocates a page table to each process, which is stored in the kernel space. When process switching or other specific time occurs, the CR3 register loads the physical base address of the global page table of the current active process. Therefore, the subsequent addressing virtual address uses the page table of the current process.
  • When the CR3 register writes a value, it will automatically refresh the TLB (conversion backup buffer) table entry.
  • The CR3 register stores the physical base address of the global directory of the process page, but the task_ Struct - > mm - > PGD stores the virtual address of the global directory of the process page.
  • Each page directory entry or page table entry has a 40bit (slightly different depending on the kernel version) field to store the physical address of the next level directory. However, if you want to traverse the page table in the kernel, you need to use "_va(x)" to convert the physical address into a virtual address after opening the MMU, which is completed by the MMU.
  • The lowest 12 bits of the virtual address (4KB page size) are the same as the lowest 12 bits of the physical address. 4 page table segments of virtual address page_index can be regarded as an index in the page table.
  • The page operation function in the figure above allows us to traverse the page table. For example, we can get the process descriptor through the "current" pointer, and then get the "pgd" pointer under the memory descriptor, so as to get the physical address corresponding to the virtual address (that is, the last layer in the figure above is page number + offset to get the physical address), The page descriptor of the page where the physical address is located can be obtained through the first 52 bits of the physical address, because the page descriptors of all page boxes are array mem_ The element in map [] and the linear nature of the array make it possible to get the page descriptor through page number.
  • We can use a simple example as an example   a page table walk:
 1 static unsigned long vaddr2paddr(unsigned long vaddr)
 2 {
 3     pgd_t *pgd;
 4     p4d_t *p4d;
 5     pud_t *pud;
 6     pmd_t *pmd;
 7     pte_t *pte;
 8     unsigned long paddr = 0;
 9     unsigned long page_addr = 0;
10     unsigned long page_offset = 0;
11 
12     pgd = pgd_offset(current->mm, vaddr);
13     if (!pgtable_l5_enabled())
14         printk("pgtable_l5 is not enabled\n");
15     p4d = p4d_offset(pgd, vaddr);
16     pud = pud_offset(p4d, vaddr);
17     pmd = pmd_offset(pud, vaddr);
18     pte = pte_offset_kernel(pmd, vaddr); 
19     page_addr = pte_val(*pte) & PAGE_MASK;
20     page_offset = vaddr & ~PAGE_MASK;
21     paddr = page_addr | page_offset;
22 
23     return paddr;
24 }
  1 #include <linux/module.h>
  2 #include <linux/init.h>
  3 #include <linux/kernel.h>
  4 #include <asm/pgtable.h>
  5 #include <asm/page.h>
  6 #include <linux/sched.h>
  7 
  8 unsigned long vaddr = 0;
  9 
 10 MODULE_LICENSE("GPL");
 11 MODULE_AUTHOR("ShieldQiQi");
 12 MODULE_DESCRIPTION("Test page table walk");
 13 
 14 static void get_pgtable_macro(void)
 15 {
 16     printk("PAGE_OFFSET = 0x%lx\n", PAGE_OFFSET);
 17     printk("PGDIR_SHIFT = %d\n", PGDIR_SHIFT); 
 18     printk("P4D_SHIFT = %d\n", P4D_SHIFT);
 19     printk("PUD_SHIFT = %d\n", PUD_SHIFT);
 20     printk("PMD_SHIFT = %d\n", PMD_SHIFT);
 21     printk("PAGE_SHIFT = %d\n", PAGE_SHIFT);
 22 
 23     printk("PTRS_PER_PGD = %d\n", PTRS_PER_PGD);
 24     printk("PTRS_PER_P4D = %d\n", PTRS_PER_P4D);
 25     printk("PTRS_PER_PUD = %d\n", PTRS_PER_PUD);
 26     printk("PTRS_PER_PMD = %d\n", PTRS_PER_PMD);
 27     printk("PTRS_PER_PTE = %d\n", PTRS_PER_PTE);
 28 
 29     printk("PAGE_MASK = 0x%lx\n", PAGE_MASK);
 30 }
 31 
 32 static unsigned long vaddr2paddr(unsigned long vaddr)
 33 {
 34     pgd_t *pgd;
 35     p4d_t *p4d;
 36     pud_t *pud;
 37     pmd_t *pmd;
 38     pte_t *pte;
 39     unsigned long paddr = 0;
 40     unsigned long page_addr = 0;
 41     unsigned long page_offset = 0;
 42 
 43     pgd = pgd_offset(current->mm, vaddr);
 44     printk("current->mm->pgd = 0x%lx\n", (unsigned long)current->mm->pgd);
 45     printk("pgd = 0x%lx\n", (unsigned long)pgd);
 46     printk("pgd_val = 0x%lx\n", pgd_val(*pgd));
 47     printk("pgd_index = %lu\n", pgd_index(vaddr));
 48     if (pgd_none(*pgd)) {
 49         printk("not mapped in pgd\n");
 50         return -1;
 51     }
 52 
 53     if (!pgtable_l5_enabled())
 54         printk("pgtable_l5 is not enabled\n");
 55     
 56     p4d = p4d_offset(pgd, vaddr);
 57     printk("p4d_val = 0x%lx\n", p4d_val(*p4d));
 58     printk("p4d_index = %lu\n", p4d_index(vaddr));
 59     if (p4d_none(*p4d)) {
 60         printk("not mapped in p4d\n");
 61         return -1;
 62     }
 63 
 64     pud = pud_offset(p4d, vaddr);
 65     printk("p4d_pfn_mask = 0x%lx\n", p4d_pfn_mask(*p4d));
 66     printk("p4d_page_vaddr = 0x%lx\n", p4d_page_vaddr(*p4d));
 67     printk("pud_index = 0x%lx\n", pud_index(vaddr));
 68     printk("pud = 0x%lx\n", (unsigned long)pud);
 69     
 70     printk("pud_val = 0x%lx\n", pud_val(*pud));
 71     if (pud_none(*pud)) {
 72         printk("not mapped in pud\n");
 73         return -1;
 74     }
 75 
 76     pmd = pmd_offset(pud, vaddr);
 77     printk("pmd_val = 0x%lx\n", pmd_val(*pmd));
 78     printk("pmd_index = %lu\n", pmd_index(vaddr));
 79     printk("pmd = 0x%lx\n", (unsigned long)pmd);
 80     if (pmd_none(*pmd)) {
 81         printk("not mapped in pmd\n");
 82         return -1;
 83     }
 84 
 85     pte = pte_offset_kernel(pmd, vaddr); 
 86     printk("pte = 0x%lx\n", (unsigned long)pte);
 87     printk("pte_val = 0x%lx\n", pte_val(*pte));
 88     printk("pte_index = %lu\n", pte_index(vaddr));
 89     if (pte_none(*pte)) {
 90         printk("not mapped in pte\n");
 91         return -1;
 92     }
 93 
 94     /* Page frame physical address mechanism | offset */
 95     page_addr = pte_val(*pte) & PAGE_MASK;
 96     page_offset = vaddr & ~PAGE_MASK;
 97     paddr = page_addr | page_offset;
 98     printk("page_addr = %lx, page_offset = %lx\n", page_addr, page_offset);
 99     printk("vaddr = %lx, paddr = %lx\n", vaddr, paddr);
100 
101     return paddr;
102 }
103 
104 static int __init v2p_init(void)
105 {
106 
107     printk("vaddr to paddr module is running..\n");
108     get_pgtable_macro();
109     printk("\n");
110 
111     vaddr = (unsigned long)vmalloc(1000 * sizeof(char));
112     if (vaddr == 0) {
113         printk("vmalloc failed..\n");
114         return 0;
115     }
116     printk("vmalloc_vaddr=0x%lx\n", vaddr);
117     vaddr2paddr(vaddr);
118     vfree((void *)vaddr);
119     
120     printk("\n\n");
121     vaddr = __get_free_page(GFP_KERNEL);
122     if (vaddr == 0) {
123         printk("__get_free_page failed..\n");
124         return 0;
125     }
126     printk("get_page_vaddr=0x%lx\n", vaddr);
127     vaddr2paddr(vaddr);
128     free_page(vaddr);
129     
130     return 0;
131 }
132 
133 static void __exit v2p_exit(void)
134 {
135     printk("vaddr to paddr module is leaving..\n");
136 }
137 
138 module_init(v2p_init);
139 module_exit(v2p_exit);
  •   If you look deeply at how "p**_offset" is implemented, you will find that it is obtained by adding "p**_index" to the virtual address of the page box where the current page table is located. There is a misunderstanding here. In Linux kernel 5.4.0,   "p**_offset" is implemented as follows:
 1   static inline unsigned long pud_page_vaddr(pud_t pud)
 2   {
 3       return (unsigned long)__va(pud_val(pud) & pud_pfn_mask(pud));
 4   }
 5   
 6   /*
 7    * Currently stuck as a macro due to indirect forward reference to
 8    * linux/mmzone.h's __section_mem_map_addr() definition:
 9    */
10   #define pud_page(pud)   pfn_to_page(pud_pfn(pud))
11   
12   /* Find an entry in the second-level page table.. */
13   static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
14   {
15      return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address);
16   }
  • You can see that "pmd_offset" finally returns__ The virtual address after va(), so the input parameter "pud_t *pud" in the function body is actually a virtual address. The function "pud_page_vaddr" is used to retrieve the contents stored in the page's parent directory item. You can see that the address symbol "*" is used. You can see that some small partners here may ask why they still use the known physical address__ The virtual address "PUD" obtained by VA () may even say that what we are doing now is to convert the virtual address into a physical address, so how can we directly use the virtual address? In fact, when we use these macros, the system already works normally. All addresses processed by the cpu should be virtual addresses. The conversion should be left to the MMU. The reason why we use virtual addresses is that we can only use virtual addresses.
  • The above code passed
    page_offset = vaddr & ~PAGE_MASK
  • The physical address obtained is actually wrong, because only the middle 40 bits are the physical address page number

The actual output of the above example is as follows:

[598341.980621] vaddr to paddr module is running..
[598341.980622] PAGE_OFFSET = 0xffff90a240000000
[598341.980623] PGDIR_SHIFT = 39
[598341.980623] P4D_SHIFT = 39
[598341.980623] PUD_SHIFT = 30
[598341.980624] PMD_SHIFT = 21
[598341.980624] PAGE_SHIFT = 12
[598341.980624] PTRS_PER_PGD = 512
[598341.980624] PTRS_PER_P4D = 1
[598341.980625] PTRS_PER_PUD = 512
[598341.980625] PTRS_PER_PMD = 512
[598341.980625] PTRS_PER_PTE = 512
[598341.980626] PAGE_MASK = 0xfffffffffffff000

[598341.980628] vmalloc_vaddr=0xffffbaddc02cb000
[598341.980628] current->mm->pgd = 0xffff90a3c83e4000
[598341.980629] pgd = 0xffff90a3c83e4ba8
[598341.980629] pgd_val = 0x2b5155067
[598341.980629] pgd_index = 373
[598341.980630] pgtable_l5 is not enabled
[598341.980630] p4d_val = 0x2b5155067
[598341.980630] p4d_index = 0
[598341.980631] p4d_pfn_mask = 0xffffffffff000
[598341.980631] p4d_page_vaddr = 0xffff90a4f5155000
[598341.980631] pud_index = 0x177
[598341.980631] pud = 0xffff90a4f5155bb8
[598341.980632] pud_val = 0x2b5158067
[598341.980632] pmd_val = 0x2b4a9e067
[598341.980632] pmd_index = 1
[598341.980633] pmd = 0xffff90a4f5158008
[598341.980633] pte = 0xffff90a4f4a9e658
[598341.980633] pte_val = 0x8000000204610063
[598341.980634] pte_index = 203
[598341.980634] page_addr = 8000000204610000, page_offset = 0
[598341.980634] vaddr = ffffbaddc02cb000, paddr = 8000000204610000
[598341.980635] 

[598341.980635] get_page_vaddr=0xffff90a444610000
[598341.980636] current->mm->pgd = 0xffff90a3c83e4000
[598341.980636] pgd = 0xffff90a3c83e4908
[598341.980636] pgd_val = 0x1cfe01067
[598341.980636] pgd_index = 289
[598341.980637] pgtable_l5 is not enabled
[598341.980637] p4d_val = 0x1cfe01067
[598341.980637] p4d_index = 0
[598341.980638] p4d_pfn_mask = 0xffffffffff000
[598341.980638] p4d_page_vaddr = 0xffff90a40fe01000
[598341.980638] pud_index = 0x91
[598341.980638] pud = 0xffff90a40fe01488
[598341.980639] pud_val = 0x24baa9063
[598341.980639] pmd_val = 0x204680063
[598341.980639] pmd_index = 35
[598341.980640] pmd = 0xffff90a48baa9118
[598341.980640] pte = 0xffff90a444680080
[598341.980640] pte_val = 0x8000000204610063
[598341.980640] pte_index = 16
[598341.980641] page_addr = 8000000204610000, page_offset = 0
[598341.980641] vaddr = ffff90a444610000, paddr = 8000000204610000
[598346.531714] vaddr to paddr module is leaving..

 

If there is any mistake, please point it out and correct it. Thank you!

 

Posted on Wed, 01 Dec 2021 15:01:16 -0500 by youngloopy