Linux 内核启动时为什么要做线性映射?
Linux 内核启动之后,对于 32 位的系统来说,他会把 0 ~ 896M 这部分低端内存(low memory)都做线性映射,不管这部分内存是否需要用到。对于 64 位的系统,内核会把所有的物理(一般情况如此,除非物理内存特别大)内存都映射出来。这么做的目的是啥?这里先说结论,然后分析代码。
这么做的原因是为了访问效率,内核直接使用这些地址时,不需要重映射。并且这些地址是大页映射,tlb miss概率降低。一般来说,x86和arm64都是1G或者2M的大页。采用大页映射的另一个好处是:页表的开销也会小很多。
注意:linux内核虽然在开机的时候,映射了(对于64为平台来说)所有物理内存,但是他并没占有这些内存,只是为了访问方便。
以下代码来自于:linux-5.15,ARM64架构。首先map_mem函数会遍历所有的memory banks,对他们做线性映射。
// arch/arm64/mm/mmu.c 510 static void __init map_mem(pgd_t *pgdp) 511 { 512 static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN); 513 phys_addr_t kernel_start = __pa_symbol(_stext); 514 phys_addr_t kernel_end = __pa_symbol(__init_begin); 515 phys_addr_t start, end; ... 550 /* map all the memory banks */ 551 for_each_mem_range(i, &start, &end) { 552 if (start >= end) 553 break; 554 /* 555 * The linear map must allow allocation tags reading/writing 556 * if MTE is present. Otherwise, it has the same attributes as 557 * PAGE_KERNEL. 558 */ 559 __map_memblock(pgdp, start, end, pgprot_tagged(PAGE_KERNEL), 560 flags); 561 }
下面只是一个过渡的函数。
// arch/arm64/mm/mmu.c 478 static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start, 479 phys_addr_t end, pgprot_t prot, int flags) 480 { 481 __create_pgd_mapping(pgdp, start, __phys_to_virt(start), end - start, 482 prot, early_pgtable_alloc, flags); 483 }
创建pgd全局页表。
// arch/arm64/mm/mmu.c 372 static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, 373 unsigned long virt, phys_addr_t size, 374 pgprot_t prot, 375 phys_addr_t (*pgtable_alloc)(int), 376 int flags) 377 { 378 unsigned long addr, end, next; 379 pgd_t *pgdp = pgd_offset_pgd(pgdir, virt); 380 381 /* 382 * If the virtual and physical address don't have the same offset 383 * within a page, we cannot map the region as the caller expects. 384 */ 385 if (WARN_ON((phys ^ virt) & ~PAGE_MASK)) 386 return; 387 388 phys &= PAGE_MASK; 389 addr = virt & PAGE_MASK; 390 end = PAGE_ALIGN(virt + size); 391 392 do { 393 next = pgd_addr_end(addr, end); 394 alloc_init_pud(pgdp, addr, next, phys, prot, pgtable_alloc, 395 flags); 396 phys += next - addr; 397 } while (pgdp++, addr = next, addr != end); 398 }
以下函数用来分配pud页表项,注意他会调用 use_1G_block 来判断是否使用1G页表,如果条件为真,那么映射完毕,不需要走PMD和PTE了。注意,对于一个典型ARM64 Linux架构来说,pte能映射2^9*4K = 2M地址空间。
309 static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end, 310 phys_addr_t phys, pgprot_t prot, 311 phys_addr_t (*pgtable_alloc)(int), 312 int flags) 313 { 314 unsigned long next; 315 pud_t *pudp; 316 p4d_t *p4dp = p4d_offset(pgdp, addr); 317 p4d_t p4d = READ_ONCE(*p4dp); 318 319 if (p4d_none(p4d)) { 320 p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN; 321 phys_addr_t pud_phys; 322 323 if (flags & NO_EXEC_MAPPINGS) 324 p4dval |= P4D_TABLE_PXN; 325 BUG_ON(!pgtable_alloc); 326 pud_phys = pgtable_alloc(PUD_SHIFT); 327 __p4d_populate(p4dp, pud_phys, p4dval); 328 p4d = READ_ONCE(*p4dp); 329 } 330 BUG_ON(p4d_bad(p4d)); 331 332 /* 333 * No need for locking during early boot. And it doesn't work as 334 * expected with KASLR enabled. 335 */ 336 if (system_state != SYSTEM_BOOTING) 337 mutex_lock(&fixmap_lock); 338 pudp = pud_set_fixmap_offset(p4dp, addr); 339 do { 340 pud_t old_pud = READ_ONCE(*pudp); 341 342 next = pud_addr_end(addr, end); 343 344 /* 345 * For 4K granule only, attempt to put down a 1GB block 346 */ 347 if (use_1G_block(addr, next, phys) && 348 (flags & NO_BLOCK_MAPPINGS) == 0) { 349 pud_set_huge(pudp, phys, prot); 350 351 /* 352 * After the PUD entry has been populated once, we 353 * only allow updates to the permission attributes. 354 */ 355 BUG_ON(!pgattr_change_is_safe(pud_val(old_pud), 356 READ_ONCE(pud_val(*pudp)))); 357 } else { 358 alloc_init_cont_pmd(pudp, addr, next, phys, prot, 359 pgtable_alloc, flags); 360 361 BUG_ON(pud_val(old_pud) != 0 && 362 pud_val(old_pud) != READ_ONCE(pud_val(*pudp))); 363 } 364 phys += next - addr; 365 } while (pudp++, addr = next, addr != end); 366 367 pud_clear_fixmap(); 368 if (system_state != SYSTEM_BOOTING) 369 mutex_unlock(&fixmap_lock); 370 }
以下函数用来分配pmd页表,如果当前地址满足2M对齐,那么采用2M页表来做映射。注意,对于一个典型ARM64 Linux架构来说,pmd能映射2^9*2M = 1G地址空间。
219 static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end, 220 phys_addr_t phys, pgprot_t prot, 221 phys_addr_t (*pgtable_alloc)(int), int flags) 222 { 223 unsigned long next; 224 pmd_t *pmdp; 225 226 pmdp = pmd_set_fixmap_offset(pudp, addr); 227 do { 228 pmd_t old_pmd = READ_ONCE(*pmdp); 229 230 next = pmd_addr_end(addr, end); 231 232 /* try section mapping first */ 233 if (((addr | next | phys) & ~PMD_MASK) == 0 && 234 (flags & NO_BLOCK_MAPPINGS) == 0) { 235 pmd_set_huge(pmdp, phys, prot); 236 237 /* 238 * After the PMD entry has been populated once, we 239 * only allow updates to the permission attributes. 240 */ 241 BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd), 242 READ_ONCE(pmd_val(*pmdp)))); 243 } else { 244 alloc_init_cont_pte(pmdp, addr, next, phys, prot, 245 pgtable_alloc, flags); 246 247 BUG_ON(pmd_val(old_pmd) != 0 && 248 pmd_val(old_pmd) != READ_ONCE(pmd_val(*pmdp))); 249 } 250 phys += next - addr; 251 } while (pmdp++, addr = next, addr != end); 252 253 pmd_clear_fixmap(); 254 }
最后一个问题:64位的arm平台,能够直接映射的物理内存有多大呢?如下截图是ARM64架构 4K page + 4级页表的结构,答案是:128T。
此图来自于:https://www.kernel.org/doc/html/latest/arm64/memory.html
来自公众号:人人极客社区作者简介:周文嘉: 曾服务于ARM、阿里系子公司、HTC等公司。10年以上工作经验,主要从事系统软件开发,涵盖:系统库开发、指令集优化、Linux内核开发等。累计为某些开源社贡献过一定数量的patch。