Linux 内核启动之后,对于 32 位的系统来说,他会把 0 ~ 896M 这部分低端内存(low memory)都做线性映射,不管这部分内存是否需要用到。对于 64 位的系统,内核会把所有的物理(一般情况如此,除非物理内存特别大)内存都映射出来。这么做的目的是啥?这里先说结论,然后分析代码。
这么做的原因是为了访问效率,内核直接使用这些地址时,不需要重映射。并且这些地址是大页映射,tlb miss概率降低。一般来说,x86和arm64都是1G或者2M的大页。采用大页映射的另一个好处是:页表的开销也会小很多。
注意:linux内核虽然在开机的时候,映射了(对于64为平台来说)所有物理内存,但是他并没占有这些内存,只是为了访问方便。
以下代码来自于:linux-5.15,ARM64架构。首先map_mem函数会遍历所有的memory banks,对他们做线性映射。
// arch/arm64/mm/mmu.c
510 static void __init map_mem(pgd_t *pgdp)
511 {
512 static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);
513 phys_addr_t kernel_start = __pa_symbol(_stext);
514 phys_addr_t kernel_end = __pa_symbol(__init_begin);
515 phys_addr_t start, end;
...
550 /* map all the memory banks */
551 for_each_mem_range(i, &start, &end) {
552 if (start >= end)
553 break;
554 /*
555 * The linear map must allow allocation tags reading/writing
556 * if MTE is present. Otherwise, it has the same attributes as
557 * PAGE_KERNEL.
558 */
559 __map_memblock(pgdp, start, end, pgprot_tagged(PAGE_KERNEL),
560 flags);
561 }
下面只是一个过渡的函数。
// arch/arm64/mm/mmu.c
478 static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start,
479 phys_addr_t end, pgprot_t prot, int flags)
480 {
481 __create_pgd_mapping(pgdp, start, __phys_to_virt(start), end - start,
482 prot, early_pgtable_alloc, flags);
483 }
创建pgd全局页表。
// arch/arm64/mm/mmu.c
372 static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
373 unsigned long virt, phys_addr_t size,
374 pgprot_t prot,
375 phys_addr_t (*pgtable_alloc)(int),
376 int flags)
377 {
378 unsigned long addr, end, next;
379 pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
380
381 /*
382 * If the virtual and physical address don't have the same offset
383 * within a page, we cannot map the region as the caller expects.
384 */
385 if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
386 return;
387
388 phys &= PAGE_MASK;
389 addr = virt & PAGE_MASK;
390 end = PAGE_ALIGN(virt + size);
391
392 do {
393 next = pgd_addr_end(addr, end);
394 alloc_init_pud(pgdp, addr, next, phys, prot, pgtable_alloc,
395 flags);
396 phys += next - addr;
397 } while (pgdp++, addr = next, addr != end);
398 }
以下函数用来分配pud页表项,注意他会调用 use_1G_block 来判断是否使用1G页表,如果条件为真,那么映射完毕,不需要走PMD和PTE了。注意,对于一个典型ARM64 Linux架构来说,pte能映射2^9*4K = 2M地址空间。
309 static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
310 phys_addr_t phys, pgprot_t prot,
311 phys_addr_t (*pgtable_alloc)(int),
312 int flags)
313 {
314 unsigned long next;
315 pud_t *pudp;
316 p4d_t *p4dp = p4d_offset(pgdp, addr);
317 p4d_t p4d = READ_ONCE(*p4dp);
318
319 if (p4d_none(p4d)) {
320 p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN;
321 phys_addr_t pud_phys;
322
323 if (flags & NO_EXEC_MAPPINGS)
324 p4dval |= P4D_TABLE_PXN;
325 BUG_ON(!pgtable_alloc);
326 pud_phys = pgtable_alloc(PUD_SHIFT);
327 __p4d_populate(p4dp, pud_phys, p4dval);
328 p4d = READ_ONCE(*p4dp);
329 }
330 BUG_ON(p4d_bad(p4d));
331
332 /*
333 * No need for locking during early boot. And it doesn't work as
334 * expected with KASLR enabled.
335 */
336 if (system_state != SYSTEM_BOOTING)
337 mutex_lock(&fixmap_lock);
338 pudp = pud_set_fixmap_offset(p4dp, addr);
339 do {
340 pud_t old_pud = READ_ONCE(*pudp);
341
342 next = pud_addr_end(addr, end);
343
344 /*
345 * For 4K granule only, attempt to put down a 1GB block
346 */
347 if (use_1G_block(addr, next, phys) &&
348 (flags & NO_BLOCK_MAPPINGS) == 0) {
349 pud_set_huge(pudp, phys, prot);
350
351 /*
352 * After the PUD entry has been populated once, we
353 * only allow updates to the permission attributes.
354 */
355 BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
356 READ_ONCE(pud_val(*pudp))));
357 } else {
358 alloc_init_cont_pmd(pudp, addr, next, phys, prot,
359 pgtable_alloc, flags);
360
361 BUG_ON(pud_val(old_pud) != 0 &&
362 pud_val(old_pud) !=
READ_ONCE(pud_val(*pudp)));
363 }
364 phys += next - addr;
365 } while (pudp++, addr = next, addr != end);
366
367 pud_clear_fixmap();
368 if (system_state != SYSTEM_BOOTING)
369 mutex_unlock(&fixmap_lock);
370 }
以下函数用来分配pmd页表,如果当前地址满足2M对齐,那么采用2M页表来做映射。注意,对于一个典型ARM64 Linux架构来说,pmd能映射2^9*2M = 1G地址空间。
219 static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end,
220 phys_addr_t phys, pgprot_t prot,
221 phys_addr_t (*pgtable_alloc)(int), int flags)
222 {
223 unsigned long next;
224 pmd_t *pmdp;
225
226 pmdp = pmd_set_fixmap_offset(pudp, addr);
227 do {
228 pmd_t old_pmd = READ_ONCE(*pmdp);
229
230 next = pmd_addr_end(addr, end);
231
232 /* try section mapping first */
233 if (((addr | next | phys) & ~PMD_MASK) == 0 &&
234 (flags & NO_BLOCK_MAPPINGS) == 0) {
235 pmd_set_huge(pmdp, phys, prot);
236
237 /*
238 * After the PMD entry has been populated once, we
239 * only allow updates to the permission attributes.
240 */
241 BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd),
242 READ_ONCE(pmd_val(*pmdp))));
243 } else {
244 alloc_init_cont_pte(pmdp, addr, next, phys, prot,
245 pgtable_alloc, flags);
246
247 BUG_ON(pmd_val(old_pmd) != 0 &&
248 pmd_val(old_pmd) != READ_ONCE(pmd_val(*pmdp)));
249 }
250 phys += next - addr;
251 } while (pmdp++, addr = next, addr != end);
252
253 pmd_clear_fixmap();
254 }
最后一个问题:64位的arm平台,能够直接映射的物理内存有多大呢?如下截图是ARM64架构 4K page + 4级页表的结构,答案是:128T。
此图来自于:https://www.kernel.org/doc/html/latest/arm64/memory.html
来自公众号:人人极客社区作者简介:周文嘉: 曾服务于ARM、阿里系子公司、HTC等公司。10年以上工作经验,主要从事系统软件开发,涵盖:系统库开发、指令集优化、Linux内核开发等。累计为某些开源社贡献过一定数量的patch。