@ -9,6 +9,16 @@
# define SPSR_EL2_TO_EL1_VAL \
( SPSR_EL2_D | SPSR_EL2_A | SPSR_EL2_I | SPSR_EL2_F | SPSR_EL2_MODE_EL1H )
# define SCTLR_EL1_RESERVED ( 3 < < 28 ) | ( 3 < < 22 ) | ( 1 < < 20 ) | ( 1 < < 11 )
# define SCTLR_EL1_EE_LITTLE_ENDIAN ( 0 < < 25 )
# define SCTLR_EL1_EOE_LITTLE_ENDIAN ( 0 < < 24 )
# define SCTLR_EL1_I_CACHE_DISABLED ( 0 < < 12 )
# define SCTLR_EL1_D_CACHE_DISABLED ( 0 < < 2 )
# define SCTLR_EL1_MMU_DISABLED ( 0 < < 0 )
# define SCTLR_EL1_MMU_ENABLED ( 1 < < 0 )
# define SCTLR_EL1_VAL_MMU_DISABLED ( SCTLR_EL1_RESERVED | SCTLR_EL1_EE_LITTLE_ENDIAN | SCTLR_EL1_I_CACHE_DISABLED | SCTLR_EL1_D_CACHE_DISABLED | SCTLR_EL1_MMU_DISABLED )
.section ".text.boot" / / Put this at start of kernel
.global _start
@ -87,6 +97,124 @@ _start:
/ / cmp x1 , # 2
/ / b.ne 1 b
/ /
/ / Set up virtual memory
/ /
/ / Set cache granularity
/ / ( todo performance ) Set Translation table shareability / cacheability as per ARM DEN0024A 12 - 18 section 12 .5
ldr x1 , = ( 3 < < 30 ) / / TG1 Translation granule for EL1 set to 0 b11 = 64 KiB ( 2 ^ 16 bytes )
ldr x2 , = ( 3 < < 14 ) / / TG0 Translation granule for EL0 set to 0 b11 = 64 KiB
orr x1 , x1 , x2
/ / Set number of significant bits. We want to eliminate table level 0 and 1 since they are
/ / unnecessarily large for 64 KiB blocks , so we set the TnSZ to skip it.
/ / See R FMBKV in ARM DDI 0487 J.a
/ / " The size offset of the memory region addressed by TTBR1_EL1. The region size is 2 ^ ( 64 - T1SZ ) bytes. "
/ / " Next , imagine you set T0SZ to 34 :
/ / 6 4 - T0SZ = 30 - bit address space ( address bits 29 : 0 ) "
/ / Granule size is 64 KiB , which according to ARM DEN0024A 12 - 16 :
/ / " Bits 47 : 42 of the Virtual Address select a descriptor from the 64 entry L1 table.
/ / Each of these table entries spans a 4 TB range and points to an L2 table. Within
/ / that 8192 entry L2 table , bits 41 : 29 are used as index to select an entry and each
/ / entry points to either a 512 MB block or an L2 table. At the final translation stage ,
/ / bits 28 : 16 index into an 8192 entry L3 table and each entry points to a 64 kB
/ / block. "
/ / The ARM document 101811 _0103_01_en ( " Learn the architecture - AArch64 memory management " )
/ / helps clear this up for me.
/ /
/ / We ignore [ 47 : 42 ] ( stage 1 lookup ) by reducing our address space with TnSZ.
/ / Only use [ 32 : 29 ] ( table level 2 ), 4 bits to index 16 blocks of 512 MiB each covering 8 GiB
/ / and [ 28 : 16 ] ( table level 3 ) to index 8192 pages of 64 KiB each.
/ / which allows us to either have 512 MiB blocks resolved straight from stage 2 or 64 KiB
/ / blocks from stage 3 .
/ / These numbers were picked based on the following :
/ / - Arm recommends the largest pages possible in order to reduce TLB cache misses and table
/ / sizes. This is intuitive. The drawback is lower granularity , but the type of processes I ' m
/ / imagining ( few , large processes , possibly with many " threads " which share memory ) mean the
/ / expense of the smallest page being 64 KiB is not a concern. ( If you had a million tiny
/ / processes , it might make sense going with more granular processes. )
/ / - The Raspberry Pi 4 B has a maximum of 8 GiB physical RAM. I don ' t think it ' s a good idea to
/ / abstract away memory such that you don ' t know exactly when your pages are in RAM or on
/ / disk. If you suddenly have to load from disk , it could cause frame stutters etc. With this
/ / in mind , it does not make sense to have a larger address space than is possible in RAM.
/ / The argument could be made to have 16 GiB space to leave room for separate device memory ,
/ / which I may find necessary. At this level it won ' t be hard to increase the space slightly.
/ / " TCR_EL1 has two separate fi elds that control the granule size for the kernel space
/ / and the user space virtual address ranges. These fi elds are called TG1 for kernel
/ / space and TG0 for user space. A potential problem for programmers is that these
/ / two fi elds have different encodings. " 101811 _0103_01_en
/ / Number of bits of address = 64 - TnSZ ; we want a 33 bit address space ( 8 GiB ), so TnSZ = 31
ldr x2 , = 31
/ / LEFT OFF : Figure out how to encode the bits
/ / lsr x2 , x2 , / / to t1sz and t0sz
/ / Translation control register
msr tcr_el1 , x2
isb / / Force changes to registers to be seen before we enable the MMU
/ / Set up Memory Attribute Indirection Register MAIR , which specifies the available types of
/ / memory behaviors
/ / Device memory types :
/ / ( G ) athering : allow multiple accesse to be merged into a single transaction
/ / ( R ) e-ordering : allow accesses to the same device to be re-ordered with respect to each otehr
/ / ( E ) arly write acknowledgement : Whether intermediate write buffer between processor and slave
/ / can send an acknowledgement of write completion rather than wait for the device to confirm
/ / the write.
/ / We have 8 possible types available. See ARM DDI 0487 J.a D19-6894
/ / [ 0 ] Device memory , non-gather , no re-order , no early right ( a.k.a. strict order , slow !)
ldr x1 , = ( 0x00 < < ( 8 * 0 ))
/ / [ 1 ] Normal memory , for now ( todo performance ) non-cacheable ( both inner and outer ) = 0 b01000100
ldr x2 , = ( 0x44 < < ( 8 * 1 ))
orr x1 , x1 , x2
msr mair_el1 , x1
/ / Set up page tables
/ / Table descriptor for 64 KiB , 48 - bit address ( since TCR_EL1.DS is 0 , otherwise it ' d be 52 bits )
/ / [ 6 3 : 5 9 ] attributes , [ 58 : 51 ] ignored , [ 50 : 48 ] reserved 0 , [ 47 : 16 ] next level table address ,
/ / [ 1 5 : 1 2 ] reserved 0 , [ 11 : 2 ] ignored , [ 1 ] Is block entry ( 0 ) or table entry ( 1 ), [ 0 ] is valid
/ / The [ 63 : 59 ] attributes seem to be dealing with execute never and virtualization stuff.
/ / Block descriptor for 64 KiB 48 - bit address
/ / [ 6 3 : 5 0 ] attributes , [ 49 : 48 ] reserved 0 , [ 47 : 29 ] block address ,
/ / [ 2 8 : 1 7 ] reserved 0 , [ 16 ] nT , [ 15 : 12 ] reserved 0 , [ 11 : 2 ] lower attributes ,
/ / [ 1 ] Is block entry ( 0 ) or table entry ( 1 ), [ 0 ] is valid
/ / Page descriptor for 64 KiB 48 - bit address
/ / [ 6 3 : 5 0 ] attributes , [ 49 : 48 ] reserved 0 , [ 47 : 16 ] output address ,
/ / [ 1 5 : 1 2 ] reserved 0 , [ 11 : 2 ] lower attributes ,
/ / [ 1 ] Is block entry ( 0 ) or table entry ( 1 ), [ 0 ] is valid
/ / In block and page descriptors
/ / See ARM DAI 0527 A Setting up the MMU
/ / adr x0 , ttb0_base / / must be 64 KiB aligned
/ / / / Set the translation table for addresses with upper bits 0 to our level 1 table stored at ttb0_base
/ / TODO: I think ttbr1_el1 needs to be set as well !
/ / msr ttbr0_el1 , x0
/ / / / Level 1 translation table
/ / ldr x1 , = level2_pagetable / / must be 64 KiB aligned
/ / ldr x2 , = 0xffff0000 / / Mask to 64 KiB just in case the alignment is off ( not sure if this is necessary )
/ / and x2 , x1 , x2
/ / orr x2 , x2 , 0x3 / / Set [ 1 ] table entry [ 0 ] valid
/ / / / All upper attributes are 0
/ / str x2 , [ x0 ], # 8 / / Set the first entry in our table
/ / All other entries are invalid ( our first entry represents 64 GiB of space over the 4 TiB total range )
/ / As per I FTBXR in ARM DDI 0487 J.a , I should be able to reduce this by setting TCR_ELx.TnSZ
/ / Minimum: 1 6 Maximum: 3 9
/ / At stage 2 , the effective minimum is 32 for 32 bit PA
/ / Level 2 translation table
/ /
/ / Final setup before EL2- > EL1
/ /
/ * setup SCTLR_EL1 * /
ldr x0 , = SCTLR_EL1_VAL_MMU_DISABLED
msr SCTLR_EL1 , x0
/ * Enable MMU virtual memory * /
/ / mrs x0 , sctlr_el1
/ / orr x0 , x0 , # 1 / / Set [ M ] bit to enable MMU
/ / msr sctlr_el1 , x0
/ / isb / / Force changes for next instruction
/ / Tell the hypervisor we want AArch64 in all lower ELs
mrs x0 , hcr_el2
orr x0 , x0 , # ( 1 < < 31 )
@ -104,7 +232,8 @@ _start:
el1_entry:
/ / We MUST initialize the stack AFTER the EL has changed , because we ' re using a new stack
/ / pointer at this EL. Took me way to long to figure this out.
/ / TODO What should this value actually be ?
/ / TODO What should this value actually be ? The stack seems to grow down , so this would start
/ / at 0x80000 and work towards 0x0 .
ldr x1 , = _start
mov sp , x1
/ / Jump to main ( note that we never expect this to return )
@ -117,3 +246,10 @@ get_el:
mrs x0 , CurrentEL
lsr x0 , x0 , # 2
ret
/ / On Raspberry Pi 4 B , it reported 16 TiB
.globl get_implemented_physical_address_size
get_implemented_physical_address_size:
mrs x0 , ID_AA64MMFR0_EL1
and x0 , x0 , 0xf
ret