diff mbox

[3/6] 8xx: get rid of _PAGE_HWWRITE dependency in MMU.

Message ID 1254744999-3158-4-git-send-email-Joakim.Tjernlund@transmode.se (mailing list archive)
State Superseded, archived
Headers show

Commit Message

Joakim Tjernlund Oct. 5, 2009, 12:16 p.m. UTC
Update the TLB asm to make proper use of _PAGE_DIRY and _PAGE_ACCESSED.
Pros:
 - I/D TLB Miss never needs to write to the linux pte.
 - _PAGE_ACCESSED is only set on I/D TLB Error fixing accounting
 - _PAGE_DIRTY is mapped to 0x100, the changed bit, and is set directly
    when a page has been made dirty.
 - Proper RO/RW mapping of user space.
 - Free up 2 SW TLB bits in the linux pte(add back _PAGE_WRITETHRU ?)
Cons:
 - 4 more instructions in I/D TLB Miss, but the since the linux pte is
   not written anymore, it should still be a win.
---
 arch/powerpc/include/asm/pte-8xx.h |    9 +-
 arch/powerpc/kernel/head_8xx.S     |  163 ++++++++++++++++++++++++++----------
 2 files changed, 122 insertions(+), 50 deletions(-)

Comments

Benjamin Herrenschmidt Oct. 5, 2009, 8:17 p.m. UTC | #1
On Mon, 2009-10-05 at 14:16 +0200, Joakim Tjernlund wrote:
> Update the TLB asm to make proper use of _PAGE_DIRY and _PAGE_ACCESSED.
> Pros:
>  - I/D TLB Miss never needs to write to the linux pte.
>  - _PAGE_ACCESSED is only set on I/D TLB Error fixing accounting
>  - _PAGE_DIRTY is mapped to 0x100, the changed bit, and is set directly
>     when a page has been made dirty.

Not sure here. You seem to still set those from asm. Is that necessary ?

The approach I take on BookE is to simply not set these from asm, -and-
(this is important) not set write permission if dirty is not set in
the TLB miss and set no access permission at all when accessed is not
set. This is important or we'll miss dirty updates which can
be fatal.

The C code in handle_pte_fault() will fixup missing access and dirty
if necessary and flush.

Also look at the 440 code, I think you could simplify your
implementation using similar techniques, such as andc of PTE against
requested bits etc... and thus maybe save a couple of insns.

Cheers,
Ben.

>  - Proper RO/RW mapping of user space.
>  - Free up 2 SW TLB bits in the linux pte(add back _PAGE_WRITETHRU ?)
> Cons:
>  - 4 more instructions in I/D TLB Miss, but the since the linux pte is
>    not written anymore, it should still be a win.
> ---
>  arch/powerpc/include/asm/pte-8xx.h |    9 +-
>  arch/powerpc/kernel/head_8xx.S     |  163 ++++++++++++++++++++++++++----------
>  2 files changed, 122 insertions(+), 50 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/pte-8xx.h b/arch/powerpc/include/asm/pte-8xx.h
> index 8c6e312..af541a2 100644
> --- a/arch/powerpc/include/asm/pte-8xx.h
> +++ b/arch/powerpc/include/asm/pte-8xx.h
> @@ -32,22 +32,21 @@
>  #define _PAGE_FILE	0x0002	/* when !present: nonlinear file mapping */
>  #define _PAGE_NO_CACHE	0x0002	/* I: cache inhibit */
>  #define _PAGE_SHARED	0x0004	/* No ASID (context) compare */
> +#define _PAGE_DIRTY	0x0100	/* C: page changed */
>  
>  /* These five software bits must be masked out when the entry is loaded
>   * into the TLB.
>   */
>  #define _PAGE_EXEC	0x0008	/* software: i-cache coherency required */
>  #define _PAGE_GUARDED	0x0010	/* software: guarded access */
> -#define _PAGE_DIRTY	0x0020	/* software: page changed */
> -#define _PAGE_RW	0x0040	/* software: user write access allowed */
> -#define _PAGE_ACCESSED	0x0080	/* software: page referenced */
> +#define _PAGE_USER	0x0020	/* software: User space access */
>  
>  /* Setting any bits in the nibble with the follow two controls will
>   * require a TLB exception handler change.  It is assumed unused bits
>   * are always zero.
>   */
> -#define _PAGE_HWWRITE	0x0100	/* h/w write enable: never set in Linux PTE */
> -#define _PAGE_USER	0x0800	/* One of the PP bits, the other is USER&~RW */
> +#define _PAGE_RW	0x0400	/* lsb PP bits */
> +#define _PAGE_ACCESSED	0x0800	/* msb PP bits */
>  
>  #define _PMD_PRESENT	0x0001
>  #define _PMD_BAD	0x0ff0
> diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
> index 118bb05..b1f72d9 100644
> --- a/arch/powerpc/kernel/head_8xx.S
> +++ b/arch/powerpc/kernel/head_8xx.S
> @@ -333,21 +333,15 @@ InstructionTLBMiss:
>  	mfspr	r11, SPRN_MD_TWC	/* ....and get the pte address */
>  	lwz	r10, 0(r11)	/* Get the pte */
>  
> -#ifdef CONFIG_SWAP
> -	/* do not set the _PAGE_ACCESSED bit of a non-present page */
> -	andi.	r11, r10, _PAGE_PRESENT
> -	beq	4f
> -	ori	r10, r10, _PAGE_ACCESSED
> -	mfspr	r11, SPRN_MD_TWC	/* get the pte address again */
> -	stw	r10, 0(r11)
> -4:
> -#else
> -	ori	r10, r10, _PAGE_ACCESSED
> -	stw	r10, 0(r11)
> -#endif
> -
> +	andi.	r11, r10, _PAGE_USER | _PAGE_ACCESSED
> +	cmpwi	cr0, r11, _PAGE_USER | _PAGE_ACCESSED
> +	beq+	cr0, 5f	/* branch if access allowed */
> +	rlwinm	r10, r10, 0, 22, 19 /* r10 &= ~(_PAGE_ACCESSED | _PAGE_RW) */
> +	b	6f
> +5:	xori	r10, r10, _PAGE_RW /* invert RW bit */
> +6:
>  	/* The Linux PTE won't go exactly into the MMU TLB.
> -	 * Software indicator bits 21, 22 and 28 must be clear.
> +	 * Software indicator bit 28 must be clear.
>  	 * Software indicator bits 24, 25, 26, and 27 must be
>  	 * set.  All other Linux PTE bits control the behavior
>  	 * of the MMU.
> @@ -409,21 +403,15 @@ DataStoreTLBMiss:
>  	DO_8xx_CPU6(0x3b80, r3)
>  	mtspr	SPRN_MD_TWC, r11
>  
> -#ifdef CONFIG_SWAP
> -	/* do not set the _PAGE_ACCESSED bit of a non-present page */
> -	andi.	r11, r10, _PAGE_PRESENT
> -	beq	4f
> -	ori	r10, r10, _PAGE_ACCESSED
> -4:
> -	/* and update pte in table */
> -#else
> -	ori	r10, r10, _PAGE_ACCESSED
> -#endif
> -	mfspr	r11, SPRN_MD_TWC	/* get the pte address again */
> -	stw	r10, 0(r11)
> -
> +	andi.	r11, r10, _PAGE_USER | _PAGE_ACCESSED
> +	cmpwi	cr0, r11, _PAGE_USER | _PAGE_ACCESSED
> +	beq+	cr0, 5f	/* branch if access allowed */
> +	rlwinm	r10, r10, 0, 22, 19 /* r10 &= ~(_PAGE_ACCESSED | _PAGE_RW) */
> +	b	6f
> +5:	xori	r10, r10, _PAGE_RW /* invert RW bit */
> +6:
>  	/* The Linux PTE won't go exactly into the MMU TLB.
> -	 * Software indicator bits 21, 22 and 28 must be clear.
> +	 * Software indicator bit 28 must be clear.
>  	 * Software indicator bits 24, 25, 26, and 27 must be
>  	 * set.  All other Linux PTE bits control the behavior
>  	 * of the MMU.
> @@ -448,6 +436,91 @@ DataStoreTLBMiss:
>   */
>  	. = 0x1300
>  InstructionTLBError:
> +#ifdef CONFIG_8xx_CPU6
> +	stw	r3, 8(r0)
> +#endif
> +	DO_8xx_CPU6(0x3f80, r3)
> +	mtspr	SPRN_M_TW, r10	/* Save a couple of working registers */
> +	mfcr	r10
> +	stw	r10, 0(r0)
> +	stw	r11, 4(r0)
> +
> +	mfspr	r11, SPRN_SRR1
> +	andis.	r11, r11, 0x5000	/* no translation, guarded */
> +	bne	2f
> +
> +	mfspr	r10, SPRN_SRR0	/* Get effective address of fault */
> +#ifdef CONFIG_8xx_CPU15
> +	addi	r11, r10, 0x1000
> +	tlbie	r11
> +	addi	r11, r10, -0x1000
> +	tlbie	r11
> +#endif
> +	DO_8xx_CPU6(0x3780, r3)
> +	mtspr	SPRN_MD_EPN, r10	/* Have to use MD_EPN for walk, MI_EPN can't */
> +	mfspr	r10, SPRN_M_TWB	/* Get level 1 table entry address */
> +
> +	/* If we are faulting a kernel address, we have to use the
> +	 * kernel page tables.
> +	 */
> +	andi.	r11, r10, 0x0800	/* Address >= 0x80000000 */
> +	beq	3f
> +	lis	r11, swapper_pg_dir@h
> +	ori	r11, r11, swapper_pg_dir@l
> +	rlwimi	r10, r11, 0, 2, 19
> +3:
> +	lwz	r11, 0(r10)	/* Get the level 1 entry */
> +	rlwinm.	r10, r11,0,0,19	/* Extract page descriptor page address */
> +	beq	2f		/* If zero, don't try to find a pte */
> +
> +	/* We have a pte table, so load the MI_TWC with the attributes
> +	 * for this "segment."
> +	 */
> +	ori	r11,r11,1		/* Set valid bit */
> +	DO_8xx_CPU6(0x2b80, r3)
> +	mtspr	SPRN_MI_TWC, r11	/* Set segment attributes */
> +	DO_8xx_CPU6(0x3b80, r3)
> +	mtspr	SPRN_MD_TWC, r11	/* Load pte table base address */
> +
> +	mfspr	r11, SPRN_SRR1
> +	andi.	r11, r11, 0x4000 /* MSR[PR] */
> +	mfspr	r11, SPRN_MD_TWC	/* ....and get the pte address */
> +	lwz	r10, 0(r11)	/* Get the pte */
> +	beq	5f /* Kernel access always OK */
> +	andi.	r11,r10, _PAGE_USER
> +	beq	2f
> +5:	ori	r10, r10, _PAGE_ACCESSED
> +	mfspr	r21, SPRN_MD_TWC	/* ....and get the pte address */
> +	stw	r10, 0(r11)
> +	xori	r10, r10, _PAGE_RW /* invert RW bit */
> +
> +	/* The Linux PTE won't go exactly into the MMU TLB.
> +	 * Software indicator bit 28 must be clear.
> +	 * Software indicator bits 24, 25, 26, and 27 must be
> +	 * set.  All other Linux PTE bits control the behavior
> +	 * of the MMU.
> +	 */
> +	li	r11, 0x00f0
> +	rlwimi	r10, r11, 0, 24, 28	/* Set 24-27, clear 28 */
> +	DO_8xx_CPU6(0x2d80, r3)
> +	mtspr	SPRN_MI_RPN, r10	/* Update TLB entry */
> +
> +	mfspr	r10, SPRN_M_TW	/* Restore registers */
> +	lwz	r11, 0(r0)
> +	mtcr	r11
> +	lwz	r11, 4(r0)
> +#ifdef CONFIG_8xx_CPU6
> +	lwz	r3, 8(r0)
> +#endif
> +	rfi
> +
> +2:	mfspr	r10, SPRN_M_TW	/* Restore registers */
> +	lwz	r11, 0(r0)
> +	mtcr	r11
> +	lwz	r11, 4(r0)
> +#ifdef CONFIG_8xx_CPU6
> +	lwz	r3, 8(r0)
> +#endif
>  	b	InstructionAccess
>  
>  /* This is the data TLB error on the MPC8xx.  This could be due to
> @@ -472,8 +545,8 @@ DataTLBError:
>  	/* First, make sure this was a store operation.
>  	*/
>  	mfspr	r10, SPRN_DSISR
> -	andis.	r11, r10, 0x4800 /* no translation, no permission. */
> -	bne	2f	/* branch if either is set */
> +	andis.	r11, r10, 0x4000 /* no translation */
> +	bne	2f	/* branch if set */
>  
>  	/* The EA of a data TLB miss is automatically stored in the MD_EPN
>  	 * register.  The EA of a data TLB error is automatically stored in
> @@ -522,26 +595,26 @@ DataTLBError:
>  	mfspr	r11, SPRN_MD_TWC		/* ....and get the pte address */
>  	lwz	r10, 0(r11)		/* Get the pte */
>  
> -	andi.	r11, r10, _PAGE_RW	/* Is it writeable? */
> -	beq	2f			/* Bail out if not */
> +	mfspr	r11, SPRN_SRR1
> +	andi.	r11, r11, 0x4000 /* MSR[PR] */
> +	beq	5f /* Kernel access always OK */
> +	andi.	r11,r10, _PAGE_USER
> +	beq	2f
> +5:	mfspr	r11, SPRN_DSISR
> +	andis.	r11, r11, 0x0200	/* store */
> +	beq	6f
> +	andi.	r11, r10, _PAGE_RW	/* writeable? */
> +	beq	2f /* branch if not */
> +	ori	r10, r10, _PAGE_DIRTY | _PAGE_HWWRITE
> +6:	ori	r10, r10, _PAGE_ACCESSED
>  
> -	/* Update 'changed', among others.
> -	*/
> -#ifdef CONFIG_SWAP
> -	ori	r10, r10, _PAGE_DIRTY|_PAGE_HWWRITE
> -	/* do not set the _PAGE_ACCESSED bit of a non-present page */
> -	andi.	r11, r10, _PAGE_PRESENT
> -	beq	4f
> -	ori	r10, r10, _PAGE_ACCESSED
> -4:
> -#else
> -	ori	r10, r10, _PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_HWWRITE
> -#endif
>  	mfspr	r11, SPRN_MD_TWC		/* Get pte address again */
>  	stw	r10, 0(r11)		/* and update pte in table */
>  
> +	xori	r10, r10, _PAGE_RW	/* Invert RW bit */
> +
>  	/* The Linux PTE won't go exactly into the MMU TLB.
> -	 * Software indicator bits 21, 22 and 28 must be clear.
> +	 * Software indicator bit 28 must be clear.
>  	 * Software indicator bits 24, 25, 26, and 27 must be
>  	 * set.  All other Linux PTE bits control the behavior
>  	 * of the MMU.
Joakim Tjernlund Oct. 5, 2009, 9:25 p.m. UTC | #2
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote on 05/10/2009 22:17:04:
>
> On Mon, 2009-10-05 at 14:16 +0200, Joakim Tjernlund wrote:
> > Update the TLB asm to make proper use of _PAGE_DIRY and _PAGE_ACCESSED.
> > Pros:
> >  - I/D TLB Miss never needs to write to the linux pte.
> >  - _PAGE_ACCESSED is only set on I/D TLB Error fixing accounting
> >  - _PAGE_DIRTY is mapped to 0x100, the changed bit, and is set directly
> >     when a page has been made dirty.
>
> Not sure here. You seem to still set those from asm. Is that necessary ?

Well, yes. head_32.S also sets ACCESSED and DIRTY from asm so why not me?
The basic method I use is:
TLB gets mapped with NoAccess, then the first access will trap
into TLB error, where ACCESSED will be set if permission to access
the page is OK (and RW too). On first store 8xx will trap
into DTLB error and permissions is OK, DIRTY will be set too.
Is this wrong?
Do I have to trap to C for first store?

>
> The approach I take on BookE is to simply not set these from asm, -and-
> (this is important) not set write permission if dirty is not set in
> the TLB miss and set no access permission at all when accessed is not
> set. This is important or we'll miss dirty updates which can
> be fatal.

not sure, but this seems similar to what I do. DIRTY will be set,
in asm, on first store.
ACCESSED will only be set iff (USER && VALID)

>
> The C code in handle_pte_fault() will fixup missing access and dirty
> if necessary and flush.
>
> Also look at the 440 code, I think you could simplify your
> implementation using similar techniques, such as andc of PTE against
> requested bits etc... and thus maybe save a couple of insns.

Great, but first I want to make sure I doing it right :)

So is there some golden rule I have to follow?

 Jocke

>
> Cheers,
> Ben.
>
> >  - Proper RO/RW mapping of user space.
> >  - Free up 2 SW TLB bits in the linux pte(add back _PAGE_WRITETHRU ?)
> > Cons:
> >  - 4 more instructions in I/D TLB Miss, but the since the linux pte is
> >    not written anymore, it should still be a win.
> > ---
> >  arch/powerpc/include/asm/pte-8xx.h |    9 +-
> >  arch/powerpc/kernel/head_8xx.S     |  163 ++++++++++++++++++++++++++----------
> >  2 files changed, 122 insertions(+), 50 deletions(-)
> >
> > diff --git a/arch/powerpc/include/asm/pte-8xx.h b/arch/powerpc/include/asm/pte-8xx.h
> > index 8c6e312..af541a2 100644
> > --- a/arch/powerpc/include/asm/pte-8xx.h
> > +++ b/arch/powerpc/include/asm/pte-8xx.h
> > @@ -32,22 +32,21 @@
> >  #define _PAGE_FILE   0x0002   /* when !present: nonlinear file mapping */
> >  #define _PAGE_NO_CACHE   0x0002   /* I: cache inhibit */
> >  #define _PAGE_SHARED   0x0004   /* No ASID (context) compare */
> > +#define _PAGE_DIRTY   0x0100   /* C: page changed */
> >
> >  /* These five software bits must be masked out when the entry is loaded
> >   * into the TLB.
> >   */
> >  #define _PAGE_EXEC   0x0008   /* software: i-cache coherency required */
> >  #define _PAGE_GUARDED   0x0010   /* software: guarded access */
> > -#define _PAGE_DIRTY   0x0020   /* software: page changed */
> > -#define _PAGE_RW   0x0040   /* software: user write access allowed */
> > -#define _PAGE_ACCESSED   0x0080   /* software: page referenced */
> > +#define _PAGE_USER   0x0020   /* software: User space access */
> >
> >  /* Setting any bits in the nibble with the follow two controls will
> >   * require a TLB exception handler change.  It is assumed unused bits
> >   * are always zero.
> >   */
> > -#define _PAGE_HWWRITE   0x0100   /* h/w write enable: never set in Linux PTE */
> > -#define _PAGE_USER   0x0800   /* One of the PP bits, the other is USER&~RW */
> > +#define _PAGE_RW   0x0400   /* lsb PP bits */
> > +#define _PAGE_ACCESSED   0x0800   /* msb PP bits */
> >
> >  #define _PMD_PRESENT   0x0001
> >  #define _PMD_BAD   0x0ff0
> > diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
> > index 118bb05..b1f72d9 100644
> > --- a/arch/powerpc/kernel/head_8xx.S
> > +++ b/arch/powerpc/kernel/head_8xx.S
> > @@ -333,21 +333,15 @@ InstructionTLBMiss:
> >     mfspr   r11, SPRN_MD_TWC   /* ....and get the pte address */
> >     lwz   r10, 0(r11)   /* Get the pte */
> >
> > -#ifdef CONFIG_SWAP
> > -   /* do not set the _PAGE_ACCESSED bit of a non-present page */
> > -   andi.   r11, r10, _PAGE_PRESENT
> > -   beq   4f
> > -   ori   r10, r10, _PAGE_ACCESSED
> > -   mfspr   r11, SPRN_MD_TWC   /* get the pte address again */
> > -   stw   r10, 0(r11)
> > -4:
> > -#else
> > -   ori   r10, r10, _PAGE_ACCESSED
> > -   stw   r10, 0(r11)
> > -#endif
> > -
> > +   andi.   r11, r10, _PAGE_USER | _PAGE_ACCESSED
> > +   cmpwi   cr0, r11, _PAGE_USER | _PAGE_ACCESSED
> > +   beq+   cr0, 5f   /* branch if access allowed */
> > +   rlwinm   r10, r10, 0, 22, 19 /* r10 &= ~(_PAGE_ACCESSED | _PAGE_RW) */
> > +   b   6f
> > +5:   xori   r10, r10, _PAGE_RW /* invert RW bit */
> > +6:
> >     /* The Linux PTE won't go exactly into the MMU TLB.
> > -    * Software indicator bits 21, 22 and 28 must be clear.
> > +    * Software indicator bit 28 must be clear.
> >      * Software indicator bits 24, 25, 26, and 27 must be
> >      * set.  All other Linux PTE bits control the behavior
> >      * of the MMU.
> > @@ -409,21 +403,15 @@ DataStoreTLBMiss:
> >     DO_8xx_CPU6(0x3b80, r3)
> >     mtspr   SPRN_MD_TWC, r11
> >
> > -#ifdef CONFIG_SWAP
> > -   /* do not set the _PAGE_ACCESSED bit of a non-present page */
> > -   andi.   r11, r10, _PAGE_PRESENT
> > -   beq   4f
> > -   ori   r10, r10, _PAGE_ACCESSED
> > -4:
> > -   /* and update pte in table */
> > -#else
> > -   ori   r10, r10, _PAGE_ACCESSED
> > -#endif
> > -   mfspr   r11, SPRN_MD_TWC   /* get the pte address again */
> > -   stw   r10, 0(r11)
> > -
> > +   andi.   r11, r10, _PAGE_USER | _PAGE_ACCESSED
> > +   cmpwi   cr0, r11, _PAGE_USER | _PAGE_ACCESSED
> > +   beq+   cr0, 5f   /* branch if access allowed */
> > +   rlwinm   r10, r10, 0, 22, 19 /* r10 &= ~(_PAGE_ACCESSED | _PAGE_RW) */
> > +   b   6f
> > +5:   xori   r10, r10, _PAGE_RW /* invert RW bit */
> > +6:
> >     /* The Linux PTE won't go exactly into the MMU TLB.
> > -    * Software indicator bits 21, 22 and 28 must be clear.
> > +    * Software indicator bit 28 must be clear.
> >      * Software indicator bits 24, 25, 26, and 27 must be
> >      * set.  All other Linux PTE bits control the behavior
> >      * of the MMU.
> > @@ -448,6 +436,91 @@ DataStoreTLBMiss:
> >   */
> >     . = 0x1300
> >  InstructionTLBError:
> > +#ifdef CONFIG_8xx_CPU6
> > +   stw   r3, 8(r0)
> > +#endif
> > +   DO_8xx_CPU6(0x3f80, r3)
> > +   mtspr   SPRN_M_TW, r10   /* Save a couple of working registers */
> > +   mfcr   r10
> > +   stw   r10, 0(r0)
> > +   stw   r11, 4(r0)
> > +
> > +   mfspr   r11, SPRN_SRR1
> > +   andis.   r11, r11, 0x5000   /* no translation, guarded */
> > +   bne   2f
> > +
> > +   mfspr   r10, SPRN_SRR0   /* Get effective address of fault */
> > +#ifdef CONFIG_8xx_CPU15
> > +   addi   r11, r10, 0x1000
> > +   tlbie   r11
> > +   addi   r11, r10, -0x1000
> > +   tlbie   r11
> > +#endif
> > +   DO_8xx_CPU6(0x3780, r3)
> > +   mtspr   SPRN_MD_EPN, r10   /* Have to use MD_EPN for walk, MI_EPN can't */
> > +   mfspr   r10, SPRN_M_TWB   /* Get level 1 table entry address */
> > +
> > +   /* If we are faulting a kernel address, we have to use the
> > +    * kernel page tables.
> > +    */
> > +   andi.   r11, r10, 0x0800   /* Address >= 0x80000000 */
> > +   beq   3f
> > +   lis   r11, swapper_pg_dir@h
> > +   ori   r11, r11, swapper_pg_dir@l
> > +   rlwimi   r10, r11, 0, 2, 19
> > +3:
> > +   lwz   r11, 0(r10)   /* Get the level 1 entry */
> > +   rlwinm.   r10, r11,0,0,19   /* Extract page descriptor page address */
> > +   beq   2f      /* If zero, don't try to find a pte */
> > +
> > +   /* We have a pte table, so load the MI_TWC with the attributes
> > +    * for this "segment."
> > +    */
> > +   ori   r11,r11,1      /* Set valid bit */
> > +   DO_8xx_CPU6(0x2b80, r3)
> > +   mtspr   SPRN_MI_TWC, r11   /* Set segment attributes */
> > +   DO_8xx_CPU6(0x3b80, r3)
> > +   mtspr   SPRN_MD_TWC, r11   /* Load pte table base address */
> > +
> > +   mfspr   r11, SPRN_SRR1
> > +   andi.   r11, r11, 0x4000 /* MSR[PR] */
> > +   mfspr   r11, SPRN_MD_TWC   /* ....and get the pte address */
> > +   lwz   r10, 0(r11)   /* Get the pte */
> > +   beq   5f /* Kernel access always OK */
> > +   andi.   r11,r10, _PAGE_USER
> > +   beq   2f
> > +5:   ori   r10, r10, _PAGE_ACCESSED
> > +   mfspr   r21, SPRN_MD_TWC   /* ....and get the pte address */
> > +   stw   r10, 0(r11)
> > +   xori   r10, r10, _PAGE_RW /* invert RW bit */
> > +
> > +   /* The Linux PTE won't go exactly into the MMU TLB.
> > +    * Software indicator bit 28 must be clear.
> > +    * Software indicator bits 24, 25, 26, and 27 must be
> > +    * set.  All other Linux PTE bits control the behavior
> > +    * of the MMU.
> > +    */
> > +   li   r11, 0x00f0
> > +   rlwimi   r10, r11, 0, 24, 28   /* Set 24-27, clear 28 */
> > +   DO_8xx_CPU6(0x2d80, r3)
> > +   mtspr   SPRN_MI_RPN, r10   /* Update TLB entry */
> > +
> > +   mfspr   r10, SPRN_M_TW   /* Restore registers */
> > +   lwz   r11, 0(r0)
> > +   mtcr   r11
> > +   lwz   r11, 4(r0)
> > +#ifdef CONFIG_8xx_CPU6
> > +   lwz   r3, 8(r0)
> > +#endif
> > +   rfi
> > +
> > +2:   mfspr   r10, SPRN_M_TW   /* Restore registers */
> > +   lwz   r11, 0(r0)
> > +   mtcr   r11
> > +   lwz   r11, 4(r0)
> > +#ifdef CONFIG_8xx_CPU6
> > +   lwz   r3, 8(r0)
> > +#endif
> >     b   InstructionAccess
> >
> >  /* This is the data TLB error on the MPC8xx.  This could be due to
> > @@ -472,8 +545,8 @@ DataTLBError:
> >     /* First, make sure this was a store operation.
> >     */
> >     mfspr   r10, SPRN_DSISR
> > -   andis.   r11, r10, 0x4800 /* no translation, no permission. */
> > -   bne   2f   /* branch if either is set */
> > +   andis.   r11, r10, 0x4000 /* no translation */
> > +   bne   2f   /* branch if set */
> >
> >     /* The EA of a data TLB miss is automatically stored in the MD_EPN
> >      * register.  The EA of a data TLB error is automatically stored in
> > @@ -522,26 +595,26 @@ DataTLBError:
> >     mfspr   r11, SPRN_MD_TWC      /* ....and get the pte address */
> >     lwz   r10, 0(r11)      /* Get the pte */
> >
> > -   andi.   r11, r10, _PAGE_RW   /* Is it writeable? */
> > -   beq   2f         /* Bail out if not */
> > +   mfspr   r11, SPRN_SRR1
> > +   andi.   r11, r11, 0x4000 /* MSR[PR] */
> > +   beq   5f /* Kernel access always OK */
> > +   andi.   r11,r10, _PAGE_USER
> > +   beq   2f
> > +5:   mfspr   r11, SPRN_DSISR
> > +   andis.   r11, r11, 0x0200   /* store */
> > +   beq   6f
> > +   andi.   r11, r10, _PAGE_RW   /* writeable? */
> > +   beq   2f /* branch if not */
> > +   ori   r10, r10, _PAGE_DIRTY | _PAGE_HWWRITE
> > +6:   ori   r10, r10, _PAGE_ACCESSED
> >
> > -   /* Update 'changed', among others.
> > -   */
> > -#ifdef CONFIG_SWAP
> > -   ori   r10, r10, _PAGE_DIRTY|_PAGE_HWWRITE
> > -   /* do not set the _PAGE_ACCESSED bit of a non-present page */
> > -   andi.   r11, r10, _PAGE_PRESENT
> > -   beq   4f
> > -   ori   r10, r10, _PAGE_ACCESSED
> > -4:
> > -#else
> > -   ori   r10, r10, _PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_HWWRITE
> > -#endif
> >     mfspr   r11, SPRN_MD_TWC      /* Get pte address again */
> >     stw   r10, 0(r11)      /* and update pte in table */
> >
> > +   xori   r10, r10, _PAGE_RW   /* Invert RW bit */
> > +
> >     /* The Linux PTE won't go exactly into the MMU TLB.
> > -    * Software indicator bits 21, 22 and 28 must be clear.
> > +    * Software indicator bit 28 must be clear.
> >      * Software indicator bits 24, 25, 26, and 27 must be
> >      * set.  All other Linux PTE bits control the behavior
> >      * of the MMU.
>
>
>
>
Benjamin Herrenschmidt Oct. 5, 2009, 9:37 p.m. UTC | #3
On Mon, 2009-10-05 at 23:25 +0200, Joakim Tjernlund wrote:
> 
> Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote on 05/10/2009 22:17:04:
> >
> > On Mon, 2009-10-05 at 14:16 +0200, Joakim Tjernlund wrote:
> > > Update the TLB asm to make proper use of _PAGE_DIRY and _PAGE_ACCESSED.
> > > Pros:
> > >  - I/D TLB Miss never needs to write to the linux pte.
> > >  - _PAGE_ACCESSED is only set on I/D TLB Error fixing accounting
> > >  - _PAGE_DIRTY is mapped to 0x100, the changed bit, and is set directly
> > >     when a page has been made dirty.
> >
> > Not sure here. You seem to still set those from asm. Is that necessary ?
> 
> Well, yes. head_32.S also sets ACCESSED and DIRTY from asm so why not me?

Don't look at the hash 32 code :-)

> The basic method I use is:
> TLB gets mapped with NoAccess, then the first access will trap
> into TLB error, where ACCESSED will be set if permission to access
> the page is OK (and RW too). On first store 8xx will trap
> into DTLB error and permissions is OK, DIRTY will be set too.
> Is this wrong?

>From your explanation it looks ok. IE. as long as !DIRTY -> not
writeable (will fault on store) and !ACCESSED means not accessible (will
fault on any access) then you are ok.

> Do I have to trap to C for first store?

Most of the time you do anyways since the PTE isn't populated at all. At
which point, Linux will populate a PTE with DIRTY and ACCESSED already
set. It should be reasonably rare to actually fault because DIRTY and/or
ACCESSED are missing.

> > The approach I take on BookE is to simply not set these from asm, -and-
> > (this is important) not set write permission if dirty is not set in
> > the TLB miss and set no access permission at all when accessed is not
> > set. This is important or we'll miss dirty updates which can
> > be fatal.
> 
> not sure, but this seems similar to what I do. DIRTY will be set,
> in asm, on first store.

Don't set DIRTY if !VALID

> ACCESSED will only be set iff (USER && VALID)

My point is that you should be able to simplify the code here, have only
the TLB miss look at the PTE, not the data access and instruction
access, and have the later be a boring exception going straight to C.

> >
> > The C code in handle_pte_fault() will fixup missing access and dirty
> > if necessary and flush.
> >
> > Also look at the 440 code, I think you could simplify your
> > implementation using similar techniques, such as andc of PTE against
> > requested bits etc... and thus maybe save a couple of insns.
> 
> Great, but first I want to make sure I doing it right :)
> 
> So is there some golden rule I have to follow?

Mostly only !ACCESSED -> no access permitted and !DIRTY -> no store
permitted and don't write anything back if !VALID.

Cheers,
Ben.

>  Jocke
> 
> >
> > Cheers,
> > Ben.
> >
> > >  - Proper RO/RW mapping of user space.
> > >  - Free up 2 SW TLB bits in the linux pte(add back _PAGE_WRITETHRU ?)
> > > Cons:
> > >  - 4 more instructions in I/D TLB Miss, but the since the linux pte is
> > >    not written anymore, it should still be a win.
> > > ---
> > >  arch/powerpc/include/asm/pte-8xx.h |    9 +-
> > >  arch/powerpc/kernel/head_8xx.S     |  163 ++++++++++++++++++++++++++----------
> > >  2 files changed, 122 insertions(+), 50 deletions(-)
> > >
> > > diff --git a/arch/powerpc/include/asm/pte-8xx.h b/arch/powerpc/include/asm/pte-8xx.h
> > > index 8c6e312..af541a2 100644
> > > --- a/arch/powerpc/include/asm/pte-8xx.h
> > > +++ b/arch/powerpc/include/asm/pte-8xx.h
> > > @@ -32,22 +32,21 @@
> > >  #define _PAGE_FILE   0x0002   /* when !present: nonlinear file mapping */
> > >  #define _PAGE_NO_CACHE   0x0002   /* I: cache inhibit */
> > >  #define _PAGE_SHARED   0x0004   /* No ASID (context) compare */
> > > +#define _PAGE_DIRTY   0x0100   /* C: page changed */
> > >
> > >  /* These five software bits must be masked out when the entry is loaded
> > >   * into the TLB.
> > >   */
> > >  #define _PAGE_EXEC   0x0008   /* software: i-cache coherency required */
> > >  #define _PAGE_GUARDED   0x0010   /* software: guarded access */
> > > -#define _PAGE_DIRTY   0x0020   /* software: page changed */
> > > -#define _PAGE_RW   0x0040   /* software: user write access allowed */
> > > -#define _PAGE_ACCESSED   0x0080   /* software: page referenced */
> > > +#define _PAGE_USER   0x0020   /* software: User space access */
> > >
> > >  /* Setting any bits in the nibble with the follow two controls will
> > >   * require a TLB exception handler change.  It is assumed unused bits
> > >   * are always zero.
> > >   */
> > > -#define _PAGE_HWWRITE   0x0100   /* h/w write enable: never set in Linux PTE */
> > > -#define _PAGE_USER   0x0800   /* One of the PP bits, the other is USER&~RW */
> > > +#define _PAGE_RW   0x0400   /* lsb PP bits */
> > > +#define _PAGE_ACCESSED   0x0800   /* msb PP bits */
> > >
> > >  #define _PMD_PRESENT   0x0001
> > >  #define _PMD_BAD   0x0ff0
> > > diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
> > > index 118bb05..b1f72d9 100644
> > > --- a/arch/powerpc/kernel/head_8xx.S
> > > +++ b/arch/powerpc/kernel/head_8xx.S
> > > @@ -333,21 +333,15 @@ InstructionTLBMiss:
> > >     mfspr   r11, SPRN_MD_TWC   /* ....and get the pte address */
> > >     lwz   r10, 0(r11)   /* Get the pte */
> > >
> > > -#ifdef CONFIG_SWAP
> > > -   /* do not set the _PAGE_ACCESSED bit of a non-present page */
> > > -   andi.   r11, r10, _PAGE_PRESENT
> > > -   beq   4f
> > > -   ori   r10, r10, _PAGE_ACCESSED
> > > -   mfspr   r11, SPRN_MD_TWC   /* get the pte address again */
> > > -   stw   r10, 0(r11)
> > > -4:
> > > -#else
> > > -   ori   r10, r10, _PAGE_ACCESSED
> > > -   stw   r10, 0(r11)
> > > -#endif
> > > -
> > > +   andi.   r11, r10, _PAGE_USER | _PAGE_ACCESSED
> > > +   cmpwi   cr0, r11, _PAGE_USER | _PAGE_ACCESSED
> > > +   beq+   cr0, 5f   /* branch if access allowed */
> > > +   rlwinm   r10, r10, 0, 22, 19 /* r10 &= ~(_PAGE_ACCESSED | _PAGE_RW) */
> > > +   b   6f
> > > +5:   xori   r10, r10, _PAGE_RW /* invert RW bit */
> > > +6:
> > >     /* The Linux PTE won't go exactly into the MMU TLB.
> > > -    * Software indicator bits 21, 22 and 28 must be clear.
> > > +    * Software indicator bit 28 must be clear.
> > >      * Software indicator bits 24, 25, 26, and 27 must be
> > >      * set.  All other Linux PTE bits control the behavior
> > >      * of the MMU.
> > > @@ -409,21 +403,15 @@ DataStoreTLBMiss:
> > >     DO_8xx_CPU6(0x3b80, r3)
> > >     mtspr   SPRN_MD_TWC, r11
> > >
> > > -#ifdef CONFIG_SWAP
> > > -   /* do not set the _PAGE_ACCESSED bit of a non-present page */
> > > -   andi.   r11, r10, _PAGE_PRESENT
> > > -   beq   4f
> > > -   ori   r10, r10, _PAGE_ACCESSED
> > > -4:
> > > -   /* and update pte in table */
> > > -#else
> > > -   ori   r10, r10, _PAGE_ACCESSED
> > > -#endif
> > > -   mfspr   r11, SPRN_MD_TWC   /* get the pte address again */
> > > -   stw   r10, 0(r11)
> > > -
> > > +   andi.   r11, r10, _PAGE_USER | _PAGE_ACCESSED
> > > +   cmpwi   cr0, r11, _PAGE_USER | _PAGE_ACCESSED
> > > +   beq+   cr0, 5f   /* branch if access allowed */
> > > +   rlwinm   r10, r10, 0, 22, 19 /* r10 &= ~(_PAGE_ACCESSED | _PAGE_RW) */
> > > +   b   6f
> > > +5:   xori   r10, r10, _PAGE_RW /* invert RW bit */
> > > +6:
> > >     /* The Linux PTE won't go exactly into the MMU TLB.
> > > -    * Software indicator bits 21, 22 and 28 must be clear.
> > > +    * Software indicator bit 28 must be clear.
> > >      * Software indicator bits 24, 25, 26, and 27 must be
> > >      * set.  All other Linux PTE bits control the behavior
> > >      * of the MMU.
> > > @@ -448,6 +436,91 @@ DataStoreTLBMiss:
> > >   */
> > >     . = 0x1300
> > >  InstructionTLBError:
> > > +#ifdef CONFIG_8xx_CPU6
> > > +   stw   r3, 8(r0)
> > > +#endif
> > > +   DO_8xx_CPU6(0x3f80, r3)
> > > +   mtspr   SPRN_M_TW, r10   /* Save a couple of working registers */
> > > +   mfcr   r10
> > > +   stw   r10, 0(r0)
> > > +   stw   r11, 4(r0)
> > > +
> > > +   mfspr   r11, SPRN_SRR1
> > > +   andis.   r11, r11, 0x5000   /* no translation, guarded */
> > > +   bne   2f
> > > +
> > > +   mfspr   r10, SPRN_SRR0   /* Get effective address of fault */
> > > +#ifdef CONFIG_8xx_CPU15
> > > +   addi   r11, r10, 0x1000
> > > +   tlbie   r11
> > > +   addi   r11, r10, -0x1000
> > > +   tlbie   r11
> > > +#endif
> > > +   DO_8xx_CPU6(0x3780, r3)
> > > +   mtspr   SPRN_MD_EPN, r10   /* Have to use MD_EPN for walk, MI_EPN can't */
> > > +   mfspr   r10, SPRN_M_TWB   /* Get level 1 table entry address */
> > > +
> > > +   /* If we are faulting a kernel address, we have to use the
> > > +    * kernel page tables.
> > > +    */
> > > +   andi.   r11, r10, 0x0800   /* Address >= 0x80000000 */
> > > +   beq   3f
> > > +   lis   r11, swapper_pg_dir@h
> > > +   ori   r11, r11, swapper_pg_dir@l
> > > +   rlwimi   r10, r11, 0, 2, 19
> > > +3:
> > > +   lwz   r11, 0(r10)   /* Get the level 1 entry */
> > > +   rlwinm.   r10, r11,0,0,19   /* Extract page descriptor page address */
> > > +   beq   2f      /* If zero, don't try to find a pte */
> > > +
> > > +   /* We have a pte table, so load the MI_TWC with the attributes
> > > +    * for this "segment."
> > > +    */
> > > +   ori   r11,r11,1      /* Set valid bit */
> > > +   DO_8xx_CPU6(0x2b80, r3)
> > > +   mtspr   SPRN_MI_TWC, r11   /* Set segment attributes */
> > > +   DO_8xx_CPU6(0x3b80, r3)
> > > +   mtspr   SPRN_MD_TWC, r11   /* Load pte table base address */
> > > +
> > > +   mfspr   r11, SPRN_SRR1
> > > +   andi.   r11, r11, 0x4000 /* MSR[PR] */
> > > +   mfspr   r11, SPRN_MD_TWC   /* ....and get the pte address */
> > > +   lwz   r10, 0(r11)   /* Get the pte */
> > > +   beq   5f /* Kernel access always OK */
> > > +   andi.   r11,r10, _PAGE_USER
> > > +   beq   2f
> > > +5:   ori   r10, r10, _PAGE_ACCESSED
> > > +   mfspr   r21, SPRN_MD_TWC   /* ....and get the pte address */
> > > +   stw   r10, 0(r11)
> > > +   xori   r10, r10, _PAGE_RW /* invert RW bit */
> > > +
> > > +   /* The Linux PTE won't go exactly into the MMU TLB.
> > > +    * Software indicator bit 28 must be clear.
> > > +    * Software indicator bits 24, 25, 26, and 27 must be
> > > +    * set.  All other Linux PTE bits control the behavior
> > > +    * of the MMU.
> > > +    */
> > > +   li   r11, 0x00f0
> > > +   rlwimi   r10, r11, 0, 24, 28   /* Set 24-27, clear 28 */
> > > +   DO_8xx_CPU6(0x2d80, r3)
> > > +   mtspr   SPRN_MI_RPN, r10   /* Update TLB entry */
> > > +
> > > +   mfspr   r10, SPRN_M_TW   /* Restore registers */
> > > +   lwz   r11, 0(r0)
> > > +   mtcr   r11
> > > +   lwz   r11, 4(r0)
> > > +#ifdef CONFIG_8xx_CPU6
> > > +   lwz   r3, 8(r0)
> > > +#endif
> > > +   rfi
> > > +
> > > +2:   mfspr   r10, SPRN_M_TW   /* Restore registers */
> > > +   lwz   r11, 0(r0)
> > > +   mtcr   r11
> > > +   lwz   r11, 4(r0)
> > > +#ifdef CONFIG_8xx_CPU6
> > > +   lwz   r3, 8(r0)
> > > +#endif
> > >     b   InstructionAccess
> > >
> > >  /* This is the data TLB error on the MPC8xx.  This could be due to
> > > @@ -472,8 +545,8 @@ DataTLBError:
> > >     /* First, make sure this was a store operation.
> > >     */
> > >     mfspr   r10, SPRN_DSISR
> > > -   andis.   r11, r10, 0x4800 /* no translation, no permission. */
> > > -   bne   2f   /* branch if either is set */
> > > +   andis.   r11, r10, 0x4000 /* no translation */
> > > +   bne   2f   /* branch if set */
> > >
> > >     /* The EA of a data TLB miss is automatically stored in the MD_EPN
> > >      * register.  The EA of a data TLB error is automatically stored in
> > > @@ -522,26 +595,26 @@ DataTLBError:
> > >     mfspr   r11, SPRN_MD_TWC      /* ....and get the pte address */
> > >     lwz   r10, 0(r11)      /* Get the pte */
> > >
> > > -   andi.   r11, r10, _PAGE_RW   /* Is it writeable? */
> > > -   beq   2f         /* Bail out if not */
> > > +   mfspr   r11, SPRN_SRR1
> > > +   andi.   r11, r11, 0x4000 /* MSR[PR] */
> > > +   beq   5f /* Kernel access always OK */
> > > +   andi.   r11,r10, _PAGE_USER
> > > +   beq   2f
> > > +5:   mfspr   r11, SPRN_DSISR
> > > +   andis.   r11, r11, 0x0200   /* store */
> > > +   beq   6f
> > > +   andi.   r11, r10, _PAGE_RW   /* writeable? */
> > > +   beq   2f /* branch if not */
> > > +   ori   r10, r10, _PAGE_DIRTY | _PAGE_HWWRITE
> > > +6:   ori   r10, r10, _PAGE_ACCESSED
> > >
> > > -   /* Update 'changed', among others.
> > > -   */
> > > -#ifdef CONFIG_SWAP
> > > -   ori   r10, r10, _PAGE_DIRTY|_PAGE_HWWRITE
> > > -   /* do not set the _PAGE_ACCESSED bit of a non-present page */
> > > -   andi.   r11, r10, _PAGE_PRESENT
> > > -   beq   4f
> > > -   ori   r10, r10, _PAGE_ACCESSED
> > > -4:
> > > -#else
> > > -   ori   r10, r10, _PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_HWWRITE
> > > -#endif
> > >     mfspr   r11, SPRN_MD_TWC      /* Get pte address again */
> > >     stw   r10, 0(r11)      /* and update pte in table */
> > >
> > > +   xori   r10, r10, _PAGE_RW   /* Invert RW bit */
> > > +
> > >     /* The Linux PTE won't go exactly into the MMU TLB.
> > > -    * Software indicator bits 21, 22 and 28 must be clear.
> > > +    * Software indicator bit 28 must be clear.
> > >      * Software indicator bits 24, 25, 26, and 27 must be
> > >      * set.  All other Linux PTE bits control the behavior
> > >      * of the MMU.
> >
> >
> >
> >
Joakim Tjernlund Oct. 5, 2009, 10 p.m. UTC | #4
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote on 05/10/2009 23:37:23:
>
> On Mon, 2009-10-05 at 23:25 +0200, Joakim Tjernlund wrote:
> >
> > Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote on 05/10/2009 22:17:04:
> > >
> > > On Mon, 2009-10-05 at 14:16 +0200, Joakim Tjernlund wrote:
> > > > Update the TLB asm to make proper use of _PAGE_DIRY and _PAGE_ACCESSED.
> > > > Pros:
> > > >  - I/D TLB Miss never needs to write to the linux pte.
> > > >  - _PAGE_ACCESSED is only set on I/D TLB Error fixing accounting
> > > >  - _PAGE_DIRTY is mapped to 0x100, the changed bit, and is set directly
> > > >     when a page has been made dirty.
> > >
> > > Not sure here. You seem to still set those from asm. Is that necessary ?
> >
> > Well, yes. head_32.S also sets ACCESSED and DIRTY from asm so why not me?
>
> Don't look at the hash 32 code :-)

So what to look at then? :)

>
> > The basic method I use is:
> > TLB gets mapped with NoAccess, then the first access will trap
> > into TLB error, where ACCESSED will be set if permission to access
> > the page is OK (and RW too). On first store 8xx will trap
> > into DTLB error and permissions is OK, DIRTY will be set too.
> > Is this wrong?
>
> >From your explanation it looks ok. IE. as long as !DIRTY -> not
> writeable (will fault on store) and !ACCESSED means not accessible (will
> fault on any access) then you are ok.

Yes, that is what I have (tried) to do.

>
> > Do I have to trap to C for first store?
>
> Most of the time you do anyways since the PTE isn't populated at all. At
> which point, Linux will populate a PTE with DIRTY and ACCESSED already
> set. It should be reasonably rare to actually fault because DIRTY and/or
> ACCESSED are missing.

I tried to unconditionally trap to C in DTLB error but it just hung if I did
that so the asm has to do something.

>
> > > The approach I take on BookE is to simply not set these from asm, -and-
> > > (this is important) not set write permission if dirty is not set in

Did you really mean "if dirty is not" ? I test RW for write permission.
Dirty is just set when the first write happens after the permission check.

> > > the TLB miss and set no access permission at all when accessed is not
> > > set. This is important or we'll miss dirty updates which can
> > > be fatal.
> >
> > not sure, but this seems similar to what I do. DIRTY will be set,
> > in asm, on first store.
>
> Don't set DIRTY if !VALID

I don't. !VALID trap to C

>
> > ACCESSED will only be set iff (USER && VALID)
>
> My point is that you should be able to simplify the code here, have only
> the TLB miss look at the PTE, not the data access and instruction
> access, and have the later be a boring exception going straight to C.

I do this, TLB Miss only looks at TLB and then transforms it into a HW pte.
The HW pte do not map 1:1 so I need to some magic to fit the linux pte
into a HW pte.

>
> > >
> > > The C code in handle_pte_fault() will fixup missing access and dirty
> > > if necessary and flush.
> > >
> > > Also look at the 440 code, I think you could simplify your
> > > implementation using similar techniques, such as andc of PTE against
> > > requested bits etc... and thus maybe save a couple of insns.
> >
> > Great, but first I want to make sure I doing it right :)
> >
> > So is there some golden rule I have to follow?
>
> Mostly only !ACCESSED -> no access permitted and !DIRTY -> no store
> permitted and don't write anything back if !VALID.

That should be !RW and not !DIRTY I hope? Then trap
first store and set DIRTY (without trapping to C)

>
> Cheers,
> Ben.
>
> >  Jocke
> >
> > >
> > > Cheers,
> > > Ben.
> > >
> > > >  - Proper RO/RW mapping of user space.
> > > >  - Free up 2 SW TLB bits in the linux pte(add back _PAGE_WRITETHRU ?)
> > > > Cons:
> > > >  - 4 more instructions in I/D TLB Miss, but the since the linux pte is
> > > >    not written anymore, it should still be a win.
Benjamin Herrenschmidt Oct. 5, 2009, 10:09 p.m. UTC | #5
On Tue, 2009-10-06 at 00:00 +0200, Joakim Tjernlund wrote:
> Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote on 05/10/2009 23:37:23:
> >
> > On Mon, 2009-10-05 at 23:25 +0200, Joakim Tjernlund wrote:
> > >
> > > Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote on 05/10/2009 22:17:04:
> > > >
> > > > On Mon, 2009-10-05 at 14:16 +0200, Joakim Tjernlund wrote:
> > > > > Update the TLB asm to make proper use of _PAGE_DIRY and _PAGE_ACCESSED.
> > > > > Pros:
> > > > >  - I/D TLB Miss never needs to write to the linux pte.
> > > > >  - _PAGE_ACCESSED is only set on I/D TLB Error fixing accounting
> > > > >  - _PAGE_DIRTY is mapped to 0x100, the changed bit, and is set directly
> > > > >     when a page has been made dirty.
> > > >
> > > > Not sure here. You seem to still set those from asm. Is that necessary ?
> > >
> > > Well, yes. head_32.S also sets ACCESSED and DIRTY from asm so why not me?
> >
> > Don't look at the hash 32 code :-)
> 
> So what to look at then? :)

head_440.S on a recent 2.6

> > Most of the time you do anyways since the PTE isn't populated at all. At
> > which point, Linux will populate a PTE with DIRTY and ACCESSED already
> > set. It should be reasonably rare to actually fault because DIRTY and/or
> > ACCESSED are missing.
> 
> I tried to unconditionally trap to C in DTLB error but it just hung if I did
> that so the asm has to do something.

Sure, the question is what and how can we get away without it ? :-)

> > > > The approach I take on BookE is to simply not set these from asm, -and-
> > > > (this is important) not set write permission if dirty is not set in
> 
> Did you really mean "if dirty is not" ? I test RW for write permission.
> Dirty is just set when the first write happens after the permission check.

Sure but if dirty is cleared by the kernel, then you also need to remove
write permission in the TLB or it will miss setting dirty on the next
store to the page.

So dirty basically acts as a filter on RW, and accessed as a filter on
valid if you want.

> > Mostly only !ACCESSED -> no access permitted and !DIRTY -> no store
> > permitted and don't write anything back if !VALID.
> 
> That should be !RW and not !DIRTY I hope? Then trap
> first store and set DIRTY (without trapping to C)

No it's really !(RW _AND_ DIRTY) -> no store permitted, and
!(PRESENT _AND_ ACCESSED) -> no access permitted.

Cheers,
Ben.

> >
> > Cheers,
> > Ben.
> >
> > >  Jocke
> > >
> > > >
> > > > Cheers,
> > > > Ben.
> > > >
> > > > >  - Proper RO/RW mapping of user space.
> > > > >  - Free up 2 SW TLB bits in the linux pte(add back _PAGE_WRITETHRU ?)
> > > > > Cons:
> > > > >  - 4 more instructions in I/D TLB Miss, but the since the linux pte is
> > > > >    not written anymore, it should still be a win.
Joakim Tjernlund Oct. 5, 2009, 10:55 p.m. UTC | #6
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote on 06/10/2009 00:09:16:
>
> On Tue, 2009-10-06 at 00:00 +0200, Joakim Tjernlund wrote:
> > Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote on 05/10/2009 23:37:23:
> > >
> > > On Mon, 2009-10-05 at 23:25 +0200, Joakim Tjernlund wrote:
> > > >
> > > > Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote on 05/10/2009 22:17:04:
> > > > >
> > > > > On Mon, 2009-10-05 at 14:16 +0200, Joakim Tjernlund wrote:
> > > > > > Update the TLB asm to make proper use of _PAGE_DIRY and _PAGE_ACCESSED.
> > > > > > Pros:
> > > > > >  - I/D TLB Miss never needs to write to the linux pte.
> > > > > >  - _PAGE_ACCESSED is only set on I/D TLB Error fixing accounting
> > > > > >  - _PAGE_DIRTY is mapped to 0x100, the changed bit, and is set directly
> > > > > >     when a page has been made dirty.
> > > > >
> > > > > Not sure here. You seem to still set those from asm. Is that necessary ?
> > > >
> > > > Well, yes. head_32.S also sets ACCESSED and DIRTY from asm so why not me?
> > >
> > > Don't look at the hash 32 code :-)
> >
> > So what to look at then? :)
>
> head_440.S on a recent 2.6

OK, will look

>
> > > Most of the time you do anyways since the PTE isn't populated at all. At
> > > which point, Linux will populate a PTE with DIRTY and ACCESSED already
> > > set. It should be reasonably rare to actually fault because DIRTY and/or
> > > ACCESSED are missing.
> >
> > I tried to unconditionally trap to C in DTLB error but it just hung if I did
> > that so the asm has to do something.
>
> Sure, the question is what and how can we get away without it ? :-)

You tell me :)

>
> > > > > The approach I take on BookE is to simply not set these from asm, -and-
> > > > > (this is important) not set write permission if dirty is not set in
> >
> > Did you really mean "if dirty is not" ? I test RW for write permission.
> > Dirty is just set when the first write happens after the permission check.
>
> Sure but if dirty is cleared by the kernel, then you also need to remove
> write permission in the TLB or it will miss setting dirty on the next
> store to the page.

Dirty map to HW changed bit so if kernel clears it, the next
store will trap to DTLB error. There I will check RW and clear it again
without trapping to C. Is that OK? Not if I read you correcly below ..

>
> So dirty basically acts as a filter on RW, and accessed as a filter on
> valid if you want.
>
> > > Mostly only !ACCESSED -> no access permitted and !DIRTY -> no store
> > > permitted and don't write anything back if !VALID.
> >
> > That should be !RW and not !DIRTY I hope? Then trap
> > first store and set DIRTY (without trapping to C)
>
> No it's really !(RW _AND_ DIRTY) -> no store permitted, and

..  hmm, I don't do that. I should do a
    if ( store && !(pte & (RW | DIRTY))
       goto DSI
 in DTLB error?

> !(PRESENT _AND_ ACCESSED) -> no access permitted.

Yes, how about those pinned kernel ptes. Do they have ACCESSED from start?

>
> Cheers,
> Ben.
>
> > >
> > > Cheers,
> > > Ben.
> > >
> > > >  Jocke
> > > >
> > > > >
> > > > > Cheers,
> > > > > Ben.
> > > > >
> > > > > >  - Proper RO/RW mapping of user space.
> > > > > >  - Free up 2 SW TLB bits in the linux pte(add back _PAGE_WRITETHRU ?)
> > > > > > Cons:
> > > > > >  - 4 more instructions in I/D TLB Miss, but the since the linux pte is
> > > > > >    not written anymore, it should still be a win.
>
>
>
>
Benjamin Herrenschmidt Oct. 5, 2009, 11:15 p.m. UTC | #7
On Tue, 2009-10-06 at 00:55 +0200, Joakim Tjernlund wrote:
> > Sure but if dirty is cleared by the kernel, then you also need to
> remove
> > write permission in the TLB or it will miss setting dirty on the
> next
> > store to the page.
> 
> Dirty map to HW changed bit so if kernel clears it, the next
> store will trap to DTLB error. There I will check RW and clear it
> again
> without trapping to C. Is that OK? Not if I read you correcly below ..

Well, if the HW has the ability to enforce trap when store with !DIRTY,
then that's fine, just map it that way, but you shouldn't have to handle
it in the DTLB error neither, the kernel will fix it up for you in
handle_pte_fault().

I think I need to get myself an 8xx manual and figure out what that damn
MMU actually does :-)

> No it's really !(RW _AND_ DIRTY) -> no store permitted, and
> 
> ..  hmm, I don't do that. I should do a
>     if ( store && !(pte & (RW | DIRTY))
>        goto DSI
>  in DTLB error?

Well, if the HW does the test of DIRTY, then it's fine, as you seem to
suggest above. _something_ needs to do it, either SW or HW, ie, we need
to catch any attempt to store to a non-dirty page to set dirty.

> > !(PRESENT _AND_ ACCESSED) -> no access permitted.
> 
> Yes, how about those pinned kernel ptes. Do they have ACCESSED from
> start?

They should.

Cheers,
Ben.
Joakim Tjernlund Oct. 5, 2009, 11:35 p.m. UTC | #8
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote on 06/10/2009 01:15:39:
>
> On Tue, 2009-10-06 at 00:55 +0200, Joakim Tjernlund wrote:
> > > Sure but if dirty is cleared by the kernel, then you also need to
> > remove
> > > write permission in the TLB or it will miss setting dirty on the
> > next
> > > store to the page.
> >
> > Dirty map to HW changed bit so if kernel clears it, the next
> > store will trap to DTLB error. There I will check RW and clear it
> > again
> > without trapping to C. Is that OK? Not if I read you correcly below ..
>
> Well, if the HW has the ability to enforce trap when store with !DIRTY,

Yes, provided that the kernel invalidates the TLB too so the next access
will provoke a TLB Miss, which will then provoke a TLB error. The TLB
error routine checks VALID, RW and USER(if not a kernel access), then sets
ACCESSED & DIRTY and writes the TLB(RPN reg).

Perhaps the missing invalidate is haunting us here?

> then that's fine, just map it that way, but you shouldn't have to handle
> it in the DTLB error neither, the kernel will fix it up for you in
> handle_pte_fault().

Does not all ppc have the Changed bit?

>
> I think I need to get myself an 8xx manual and figure out what that damn
> MMU actually does :-)

Please do, get the mpc862 users manual :)

>
> > No it's really !(RW _AND_ DIRTY) -> no store permitted, and
> >
> > ..  hmm, I don't do that. I should do a
> >     if ( store && !(pte & (RW | DIRTY))
> >        goto DSI
> >  in DTLB error?
>
> Well, if the HW does the test of DIRTY, then it's fine, as you seem to
> suggest above. _something_ needs to do it, either SW or HW, ie, we need
> to catch any attempt to store to a non-dirty page to set dirty.

Yes, I think I do this.

>
> > > !(PRESENT _AND_ ACCESSED) -> no access permitted.
> >
> > Yes, how about those pinned kernel ptes. Do they have ACCESSED from
> > start?
>
> They should.
>
> Cheers,
> Ben.
>
>
>
>
Benjamin Herrenschmidt Oct. 6, 2009, 12:34 a.m. UTC | #9
On Tue, 2009-10-06 at 01:35 +0200, Joakim Tjernlund wrote:
> 
> > Well, if the HW has the ability to enforce trap when store with !
> DIRTY,
> 
> Yes, provided that the kernel invalidates the TLB too so the next
> access
> will provoke a TLB Miss, which will then provoke a TLB error. The TLB
> error routine checks VALID, RW and USER(if not a kernel access), then
> sets
> ACCESSED & DIRTY and writes the TLB(RPN reg).
> 
> Perhaps the missing invalidate is haunting us here?

No, the kernel will invalidate when clearing dirty or accessed, I don't
think that's our problem.

This is still all inefficient, we end up basically with two traps.

8xx provides backup GPRs when doing TLB misses ? What does it cost to
jump out of a TLB miss back into "normal" context ?

IE. What I do on 440 is I set a mask of required bits, basically
_PAGE_PRESENT | _PAGE_ACCESSED is the base. The DTLB miss also sticks
in _PAGE_RW | _PAGE_DIRTY when it's a store fault.

Then, I andc. the PTE value off that mask, and if the result is non-0
(which means one of the required bits is missing), I get out of the TLB
miss immediately and go to the data (or instruction) access interrupt.

Once you've done that, you should be able to have data and instruction
access go straight to C. Missing _PAGE_ACCESSED and _PAGE_DIRTY are
going to be fixed up by generic code.

> > then that's fine, just map it that way, but you shouldn't have to
> handle
> > it in the DTLB error neither, the kernel will fix it up for you in
> > handle_pte_fault().
> 
> Does not all ppc have the Changed bit?

No. BookE doesn't.

> Please do, get the mpc862 users manual :)

Ok :-)

Cheers,
Ben.
Joakim Tjernlund Oct. 6, 2009, 6:15 a.m. UTC | #10
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote on 06/10/2009 02:34:15:
>
> On Tue, 2009-10-06 at 01:35 +0200, Joakim Tjernlund wrote:
> >
> > > Well, if the HW has the ability to enforce trap when store with !
> > DIRTY,
> >
> > Yes, provided that the kernel invalidates the TLB too so the next
> > access
> > will provoke a TLB Miss, which will then provoke a TLB error. The TLB
> > error routine checks VALID, RW and USER(if not a kernel access), then
> > sets
> > ACCESSED & DIRTY and writes the TLB(RPN reg).
> >
> > Perhaps the missing invalidate is haunting us here?
>
> No, the kernel will invalidate when clearing dirty or accessed, I don't
> think that's our problem.
>
> This is still all inefficient, we end up basically with two traps.

Yes, but once the 2 traps is over, it gets much cheaper plus I don't
get a choice, see below.

>
> 8xx provides backup GPRs when doing TLB misses ? What does it cost to
> jump out of a TLB miss back into "normal" context ?

Nope, there is just one TLB scratch register. I have been meaning to
ask you about SPRG2, it seems unused?
There is a leftover from 2.4 that inits G2 to something but the
it appears unused otherwise.

>
> IE. What I do on 440 is I set a mask of required bits, basically
> _PAGE_PRESENT | _PAGE_ACCESSED is the base. The DTLB miss also sticks
> in _PAGE_RW | _PAGE_DIRTY when it's a store fault.

Yes, I would too but TLB Miss knows nothing about load/store, protection etc.
because DSISR isn't set. So I cannot see any other way than the TLB Error way.

>
> Then, I andc. the PTE value off that mask, and if the result is non-0
> (which means one of the required bits is missing), I get out of the TLB
> miss immediately and go to the data (or instruction) access interrupt.
>
> Once you've done that, you should be able to have data and instruction
> access go straight to C. Missing _PAGE_ACCESSED and _PAGE_DIRTY are
> going to be fixed up by generic code.
>
> > > then that's fine, just map it that way, but you shouldn't have to
> > handle
> > > it in the DTLB error neither, the kernel will fix it up for you in
> > > handle_pte_fault().
> >
> > Does not all ppc have the Changed bit?
>
> No. BookE doesn't.

But I guess BookE knows if it is a load or store in TLB Miss?
Then it can emulate changed bit I guess.
Benjamin Herrenschmidt Oct. 6, 2009, 6:45 a.m. UTC | #11
On Tue, 2009-10-06 at 08:15 +0200, Joakim Tjernlund wrote:

> Yes, I would too but TLB Miss knows nothing about load/store, protection etc.
> because DSISR isn't set. So I cannot see any other way than the TLB Error way.

Hrm... that MMU really sucks more than I thought :-(

I'll go read the manual and think about that a bit more.

> But I guess BookE knows if it is a load or store in TLB Miss?
> Then it can emulate changed bit I guess.

Yes, it knows.

Cheers,
Ben.
Joakim Tjernlund Oct. 6, 2009, 7:54 a.m. UTC | #12
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote on 06/10/2009 08:45:47:
>
> On Tue, 2009-10-06 at 08:15 +0200, Joakim Tjernlund wrote:
>
> > Yes, I would too but TLB Miss knows nothing about load/store, protection etc.
> > because DSISR isn't set. So I cannot see any other way than the TLB Error way.
>
> Hrm... that MMU really sucks more than I thought :-(

Yeah, but at least it got a Changed bit as opposed to some other
arch mention in this mail thread :)

>
> I'll go read the manual and think about that a bit more.
Thanks

I wonder if not much of the problems Scott and Rex are seeing are from
dcbX insn and that my fixup isn't quite working yet. I am tempted
to get the asm version working again as then I don't have to worry about
permission and get_user() et. all :)
Joakim Tjernlund Oct. 6, 2009, 3:40 p.m. UTC | #13
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote on 06/10/2009 08:45:47:
>
> On Tue, 2009-10-06 at 08:15 +0200, Joakim Tjernlund wrote:
>
> > Yes, I would too but TLB Miss knows nothing about load/store, protection etc.
> > because DSISR isn't set. So I cannot see any other way than the TLB Error way.
>
> Hrm... that MMU really sucks more than I thought :-(
>
> I'll go read the manual and think about that a bit more.

I realise now that for an ITLB Miss I can work out everything I need.
an ITLB must be a read en everything else is in the linux pte.
Just check present and user, add accessed if not already there.
I blame that on too little sleep :)

I don't think I will dwell on that ATM.

    Jocke
Joakim Tjernlund Oct. 6, 2009, 5:28 p.m. UTC | #14
>
> Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote on 06/10/2009 08:45:47:
> >
> > On Tue, 2009-10-06 at 08:15 +0200, Joakim Tjernlund wrote:
> >
> > > Yes, I would too but TLB Miss knows nothing about load/store, protection etc.
> > > because DSISR isn't set. So I cannot see any other way than the TLB Error way.
> >
> > Hrm... that MMU really sucks more than I thought :-(
> >
> > I'll go read the manual and think about that a bit more.
>
> I realise now that for an ITLB Miss I can work out everything I need.
> an ITLB must be a read en everything else is in the linux pte.
> Just check present and user, add accessed if not already there.
> I blame that on too little sleep :)

bugger,  more bugs I think. Need to fix the TLB code some more
Joakim Tjernlund Oct. 6, 2009, 10:05 p.m. UTC | #15
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote on 06/10/2009 02:34:15:
>
> On Tue, 2009-10-06 at 01:35 +0200, Joakim Tjernlund wrote:
> >
> > > Well, if the HW has the ability to enforce trap when store with !
> > DIRTY,
> >
> > Yes, provided that the kernel invalidates the TLB too so the next
> > access
> > will provoke a TLB Miss, which will then provoke a TLB error. The TLB
> > error routine checks VALID, RW and USER(if not a kernel access), then
> > sets
> > ACCESSED & DIRTY and writes the TLB(RPN reg).
> >
> > Perhaps the missing invalidate is haunting us here?
>
> No, the kernel will invalidate when clearing dirty or accessed, I don't
> think that's our problem.
>
> This is still all inefficient, we end up basically with two traps.
>
> 8xx provides backup GPRs when doing TLB misses ? What does it cost to
> jump out of a TLB miss back into "normal" context ?
>
> IE. What I do on 440 is I set a mask of required bits, basically
> _PAGE_PRESENT | _PAGE_ACCESSED is the base. The DTLB miss also sticks
> in _PAGE_RW | _PAGE_DIRTY when it's a store fault.

After some more thinking I don't think I do TLB Miss/Error correctly yet.
The problem is ACCESSED. Since I don't know if load or store in TLB Miss
I must choose:
 - Assume load and do what you do above. That will incorrectly
   set ACCESSED on store ops when mapped as RO(plus whatever more I haven't thought about yet)

 - Trap to TLB Error and do the above. That will set ACCESSED correctly
   but won't trap kernel space so these remain what they are.
   Is anything depending on ACCESSED for kernel pages?
   if so, what if we set ACCESSED on all kernel pages when mapping them at boot?
   Will SWAP or some other service(accounting?) not like that?

Finally, why do you need to include DIRTY when a store OP?
Do you need to do COW before dirtying the page?
Seems to work for me to just set DIRTY in TLB Error if RW is set too
and not trap to C.
Benjamin Herrenschmidt Oct. 6, 2009, 11:25 p.m. UTC | #16
> After some more thinking I don't think I do TLB Miss/Error correctly yet.
> The problem is ACCESSED. Since I don't know if load or store in TLB Miss
> I must choose:
>  - Assume load and do what you do above. That will incorrectly
>    set ACCESSED on store ops when mapped as RO(plus whatever more I haven't thought about yet)
> 
>  - Trap to TLB Error and do the above. That will set ACCESSED correctly
>    but won't trap kernel space so these remain what they are.
>    Is anything depending on ACCESSED for kernel pages?
>    if so, what if we set ACCESSED on all kernel pages when mapping them at boot?
>    Will SWAP or some other service(accounting?) not like that?

Kernel pages always have ACCESSED set and never clear it.

I think the only solution here is that if -anything- doesn't look right,
the TLB miss should create one of those "unpopulated" entries and we
need to make sure they are properly invalidated in the subsequent fault
path. I'll read the 8xx doco asap to make sure I get it right.

> Finally, why do you need to include DIRTY when a store OP?
> Do you need to do COW before dirtying the page?
> Seems to work for me to just set DIRTY in TLB Error if RW is set too
> and not trap to C.

You can, but what about a load ? That shouldn't set dirty. So if you set
dirty only on stores, then a load will bring in a page without dirty,
you need to make sure you get another TLB error when writing to it so
you get a chance to set dirty.

Ben.
Benjamin Herrenschmidt Oct. 7, 2009, 1:07 a.m. UTC | #17
Allright, did a bit of reading of doco and code..

Doco isn't totally clear though. At some stage, it -hints- that in case
of a TLB "error" (match on EA/ASID but incorrect
protection/valid/changed/...) the offending TLB entry is automatically
invalidated. Do you know if that is correct ?

I would hope so since we never do anything to remove the "invalid"
entries we write to the TLB when hitting non-present PTEs but then, that
may also explain some of our problems...

Now, a few comments from what I read in the code:

 - The whole writeback could be avoided in Instruction/Data TLB miss,
but for that, you need to make sure that the TLB entry we create has
valid set only if -both- present and accessed are set. That would save
in the case of CONFIG_SWAP, a store and a read back of TWC I suppose,
but you do need to find a way to do that ANDing of ACCESSED and PRESENT.

 - I think we can get rid of HWWRITE. We can make CHANGED be the current
HWWRITE value, I agree with you, which matches the HW changed bit. We
need to be a bit careful of how we setup the PP bits tho. At this stage,
I see several approaches:

   * One is to basically "generate" the right PP bits based on a
combination of _PAGE_USER and _PAGE_RW. That's the simpler approach but
probably uses more code in the TLB miss handler. It would look like
that, with MxCTR:PPCS=0

  _PAGE_USER    _PAGE_RW      PP (bits 20..27)
      0            0               011C1111    (C is _PAGE_DIRTY)
      0            1               000C1111
      1            0               110C1111
      1            1               100C1111

One easy way to do that is to have _PAGE_USER and _PAGE_RW sit next to
each other in bit position 28 and 29 (0xc). Load a GPR with something
like 0110000011001000 and rotate it left by PTE & 0xc, then move the
resulting 3 bits into position, or something along those lines. You can
also give up on kernel read-only support and go down to 2 PP bits and
never use the extended encoding.

  * Another one is to use MxCTR:PPCS=1 a mask of 100C1U00 (U is
_PAGE_USER) and or in ^_PAGE_RW (it could actually be made
reverse-polarity in the PTE but that would mean some changes to non-8xx
specific headers, so let's avoid it for now at least).

At least that's the best options I can see from my reading of the doco,
though it's not totally clear to me what really happens when doing the
PPCS trick, will it generate a TLB error on a non-match or will it try
to TLB miss, which could be bad.

  * Last but not least, it wouldn't be hard to use either of the above
encodings, and have the PTE actually contain the right bit combination
already. You don't need to have a _PAGE_RW, you don't need to have a
_PAGE_USER  :-) Look at how I do things for book3e, where I layout the
6 BookE protection bit directly in the PTE. That's a bit harder and maybe
will need subtle changes to pte-common.h and such to accomodate it, and
so probably something to experiment with as a second step, but it's the
most efficient approach in the long run for obvious reasons.

 - I think we could have more bits pre-set to the right values in the
PTE, look how we play with defining some of the constants in
pte-common.h, might be worth having a look, but -after- we have
something that works :-)

 - Possible bug: I'm very disturbed by the fact that DataTLBError sets
HWWRITE and DIRTY on a non-present PTE. It should not. Just like
ACCESSED. That's going to cause trouble and swap corruption, even more
as we move DIRTY around.

 - Maybe we can completely remove that mucking around with dirty,
setting of accessed etc... from DataTLBError. Just make it go to C code
just like InstructionAccess, as we discussed earlier, the generic code
will fix it up and we'll speed up page faults. It should be fairly rare
to take a fault due to a missing _PAGE_ACCESSED or _PAGE_DIRTY in any
case, so all that fixup in those exceptions is just overhead.

 - Later on, if it rocks your boat, you may want to look into removing
the mucking around with swapper_pg_dir in the TLB miss. Instead, what
you can do is lazily copy the kernel PMDs into the user PMDs. IE. On the
first kernel access from a new user context, you fault, and from the
do_page_fault() code, we can detect that and fixup the user PMD to point
to the kernel page tables, thus avoiding that whole bunch of code in the
TLB miss. When creating new user page tables, we can pre-fill the kernel
PMDs too. I think x86 does that. We could do that for BookE too, though
it's less of a win since we have to fixup the TID in MMUCR/MAS but for
8xx it would save a few instructions & conditionals in the TLB miss fast
path.

 - I still have problems with the comment next to the "workaround"
tlbil_va() we have in the icache/dcache flush. It doesn't make much
sense to me since dcbst is going to be done by the kernel on a kernel
address, not a user address, so I don't see how the thing we invalidate
relates to the flush we do just below.... So that raises a few
questions:

   * If this is to avoid write faults on kernel pages, then we may just
want to have a fixup in do_page_fault() to ignore them when coming from
flush_dcache_icache_* using the exception table mechanism instead ?

   * I don't see how it works around user faults but maybe you have a
clear scenario in mind

   * It appears still reasonably obvious that we need that tlbil_va
somewhere in that fault path, but I don't know what for, I'm not sure
it's really what the comment says it's about. That raises back the whole
question of when are those "invalid" TLB entries that we create when
_PAGE_PRESENT is not set are going to be removed. The doc hints that
they go a way on TLB error, but the doc really only says so for missing
changed bit... so I'm tempted to say we need to basically stick a
tlbil_va on 8xx on both set_pte_at() and ptep_set_access_flags() for any
non-kernel page. What do you think ? That would be the safest.

 - The "fake" DataAccess and InstructionAccess are useless... just a
spurious jump for nothing. We could implement those straight in
DataTLBError and InstructionTLBError, we just need to make sure that
the trap numbers we put in the exception frame are 0x300 and 0x400, but
that's just a matter of passing the right bits to EXC_XFER_EE_LITE().

 - Finally, another boat-rocking optimisation to do is preload. You
could hook in update_mmu_cache() for 8xx to go whack an entry in the TLB
as well, which would avoid a TLB miss right after a page fault. But
again, only when things work.

None of the above of course address your DAR-not-updated concern. I
think your approach of "clobbering" the DAR on every storage exception
after it's been snapshotted and then testing for the clobber and
emulating the instruction is the way to go, though your patch could use
some cleaning there.

I'll post comments on it separately.

At the end of the day, I can't help that much without HW access and/or a
simulator, as you can guess, and no, I'm not asking for people to send
me an 8xx board :-) (especially useless without a BDI imho). So I rely
on Scott and yourself to sort this out. But thanks for showing up, and
please, do port your board to 2.6 so you can test your stuff :-)

Cheers,
Ben.
Joakim Tjernlund Oct. 7, 2009, 7:47 a.m. UTC | #18
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote on 07/10/2009 03:07:35:
>
> Allright, did a bit of reading of doco and code..

hey, this is a super, thanks!

>
> Doco isn't totally clear though. At some stage, it -hints- that in case
> of a TLB "error" (match on EA/ASID but incorrect
> protection/valid/changed/...) the offending TLB entry is automatically
> invalidated. Do you know if that is correct ?

Nope, I don't know.

>
> I would hope so since we never do anything to remove the "invalid"
> entries we write to the TLB when hitting non-present PTEs but then, that
> may also explain some of our problems...

hmm, then a tlbil_va() in do_page_fault for "no translation" errors
would help I guess.

>
> Now, a few comments from what I read in the code:
>
>  - The whole writeback could be avoided in Instruction/Data TLB miss,
> but for that, you need to make sure that the TLB entry we create has
> valid set only if -both- present and accessed are set. That would save
> in the case of CONFIG_SWAP, a store and a read back of TWC I suppose,
> but you do need to find a way to do that ANDing of ACCESSED and PRESENT.

So far I have mapped !ACCESSED pages as No Access instead, maybe that
was wrong? I thought it was easier but I will have a look at this too.

>
>  - I think we can get rid of HWWRITE. We can make CHANGED be the current
> HWWRITE value, I agree with you, which matches the HW changed bit. We
> need to be a bit careful of how we setup the PP bits tho. At this stage,
> I see several approaches:
>
>    * One is to basically "generate" the right PP bits based on a
> combination of _PAGE_USER and _PAGE_RW. That's the simpler approach but
> probably uses more code in the TLB miss handler. It would look like
> that, with MxCTR:PPCS=0
>
>   _PAGE_USER    _PAGE_RW      PP (bits 20..27)
>       0            0               011C1111    (C is _PAGE_DIRTY)
>       0            1               000C1111
>       1            0               110C1111
>       1            1               100C1111
>
> One easy way to do that is to have _PAGE_USER and _PAGE_RW sit next to
> each other in bit position 28 and 29 (0xc). Load a GPR with something
> like 0110000011001000 and rotate it left by PTE & 0xc, then move the
> resulting 3 bits into position, or something along those lines. You can
> also give up on kernel read-only support and go down to 2 PP bits and
> never use the extended encoding.

yes, I do think the extended encoding is too much work and not worth it.
One concern I have is if a user RO mapping also can be a kernel RO
mapping? That is, do kernel require RW to a user page mapped RO?

>
>   * Another one is to use MxCTR:PPCS=1 a mask of 100C1U00 (U is
> _PAGE_USER) and or in ^_PAGE_RW (it could actually be made
> reverse-polarity in the PTE but that would mean some changes to non-8xx
> specific headers, so let's avoid it for now at least).

hmm, interesting.
Then I will also have set ACCESSED manually in TLB Miss I think.

>
> At least that's the best options I can see from my reading of the doco,
> though it's not totally clear to me what really happens when doing the
> PPCS trick, will it generate a TLB error on a non-match or will it try
> to TLB miss, which could be bad.

Yes, I don't think anybody has tested this.

>
>   * Last but not least, it wouldn't be hard to use either of the above
> encodings, and have the PTE actually contain the right bit combination
> already. You don't need to have a _PAGE_RW, you don't need to have a
> _PAGE_USER  :-) Look at how I do things for book3e, where I layout the
> 6 BookE protection bit directly in the PTE. That's a bit harder and maybe
> will need subtle changes to pte-common.h and such to accomodate it, and
> so probably something to experiment with as a second step, but it's the
> most efficient approach in the long run for obvious reasons.
>
>  - I think we could have more bits pre-set to the right values in the
> PTE, look how we play with defining some of the constants in
> pte-common.h, might be worth having a look, but -after- we have
> something that works :-)
>
>  - Possible bug: I'm very disturbed by the fact that DataTLBError sets
> HWWRITE and DIRTY on a non-present PTE. It should not. Just like
> ACCESSED. That's going to cause trouble and swap corruption, even more
> as we move DIRTY around.

Yes, this is what I fix with the first patch in my series:
   8xx: DTLB Error must check for more errors.

    DataTLBError currently does:
     if ((err & 0x02000000) == 0)
        DSI();
    This won't handle a store with no valid translation.
    Change this to
     if ((err & 0x48000000) != 0)
        DSI();
    that is, branch to DSI if either !permission or
    !translation.

>
>  - Maybe we can completely remove that mucking around with dirty,
> setting of accessed etc... from DataTLBError. Just make it go to C code
> just like InstructionAccess, as we discussed earlier, the generic code
> will fix it up and we'll speed up page faults. It should be fairly rare
> to take a fault due to a missing _PAGE_ACCESSED or _PAGE_DIRTY in any
> case, so all that fixup in those exceptions is just overhead.

That depends on if you set ACCESSED in TLB Miss or not I think.
if TLB Miss maps pages as NoAccess du to missing ACCESSED then the
first access is always going to trap to TLB Error and then to
C just to fix DIRTY and/or ACCESSED, feels a bit more expensive
fixing it in TLB error, no?

BTW, does 2.4 update ACCESSED and DIRTY in generic code the same way
as 2.6? It is still much easier for me to test thing in 2.4 and then
move it over to 2.6

>
>  - Later on, if it rocks your boat, you may want to look into removing
> the mucking around with swapper_pg_dir in the TLB miss. Instead, what
> you can do is lazily copy the kernel PMDs into the user PMDs. IE. On the
> first kernel access from a new user context, you fault, and from the
> do_page_fault() code, we can detect that and fixup the user PMD to point
> to the kernel page tables, thus avoiding that whole bunch of code in the
> TLB miss. When creating new user page tables, we can pre-fill the kernel
> PMDs too. I think x86 does that. We could do that for BookE too, though
> it's less of a win since we have to fixup the TID in MMUCR/MAS but for
> 8xx it would save a few instructions & conditionals in the TLB miss fast
> path.

That would be nice, I have had that idea but no clue as how to do it.

>
>  - I still have problems with the comment next to the "workaround"
> tlbil_va() we have in the icache/dcache flush. It doesn't make much
> sense to me since dcbst is going to be done by the kernel on a kernel
> address, not a user address, so I don't see how the thing we invalidate
> relates to the flush we do just below.... So that raises a few
> questions:

This workaround was added when we started to force TLB Errors in
TLB Miss for missing pmd entries so it may be some other bug in the
TLB code that makes this happen.
Here is a link:
 http://www.mail-archive.com/linuxppc-embedded@ozlabs.org/msg07885.html
Seems to boil down to a missing tlbie()

>
>    * If this is to avoid write faults on kernel pages, then we may just
> want to have a fixup in do_page_fault() to ignore them when coming from
> flush_dcache_icache_* using the exception table mechanism instead ?
>
>    * I don't see how it works around user faults but maybe you have a
> clear scenario in mind
>
>    * It appears still reasonably obvious that we need that tlbil_va
> somewhere in that fault path, but I don't know what for, I'm not sure
> it's really what the comment says it's about. That raises back the whole
> question of when are those "invalid" TLB entries that we create when
> _PAGE_PRESENT is not set are going to be removed. The doc hints that
> they go a way on TLB error, but the doc really only says so for missing
> changed bit... so I'm tempted to say we need to basically stick a
> tlbil_va on 8xx on both set_pte_at() and ptep_set_access_flags() for any
> non-kernel page. What do you think ? That would be the safest.

Possibly just doing a tlbil_va in do_page_fault() for no translation
errors comes to mind.

>
>  - The "fake" DataAccess and InstructionAccess are useless... just a
> spurious jump for nothing. We could implement those straight in
> DataTLBError and InstructionTLBError, we just need to make sure that
> the trap numbers we put in the exception frame are 0x300 and 0x400, but
> that's just a matter of passing the right bits to EXC_XFER_EE_LITE().
>
>  - Finally, another boat-rocking optimisation to do is preload. You
> could hook in update_mmu_cache() for 8xx to go whack an entry in the TLB
> as well, which would avoid a TLB miss right after a page fault. But
> again, only when things work.
>
> None of the above of course address your DAR-not-updated concern. I
> think your approach of "clobbering" the DAR on every storage exception
> after it's been snapshotted and then testing for the clobber and
> emulating the instruction is the way to go, though your patch could use
> some cleaning there.

Yes, it is full of debug cruft now.

>
> I'll post comments on it separately.

Thanks, I still like the asm version for two reasons:
- It contains the problem to 8xx files
- fixing it up in do_page_fault() may lead to a never ending loop.
  This is mainly due to the current TLB code not using DIRTY correctly.

In any case it would be nice to keep the asm version in the file, should
we need it some day. It was rather painful to get it working so saving it
somewhere would be nice.

>
> At the end of the day, I can't help that much without HW access and/or a
> simulator, as you can guess, and no, I'm not asking for people to send
> me an 8xx board :-) (especially useless without a BDI imho). So I rely
> on Scott and yourself to sort this out. But thanks for showing up, and
> please, do port your board to 2.6 so you can test your stuff :-)

 :)
diff mbox

Patch

diff --git a/arch/powerpc/include/asm/pte-8xx.h b/arch/powerpc/include/asm/pte-8xx.h
index 8c6e312..af541a2 100644
--- a/arch/powerpc/include/asm/pte-8xx.h
+++ b/arch/powerpc/include/asm/pte-8xx.h
@@ -32,22 +32,21 @@ 
 #define _PAGE_FILE	0x0002	/* when !present: nonlinear file mapping */
 #define _PAGE_NO_CACHE	0x0002	/* I: cache inhibit */
 #define _PAGE_SHARED	0x0004	/* No ASID (context) compare */
+#define _PAGE_DIRTY	0x0100	/* C: page changed */
 
 /* These five software bits must be masked out when the entry is loaded
  * into the TLB.
  */
 #define _PAGE_EXEC	0x0008	/* software: i-cache coherency required */
 #define _PAGE_GUARDED	0x0010	/* software: guarded access */
-#define _PAGE_DIRTY	0x0020	/* software: page changed */
-#define _PAGE_RW	0x0040	/* software: user write access allowed */
-#define _PAGE_ACCESSED	0x0080	/* software: page referenced */
+#define _PAGE_USER	0x0020	/* software: User space access */
 
 /* Setting any bits in the nibble with the follow two controls will
  * require a TLB exception handler change.  It is assumed unused bits
  * are always zero.
  */
-#define _PAGE_HWWRITE	0x0100	/* h/w write enable: never set in Linux PTE */
-#define _PAGE_USER	0x0800	/* One of the PP bits, the other is USER&~RW */
+#define _PAGE_RW	0x0400	/* lsb PP bits */
+#define _PAGE_ACCESSED	0x0800	/* msb PP bits */
 
 #define _PMD_PRESENT	0x0001
 #define _PMD_BAD	0x0ff0
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 118bb05..b1f72d9 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -333,21 +333,15 @@  InstructionTLBMiss:
 	mfspr	r11, SPRN_MD_TWC	/* ....and get the pte address */
 	lwz	r10, 0(r11)	/* Get the pte */
 
-#ifdef CONFIG_SWAP
-	/* do not set the _PAGE_ACCESSED bit of a non-present page */
-	andi.	r11, r10, _PAGE_PRESENT
-	beq	4f
-	ori	r10, r10, _PAGE_ACCESSED
-	mfspr	r11, SPRN_MD_TWC	/* get the pte address again */
-	stw	r10, 0(r11)
-4:
-#else
-	ori	r10, r10, _PAGE_ACCESSED
-	stw	r10, 0(r11)
-#endif
-
+	andi.	r11, r10, _PAGE_USER | _PAGE_ACCESSED
+	cmpwi	cr0, r11, _PAGE_USER | _PAGE_ACCESSED
+	beq+	cr0, 5f	/* branch if access allowed */
+	rlwinm	r10, r10, 0, 22, 19 /* r10 &= ~(_PAGE_ACCESSED | _PAGE_RW) */
+	b	6f
+5:	xori	r10, r10, _PAGE_RW /* invert RW bit */
+6:
 	/* The Linux PTE won't go exactly into the MMU TLB.
-	 * Software indicator bits 21, 22 and 28 must be clear.
+	 * Software indicator bit 28 must be clear.
 	 * Software indicator bits 24, 25, 26, and 27 must be
 	 * set.  All other Linux PTE bits control the behavior
 	 * of the MMU.
@@ -409,21 +403,15 @@  DataStoreTLBMiss:
 	DO_8xx_CPU6(0x3b80, r3)
 	mtspr	SPRN_MD_TWC, r11
 
-#ifdef CONFIG_SWAP
-	/* do not set the _PAGE_ACCESSED bit of a non-present page */
-	andi.	r11, r10, _PAGE_PRESENT
-	beq	4f
-	ori	r10, r10, _PAGE_ACCESSED
-4:
-	/* and update pte in table */
-#else
-	ori	r10, r10, _PAGE_ACCESSED
-#endif
-	mfspr	r11, SPRN_MD_TWC	/* get the pte address again */
-	stw	r10, 0(r11)
-
+	andi.	r11, r10, _PAGE_USER | _PAGE_ACCESSED
+	cmpwi	cr0, r11, _PAGE_USER | _PAGE_ACCESSED
+	beq+	cr0, 5f	/* branch if access allowed */
+	rlwinm	r10, r10, 0, 22, 19 /* r10 &= ~(_PAGE_ACCESSED | _PAGE_RW) */
+	b	6f
+5:	xori	r10, r10, _PAGE_RW /* invert RW bit */
+6:
 	/* The Linux PTE won't go exactly into the MMU TLB.
-	 * Software indicator bits 21, 22 and 28 must be clear.
+	 * Software indicator bit 28 must be clear.
 	 * Software indicator bits 24, 25, 26, and 27 must be
 	 * set.  All other Linux PTE bits control the behavior
 	 * of the MMU.
@@ -448,6 +436,91 @@  DataStoreTLBMiss:
  */
 	. = 0x1300
 InstructionTLBError:
+#ifdef CONFIG_8xx_CPU6
+	stw	r3, 8(r0)
+#endif
+	DO_8xx_CPU6(0x3f80, r3)
+	mtspr	SPRN_M_TW, r10	/* Save a couple of working registers */
+	mfcr	r10
+	stw	r10, 0(r0)
+	stw	r11, 4(r0)
+
+	mfspr	r11, SPRN_SRR1
+	andis.	r11, r11, 0x5000	/* no translation, guarded */
+	bne	2f
+
+	mfspr	r10, SPRN_SRR0	/* Get effective address of fault */
+#ifdef CONFIG_8xx_CPU15
+	addi	r11, r10, 0x1000
+	tlbie	r11
+	addi	r11, r10, -0x1000
+	tlbie	r11
+#endif
+	DO_8xx_CPU6(0x3780, r3)
+	mtspr	SPRN_MD_EPN, r10	/* Have to use MD_EPN for walk, MI_EPN can't */
+	mfspr	r10, SPRN_M_TWB	/* Get level 1 table entry address */
+
+	/* If we are faulting a kernel address, we have to use the
+	 * kernel page tables.
+	 */
+	andi.	r11, r10, 0x0800	/* Address >= 0x80000000 */
+	beq	3f
+	lis	r11, swapper_pg_dir@h
+	ori	r11, r11, swapper_pg_dir@l
+	rlwimi	r10, r11, 0, 2, 19
+3:
+	lwz	r11, 0(r10)	/* Get the level 1 entry */
+	rlwinm.	r10, r11,0,0,19	/* Extract page descriptor page address */
+	beq	2f		/* If zero, don't try to find a pte */
+
+	/* We have a pte table, so load the MI_TWC with the attributes
+	 * for this "segment."
+	 */
+	ori	r11,r11,1		/* Set valid bit */
+	DO_8xx_CPU6(0x2b80, r3)
+	mtspr	SPRN_MI_TWC, r11	/* Set segment attributes */
+	DO_8xx_CPU6(0x3b80, r3)
+	mtspr	SPRN_MD_TWC, r11	/* Load pte table base address */
+
+	mfspr	r11, SPRN_SRR1
+	andi.	r11, r11, 0x4000 /* MSR[PR] */
+	mfspr	r11, SPRN_MD_TWC	/* ....and get the pte address */
+	lwz	r10, 0(r11)	/* Get the pte */
+	beq	5f /* Kernel access always OK */
+	andi.	r11,r10, _PAGE_USER
+	beq	2f
+5:	ori	r10, r10, _PAGE_ACCESSED
+	mfspr	r21, SPRN_MD_TWC	/* ....and get the pte address */
+	stw	r10, 0(r11)
+	xori	r10, r10, _PAGE_RW /* invert RW bit */
+
+	/* The Linux PTE won't go exactly into the MMU TLB.
+	 * Software indicator bit 28 must be clear.
+	 * Software indicator bits 24, 25, 26, and 27 must be
+	 * set.  All other Linux PTE bits control the behavior
+	 * of the MMU.
+	 */
+	li	r11, 0x00f0
+	rlwimi	r10, r11, 0, 24, 28	/* Set 24-27, clear 28 */
+	DO_8xx_CPU6(0x2d80, r3)
+	mtspr	SPRN_MI_RPN, r10	/* Update TLB entry */
+
+	mfspr	r10, SPRN_M_TW	/* Restore registers */
+	lwz	r11, 0(r0)
+	mtcr	r11
+	lwz	r11, 4(r0)
+#ifdef CONFIG_8xx_CPU6
+	lwz	r3, 8(r0)
+#endif
+	rfi
+
+2:	mfspr	r10, SPRN_M_TW	/* Restore registers */
+	lwz	r11, 0(r0)
+	mtcr	r11
+	lwz	r11, 4(r0)
+#ifdef CONFIG_8xx_CPU6
+	lwz	r3, 8(r0)
+#endif
 	b	InstructionAccess
 
 /* This is the data TLB error on the MPC8xx.  This could be due to
@@ -472,8 +545,8 @@  DataTLBError:
 	/* First, make sure this was a store operation.
 	*/
 	mfspr	r10, SPRN_DSISR
-	andis.	r11, r10, 0x4800 /* no translation, no permission. */
-	bne	2f	/* branch if either is set */
+	andis.	r11, r10, 0x4000 /* no translation */
+	bne	2f	/* branch if set */
 
 	/* The EA of a data TLB miss is automatically stored in the MD_EPN
 	 * register.  The EA of a data TLB error is automatically stored in
@@ -522,26 +595,26 @@  DataTLBError:
 	mfspr	r11, SPRN_MD_TWC		/* ....and get the pte address */
 	lwz	r10, 0(r11)		/* Get the pte */
 
-	andi.	r11, r10, _PAGE_RW	/* Is it writeable? */
-	beq	2f			/* Bail out if not */
+	mfspr	r11, SPRN_SRR1
+	andi.	r11, r11, 0x4000 /* MSR[PR] */
+	beq	5f /* Kernel access always OK */
+	andi.	r11,r10, _PAGE_USER
+	beq	2f
+5:	mfspr	r11, SPRN_DSISR
+	andis.	r11, r11, 0x0200	/* store */
+	beq	6f
+	andi.	r11, r10, _PAGE_RW	/* writeable? */
+	beq	2f /* branch if not */
+	ori	r10, r10, _PAGE_DIRTY | _PAGE_HWWRITE
+6:	ori	r10, r10, _PAGE_ACCESSED
 
-	/* Update 'changed', among others.
-	*/
-#ifdef CONFIG_SWAP
-	ori	r10, r10, _PAGE_DIRTY|_PAGE_HWWRITE
-	/* do not set the _PAGE_ACCESSED bit of a non-present page */
-	andi.	r11, r10, _PAGE_PRESENT
-	beq	4f
-	ori	r10, r10, _PAGE_ACCESSED
-4:
-#else
-	ori	r10, r10, _PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_HWWRITE
-#endif
 	mfspr	r11, SPRN_MD_TWC		/* Get pte address again */
 	stw	r10, 0(r11)		/* and update pte in table */
 
+	xori	r10, r10, _PAGE_RW	/* Invert RW bit */
+
 	/* The Linux PTE won't go exactly into the MMU TLB.
-	 * Software indicator bits 21, 22 and 28 must be clear.
+	 * Software indicator bit 28 must be clear.
 	 * Software indicator bits 24, 25, 26, and 27 must be
 	 * set.  All other Linux PTE bits control the behavior
 	 * of the MMU.