Message ID | 1394168285-32275-9-git-send-email-chenhui.zhao@freescale.com (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Scott Wood |
Headers | show |
On Fri, 2014-03-07 at 12:58 +0800, Chenhui Zhao wrote: > From: Zhao Chenhui <chenhui.zhao@freescale.com> > > T1040 supports deep sleep feature, which can switch off most parts of > the SoC when it is in deep sleep mode. This way, it becomes more > energy-efficient. > > The DDR controller will also be powered off in deep sleep. Therefore, > the last stage (the latter part of fsl_dp_enter_low) will run without DDR > access. This piece of code and related TLBs will be prefetched. > > Due to the different initialization code between 32-bit and 64-bit, they > have seperate resume entry and precedure. > > The feature supports 32-bit and 64-bit kernel mode. > > Signed-off-by: Zhao Chenhui <chenhui.zhao@freescale.com> > --- > arch/powerpc/include/asm/booke_save_regs.h | 3 + > arch/powerpc/kernel/cpu_setup_fsl_booke.S | 17 ++ > arch/powerpc/kernel/head_fsl_booke.S | 30 +++ > arch/powerpc/platforms/85xx/Makefile | 2 +- > arch/powerpc/platforms/85xx/deepsleep.c | 201 +++++++++++++++++++ > arch/powerpc/platforms/85xx/qoriq_pm.c | 38 ++++ > arch/powerpc/platforms/85xx/sleep.S | 295 ++++++++++++++++++++++++++++ > arch/powerpc/sysdev/fsl_soc.h | 7 + > 8 files changed, 592 insertions(+), 1 deletions(-) > create mode 100644 arch/powerpc/platforms/85xx/deepsleep.c > create mode 100644 arch/powerpc/platforms/85xx/sleep.S > > diff --git a/arch/powerpc/include/asm/booke_save_regs.h b/arch/powerpc/include/asm/booke_save_regs.h > index 87c357a..37c1f6c 100644 > --- a/arch/powerpc/include/asm/booke_save_regs.h > +++ b/arch/powerpc/include/asm/booke_save_regs.h > @@ -88,6 +88,9 @@ > #define HIBERNATION_FLAG 1 > #define DEEPSLEEP_FLAG 2 > > +#define CPLD_FLAG 1 > +#define FPGA_FLAG 2 What is this? > #ifndef __ASSEMBLY__ > extern void booke_cpu_state_save(void *buf, int type); > extern void *booke_cpu_state_restore(void *buf, int type); > diff --git a/arch/powerpc/kernel/cpu_setup_fsl_booke.S b/arch/powerpc/kernel/cpu_setup_fsl_booke.S > index e59d6de..ea9bc28 100644 > --- a/arch/powerpc/kernel/cpu_setup_fsl_booke.S > +++ b/arch/powerpc/kernel/cpu_setup_fsl_booke.S > @@ -318,6 +318,23 @@ flush_backside_L2_cache: > 2: > blr > > +#define CPC_CPCCSR0 0x0 > +#define CPC_CPCCSR0_CPCFL 0x800 This is supposed to be CPU setup, not platform setup. > +/* r3 : the base address of CPC */ > +_GLOBAL(fsl_flush_cpc_cache) > + lwz r6, CPC_CPCCSR0(r3) > + ori r6, r6, CPC_CPCCSR0_CPCFL > + stw r6, CPC_CPCCSR0(r3) > + sync > + > + /* Wait until completing the flush */ > +1: lwz r6, CPC_CPCCSR0(r3) > + andi. r6, r6, CPC_CPCCSR0_CPCFL > + bne 1b > + > + blr > + > _GLOBAL(__flush_caches_e500v2) I'm not sure that this belongs here either. > mflr r0 > bl flush_dcache_L1 > diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S > index 20204fe..3285752 100644 > --- a/arch/powerpc/kernel/head_fsl_booke.S > +++ b/arch/powerpc/kernel/head_fsl_booke.S > @@ -162,6 +162,19 @@ _ENTRY(__early_start) > #include "fsl_booke_entry_mapping.S" > #undef ENTRY_MAPPING_BOOT_SETUP > > +#if defined(CONFIG_SUSPEND) && defined(CONFIG_FSL_CORENET_RCPM) > + /* if deep_sleep_flag != 0, jump to the deep sleep resume entry */ > + LOAD_REG_ADDR(r4, deep_sleep_flag) > + lwz r3, 0(r4) > + cmpwi r3, 0 > + beq 11f > + /* clear deep_sleep_flag */ > + li r3, 0 > + stw r3, 0(r4) > + b fsl_deepsleep_resume > +11: > +#endif Why do you come in via the normal kernel entry, versus specifying a direct entry point for deep sleep resume? How does U-Boot even know what the normal entry is when resuming? Be careful of the "beq set_ivor" in the CONFIG_RELOCATABLE section above. Also you probably don't want the relocation code to run again when resuming. > +#if defined(CONFIG_SUSPEND) && defined(CONFIG_FSL_CORENET_RCPM) > +_ENTRY(__entry_deep_sleep) > +/* > + * Bootloader will jump to here when resuming from deep sleep. > + * After executing the init code in fsl_booke_entry_mapping.S, > + * will jump to the real resume entry. > + */ > + li r8, 1 > + bl 12f > +12: mflr r9 > + addi r9, r9, (deep_sleep_flag - 12b) > + stw r8, 0(r9) > + b __early_start > +deep_sleep_flag: > + .long 0 > +#endif It's a bit ambiguous to say "entry_deep_sleep" when it's resuming rather than entering... So you do have a special entry point. Why do you go to __early_start only to quickly test a flag and branch away? > diff --git a/arch/powerpc/platforms/85xx/Makefile b/arch/powerpc/platforms/85xx/Makefile > index 7fae817..9a4ea86 100644 > --- a/arch/powerpc/platforms/85xx/Makefile > +++ b/arch/powerpc/platforms/85xx/Makefile > @@ -3,7 +3,7 @@ > # > obj-$(CONFIG_SMP) += smp.o > ifeq ($(CONFIG_FSL_CORENET_RCPM), y) > -obj-$(CONFIG_SUSPEND) += qoriq_pm.o > +obj-$(CONFIG_SUSPEND) += qoriq_pm.o deepsleep.o sleep.o > endif > > obj-y += common.o > diff --git a/arch/powerpc/platforms/85xx/deepsleep.c b/arch/powerpc/platforms/85xx/deepsleep.c > new file mode 100644 > index 0000000..ddd7185 > --- /dev/null > +++ b/arch/powerpc/platforms/85xx/deepsleep.c > @@ -0,0 +1,201 @@ > +/* > + * Support deep sleep feature AFAICT this supports deep sleep on T1040, not on all 85xx that has it. > + * > + * Copyright 2014 Freescale Semiconductor Inc. > + * > + * Author: Chenhui Zhao <chenhui.zhao@freescale.com> > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms of the GNU General Public License as published by the > + * Free Software Foundation; either version 2 of the License, or (at your > + * option) any later version. > + */ > + > +#include <linux/kernel.h> > +#include <linux/of_platform.h> > +#include <linux/of_address.h> > +#include <asm/machdep.h> > +#include <sysdev/fsl_soc.h> > +#include <asm/booke_save_regs.h> > + > +#define SIZE_1MB 0x100000 > +#define SIZE_2MB 0x200000 > + > +#define CCSR_SCFG_DPSLPCR 0xfc000 > +#define CCSR_SCFG_DPSLPCR_WDRR_EN 0x1 > +#define CCSR_SCFG_SPARECR2 0xfc504 > +#define CCSR_SCFG_SPARECR3 0xfc508 > + > +#define CCSR_GPIO1_GPDIR 0x130000 > +#define CCSR_GPIO1_GPODR 0x130004 > +#define CCSR_GPIO1_GPDAT 0x130008 > +#define CCSR_GPIO1_GPDIR_29 0x4 > + > +/* 128 bytes buffer for restoring data broke by DDR training initialization */ > +#define DDR_BUF_SIZE 128 > +static u8 ddr_buff[DDR_BUF_SIZE] __aligned(64); > + > +static void *dcsr_base, *ccsr_base, *pld_base; > +static int pld_flag; > + > +int fsl_dp_iomap(void) > +{ > + struct device_node *np; > + const u32 *prop; > + int ret = 0; > + u64 ccsr_phy_addr, dcsr_phy_addr; > + > + np = of_find_node_by_type(NULL, "soc"); > + if (!np) { > + pr_err("%s: Can't find the node of \"soc\"\n", __func__); > + ret = -EINVAL; > + goto ccsr_err; > + } > + prop = of_get_property(np, "ranges", NULL); > + if (!prop) { > + pr_err("%s: Can't find the property of \"ranges\"\n", __func__); > + of_node_put(np); > + ret = -EINVAL; > + goto ccsr_err; > + } Use get_immrbase(), or better use specific nodes in the device tree. > + ccsr_phy_addr = of_translate_address(np, prop + 1); > + ccsr_base = ioremap((phys_addr_t)ccsr_phy_addr, SIZE_2MB); > + of_node_put(np); > + if (!ccsr_base) { > + ret = -ENOMEM; > + goto ccsr_err; > + } Unnecessary cast. Why 2 MiB? > + np = of_find_compatible_node(NULL, NULL, "fsl,dcsr"); > + if (!np) { > + pr_err("%s: Can't find the node of \"fsl,dcsr\"\n", __func__); > + ret = -EINVAL; > + goto dcsr_err; > + } > + prop = of_get_property(np, "ranges", NULL); > + if (!prop) { > + pr_err("%s: Can't find the property of \"ranges\"\n", __func__); > + of_node_put(np); > + ret = -EINVAL; > + goto dcsr_err; > + } > + dcsr_phy_addr = of_translate_address(np, prop + 1); > + dcsr_base = ioremap((phys_addr_t)dcsr_phy_addr, SIZE_1MB); > + of_node_put(np); > + if (!dcsr_base) { > + ret = -ENOMEM; > + goto dcsr_err; > + } If you must do this, add a helper to get the dcsr base -- but do we not already have dcsr subnodes for what you are using? > + > + np = of_find_compatible_node(NULL, NULL, "fsl,fpga-qixis"); > + if (np) { > + pld_flag = FPGA_FLAG; > + } else { > + np = of_find_compatible_node(NULL, NULL, "fsl,p104xrdb-cpld"); > + if (np) { > + pld_flag = CPLD_FLAG; > + } else { > + pr_err("%s: Can't find the FPGA/CPLD node\n", > + __func__); > + ret = -EINVAL; > + goto pld_err; > + } > + } OK, so this file isn't even specific to T1040 -- it's specific to our reference boards? > +static void fsl_dp_ddr_save(void *ccsr_base) > +{ > + u32 ddr_buff_addr; > + > + /* > + * DDR training initialization will break 128 bytes at the beginning > + * of DDR, therefore, save them so that the bootloader will restore > + * them. Assume that DDR is mapped to the address space started with > + * CONFIG_PAGE_OFFSET. > + */ > + memcpy(ddr_buff, (void *)CONFIG_PAGE_OFFSET, DDR_BUF_SIZE); > + > + /* assume ddr_buff is in the physical address space of 4GB */ > + ddr_buff_addr = (u32)(__pa(ddr_buff) & 0xffffffff); That assumption may break with a relocatable kernel. > + > +} > + > +int fsl_enter_epu_deepsleep(void) > +{ > + > + No blank lines at begin/end of function. > diff --git a/arch/powerpc/platforms/85xx/qoriq_pm.c b/arch/powerpc/platforms/85xx/qoriq_pm.c > index 915b13b..5f2c016 100644 > --- a/arch/powerpc/platforms/85xx/qoriq_pm.c > +++ b/arch/powerpc/platforms/85xx/qoriq_pm.c > @@ -20,6 +20,8 @@ > #define FSL_SLEEP 0x1 > #define FSL_DEEP_SLEEP 0x2 > > +int (*fsl_enter_deepsleep)(void); > + > /* specify the sleep state of the present platform */ > int sleep_pm_state; > /* supported sleep modes by the present platform */ > @@ -28,6 +30,7 @@ static unsigned int sleep_modes; > static int qoriq_suspend_enter(suspend_state_t state) > { > int ret = 0; > + int cpu; > > switch (state) { > case PM_SUSPEND_STANDBY: > @@ -39,6 +42,17 @@ static int qoriq_suspend_enter(suspend_state_t state) > > break; > > + case PM_SUSPEND_MEM: > + > + cpu = smp_processor_id(); > + qoriq_pm_ops->irq_mask(cpu); > + > + ret = fsl_enter_deepsleep(); > + > + qoriq_pm_ops->irq_unmask(cpu); > + > + break; > + > default: > ret = -EINVAL; > > @@ -52,12 +66,30 @@ static int qoriq_suspend_valid(suspend_state_t state) > if (state == PM_SUSPEND_STANDBY && (sleep_modes & FSL_SLEEP)) > return 1; > > + if (state == PM_SUSPEND_MEM && (sleep_modes & FSL_DEEP_SLEEP)) > + return 1; > + > return 0; > } > > +static int qoriq_suspend_begin(suspend_state_t state) > +{ > + if (state == PM_SUSPEND_MEM) > + return fsl_dp_iomap(); > + > + return 0; > +} > + > +static void qoriq_suspend_end(void) > +{ > + fsl_dp_iounmap(); > +} > + > static const struct platform_suspend_ops qoriq_suspend_ops = { > .valid = qoriq_suspend_valid, > .enter = qoriq_suspend_enter, > + .begin = qoriq_suspend_begin, > + .end = qoriq_suspend_end, > }; > > static int __init qoriq_suspend_init(void) > @@ -71,6 +103,12 @@ static int __init qoriq_suspend_init(void) > if (np) > sleep_pm_state = PLAT_PM_LPM20; > > + np = of_find_compatible_node(NULL, NULL, "fsl,t1040-rcpm"); > + if (np) { > + fsl_enter_deepsleep = fsl_enter_epu_deepsleep; > + sleep_modes |= FSL_DEEP_SLEEP; > + } > + > suspend_set_ops(&qoriq_suspend_ops); > > return 0; > diff --git a/arch/powerpc/platforms/85xx/sleep.S b/arch/powerpc/platforms/85xx/sleep.S > new file mode 100644 > index 0000000..95a5746 > --- /dev/null > +++ b/arch/powerpc/platforms/85xx/sleep.S > @@ -0,0 +1,295 @@ > +/* > + * Implement the low level part of deep sleep > + * > + * Copyright 2014 Freescale Semiconductor Inc. > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms of the GNU General Public License as published by the > + * Free Software Foundation; either version 2 of the License, or (at your > + * option) any later version. > + */ > + > +#include <asm/page.h> > +#include <asm/ppc_asm.h> > +#include <asm/reg.h> > +#include <asm/asm-offsets.h> > +#include <asm/booke_save_regs.h> > +#include <asm/mmu.h> > + > +#define FSLDELAY(count) \ > + li r3, (count)@l; \ > + slwi r3, r3, 10; \ > + mtctr r3; \ > +101: nop; \ > + bdnz 101b; You don't need a namespace prefix on local macros in a non-header file. Is the timebase stopped where you're calling this from? > +#define FSL_DIS_ALL_IRQ \ > + mfmsr r8; \ > + rlwinm r8, r8, 0, ~MSR_CE; \ > + rlwinm r8, r8, 0, ~MSR_ME; \ > + rlwinm r8, r8, 0, ~MSR_EE; \ > + rlwinm r8, r8, 0, ~MSR_DE; \ > + mtmsr r8; \ > + isync > + > + > + .section .data > + .align 6 > +booke_regs_buffer: > + .space REGS_BUFFER_SIZE > + > + .section .txt > + .align 6 > + > +_GLOBAL(fsl_dp_enter_low) > +deepsleep_start: > + LOAD_REG_ADDR(r9, buf_tmp) > + PPC_STL r3, 0(r9) > + PPC_STL r4, 8(r9) > + PPC_STL r5, 16(r9) > + PPC_STL r6, 24(r9) > + > + LOAD_REG_ADDR(r3, booke_regs_buffer) > + /* save the return address */ > + mflr r5 > + PPC_STL r5, SR_LR(r3) > + mfmsr r5 > + PPC_STL r5, SR_MSR(r3) > + li r4, DEEPSLEEP_FLAG > + bl booke_cpu_state_save > + > + LOAD_REG_ADDR(r9, buf_tmp) > + PPC_LL r31, 0(r9) > + PPC_LL r30, 8(r9) > + PPC_LL r29, 16(r9) > + PPC_LL r28, 24(r9) > + > + /* flush caches */ > + LOAD_REG_ADDR(r3, cur_cpu_spec) > + PPC_LL r3, 0(r3) > + PPC_LL r3, CPU_FLUSH_CACHES(r3) > + PPC_LCMPI 0, r3, 0 > + beq 6f > +#ifdef CONFIG_PPC64 > + PPC_LL r3, 0(r3) > +#endif > + mtctr r3 > + bctrl > +6: > +#define CPC_OFFSET 0x10000 > + mr r3, r31 > + addis r3, r3, CPC_OFFSET@h > + bl fsl_flush_cpc_cache > + > + LOAD_REG_ADDR(r8, deepsleep_start) > + LOAD_REG_ADDR(r9, deepsleep_end) > + > + /* prefecth TLB */ > +#define CCSR_GPIO1_GPDAT 0x130008 > +#define CCSR_GPIO1_GPDAT_29 0x4 > + LOAD_REG_IMMEDIATE(r11, CCSR_GPIO1_GPDAT) > + add r11, r31, r11 > + lwz r10, 0(r11) > + > +#define CCSR_RCPM_PCPH15SETR 0xe20b4 > +#define CCSR_RCPM_PCPH15SETR_CORE0 0x1 > + LOAD_REG_IMMEDIATE(r12, CCSR_RCPM_PCPH15SETR) > + add r12, r31, r12 > + lwz r10, 0(r12) > + > +#define CCSR_DDR_SDRAM_CFG_2 0x8114 > +#define CCSR_DDR_SDRAM_CFG_2_FRC_SR 0x80000000 > + LOAD_REG_IMMEDIATE(r13, CCSR_DDR_SDRAM_CFG_2) > + add r13, r31, r13 > + lwz r10, 0(r13) > + > +#define DCSR_EPU_EPGCR 0x000 > +#define DCSR_EPU_EPGCR_GCE 0x80000000 > + li r14, DCSR_EPU_EPGCR > + add r14, r30, r14 > + lwz r10, 0(r14) > + > +#define DCSR_EPU_EPECR15 0x33C > +#define DCSR_EPU_EPECR15_IC0 0x80000000 > + li r15, DCSR_EPU_EPECR15 > + add r15, r30, r15 > + lwz r10, 0(r15) > + > +#define CCSR_SCFG_QMCRDTRSTCR 0xfc40c > +#define CCSR_SCFG_QMCRDTRSTCR_CRDTRST 0x80000000 > + LOAD_REG_IMMEDIATE(r16, CCSR_SCFG_QMCRDTRSTCR) > + add r16, r31, r16 > + lwz r10, 0(r16) > + > +/* > + * There are two kind of register maps, one for CPLD and the other for FPGA > + */ > +#define CPLD_MISCCSR 0x17 > +#define CPLD_MISCCSR_SLEEPEN 0x40 > +#define QIXIS_PWR_CTL2 0x21 > +#define QIXIS_PWR_CTL2_PCTL 0x2 > + PPC_LCMPI 0, r28, FPGA_FLAG > + beq 20f > + addi r29, r29, CPLD_MISCCSR > +20: > + addi r29, r29, QIXIS_PWR_CTL2 > + lbz r10, 0(r29) Again, this is not marked as a board-specific file. How do you expect customers to adapt this mechanism to their boards? > + > + /* prefecth code to cache so that executing code after disable DDR */ > +1: lwz r3, 0(r8) > + addi r8, r8, 4 > + cmpw r8, r9 > + blt 1b > + msync Instructions don't execute from dcache... I guess you're assuming it will allocate in, and stay in, L2/CPC. It'd be safer to lock those cache lines in icache, or copy the code to SRAM. > + FSL_DIS_ALL_IRQ > + > + /* > + * Place DDR controller in self refresh mode. > + * From here on, DDR can't be access any more. > + */ > + lwz r10, 0(r13) > + oris r10, r10, CCSR_DDR_SDRAM_CFG_2_FRC_SR@h > + stw r10, 0(r13) > + > + /* can't call udelay() here, so use a macro to delay */ > + FSLDELAY(50) A timebase loop doesn't require accessing DDR. You also probably want to do a "sync, readback, data dependency, isync" sequence to make sure that the store has hit CCSR before you begin your delay (or is a delay required at all if you do that?). > + /* > + * Enable deep sleep signals by write external CPLD/FPGA register. > + * The bootloader will disable them when wakeup from deep sleep. > + */ > + lbz r10, 0(r29) > + PPC_LCMPI 0, r28, FPGA_FLAG > + beq 22f > + ori r10, r10, CPLD_MISCCSR_SLEEPEN > +22: > + ori r10, r10, QIXIS_PWR_CTL2_PCTL > + stb r10, 0(r29) > + > + /* > + * Set GPIO1_29 to lock the signal MCKE down during deep sleep. > + * The bootloader will clear it when wakeup. > + */ > + lwz r10, 0(r11) > + ori r10, r10, CCSR_GPIO1_GPDAT_29 > + stw r10, 0(r11) > + > + FSLDELAY(10) > + > + /* Clear the QMan CITI Credits */ > + lwz r10, 0(r16) > + oris r10, r10, CCSR_SCFG_QMCRDTRSTCR_CRDTRST@h > + stw r10, 0(r16) > + > + /* Enable all EPU Counters */ > + li r10, 0 > + oris r10, r10, DCSR_EPU_EPGCR_GCE@h > + stw r10, 0(r14) > + > + /* Enable SCU15 to trigger on RCPM Concentrator 0 */ > + lwz r10, 0(r15) > + oris r10, r10, DCSR_EPU_EPECR15_IC0@h > + stw r10, 0(r15) > + > + /* put Core0 in PH15 mode, trigger EPU FSM */ > + lwz r10, 0(r12) > + ori r10, r10, CCSR_RCPM_PCPH15SETR_CORE0 > + stw r10, 0(r12) Shouldn't there be a sync to ensure that the previous I/O happens before the final store to enter PH15? -Scott
On Tue, Mar 11, 2014 at 08:10:24PM -0500, Scott Wood wrote: > > + FSL_DIS_ALL_IRQ > > + > > + /* > > + * Place DDR controller in self refresh mode. > > + * From here on, DDR can't be access any more. > > + */ > > + lwz r10, 0(r13) > > + oris r10, r10, CCSR_DDR_SDRAM_CFG_2_FRC_SR@h > > + stw r10, 0(r13) > > + > > + /* can't call udelay() here, so use a macro to delay */ > > + FSLDELAY(50) > > A timebase loop doesn't require accessing DDR. > > You also probably want to do a "sync, readback, data dependency, isync" > sequence to make sure that the store has hit CCSR before you begin your > delay (or is a delay required at all if you do that?). Shouldn't we use "readback, sync" here? The following is quoted form t4240RM: To guarantee that the results of any sequence of writes to configuration registers are in effect, the final configuration register write should be immediately followed by a read of the same register, and that should be followed by a SYNC instruction. Then accesses can safely be made to memory regions affected by the configuration register write. > > + > > + /* Enable SCU15 to trigger on RCPM Concentrator 0 */ > > + lwz r10, 0(r15) > > + oris r10, r10, DCSR_EPU_EPECR15_IC0@h > > + stw r10, 0(r15) > > + > > + /* put Core0 in PH15 mode, trigger EPU FSM */ > > + lwz r10, 0(r12) > > + ori r10, r10, CCSR_RCPM_PCPH15SETR_CORE0 > > + stw r10, 0(r12) > > Shouldn't there be a sync to ensure that the previous I/O happens before > the final store to enter PH15? Do we really need a sync here? According to the PowerISA, the above stores should be performed in program order. If two Store instructions or two Load instructions specify storage locations that are both Caching Inhibited and Guarded, the corresponding storage accesses are performed in program order with respect to any processor or mechanism. Thanks, Kevin
On Tue, Mar 11, 2014 at 08:10:24PM -0500, Scott Wood wrote: > On Fri, 2014-03-07 at 12:58 +0800, Chenhui Zhao wrote: > > From: Zhao Chenhui <chenhui.zhao@freescale.com> > > > > T1040 supports deep sleep feature, which can switch off most parts of > > the SoC when it is in deep sleep mode. This way, it becomes more > > energy-efficient. > > > > The DDR controller will also be powered off in deep sleep. Therefore, > > the last stage (the latter part of fsl_dp_enter_low) will run without DDR > > access. This piece of code and related TLBs will be prefetched. > > > > Due to the different initialization code between 32-bit and 64-bit, they > > have seperate resume entry and precedure. > > > > The feature supports 32-bit and 64-bit kernel mode. > > > > Signed-off-by: Zhao Chenhui <chenhui.zhao@freescale.com> > > --- > > arch/powerpc/include/asm/booke_save_regs.h | 3 + > > arch/powerpc/kernel/cpu_setup_fsl_booke.S | 17 ++ > > arch/powerpc/kernel/head_fsl_booke.S | 30 +++ > > arch/powerpc/platforms/85xx/Makefile | 2 +- > > arch/powerpc/platforms/85xx/deepsleep.c | 201 +++++++++++++++++++ > > arch/powerpc/platforms/85xx/qoriq_pm.c | 38 ++++ > > arch/powerpc/platforms/85xx/sleep.S | 295 ++++++++++++++++++++++++++++ > > arch/powerpc/sysdev/fsl_soc.h | 7 + > > 8 files changed, 592 insertions(+), 1 deletions(-) > > create mode 100644 arch/powerpc/platforms/85xx/deepsleep.c > > create mode 100644 arch/powerpc/platforms/85xx/sleep.S > > > > diff --git a/arch/powerpc/include/asm/booke_save_regs.h b/arch/powerpc/include/asm/booke_save_regs.h > > index 87c357a..37c1f6c 100644 > > --- a/arch/powerpc/include/asm/booke_save_regs.h > > +++ b/arch/powerpc/include/asm/booke_save_regs.h > > @@ -88,6 +88,9 @@ > > #define HIBERNATION_FLAG 1 > > #define DEEPSLEEP_FLAG 2 > > > > +#define CPLD_FLAG 1 > > +#define FPGA_FLAG 2 > > What is this? We have two kind of boards, QDS and RDB. They have different register map. Use the flag to indicate the current board is using which kind of register map. > > > #ifndef __ASSEMBLY__ > > extern void booke_cpu_state_save(void *buf, int type); > > extern void *booke_cpu_state_restore(void *buf, int type); > > diff --git a/arch/powerpc/kernel/cpu_setup_fsl_booke.S b/arch/powerpc/kernel/cpu_setup_fsl_booke.S > > index e59d6de..ea9bc28 100644 > > --- a/arch/powerpc/kernel/cpu_setup_fsl_booke.S > > +++ b/arch/powerpc/kernel/cpu_setup_fsl_booke.S > > @@ -318,6 +318,23 @@ flush_backside_L2_cache: > > 2: > > blr > > > > +#define CPC_CPCCSR0 0x0 > > +#define CPC_CPCCSR0_CPCFL 0x800 > > This is supposed to be CPU setup, not platform setup. > > > +/* r3 : the base address of CPC */ > > +_GLOBAL(fsl_flush_cpc_cache) > > + lwz r6, CPC_CPCCSR0(r3) > > + ori r6, r6, CPC_CPCCSR0_CPCFL > > + stw r6, CPC_CPCCSR0(r3) > > + sync > > + > > + /* Wait until completing the flush */ > > +1: lwz r6, CPC_CPCCSR0(r3) > > + andi. r6, r6, CPC_CPCCSR0_CPCFL > > + bne 1b > > + > > + blr > > + > > _GLOBAL(__flush_caches_e500v2) > > I'm not sure that this belongs here either. Will find a better place. > > > mflr r0 > > bl flush_dcache_L1 > > diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S > > index 20204fe..3285752 100644 > > --- a/arch/powerpc/kernel/head_fsl_booke.S > > +++ b/arch/powerpc/kernel/head_fsl_booke.S > > @@ -162,6 +162,19 @@ _ENTRY(__early_start) > > #include "fsl_booke_entry_mapping.S" > > #undef ENTRY_MAPPING_BOOT_SETUP > > > > +#if defined(CONFIG_SUSPEND) && defined(CONFIG_FSL_CORENET_RCPM) > > + /* if deep_sleep_flag != 0, jump to the deep sleep resume entry */ > > + LOAD_REG_ADDR(r4, deep_sleep_flag) > > + lwz r3, 0(r4) > > + cmpwi r3, 0 > > + beq 11f > > + /* clear deep_sleep_flag */ > > + li r3, 0 > > + stw r3, 0(r4) > > + b fsl_deepsleep_resume > > +11: > > +#endif > > Why do you come in via the normal kernel entry, versus specifying a > direct entry point for deep sleep resume? How does U-Boot even know > what the normal entry is when resuming? I wish to return to a specified point (like 64-bit mode), but the code in fsl_booke_entry_mapping.S only can run in the first page. Because it only setups a temp mapping of 4KB. > > Be careful of the "beq set_ivor" in the CONFIG_RELOCATABLE section > above. Also you probably don't want the relocation code to run again > when resuming. When resuming, will run from the point __early_start. Don't run the code in the CONFIG_RELOCATABLE section. > > > +#if defined(CONFIG_SUSPEND) && defined(CONFIG_FSL_CORENET_RCPM) > > +_ENTRY(__entry_deep_sleep) > > +/* > > + * Bootloader will jump to here when resuming from deep sleep. > > + * After executing the init code in fsl_booke_entry_mapping.S, > > + * will jump to the real resume entry. > > + */ > > + li r8, 1 > > + bl 12f > > +12: mflr r9 > > + addi r9, r9, (deep_sleep_flag - 12b) > > + stw r8, 0(r9) > > + b __early_start > > +deep_sleep_flag: > > + .long 0 > > +#endif > > It's a bit ambiguous to say "entry_deep_sleep" when it's resuming rather > than entering... How about __fsl_entry_resume? > > So you do have a special entry point. Why do you go to __early_start > only to quickly test a flag and branch away? > > > diff --git a/arch/powerpc/platforms/85xx/Makefile b/arch/powerpc/platforms/85xx/Makefile > > index 7fae817..9a4ea86 100644 > > --- a/arch/powerpc/platforms/85xx/Makefile > > +++ b/arch/powerpc/platforms/85xx/Makefile > > @@ -3,7 +3,7 @@ > > # > > obj-$(CONFIG_SMP) += smp.o > > ifeq ($(CONFIG_FSL_CORENET_RCPM), y) > > -obj-$(CONFIG_SUSPEND) += qoriq_pm.o > > +obj-$(CONFIG_SUSPEND) += qoriq_pm.o deepsleep.o sleep.o > > endif > > > > obj-y += common.o > > diff --git a/arch/powerpc/platforms/85xx/deepsleep.c b/arch/powerpc/platforms/85xx/deepsleep.c > > new file mode 100644 > > index 0000000..ddd7185 > > --- /dev/null > > +++ b/arch/powerpc/platforms/85xx/deepsleep.c > > @@ -0,0 +1,201 @@ > > +/* > > + * Support deep sleep feature > > AFAICT this supports deep sleep on T1040, not on all 85xx that has it. Yes, for now T1040 and T1042. > > > + * > > + * Copyright 2014 Freescale Semiconductor Inc. > > + * > > + * Author: Chenhui Zhao <chenhui.zhao@freescale.com> > > + * > > + * This program is free software; you can redistribute it and/or modify it > > + * under the terms of the GNU General Public License as published by the > > + * Free Software Foundation; either version 2 of the License, or (at your > > + * option) any later version. > > + */ > > + > > +#include <linux/kernel.h> > > +#include <linux/of_platform.h> > > +#include <linux/of_address.h> > > +#include <asm/machdep.h> > > +#include <sysdev/fsl_soc.h> > > +#include <asm/booke_save_regs.h> > > + > > +#define SIZE_1MB 0x100000 > > +#define SIZE_2MB 0x200000 > > + > > +#define CCSR_SCFG_DPSLPCR 0xfc000 > > +#define CCSR_SCFG_DPSLPCR_WDRR_EN 0x1 > > +#define CCSR_SCFG_SPARECR2 0xfc504 > > +#define CCSR_SCFG_SPARECR3 0xfc508 > > + > > +#define CCSR_GPIO1_GPDIR 0x130000 > > +#define CCSR_GPIO1_GPODR 0x130004 > > +#define CCSR_GPIO1_GPDAT 0x130008 > > +#define CCSR_GPIO1_GPDIR_29 0x4 > > + > > +/* 128 bytes buffer for restoring data broke by DDR training initialization */ > > +#define DDR_BUF_SIZE 128 > > +static u8 ddr_buff[DDR_BUF_SIZE] __aligned(64); > > + > > +static void *dcsr_base, *ccsr_base, *pld_base; > > +static int pld_flag; > > + > > +int fsl_dp_iomap(void) > > +{ > > + struct device_node *np; > > + const u32 *prop; > > + int ret = 0; > > + u64 ccsr_phy_addr, dcsr_phy_addr; > > + > > + np = of_find_node_by_type(NULL, "soc"); > > + if (!np) { > > + pr_err("%s: Can't find the node of \"soc\"\n", __func__); > > + ret = -EINVAL; > > + goto ccsr_err; > > + } > > + prop = of_get_property(np, "ranges", NULL); > > + if (!prop) { > > + pr_err("%s: Can't find the property of \"ranges\"\n", __func__); > > + of_node_put(np); > > + ret = -EINVAL; > > + goto ccsr_err; > > + } > > Use get_immrbase(), or better use specific nodes in the device tree. > > > + ccsr_phy_addr = of_translate_address(np, prop + 1); > > + ccsr_base = ioremap((phys_addr_t)ccsr_phy_addr, SIZE_2MB); > > + of_node_put(np); > > + if (!ccsr_base) { > > + ret = -ENOMEM; > > + goto ccsr_err; > > + } > > Unnecessary cast. > > Why 2 MiB? All registers used are inside the 2MB address space. > > > + np = of_find_compatible_node(NULL, NULL, "fsl,dcsr"); > > + if (!np) { > > + pr_err("%s: Can't find the node of \"fsl,dcsr\"\n", __func__); > > + ret = -EINVAL; > > + goto dcsr_err; > > + } > > + prop = of_get_property(np, "ranges", NULL); > > + if (!prop) { > > + pr_err("%s: Can't find the property of \"ranges\"\n", __func__); > > + of_node_put(np); > > + ret = -EINVAL; > > + goto dcsr_err; > > + } > > + dcsr_phy_addr = of_translate_address(np, prop + 1); > > + dcsr_base = ioremap((phys_addr_t)dcsr_phy_addr, SIZE_1MB); > > + of_node_put(np); > > + if (!dcsr_base) { > > + ret = -ENOMEM; > > + goto dcsr_err; > > + } > > If you must do this, add a helper to get the dcsr base -- but do we not > already have dcsr subnodes for what you are using? > > > + > > + np = of_find_compatible_node(NULL, NULL, "fsl,fpga-qixis"); > > + if (np) { > > + pld_flag = FPGA_FLAG; > > + } else { > > + np = of_find_compatible_node(NULL, NULL, "fsl,p104xrdb-cpld"); > > + if (np) { > > + pld_flag = CPLD_FLAG; > > + } else { > > + pr_err("%s: Can't find the FPGA/CPLD node\n", > > + __func__); > > + ret = -EINVAL; > > + goto pld_err; > > + } > > + } > > OK, so this file isn't even specific to T1040 -- it's specific to our > reference boards? Yes. There are some FPGA/CPLD setting which needed by deep sleep. > > > +static void fsl_dp_ddr_save(void *ccsr_base) > > +{ > > + u32 ddr_buff_addr; > > + > > + /* > > + * DDR training initialization will break 128 bytes at the beginning > > + * of DDR, therefore, save them so that the bootloader will restore > > + * them. Assume that DDR is mapped to the address space started with > > + * CONFIG_PAGE_OFFSET. > > + */ > > + memcpy(ddr_buff, (void *)CONFIG_PAGE_OFFSET, DDR_BUF_SIZE); > > + > > + /* assume ddr_buff is in the physical address space of 4GB */ > > + ddr_buff_addr = (u32)(__pa(ddr_buff) & 0xffffffff); > > That assumption may break with a relocatable kernel. > > > + > > +} > > + > > +int fsl_enter_epu_deepsleep(void) > > +{ > > + > > + > > No blank lines at begin/end of function. > > > diff --git a/arch/powerpc/platforms/85xx/qoriq_pm.c b/arch/powerpc/platforms/85xx/qoriq_pm.c > > index 915b13b..5f2c016 100644 > > --- a/arch/powerpc/platforms/85xx/qoriq_pm.c > > +++ b/arch/powerpc/platforms/85xx/qoriq_pm.c > > @@ -20,6 +20,8 @@ > > #define FSL_SLEEP 0x1 > > #define FSL_DEEP_SLEEP 0x2 > > > > +int (*fsl_enter_deepsleep)(void); > > + > > /* specify the sleep state of the present platform */ > > int sleep_pm_state; > > /* supported sleep modes by the present platform */ > > @@ -28,6 +30,7 @@ static unsigned int sleep_modes; > > static int qoriq_suspend_enter(suspend_state_t state) > > { > > int ret = 0; > > + int cpu; > > > > switch (state) { > > case PM_SUSPEND_STANDBY: > > @@ -39,6 +42,17 @@ static int qoriq_suspend_enter(suspend_state_t state) > > > > break; > > > > + case PM_SUSPEND_MEM: > > + > > + cpu = smp_processor_id(); > > + qoriq_pm_ops->irq_mask(cpu); > > + > > + ret = fsl_enter_deepsleep(); > > + > > + qoriq_pm_ops->irq_unmask(cpu); > > + > > + break; > > + > > default: > > ret = -EINVAL; > > > > @@ -52,12 +66,30 @@ static int qoriq_suspend_valid(suspend_state_t state) > > if (state == PM_SUSPEND_STANDBY && (sleep_modes & FSL_SLEEP)) > > return 1; > > > > + if (state == PM_SUSPEND_MEM && (sleep_modes & FSL_DEEP_SLEEP)) > > + return 1; > > + > > return 0; > > } > > > > +static int qoriq_suspend_begin(suspend_state_t state) > > +{ > > + if (state == PM_SUSPEND_MEM) > > + return fsl_dp_iomap(); > > + > > + return 0; > > +} > > + > > +static void qoriq_suspend_end(void) > > +{ > > + fsl_dp_iounmap(); > > +} > > + > > static const struct platform_suspend_ops qoriq_suspend_ops = { > > .valid = qoriq_suspend_valid, > > .enter = qoriq_suspend_enter, > > + .begin = qoriq_suspend_begin, > > + .end = qoriq_suspend_end, > > }; > > > > static int __init qoriq_suspend_init(void) > > @@ -71,6 +103,12 @@ static int __init qoriq_suspend_init(void) > > if (np) > > sleep_pm_state = PLAT_PM_LPM20; > > > > + np = of_find_compatible_node(NULL, NULL, "fsl,t1040-rcpm"); > > + if (np) { > > + fsl_enter_deepsleep = fsl_enter_epu_deepsleep; > > + sleep_modes |= FSL_DEEP_SLEEP; > > + } > > + > > suspend_set_ops(&qoriq_suspend_ops); > > > > return 0; > > diff --git a/arch/powerpc/platforms/85xx/sleep.S b/arch/powerpc/platforms/85xx/sleep.S > > new file mode 100644 > > index 0000000..95a5746 > > --- /dev/null > > +++ b/arch/powerpc/platforms/85xx/sleep.S > > @@ -0,0 +1,295 @@ > > +/* > > + * Implement the low level part of deep sleep > > + * > > + * Copyright 2014 Freescale Semiconductor Inc. > > + * > > + * This program is free software; you can redistribute it and/or modify it > > + * under the terms of the GNU General Public License as published by the > > + * Free Software Foundation; either version 2 of the License, or (at your > > + * option) any later version. > > + */ > > + > > +#include <asm/page.h> > > +#include <asm/ppc_asm.h> > > +#include <asm/reg.h> > > +#include <asm/asm-offsets.h> > > +#include <asm/booke_save_regs.h> > > +#include <asm/mmu.h> > > + > > +#define FSLDELAY(count) \ > > + li r3, (count)@l; \ > > + slwi r3, r3, 10; \ > > + mtctr r3; \ > > +101: nop; \ > > + bdnz 101b; > > You don't need a namespace prefix on local macros in a non-header file. > > Is the timebase stopped where you're calling this from? No. My purpose is to avoid jump in the last stage of entering deep sleep. Jump may cause problem at that time. > > > +#define FSL_DIS_ALL_IRQ \ > > + mfmsr r8; \ > > + rlwinm r8, r8, 0, ~MSR_CE; \ > > + rlwinm r8, r8, 0, ~MSR_ME; \ > > + rlwinm r8, r8, 0, ~MSR_EE; \ > > + rlwinm r8, r8, 0, ~MSR_DE; \ > > + mtmsr r8; \ > > + isync > > + > > + > > + .section .data > > + .align 6 > > +booke_regs_buffer: > > + .space REGS_BUFFER_SIZE > > + > > + .section .txt > > + .align 6 > > + > > +_GLOBAL(fsl_dp_enter_low) > > +deepsleep_start: > > + LOAD_REG_ADDR(r9, buf_tmp) > > + PPC_STL r3, 0(r9) > > + PPC_STL r4, 8(r9) > > + PPC_STL r5, 16(r9) > > + PPC_STL r6, 24(r9) > > + > > + LOAD_REG_ADDR(r3, booke_regs_buffer) > > + /* save the return address */ > > + mflr r5 > > + PPC_STL r5, SR_LR(r3) > > + mfmsr r5 > > + PPC_STL r5, SR_MSR(r3) > > + li r4, DEEPSLEEP_FLAG > > + bl booke_cpu_state_save > > + > > + LOAD_REG_ADDR(r9, buf_tmp) > > + PPC_LL r31, 0(r9) > > + PPC_LL r30, 8(r9) > > + PPC_LL r29, 16(r9) > > + PPC_LL r28, 24(r9) > > + > > + /* flush caches */ > > + LOAD_REG_ADDR(r3, cur_cpu_spec) > > + PPC_LL r3, 0(r3) > > + PPC_LL r3, CPU_FLUSH_CACHES(r3) > > + PPC_LCMPI 0, r3, 0 > > + beq 6f > > +#ifdef CONFIG_PPC64 > > + PPC_LL r3, 0(r3) > > +#endif > > + mtctr r3 > > + bctrl > > +6: > > +#define CPC_OFFSET 0x10000 > > + mr r3, r31 > > + addis r3, r3, CPC_OFFSET@h > > + bl fsl_flush_cpc_cache > > + > > + LOAD_REG_ADDR(r8, deepsleep_start) > > + LOAD_REG_ADDR(r9, deepsleep_end) > > + > > + /* prefecth TLB */ > > +#define CCSR_GPIO1_GPDAT 0x130008 > > +#define CCSR_GPIO1_GPDAT_29 0x4 > > + LOAD_REG_IMMEDIATE(r11, CCSR_GPIO1_GPDAT) > > + add r11, r31, r11 > > + lwz r10, 0(r11) > > + > > +#define CCSR_RCPM_PCPH15SETR 0xe20b4 > > +#define CCSR_RCPM_PCPH15SETR_CORE0 0x1 > > + LOAD_REG_IMMEDIATE(r12, CCSR_RCPM_PCPH15SETR) > > + add r12, r31, r12 > > + lwz r10, 0(r12) > > + > > +#define CCSR_DDR_SDRAM_CFG_2 0x8114 > > +#define CCSR_DDR_SDRAM_CFG_2_FRC_SR 0x80000000 > > + LOAD_REG_IMMEDIATE(r13, CCSR_DDR_SDRAM_CFG_2) > > + add r13, r31, r13 > > + lwz r10, 0(r13) > > + > > +#define DCSR_EPU_EPGCR 0x000 > > +#define DCSR_EPU_EPGCR_GCE 0x80000000 > > + li r14, DCSR_EPU_EPGCR > > + add r14, r30, r14 > > + lwz r10, 0(r14) > > + > > +#define DCSR_EPU_EPECR15 0x33C > > +#define DCSR_EPU_EPECR15_IC0 0x80000000 > > + li r15, DCSR_EPU_EPECR15 > > + add r15, r30, r15 > > + lwz r10, 0(r15) > > + > > +#define CCSR_SCFG_QMCRDTRSTCR 0xfc40c > > +#define CCSR_SCFG_QMCRDTRSTCR_CRDTRST 0x80000000 > > + LOAD_REG_IMMEDIATE(r16, CCSR_SCFG_QMCRDTRSTCR) > > + add r16, r31, r16 > > + lwz r10, 0(r16) > > + > > +/* > > + * There are two kind of register maps, one for CPLD and the other for FPGA > > + */ > > +#define CPLD_MISCCSR 0x17 > > +#define CPLD_MISCCSR_SLEEPEN 0x40 > > +#define QIXIS_PWR_CTL2 0x21 > > +#define QIXIS_PWR_CTL2_PCTL 0x2 > > + PPC_LCMPI 0, r28, FPGA_FLAG > > + beq 20f > > + addi r29, r29, CPLD_MISCCSR > > +20: > > + addi r29, r29, QIXIS_PWR_CTL2 > > + lbz r10, 0(r29) > > > Again, this is not marked as a board-specific file. How do you expect > customers to adapt this mechanism to their boards? Will add comment. > > > + > > + /* prefecth code to cache so that executing code after disable DDR */ > > +1: lwz r3, 0(r8) > > + addi r8, r8, 4 > > + cmpw r8, r9 > > + blt 1b > > + msync > > Instructions don't execute from dcache... I guess you're assuming it > will allocate in, and stay in, L2/CPC. It'd be safer to lock those > cache lines in icache, or copy the code to SRAM. Yes, they should be in L2/CPC. Will try to lock them in icache. > > > + FSL_DIS_ALL_IRQ > > + > > + /* > > + * Place DDR controller in self refresh mode. > > + * From here on, DDR can't be access any more. > > + */ > > + lwz r10, 0(r13) > > + oris r10, r10, CCSR_DDR_SDRAM_CFG_2_FRC_SR@h > > + stw r10, 0(r13) > > + > > + /* can't call udelay() here, so use a macro to delay */ > > + FSLDELAY(50) > > A timebase loop doesn't require accessing DDR. Don't wish to jump at this time. Maybe cause problems. > > You also probably want to do a "sync, readback, data dependency, isync" > sequence to make sure that the store has hit CCSR before you begin your > delay (or is a delay required at all if you do that?). Yes. It is safer with a sync sequence. The DDR controller need some time to signal the external DDR modules to enter self refresh mode. -Chenhui
On Wed, 2014-03-12 at 13:57 +0800, Kevin Hao wrote: > On Tue, Mar 11, 2014 at 08:10:24PM -0500, Scott Wood wrote: > > > + FSL_DIS_ALL_IRQ > > > + > > > + /* > > > + * Place DDR controller in self refresh mode. > > > + * From here on, DDR can't be access any more. > > > + */ > > > + lwz r10, 0(r13) > > > + oris r10, r10, CCSR_DDR_SDRAM_CFG_2_FRC_SR@h > > > + stw r10, 0(r13) > > > + > > > + /* can't call udelay() here, so use a macro to delay */ > > > + FSLDELAY(50) > > > > A timebase loop doesn't require accessing DDR. > > > > You also probably want to do a "sync, readback, data dependency, isync" > > sequence to make sure that the store has hit CCSR before you begin your > > delay (or is a delay required at all if you do that?). > > Shouldn't we use "readback, sync" here? The following is quoted form t4240RM: > To guarantee that the results of any sequence of writes to configuration > registers are in effect, the final configuration register write should be > immediately followed by a read of the same register, and that should be > followed by a SYNC instruction. Then accesses can safely be made to memory > regions affected by the configuration register write. I agree that the sync before the readback is probably not necessary, since transactions to the same address should already be ordered. A sync after the readback helps if you're trying to order the readback with subsequent memory accesses, though in that case wouldn't a sync alone (no readback) be adequate? Though maybe not always -- see the comment near the end of fsl_elbc_write_buf() in drivers/mtd/nand_fsl_elbc.c. I guess the readback does more than just make sure the device has seen the write, ensuring that the device has finished the transaction to the point of acting on another one. The data dependency plus isync sequence, which is done by the normal I/O accessors used from C code, orders the readback versus all future instructions (not just I/O). The delay loop is not I/O. > > > + /* Enable SCU15 to trigger on RCPM Concentrator 0 */ > > > + lwz r10, 0(r15) > > > + oris r10, r10, DCSR_EPU_EPECR15_IC0@h > > > + stw r10, 0(r15) > > > + > > > + /* put Core0 in PH15 mode, trigger EPU FSM */ > > > + lwz r10, 0(r12) > > > + ori r10, r10, CCSR_RCPM_PCPH15SETR_CORE0 > > > + stw r10, 0(r12) > > > > Shouldn't there be a sync to ensure that the previous I/O happens before > > the final store to enter PH15? > > Do we really need a sync here? According to the PowerISA, the above stores > should be performed in program order. > If two Store instructions or two Load instructions > specify storage locations that are both Caching > Inhibited and Guarded, the corresponding storage > accesses are performed in program order with > respect to any processor or mechanism. OK, wasn't aware of that. -Scott
On Wed, Mar 12, 2014 at 12:43:05PM -0500, Scott Wood wrote: > > Shouldn't we use "readback, sync" here? The following is quoted form t4240RM: > > To guarantee that the results of any sequence of writes to configuration > > registers are in effect, the final configuration register write should be > > immediately followed by a read of the same register, and that should be > > followed by a SYNC instruction. Then accesses can safely be made to memory > > regions affected by the configuration register write. > > I agree that the sync before the readback is probably not necessary, > since transactions to the same address should already be ordered. > > A sync after the readback helps if you're trying to order the readback > with subsequent memory accesses, though in that case wouldn't a sync > alone (no readback) be adequate? No, we don't just want to order the subsequent memory access here. The 'write, readback, sync' is the required sequence if we want to make sure that the writing to CCSR register does really take effect. > Though maybe not always -- see the > comment near the end of fsl_elbc_write_buf() in > drivers/mtd/nand_fsl_elbc.c. I guess the readback does more than just > make sure the device has seen the write, ensuring that the device has > finished the transaction to the point of acting on another one. Agree. > > The data dependency plus isync sequence, which is done by the normal I/O > accessors used from C code, orders the readback versus all future > instructions (not just I/O). The delay loop is not I/O. According to the PowerISA, the sequence 'load, date dependency, isync' only order the load accesses. So if we want to order all the storage access as well as execution synchronization, we should choose sync here. Thanks, Kevin
On Thu, 2014-03-13 at 15:46 +0800, Kevin Hao wrote: > On Wed, Mar 12, 2014 at 12:43:05PM -0500, Scott Wood wrote: > > > Shouldn't we use "readback, sync" here? The following is quoted form t4240RM: > > > To guarantee that the results of any sequence of writes to configuration > > > registers are in effect, the final configuration register write should be > > > immediately followed by a read of the same register, and that should be > > > followed by a SYNC instruction. Then accesses can safely be made to memory > > > regions affected by the configuration register write. > > > > I agree that the sync before the readback is probably not necessary, > > since transactions to the same address should already be ordered. > > > > A sync after the readback helps if you're trying to order the readback > > with subsequent memory accesses, though in that case wouldn't a sync > > alone (no readback) be adequate? > > No, we don't just want to order the subsequent memory access here. > The 'write, readback, sync' is the required sequence if we want to make > sure that the writing to CCSR register does really take effect. > > > Though maybe not always -- see the > > comment near the end of fsl_elbc_write_buf() in > > drivers/mtd/nand_fsl_elbc.c. I guess the readback does more than just > > make sure the device has seen the write, ensuring that the device has > > finished the transaction to the point of acting on another one. > > Agree. > > > > > The data dependency plus isync sequence, which is done by the normal I/O > > accessors used from C code, orders the readback versus all future > > instructions (not just I/O). The delay loop is not I/O. > > According to the PowerISA, the sequence 'load, date dependency, isync' only > order the load accesses. The point is to order the delay loop after the load, not to order storage versus storage. This is a sequence we're already using on all of our I/O loads (excluding accesses like in this patch that don't use the standard accessors). I'm confident that it works even if it's not architecturally guaranteed. I'm not sure that there exists a clear architectural way of synchronizing non-storage instructions relative to storage instructions. Given that isync is documented as preventing any execution of instructions after the isync until all previous instructions complete, it doesn't seem to make sense for the architecture to explicitly talk about loads (as opposed to any other instruction) following a load, dependent conditional branch, isync sequence. > So if we want to order all the storage access as well > as execution synchronization, we should choose sync here. Do we need execution synchronization or context synchronization? The t4240 RM section that talks about a readback and a sync is in the context of subsequent memory operations ("Then accesses can safely be made to memory regions affected..."), not arbitrary instructions. There are also a couple other places in the RM where isync is recommended instead (when setting LAWs or CCSRBAR), even though those also only involve memory accesses. In any case, this is not performance critical and thus it's better to oversynchronize than undersynchronize. -Scott
On Wed, 2014-03-12 at 18:40 +0800, Chenhui Zhao wrote: > On Tue, Mar 11, 2014 at 08:10:24PM -0500, Scott Wood wrote: > > On Fri, 2014-03-07 at 12:58 +0800, Chenhui Zhao wrote: > > > From: Zhao Chenhui <chenhui.zhao@freescale.com> > > > > > > T1040 supports deep sleep feature, which can switch off most parts of > > > the SoC when it is in deep sleep mode. This way, it becomes more > > > energy-efficient. > > > > > > The DDR controller will also be powered off in deep sleep. Therefore, > > > the last stage (the latter part of fsl_dp_enter_low) will run without DDR > > > access. This piece of code and related TLBs will be prefetched. > > > > > > Due to the different initialization code between 32-bit and 64-bit, they > > > have seperate resume entry and precedure. > > > > > > The feature supports 32-bit and 64-bit kernel mode. > > > > > > Signed-off-by: Zhao Chenhui <chenhui.zhao@freescale.com> > > > --- > > > arch/powerpc/include/asm/booke_save_regs.h | 3 + > > > arch/powerpc/kernel/cpu_setup_fsl_booke.S | 17 ++ > > > arch/powerpc/kernel/head_fsl_booke.S | 30 +++ > > > arch/powerpc/platforms/85xx/Makefile | 2 +- > > > arch/powerpc/platforms/85xx/deepsleep.c | 201 +++++++++++++++++++ > > > arch/powerpc/platforms/85xx/qoriq_pm.c | 38 ++++ > > > arch/powerpc/platforms/85xx/sleep.S | 295 ++++++++++++++++++++++++++++ > > > arch/powerpc/sysdev/fsl_soc.h | 7 + > > > 8 files changed, 592 insertions(+), 1 deletions(-) > > > create mode 100644 arch/powerpc/platforms/85xx/deepsleep.c > > > create mode 100644 arch/powerpc/platforms/85xx/sleep.S > > > > > > diff --git a/arch/powerpc/include/asm/booke_save_regs.h b/arch/powerpc/include/asm/booke_save_regs.h > > > index 87c357a..37c1f6c 100644 > > > --- a/arch/powerpc/include/asm/booke_save_regs.h > > > +++ b/arch/powerpc/include/asm/booke_save_regs.h > > > @@ -88,6 +88,9 @@ > > > #define HIBERNATION_FLAG 1 > > > #define DEEPSLEEP_FLAG 2 > > > > > > +#define CPLD_FLAG 1 > > > +#define FPGA_FLAG 2 > > > > What is this? > > We have two kind of boards, QDS and RDB. > They have different register map. Use the flag to indicate the current board is using which kind > of register map. CPLD versus FPGA is not a meaningful difference. We don't care what technology is used to implement programmable logic -- we care what programming interface is exposed. Customers will have their own boards that will likely not imitate either of these programming interfaces, but what they do have will still probably be implemented in a CPLD or FPGA. Likewise, Freescale may have future reference boards whose CPLD/FPGA is not compatible. So use better naming, and structure the code so it's easy to plug in implementations for new or custom boards. > > > diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S > > > index 20204fe..3285752 100644 > > > --- a/arch/powerpc/kernel/head_fsl_booke.S > > > +++ b/arch/powerpc/kernel/head_fsl_booke.S > > > @@ -162,6 +162,19 @@ _ENTRY(__early_start) > > > #include "fsl_booke_entry_mapping.S" > > > #undef ENTRY_MAPPING_BOOT_SETUP > > > > > > +#if defined(CONFIG_SUSPEND) && defined(CONFIG_FSL_CORENET_RCPM) > > > + /* if deep_sleep_flag != 0, jump to the deep sleep resume entry */ > > > + LOAD_REG_ADDR(r4, deep_sleep_flag) > > > + lwz r3, 0(r4) > > > + cmpwi r3, 0 > > > + beq 11f > > > + /* clear deep_sleep_flag */ > > > + li r3, 0 > > > + stw r3, 0(r4) > > > + b fsl_deepsleep_resume > > > +11: > > > +#endif > > > > Why do you come in via the normal kernel entry, versus specifying a > > direct entry point for deep sleep resume? How does U-Boot even know > > what the normal entry is when resuming? > > I wish to return to a specified point (like 64-bit mode), but the code in > fsl_booke_entry_mapping.S only can run in the first page. Because it > only setups a temp mapping of 4KB. Why do you need the entry mapping on 32-bit but not 64-bit? > > > > +#if defined(CONFIG_SUSPEND) && defined(CONFIG_FSL_CORENET_RCPM) > > > +_ENTRY(__entry_deep_sleep) > > > +/* > > > + * Bootloader will jump to here when resuming from deep sleep. > > > + * After executing the init code in fsl_booke_entry_mapping.S, > > > + * will jump to the real resume entry. > > > + */ > > > + li r8, 1 > > > + bl 12f > > > +12: mflr r9 > > > + addi r9, r9, (deep_sleep_flag - 12b) > > > + stw r8, 0(r9) > > > + b __early_start > > > +deep_sleep_flag: > > > + .long 0 > > > +#endif > > > > It's a bit ambiguous to say "entry_deep_sleep" when it's resuming rather > > than entering... > > How about __fsl_entry_resume? fsl_booke_deep_sleep_resume > > > +#define FSLDELAY(count) \ > > > + li r3, (count)@l; \ > > > + slwi r3, r3, 10; \ > > > + mtctr r3; \ > > > +101: nop; \ > > > + bdnz 101b; > > > > You don't need a namespace prefix on local macros in a non-header file. > > > > Is the timebase stopped where you're calling this from? > > No. My purpose is to avoid jump in the last stage of entering deep sleep. > Jump may cause problem at that time. "bdnz" is a jump. What problems do you think a jump will cause? > > You also probably want to do a "sync, readback, data dependency, isync" > > sequence to make sure that the store has hit CCSR before you begin your > > delay (or is a delay required at all if you do that?). > > Yes. It is safer with a sync sequence. > > The DDR controller need some time to signal the external DDR modules to > enter self refresh mode. Is it documented how much time it requires? -Scott
On Fri, Mar 14, 2014 at 05:26:27PM -0500, Scott Wood wrote: > On Thu, 2014-03-13 at 15:46 +0800, Kevin Hao wrote: > > On Wed, Mar 12, 2014 at 12:43:05PM -0500, Scott Wood wrote: > > > > Shouldn't we use "readback, sync" here? The following is quoted form t4240RM: > > > > To guarantee that the results of any sequence of writes to configuration > > > > registers are in effect, the final configuration register write should be > > > > immediately followed by a read of the same register, and that should be > > > > followed by a SYNC instruction. Then accesses can safely be made to memory > > > > regions affected by the configuration register write. > > > > > > I agree that the sync before the readback is probably not necessary, > > > since transactions to the same address should already be ordered. > > > > > > A sync after the readback helps if you're trying to order the readback > > > with subsequent memory accesses, though in that case wouldn't a sync > > > alone (no readback) be adequate? > > > > No, we don't just want to order the subsequent memory access here. > > The 'write, readback, sync' is the required sequence if we want to make > > sure that the writing to CCSR register does really take effect. > > > > > Though maybe not always -- see the > > > comment near the end of fsl_elbc_write_buf() in > > > drivers/mtd/nand_fsl_elbc.c. I guess the readback does more than just > > > make sure the device has seen the write, ensuring that the device has > > > finished the transaction to the point of acting on another one. > > > > Agree. > > > > > > > > The data dependency plus isync sequence, which is done by the normal I/O > > > accessors used from C code, orders the readback versus all future > > > instructions (not just I/O). The delay loop is not I/O. > > > > According to the PowerISA, the sequence 'load, date dependency, isync' only > > order the load accesses. > > The point is to order the delay loop after the load, not to order > storage versus storage. I think the point is to make sure that the writing of the CCSR_DDR_SDRAM_CFG_2 does really take effect before we begin to delay loop. The sequence "write, readback, sync" will guarantee this according to the manual. If we just want to order the delay loop after the load, the following sequence should be enough: store to CCSR_DDR_SDRAM_CFG_2 load from CCSR_DDR_SDRAM_CFG_2 isync or sync delay loop Why do we need the 'date dependency' here? According to the e6500 manual, the instructions can execute out of order, but complete in order. So I am really wondering why we need to order the load and the following delay loop if there is no intention to order the storage access? > > This is a sequence we're already using on all of our I/O loads > (excluding accesses like in this patch that don't use the standard > accessors). I'm confident that it works even if it's not > architecturally guaranteed. This sequence is used to order the load and the followed storage access. And this is guaranteed by the architecture. But I don't think it is suitable for a case like this. The following is quoted from PowerISA: Because stores cannot be performed “out-of-order” (see Book III), if a Store instruction depends on the value returned by a preceding Load instruction (because the value returned by the Load is used to compute either the effective address specified by the Store or the value to be stored), the corresponding stor- age accesses are performed in program order. The same applies if whether the Store instruction is exe- cuted depends on a conditional Branch instruction that in turn depends on the value returned by a preceding Load instruction. Because an isync instruction prevents the execution of instructions following the isync until instructions pre- ceding the isync have completed, if an isync follows a conditional Branch instruction that depends on the value returned by a preceding Load instruction, the load on which the Branch depends is performed before any loads caused by instructions following the isync. I think the above description would guarantee that the load will be performed before any storage access (both load and store) following the isync in the following scenario: lwz r4, 0(r3) twi 0, r4, 0 isync > I'm not sure that there exists a clear > architectural way of synchronizing non-storage instructions relative to > storage instructions. Isn't what the execution synchronization instructions such as sync, isync, mtmsr do? > > Given that isync is documented as preventing any execution of > instructions after the isync until all previous instructions complete, > it doesn't seem to make sense for the architecture to explicitly talk > about loads (as opposed to any other instruction) following a load, > dependent conditional branch, isync sequence. Sorry, I didn't get what you mean. > > > So if we want to order all the storage access as well > > as execution synchronization, we should choose sync here. > > Do we need execution synchronization or context synchronization? There is no context-altering instruction here, so I think an execution synchronizing instruction should be enough here. > > The t4240 RM section that talks about a readback and a sync is in the > context of subsequent memory operations ("Then accesses can safely be > made to memory regions affected..."), not arbitrary instructions. I assume that this sequence also guarantee that the writing does take effect. > There > are also a couple other places in the RM where isync is recommended > instead (when setting LAWs or CCSRBAR), even though those also only > involve memory accesses. No idea why isync is used here. > > In any case, this is not performance critical and thus it's better to > oversynchronize than undersynchronize. On the contrary I think that sync is oversynchronize instead of undersynchronize. It not only provide the execution synchronizing but also order all the storage accesses. That is why I prefer the sync. :-) Thanks, Kevin
On Fri, Mar 14, 2014 at 06:18:27PM -0500, Scott Wood wrote: > On Wed, 2014-03-12 at 18:40 +0800, Chenhui Zhao wrote: > > On Tue, Mar 11, 2014 at 08:10:24PM -0500, Scott Wood wrote: > > > On Fri, 2014-03-07 at 12:58 +0800, Chenhui Zhao wrote: > > > > From: Zhao Chenhui <chenhui.zhao@freescale.com> > > > > > > > > T1040 supports deep sleep feature, which can switch off most parts of > > > > the SoC when it is in deep sleep mode. This way, it becomes more > > > > energy-efficient. > > > > > > > > The DDR controller will also be powered off in deep sleep. Therefore, > > > > the last stage (the latter part of fsl_dp_enter_low) will run without DDR > > > > access. This piece of code and related TLBs will be prefetched. > > > > > > > > Due to the different initialization code between 32-bit and 64-bit, they > > > > have seperate resume entry and precedure. > > > > > > > > The feature supports 32-bit and 64-bit kernel mode. > > > > > > > > Signed-off-by: Zhao Chenhui <chenhui.zhao@freescale.com> > > > > --- > > > > arch/powerpc/include/asm/booke_save_regs.h | 3 + > > > > arch/powerpc/kernel/cpu_setup_fsl_booke.S | 17 ++ > > > > arch/powerpc/kernel/head_fsl_booke.S | 30 +++ > > > > arch/powerpc/platforms/85xx/Makefile | 2 +- > > > > arch/powerpc/platforms/85xx/deepsleep.c | 201 +++++++++++++++++++ > > > > arch/powerpc/platforms/85xx/qoriq_pm.c | 38 ++++ > > > > arch/powerpc/platforms/85xx/sleep.S | 295 ++++++++++++++++++++++++++++ > > > > arch/powerpc/sysdev/fsl_soc.h | 7 + > > > > 8 files changed, 592 insertions(+), 1 deletions(-) > > > > create mode 100644 arch/powerpc/platforms/85xx/deepsleep.c > > > > create mode 100644 arch/powerpc/platforms/85xx/sleep.S > > > > > > > > diff --git a/arch/powerpc/include/asm/booke_save_regs.h b/arch/powerpc/include/asm/booke_save_regs.h > > > > index 87c357a..37c1f6c 100644 > > > > --- a/arch/powerpc/include/asm/booke_save_regs.h > > > > +++ b/arch/powerpc/include/asm/booke_save_regs.h > > > > @@ -88,6 +88,9 @@ > > > > #define HIBERNATION_FLAG 1 > > > > #define DEEPSLEEP_FLAG 2 > > > > > > > > +#define CPLD_FLAG 1 > > > > +#define FPGA_FLAG 2 > > > > > > What is this? > > > > We have two kind of boards, QDS and RDB. > > They have different register map. Use the flag to indicate the current board is using which kind > > of register map. > > CPLD versus FPGA is not a meaningful difference. We don't care what > technology is used to implement programmable logic -- we care what > programming interface is exposed. Customers will have their own boards > that will likely not imitate either of these programming interfaces, but > what they do have will still probably be implemented in a CPLD or FPGA. > Likewise, Freescale may have future reference boards whose CPLD/FPGA is > not compatible. Will use a better name. > > So use better naming, and structure the code so it's easy to plug in > implementations for new or custom boards. > > > > > diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S > > > > index 20204fe..3285752 100644 > > > > --- a/arch/powerpc/kernel/head_fsl_booke.S > > > > +++ b/arch/powerpc/kernel/head_fsl_booke.S > > > > @@ -162,6 +162,19 @@ _ENTRY(__early_start) > > > > #include "fsl_booke_entry_mapping.S" > > > > #undef ENTRY_MAPPING_BOOT_SETUP > > > > > > > > +#if defined(CONFIG_SUSPEND) && defined(CONFIG_FSL_CORENET_RCPM) > > > > + /* if deep_sleep_flag != 0, jump to the deep sleep resume entry */ > > > > + LOAD_REG_ADDR(r4, deep_sleep_flag) > > > > + lwz r3, 0(r4) > > > > + cmpwi r3, 0 > > > > + beq 11f > > > > + /* clear deep_sleep_flag */ > > > > + li r3, 0 > > > > + stw r3, 0(r4) > > > > + b fsl_deepsleep_resume > > > > +11: > > > > +#endif > > > > > > Why do you come in via the normal kernel entry, versus specifying a > > > direct entry point for deep sleep resume? How does U-Boot even know > > > what the normal entry is when resuming? > > > > I wish to return to a specified point (like 64-bit mode), but the code in > > fsl_booke_entry_mapping.S only can run in the first page. Because it > > only setups a temp mapping of 4KB. > > Why do you need the entry mapping on 32-bit but not 64-bit? fsl_booke_entry_mapping.S is for 32-bit. 64-bit calls initial_tlb_book3e() in exceptions-64e.S. > > > > > > +#if defined(CONFIG_SUSPEND) && defined(CONFIG_FSL_CORENET_RCPM) > > > > +_ENTRY(__entry_deep_sleep) > > > > +/* > > > > + * Bootloader will jump to here when resuming from deep sleep. > > > > + * After executing the init code in fsl_booke_entry_mapping.S, > > > > + * will jump to the real resume entry. > > > > + */ > > > > + li r8, 1 > > > > + bl 12f > > > > +12: mflr r9 > > > > + addi r9, r9, (deep_sleep_flag - 12b) > > > > + stw r8, 0(r9) > > > > + b __early_start > > > > +deep_sleep_flag: > > > > + .long 0 > > > > +#endif > > > > > > It's a bit ambiguous to say "entry_deep_sleep" when it's resuming rather > > > than entering... > > > > How about __fsl_entry_resume? > > fsl_booke_deep_sleep_resume > > > > > +#define FSLDELAY(count) \ > > > > + li r3, (count)@l; \ > > > > + slwi r3, r3, 10; \ > > > > + mtctr r3; \ > > > > +101: nop; \ > > > > + bdnz 101b; > > > > > > You don't need a namespace prefix on local macros in a non-header file. > > > > > > Is the timebase stopped where you're calling this from? > > > > No. My purpose is to avoid jump in the last stage of entering deep sleep. > > Jump may cause problem at that time. > > "bdnz" is a jump. > > What problems do you think a jump will cause? I mean a far jump which can jump to an address which has not been prefetched in advance. I wish the code is executed in a restricted environment (predictable code and address). > > > > You also probably want to do a "sync, readback, data dependency, isync" > > > sequence to make sure that the store has hit CCSR before you begin your > > > delay (or is a delay required at all if you do that?). > > > > Yes. It is safer with a sync sequence. > > > > The DDR controller need some time to signal the external DDR modules to > > enter self refresh mode. > > Is it documented how much time it requires? > > -Scott No. -Chenhui
On Mon, 2014-03-17 at 19:19 +0800, Chenhui Zhao wrote: > On Fri, Mar 14, 2014 at 06:18:27PM -0500, Scott Wood wrote: > > On Wed, 2014-03-12 at 18:40 +0800, Chenhui Zhao wrote: > > > On Tue, Mar 11, 2014 at 08:10:24PM -0500, Scott Wood wrote: > > > > On Fri, 2014-03-07 at 12:58 +0800, Chenhui Zhao wrote: > > > > > From: Zhao Chenhui <chenhui.zhao@freescale.com> > > > > > > > > > > T1040 supports deep sleep feature, which can switch off most parts of > > > > > the SoC when it is in deep sleep mode. This way, it becomes more > > > > > energy-efficient. > > > > > > > > > > The DDR controller will also be powered off in deep sleep. Therefore, > > > > > the last stage (the latter part of fsl_dp_enter_low) will run without DDR > > > > > access. This piece of code and related TLBs will be prefetched. > > > > > > > > > > Due to the different initialization code between 32-bit and 64-bit, they > > > > > have seperate resume entry and precedure. > > > > > > > > > > The feature supports 32-bit and 64-bit kernel mode. > > > > > > > > > > Signed-off-by: Zhao Chenhui <chenhui.zhao@freescale.com> > > > > > --- > > > > > arch/powerpc/include/asm/booke_save_regs.h | 3 + > > > > > arch/powerpc/kernel/cpu_setup_fsl_booke.S | 17 ++ > > > > > arch/powerpc/kernel/head_fsl_booke.S | 30 +++ > > > > > arch/powerpc/platforms/85xx/Makefile | 2 +- > > > > > arch/powerpc/platforms/85xx/deepsleep.c | 201 +++++++++++++++++++ > > > > > arch/powerpc/platforms/85xx/qoriq_pm.c | 38 ++++ > > > > > arch/powerpc/platforms/85xx/sleep.S | 295 ++++++++++++++++++++++++++++ > > > > > arch/powerpc/sysdev/fsl_soc.h | 7 + > > > > > 8 files changed, 592 insertions(+), 1 deletions(-) > > > > > create mode 100644 arch/powerpc/platforms/85xx/deepsleep.c > > > > > create mode 100644 arch/powerpc/platforms/85xx/sleep.S > > > > > > > > > > diff --git a/arch/powerpc/include/asm/booke_save_regs.h b/arch/powerpc/include/asm/booke_save_regs.h > > > > > index 87c357a..37c1f6c 100644 > > > > > --- a/arch/powerpc/include/asm/booke_save_regs.h > > > > > +++ b/arch/powerpc/include/asm/booke_save_regs.h > > > > > @@ -88,6 +88,9 @@ > > > > > #define HIBERNATION_FLAG 1 > > > > > #define DEEPSLEEP_FLAG 2 > > > > > > > > > > +#define CPLD_FLAG 1 > > > > > +#define FPGA_FLAG 2 > > > > > > > > What is this? > > > > > > We have two kind of boards, QDS and RDB. > > > They have different register map. Use the flag to indicate the current board is using which kind > > > of register map. > > > > CPLD versus FPGA is not a meaningful difference. We don't care what > > technology is used to implement programmable logic -- we care what > > programming interface is exposed. Customers will have their own boards > > that will likely not imitate either of these programming interfaces, but > > what they do have will still probably be implemented in a CPLD or FPGA. > > Likewise, Freescale may have future reference boards whose CPLD/FPGA is > > not compatible. > > Will use a better name. > > > > > So use better naming, and structure the code so it's easy to plug in > > implementations for new or custom boards. > > > > > > > diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S > > > > > index 20204fe..3285752 100644 > > > > > --- a/arch/powerpc/kernel/head_fsl_booke.S > > > > > +++ b/arch/powerpc/kernel/head_fsl_booke.S > > > > > @@ -162,6 +162,19 @@ _ENTRY(__early_start) > > > > > #include "fsl_booke_entry_mapping.S" > > > > > #undef ENTRY_MAPPING_BOOT_SETUP > > > > > > > > > > +#if defined(CONFIG_SUSPEND) && defined(CONFIG_FSL_CORENET_RCPM) > > > > > + /* if deep_sleep_flag != 0, jump to the deep sleep resume entry */ > > > > > + LOAD_REG_ADDR(r4, deep_sleep_flag) > > > > > + lwz r3, 0(r4) > > > > > + cmpwi r3, 0 > > > > > + beq 11f > > > > > + /* clear deep_sleep_flag */ > > > > > + li r3, 0 > > > > > + stw r3, 0(r4) > > > > > + b fsl_deepsleep_resume > > > > > +11: > > > > > +#endif > > > > > > > > Why do you come in via the normal kernel entry, versus specifying a > > > > direct entry point for deep sleep resume? How does U-Boot even know > > > > what the normal entry is when resuming? > > > > > > I wish to return to a specified point (like 64-bit mode), but the code in > > > fsl_booke_entry_mapping.S only can run in the first page. Because it > > > only setups a temp mapping of 4KB. > > > > Why do you need the entry mapping on 32-bit but not 64-bit? > > fsl_booke_entry_mapping.S is for 32-bit. 64-bit calls > initial_tlb_book3e() in exceptions-64e.S. The answer I was looking for is that __entry_deep_sleep calls start_initialization_book3e which calls the code to handle it. But why is it driven from sleep.S on 64-bit but not on 32-bit? Why can't you make it so that the 32-bit TLB setup can be called into in a similar manner? > > > > > +#define FSLDELAY(count) \ > > > > > + li r3, (count)@l; \ > > > > > + slwi r3, r3, 10; \ > > > > > + mtctr r3; \ > > > > > +101: nop; \ > > > > > + bdnz 101b; > > > > > > > > You don't need a namespace prefix on local macros in a non-header file. > > > > > > > > Is the timebase stopped where you're calling this from? > > > > > > No. My purpose is to avoid jump in the last stage of entering deep sleep. > > > Jump may cause problem at that time. > > > > "bdnz" is a jump. > > > > What problems do you think a jump will cause? > > I mean a far jump which can jump to an address which has not been prefetched in > advance. I wish the code is executed in a restricted environment (predictable code > and address). Why would a timebase loop require a "far" jump? > > > > You also probably want to do a "sync, readback, data dependency, isync" > > > > sequence to make sure that the store has hit CCSR before you begin your > > > > delay (or is a delay required at all if you do that?). > > > > > > Yes. It is safer with a sync sequence. > > > > > > The DDR controller need some time to signal the external DDR modules to > > > enter self refresh mode. > > > > Is it documented how much time it requires? > > > > -Scott > > No. How do you know the current delay is adequate in all circumstances (e.g clock speeds), much less on future chips? Is it documented that a delay is needed at all, or is this just something that appeared to make it work? If the latter, what happens if you put the synchronization in, but leave out the delay? -Scott
On Sun, 2014-03-16 at 12:58 +0800, Kevin Hao wrote: > On Fri, Mar 14, 2014 at 05:26:27PM -0500, Scott Wood wrote: > > On Thu, 2014-03-13 at 15:46 +0800, Kevin Hao wrote: > > > On Wed, Mar 12, 2014 at 12:43:05PM -0500, Scott Wood wrote: > > > > > Shouldn't we use "readback, sync" here? The following is quoted form t4240RM: > > > > > To guarantee that the results of any sequence of writes to configuration > > > > > registers are in effect, the final configuration register write should be > > > > > immediately followed by a read of the same register, and that should be > > > > > followed by a SYNC instruction. Then accesses can safely be made to memory > > > > > regions affected by the configuration register write. > > > > > > > > I agree that the sync before the readback is probably not necessary, > > > > since transactions to the same address should already be ordered. > > > > > > > > A sync after the readback helps if you're trying to order the readback > > > > with subsequent memory accesses, though in that case wouldn't a sync > > > > alone (no readback) be adequate? > > > > > > No, we don't just want to order the subsequent memory access here. > > > The 'write, readback, sync' is the required sequence if we want to make > > > sure that the writing to CCSR register does really take effect. > > > > > > > Though maybe not always -- see the > > > > comment near the end of fsl_elbc_write_buf() in > > > > drivers/mtd/nand_fsl_elbc.c. I guess the readback does more than just > > > > make sure the device has seen the write, ensuring that the device has > > > > finished the transaction to the point of acting on another one. > > > > > > Agree. > > > > > > > > > > > The data dependency plus isync sequence, which is done by the normal I/O > > > > accessors used from C code, orders the readback versus all future > > > > instructions (not just I/O). The delay loop is not I/O. > > > > > > According to the PowerISA, the sequence 'load, date dependency, isync' only > > > order the load accesses. > > > > The point is to order the delay loop after the load, not to order > > storage versus storage. > > I think the point is to make sure that the writing of the CCSR_DDR_SDRAM_CFG_2 > does really take effect before we begin to delay loop. Yes. > The sequence "write, readback, sync" will guarantee this according to the manual. If you're referring to the section you quoted above, the manual does not say that. It only talks about when accesses "to memory regions affected by the configuration register write" can be safely made. > If we just want to > order the delay loop after the load, the following sequence should be enough: > store to CCSR_DDR_SDRAM_CFG_2 > load from CCSR_DDR_SDRAM_CFG_2 > isync or sync > delay loop > > Why do we need the 'date dependency' here? According to the e6500 manual, the > instructions can execute out of order, but complete in order. So I am really > wondering why we need to order the load and the following delay loop if there > is no intention to order the storage access? The data (not date) dependency means that the twi will not execute until after the load instruction returns data (thus, after the device has responded to the load). The twi, being a flow control instruction rather than a storage instruction, should be fully ordered by isync. From the isync description in the ISA: "Except as described in the preceding sentence, the isync instruction may complete before storage accesses associated with instructions preceding the isync instruction have been performed." I don't know if that really applies to loads (as opposed to stores), and it probably doesn't apply to guarded loads, but I feel safer leaving the twi in. As for sync instead of isync, I see nothing to indicate that it would be adequate (though it may be that the only reason there needed to be a delay loop in the first place was the lack of a readback/sync before doing other I/O, in which case this is moot). > > This is a sequence we're already using on all of our I/O loads > > (excluding accesses like in this patch that don't use the standard > > accessors). I'm confident that it works even if it's not > > architecturally guaranteed. > > This sequence is used to order the load and the followed storage access. It's also used to order loads versus other things, such as enabling MSR[EE]. > And this is guaranteed by the architecture. But I don't think it is suitable > for a case like this. The following is quoted from PowerISA: > Because stores cannot be performed “out-of-order” > (see Book III), if a Store instruction depends on the > value returned by a preceding Load instruction > (because the value returned by the Load is used to > compute either the effective address specified by the > Store or the value to be stored), the corresponding stor- > age accesses are performed in program order. The > same applies if whether the Store instruction is exe- > cuted depends on a conditional Branch instruction that > in turn depends on the value returned by a preceding > Load instruction. > > Because an isync instruction prevents the execution of > instructions following the isync until instructions pre- > ceding the isync have completed, if an isync follows a > conditional Branch instruction that depends on the > value returned by a preceding Load instruction, the > load on which the Branch depends is performed before > any loads caused by instructions following the isync. > > I think the above description would guarantee that the load will be performed > before any storage access (both load and store) following the isync in the > following scenario: > lwz r4, 0(r3) > twi 0, r4, 0 > isync I think it's a poorly worded section, but yes, it guarantees both loads and stores -- unnecessarily doing so in separate places with different wording. But by the definition of isync I don't see why it does not apply to *any* instruction after the isync, not just loads and stores. > > I'm not sure that there exists a clear > > architectural way of synchronizing non-storage instructions relative to > > storage instructions. > > Isn't what the execution synchronization instructions such as sync, isync, mtmsr > do? No. Execution synchronizing just says that the previous instructions "have completed to a point at which they have reported all the exceptions they will cause" (I'm assuming this doesn't include machine check error reports), and that the previous instructions won't be affected by context changes after the execution synchronizing instruction, not that the previous instructions are fully completed. > > Given that isync is documented as preventing any execution of > > instructions after the isync until all previous instructions complete, > > it doesn't seem to make sense for the architecture to explicitly talk > > about loads (as opposed to any other instruction) following a load, > > dependent conditional branch, isync sequence. > > Sorry, I didn't get what you mean. I mean that I do not understand why it says, "the load on which the Branch depends is performed before any loads caused by instructions following the isync" rather than "the Load on which the Branch depends is performed before any instructions following the isync". > > > So if we want to order all the storage access as well > > > as execution synchronization, we should choose sync here. > > > > Do we need execution synchronization or context synchronization? > > There is no context-altering instruction here, so I think an execution > synchronizing instruction should be enough here. Is the ISA ever explicit about what constitutes "context"? In any case, just because we don't need that aspect of context synchronization doesn't mean execution synchronization is enough. The behavior we want is described in the isync instruction specifically, not in "context synchronization" or "execution synchronization". > > The t4240 RM section that talks about a readback and a sync is in the > > context of subsequent memory operations ("Then accesses can safely be > > made to memory regions affected..."), not arbitrary instructions. > > I assume that this sequence also guarantee that the writing does take effect. You may assume that, but the manual doesn't say that. Sync could (AFAIK) be implemented by emitting a barrier on the bus, or delaying future storage accesses, rather than waiting for everything to have finished before proceeding. > > In any case, this is not performance critical and thus it's better to > > oversynchronize than undersynchronize. > > On the contrary I think that sync is oversynchronize instead of > undersynchronize. It not only provide the execution synchronizing but also > order all the storage accesses. That is why I prefer the sync. :-) sync is not a superset of isync. isync is not a superset of sync. If you want to oversynchronize to be safe, do both (though even that isn't equivalent to a barrier that orders everything, which is partly why a readback is needed). -Scott
On Tue, Mar 18, 2014 at 05:42:09PM -0500, Scott Wood wrote: > On Mon, 2014-03-17 at 19:19 +0800, Chenhui Zhao wrote: > > On Fri, Mar 14, 2014 at 06:18:27PM -0500, Scott Wood wrote: > > > On Wed, 2014-03-12 at 18:40 +0800, Chenhui Zhao wrote: > > > > On Tue, Mar 11, 2014 at 08:10:24PM -0500, Scott Wood wrote: > > > > > On Fri, 2014-03-07 at 12:58 +0800, Chenhui Zhao wrote: > > > > > > From: Zhao Chenhui <chenhui.zhao@freescale.com> > > > > > > > > > > > > T1040 supports deep sleep feature, which can switch off most parts of > > > > > > the SoC when it is in deep sleep mode. This way, it becomes more > > > > > > energy-efficient. > > > > > > > > > > > > The DDR controller will also be powered off in deep sleep. Therefore, > > > > > > the last stage (the latter part of fsl_dp_enter_low) will run without DDR > > > > > > access. This piece of code and related TLBs will be prefetched. > > > > > > > > > > > > Due to the different initialization code between 32-bit and 64-bit, they > > > > > > have seperate resume entry and precedure. > > > > > > > > > > > > The feature supports 32-bit and 64-bit kernel mode. > > > > > > > > > > > > Signed-off-by: Zhao Chenhui <chenhui.zhao@freescale.com> > > > > > > --- > > > > > > arch/powerpc/include/asm/booke_save_regs.h | 3 + > > > > > > arch/powerpc/kernel/cpu_setup_fsl_booke.S | 17 ++ > > > > > > arch/powerpc/kernel/head_fsl_booke.S | 30 +++ > > > > > > arch/powerpc/platforms/85xx/Makefile | 2 +- > > > > > > arch/powerpc/platforms/85xx/deepsleep.c | 201 +++++++++++++++++++ > > > > > > arch/powerpc/platforms/85xx/qoriq_pm.c | 38 ++++ > > > > > > arch/powerpc/platforms/85xx/sleep.S | 295 ++++++++++++++++++++++++++++ > > > > > > arch/powerpc/sysdev/fsl_soc.h | 7 + > > > > > > 8 files changed, 592 insertions(+), 1 deletions(-) > > > > > > create mode 100644 arch/powerpc/platforms/85xx/deepsleep.c > > > > > > create mode 100644 arch/powerpc/platforms/85xx/sleep.S > > > > > > > > > > > > diff --git a/arch/powerpc/include/asm/booke_save_regs.h b/arch/powerpc/include/asm/booke_save_regs.h > > > > > > index 87c357a..37c1f6c 100644 > > > > > > --- a/arch/powerpc/include/asm/booke_save_regs.h > > > > > > +++ b/arch/powerpc/include/asm/booke_save_regs.h > > > > > > @@ -88,6 +88,9 @@ > > > > > > #define HIBERNATION_FLAG 1 > > > > > > #define DEEPSLEEP_FLAG 2 > > > > > > > > > > > > +#define CPLD_FLAG 1 > > > > > > +#define FPGA_FLAG 2 > > > > > > > > > > What is this? > > > > > > > > We have two kind of boards, QDS and RDB. > > > > They have different register map. Use the flag to indicate the current board is using which kind > > > > of register map. > > > > > > CPLD versus FPGA is not a meaningful difference. We don't care what > > > technology is used to implement programmable logic -- we care what > > > programming interface is exposed. Customers will have their own boards > > > that will likely not imitate either of these programming interfaces, but > > > what they do have will still probably be implemented in a CPLD or FPGA. > > > Likewise, Freescale may have future reference boards whose CPLD/FPGA is > > > not compatible. > > > > Will use a better name. > > > > > > > > So use better naming, and structure the code so it's easy to plug in > > > implementations for new or custom boards. > > > > > > > > > diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S > > > > > > index 20204fe..3285752 100644 > > > > > > --- a/arch/powerpc/kernel/head_fsl_booke.S > > > > > > +++ b/arch/powerpc/kernel/head_fsl_booke.S > > > > > > @@ -162,6 +162,19 @@ _ENTRY(__early_start) > > > > > > #include "fsl_booke_entry_mapping.S" > > > > > > #undef ENTRY_MAPPING_BOOT_SETUP > > > > > > > > > > > > +#if defined(CONFIG_SUSPEND) && defined(CONFIG_FSL_CORENET_RCPM) > > > > > > + /* if deep_sleep_flag != 0, jump to the deep sleep resume entry */ > > > > > > + LOAD_REG_ADDR(r4, deep_sleep_flag) > > > > > > + lwz r3, 0(r4) > > > > > > + cmpwi r3, 0 > > > > > > + beq 11f > > > > > > + /* clear deep_sleep_flag */ > > > > > > + li r3, 0 > > > > > > + stw r3, 0(r4) > > > > > > + b fsl_deepsleep_resume > > > > > > +11: > > > > > > +#endif > > > > > > > > > > Why do you come in via the normal kernel entry, versus specifying a > > > > > direct entry point for deep sleep resume? How does U-Boot even know > > > > > what the normal entry is when resuming? > > > > > > > > I wish to return to a specified point (like 64-bit mode), but the code in > > > > fsl_booke_entry_mapping.S only can run in the first page. Because it > > > > only setups a temp mapping of 4KB. > > > > > > Why do you need the entry mapping on 32-bit but not 64-bit? > > > > fsl_booke_entry_mapping.S is for 32-bit. 64-bit calls > > initial_tlb_book3e() in exceptions-64e.S. > > The answer I was looking for is that __entry_deep_sleep calls > start_initialization_book3e which calls the code to handle it. > > But why is it driven from sleep.S on 64-bit but not on 32-bit? Why > can't you make it so that the 32-bit TLB setup can be called into in a > similar manner? Yes. I also wish to do like this. As I mentioned, the problem in 32-bit is that the TLB setup code in fsl_booke_entry_mapping.S only setups a temp mapping of 4KB, so these code only can run in this 4KB address space. But the code in sleep.S is outside of the 4KB space. So can't put the TLB setup code in sleep.S. There are two method to solve it. 1) The current method is running the TLB setup code of fsl_booke_entry_mapping.S in the 4KB space, then jump to the code of sleep.S. 2) extend the temp mapping space in the TLB setup code to cover kernel, say 4MB or 8MB. But not sure if there are any side effects. > > > > > > > +#define FSLDELAY(count) \ > > > > > > + li r3, (count)@l; \ > > > > > > + slwi r3, r3, 10; \ > > > > > > + mtctr r3; \ > > > > > > +101: nop; \ > > > > > > + bdnz 101b; > > > > > > > > > > You don't need a namespace prefix on local macros in a non-header file. > > > > > > > > > > Is the timebase stopped where you're calling this from? > > > > > > > > No. My purpose is to avoid jump in the last stage of entering deep sleep. > > > > Jump may cause problem at that time. > > > > > > "bdnz" is a jump. > > > > > > What problems do you think a jump will cause? > > > > I mean a far jump which can jump to an address which has not been prefetched in > > advance. I wish the code is executed in a restricted environment (predictable code > > and address). > > Why would a timebase loop require a "far" jump? I mean there is far jump in udely(). Do you mean using a timebase loop in the macro FSLDELAY? If so, I agree. > > > > > > You also probably want to do a "sync, readback, data dependency, isync" > > > > > sequence to make sure that the store has hit CCSR before you begin your > > > > > delay (or is a delay required at all if you do that?). > > > > > > > > Yes. It is safer with a sync sequence. > > > > > > > > The DDR controller need some time to signal the external DDR modules to > > > > enter self refresh mode. > > > > > > Is it documented how much time it requires? > > > > > > -Scott > > > > No. > > How do you know the current delay is adequate in all circumstances (e.g > clock speeds), much less on future chips? Is it documented that a delay > is needed at all, or is this just something that appeared to make it > work? If the latter, what happens if you put the synchronization in, > but leave out the delay? > > -Scott The code controls external parts (FPGA/CPLD, DDR module) to act, and the sequent code must wait until external parts complete. We can't get an ack from external parts, so use delay to make sure the sequence and insert enough time to wait. -Chenhui
On Tue, Mar 18, 2014 at 06:18:54PM -0500, Scott Wood wrote: > > The sequence "write, readback, sync" will guarantee this according to the manual. > > If you're referring to the section you quoted above, the manual does not > say that. It only talks about when accesses "to memory regions affected > by the configuration register write" can be safely made. To guarantee that the results of any sequence of writes to configuration registers are in effect, the final configuration register write should be immediately followed by a read of the same register, and that should be followed by a SYNC instruction. Then accesses can safely be made to memory regions affected by the configuration register write. According to the above description in t4240 RM manual (2.6.1 Accessing CCSR Memory from the Local Processor), that the writing to configuration register takes effect is a prerequisite for the memory access to the affected regions. > > > If we just want to > > order the delay loop after the load, the following sequence should be enough: > > store to CCSR_DDR_SDRAM_CFG_2 > > load from CCSR_DDR_SDRAM_CFG_2 > > isync or sync > > delay loop > > > > Why do we need the 'date dependency' here? According to the e6500 manual, the > > instructions can execute out of order, but complete in order. So I am really > > wondering why we need to order the load and the following delay loop if there > > is no intention to order the storage access? > > The data (not date) dependency means that the twi will not execute until > after the load instruction returns data (thus, after the device has > responded to the load). The twi, being a flow control instruction > rather than a storage instruction, should be fully ordered by isync. > > From the isync description in the ISA: "Except as described in the > preceding sentence, the isync instruction may complete before storage > accesses associated with instructions preceding the > isync instruction have been performed." > > I don't know if that really applies to loads (as opposed to stores), and > it probably doesn't apply to guarded loads, but I feel safer leaving the > twi in. > > As for sync instead of isync, I see nothing to indicate that it would be > adequate (though it may be that the only reason there needed to be a > delay loop in the first place was the lack of a readback/sync before > doing other I/O, in which case this is moot). OK, so the intention of 'twi, isync' following the load is not to order the following storage access, but order the following delay loop instructions, right? But according to the e6500 manual, the instructions complete in order. The following is the definition of 'complete': Complete—An instruction is eligible to complete after it finishes executing and makes its results available for subsequent instructions. Instructions must complete in order from the bottom two entries of the completion queue (CQ). The completion unit coordinates how instructions (which may have executed out of order) affect architected registers to ensure the appearance of serial execution. This guarantees that the completed instruction and all previous instructions can cause no exceptions. An instruction completes when it is retired, that is, deleted from the CQ. So the following delay loop instructions should never complete before the complete of the load instruction. After the complete of load instruction, the data should already been updated to the architecture register. So we can guarantee that the load instruction get the data (the device has responded to the load) before the complete of the following delay loop instructions, why do we need the additional 'twi, isync'? Then for a case like this (which don't need order the following storage access), I think the following sequence should be enough: write to configuration register read back delay loop > I mean that I do not understand why it says, "the load on which the > Branch depends is performed before any loads caused by instructions > following the isync" rather than "the Load on which the Branch depends > is performed before any instructions following the isync". When we talk about the 'performed' here, we should only mean the effect of storage access. The following is the definition of 'performed': performed A load or instruction fetch by a processor or mech- anism (P1) is performed with respect to any pro- cessor or mechanism (P2) when the value to be returned by the load or instruction fetch can no longer be changed by a store by P2. A store by P1 is performed with respect to P2 when a load by P2 from the location accessed by the store will return the value stored (or a value stored subsequently). An instruction cache block invalidation by P1 is performed with respect to P2 when an instruction fetch by P2 will not be satisfied from the copy of the block that existed in its instruction cache when the instruction causing the invalidation was exe- cuted, and similarly for a data cache block invalida- tion. The preceding definitions apply regardless of whether P1 and P2 are the same entity. > > > > > So if we want to order all the storage access as well > > > > as execution synchronization, we should choose sync here. > > > > > > Do we need execution synchronization or context synchronization? > > > > There is no context-altering instruction here, so I think an execution > > synchronizing instruction should be enough here. > > Is the ISA ever explicit about what constitutes "context"? The following is the definition of context-altering instruction: An instruction that alters the context in which data addresses or instruction addresses are interpreted, or in which instructions are executed or data accesses are performed, is called a context-altering instruction. So the context should be: - in which data addresses or instruction addresses are interpreted - in which instructions are executed - in which data accesses are performed Thanks, Kevin
From: Kevin Hao > Sent: 20 March 2014 11:48 > To: Scott Wood > Cc: linuxppc-dev@lists.ozlabs.org; Chenhui Zhao; Jason.Jin@freescale.com; linux-kernel@vger.kernel.org > Subject: Re: [PATCH 9/9] powerpc/pm: support deep sleep feature on T1040 > > On Tue, Mar 18, 2014 at 06:18:54PM -0500, Scott Wood wrote: > > > The sequence "write, readback, sync" will guarantee this according to the manual. > > > > If you're referring to the section you quoted above, the manual does not > > say that. It only talks about when accesses "to memory regions affected > > by the configuration register write" can be safely made. > > To guarantee that the results of any sequence of writes to configuration > registers are in effect, the final configuration register write should be > immediately followed by a read of the same register, and that should be > followed by a SYNC instruction. Then accesses can safely be made to memory > regions affected by the configuration register write. That sort of sequence is need to force the operations through any external bus - after the cpu itself has issued the bus cycles. Mostly required because writes are often 'posted' (ie address and data latched, and then performed synchronously). > According to the above description in t4240 RM manual (2.6.1 Accessing CCSR > Memory from the Local Processor), that the writing to configuration register > takes effect is a prerequisite for the memory access to the affected regions. ... > OK, so the intention of 'twi, isync' following the load is not to order the > following storage access, but order the following delay loop instructions, > right I tried to work out what the 'twi, isync' instructions were for (in in_le32()). The best I could come up with was to ensure a synchronous bus-fault. But bus faults are probably only expected during device probing - not normal operation, and the instructions will have a significant cost. Additionally in_le32() and out_le32() both start with a 'sync' instruction. In many cases that isn't needed either - an explicit iosync() can be used after groups of instructions. David
On Thu, 2014-03-20 at 11:59 +0000, David Laight wrote: > I tried to work out what the 'twi, isync' instructions were for (in in_le32()). > The best I could come up with was to ensure a synchronous bus-fault. > But bus faults are probably only expected during device probing - not > normal operation, and the instructions will have a significant cost. > > Additionally in_le32() and out_le32() both start with a 'sync' instruction. > In many cases that isn't needed either - an explicit iosync() can be > used after groups of instructions. The idea is that it's better to be maximally safe by default, and let performance critical sections be optimized using raw accessors and explicit synchronization if needed, than to have hard-to-debug bugs due to missing/wrong sync. A lot of I/O is slow enough that the performance impact doesn't really matter, but the brain-time cost of getting the sync right is still there. -Scott
On Thu, 2014-03-20 at 19:47 +0800, Kevin Hao wrote: > OK, so the intention of 'twi, isync' following the load is not to order the > following storage access, but order the following delay loop instructions, > right? But according to the e6500 manual, the instructions complete in order. > The following is the definition of 'complete': > Complete—An instruction is eligible to complete after it finishes executing > and makes its results available for subsequent instructions. Instructions > must complete in order from the bottom two entries of the > completion queue (CQ). The completion unit coordinates how instructions (which > may have executed out of order) affect architected registers to ensure the > appearance of serial execution. This guarantees that the completed instruction > and all previous instructions can cause no exceptions. An instruction completes > when it is retired, that is, deleted from the CQ. > > So the following delay loop instructions should never complete before the > complete of the load instruction. We don't want the delay loop instructions to *start* until the load has completed. Completion of the loop only matters when ordering the loop versus post-loop actions (and again, there we'd want the loop to complete before subsequent actions start). > > > > > So if we want to order all the storage access as well > > > > > as execution synchronization, we should choose sync here. > > > > > > > > Do we need execution synchronization or context synchronization? > > > > > > There is no context-altering instruction here, so I think an execution > > > synchronizing instruction should be enough here. > > > > Is the ISA ever explicit about what constitutes "context"? > > The following is the definition of context-altering instruction: > An instruction that alters the context in which data > addresses or instruction addresses are interpreted, or > in which instructions are executed or data accesses are > performed, is called a context-altering instruction. > > So the context should be: > - in which data addresses or instruction addresses are interpreted > - in which instructions are executed > - in which data accesses are performed By that definition, a store to CCSR could easily change the context in which data accesses are performed, by changing a mapping. But still, nothing in the above defines "context" -- or rather, it does so circularly. While it makes intuitive sense that it would be limited to context that lives within the core, rather than the rest of the system, I don't think the ISA generally distinguishes between the two. -Scott
On Wed, 2014-03-19 at 08:56 +0800, Chenhui Zhao wrote: > On Tue, Mar 18, 2014 at 05:42:09PM -0500, Scott Wood wrote: > > On Mon, 2014-03-17 at 19:19 +0800, Chenhui Zhao wrote: > > > On Fri, Mar 14, 2014 at 06:18:27PM -0500, Scott Wood wrote: > > > > Why do you need the entry mapping on 32-bit but not 64-bit? > > > > > > fsl_booke_entry_mapping.S is for 32-bit. 64-bit calls > > > initial_tlb_book3e() in exceptions-64e.S. > > > > The answer I was looking for is that __entry_deep_sleep calls > > start_initialization_book3e which calls the code to handle it. > > > > But why is it driven from sleep.S on 64-bit but not on 32-bit? Why > > can't you make it so that the 32-bit TLB setup can be called into in a > > similar manner? > > Yes. I also wish to do like this. As I mentioned, the problem in 32-bit > is that the TLB setup code in fsl_booke_entry_mapping.S only setups a temp > mapping of 4KB, so these code only can run in this 4KB address space. But the > code in sleep.S is outside of the 4KB space. So can't put the TLB setup > code in sleep.S. There are two method to solve it. > 1) The current method is running the TLB setup code of fsl_booke_entry_mapping.S in the 4KB > space, then jump to the code of sleep.S. > 2) extend the temp mapping space in the TLB setup code to cover kernel, say 4MB or 8MB. But > not sure if there are any side effects. fsl_booke_entry_mapping.S creates a 64M entry. The 4K entry is only temporary while in AS1 -- it shouldn't matter if the address you return to when leaving fsl_booke_entry_mapping.S is outside the 4K, as long as it's within the 64M entry. Or am I missing something? > > > > > > > +#define FSLDELAY(count) \ > > > > > > > + li r3, (count)@l; \ > > > > > > > + slwi r3, r3, 10; \ > > > > > > > + mtctr r3; \ > > > > > > > +101: nop; \ > > > > > > > + bdnz 101b; > > > > > > > > > > > > You don't need a namespace prefix on local macros in a non-header file. > > > > > > > > > > > > Is the timebase stopped where you're calling this from? > > > > > > > > > > No. My purpose is to avoid jump in the last stage of entering deep sleep. > > > > > Jump may cause problem at that time. > > > > > > > > "bdnz" is a jump. > > > > > > > > What problems do you think a jump will cause? > > > > > > I mean a far jump which can jump to an address which has not been prefetched in > > > advance. I wish the code is executed in a restricted environment (predictable code > > > and address). > > > > Why would a timebase loop require a "far" jump? > > I mean there is far jump in udely(). > > Do you mean using a timebase loop in the macro FSLDELAY? If so, I agree. Yes, I meant a timebase loop, not udelay(). > > > > > > You also probably want to do a "sync, readback, data dependency, isync" > > > > > > sequence to make sure that the store has hit CCSR before you begin your > > > > > > delay (or is a delay required at all if you do that?). > > > > > > > > > > Yes. It is safer with a sync sequence. > > > > > > > > > > The DDR controller need some time to signal the external DDR modules to > > > > > enter self refresh mode. > > > > > > > > Is it documented how much time it requires? > > > > > > > > -Scott > > > > > > No. > > > > How do you know the current delay is adequate in all circumstances (e.g > > clock speeds), much less on future chips? Is it documented that a delay > > is needed at all, or is this just something that appeared to make it > > work? If the latter, what happens if you put the synchronization in, > > but leave out the delay? > > > > -Scott > > The code controls external parts (FPGA/CPLD, DDR module) to act, and > the sequent code must wait until external parts complete. We can't get > an ack from external parts, so use delay to make sure the sequence and > insert enough time to wait. It would be good if you could get the hardware designers to provide an upper bound for how long we need to wait. -Scott
From: Scott Wood [mailto:scottwood@freescale.com] > On Thu, 2014-03-20 at 11:59 +0000, David Laight wrote: > > I tried to work out what the 'twi, isync' instructions were for (in in_le32()). > > The best I could come up with was to ensure a synchronous bus-fault. > > But bus faults are probably only expected during device probing - not > > normal operation, and the instructions will have a significant cost. > > > > Additionally in_le32() and out_le32() both start with a 'sync' instruction. > > In many cases that isn't needed either - an explicit iosync() can be > > used after groups of instructions. > > The idea is that it's better to be maximally safe by default, and let > performance critical sections be optimized using raw accessors and > explicit synchronization if needed, than to have hard-to-debug bugs due > to missing/wrong sync. A lot of I/O is slow enough that the performance > impact doesn't really matter, but the brain-time cost of getting the > sync right is still there. Hmmm.... That might be an excuse for the 'sync', but not the twi and isync. I was setting up a dma request (for the ppc 83xx PCIe bridge) and was doing back to back little-endian writes into memory. I had difficulty finding and including header files containing the definitions for byteswapped accesses I needed. arch/powerpc/include/asm/swab.h contains some - but I couldn't work out how to get it included (apart from giving the full path). In any case you need to understand when synchronisation is required - otherwise you will get it wrong. Especially since non-byteswapped accesses are done by direct access. David
On Fri, 2014-03-21 at 09:21 +0000, David Laight wrote: > From: Scott Wood [mailto:scottwood@freescale.com] > > On Thu, 2014-03-20 at 11:59 +0000, David Laight wrote: > > > I tried to work out what the 'twi, isync' instructions were for (in in_le32()). > > > The best I could come up with was to ensure a synchronous bus-fault. > > > But bus faults are probably only expected during device probing - not > > > normal operation, and the instructions will have a significant cost. > > > > > > Additionally in_le32() and out_le32() both start with a 'sync' instruction. > > > In many cases that isn't needed either - an explicit iosync() can be > > > used after groups of instructions. > > > > The idea is that it's better to be maximally safe by default, and let > > performance critical sections be optimized using raw accessors and > > explicit synchronization if needed, than to have hard-to-debug bugs due > > to missing/wrong sync. A lot of I/O is slow enough that the performance > > impact doesn't really matter, but the brain-time cost of getting the > > sync right is still there. > > Hmmm.... > > That might be an excuse for the 'sync', but not the twi and isync. That might be true if I/O is always cache inhibited and guarded, in which case I think we can rely on that to ensure that the load has completed before we do things like wrtee or rfi. In any case, I'd want to hear Ben's explanation. > I was setting up a dma request (for the ppc 83xx PCIe bridge) and > was doing back to back little-endian writes into memory. > I had difficulty finding and including header files containing > the definitions for byteswapped accesses I needed. > arch/powerpc/include/asm/swab.h contains some - but I couldn't > work out how to get it included (apart from giving the full path). > > In any case you need to understand when synchronisation is > required - otherwise you will get it wrong. > Especially since non-byteswapped accesses are done by direct > access. Yes, it's bad that rawness combines the lack of byteswapping with the lack of synchronization. Ideally the raw accessors would also come in big and little endian form, plus a native endian form if it's really needed. -Scott
diff --git a/arch/powerpc/include/asm/booke_save_regs.h b/arch/powerpc/include/asm/booke_save_regs.h index 87c357a..37c1f6c 100644 --- a/arch/powerpc/include/asm/booke_save_regs.h +++ b/arch/powerpc/include/asm/booke_save_regs.h @@ -88,6 +88,9 @@ #define HIBERNATION_FLAG 1 #define DEEPSLEEP_FLAG 2 +#define CPLD_FLAG 1 +#define FPGA_FLAG 2 + #ifndef __ASSEMBLY__ extern void booke_cpu_state_save(void *buf, int type); extern void *booke_cpu_state_restore(void *buf, int type); diff --git a/arch/powerpc/kernel/cpu_setup_fsl_booke.S b/arch/powerpc/kernel/cpu_setup_fsl_booke.S index e59d6de..ea9bc28 100644 --- a/arch/powerpc/kernel/cpu_setup_fsl_booke.S +++ b/arch/powerpc/kernel/cpu_setup_fsl_booke.S @@ -318,6 +318,23 @@ flush_backside_L2_cache: 2: blr +#define CPC_CPCCSR0 0x0 +#define CPC_CPCCSR0_CPCFL 0x800 + +/* r3 : the base address of CPC */ +_GLOBAL(fsl_flush_cpc_cache) + lwz r6, CPC_CPCCSR0(r3) + ori r6, r6, CPC_CPCCSR0_CPCFL + stw r6, CPC_CPCCSR0(r3) + sync + + /* Wait until completing the flush */ +1: lwz r6, CPC_CPCCSR0(r3) + andi. r6, r6, CPC_CPCCSR0_CPCFL + bne 1b + + blr + _GLOBAL(__flush_caches_e500v2) mflr r0 bl flush_dcache_L1 diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 20204fe..3285752 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -162,6 +162,19 @@ _ENTRY(__early_start) #include "fsl_booke_entry_mapping.S" #undef ENTRY_MAPPING_BOOT_SETUP +#if defined(CONFIG_SUSPEND) && defined(CONFIG_FSL_CORENET_RCPM) + /* if deep_sleep_flag != 0, jump to the deep sleep resume entry */ + LOAD_REG_ADDR(r4, deep_sleep_flag) + lwz r3, 0(r4) + cmpwi r3, 0 + beq 11f + /* clear deep_sleep_flag */ + li r3, 0 + stw r3, 0(r4) + b fsl_deepsleep_resume +11: +#endif + set_ivor: /* Establish the interrupt vector offsets */ SET_IVOR(0, CriticalInput); @@ -343,6 +356,23 @@ set_ivor: lwz r11, 0(r12); /* Get Linux PTE */ #endif +#if defined(CONFIG_SUSPEND) && defined(CONFIG_FSL_CORENET_RCPM) +_ENTRY(__entry_deep_sleep) +/* + * Bootloader will jump to here when resuming from deep sleep. + * After executing the init code in fsl_booke_entry_mapping.S, + * will jump to the real resume entry. + */ + li r8, 1 + bl 12f +12: mflr r9 + addi r9, r9, (deep_sleep_flag - 12b) + stw r8, 0(r9) + b __early_start +deep_sleep_flag: + .long 0 +#endif + /* * Interrupt vector entry code * diff --git a/arch/powerpc/platforms/85xx/Makefile b/arch/powerpc/platforms/85xx/Makefile index 7fae817..9a4ea86 100644 --- a/arch/powerpc/platforms/85xx/Makefile +++ b/arch/powerpc/platforms/85xx/Makefile @@ -3,7 +3,7 @@ # obj-$(CONFIG_SMP) += smp.o ifeq ($(CONFIG_FSL_CORENET_RCPM), y) -obj-$(CONFIG_SUSPEND) += qoriq_pm.o +obj-$(CONFIG_SUSPEND) += qoriq_pm.o deepsleep.o sleep.o endif obj-y += common.o diff --git a/arch/powerpc/platforms/85xx/deepsleep.c b/arch/powerpc/platforms/85xx/deepsleep.c new file mode 100644 index 0000000..ddd7185 --- /dev/null +++ b/arch/powerpc/platforms/85xx/deepsleep.c @@ -0,0 +1,201 @@ +/* + * Support deep sleep feature + * + * Copyright 2014 Freescale Semiconductor Inc. + * + * Author: Chenhui Zhao <chenhui.zhao@freescale.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/of_platform.h> +#include <linux/of_address.h> +#include <asm/machdep.h> +#include <sysdev/fsl_soc.h> +#include <asm/booke_save_regs.h> + +#define SIZE_1MB 0x100000 +#define SIZE_2MB 0x200000 + +#define CCSR_SCFG_DPSLPCR 0xfc000 +#define CCSR_SCFG_DPSLPCR_WDRR_EN 0x1 +#define CCSR_SCFG_SPARECR2 0xfc504 +#define CCSR_SCFG_SPARECR3 0xfc508 + +#define CCSR_GPIO1_GPDIR 0x130000 +#define CCSR_GPIO1_GPODR 0x130004 +#define CCSR_GPIO1_GPDAT 0x130008 +#define CCSR_GPIO1_GPDIR_29 0x4 + +/* 128 bytes buffer for restoring data broke by DDR training initialization */ +#define DDR_BUF_SIZE 128 +static u8 ddr_buff[DDR_BUF_SIZE] __aligned(64); + +static void *dcsr_base, *ccsr_base, *pld_base; +static int pld_flag; + +int fsl_dp_iomap(void) +{ + struct device_node *np; + const u32 *prop; + int ret = 0; + u64 ccsr_phy_addr, dcsr_phy_addr; + + np = of_find_node_by_type(NULL, "soc"); + if (!np) { + pr_err("%s: Can't find the node of \"soc\"\n", __func__); + ret = -EINVAL; + goto ccsr_err; + } + prop = of_get_property(np, "ranges", NULL); + if (!prop) { + pr_err("%s: Can't find the property of \"ranges\"\n", __func__); + of_node_put(np); + ret = -EINVAL; + goto ccsr_err; + } + ccsr_phy_addr = of_translate_address(np, prop + 1); + ccsr_base = ioremap((phys_addr_t)ccsr_phy_addr, SIZE_2MB); + of_node_put(np); + if (!ccsr_base) { + ret = -ENOMEM; + goto ccsr_err; + } + + np = of_find_compatible_node(NULL, NULL, "fsl,dcsr"); + if (!np) { + pr_err("%s: Can't find the node of \"fsl,dcsr\"\n", __func__); + ret = -EINVAL; + goto dcsr_err; + } + prop = of_get_property(np, "ranges", NULL); + if (!prop) { + pr_err("%s: Can't find the property of \"ranges\"\n", __func__); + of_node_put(np); + ret = -EINVAL; + goto dcsr_err; + } + dcsr_phy_addr = of_translate_address(np, prop + 1); + dcsr_base = ioremap((phys_addr_t)dcsr_phy_addr, SIZE_1MB); + of_node_put(np); + if (!dcsr_base) { + ret = -ENOMEM; + goto dcsr_err; + } + + np = of_find_compatible_node(NULL, NULL, "fsl,fpga-qixis"); + if (np) { + pld_flag = FPGA_FLAG; + } else { + np = of_find_compatible_node(NULL, NULL, "fsl,p104xrdb-cpld"); + if (np) { + pld_flag = CPLD_FLAG; + } else { + pr_err("%s: Can't find the FPGA/CPLD node\n", + __func__); + ret = -EINVAL; + goto pld_err; + } + } + pld_base = of_iomap(np, 0); + of_node_put(np); + + return 0; + +pld_err: + iounmap(dcsr_base); +dcsr_err: + iounmap(ccsr_base); +ccsr_err: + ccsr_base = NULL; + dcsr_base = NULL; + pld_base = NULL; + return ret; +} + +void fsl_dp_iounmap(void) +{ + if (dcsr_base) { + iounmap(dcsr_base); + dcsr_base = NULL; + } + + if (ccsr_base) { + iounmap(ccsr_base); + ccsr_base = NULL; + } + + if (pld_base) { + iounmap(pld_base); + pld_base = NULL; + } +} + +static void fsl_dp_ddr_save(void *ccsr_base) +{ + u32 ddr_buff_addr; + + /* + * DDR training initialization will break 128 bytes at the beginning + * of DDR, therefore, save them so that the bootloader will restore + * them. Assume that DDR is mapped to the address space started with + * CONFIG_PAGE_OFFSET. + */ + memcpy(ddr_buff, (void *)CONFIG_PAGE_OFFSET, DDR_BUF_SIZE); + + /* assume ddr_buff is in the physical address space of 4GB */ + ddr_buff_addr = (u32)(__pa(ddr_buff) & 0xffffffff); + + /* + * the bootloader will restore the first 128 bytes of DDR from + * the location indicated by the register SPARECR3 + */ + out_be32(ccsr_base + CCSR_SCFG_SPARECR3, ddr_buff_addr); +} + +static void fsl_dp_set_resume_pointer(void *ccsr_base) +{ + u32 resume_addr; + + /* the bootloader will finally jump to this address to return kernel */ +#ifdef CONFIG_PPC32 + resume_addr = (u32)(__pa(__entry_deep_sleep)); +#else + resume_addr = (u32)(__pa(*(u64 *)__entry_deep_sleep) & 0xffffffff); +#endif + + /* use the register SPARECR2 to save the resume address */ + out_be32(ccsr_base + CCSR_SCFG_SPARECR2, resume_addr); + +} + +int fsl_enter_epu_deepsleep(void) +{ + + fsl_dp_ddr_save(ccsr_base); + + fsl_dp_set_resume_pointer(ccsr_base); + + /* enable Warm Device Reset request. */ + setbits32(ccsr_base + CCSR_SCFG_DPSLPCR, CCSR_SCFG_DPSLPCR_WDRR_EN); + + /* set GPIO1_29 as an output pin (not open-drain), and output 0 */ + clrbits32(ccsr_base + CCSR_GPIO1_GPDAT, CCSR_GPIO1_GPDIR_29); + clrbits32(ccsr_base + CCSR_GPIO1_GPODR, CCSR_GPIO1_GPDIR_29); + setbits32(ccsr_base + CCSR_GPIO1_GPDIR, CCSR_GPIO1_GPDIR_29); + + fsl_dp_fsm_setup(dcsr_base); + + fsl_dp_enter_low(ccsr_base, dcsr_base, pld_base, pld_flag); + + /* disable Warm Device Reset request */ + clrbits32(ccsr_base + CCSR_SCFG_DPSLPCR, CCSR_SCFG_DPSLPCR_WDRR_EN); + + fsl_dp_fsm_clean(dcsr_base); + + return 0; +} diff --git a/arch/powerpc/platforms/85xx/qoriq_pm.c b/arch/powerpc/platforms/85xx/qoriq_pm.c index 915b13b..5f2c016 100644 --- a/arch/powerpc/platforms/85xx/qoriq_pm.c +++ b/arch/powerpc/platforms/85xx/qoriq_pm.c @@ -20,6 +20,8 @@ #define FSL_SLEEP 0x1 #define FSL_DEEP_SLEEP 0x2 +int (*fsl_enter_deepsleep)(void); + /* specify the sleep state of the present platform */ int sleep_pm_state; /* supported sleep modes by the present platform */ @@ -28,6 +30,7 @@ static unsigned int sleep_modes; static int qoriq_suspend_enter(suspend_state_t state) { int ret = 0; + int cpu; switch (state) { case PM_SUSPEND_STANDBY: @@ -39,6 +42,17 @@ static int qoriq_suspend_enter(suspend_state_t state) break; + case PM_SUSPEND_MEM: + + cpu = smp_processor_id(); + qoriq_pm_ops->irq_mask(cpu); + + ret = fsl_enter_deepsleep(); + + qoriq_pm_ops->irq_unmask(cpu); + + break; + default: ret = -EINVAL; @@ -52,12 +66,30 @@ static int qoriq_suspend_valid(suspend_state_t state) if (state == PM_SUSPEND_STANDBY && (sleep_modes & FSL_SLEEP)) return 1; + if (state == PM_SUSPEND_MEM && (sleep_modes & FSL_DEEP_SLEEP)) + return 1; + return 0; } +static int qoriq_suspend_begin(suspend_state_t state) +{ + if (state == PM_SUSPEND_MEM) + return fsl_dp_iomap(); + + return 0; +} + +static void qoriq_suspend_end(void) +{ + fsl_dp_iounmap(); +} + static const struct platform_suspend_ops qoriq_suspend_ops = { .valid = qoriq_suspend_valid, .enter = qoriq_suspend_enter, + .begin = qoriq_suspend_begin, + .end = qoriq_suspend_end, }; static int __init qoriq_suspend_init(void) @@ -71,6 +103,12 @@ static int __init qoriq_suspend_init(void) if (np) sleep_pm_state = PLAT_PM_LPM20; + np = of_find_compatible_node(NULL, NULL, "fsl,t1040-rcpm"); + if (np) { + fsl_enter_deepsleep = fsl_enter_epu_deepsleep; + sleep_modes |= FSL_DEEP_SLEEP; + } + suspend_set_ops(&qoriq_suspend_ops); return 0; diff --git a/arch/powerpc/platforms/85xx/sleep.S b/arch/powerpc/platforms/85xx/sleep.S new file mode 100644 index 0000000..95a5746 --- /dev/null +++ b/arch/powerpc/platforms/85xx/sleep.S @@ -0,0 +1,295 @@ +/* + * Implement the low level part of deep sleep + * + * Copyright 2014 Freescale Semiconductor Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#include <asm/page.h> +#include <asm/ppc_asm.h> +#include <asm/reg.h> +#include <asm/asm-offsets.h> +#include <asm/booke_save_regs.h> +#include <asm/mmu.h> + +#define FSLDELAY(count) \ + li r3, (count)@l; \ + slwi r3, r3, 10; \ + mtctr r3; \ +101: nop; \ + bdnz 101b; + +#define FSL_DIS_ALL_IRQ \ + mfmsr r8; \ + rlwinm r8, r8, 0, ~MSR_CE; \ + rlwinm r8, r8, 0, ~MSR_ME; \ + rlwinm r8, r8, 0, ~MSR_EE; \ + rlwinm r8, r8, 0, ~MSR_DE; \ + mtmsr r8; \ + isync + + + .section .data + .align 6 +booke_regs_buffer: + .space REGS_BUFFER_SIZE + + .section .txt + .align 6 + +_GLOBAL(fsl_dp_enter_low) +deepsleep_start: + LOAD_REG_ADDR(r9, buf_tmp) + PPC_STL r3, 0(r9) + PPC_STL r4, 8(r9) + PPC_STL r5, 16(r9) + PPC_STL r6, 24(r9) + + LOAD_REG_ADDR(r3, booke_regs_buffer) + /* save the return address */ + mflr r5 + PPC_STL r5, SR_LR(r3) + mfmsr r5 + PPC_STL r5, SR_MSR(r3) + li r4, DEEPSLEEP_FLAG + bl booke_cpu_state_save + + LOAD_REG_ADDR(r9, buf_tmp) + PPC_LL r31, 0(r9) + PPC_LL r30, 8(r9) + PPC_LL r29, 16(r9) + PPC_LL r28, 24(r9) + + /* flush caches */ + LOAD_REG_ADDR(r3, cur_cpu_spec) + PPC_LL r3, 0(r3) + PPC_LL r3, CPU_FLUSH_CACHES(r3) + PPC_LCMPI 0, r3, 0 + beq 6f +#ifdef CONFIG_PPC64 + PPC_LL r3, 0(r3) +#endif + mtctr r3 + bctrl +6: +#define CPC_OFFSET 0x10000 + mr r3, r31 + addis r3, r3, CPC_OFFSET@h + bl fsl_flush_cpc_cache + + LOAD_REG_ADDR(r8, deepsleep_start) + LOAD_REG_ADDR(r9, deepsleep_end) + + /* prefecth TLB */ +#define CCSR_GPIO1_GPDAT 0x130008 +#define CCSR_GPIO1_GPDAT_29 0x4 + LOAD_REG_IMMEDIATE(r11, CCSR_GPIO1_GPDAT) + add r11, r31, r11 + lwz r10, 0(r11) + +#define CCSR_RCPM_PCPH15SETR 0xe20b4 +#define CCSR_RCPM_PCPH15SETR_CORE0 0x1 + LOAD_REG_IMMEDIATE(r12, CCSR_RCPM_PCPH15SETR) + add r12, r31, r12 + lwz r10, 0(r12) + +#define CCSR_DDR_SDRAM_CFG_2 0x8114 +#define CCSR_DDR_SDRAM_CFG_2_FRC_SR 0x80000000 + LOAD_REG_IMMEDIATE(r13, CCSR_DDR_SDRAM_CFG_2) + add r13, r31, r13 + lwz r10, 0(r13) + +#define DCSR_EPU_EPGCR 0x000 +#define DCSR_EPU_EPGCR_GCE 0x80000000 + li r14, DCSR_EPU_EPGCR + add r14, r30, r14 + lwz r10, 0(r14) + +#define DCSR_EPU_EPECR15 0x33C +#define DCSR_EPU_EPECR15_IC0 0x80000000 + li r15, DCSR_EPU_EPECR15 + add r15, r30, r15 + lwz r10, 0(r15) + +#define CCSR_SCFG_QMCRDTRSTCR 0xfc40c +#define CCSR_SCFG_QMCRDTRSTCR_CRDTRST 0x80000000 + LOAD_REG_IMMEDIATE(r16, CCSR_SCFG_QMCRDTRSTCR) + add r16, r31, r16 + lwz r10, 0(r16) + +/* + * There are two kind of register maps, one for CPLD and the other for FPGA + */ +#define CPLD_MISCCSR 0x17 +#define CPLD_MISCCSR_SLEEPEN 0x40 +#define QIXIS_PWR_CTL2 0x21 +#define QIXIS_PWR_CTL2_PCTL 0x2 + PPC_LCMPI 0, r28, FPGA_FLAG + beq 20f + addi r29, r29, CPLD_MISCCSR +20: + addi r29, r29, QIXIS_PWR_CTL2 + lbz r10, 0(r29) + + /* prefecth code to cache so that executing code after disable DDR */ +1: lwz r3, 0(r8) + addi r8, r8, 4 + cmpw r8, r9 + blt 1b + msync + + FSL_DIS_ALL_IRQ + + /* + * Place DDR controller in self refresh mode. + * From here on, DDR can't be access any more. + */ + lwz r10, 0(r13) + oris r10, r10, CCSR_DDR_SDRAM_CFG_2_FRC_SR@h + stw r10, 0(r13) + + /* can't call udelay() here, so use a macro to delay */ + FSLDELAY(50) + + /* + * Enable deep sleep signals by write external CPLD/FPGA register. + * The bootloader will disable them when wakeup from deep sleep. + */ + lbz r10, 0(r29) + PPC_LCMPI 0, r28, FPGA_FLAG + beq 22f + ori r10, r10, CPLD_MISCCSR_SLEEPEN +22: + ori r10, r10, QIXIS_PWR_CTL2_PCTL + stb r10, 0(r29) + + /* + * Set GPIO1_29 to lock the signal MCKE down during deep sleep. + * The bootloader will clear it when wakeup. + */ + lwz r10, 0(r11) + ori r10, r10, CCSR_GPIO1_GPDAT_29 + stw r10, 0(r11) + + FSLDELAY(10) + + /* Clear the QMan CITI Credits */ + lwz r10, 0(r16) + oris r10, r10, CCSR_SCFG_QMCRDTRSTCR_CRDTRST@h + stw r10, 0(r16) + + /* Enable all EPU Counters */ + li r10, 0 + oris r10, r10, DCSR_EPU_EPGCR_GCE@h + stw r10, 0(r14) + + /* Enable SCU15 to trigger on RCPM Concentrator 0 */ + lwz r10, 0(r15) + oris r10, r10, DCSR_EPU_EPECR15_IC0@h + stw r10, 0(r15) + + /* put Core0 in PH15 mode, trigger EPU FSM */ + lwz r10, 0(r12) + ori r10, r10, CCSR_RCPM_PCPH15SETR_CORE0 + stw r10, 0(r12) + +2: + b 2b + + /* + * Leave some space to prevent prefeching instruction + * beyond deepsleep_end. The space also can be used as heap. + */ +buf_tmp: + .space 128 + .align 6 +deepsleep_end: + +#ifdef CONFIG_PPC32 +_GLOBAL(fsl_deepsleep_resume) + /* disable interrupts */ + FSL_DIS_ALL_IRQ + + li r3, 0 + mfspr r4, SPRN_PIR + bl call_setup_cpu + + /* Load each CAM entry */ + LOAD_REG_ADDR(r3, tlbcam_index) + lwz r3, 0(r3) + mtctr r3 + li r0, 0 +3: mr r3, r0 + bl loadcam_entry + addi r0, r0, 1 + bdnz 3b + + /* restore cpu registers */ + LOAD_REG_ADDR(r3, booke_regs_buffer) + li r4, DEEPSLEEP_FLAG + bl booke_cpu_state_restore + + LOAD_REG_ADDR(r3, booke_regs_buffer) + lwz r4, SR_MSR(r3) + mtmsr r4 + lwz r4, SR_LR(r3) + mtlr r4 + + blr + +#else /* CONFIG_PPC32 */ + +_GLOBAL(__entry_deep_sleep) + /* disable interrupts */ + FSL_DIS_ALL_IRQ + + /* switch to 64-bit mode */ + bl .enable_64b_mode + + /* set TOC pointer */ + bl .relative_toc + + /* setup initial TLBs, switch to kernel space ... */ + bl .start_initialization_book3e + + /* address space changed, set TOC pointer again */ + bl .relative_toc + + /* call a cpu state restore handler */ + LOAD_REG_ADDR(r23, cur_cpu_spec) + ld r23,0(r23) + ld r23,CPU_SPEC_RESTORE(r23) + cmpdi 0,r23,0 + beq 1f + ld r23,0(r23) + mtctr r23 + bctrl +1: + LOAD_REG_ADDR(r3, booke_regs_buffer) + li r4, DEEPSLEEP_FLAG + bl booke_cpu_state_restore + + /* Load each CAM entry */ + LOAD_REG_ADDR(r3, tlbcam_index) + lwz r3, 0(r3) + mtctr r3 + li r0, 0 +3: mr r3, r0 + bl loadcam_entry + addi r0, r0, 1 + bdnz 3b + + /* restore return address */ + LOAD_REG_ADDR(r3, booke_regs_buffer) + ld r4, SR_MSR(r3) + mtmsr r4 + ld r4, SR_LR(r3) + mtlr r4 + + blr + +#endif /* CONFIG_PPC32 */ diff --git a/arch/powerpc/sysdev/fsl_soc.h b/arch/powerpc/sysdev/fsl_soc.h index eb83a30..7351c40 100644 --- a/arch/powerpc/sysdev/fsl_soc.h +++ b/arch/powerpc/sysdev/fsl_soc.h @@ -67,7 +67,14 @@ extern const struct fsl_pm_ops *qoriq_pm_ops; #define PLAT_PM_SLEEP 20 #define PLAT_PM_LPM20 30 +extern int fsl_dp_iomap(void); +extern void fsl_dp_iounmap(void); + extern int fsl_rcpm_init(void); +extern int fsl_enter_epu_deepsleep(void); +extern void fsl_dp_enter_low(void *ccsr_base, void *dcsr_base, + void *pld_base, int pld_flag); +extern void __entry_deep_sleep(void); extern void fsl_dp_fsm_setup(void *dcsr_base); extern void fsl_dp_fsm_clean(void *dcsr_base);