diff mbox

[RFC,v6,10/14] softmmu: Simplify helper_*_st_name, wrap unaligned code

Message ID 1450082498-27109-11-git-send-email-a.rigo@virtualopensystems.com
State New
Headers show

Commit Message

Alvise Rigo Dec. 14, 2015, 8:41 a.m. UTC
Attempting to simplify the helper_*_st_name, wrap the
do_unaligned_access code into an inline function.
Remove also the goto statement.

Suggested-by: Jani Kokkonen <jani.kokkonen@huawei.com>
Suggested-by: Claudio Fontana <claudio.fontana@huawei.com>
Signed-off-by: Alvise Rigo <a.rigo@virtualopensystems.com>
---
 softmmu_template.h | 96 ++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 60 insertions(+), 36 deletions(-)

Comments

Alex Bennée Jan. 7, 2016, 2:46 p.m. UTC | #1
Alvise Rigo <a.rigo@virtualopensystems.com> writes:

> Attempting to simplify the helper_*_st_name, wrap the
> do_unaligned_access code into an inline function.
> Remove also the goto statement.

As I said in the other thread I think these sort of clean-ups can come
before the ll/sc implementations and potentially get merged ahead of the
rest of it.

>
> Suggested-by: Jani Kokkonen <jani.kokkonen@huawei.com>
> Suggested-by: Claudio Fontana <claudio.fontana@huawei.com>
> Signed-off-by: Alvise Rigo <a.rigo@virtualopensystems.com>
> ---
>  softmmu_template.h | 96 ++++++++++++++++++++++++++++++++++--------------------
>  1 file changed, 60 insertions(+), 36 deletions(-)
>
> diff --git a/softmmu_template.h b/softmmu_template.h
> index d3d5902..92f92b1 100644
> --- a/softmmu_template.h
> +++ b/softmmu_template.h
> @@ -370,6 +370,32 @@ static inline void glue(io_write, SUFFIX)(CPUArchState *env,
>                                   iotlbentry->attrs);
>  }
>
> +static inline void glue(helper_le_st_name, _do_unl_access)(CPUArchState *env,
> +                                                           DATA_TYPE val,
> +                                                           target_ulong addr,
> +                                                           TCGMemOpIdx oi,
> +                                                           unsigned mmu_idx,
> +                                                           uintptr_t retaddr)
> +{
> +    int i;
> +
> +    if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
> +        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
> +                             mmu_idx, retaddr);
> +    }
> +    /* XXX: not efficient, but simple */
> +    /* Note: relies on the fact that tlb_fill() does not remove the
> +     * previous page from the TLB cache.  */
> +    for (i = DATA_SIZE - 1; i >= 0; i--) {
> +        /* Little-endian extract.  */
> +        uint8_t val8 = val >> (i * 8);
> +        /* Note the adjustment at the beginning of the function.
> +           Undo that for the recursion.  */
> +        glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
> +                                        oi, retaddr + GETPC_ADJ);
> +    }
> +}

There is still duplication of 99% of the code here which is silly given
the compiler inlines the code anyway. If we gave the helper a more
generic name and passed the endianess in via args I would hope the
compiler did the sensible thing and constant fold the code. Something
like:

static inline void glue(helper_generic_st_name, _do_unl_access)
                        (CPUArchState *env,
                        bool little_endian,
                        DATA_TYPE val,
                        target_ulong addr,
                        TCGMemOpIdx oi,
                        unsigned mmu_idx,
                        uintptr_t retaddr)
{
    int i;

    if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
                             mmu_idx, retaddr);
    }
    /* Note: relies on the fact that tlb_fill() does not remove the
     * previous page from the TLB cache.  */
    for (i = DATA_SIZE - 1; i >= 0; i--) {
        if (little_endian) {
                /* Little-endian extract.  */
                uint8_t val8 = val >> (i * 8);
        } else {
                /* Big-endian extract.  */
                uint8_t val8 = val >> (((DATA_SIZE - 1) * 8) - (i * 8));
        }
        /* Note the adjustment at the beginning of the function.
           Undo that for the recursion.  */
        glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
                                        oi, retaddr + GETPC_ADJ);
    }
}


> +
>  void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>                         TCGMemOpIdx oi, uintptr_t retaddr)
>  {
> @@ -433,7 +459,8 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>              return;
>          } else {
>              if ((addr & (DATA_SIZE - 1)) != 0) {
> -                goto do_unaligned_access;
> +                glue(helper_le_st_name, _do_unl_access)(env, val, addr, mmu_idx,
> +                                                        oi, retaddr);
>              }
>              iotlbentry = &env->iotlb[mmu_idx][index];
>
> @@ -449,23 +476,8 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>      if (DATA_SIZE > 1
>          && unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
>                       >= TARGET_PAGE_SIZE)) {
> -        int i;
> -    do_unaligned_access:
> -        if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
> -            cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
> -                                 mmu_idx, retaddr);
> -        }
> -        /* XXX: not efficient, but simple */
> -        /* Note: relies on the fact that tlb_fill() does not remove the
> -         * previous page from the TLB cache.  */
> -        for (i = DATA_SIZE - 1; i >= 0; i--) {
> -            /* Little-endian extract.  */
> -            uint8_t val8 = val >> (i * 8);
> -            /* Note the adjustment at the beginning of the function.
> -               Undo that for the recursion.  */
> -            glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
> -                                            oi, retaddr + GETPC_ADJ);
> -        }
> +        glue(helper_le_st_name, _do_unl_access)(env, val, addr, oi, mmu_idx,
> +                                                retaddr);
>          return;
>      }
>
> @@ -485,6 +497,32 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>  }
>
>  #if DATA_SIZE > 1
> +static inline void glue(helper_be_st_name, _do_unl_access)(CPUArchState *env,
> +                                                           DATA_TYPE val,
> +                                                           target_ulong addr,
> +                                                           TCGMemOpIdx oi,
> +                                                           unsigned mmu_idx,
> +                                                           uintptr_t retaddr)
> +{
> +    int i;
> +
> +    if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
> +        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
> +                             mmu_idx, retaddr);
> +    }
> +    /* XXX: not efficient, but simple */
> +    /* Note: relies on the fact that tlb_fill() does not remove the
> +     * previous page from the TLB cache.  */
> +    for (i = DATA_SIZE - 1; i >= 0; i--) {
> +        /* Big-endian extract.  */
> +        uint8_t val8 = val >> (((DATA_SIZE - 1) * 8) - (i * 8));
> +        /* Note the adjustment at the beginning of the function.
> +           Undo that for the recursion.  */
> +        glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
> +                                        oi, retaddr + GETPC_ADJ);
> +    }
> +}

Not that it matters if you combine to two as suggested because anything
not called shouldn't generate the code.

>  void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>                         TCGMemOpIdx oi, uintptr_t retaddr)
>  {
> @@ -548,7 +586,8 @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>              return;
>          } else {
>              if ((addr & (DATA_SIZE - 1)) != 0) {
> -                goto do_unaligned_access;
> +                glue(helper_be_st_name, _do_unl_access)(env, val, addr, mmu_idx,
> +                                                        oi, retaddr);
>              }
>              iotlbentry = &env->iotlb[mmu_idx][index];
>
> @@ -564,23 +603,8 @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>      if (DATA_SIZE > 1
>          && unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
>                       >= TARGET_PAGE_SIZE)) {
> -        int i;
> -    do_unaligned_access:
> -        if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
> -            cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
> -                                 mmu_idx, retaddr);
> -        }
> -        /* XXX: not efficient, but simple */
> -        /* Note: relies on the fact that tlb_fill() does not remove the
> -         * previous page from the TLB cache.  */
> -        for (i = DATA_SIZE - 1; i >= 0; i--) {
> -            /* Big-endian extract.  */
> -            uint8_t val8 = val >> (((DATA_SIZE - 1) * 8) - (i * 8));
> -            /* Note the adjustment at the beginning of the function.
> -               Undo that for the recursion.  */
> -            glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
> -                                            oi, retaddr + GETPC_ADJ);
> -        }
> +        glue(helper_be_st_name, _do_unl_access)(env, val, addr, oi, mmu_idx,
> +                                                retaddr);
>          return;
>      }


--
Alex Bennée
Alvise Rigo Jan. 7, 2016, 3:09 p.m. UTC | #2
On Thu, Jan 7, 2016 at 3:46 PM, Alex Bennée <alex.bennee@linaro.org> wrote:
>
> Alvise Rigo <a.rigo@virtualopensystems.com> writes:
>
>> Attempting to simplify the helper_*_st_name, wrap the
>> do_unaligned_access code into an inline function.
>> Remove also the goto statement.
>
> As I said in the other thread I think these sort of clean-ups can come
> before the ll/sc implementations and potentially get merged ahead of the
> rest of it.
>
>>
>> Suggested-by: Jani Kokkonen <jani.kokkonen@huawei.com>
>> Suggested-by: Claudio Fontana <claudio.fontana@huawei.com>
>> Signed-off-by: Alvise Rigo <a.rigo@virtualopensystems.com>
>> ---
>>  softmmu_template.h | 96 ++++++++++++++++++++++++++++++++++--------------------
>>  1 file changed, 60 insertions(+), 36 deletions(-)
>>
>> diff --git a/softmmu_template.h b/softmmu_template.h
>> index d3d5902..92f92b1 100644
>> --- a/softmmu_template.h
>> +++ b/softmmu_template.h
>> @@ -370,6 +370,32 @@ static inline void glue(io_write, SUFFIX)(CPUArchState *env,
>>                                   iotlbentry->attrs);
>>  }
>>
>> +static inline void glue(helper_le_st_name, _do_unl_access)(CPUArchState *env,
>> +                                                           DATA_TYPE val,
>> +                                                           target_ulong addr,
>> +                                                           TCGMemOpIdx oi,
>> +                                                           unsigned mmu_idx,
>> +                                                           uintptr_t retaddr)
>> +{
>> +    int i;
>> +
>> +    if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
>> +        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
>> +                             mmu_idx, retaddr);
>> +    }
>> +    /* XXX: not efficient, but simple */
>> +    /* Note: relies on the fact that tlb_fill() does not remove the
>> +     * previous page from the TLB cache.  */
>> +    for (i = DATA_SIZE - 1; i >= 0; i--) {
>> +        /* Little-endian extract.  */
>> +        uint8_t val8 = val >> (i * 8);
>> +        /* Note the adjustment at the beginning of the function.
>> +           Undo that for the recursion.  */
>> +        glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
>> +                                        oi, retaddr + GETPC_ADJ);
>> +    }
>> +}
>
> There is still duplication of 99% of the code here which is silly given

Then why should we keep this template-like design in the first place?
I tried to keep the code duplication for performance reasons
(otherwise how can we justify the two almost identical versions of the
helper?), while making the code more compact and readable.

> the compiler inlines the code anyway. If we gave the helper a more
> generic name and passed the endianess in via args I would hope the
> compiler did the sensible thing and constant fold the code. Something
> like:
>
> static inline void glue(helper_generic_st_name, _do_unl_access)
>                         (CPUArchState *env,
>                         bool little_endian,
>                         DATA_TYPE val,
>                         target_ulong addr,
>                         TCGMemOpIdx oi,
>                         unsigned mmu_idx,
>                         uintptr_t retaddr)
> {
>     int i;
>
>     if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
>         cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
>                              mmu_idx, retaddr);
>     }
>     /* Note: relies on the fact that tlb_fill() does not remove the
>      * previous page from the TLB cache.  */
>     for (i = DATA_SIZE - 1; i >= 0; i--) {
>         if (little_endian) {

little_endian will have >99% of the time the same value, does it make
sense to have a branch here?

Thank you,
alvise

>                 /* Little-endian extract.  */
>                 uint8_t val8 = val >> (i * 8);
>         } else {
>                 /* Big-endian extract.  */
>                 uint8_t val8 = val >> (((DATA_SIZE - 1) * 8) - (i * 8));
>         }
>         /* Note the adjustment at the beginning of the function.
>            Undo that for the recursion.  */
>         glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
>                                         oi, retaddr + GETPC_ADJ);
>     }
> }
>
>
>> +
>>  void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>>                         TCGMemOpIdx oi, uintptr_t retaddr)
>>  {
>> @@ -433,7 +459,8 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>>              return;
>>          } else {
>>              if ((addr & (DATA_SIZE - 1)) != 0) {
>> -                goto do_unaligned_access;
>> +                glue(helper_le_st_name, _do_unl_access)(env, val, addr, mmu_idx,
>> +                                                        oi, retaddr);
>>              }
>>              iotlbentry = &env->iotlb[mmu_idx][index];
>>
>> @@ -449,23 +476,8 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>>      if (DATA_SIZE > 1
>>          && unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
>>                       >= TARGET_PAGE_SIZE)) {
>> -        int i;
>> -    do_unaligned_access:
>> -        if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
>> -            cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
>> -                                 mmu_idx, retaddr);
>> -        }
>> -        /* XXX: not efficient, but simple */
>> -        /* Note: relies on the fact that tlb_fill() does not remove the
>> -         * previous page from the TLB cache.  */
>> -        for (i = DATA_SIZE - 1; i >= 0; i--) {
>> -            /* Little-endian extract.  */
>> -            uint8_t val8 = val >> (i * 8);
>> -            /* Note the adjustment at the beginning of the function.
>> -               Undo that for the recursion.  */
>> -            glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
>> -                                            oi, retaddr + GETPC_ADJ);
>> -        }
>> +        glue(helper_le_st_name, _do_unl_access)(env, val, addr, oi, mmu_idx,
>> +                                                retaddr);
>>          return;
>>      }
>>
>> @@ -485,6 +497,32 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>>  }
>>
>>  #if DATA_SIZE > 1
>> +static inline void glue(helper_be_st_name, _do_unl_access)(CPUArchState *env,
>> +                                                           DATA_TYPE val,
>> +                                                           target_ulong addr,
>> +                                                           TCGMemOpIdx oi,
>> +                                                           unsigned mmu_idx,
>> +                                                           uintptr_t retaddr)
>> +{
>> +    int i;
>> +
>> +    if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
>> +        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
>> +                             mmu_idx, retaddr);
>> +    }
>> +    /* XXX: not efficient, but simple */
>> +    /* Note: relies on the fact that tlb_fill() does not remove the
>> +     * previous page from the TLB cache.  */
>> +    for (i = DATA_SIZE - 1; i >= 0; i--) {
>> +        /* Big-endian extract.  */
>> +        uint8_t val8 = val >> (((DATA_SIZE - 1) * 8) - (i * 8));
>> +        /* Note the adjustment at the beginning of the function.
>> +           Undo that for the recursion.  */
>> +        glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
>> +                                        oi, retaddr + GETPC_ADJ);
>> +    }
>> +}
>
> Not that it matters if you combine to two as suggested because anything
> not called shouldn't generate the code.
>
>>  void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>>                         TCGMemOpIdx oi, uintptr_t retaddr)
>>  {
>> @@ -548,7 +586,8 @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>>              return;
>>          } else {
>>              if ((addr & (DATA_SIZE - 1)) != 0) {
>> -                goto do_unaligned_access;
>> +                glue(helper_be_st_name, _do_unl_access)(env, val, addr, mmu_idx,
>> +                                                        oi, retaddr);
>>              }
>>              iotlbentry = &env->iotlb[mmu_idx][index];
>>
>> @@ -564,23 +603,8 @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>>      if (DATA_SIZE > 1
>>          && unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
>>                       >= TARGET_PAGE_SIZE)) {
>> -        int i;
>> -    do_unaligned_access:
>> -        if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
>> -            cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
>> -                                 mmu_idx, retaddr);
>> -        }
>> -        /* XXX: not efficient, but simple */
>> -        /* Note: relies on the fact that tlb_fill() does not remove the
>> -         * previous page from the TLB cache.  */
>> -        for (i = DATA_SIZE - 1; i >= 0; i--) {
>> -            /* Big-endian extract.  */
>> -            uint8_t val8 = val >> (((DATA_SIZE - 1) * 8) - (i * 8));
>> -            /* Note the adjustment at the beginning of the function.
>> -               Undo that for the recursion.  */
>> -            glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
>> -                                            oi, retaddr + GETPC_ADJ);
>> -        }
>> +        glue(helper_be_st_name, _do_unl_access)(env, val, addr, oi, mmu_idx,
>> +                                                retaddr);
>>          return;
>>      }
>
>
> --
> Alex Bennée
Alex Bennée Jan. 7, 2016, 4:35 p.m. UTC | #3
alvise rigo <a.rigo@virtualopensystems.com> writes:

> On Thu, Jan 7, 2016 at 3:46 PM, Alex Bennée <alex.bennee@linaro.org> wrote:
>>
>> Alvise Rigo <a.rigo@virtualopensystems.com> writes:
>>
>>> Attempting to simplify the helper_*_st_name, wrap the
>>> do_unaligned_access code into an inline function.
>>> Remove also the goto statement.
>>
>> As I said in the other thread I think these sort of clean-ups can come
>> before the ll/sc implementations and potentially get merged ahead of the
>> rest of it.
>>
>>>
>>> Suggested-by: Jani Kokkonen <jani.kokkonen@huawei.com>
>>> Suggested-by: Claudio Fontana <claudio.fontana@huawei.com>
>>> Signed-off-by: Alvise Rigo <a.rigo@virtualopensystems.com>
>>> ---
>>>  softmmu_template.h | 96 ++++++++++++++++++++++++++++++++++--------------------
>>>  1 file changed, 60 insertions(+), 36 deletions(-)
>>>
>>> diff --git a/softmmu_template.h b/softmmu_template.h
>>> index d3d5902..92f92b1 100644
>>> --- a/softmmu_template.h
>>> +++ b/softmmu_template.h
>>> @@ -370,6 +370,32 @@ static inline void glue(io_write, SUFFIX)(CPUArchState *env,
>>>                                   iotlbentry->attrs);
>>>  }
>>>
>>> +static inline void glue(helper_le_st_name, _do_unl_access)(CPUArchState *env,
>>> +                                                           DATA_TYPE val,
>>> +                                                           target_ulong addr,
>>> +                                                           TCGMemOpIdx oi,
>>> +                                                           unsigned mmu_idx,
>>> +                                                           uintptr_t retaddr)
>>> +{
>>> +    int i;
>>> +
>>> +    if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
>>> +        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
>>> +                             mmu_idx, retaddr);
>>> +    }
>>> +    /* XXX: not efficient, but simple */
>>> +    /* Note: relies on the fact that tlb_fill() does not remove the
>>> +     * previous page from the TLB cache.  */
>>> +    for (i = DATA_SIZE - 1; i >= 0; i--) {
>>> +        /* Little-endian extract.  */
>>> +        uint8_t val8 = val >> (i * 8);
>>> +        /* Note the adjustment at the beginning of the function.
>>> +           Undo that for the recursion.  */
>>> +        glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
>>> +                                        oi, retaddr + GETPC_ADJ);
>>> +    }
>>> +}
>>
>> There is still duplication of 99% of the code here which is silly given
>
> Then why should we keep this template-like design in the first place?
> I tried to keep the code duplication for performance reasons
> (otherwise how can we justify the two almost identical versions of the
> helper?), while making the code more compact and readable.

We shouldn't really - code duplication is bad for all the well known
reasons. The main reason we need explicit helpers for the be/le case are
because they are called directly from the TCG code which encodes the
endianess decision in the call it makes. However that doesn't stop us
making generic inline helpers (helpers for the helpers ;-) which the
compiler can sort out.

>
>> the compiler inlines the code anyway. If we gave the helper a more
>> generic name and passed the endianess in via args I would hope the
>> compiler did the sensible thing and constant fold the code. Something
>> like:
>>
>> static inline void glue(helper_generic_st_name, _do_unl_access)
>>                         (CPUArchState *env,
>>                         bool little_endian,
>>                         DATA_TYPE val,
>>                         target_ulong addr,
>>                         TCGMemOpIdx oi,
>>                         unsigned mmu_idx,
>>                         uintptr_t retaddr)
>> {
>>     int i;
>>
>>     if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
>>         cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
>>                              mmu_idx, retaddr);
>>     }
>>     /* Note: relies on the fact that tlb_fill() does not remove the
>>      * previous page from the TLB cache.  */
>>     for (i = DATA_SIZE - 1; i >= 0; i--) {
>>         if (little_endian) {
>
> little_endian will have >99% of the time the same value, does it make
> sense to have a branch here?

The compiler should detect that little_endian is constant when it
inlines the code and not bother generating a test/branch case for
something that will never happen.

Even if it did though I doubt a local branch would stall the processor
that much, have you counted how many instructions we execute once we are
on the slow path?

>
> Thank you,
> alvise
>
>>                 /* Little-endian extract.  */
>>                 uint8_t val8 = val >> (i * 8);
>>         } else {
>>                 /* Big-endian extract.  */
>>                 uint8_t val8 = val >> (((DATA_SIZE - 1) * 8) - (i * 8));
>>         }
>>         /* Note the adjustment at the beginning of the function.
>>            Undo that for the recursion.  */
>>         glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
>>                                         oi, retaddr + GETPC_ADJ);
>>     }
>> }
>>
>>
>>> +
>>>  void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>>>                         TCGMemOpIdx oi, uintptr_t retaddr)
>>>  {
>>> @@ -433,7 +459,8 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>>>              return;
>>>          } else {
>>>              if ((addr & (DATA_SIZE - 1)) != 0) {
>>> -                goto do_unaligned_access;
>>> +                glue(helper_le_st_name, _do_unl_access)(env, val, addr, mmu_idx,
>>> +                                                        oi, retaddr);
>>>              }
>>>              iotlbentry = &env->iotlb[mmu_idx][index];
>>>
>>> @@ -449,23 +476,8 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>>>      if (DATA_SIZE > 1
>>>          && unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
>>>                       >= TARGET_PAGE_SIZE)) {
>>> -        int i;
>>> -    do_unaligned_access:
>>> -        if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
>>> -            cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
>>> -                                 mmu_idx, retaddr);
>>> -        }
>>> -        /* XXX: not efficient, but simple */
>>> -        /* Note: relies on the fact that tlb_fill() does not remove the
>>> -         * previous page from the TLB cache.  */
>>> -        for (i = DATA_SIZE - 1; i >= 0; i--) {
>>> -            /* Little-endian extract.  */
>>> -            uint8_t val8 = val >> (i * 8);
>>> -            /* Note the adjustment at the beginning of the function.
>>> -               Undo that for the recursion.  */
>>> -            glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
>>> -                                            oi, retaddr + GETPC_ADJ);
>>> -        }
>>> +        glue(helper_le_st_name, _do_unl_access)(env, val, addr, oi, mmu_idx,
>>> +                                                retaddr);
>>>          return;
>>>      }
>>>
>>> @@ -485,6 +497,32 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>>>  }
>>>
>>>  #if DATA_SIZE > 1
>>> +static inline void glue(helper_be_st_name, _do_unl_access)(CPUArchState *env,
>>> +                                                           DATA_TYPE val,
>>> +                                                           target_ulong addr,
>>> +                                                           TCGMemOpIdx oi,
>>> +                                                           unsigned mmu_idx,
>>> +                                                           uintptr_t retaddr)
>>> +{
>>> +    int i;
>>> +
>>> +    if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
>>> +        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
>>> +                             mmu_idx, retaddr);
>>> +    }
>>> +    /* XXX: not efficient, but simple */
>>> +    /* Note: relies on the fact that tlb_fill() does not remove the
>>> +     * previous page from the TLB cache.  */
>>> +    for (i = DATA_SIZE - 1; i >= 0; i--) {
>>> +        /* Big-endian extract.  */
>>> +        uint8_t val8 = val >> (((DATA_SIZE - 1) * 8) - (i * 8));
>>> +        /* Note the adjustment at the beginning of the function.
>>> +           Undo that for the recursion.  */
>>> +        glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
>>> +                                        oi, retaddr + GETPC_ADJ);
>>> +    }
>>> +}
>>
>> Not that it matters if you combine to two as suggested because anything
>> not called shouldn't generate the code.
>>
>>>  void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>>>                         TCGMemOpIdx oi, uintptr_t retaddr)
>>>  {
>>> @@ -548,7 +586,8 @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>>>              return;
>>>          } else {
>>>              if ((addr & (DATA_SIZE - 1)) != 0) {
>>> -                goto do_unaligned_access;
>>> +                glue(helper_be_st_name, _do_unl_access)(env, val, addr, mmu_idx,
>>> +                                                        oi, retaddr);
>>>              }
>>>              iotlbentry = &env->iotlb[mmu_idx][index];
>>>
>>> @@ -564,23 +603,8 @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>>>      if (DATA_SIZE > 1
>>>          && unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
>>>                       >= TARGET_PAGE_SIZE)) {
>>> -        int i;
>>> -    do_unaligned_access:
>>> -        if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
>>> -            cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
>>> -                                 mmu_idx, retaddr);
>>> -        }
>>> -        /* XXX: not efficient, but simple */
>>> -        /* Note: relies on the fact that tlb_fill() does not remove the
>>> -         * previous page from the TLB cache.  */
>>> -        for (i = DATA_SIZE - 1; i >= 0; i--) {
>>> -            /* Big-endian extract.  */
>>> -            uint8_t val8 = val >> (((DATA_SIZE - 1) * 8) - (i * 8));
>>> -            /* Note the adjustment at the beginning of the function.
>>> -               Undo that for the recursion.  */
>>> -            glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
>>> -                                            oi, retaddr + GETPC_ADJ);
>>> -        }
>>> +        glue(helper_be_st_name, _do_unl_access)(env, val, addr, oi, mmu_idx,
>>> +                                                retaddr);
>>>          return;
>>>      }
>>
>>
>> --
>> Alex Bennée


--
Alex Bennée
Alvise Rigo Jan. 7, 2016, 4:54 p.m. UTC | #4
On Thu, Jan 7, 2016 at 5:35 PM, Alex Bennée <alex.bennee@linaro.org> wrote:
>
> alvise rigo <a.rigo@virtualopensystems.com> writes:
>
>> On Thu, Jan 7, 2016 at 3:46 PM, Alex Bennée <alex.bennee@linaro.org> wrote:
>>>
>>> Alvise Rigo <a.rigo@virtualopensystems.com> writes:
>>>
>>>> Attempting to simplify the helper_*_st_name, wrap the
>>>> do_unaligned_access code into an inline function.
>>>> Remove also the goto statement.
>>>
>>> As I said in the other thread I think these sort of clean-ups can come
>>> before the ll/sc implementations and potentially get merged ahead of the
>>> rest of it.
>>>
>>>>
>>>> Suggested-by: Jani Kokkonen <jani.kokkonen@huawei.com>
>>>> Suggested-by: Claudio Fontana <claudio.fontana@huawei.com>
>>>> Signed-off-by: Alvise Rigo <a.rigo@virtualopensystems.com>
>>>> ---
>>>>  softmmu_template.h | 96 ++++++++++++++++++++++++++++++++++--------------------
>>>>  1 file changed, 60 insertions(+), 36 deletions(-)
>>>>
>>>> diff --git a/softmmu_template.h b/softmmu_template.h
>>>> index d3d5902..92f92b1 100644
>>>> --- a/softmmu_template.h
>>>> +++ b/softmmu_template.h
>>>> @@ -370,6 +370,32 @@ static inline void glue(io_write, SUFFIX)(CPUArchState *env,
>>>>                                   iotlbentry->attrs);
>>>>  }
>>>>
>>>> +static inline void glue(helper_le_st_name, _do_unl_access)(CPUArchState *env,
>>>> +                                                           DATA_TYPE val,
>>>> +                                                           target_ulong addr,
>>>> +                                                           TCGMemOpIdx oi,
>>>> +                                                           unsigned mmu_idx,
>>>> +                                                           uintptr_t retaddr)
>>>> +{
>>>> +    int i;
>>>> +
>>>> +    if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
>>>> +        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
>>>> +                             mmu_idx, retaddr);
>>>> +    }
>>>> +    /* XXX: not efficient, but simple */
>>>> +    /* Note: relies on the fact that tlb_fill() does not remove the
>>>> +     * previous page from the TLB cache.  */
>>>> +    for (i = DATA_SIZE - 1; i >= 0; i--) {
>>>> +        /* Little-endian extract.  */
>>>> +        uint8_t val8 = val >> (i * 8);
>>>> +        /* Note the adjustment at the beginning of the function.
>>>> +           Undo that for the recursion.  */
>>>> +        glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
>>>> +                                        oi, retaddr + GETPC_ADJ);
>>>> +    }
>>>> +}
>>>
>>> There is still duplication of 99% of the code here which is silly given
>>
>> Then why should we keep this template-like design in the first place?
>> I tried to keep the code duplication for performance reasons
>> (otherwise how can we justify the two almost identical versions of the
>> helper?), while making the code more compact and readable.
>
> We shouldn't really - code duplication is bad for all the well known
> reasons. The main reason we need explicit helpers for the be/le case are
> because they are called directly from the TCG code which encodes the
> endianess decision in the call it makes. However that doesn't stop us
> making generic inline helpers (helpers for the helpers ;-) which the
> compiler can sort out.

I thought you wanted to make conditional all the le/be differences not
just those helpers for the helpers...
So, if we are allowed to introduce this small overhead, all the
helper_{le,be}_st_name_do_{unl,mmio,ram}_access can be squashed to
helper_generic_st_do_{unl,mmio,ram}_access. I think this is want you
proposed in the POC, right?

>
>>
>>> the compiler inlines the code anyway. If we gave the helper a more
>>> generic name and passed the endianess in via args I would hope the
>>> compiler did the sensible thing and constant fold the code. Something
>>> like:
>>>
>>> static inline void glue(helper_generic_st_name, _do_unl_access)
>>>                         (CPUArchState *env,
>>>                         bool little_endian,
>>>                         DATA_TYPE val,
>>>                         target_ulong addr,
>>>                         TCGMemOpIdx oi,
>>>                         unsigned mmu_idx,
>>>                         uintptr_t retaddr)
>>> {
>>>     int i;
>>>
>>>     if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
>>>         cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
>>>                              mmu_idx, retaddr);
>>>     }
>>>     /* Note: relies on the fact that tlb_fill() does not remove the
>>>      * previous page from the TLB cache.  */
>>>     for (i = DATA_SIZE - 1; i >= 0; i--) {
>>>         if (little_endian) {
>>
>> little_endian will have >99% of the time the same value, does it make
>> sense to have a branch here?
>
> The compiler should detect that little_endian is constant when it
> inlines the code and not bother generating a test/branch case for
> something that will never happen.
>
> Even if it did though I doubt a local branch would stall the processor
> that much, have you counted how many instructions we execute once we are
> on the slow path?

Too many :)

Regards,
alvise

>
>>
>> Thank you,
>> alvise
>>
>>>                 /* Little-endian extract.  */
>>>                 uint8_t val8 = val >> (i * 8);
>>>         } else {
>>>                 /* Big-endian extract.  */
>>>                 uint8_t val8 = val >> (((DATA_SIZE - 1) * 8) - (i * 8));
>>>         }
>>>         /* Note the adjustment at the beginning of the function.
>>>            Undo that for the recursion.  */
>>>         glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
>>>                                         oi, retaddr + GETPC_ADJ);
>>>     }
>>> }
>>>
>>>
>>>> +
>>>>  void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>>>>                         TCGMemOpIdx oi, uintptr_t retaddr)
>>>>  {
>>>> @@ -433,7 +459,8 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>>>>              return;
>>>>          } else {
>>>>              if ((addr & (DATA_SIZE - 1)) != 0) {
>>>> -                goto do_unaligned_access;
>>>> +                glue(helper_le_st_name, _do_unl_access)(env, val, addr, mmu_idx,
>>>> +                                                        oi, retaddr);
>>>>              }
>>>>              iotlbentry = &env->iotlb[mmu_idx][index];
>>>>
>>>> @@ -449,23 +476,8 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>>>>      if (DATA_SIZE > 1
>>>>          && unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
>>>>                       >= TARGET_PAGE_SIZE)) {
>>>> -        int i;
>>>> -    do_unaligned_access:
>>>> -        if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
>>>> -            cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
>>>> -                                 mmu_idx, retaddr);
>>>> -        }
>>>> -        /* XXX: not efficient, but simple */
>>>> -        /* Note: relies on the fact that tlb_fill() does not remove the
>>>> -         * previous page from the TLB cache.  */
>>>> -        for (i = DATA_SIZE - 1; i >= 0; i--) {
>>>> -            /* Little-endian extract.  */
>>>> -            uint8_t val8 = val >> (i * 8);
>>>> -            /* Note the adjustment at the beginning of the function.
>>>> -               Undo that for the recursion.  */
>>>> -            glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
>>>> -                                            oi, retaddr + GETPC_ADJ);
>>>> -        }
>>>> +        glue(helper_le_st_name, _do_unl_access)(env, val, addr, oi, mmu_idx,
>>>> +                                                retaddr);
>>>>          return;
>>>>      }
>>>>
>>>> @@ -485,6 +497,32 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>>>>  }
>>>>
>>>>  #if DATA_SIZE > 1
>>>> +static inline void glue(helper_be_st_name, _do_unl_access)(CPUArchState *env,
>>>> +                                                           DATA_TYPE val,
>>>> +                                                           target_ulong addr,
>>>> +                                                           TCGMemOpIdx oi,
>>>> +                                                           unsigned mmu_idx,
>>>> +                                                           uintptr_t retaddr)
>>>> +{
>>>> +    int i;
>>>> +
>>>> +    if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
>>>> +        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
>>>> +                             mmu_idx, retaddr);
>>>> +    }
>>>> +    /* XXX: not efficient, but simple */
>>>> +    /* Note: relies on the fact that tlb_fill() does not remove the
>>>> +     * previous page from the TLB cache.  */
>>>> +    for (i = DATA_SIZE - 1; i >= 0; i--) {
>>>> +        /* Big-endian extract.  */
>>>> +        uint8_t val8 = val >> (((DATA_SIZE - 1) * 8) - (i * 8));
>>>> +        /* Note the adjustment at the beginning of the function.
>>>> +           Undo that for the recursion.  */
>>>> +        glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
>>>> +                                        oi, retaddr + GETPC_ADJ);
>>>> +    }
>>>> +}
>>>
>>> Not that it matters if you combine to two as suggested because anything
>>> not called shouldn't generate the code.
>>>
>>>>  void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>>>>                         TCGMemOpIdx oi, uintptr_t retaddr)
>>>>  {
>>>> @@ -548,7 +586,8 @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>>>>              return;
>>>>          } else {
>>>>              if ((addr & (DATA_SIZE - 1)) != 0) {
>>>> -                goto do_unaligned_access;
>>>> +                glue(helper_be_st_name, _do_unl_access)(env, val, addr, mmu_idx,
>>>> +                                                        oi, retaddr);
>>>>              }
>>>>              iotlbentry = &env->iotlb[mmu_idx][index];
>>>>
>>>> @@ -564,23 +603,8 @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>>>>      if (DATA_SIZE > 1
>>>>          && unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
>>>>                       >= TARGET_PAGE_SIZE)) {
>>>> -        int i;
>>>> -    do_unaligned_access:
>>>> -        if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
>>>> -            cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
>>>> -                                 mmu_idx, retaddr);
>>>> -        }
>>>> -        /* XXX: not efficient, but simple */
>>>> -        /* Note: relies on the fact that tlb_fill() does not remove the
>>>> -         * previous page from the TLB cache.  */
>>>> -        for (i = DATA_SIZE - 1; i >= 0; i--) {
>>>> -            /* Big-endian extract.  */
>>>> -            uint8_t val8 = val >> (((DATA_SIZE - 1) * 8) - (i * 8));
>>>> -            /* Note the adjustment at the beginning of the function.
>>>> -               Undo that for the recursion.  */
>>>> -            glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
>>>> -                                            oi, retaddr + GETPC_ADJ);
>>>> -        }
>>>> +        glue(helper_be_st_name, _do_unl_access)(env, val, addr, oi, mmu_idx,
>>>> +                                                retaddr);
>>>>          return;
>>>>      }
>>>
>>>
>>> --
>>> Alex Bennée
>
>
> --
> Alex Bennée
Alex Bennée Jan. 7, 2016, 5:36 p.m. UTC | #5
alvise rigo <a.rigo@virtualopensystems.com> writes:

> On Thu, Jan 7, 2016 at 5:35 PM, Alex Bennée <alex.bennee@linaro.org> wrote:
>>
>> alvise rigo <a.rigo@virtualopensystems.com> writes:
>>
>>> On Thu, Jan 7, 2016 at 3:46 PM, Alex Bennée <alex.bennee@linaro.org> wrote:
>>>>
>>>> Alvise Rigo <a.rigo@virtualopensystems.com> writes:
>>>>
>>>>> Attempting to simplify the helper_*_st_name, wrap the
>>>>> do_unaligned_access code into an inline function.
>>>>> Remove also the goto statement.
>>>>
>>>> As I said in the other thread I think these sort of clean-ups can come
>>>> before the ll/sc implementations and potentially get merged ahead of the
>>>> rest of it.
>>>>
>>>>>
>>>>> Suggested-by: Jani Kokkonen <jani.kokkonen@huawei.com>
>>>>> Suggested-by: Claudio Fontana <claudio.fontana@huawei.com>
>>>>> Signed-off-by: Alvise Rigo <a.rigo@virtualopensystems.com>
>>>>> ---
>>>>>  softmmu_template.h | 96 ++++++++++++++++++++++++++++++++++--------------------
>>>>>  1 file changed, 60 insertions(+), 36 deletions(-)
>>>>>
>>>>> diff --git a/softmmu_template.h b/softmmu_template.h
>>>>> index d3d5902..92f92b1 100644
>>>>> --- a/softmmu_template.h
>>>>> +++ b/softmmu_template.h
>>>>> @@ -370,6 +370,32 @@ static inline void glue(io_write, SUFFIX)(CPUArchState *env,
>>>>>                                   iotlbentry->attrs);
>>>>>  }
>>>>>
>>>>> +static inline void glue(helper_le_st_name, _do_unl_access)(CPUArchState *env,
>>>>> +                                                           DATA_TYPE val,
>>>>> +                                                           target_ulong addr,
>>>>> +                                                           TCGMemOpIdx oi,
>>>>> +                                                           unsigned mmu_idx,
>>>>> +                                                           uintptr_t retaddr)
>>>>> +{
>>>>> +    int i;
>>>>> +
>>>>> +    if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
>>>>> +        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
>>>>> +                             mmu_idx, retaddr);
>>>>> +    }
>>>>> +    /* XXX: not efficient, but simple */
>>>>> +    /* Note: relies on the fact that tlb_fill() does not remove the
>>>>> +     * previous page from the TLB cache.  */
>>>>> +    for (i = DATA_SIZE - 1; i >= 0; i--) {
>>>>> +        /* Little-endian extract.  */
>>>>> +        uint8_t val8 = val >> (i * 8);
>>>>> +        /* Note the adjustment at the beginning of the function.
>>>>> +           Undo that for the recursion.  */
>>>>> +        glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
>>>>> +                                        oi, retaddr + GETPC_ADJ);
>>>>> +    }
>>>>> +}
>>>>
>>>> There is still duplication of 99% of the code here which is silly given
>>>
>>> Then why should we keep this template-like design in the first place?
>>> I tried to keep the code duplication for performance reasons
>>> (otherwise how can we justify the two almost identical versions of the
>>> helper?), while making the code more compact and readable.
>>
>> We shouldn't really - code duplication is bad for all the well known
>> reasons. The main reason we need explicit helpers for the be/le case are
>> because they are called directly from the TCG code which encodes the
>> endianess decision in the call it makes. However that doesn't stop us
>> making generic inline helpers (helpers for the helpers ;-) which the
>> compiler can sort out.
>
> I thought you wanted to make conditional all the le/be differences not
> just those helpers for the helpers...

That would be nice for it all but that involves tweaking the TCG->helper
calls themselves. However if we are re-factoring common stuff from those
helpers into inlines then we can at least reduce the duplication there.

> So, if we are allowed to introduce this small overhead, all the
> helper_{le,be}_st_name_do_{unl,mmio,ram}_access can be squashed to
> helper_generic_st_do_{unl,mmio,ram}_access. I think this is want you
> proposed in the POC, right?

Well in theory it shouldn't introduce any overhead. However my proof is
currently waiting on a bug fix to GDB's dissas command so I can show you
the side by side assembly dump ;-)

>>>> the compiler inlines the code anyway. If we gave the helper a more
>>>> generic name and passed the endianess in via args I would hope the
>>>> compiler did the sensible thing and constant fold the code. Something
>>>> like:
>>>>
>>>> static inline void glue(helper_generic_st_name, _do_unl_access)
>>>>                         (CPUArchState *env,
>>>>                         bool little_endian,
>>>>                         DATA_TYPE val,
>>>>                         target_ulong addr,
>>>>                         TCGMemOpIdx oi,
>>>>                         unsigned mmu_idx,
>>>>                         uintptr_t retaddr)
>>>> {
>>>>     int i;
>>>>
>>>>     if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
>>>>         cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
>>>>                              mmu_idx, retaddr);
>>>>     }
>>>>     /* Note: relies on the fact that tlb_fill() does not remove the
>>>>      * previous page from the TLB cache.  */
>>>>     for (i = DATA_SIZE - 1; i >= 0; i--) {
>>>>         if (little_endian) {
>>>
>>> little_endian will have >99% of the time the same value, does it make
>>> sense to have a branch here?
>>
>> The compiler should detect that little_endian is constant when it
>> inlines the code and not bother generating a test/branch case for
>> something that will never happen.
>>
>> Even if it did though I doubt a local branch would stall the processor
>> that much, have you counted how many instructions we execute once we are
>> on the slow path?
>
> Too many :)

Indeed, that is why its SLOOOOW ;-)
>
> Regards,
> alvise
>
>>
>>>
>>> Thank you,
>>> alvise
>>>
>>>>                 /* Little-endian extract.  */
>>>>                 uint8_t val8 = val >> (i * 8);
>>>>         } else {
>>>>                 /* Big-endian extract.  */
>>>>                 uint8_t val8 = val >> (((DATA_SIZE - 1) * 8) - (i * 8));
>>>>         }
>>>>         /* Note the adjustment at the beginning of the function.
>>>>            Undo that for the recursion.  */
>>>>         glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
>>>>                                         oi, retaddr + GETPC_ADJ);
>>>>     }
>>>> }
>>>>
>>>>
>>>>> +
>>>>>  void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>>>>>                         TCGMemOpIdx oi, uintptr_t retaddr)
>>>>>  {
>>>>> @@ -433,7 +459,8 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>>>>>              return;
>>>>>          } else {
>>>>>              if ((addr & (DATA_SIZE - 1)) != 0) {
>>>>> -                goto do_unaligned_access;
>>>>> +                glue(helper_le_st_name, _do_unl_access)(env, val, addr, mmu_idx,
>>>>> +                                                        oi, retaddr);
>>>>>              }
>>>>>              iotlbentry = &env->iotlb[mmu_idx][index];
>>>>>
>>>>> @@ -449,23 +476,8 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>>>>>      if (DATA_SIZE > 1
>>>>>          && unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
>>>>>                       >= TARGET_PAGE_SIZE)) {
>>>>> -        int i;
>>>>> -    do_unaligned_access:
>>>>> -        if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
>>>>> -            cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
>>>>> -                                 mmu_idx, retaddr);
>>>>> -        }
>>>>> -        /* XXX: not efficient, but simple */
>>>>> -        /* Note: relies on the fact that tlb_fill() does not remove the
>>>>> -         * previous page from the TLB cache.  */
>>>>> -        for (i = DATA_SIZE - 1; i >= 0; i--) {
>>>>> -            /* Little-endian extract.  */
>>>>> -            uint8_t val8 = val >> (i * 8);
>>>>> -            /* Note the adjustment at the beginning of the function.
>>>>> -               Undo that for the recursion.  */
>>>>> -            glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
>>>>> -                                            oi, retaddr + GETPC_ADJ);
>>>>> -        }
>>>>> +        glue(helper_le_st_name, _do_unl_access)(env, val, addr, oi, mmu_idx,
>>>>> +                                                retaddr);
>>>>>          return;
>>>>>      }
>>>>>
>>>>> @@ -485,6 +497,32 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>>>>>  }
>>>>>
>>>>>  #if DATA_SIZE > 1
>>>>> +static inline void glue(helper_be_st_name, _do_unl_access)(CPUArchState *env,
>>>>> +                                                           DATA_TYPE val,
>>>>> +                                                           target_ulong addr,
>>>>> +                                                           TCGMemOpIdx oi,
>>>>> +                                                           unsigned mmu_idx,
>>>>> +                                                           uintptr_t retaddr)
>>>>> +{
>>>>> +    int i;
>>>>> +
>>>>> +    if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
>>>>> +        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
>>>>> +                             mmu_idx, retaddr);
>>>>> +    }
>>>>> +    /* XXX: not efficient, but simple */
>>>>> +    /* Note: relies on the fact that tlb_fill() does not remove the
>>>>> +     * previous page from the TLB cache.  */
>>>>> +    for (i = DATA_SIZE - 1; i >= 0; i--) {
>>>>> +        /* Big-endian extract.  */
>>>>> +        uint8_t val8 = val >> (((DATA_SIZE - 1) * 8) - (i * 8));
>>>>> +        /* Note the adjustment at the beginning of the function.
>>>>> +           Undo that for the recursion.  */
>>>>> +        glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
>>>>> +                                        oi, retaddr + GETPC_ADJ);
>>>>> +    }
>>>>> +}
>>>>
>>>> Not that it matters if you combine to two as suggested because anything
>>>> not called shouldn't generate the code.
>>>>
>>>>>  void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>>>>>                         TCGMemOpIdx oi, uintptr_t retaddr)
>>>>>  {
>>>>> @@ -548,7 +586,8 @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>>>>>              return;
>>>>>          } else {
>>>>>              if ((addr & (DATA_SIZE - 1)) != 0) {
>>>>> -                goto do_unaligned_access;
>>>>> +                glue(helper_be_st_name, _do_unl_access)(env, val, addr, mmu_idx,
>>>>> +                                                        oi, retaddr);
>>>>>              }
>>>>>              iotlbentry = &env->iotlb[mmu_idx][index];
>>>>>
>>>>> @@ -564,23 +603,8 @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>>>>>      if (DATA_SIZE > 1
>>>>>          && unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
>>>>>                       >= TARGET_PAGE_SIZE)) {
>>>>> -        int i;
>>>>> -    do_unaligned_access:
>>>>> -        if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
>>>>> -            cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
>>>>> -                                 mmu_idx, retaddr);
>>>>> -        }
>>>>> -        /* XXX: not efficient, but simple */
>>>>> -        /* Note: relies on the fact that tlb_fill() does not remove the
>>>>> -         * previous page from the TLB cache.  */
>>>>> -        for (i = DATA_SIZE - 1; i >= 0; i--) {
>>>>> -            /* Big-endian extract.  */
>>>>> -            uint8_t val8 = val >> (((DATA_SIZE - 1) * 8) - (i * 8));
>>>>> -            /* Note the adjustment at the beginning of the function.
>>>>> -               Undo that for the recursion.  */
>>>>> -            glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
>>>>> -                                            oi, retaddr + GETPC_ADJ);
>>>>> -        }
>>>>> +        glue(helper_be_st_name, _do_unl_access)(env, val, addr, oi, mmu_idx,
>>>>> +                                                retaddr);
>>>>>          return;
>>>>>      }
>>>>
>>>>
>>>> --
>>>> Alex Bennée
>>
>>
>> --
>> Alex Bennée


--
Alex Bennée
Alex Bennée Jan. 8, 2016, 11:19 a.m. UTC | #6
Alvise Rigo <a.rigo@virtualopensystems.com> writes:

> Attempting to simplify the helper_*_st_name, wrap the
> do_unaligned_access code into an inline function.
> Remove also the goto statement.
>
> Suggested-by: Jani Kokkonen <jani.kokkonen@huawei.com>
> Suggested-by: Claudio Fontana <claudio.fontana@huawei.com>
> Signed-off-by: Alvise Rigo <a.rigo@virtualopensystems.com>
> ---
>  softmmu_template.h | 96 ++++++++++++++++++++++++++++++++++--------------------
>  1 file changed, 60 insertions(+), 36 deletions(-)
>
> diff --git a/softmmu_template.h b/softmmu_template.h
> index d3d5902..92f92b1 100644
> --- a/softmmu_template.h
> +++ b/softmmu_template.h
> @@ -370,6 +370,32 @@ static inline void glue(io_write, SUFFIX)(CPUArchState *env,
>                                   iotlbentry->attrs);
>  }
>
> +static inline void glue(helper_le_st_name, _do_unl_access)(CPUArchState *env,
> +                                                           DATA_TYPE val,
> +                                                           target_ulong addr,
> +                                                           TCGMemOpIdx oi,
> +                                                           unsigned mmu_idx,
> +                                                           uintptr_t retaddr)
> +{
> +    int i;
> +
> +    if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
> +        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
> +                             mmu_idx, retaddr);
> +    }
> +    /* XXX: not efficient, but simple */
> +    /* Note: relies on the fact that tlb_fill() does not remove the
> +     * previous page from the TLB cache.  */
> +    for (i = DATA_SIZE - 1; i >= 0; i--) {
> +        /* Little-endian extract.  */
> +        uint8_t val8 = val >> (i * 8);
> +        /* Note the adjustment at the beginning of the function.
> +           Undo that for the recursion.  */
> +        glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
> +                                        oi, retaddr + GETPC_ADJ);
> +    }
> +}
> +
>  void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>                         TCGMemOpIdx oi, uintptr_t retaddr)
>  {
> @@ -433,7 +459,8 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>              return;
>          } else {
>              if ((addr & (DATA_SIZE - 1)) != 0) {
> -                goto do_unaligned_access;
> +                glue(helper_le_st_name, _do_unl_access)(env, val, addr, mmu_idx,
> +                                                        oi, retaddr);

I've just noticed this drops and implicit return. However I'm seeing if
I can put together an RFC cleanup patch set for softmmu based on these
plus a few other clean-ups. I'll CC when done.

>              }
>              iotlbentry = &env->iotlb[mmu_idx][index];
>
> @@ -449,23 +476,8 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>      if (DATA_SIZE > 1
>          && unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
>                       >= TARGET_PAGE_SIZE)) {
> -        int i;
> -    do_unaligned_access:
> -        if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
> -            cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
> -                                 mmu_idx, retaddr);
> -        }
> -        /* XXX: not efficient, but simple */
> -        /* Note: relies on the fact that tlb_fill() does not remove the
> -         * previous page from the TLB cache.  */
> -        for (i = DATA_SIZE - 1; i >= 0; i--) {
> -            /* Little-endian extract.  */
> -            uint8_t val8 = val >> (i * 8);
> -            /* Note the adjustment at the beginning of the function.
> -               Undo that for the recursion.  */
> -            glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
> -                                            oi, retaddr + GETPC_ADJ);
> -        }
> +        glue(helper_le_st_name, _do_unl_access)(env, val, addr, oi, mmu_idx,
> +                                                retaddr);
>          return;
>      }
>
> @@ -485,6 +497,32 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>  }
>
>  #if DATA_SIZE > 1
> +static inline void glue(helper_be_st_name, _do_unl_access)(CPUArchState *env,
> +                                                           DATA_TYPE val,
> +                                                           target_ulong addr,
> +                                                           TCGMemOpIdx oi,
> +                                                           unsigned mmu_idx,
> +                                                           uintptr_t retaddr)
> +{
> +    int i;
> +
> +    if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
> +        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
> +                             mmu_idx, retaddr);
> +    }
> +    /* XXX: not efficient, but simple */
> +    /* Note: relies on the fact that tlb_fill() does not remove the
> +     * previous page from the TLB cache.  */
> +    for (i = DATA_SIZE - 1; i >= 0; i--) {
> +        /* Big-endian extract.  */
> +        uint8_t val8 = val >> (((DATA_SIZE - 1) * 8) - (i * 8));
> +        /* Note the adjustment at the beginning of the function.
> +           Undo that for the recursion.  */
> +        glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
> +                                        oi, retaddr + GETPC_ADJ);
> +    }
> +}
> +
>  void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>                         TCGMemOpIdx oi, uintptr_t retaddr)
>  {
> @@ -548,7 +586,8 @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>              return;
>          } else {
>              if ((addr & (DATA_SIZE - 1)) != 0) {
> -                goto do_unaligned_access;
> +                glue(helper_be_st_name, _do_unl_access)(env, val, addr, mmu_idx,
> +                                                        oi, retaddr);
>              }
>              iotlbentry = &env->iotlb[mmu_idx][index];
>
> @@ -564,23 +603,8 @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>      if (DATA_SIZE > 1
>          && unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
>                       >= TARGET_PAGE_SIZE)) {
> -        int i;
> -    do_unaligned_access:
> -        if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
> -            cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
> -                                 mmu_idx, retaddr);
> -        }
> -        /* XXX: not efficient, but simple */
> -        /* Note: relies on the fact that tlb_fill() does not remove the
> -         * previous page from the TLB cache.  */
> -        for (i = DATA_SIZE - 1; i >= 0; i--) {
> -            /* Big-endian extract.  */
> -            uint8_t val8 = val >> (((DATA_SIZE - 1) * 8) - (i * 8));
> -            /* Note the adjustment at the beginning of the function.
> -               Undo that for the recursion.  */
> -            glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
> -                                            oi, retaddr + GETPC_ADJ);
> -        }
> +        glue(helper_be_st_name, _do_unl_access)(env, val, addr, oi, mmu_idx,
> +                                                retaddr);
>          return;
>      }


--
Alex Bennée
diff mbox

Patch

diff --git a/softmmu_template.h b/softmmu_template.h
index d3d5902..92f92b1 100644
--- a/softmmu_template.h
+++ b/softmmu_template.h
@@ -370,6 +370,32 @@  static inline void glue(io_write, SUFFIX)(CPUArchState *env,
                                  iotlbentry->attrs);
 }
 
+static inline void glue(helper_le_st_name, _do_unl_access)(CPUArchState *env,
+                                                           DATA_TYPE val,
+                                                           target_ulong addr,
+                                                           TCGMemOpIdx oi,
+                                                           unsigned mmu_idx,
+                                                           uintptr_t retaddr)
+{
+    int i;
+
+    if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
+        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
+                             mmu_idx, retaddr);
+    }
+    /* XXX: not efficient, but simple */
+    /* Note: relies on the fact that tlb_fill() does not remove the
+     * previous page from the TLB cache.  */
+    for (i = DATA_SIZE - 1; i >= 0; i--) {
+        /* Little-endian extract.  */
+        uint8_t val8 = val >> (i * 8);
+        /* Note the adjustment at the beginning of the function.
+           Undo that for the recursion.  */
+        glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
+                                        oi, retaddr + GETPC_ADJ);
+    }
+}
+
 void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
                        TCGMemOpIdx oi, uintptr_t retaddr)
 {
@@ -433,7 +459,8 @@  void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
             return;
         } else {
             if ((addr & (DATA_SIZE - 1)) != 0) {
-                goto do_unaligned_access;
+                glue(helper_le_st_name, _do_unl_access)(env, val, addr, mmu_idx,
+                                                        oi, retaddr);
             }
             iotlbentry = &env->iotlb[mmu_idx][index];
 
@@ -449,23 +476,8 @@  void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
     if (DATA_SIZE > 1
         && unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
                      >= TARGET_PAGE_SIZE)) {
-        int i;
-    do_unaligned_access:
-        if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
-            cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
-                                 mmu_idx, retaddr);
-        }
-        /* XXX: not efficient, but simple */
-        /* Note: relies on the fact that tlb_fill() does not remove the
-         * previous page from the TLB cache.  */
-        for (i = DATA_SIZE - 1; i >= 0; i--) {
-            /* Little-endian extract.  */
-            uint8_t val8 = val >> (i * 8);
-            /* Note the adjustment at the beginning of the function.
-               Undo that for the recursion.  */
-            glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
-                                            oi, retaddr + GETPC_ADJ);
-        }
+        glue(helper_le_st_name, _do_unl_access)(env, val, addr, oi, mmu_idx,
+                                                retaddr);
         return;
     }
 
@@ -485,6 +497,32 @@  void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
 }
 
 #if DATA_SIZE > 1
+static inline void glue(helper_be_st_name, _do_unl_access)(CPUArchState *env,
+                                                           DATA_TYPE val,
+                                                           target_ulong addr,
+                                                           TCGMemOpIdx oi,
+                                                           unsigned mmu_idx,
+                                                           uintptr_t retaddr)
+{
+    int i;
+
+    if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
+        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
+                             mmu_idx, retaddr);
+    }
+    /* XXX: not efficient, but simple */
+    /* Note: relies on the fact that tlb_fill() does not remove the
+     * previous page from the TLB cache.  */
+    for (i = DATA_SIZE - 1; i >= 0; i--) {
+        /* Big-endian extract.  */
+        uint8_t val8 = val >> (((DATA_SIZE - 1) * 8) - (i * 8));
+        /* Note the adjustment at the beginning of the function.
+           Undo that for the recursion.  */
+        glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
+                                        oi, retaddr + GETPC_ADJ);
+    }
+}
+
 void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
                        TCGMemOpIdx oi, uintptr_t retaddr)
 {
@@ -548,7 +586,8 @@  void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
             return;
         } else {
             if ((addr & (DATA_SIZE - 1)) != 0) {
-                goto do_unaligned_access;
+                glue(helper_be_st_name, _do_unl_access)(env, val, addr, mmu_idx,
+                                                        oi, retaddr);
             }
             iotlbentry = &env->iotlb[mmu_idx][index];
 
@@ -564,23 +603,8 @@  void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
     if (DATA_SIZE > 1
         && unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
                      >= TARGET_PAGE_SIZE)) {
-        int i;
-    do_unaligned_access:
-        if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
-            cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
-                                 mmu_idx, retaddr);
-        }
-        /* XXX: not efficient, but simple */
-        /* Note: relies on the fact that tlb_fill() does not remove the
-         * previous page from the TLB cache.  */
-        for (i = DATA_SIZE - 1; i >= 0; i--) {
-            /* Big-endian extract.  */
-            uint8_t val8 = val >> (((DATA_SIZE - 1) * 8) - (i * 8));
-            /* Note the adjustment at the beginning of the function.
-               Undo that for the recursion.  */
-            glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
-                                            oi, retaddr + GETPC_ADJ);
-        }
+        glue(helper_be_st_name, _do_unl_access)(env, val, addr, oi, mmu_idx,
+                                                retaddr);
         return;
     }