Message ID | 20200619135150.30278-1-stli@linux.ibm.com |
---|---|
State | New |
Headers | show |
Series | S390: Optimize __memset_z196. | expand |
Just as information, if nobody opposes, I'll commit this patch tomorrow. On 6/19/20 3:51 PM, Stefan Liebler wrote: > It turned out that an 256b-mvc instruction which depends on the > result of a previous 256b-mvc instruction is counterproductive. > Therefore this patch adjusts the 256b-loop by storing the > first byte with stc and setting the remaining 255b with mvc. > Now the 255b-mvc instruction depends on the stc instruction. > --- > sysdeps/s390/memset-z900.S | 19 +++++++++---------- > 1 file changed, 9 insertions(+), 10 deletions(-) > > diff --git a/sysdeps/s390/memset-z900.S b/sysdeps/s390/memset-z900.S > index ca3eac0522..1e0c334156 100644 > --- a/sysdeps/s390/memset-z900.S > +++ b/sysdeps/s390/memset-z900.S > @@ -157,28 +157,27 @@ ENTRY(MEMSET_Z196) > # if !defined __s390x__ > llgfr %r4,%r4 > # endif /* !defined __s390x__ */ > - ltgr %r4,%r4 > - je .L_Z196_4 > + clgfi %r4,1 > + jl .L_Z196_4 # n == 0 > stc %r3,0(%r2) > + je .L_Z196_4 # n == 1 > + aghi %r4,-2 > lgr %r1,%r2 > - cghi %r4,1 > - je .L_Z196_4 > - aghi %r4,-2 > - srlg %r5,%r4,8 > - ltgr %r5,%r5 > - jne .L_Z196_1 > + risbg %r5,%r4,8,128+63,56 # r5 = n / 256 > + jne .L_Z196_1 # Jump away if r5 != 0 > .L_Z196_3: > exrl %r4,.L_Z196_17 > .L_Z196_4: > br %r14 > .L_Z196_1: > cgfi %r5,1048576 > - jh __memset_mvcle # Switch to mvcle for >256MB > + jh __memset_mvcle # Switch to mvcle for >256MB > .L_Z196_2: > pfd 2,1024(%r1) > - mvc 1(256,%r1),0(%r1) > + mvc 1(255,%r1),0(%r1) > aghi %r5,-1 > la %r1,256(%r1) > + stc %r3,0(%r1) > jne .L_Z196_2 > j .L_Z196_3 > .L_Z196_17: >
committed On 6/25/20 10:18 AM, Stefan Liebler via Libc-alpha wrote: > Just as information, if nobody opposes, I'll commit this patch tomorrow. > > On 6/19/20 3:51 PM, Stefan Liebler wrote: >> It turned out that an 256b-mvc instruction which depends on the >> result of a previous 256b-mvc instruction is counterproductive. >> Therefore this patch adjusts the 256b-loop by storing the >> first byte with stc and setting the remaining 255b with mvc. >> Now the 255b-mvc instruction depends on the stc instruction. >> --- >> sysdeps/s390/memset-z900.S | 19 +++++++++---------- >> 1 file changed, 9 insertions(+), 10 deletions(-) >> >> diff --git a/sysdeps/s390/memset-z900.S b/sysdeps/s390/memset-z900.S >> index ca3eac0522..1e0c334156 100644 >> --- a/sysdeps/s390/memset-z900.S >> +++ b/sysdeps/s390/memset-z900.S >> @@ -157,28 +157,27 @@ ENTRY(MEMSET_Z196) >> # if !defined __s390x__ >> llgfr %r4,%r4 >> # endif /* !defined __s390x__ */ >> - ltgr %r4,%r4 >> - je .L_Z196_4 >> + clgfi %r4,1 >> + jl .L_Z196_4 # n == 0 >> stc %r3,0(%r2) >> + je .L_Z196_4 # n == 1 >> + aghi %r4,-2 >> lgr %r1,%r2 >> - cghi %r4,1 >> - je .L_Z196_4 >> - aghi %r4,-2 >> - srlg %r5,%r4,8 >> - ltgr %r5,%r5 >> - jne .L_Z196_1 >> + risbg %r5,%r4,8,128+63,56 # r5 = n / 256 >> + jne .L_Z196_1 # Jump away if r5 != 0 >> .L_Z196_3: >> exrl %r4,.L_Z196_17 >> .L_Z196_4: >> br %r14 >> .L_Z196_1: >> cgfi %r5,1048576 >> - jh __memset_mvcle # Switch to mvcle for >256MB >> + jh __memset_mvcle # Switch to mvcle for >256MB >> .L_Z196_2: >> pfd 2,1024(%r1) >> - mvc 1(256,%r1),0(%r1) >> + mvc 1(255,%r1),0(%r1) >> aghi %r5,-1 >> la %r1,256(%r1) >> + stc %r3,0(%r1) >> jne .L_Z196_2 >> j .L_Z196_3 >> .L_Z196_17: >> >
diff --git a/sysdeps/s390/memset-z900.S b/sysdeps/s390/memset-z900.S index ca3eac0522..1e0c334156 100644 --- a/sysdeps/s390/memset-z900.S +++ b/sysdeps/s390/memset-z900.S @@ -157,28 +157,27 @@ ENTRY(MEMSET_Z196) # if !defined __s390x__ llgfr %r4,%r4 # endif /* !defined __s390x__ */ - ltgr %r4,%r4 - je .L_Z196_4 + clgfi %r4,1 + jl .L_Z196_4 # n == 0 stc %r3,0(%r2) + je .L_Z196_4 # n == 1 + aghi %r4,-2 lgr %r1,%r2 - cghi %r4,1 - je .L_Z196_4 - aghi %r4,-2 - srlg %r5,%r4,8 - ltgr %r5,%r5 - jne .L_Z196_1 + risbg %r5,%r4,8,128+63,56 # r5 = n / 256 + jne .L_Z196_1 # Jump away if r5 != 0 .L_Z196_3: exrl %r4,.L_Z196_17 .L_Z196_4: br %r14 .L_Z196_1: cgfi %r5,1048576 - jh __memset_mvcle # Switch to mvcle for >256MB + jh __memset_mvcle # Switch to mvcle for >256MB .L_Z196_2: pfd 2,1024(%r1) - mvc 1(256,%r1),0(%r1) + mvc 1(255,%r1),0(%r1) aghi %r5,-1 la %r1,256(%r1) + stc %r3,0(%r1) jne .L_Z196_2 j .L_Z196_3 .L_Z196_17: