diff mbox series

[4/6] LoongArch: Add ifunc support for memset{aligned, unaligned, lsx, lasx}

Message ID 20230828072651.3085034-5-dengjianbo@loongson.cn
State New
Headers show
Series LoongArch: Add ifunc support for {raw}memchr, | expand

Commit Message

dengjianbo Aug. 28, 2023, 7:26 a.m. UTC
According to glibc memset microbenchmark test results, for LSX and LASX
versions, A few cases with length less than 8 experience performace
degradation, overall, the LASX version could reduce the runtime about
15% - 75%, LSX version could reduce the runtime about 15%-50%.

The unaligned version uses unaligned memmory access to set data which
length is less than 64 and make address aligned with 8. For this part,
the performace is better than aligned version. Comparing with the generic
version, the performance is close when the length is larger than 128. When
the length is 8-128, the unaligned version could reduce the runtime about
30%-70%, the aligned version could reduce the runtime about 20%-50%.
---
 sysdeps/loongarch/lp64/multiarch/Makefile     |   4 +
 .../lp64/multiarch/dl-symbol-redir-ifunc.h    |  24 +++
 .../lp64/multiarch/ifunc-impl-list.c          |  10 +
 .../loongarch/lp64/multiarch/memset-aligned.S | 174 ++++++++++++++++++
 .../loongarch/lp64/multiarch/memset-lasx.S    | 142 ++++++++++++++
 sysdeps/loongarch/lp64/multiarch/memset-lsx.S | 135 ++++++++++++++
 .../lp64/multiarch/memset-unaligned.S         | 162 ++++++++++++++++
 sysdeps/loongarch/lp64/multiarch/memset.c     |  37 ++++
 8 files changed, 688 insertions(+)
 create mode 100644 sysdeps/loongarch/lp64/multiarch/dl-symbol-redir-ifunc.h
 create mode 100644 sysdeps/loongarch/lp64/multiarch/memset-aligned.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/memset-lasx.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/memset-lsx.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/memset-unaligned.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/memset.c
diff mbox series

Patch

diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
index 7b87bc9055..216886c551 100644
--- a/sysdeps/loongarch/lp64/multiarch/Makefile
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
@@ -30,5 +30,9 @@  sysdep_routines += \
   memrchr-generic \
   memrchr-lsx \
   memrchr-lasx \
+  memset-aligned \
+  memset-unaligned \
+  memset-lsx \
+  memset-lasx \
 # sysdep_routines
 endif
diff --git a/sysdeps/loongarch/lp64/multiarch/dl-symbol-redir-ifunc.h b/sysdeps/loongarch/lp64/multiarch/dl-symbol-redir-ifunc.h
new file mode 100644
index 0000000000..e2723873bc
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/dl-symbol-redir-ifunc.h
@@ -0,0 +1,24 @@ 
+/* Symbol rediretion for loader/static initialization code.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _DL_IFUNC_GENERIC_H
+#define _DL_IFUNC_GENERIC_H
+
+asm ("memset = __memset_aligned");
+
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
index 8bd5489ee2..37f60dde91 100644
--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
@@ -117,5 +117,15 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 #endif
 	      IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_generic)
 	      )
+
+  IFUNC_IMPL (i, name, memset,
+#if !defined __loongarch_soft_float
+	      IFUNC_IMPL_ADD (array, i, memset, SUPPORT_LASX, __memset_lasx)
+	      IFUNC_IMPL_ADD (array, i, memset, SUPPORT_LSX, __memset_lsx)
+#endif
+	      IFUNC_IMPL_ADD (array, i, memset, SUPPORT_UAL, __memset_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_aligned)
+	      )
+
   return i;
 }
diff --git a/sysdeps/loongarch/lp64/multiarch/memset-aligned.S b/sysdeps/loongarch/lp64/multiarch/memset-aligned.S
new file mode 100644
index 0000000000..1fce95b714
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/memset-aligned.S
@@ -0,0 +1,174 @@ 
+/* Optimized memset aligned implementation using basic LoongArch instructions.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc)
+# define MEMSET_NAME __memset_aligned
+#else
+# define MEMSET_NAME memset
+#endif
+
+LEAF(MEMSET_NAME, 6)
+    move        t0, a0
+    andi        a3, a0, 0x7
+    li.w        t6, 16
+    beqz        a3, L(align)
+    bltu        a2, t6, L(short_data)
+
+L(make_align):
+    li.w        t8, 8
+    sub.d       t2, t8, a3
+    pcaddi      t1, 11
+    slli.d      t3, t2, 2
+    sub.d       t1, t1, t3
+    jr          t1
+
+L(al7):
+    st.b        a1, t0, 6
+L(al6):
+    st.b        a1, t0, 5
+L(al5):
+    st.b        a1, t0, 4
+L(al4):
+    st.b        a1, t0, 3
+L(al3):
+    st.b        a1, t0, 2
+L(al2):
+    st.b        a1, t0, 1
+L(al1):
+    st.b        a1, t0, 0
+L(al0):
+    add.d       t0, t0, t2
+    sub.d       a2, a2, t2
+
+L(align):
+    bstrins.d   a1, a1, 15, 8
+    bstrins.d   a1, a1, 31, 16
+    bstrins.d   a1, a1, 63, 32
+    bltu        a2, t6, L(less_16bytes)
+
+    andi        a4, a2, 0x3f
+    beq         a4, a2, L(less_64bytes)
+
+    sub.d       t1, a2, a4
+    move        a2, a4
+    add.d       a5, t0, t1
+
+L(loop_64bytes):
+    addi.d      t0, t0, 64
+    st.d        a1, t0, -64
+    st.d        a1, t0, -56
+    st.d        a1, t0, -48
+    st.d        a1, t0, -40
+
+    st.d        a1, t0, -32
+    st.d        a1, t0, -24
+    st.d        a1, t0, -16
+    st.d        a1, t0, -8
+    bne         t0, a5, L(loop_64bytes)
+
+L(less_64bytes):
+    srai.d      a4, a2, 5
+    beqz        a4, L(less_32bytes)
+    addi.d      a2, a2, -32
+    st.d        a1, t0, 0
+
+    st.d        a1, t0, 8
+    st.d        a1, t0, 16
+    st.d        a1, t0, 24
+    addi.d      t0, t0, 32
+
+L(less_32bytes):
+    bltu        a2, t6, L(less_16bytes)
+    addi.d      a2, a2, -16
+    st.d        a1, t0, 0
+    st.d        a1, t0, 8
+    addi.d      t0, t0, 16
+
+L(less_16bytes):
+    srai.d      a4, a2, 3
+    beqz        a4, L(less_8bytes)
+    addi.d      a2, a2, -8
+    st.d        a1, t0, 0
+    addi.d      t0, t0, 8
+
+L(less_8bytes):
+    beqz        a2, L(less_1byte)
+    srai.d      a4, a2, 2
+    beqz        a4, L(less_4bytes)
+    addi.d      a2, a2, -4
+    st.w        a1, t0, 0
+    addi.d      t0, t0, 4
+
+L(less_4bytes):
+    srai.d      a3, a2, 1
+    beqz        a3, L(less_2bytes)
+    addi.d      a2, a2, -2
+    st.h        a1, t0, 0
+    addi.d      t0, t0, 2
+
+L(less_2bytes):
+    beqz        a2, L(less_1byte)
+    st.b        a1, t0, 0
+L(less_1byte):
+    jr          ra
+
+L(short_data):
+    pcaddi      t1, 19
+    slli.d      t3, a2, 2
+    sub.d       t1, t1, t3
+    jr          t1
+L(short_15):
+    st.b        a1, a0, 14
+L(short_14):
+    st.b        a1, a0, 13
+L(short_13):
+    st.b        a1, a0, 12
+L(short_12):
+    st.b        a1, a0, 11
+L(short_11):
+    st.b        a1, a0, 10
+L(short_10):
+    st.b        a1, a0, 9
+L(short_9):
+    st.b        a1, a0, 8
+L(short_8):
+    st.b        a1, a0, 7
+L(short_7):
+    st.b        a1, a0, 6
+L(short_6):
+    st.b        a1, a0, 5
+L(short_5):
+    st.b        a1, a0, 4
+L(short_4):
+    st.b        a1, a0, 3
+L(short_3):
+    st.b        a1, a0, 2
+L(short_2):
+    st.b        a1, a0, 1
+L(short_1):
+    st.b        a1, a0, 0
+L(short_0):
+    jr          ra
+END(MEMSET_NAME)
+
+libc_hidden_builtin_def (MEMSET_NAME)
diff --git a/sysdeps/loongarch/lp64/multiarch/memset-lasx.S b/sysdeps/loongarch/lp64/multiarch/memset-lasx.S
new file mode 100644
index 0000000000..041abbac87
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/memset-lasx.S
@@ -0,0 +1,142 @@ 
+/* Optimized memset implementation using LoongArch LASX instructions.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+# define MEMSET __memset_lasx
+
+LEAF(MEMSET, 6)
+    li.d            t1, 32
+    move            a3, a0
+    xvreplgr2vr.b   xr0, a1
+    add.d           a4, a0, a2
+
+    bgeu            t1, a2, L(less_32bytes)
+    li.d            t3, 128
+    li.d            t2, 64
+    blt             t3, a2, L(long_bytes)
+
+L(less_128bytes):
+    bgeu            t2, a2, L(less_64bytes)
+    xvst            xr0, a3, 0
+    xvst            xr0, a3, 32
+    xvst            xr0, a4, -32
+
+    xvst            xr0, a4, -64
+    jr              ra
+L(less_64bytes):
+    xvst            xr0, a3, 0
+    xvst            xr0, a4, -32
+
+
+    jr              ra
+L(less_32bytes):
+    srli.d          t0, a2, 4
+    beqz            t0, L(less_16bytes)
+    vst             vr0, a3, 0
+
+    vst             vr0, a4, -16
+    jr              ra
+L(less_16bytes):
+    srli.d          t0, a2, 3
+    beqz            t0, L(less_8bytes)
+
+    vstelm.d        vr0, a3, 0, 0
+    vstelm.d        vr0, a4, -8, 0
+    jr              ra
+L(less_8bytes):
+    srli.d          t0, a2, 2
+
+    beqz            t0, L(less_4bytes)
+    vstelm.w        vr0, a3, 0, 0
+    vstelm.w        vr0, a4, -4, 0
+    jr              ra
+
+
+L(less_4bytes):
+    srli.d          t0, a2, 1
+    beqz            t0, L(less_2bytes)
+    vstelm.h        vr0, a3, 0, 0
+    vstelm.h        vr0, a4, -2, 0
+
+    jr              ra
+L(less_2bytes):
+    beqz            a2, L(less_1bytes)
+    st.b            a1, a3, 0
+L(less_1bytes):
+    jr              ra
+
+L(long_bytes):
+    xvst            xr0, a3, 0
+    bstrins.d       a3, zero, 4, 0
+    addi.d          a3, a3, 32
+    sub.d           a2, a4, a3
+
+    andi            t0, a2, 0xff
+    beq             t0, a2, L(long_end)
+    move            a2, t0
+    sub.d           t0, a4, t0
+
+
+L(loop_256):
+    xvst            xr0, a3, 0
+    xvst            xr0, a3, 32
+    xvst            xr0, a3, 64
+    xvst            xr0, a3, 96
+
+    xvst            xr0, a3, 128
+    xvst            xr0, a3, 160
+    xvst            xr0, a3, 192
+    xvst            xr0, a3, 224
+
+    addi.d          a3, a3, 256
+    bne             a3, t0, L(loop_256)
+L(long_end):
+    bltu            a2, t3, L(end_less_128)
+    addi.d          a2, a2, -128
+
+    xvst            xr0, a3, 0
+    xvst            xr0, a3, 32
+    xvst            xr0, a3, 64
+    xvst            xr0, a3, 96
+
+
+    addi.d          a3, a3, 128
+L(end_less_128):
+    bltu            a2, t2, L(end_less_64)
+    addi.d          a2, a2, -64
+    xvst            xr0, a3, 0
+
+    xvst            xr0, a3, 32
+    addi.d          a3, a3, 64
+L(end_less_64):
+    bltu            a2, t1, L(end_less_32)
+    xvst            xr0, a3, 0
+
+L(end_less_32):
+    xvst            xr0, a4, -32
+    jr              ra
+END(MEMSET)
+
+libc_hidden_builtin_def (MEMSET)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/memset-lsx.S b/sysdeps/loongarch/lp64/multiarch/memset-lsx.S
new file mode 100644
index 0000000000..3d3982aa5a
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/memset-lsx.S
@@ -0,0 +1,135 @@ 
+/* Optimized memset implementation using LoongArch LSX instructions.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+# define MEMSET __memset_lsx
+
+LEAF(MEMSET, 6)
+    li.d            t1, 16
+    move            a3, a0
+    vreplgr2vr.b    vr0, a1
+    add.d           a4, a0, a2
+
+    bgeu            t1, a2, L(less_16bytes)
+    li.d            t3, 64
+    li.d            t2, 32
+    bgeu            a2, t3, L(long_bytes)
+
+L(less_64bytes):
+    bgeu            t2, a2, L(less_32bytes)
+    vst             vr0, a3, 0
+    vst             vr0, a3, 16
+    vst             vr0, a4, -32
+
+    vst             vr0, a4, -16
+    jr              ra
+L(less_32bytes):
+    vst             vr0, a3, 0
+    vst             vr0, a4, -16
+
+
+    jr              ra
+L(less_16bytes):
+    srli.d          t0, a2, 3
+    beqz            t0, L(less_8bytes)
+    vstelm.d        vr0, a3, 0, 0
+
+    vstelm.d        vr0, a4, -8, 0
+    jr              ra
+L(less_8bytes):
+    srli.d          t0, a2, 2
+    beqz            t0, L(less_4bytes)
+
+    vstelm.w        vr0, a3, 0, 0
+    vstelm.w        vr0, a4, -4, 0
+    jr              ra
+L(less_4bytes):
+    srli.d          t0, a2, 1
+
+    beqz            t0, L(less_2bytes)
+    vstelm.h        vr0, a3, 0, 0
+    vstelm.h        vr0, a4, -2, 0
+    jr              ra
+
+
+L(less_2bytes):
+    beqz            a2, L(less_1bytes)
+    vstelm.b        vr0, a3, 0, 0
+L(less_1bytes):
+    jr              ra
+L(long_bytes):
+    vst             vr0, a3, 0
+
+    bstrins.d       a3, zero, 3, 0
+    addi.d          a3, a3, 16
+    sub.d           a2, a4, a3
+    andi            t0, a2, 0x7f
+
+    beq             t0, a2, L(long_end)
+    move            a2, t0
+    sub.d           t0, a4, t0
+
+L(loop_128):
+    vst             vr0, a3, 0
+
+    vst             vr0, a3, 16
+    vst             vr0, a3, 32
+    vst             vr0, a3, 48
+    vst             vr0, a3, 64
+
+
+    vst             vr0, a3, 80
+    vst             vr0, a3, 96
+    vst             vr0, a3, 112
+    addi.d          a3, a3, 128
+
+    bne             a3, t0, L(loop_128)
+L(long_end):
+    bltu            a2, t3, L(end_less_64)
+    addi.d          a2, a2, -64
+    vst             vr0, a3, 0
+
+    vst             vr0, a3, 16
+    vst             vr0, a3, 32
+    vst             vr0, a3, 48
+    addi.d          a3, a3, 64
+
+L(end_less_64):
+    bltu            a2, t2, L(end_less_32)
+    addi.d          a2, a2, -32
+    vst             vr0, a3, 0
+    vst             vr0, a3, 16
+
+    addi.d          a3, a3, 32
+L(end_less_32):
+    bltu            a2, t1, L(end_less_16)
+    vst             vr0, a3, 0
+
+L(end_less_16):
+    vst             vr0, a4, -16
+    jr              ra
+END(MEMSET)
+
+libc_hidden_builtin_def (MEMSET)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S
new file mode 100644
index 0000000000..f7d32039df
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/memset-unaligned.S
@@ -0,0 +1,162 @@ 
+/* Optimized memset unaligned implementation using basic LoongArch instructions.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc)
+
+# define MEMSET_NAME __memset_unaligned
+
+#define ST_128(n)              \
+    st.d        a1, a0, n;     \
+    st.d        a1, a0, n+8  ; \
+    st.d        a1, a0, n+16 ; \
+    st.d        a1, a0, n+24 ; \
+    st.d        a1, a0, n+32 ; \
+    st.d        a1, a0, n+40 ; \
+    st.d        a1, a0, n+48 ; \
+    st.d        a1, a0, n+56 ; \
+    st.d        a1, a0, n+64 ; \
+    st.d        a1, a0, n+72 ; \
+    st.d        a1, a0, n+80 ; \
+    st.d        a1, a0, n+88 ; \
+    st.d        a1, a0, n+96 ; \
+    st.d        a1, a0, n+104; \
+    st.d        a1, a0, n+112; \
+    st.d        a1, a0, n+120;
+
+LEAF(MEMSET_NAME, 6)
+    bstrins.d   a1, a1, 15, 8
+    add.d       t7, a0, a2
+    bstrins.d   a1, a1, 31, 16
+    move        t0, a0
+
+    bstrins.d   a1, a1, 63, 32
+    srai.d      t8, a2, 4
+    beqz        t8, L(less_16bytes)
+    srai.d      t8, a2, 6
+
+    bnez        t8, L(more_64bytes)
+    srai.d      t8, a2, 5
+    beqz        t8, L(less_32bytes)
+
+    st.d        a1, a0, 0
+    st.d        a1, a0, 8
+    st.d        a1, a0, 16
+    st.d        a1, a0, 24
+
+    st.d        a1, t7, -32
+    st.d        a1, t7, -24
+    st.d        a1, t7, -16
+    st.d        a1, t7, -8
+
+    jr          ra
+
+L(less_32bytes):
+    st.d        a1, a0, 0
+    st.d        a1, a0, 8
+    st.d        a1, t7, -16
+    st.d        a1, t7, -8
+
+    jr          ra
+
+L(less_16bytes):
+    srai.d      t8, a2, 3
+    beqz        t8, L(less_8bytes)
+    st.d        a1, a0, 0
+    st.d        a1, t7, -8
+
+    jr          ra
+
+L(less_8bytes):
+    srai.d      t8, a2, 2
+    beqz        t8, L(less_4bytes)
+    st.w        a1, a0, 0
+    st.w        a1, t7, -4
+
+    jr          ra
+
+L(less_4bytes):
+    srai.d      t8, a2, 1
+    beqz        t8, L(less_2bytes)
+    st.h        a1, a0, 0
+    st.h        a1, t7, -2
+
+    jr          ra
+
+L(less_2bytes):
+    beqz        a2, L(less_1bytes)
+    st.b        a1, a0, 0
+
+    jr          ra
+
+L(less_1bytes):
+    jr          ra
+
+L(more_64bytes):
+    srli.d      a0, a0, 3
+    slli.d      a0, a0, 3
+    addi.d      a0, a0, 0x8
+    st.d        a1, t0, 0
+
+    sub.d       t2, t0, a0
+    add.d       a2, t2, a2
+    addi.d      a2, a2, -0x80
+    blt         a2, zero, L(end_unalign_proc)
+
+L(loop_less):
+    ST_128(0)
+    addi.d      a0, a0,  0x80
+    addi.d      a2, a2, -0x80
+    bge         a2, zero, L(loop_less)
+
+L(end_unalign_proc):
+    addi.d      a2, a2, 0x80
+    pcaddi      t1, 20
+    andi        t5, a2, 0x78
+    srli.d      t5, t5, 1
+
+    sub.d       t1, t1, t5
+    jr          t1
+
+    st.d        a1, a0, 112
+    st.d        a1, a0, 104
+    st.d        a1, a0, 96
+    st.d        a1, a0, 88
+    st.d        a1, a0, 80
+    st.d        a1, a0, 72
+    st.d        a1, a0, 64
+    st.d        a1, a0, 56
+    st.d        a1, a0, 48
+    st.d        a1, a0, 40
+    st.d        a1, a0, 32
+    st.d        a1, a0, 24
+    st.d        a1, a0, 16
+    st.d        a1, a0, 8
+    st.d        a1, a0, 0
+    st.d        a1, t7, -8
+
+    move        a0, t0
+    jr          ra
+END(MEMSET_NAME)
+
+libc_hidden_builtin_def (MEMSET_NAME)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/memset.c b/sysdeps/loongarch/lp64/multiarch/memset.c
new file mode 100644
index 0000000000..3ff60d8ac7
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/memset.c
@@ -0,0 +1,37 @@ 
+/* Multiple versions of memset.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define memset __redirect_memset
+# include <string.h>
+# undef memset
+
+# define SYMBOL_NAME memset
+# include "ifunc-lasx.h"
+
+libc_ifunc_redirected (__redirect_memset, memset,
+		       IFUNC_SELECTOR ());
+
+# ifdef SHARED
+__hidden_ver1 (memset, __GI_memset, __redirect_memset)
+  __attribute__ ((visibility ("hidden"))) __attribute_copy__ (memset);
+# endif
+
+#endif