diff mbox series

LoongArch: Add support for dl_runtime_profile

Message ID 20230607095122.1540815-1-caiyinyu@loongson.cn
State New
Headers show
Series LoongArch: Add support for dl_runtime_profile | expand

Commit Message

caiyinyu June 7, 2023, 9:51 a.m. UTC
This commit can fix the FAIL item: elf/tst-sprof-basic.
---
 sysdeps/loongarch/Makefile        |   4 +
 sysdeps/loongarch/dl-link.sym     |  14 +++
 sysdeps/loongarch/dl-machine.h    |  30 ++++-
 sysdeps/loongarch/dl-trampoline.S | 175 +++++++++++++++++++++++++++++-
 sysdeps/loongarch/sys/asm.h       |   1 +
 5 files changed, 220 insertions(+), 4 deletions(-)
 create mode 100644 sysdeps/loongarch/dl-link.sym

Comments

Adhemerval Zanella Netto June 12, 2023, 6:59 p.m. UTC | #1
On 07/06/23 06:51, caiyinyu wrote:
> This commit can fix the FAIL item: elf/tst-sprof-basic.

Some minor style comment below, the rest look ok.  Since you are the 
arch-maintainer I think myou can commit this if you are not seeing any 
regression in your environment.

> ---
>  sysdeps/loongarch/Makefile        |   4 +
>  sysdeps/loongarch/dl-link.sym     |  14 +++
>  sysdeps/loongarch/dl-machine.h    |  30 ++++-
>  sysdeps/loongarch/dl-trampoline.S | 175 +++++++++++++++++++++++++++++-
>  sysdeps/loongarch/sys/asm.h       |   1 +
>  5 files changed, 220 insertions(+), 4 deletions(-)
>  create mode 100644 sysdeps/loongarch/dl-link.sym
> 
> diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile
> index 1778fd1c88..43d2f583cd 100644
> --- a/sysdeps/loongarch/Makefile
> +++ b/sysdeps/loongarch/Makefile
> @@ -2,6 +2,10 @@ ifeq ($(subdir),misc)
>  sysdep_headers += sys/asm.h
>  endif
>  
> +ifeq ($(subdir),elf)
> +gen-as-const-headers += dl-link.sym
> +endif
> +
>  # LoongArch's assembler also needs to know about PIC as it changes the
>  # definition of some assembler macros.
>  ASFLAGS-.os += $(pic-ccflag)
> diff --git a/sysdeps/loongarch/dl-link.sym b/sysdeps/loongarch/dl-link.sym
> new file mode 100644
> index 0000000000..868ab7c6eb
> --- /dev/null
> +++ b/sysdeps/loongarch/dl-link.sym
> @@ -0,0 +1,14 @@
> +#include <stddef.h>
> +#include <sysdep.h>
> +#include <link.h>
> +
> +DL_SIZEOF_RG            sizeof(struct La_loongarch_regs)
> +DL_SIZEOF_RV            sizeof(struct La_loongarch_retval)
> +
> +DL_OFFSET_RG_A0         offsetof(struct La_loongarch_regs, lr_reg)
> +DL_OFFSET_RG_FA0        offsetof(struct La_loongarch_regs, lr_fpreg)
> +DL_OFFSET_RG_RA         offsetof(struct La_loongarch_regs, lr_ra)
> +DL_OFFSET_RG_SP         offsetof(struct La_loongarch_regs, lr_sp)
> +
> +DL_OFFSET_RV_A0         offsetof(struct La_loongarch_retval, lrv_a0)
> +DL_OFFSET_RV_FA0        offsetof(struct La_loongarch_retval, lrv_a1)
> diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
> index 1e07d124f9..e356a1cd99 100644
> --- a/sysdeps/loongarch/dl-machine.h
> +++ b/sysdeps/loongarch/dl-machine.h
> @@ -270,10 +270,34 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
>    /* If using PLTs, fill in the first two entries of .got.plt.  */
>    if (l->l_info[DT_JMPREL])
>      {
> -      extern void _dl_runtime_resolve (void)
> -	__attribute__ ((visibility ("hidden")));
> +      extern void _dl_runtime_resolve (void) attribute_hidden;
> +      extern void _dl_runtime_profile (void) attribute_hidden;
> +
>        ElfW (Addr) *gotplt = (ElfW (Addr) *) D_PTR (l, l_info[DT_PLTGOT]);
> -      gotplt[0] = (ElfW (Addr)) & _dl_runtime_resolve;
> +
> +      /* The got[0] entry contains the address of a function which gets
> +	 called to get the address of a so far unresolved function and
> +	 jump to it.  The profiling extension of the dynamic linker allows
> +	 to intercept the calls to collect information.  In this case we
> +	 don't store the address in the GOT so that all future calls also
> +	 end in this function.  */
> +      if ( __glibc_unlikely (profile))

No implicit check, use 'profile != 0' since profile is an int.

> +	{
> +	   gotplt[0] = (ElfW(Addr)) &_dl_runtime_profile;
> +
> +	  if (GLRO(dl_profile) != NULL
> +	      && _dl_name_match_p (GLRO(dl_profile), l))
> +	    /* Say that we really want profiling and the timers are
> +	       started.  */
> +	    GL(dl_profile_map) = l;
> +	}
> +      else
> +	{
> +	  /* This function will get called to fix up the GOT entry
> +	     indicated by the offset on the stack, and then jump to
> +	     the resolved address.  */
> +	  gotplt[0] = (ElfW (Addr)) & _dl_runtime_resolve;
> +	}
>        gotplt[1] = (ElfW (Addr)) l;
>      }
>  #endif
> diff --git a/sysdeps/loongarch/dl-trampoline.S b/sysdeps/loongarch/dl-trampoline.S
> index c978e2ef63..ed9ec0901c 100644
> --- a/sysdeps/loongarch/dl-trampoline.S
> +++ b/sysdeps/loongarch/dl-trampoline.S
> @@ -19,6 +19,8 @@
>  #include <sysdep.h>
>  #include <sys/asm.h>
>  
> +#include "dl-link.h"
> +
>  /* Assembler veneer called from the PLT header code for lazy loading.
>     The PLT header passes its own args in t0-t2.  */
>  #ifdef __loongarch_soft_float
> @@ -31,7 +33,6 @@ ENTRY (_dl_runtime_resolve)
>  
>  	/* Save arguments to stack. */
>  	ADDI	sp, sp, -FRAME_SIZE
> -

Spurious new line removal?

>  	REG_S	ra, sp, 9*SZREG
>  	REG_S	a0, sp, 1*SZREG
>  	REG_S	a1, sp, 2*SZREG
> @@ -88,3 +89,175 @@ ENTRY (_dl_runtime_resolve)
>  	/* Invoke the callee. */
>  	jirl		zero, t1, 0
>  END (_dl_runtime_resolve)
> +
> +
> +ENTRY (_dl_runtime_profile)
> +       /* LoongArch we get called with:
> +	t0	      linkr_map pointer
> +	t1	      the scaled offset stored in t0, which can be used
> +		      to calculate the offset of the current symbol in .rela.plt
> +	t2	      %hi(%pcrel(.got.plt)) stored in t2, no use in this function
> +	t3	      dl resolver entry point, no use in this function
> +
> +	Stack frame layout:
> +	[sp,    #96] La_loongarch_regs
> +	[sp,    #48] La_loongarch_retval
> +	[sp,    #40] frame size return from pltenter
> +	[sp,    #32] dl_profile_call saved a1
> +	[sp,    #24] dl_profile_call saved a0
> +	[sp,    #16] T1
> +	[sp,     #0] ra, fp   <- fp
> +       */
> +
> +# define OFFSET_T1              16
> +# define OFFSET_SAVED_CALL_A0   OFFSET_T1 + 8
> +# define OFFSET_FS              OFFSET_SAVED_CALL_A0 + 16
> +# define OFFSET_RV              OFFSET_FS + 8
> +# define OFFSET_RG              OFFSET_RV + DL_SIZEOF_RV
> +
> +# define SF_SIZE                (-(-(OFFSET_RG + DL_SIZEOF_RG) & ALMASK))
> +
> +	/* Save arguments to stack. */
> +	ADDI	sp, sp, -SF_SIZE
> +	REG_S	ra, sp, 0
> +	REG_S	fp, sp, 8
> +
> +	or	fp, sp, zero
> +
> +	REG_S	a0, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 0*SZREG
> +	REG_S	a1, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 1*SZREG
> +	REG_S	a2, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 2*SZREG
> +	REG_S	a3, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 3*SZREG
> +	REG_S	a4, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 4*SZREG
> +	REG_S	a5, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 5*SZREG
> +	REG_S	a6, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 6*SZREG
> +	REG_S	a7, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 7*SZREG
> +
> +#ifndef __loongarch_soft_float
> +	FREG_S	fa0, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 0*SZFREG
> +	FREG_S	fa1, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 1*SZFREG
> +	FREG_S	fa2, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 2*SZFREG
> +	FREG_S	fa3, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 3*SZFREG
> +	FREG_S	fa4, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 4*SZFREG
> +	FREG_S	fa5, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 5*SZFREG
> +	FREG_S	fa6, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 6*SZFREG
> +	FREG_S	fa7, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 7*SZFREG
> +#endif
> +
> +	/* Update .got.plt and obtain runtime address of callee.  */
> +	SLLI	a1, t1, 1
> +	or	a0, t0, zero
> +	ADD	a1, a1, t1
> +	or	a2, ra, zero		/* return addr */
> +	ADDI	a3, fp, OFFSET_RG	/* La_loongarch_regs pointer */
> +	ADDI	a4, fp, OFFSET_FS 	/* frame size return from pltenter */
> +
> +	REG_S	a0, fp, OFFSET_SAVED_CALL_A0
> +	REG_S	a1, fp, OFFSET_SAVED_CALL_A0 + SZREG
> +
> +	la	t2, _dl_profile_fixup
> +	jirl	ra, t2, 0
> +
> +	REG_L	t3, fp, OFFSET_FS
> +	bge	t3, zero, 1f
> +
> +	/* Save the return.  */
> +	or	t4, v0, zero
> +
> +	/* Restore arguments from stack.  */
> +	REG_L	a0, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 0*SZREG
> +	REG_L	a1, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 1*SZREG
> +	REG_L	a2, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 2*SZREG
> +	REG_L	a3, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 3*SZREG
> +	REG_L	a4, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 4*SZREG
> +	REG_L	a5, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 5*SZREG
> +	REG_L	a6, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 6*SZREG
> +	REG_L	a7, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 7*SZREG
> +
> +#ifndef __loongarch_soft_float
> +	FREG_L	fa0, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 0*SZFREG
> +	FREG_L	fa1, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 1*SZFREG
> +	FREG_L	fa2, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 2*SZFREG
> +	FREG_L	fa3, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 3*SZFREG
> +	FREG_L	fa4, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 4*SZFREG
> +	FREG_L	fa5, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 5*SZFREG
> +	FREG_L	fa6, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 6*SZFREG
> +	FREG_L	fa7, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 7*SZFREG
> +#endif
> +
> +	REG_L   ra, fp, 0
> +	REG_L   fp, fp, SZREG
> +
> +	ADDI	sp, sp, SF_SIZE
> +	jirl	zero, t4, 0
> +
> +1:
> +	/* The new frame size is in t3.  */
> +	SUB	sp, fp, t3
> +	BSTRINS sp, zero, 3, 0
> +
> +	REG_S	a0, fp, OFFSET_T1
> +
> +	or	a0, sp, zero
> +	ADDI	a1, fp, SF_SIZE
> +	or	a2, t3,	zero
> +	la	t5, memcpy
> +	jirl	ra, t5, 0
> +
> +	REG_L	t6, fp, OFFSET_T1
> +
> +	/* Call the function.  */
> +	REG_L	a0, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 0*SZREG
> +	REG_L	a1, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 1*SZREG
> +	REG_L	a2, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 2*SZREG
> +	REG_L	a3, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 3*SZREG
> +	REG_L	a4, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 4*SZREG
> +	REG_L	a5, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 5*SZREG
> +	REG_L	a6, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 6*SZREG
> +	REG_L	a7, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 7*SZREG
> +
> +#ifndef __loongarch_soft_float
> +	FREG_L	fa0, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 0*SZFREG
> +	FREG_L	fa1, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 1*SZFREG
> +	FREG_L	fa2, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 2*SZFREG
> +	FREG_L	fa3, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 3*SZFREG
> +	FREG_L	fa4, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 4*SZFREG
> +	FREG_L	fa5, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 5*SZFREG
> +	FREG_L	fa6, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 6*SZFREG
> +	FREG_L	fa7, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 7*SZFREG
> +#endif
> +	jirl	ra, t6, 0
> +
> +	REG_S	a0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_A0
> +	REG_S	a1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_A0 + SZREG
> +
> +#ifndef __loongarch_soft_float
> +	FREG_S	fa0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_FA0
> +	FREG_S	fa1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_FA0 + SZFREG
> +#endif
> +
> +	/* Setup call to pltexit.  */
> +	REG_L	a0, fp, OFFSET_SAVED_CALL_A0
> +	REG_L	a1, fp, OFFSET_SAVED_CALL_A0 + SZREG
> +	ADDI	a2, fp, OFFSET_RG
> +	ADDI	a3, fp, OFFSET_RV
> +	la	t7, _dl_audit_pltexit
> +	jirl	ra, t7, 0
> +
> +	REG_L	a0, fp, OFFSET_RV + DL_OFFSET_RV_A0
> +	REG_L	a1, fp, OFFSET_RV + DL_OFFSET_RV_A0 + SZREG
> +
> +#ifndef __loongarch_soft_float
> +	FREG_L	fa0, fp, OFFSET_RV + DL_OFFSET_RV_FA0
> +	FREG_L	fa1, fp, OFFSET_RV + DL_OFFSET_RV_FA0 + SZFREG
> +#endif
> +
> +	/* RA from within La_loongarch_reg.  */
> +	REG_L   ra, fp, OFFSET_RG + DL_OFFSET_RG_RA
> +	or	sp, fp, zero
> +	ADDI	sp, sp, SF_SIZE
> +	REG_S   fp, fp, SZREG
> +
> +	jirl	zero, ra, 0
> +
> +END (_dl_runtime_profile)
> diff --git a/sysdeps/loongarch/sys/asm.h b/sysdeps/loongarch/sys/asm.h
> index 3dca70a107..0bb430bb05 100644
> --- a/sysdeps/loongarch/sys/asm.h
> +++ b/sysdeps/loongarch/sys/asm.h
> @@ -31,6 +31,7 @@
>  #define SLLI slli.d
>  #define ADDI addi.d
>  #define ADD  add.d
> +#define SUB  sub.d
>  #define BSTRINS  bstrins.d
>  #define LI  li.d
>  #define FREG_L fld.d
diff mbox series

Patch

diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile
index 1778fd1c88..43d2f583cd 100644
--- a/sysdeps/loongarch/Makefile
+++ b/sysdeps/loongarch/Makefile
@@ -2,6 +2,10 @@  ifeq ($(subdir),misc)
 sysdep_headers += sys/asm.h
 endif
 
+ifeq ($(subdir),elf)
+gen-as-const-headers += dl-link.sym
+endif
+
 # LoongArch's assembler also needs to know about PIC as it changes the
 # definition of some assembler macros.
 ASFLAGS-.os += $(pic-ccflag)
diff --git a/sysdeps/loongarch/dl-link.sym b/sysdeps/loongarch/dl-link.sym
new file mode 100644
index 0000000000..868ab7c6eb
--- /dev/null
+++ b/sysdeps/loongarch/dl-link.sym
@@ -0,0 +1,14 @@ 
+#include <stddef.h>
+#include <sysdep.h>
+#include <link.h>
+
+DL_SIZEOF_RG            sizeof(struct La_loongarch_regs)
+DL_SIZEOF_RV            sizeof(struct La_loongarch_retval)
+
+DL_OFFSET_RG_A0         offsetof(struct La_loongarch_regs, lr_reg)
+DL_OFFSET_RG_FA0        offsetof(struct La_loongarch_regs, lr_fpreg)
+DL_OFFSET_RG_RA         offsetof(struct La_loongarch_regs, lr_ra)
+DL_OFFSET_RG_SP         offsetof(struct La_loongarch_regs, lr_sp)
+
+DL_OFFSET_RV_A0         offsetof(struct La_loongarch_retval, lrv_a0)
+DL_OFFSET_RV_FA0        offsetof(struct La_loongarch_retval, lrv_a1)
diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
index 1e07d124f9..e356a1cd99 100644
--- a/sysdeps/loongarch/dl-machine.h
+++ b/sysdeps/loongarch/dl-machine.h
@@ -270,10 +270,34 @@  elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
   /* If using PLTs, fill in the first two entries of .got.plt.  */
   if (l->l_info[DT_JMPREL])
     {
-      extern void _dl_runtime_resolve (void)
-	__attribute__ ((visibility ("hidden")));
+      extern void _dl_runtime_resolve (void) attribute_hidden;
+      extern void _dl_runtime_profile (void) attribute_hidden;
+
       ElfW (Addr) *gotplt = (ElfW (Addr) *) D_PTR (l, l_info[DT_PLTGOT]);
-      gotplt[0] = (ElfW (Addr)) & _dl_runtime_resolve;
+
+      /* The got[0] entry contains the address of a function which gets
+	 called to get the address of a so far unresolved function and
+	 jump to it.  The profiling extension of the dynamic linker allows
+	 to intercept the calls to collect information.  In this case we
+	 don't store the address in the GOT so that all future calls also
+	 end in this function.  */
+      if ( __glibc_unlikely (profile))
+	{
+	   gotplt[0] = (ElfW(Addr)) &_dl_runtime_profile;
+
+	  if (GLRO(dl_profile) != NULL
+	      && _dl_name_match_p (GLRO(dl_profile), l))
+	    /* Say that we really want profiling and the timers are
+	       started.  */
+	    GL(dl_profile_map) = l;
+	}
+      else
+	{
+	  /* This function will get called to fix up the GOT entry
+	     indicated by the offset on the stack, and then jump to
+	     the resolved address.  */
+	  gotplt[0] = (ElfW (Addr)) & _dl_runtime_resolve;
+	}
       gotplt[1] = (ElfW (Addr)) l;
     }
 #endif
diff --git a/sysdeps/loongarch/dl-trampoline.S b/sysdeps/loongarch/dl-trampoline.S
index c978e2ef63..ed9ec0901c 100644
--- a/sysdeps/loongarch/dl-trampoline.S
+++ b/sysdeps/loongarch/dl-trampoline.S
@@ -19,6 +19,8 @@ 
 #include <sysdep.h>
 #include <sys/asm.h>
 
+#include "dl-link.h"
+
 /* Assembler veneer called from the PLT header code for lazy loading.
    The PLT header passes its own args in t0-t2.  */
 #ifdef __loongarch_soft_float
@@ -31,7 +33,6 @@  ENTRY (_dl_runtime_resolve)
 
 	/* Save arguments to stack. */
 	ADDI	sp, sp, -FRAME_SIZE
-
 	REG_S	ra, sp, 9*SZREG
 	REG_S	a0, sp, 1*SZREG
 	REG_S	a1, sp, 2*SZREG
@@ -88,3 +89,175 @@  ENTRY (_dl_runtime_resolve)
 	/* Invoke the callee. */
 	jirl		zero, t1, 0
 END (_dl_runtime_resolve)
+
+
+ENTRY (_dl_runtime_profile)
+       /* LoongArch we get called with:
+	t0	      linkr_map pointer
+	t1	      the scaled offset stored in t0, which can be used
+		      to calculate the offset of the current symbol in .rela.plt
+	t2	      %hi(%pcrel(.got.plt)) stored in t2, no use in this function
+	t3	      dl resolver entry point, no use in this function
+
+	Stack frame layout:
+	[sp,    #96] La_loongarch_regs
+	[sp,    #48] La_loongarch_retval
+	[sp,    #40] frame size return from pltenter
+	[sp,    #32] dl_profile_call saved a1
+	[sp,    #24] dl_profile_call saved a0
+	[sp,    #16] T1
+	[sp,     #0] ra, fp   <- fp
+       */
+
+# define OFFSET_T1              16
+# define OFFSET_SAVED_CALL_A0   OFFSET_T1 + 8
+# define OFFSET_FS              OFFSET_SAVED_CALL_A0 + 16
+# define OFFSET_RV              OFFSET_FS + 8
+# define OFFSET_RG              OFFSET_RV + DL_SIZEOF_RV
+
+# define SF_SIZE                (-(-(OFFSET_RG + DL_SIZEOF_RG) & ALMASK))
+
+	/* Save arguments to stack. */
+	ADDI	sp, sp, -SF_SIZE
+	REG_S	ra, sp, 0
+	REG_S	fp, sp, 8
+
+	or	fp, sp, zero
+
+	REG_S	a0, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 0*SZREG
+	REG_S	a1, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 1*SZREG
+	REG_S	a2, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 2*SZREG
+	REG_S	a3, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 3*SZREG
+	REG_S	a4, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 4*SZREG
+	REG_S	a5, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 5*SZREG
+	REG_S	a6, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 6*SZREG
+	REG_S	a7, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 7*SZREG
+
+#ifndef __loongarch_soft_float
+	FREG_S	fa0, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 0*SZFREG
+	FREG_S	fa1, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 1*SZFREG
+	FREG_S	fa2, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 2*SZFREG
+	FREG_S	fa3, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 3*SZFREG
+	FREG_S	fa4, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 4*SZFREG
+	FREG_S	fa5, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 5*SZFREG
+	FREG_S	fa6, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 6*SZFREG
+	FREG_S	fa7, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 7*SZFREG
+#endif
+
+	/* Update .got.plt and obtain runtime address of callee.  */
+	SLLI	a1, t1, 1
+	or	a0, t0, zero
+	ADD	a1, a1, t1
+	or	a2, ra, zero		/* return addr */
+	ADDI	a3, fp, OFFSET_RG	/* La_loongarch_regs pointer */
+	ADDI	a4, fp, OFFSET_FS 	/* frame size return from pltenter */
+
+	REG_S	a0, fp, OFFSET_SAVED_CALL_A0
+	REG_S	a1, fp, OFFSET_SAVED_CALL_A0 + SZREG
+
+	la	t2, _dl_profile_fixup
+	jirl	ra, t2, 0
+
+	REG_L	t3, fp, OFFSET_FS
+	bge	t3, zero, 1f
+
+	/* Save the return.  */
+	or	t4, v0, zero
+
+	/* Restore arguments from stack.  */
+	REG_L	a0, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 0*SZREG
+	REG_L	a1, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 1*SZREG
+	REG_L	a2, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 2*SZREG
+	REG_L	a3, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 3*SZREG
+	REG_L	a4, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 4*SZREG
+	REG_L	a5, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 5*SZREG
+	REG_L	a6, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 6*SZREG
+	REG_L	a7, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 7*SZREG
+
+#ifndef __loongarch_soft_float
+	FREG_L	fa0, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 0*SZFREG
+	FREG_L	fa1, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 1*SZFREG
+	FREG_L	fa2, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 2*SZFREG
+	FREG_L	fa3, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 3*SZFREG
+	FREG_L	fa4, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 4*SZFREG
+	FREG_L	fa5, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 5*SZFREG
+	FREG_L	fa6, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 6*SZFREG
+	FREG_L	fa7, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 7*SZFREG
+#endif
+
+	REG_L   ra, fp, 0
+	REG_L   fp, fp, SZREG
+
+	ADDI	sp, sp, SF_SIZE
+	jirl	zero, t4, 0
+
+1:
+	/* The new frame size is in t3.  */
+	SUB	sp, fp, t3
+	BSTRINS sp, zero, 3, 0
+
+	REG_S	a0, fp, OFFSET_T1
+
+	or	a0, sp, zero
+	ADDI	a1, fp, SF_SIZE
+	or	a2, t3,	zero
+	la	t5, memcpy
+	jirl	ra, t5, 0
+
+	REG_L	t6, fp, OFFSET_T1
+
+	/* Call the function.  */
+	REG_L	a0, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 0*SZREG
+	REG_L	a1, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 1*SZREG
+	REG_L	a2, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 2*SZREG
+	REG_L	a3, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 3*SZREG
+	REG_L	a4, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 4*SZREG
+	REG_L	a5, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 5*SZREG
+	REG_L	a6, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 6*SZREG
+	REG_L	a7, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 7*SZREG
+
+#ifndef __loongarch_soft_float
+	FREG_L	fa0, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 0*SZFREG
+	FREG_L	fa1, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 1*SZFREG
+	FREG_L	fa2, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 2*SZFREG
+	FREG_L	fa3, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 3*SZFREG
+	FREG_L	fa4, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 4*SZFREG
+	FREG_L	fa5, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 5*SZFREG
+	FREG_L	fa6, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 6*SZFREG
+	FREG_L	fa7, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 7*SZFREG
+#endif
+	jirl	ra, t6, 0
+
+	REG_S	a0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_A0
+	REG_S	a1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_A0 + SZREG
+
+#ifndef __loongarch_soft_float
+	FREG_S	fa0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_FA0
+	FREG_S	fa1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_FA0 + SZFREG
+#endif
+
+	/* Setup call to pltexit.  */
+	REG_L	a0, fp, OFFSET_SAVED_CALL_A0
+	REG_L	a1, fp, OFFSET_SAVED_CALL_A0 + SZREG
+	ADDI	a2, fp, OFFSET_RG
+	ADDI	a3, fp, OFFSET_RV
+	la	t7, _dl_audit_pltexit
+	jirl	ra, t7, 0
+
+	REG_L	a0, fp, OFFSET_RV + DL_OFFSET_RV_A0
+	REG_L	a1, fp, OFFSET_RV + DL_OFFSET_RV_A0 + SZREG
+
+#ifndef __loongarch_soft_float
+	FREG_L	fa0, fp, OFFSET_RV + DL_OFFSET_RV_FA0
+	FREG_L	fa1, fp, OFFSET_RV + DL_OFFSET_RV_FA0 + SZFREG
+#endif
+
+	/* RA from within La_loongarch_reg.  */
+	REG_L   ra, fp, OFFSET_RG + DL_OFFSET_RG_RA
+	or	sp, fp, zero
+	ADDI	sp, sp, SF_SIZE
+	REG_S   fp, fp, SZREG
+
+	jirl	zero, ra, 0
+
+END (_dl_runtime_profile)
diff --git a/sysdeps/loongarch/sys/asm.h b/sysdeps/loongarch/sys/asm.h
index 3dca70a107..0bb430bb05 100644
--- a/sysdeps/loongarch/sys/asm.h
+++ b/sysdeps/loongarch/sys/asm.h
@@ -31,6 +31,7 @@ 
 #define SLLI slli.d
 #define ADDI addi.d
 #define ADD  add.d
+#define SUB  sub.d
 #define BSTRINS  bstrins.d
 #define LI  li.d
 #define FREG_L fld.d