===================================================================
@@ -527,3 +527,99 @@
}
[(set_attr "type" "idiv3")
(set_attr "mode" "<MODE>")])
+
+;; Loongson 128-bits load/store peephole optimization. Try to combine two
+;; 64 bits load/store insns into a 128 bits load/store insn.
+
+(define_peephole2
+ [(set (mem:DI (plus (match_operand:SI 0 "register_operand")
+ (match_operand:SI 1 "const_int_operand")))
+ (match_operand:DI 2 "register_operand"))
+ (set (mem:DI (plus (match_dup 0)
+ (match_operand:SI 3 "const_int_operand")))
+ (match_operand:DI 4 "register_operand"))]
+ "loongson_128bits_load_store_p (operands[0], operands[1], operands[3],
+ operands[2], operands[4], 0)"
+ [(parallel
+ [(set (mem:DI (plus (match_operand:SI 0 "register_operand")
+ (match_operand:SI 1 "const_int_operand")))
+ (match_operand:DI 2 "register_operand"))
+ (set (mem:DI (plus (match_dup 0)
+ (match_operand:SI 3 "const_int_operand")))
+ (match_operand:DI 4 "register_operand"))])])
+
+(define_insn "loongson_128bits_store"
+ [(parallel
+ [(set (mem:DI (plus (match_operand:SI 0 "register_operand" "d,d")
+ (match_operand:SI 1 "const_int_operand" "")))
+ (match_operand:DI 2 "register_operand" "d,f"))
+ (set (mem:DI (plus (match_dup 0)
+ (match_operand:SI 3 "const_int_operand" "")))
+ (match_operand:DI 4 "register_operand" "d,f"))])]
+ "loongson_128bits_load_store_p (operands[0], operands[1], operands[3],
+ operands[2], operands[4], 0)"
+{
+ if (INTVAL (operands[1]) < INTVAL (operands[3]))
+ {
+ if (which_alternative == 0)
+ return "gssq\t%4,%2,%1>>4(%0)";
+ else
+ return "gssqc1\t%4,%2,%1>>4(%0)";
+ }
+ else
+ {
+ if (which_alternative == 0)
+ return "gssq\t%2,%4,%3>>4(%0)";
+ else
+ return "gssqc1\t%2,%4,%3>>4(%0)";
+ }
+}
+ [(set_attr "type" "store")
+ (set_attr "mode" "DI")])
+
+(define_peephole2
+ [(set (match_operand:DI 0 "register_operand")
+ (mem:DI (plus (match_operand:SI 1 "register_operand")
+ (match_operand:SI 2 "const_int_operand"))))
+ (set (match_operand:DI 3 "register_operand")
+ (mem:DI (plus (match_dup 1)
+ (match_operand:SI 4 "const_int_operand"))))]
+ "loongson_128bits_load_store_p (operands[1], operands[2], operands[4],
+ operands[0], operands[3], 1)"
+ [(parallel
+ [(set (match_operand:DI 0 "register_operand")
+ (mem:DI (plus (match_operand:SI 1 "register_operand")
+ (match_operand:SI 2 "const_int_operand"))))
+ (set (match_operand:DI 3 "register_operand")
+ (mem:DI (plus (match_dup 1)
+ (match_operand:SI 4 "const_int_operand"))))])])
+
+(define_insn "loongson_128bits_load"
+ [(parallel
+ [(set (match_operand:DI 0 "register_operand" "=d,=f")
+ (mem:DI (plus (match_operand:SI 1 "register_operand" "d,d")
+ (match_operand:SI 2 "const_int_operand" ""))))
+ (set (match_operand:DI 3 "register_operand" "=d,=f")
+ (mem:DI (plus (match_dup 1)
+ (match_operand:SI 4 "const_int_operand" ""))))])]
+ "loongson_128bits_load_store_p (operands[1], operands[2], operands[4],
+ operands[0], operands[3], 1)"
+{
+ if (INTVAL (operands[2]) < INTVAL (operands[4]))
+ {
+ if (which_alternative == 0)
+ return "gslq\t%3,%0,%2>>4(%1)";
+ else
+ return "gslqc1\t%3,%0,%2>>4(%1)";
+ }
+ else
+ {
+ if (which_alternative == 0)
+ return "gslq\t%0,%3,%4>>4(%1)";
+ else
+ return "gslqc1\t%0,%3,%4>>4(%1)";
+ }
+}
+ [(set_attr "type" "load")
+ (set_attr "mode" "DI")])
+
===================================================================
@@ -339,4 +339,6 @@ typedef rtx (*mulsidi3_gen_fn) (rtx, rtx
extern mulsidi3_gen_fn mips_mulsidi3_gen_fn (enum rtx_code);
#endif
+extern int loongson_128bits_load_store_p (rtx, rtx, rtx, rtx, rtx, int);
+
#endif /* ! GCC_MIPS_PROTOS_H */
===================================================================
@@ -16397,6 +16397,52 @@ mips_shift_truncation_mask (enum machine
return GET_MODE_BITSIZE (mode) - 1;
}
+/* Return 1, if it's valid to do 128-bits load/store peephole optimization,
+ otherwise return 0.
+
+ Loongson 128-bits load/store requires the address (BASE + OFFSET) should
+ be 128 bits aligned.
+
+ We currently just conside the case that BASE_REG is SP register and the
+ STACK_BOUNDARY is 128, which can guarantee that the BASE is 128 bits
+ aligned. Thus the OFFSET, which is the minimun of OFFSET1 and OFFSET2,
+ should also be 128 bits aligned. */
+
+int
+loongson_128bits_load_store_p (rtx base_reg, rtx offset1, rtx offset2,
+ rtx reg1, rtx reg2, int is_load)
+{
+ HOST_WIDE_INT value1, value2, min_value;
+ unsigned int regno1, regno2;
+
+ if (!TARGET_LOONGSON_3A
+ || REGNO (base_reg) != STACK_POINTER_REGNUM
+ || STACK_BOUNDARY != 128)
+ return 0;
+
+ value1 = INTVAL (offset1);
+ value2 = INTVAL (offset2);
+ min_value = value1 < value2 ? value1 : value2;
+
+ regno1 = REGNO (reg1);
+ regno2 = REGNO (reg2);
+
+ /* Also make sure the destination registers are not same as base register,
+ i.e. SP register. */
+ if (is_load)
+ if (regno1 == STACK_POINTER_REGNUM || regno2 == STACK_POINTER_REGNUM)
+ return 0;
+
+ /* The gap between two offsets should be 8. The value range of the offset
+ after right shift 4 bits (which takes 9 bits in the binary opcode) should
+ be [-256, 255]. */
+ return ((GP_REG_P (regno1) && GP_REG_P (regno2))
+ || (FP_REG_P (regno1) && FP_REG_P (regno2)))
+ && ((value1 - value2) == 8 || (value2 - value1) == 8)
+ && ((min_value & 0xf) == 0)
+ && ((min_value >> 4) >= -256 && (min_value >> 4) < 256);
+}
+
/* Initialize the GCC target structure. */
#undef TARGET_ASM_ALIGNED_HI_OP