===================================================================
@@ -393,3 +393,54 @@ AC_DEFUN([LIBGFOR_CHECK_STRERROR_R], [
[Define if strerror_r takes two arguments and is available in <string.h>.]),)
CFLAGS="$ac_save_CFLAGS"
])
+
+dnl Check for AVX
+
+AC_DEFUN([LIBGFOR_CHECK_AVX], [
+ ac_save_CFLAGS="$CFLAGS"
+ CFLAGS="-O2 -mavx"
+ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+ void _mm256_zeroall (void)
+ {
+ __builtin_ia32_vzeroall ();
+ }]], [[]])],
+ AC_DEFINE(HAVE_AVX, 1,
+ [Define if AVX instructions can be compiled.]),
+ [])
+ CFLAGS="$ac_save_CFLAGS"
+])
+
+dnl Check for AVX2
+
+AC_DEFUN([LIBGFOR_CHECK_AVX2], [
+ ac_save_CFLAGS="$CFLAGS"
+ CFLAGS="-O2 -mavx2"
+ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+ typedef long long __v4di __attribute__ ((__vector_size__ (32)));
+ __v4di
+ mm256_is32_andnotsi256 (__v4di __X, __v4di __Y)
+ {
+ return __builtin_ia32_andnotsi256 (__X, __Y);
+ }]], [[]])],
+ AC_DEFINE(HAVE_AVX2, 1,
+ [Define if AVX2 instructions can be compiled.]),
+ [])
+ CFLAGS="$ac_save_CFLAGS"
+])
+
+dnl Check for AVX512f
+
+AC_DEFUN([LIBGFOR_CHECK_AVX512F], [
+ ac_save_CFLAGS="$CFLAGS"
+ CFLAGS="-O2 -mavx512f"
+ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+ typedef double __m512d __attribute__ ((__vector_size__ (64)));
+ __m512d _mm512_add (__m512d a)
+ {
+ return __builtin_ia32_addpd512_mask (a, a, a, 1, 4);
+ }]], [[]])],
+ AC_DEFINE(HAVE_AVX512F, 1,
+ [Define if AVX512f instructions can be compiled.]),
+ [])
+ CFLAGS="$ac_save_CFLAGS"
+])
===================================================================
@@ -78,6 +78,15 @@
/* Define to 1 if the target supports __attribute__((visibility(...))). */
#undef HAVE_ATTRIBUTE_VISIBILITY
+/* Define if AVX instructions can be compiled. */
+#undef HAVE_AVX
+
+/* Define if AVX2 instructions can be compiled. */
+#undef HAVE_AVX2
+
+/* Define if AVX512f instructions can be compiled. */
+#undef HAVE_AVX512F
+
/* Define to 1 if you have the `cabs' function. */
#undef HAVE_CABS
===================================================================
@@ -26174,6 +26174,93 @@ $as_echo "#define HAVE_CRLF 1" >>confdefs.h
fi
+# Check whether we support AVX extensions
+
+ ac_save_CFLAGS="$CFLAGS"
+ CFLAGS="-O2 -mavx"
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+
+ void _mm256_zeroall (void)
+ {
+ __builtin_ia32_vzeroall ();
+ }
+int
+main ()
+{
+
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+$as_echo "#define HAVE_AVX 1" >>confdefs.h
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ CFLAGS="$ac_save_CFLAGS"
+
+
+# Check wether we support AVX2 extensions
+
+ ac_save_CFLAGS="$CFLAGS"
+ CFLAGS="-O2 -mavx2"
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+
+ typedef long long __v4di __attribute__ ((__vector_size__ (32)));
+ __v4di
+ mm256_is32_andnotsi256 (__v4di __X, __v4di __Y)
+ {
+ return __builtin_ia32_andnotsi256 (__X, __Y);
+ }
+int
+main ()
+{
+
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+$as_echo "#define HAVE_AVX2 1" >>confdefs.h
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ CFLAGS="$ac_save_CFLAGS"
+
+
+# Check wether we support AVX512f extensions
+
+ ac_save_CFLAGS="$CFLAGS"
+ CFLAGS="-O2 -mavx512f"
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+
+ typedef double __m512d __attribute__ ((__vector_size__ (64)));
+ __m512d _mm512_add (__m512d a)
+ {
+ return __builtin_ia32_addpd512_mask (a, a, a, 1, 4);
+ }
+int
+main ()
+{
+
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+$as_echo "#define HAVE_AVX512F 1" >>confdefs.h
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ CFLAGS="$ac_save_CFLAGS"
+
+
cat >confcache <<\_ACEOF
# This file is a shell script that caches the results of configure
# tests run on this system so they can be shared between configure
===================================================================
@@ -609,6 +609,15 @@ LIBGFOR_CHECK_UNLINK_OPEN_FILE
# Check whether line terminator is LF or CRLF
LIBGFOR_CHECK_CRLF
+# Check whether we support AVX extensions
+LIBGFOR_CHECK_AVX
+
+# Check wether we support AVX2 extensions
+LIBGFOR_CHECK_AVX2
+
+# Check wether we support AVX512f extensions
+LIBGFOR_CHECK_AVX512F
+
AC_CACHE_SAVE
if test ${multilib} = yes; then
===================================================================
@@ -75,11 +75,47 @@ extern void matmul_c10 (gfc_array_c10 * const rest
int blas_limit, blas_call gemm);
export_proto(matmul_c10);
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_c10 (gfc_array_c10 * const restrict retarray,
+ gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
void
matmul_c10 (gfc_array_c10 * const restrict retarray,
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_c10 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_c10 (gfc_array_c10 * const restrict retarray,
+ gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_c10 (gfc_array_c10 * const restrict retarray,
+ gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_COMPLEX_10 * restrict abase;
const GFC_COMPLEX_10 * restrict bbase;
GFC_COMPLEX_10 * restrict dest;
===================================================================
@@ -75,11 +75,47 @@ extern void matmul_c16 (gfc_array_c16 * const rest
int blas_limit, blas_call gemm);
export_proto(matmul_c16);
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_c16 (gfc_array_c16 * const restrict retarray,
+ gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
void
matmul_c16 (gfc_array_c16 * const restrict retarray,
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_c16 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_c16 (gfc_array_c16 * const restrict retarray,
+ gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_c16 (gfc_array_c16 * const restrict retarray,
+ gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_COMPLEX_16 * restrict abase;
const GFC_COMPLEX_16 * restrict bbase;
GFC_COMPLEX_16 * restrict dest;
===================================================================
@@ -75,11 +75,47 @@ extern void matmul_c4 (gfc_array_c4 * const restri
int blas_limit, blas_call gemm);
export_proto(matmul_c4);
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_c4 (gfc_array_c4 * const restrict retarray,
+ gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
void
matmul_c4 (gfc_array_c4 * const restrict retarray,
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_c4 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_c4 (gfc_array_c4 * const restrict retarray,
+ gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_c4 (gfc_array_c4 * const restrict retarray,
+ gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_COMPLEX_4 * restrict abase;
const GFC_COMPLEX_4 * restrict bbase;
GFC_COMPLEX_4 * restrict dest;
===================================================================
@@ -75,11 +75,47 @@ extern void matmul_c8 (gfc_array_c8 * const restri
int blas_limit, blas_call gemm);
export_proto(matmul_c8);
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_c8 (gfc_array_c8 * const restrict retarray,
+ gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
void
matmul_c8 (gfc_array_c8 * const restrict retarray,
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_c8 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_c8 (gfc_array_c8 * const restrict retarray,
+ gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_c8 (gfc_array_c8 * const restrict retarray,
+ gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_COMPLEX_8 * restrict abase;
const GFC_COMPLEX_8 * restrict bbase;
GFC_COMPLEX_8 * restrict dest;
===================================================================
@@ -75,11 +75,47 @@ extern void matmul_i1 (gfc_array_i1 * const restri
int blas_limit, blas_call gemm);
export_proto(matmul_i1);
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_i1 (gfc_array_i1 * const restrict retarray,
+ gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
void
matmul_i1 (gfc_array_i1 * const restrict retarray,
gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_i1 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_i1 (gfc_array_i1 * const restrict retarray,
+ gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_i1 (gfc_array_i1 * const restrict retarray,
+ gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_INTEGER_1 * restrict abase;
const GFC_INTEGER_1 * restrict bbase;
GFC_INTEGER_1 * restrict dest;
===================================================================
@@ -75,11 +75,47 @@ extern void matmul_i16 (gfc_array_i16 * const rest
int blas_limit, blas_call gemm);
export_proto(matmul_i16);
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_i16 (gfc_array_i16 * const restrict retarray,
+ gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
void
matmul_i16 (gfc_array_i16 * const restrict retarray,
gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_i16 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_i16 (gfc_array_i16 * const restrict retarray,
+ gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_i16 (gfc_array_i16 * const restrict retarray,
+ gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_INTEGER_16 * restrict abase;
const GFC_INTEGER_16 * restrict bbase;
GFC_INTEGER_16 * restrict dest;
===================================================================
@@ -75,11 +75,47 @@ extern void matmul_i2 (gfc_array_i2 * const restri
int blas_limit, blas_call gemm);
export_proto(matmul_i2);
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_i2 (gfc_array_i2 * const restrict retarray,
+ gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
void
matmul_i2 (gfc_array_i2 * const restrict retarray,
gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_i2 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_i2 (gfc_array_i2 * const restrict retarray,
+ gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_i2 (gfc_array_i2 * const restrict retarray,
+ gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_INTEGER_2 * restrict abase;
const GFC_INTEGER_2 * restrict bbase;
GFC_INTEGER_2 * restrict dest;
===================================================================
@@ -75,11 +75,47 @@ extern void matmul_i4 (gfc_array_i4 * const restri
int blas_limit, blas_call gemm);
export_proto(matmul_i4);
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_i4 (gfc_array_i4 * const restrict retarray,
+ gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
void
matmul_i4 (gfc_array_i4 * const restrict retarray,
gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_i4 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_i4 (gfc_array_i4 * const restrict retarray,
+ gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_i4 (gfc_array_i4 * const restrict retarray,
+ gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_INTEGER_4 * restrict abase;
const GFC_INTEGER_4 * restrict bbase;
GFC_INTEGER_4 * restrict dest;
===================================================================
@@ -75,11 +75,47 @@ extern void matmul_i8 (gfc_array_i8 * const restri
int blas_limit, blas_call gemm);
export_proto(matmul_i8);
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_i8 (gfc_array_i8 * const restrict retarray,
+ gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
void
matmul_i8 (gfc_array_i8 * const restrict retarray,
gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_i8 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_i8 (gfc_array_i8 * const restrict retarray,
+ gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_i8 (gfc_array_i8 * const restrict retarray,
+ gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_INTEGER_8 * restrict abase;
const GFC_INTEGER_8 * restrict bbase;
GFC_INTEGER_8 * restrict dest;
===================================================================
@@ -75,11 +75,47 @@ extern void matmul_r10 (gfc_array_r10 * const rest
int blas_limit, blas_call gemm);
export_proto(matmul_r10);
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_r10 (gfc_array_r10 * const restrict retarray,
+ gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
void
matmul_r10 (gfc_array_r10 * const restrict retarray,
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_r10 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_r10 (gfc_array_r10 * const restrict retarray,
+ gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_r10 (gfc_array_r10 * const restrict retarray,
+ gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_REAL_10 * restrict abase;
const GFC_REAL_10 * restrict bbase;
GFC_REAL_10 * restrict dest;
===================================================================
@@ -75,11 +75,47 @@ extern void matmul_r16 (gfc_array_r16 * const rest
int blas_limit, blas_call gemm);
export_proto(matmul_r16);
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_r16 (gfc_array_r16 * const restrict retarray,
+ gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
void
matmul_r16 (gfc_array_r16 * const restrict retarray,
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_r16 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_r16 (gfc_array_r16 * const restrict retarray,
+ gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_r16 (gfc_array_r16 * const restrict retarray,
+ gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_REAL_16 * restrict abase;
const GFC_REAL_16 * restrict bbase;
GFC_REAL_16 * restrict dest;
===================================================================
@@ -75,11 +75,47 @@ extern void matmul_r4 (gfc_array_r4 * const restri
int blas_limit, blas_call gemm);
export_proto(matmul_r4);
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_r4 (gfc_array_r4 * const restrict retarray,
+ gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
void
matmul_r4 (gfc_array_r4 * const restrict retarray,
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_r4 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_r4 (gfc_array_r4 * const restrict retarray,
+ gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_r4 (gfc_array_r4 * const restrict retarray,
+ gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_REAL_4 * restrict abase;
const GFC_REAL_4 * restrict bbase;
GFC_REAL_4 * restrict dest;
===================================================================
@@ -75,11 +75,47 @@ extern void matmul_r8 (gfc_array_r8 * const restri
int blas_limit, blas_call gemm);
export_proto(matmul_r8);
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_r8 (gfc_array_r8 * const restrict retarray,
+ gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
void
matmul_r8 (gfc_array_r8 * const restrict retarray,
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_r8 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_r8 (gfc_array_r8 * const restrict retarray,
+ gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_r8 (gfc_array_r8 * const restrict retarray,
+ gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_REAL_8 * restrict abase;
const GFC_REAL_8 * restrict bbase;
GFC_REAL_8 * restrict dest;
===================================================================
@@ -76,11 +76,47 @@ extern void matmul_'rtype_code` ('rtype` * const r
int blas_limit, blas_call gemm);
export_proto(matmul_'rtype_code`);
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_'rtype_code` ('rtype` * const restrict retarray,
+ 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
void
matmul_'rtype_code` ('rtype` * const restrict retarray,
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_'rtype_code` (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_'rtype_code` ('rtype` * const restrict retarray,
+ 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_'rtype_code` ('rtype` * const restrict retarray,
+ 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const 'rtype_name` * restrict abase;
const 'rtype_name` * restrict bbase;
'rtype_name` * restrict dest;