diff mbox

[libfortran] Add AVX-specific matmul

Message ID 0b19ecac-9ac5-5fa8-dc10-1bc2fdded6b5@netcologne.de
State New
Headers show

Commit Message

Thomas Koenig Nov. 17, 2016, 2:47 p.m. UTC
Well, here is a newer version of the patch.

I wrote a few configure tests to check for AVX.
This version hast the advantage that, if anybody
uses 32-bit programs with AVX, they would also benefit.

Jakub, would you be OK with that patch?

I do not yet want to commit this because it needs more
testing on different platforms to see if it actually
performs better.

Regarding putting the blocked part into something separate:
Quite doable, but I would rather like to do this in a follow-up
patch, if we decide t do it.

Regards

	Thomas

2016-11-17  Thomas Koenig  <tkoenig@gcc.gnu.org>

         PR fortran/78379
         * acinclude.m4 (LIBGFOR_CHECK_AVX):  New test.
         (LIBGFOR_CHECK_AVX2):  New test.
         (LIBGFOR_CHECK_AVX512F):  New test.
         * configure.ac:  Call LIBGFOR_CHECK_AVX, LIBGFOR_CHECK_AVX2
         and LIBGFOR_CHECK_AVX512F.
         * config.h.in: Regenerated.
         * configure:  Regenerated.
         * m4/matmul.m4: For AVX, AVX2 and AVX_512F, make the work function
         for matmul static with target_clones for AVX and default, and
         create a wrapper function to call it.
         * generated/matmul_c10.c: Regenerated.
         * generated/matmul_c16.c: Regenerated.
         * generated/matmul_c4.c: Regenerated.
         * generated/matmul_c8.c: Regenerated.
         * generated/matmul_i1.c: Regenerated.
         * generated/matmul_i16.c: Regenerated.
         * generated/matmul_i2.c: Regenerated.
         * generated/matmul_i4.c: Regenerated.
         * generated/matmul_i8.c: Regenerated.
         * generated/matmul_r10.c: Regenerated.
         * generated/matmul_r16.c: Regenerated.
         * generated/matmul_r4.c: Regenerated.
         * generated/matmul_r8.c: Regenerated.
diff mbox

Patch

Index: acinclude.m4
===================================================================
--- acinclude.m4	(Revision 242477)
+++ acinclude.m4	(Arbeitskopie)
@@ -393,3 +393,54 @@  AC_DEFUN([LIBGFOR_CHECK_STRERROR_R], [
 		  [Define if strerror_r takes two arguments and is available in <string.h>.]),)
   CFLAGS="$ac_save_CFLAGS"
 ])
+
+dnl Check for AVX
+
+AC_DEFUN([LIBGFOR_CHECK_AVX], [
+  ac_save_CFLAGS="$CFLAGS"
+  CFLAGS="-O2 -mavx"
+  AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+  void _mm256_zeroall (void)
+        {
+           __builtin_ia32_vzeroall ();
+        }]], [[]])],
+	AC_DEFINE(HAVE_AVX, 1,
+	[Define if AVX instructions can be compiled.]),
+	[])
+  CFLAGS="$ac_save_CFLAGS"
+])
+
+dnl Check for AVX2
+
+AC_DEFUN([LIBGFOR_CHECK_AVX2], [
+  ac_save_CFLAGS="$CFLAGS"
+  CFLAGS="-O2 -mavx2"
+  AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+  typedef long long __v4di __attribute__ ((__vector_size__ (32)));
+	__v4di
+	mm256_is32_andnotsi256  (__v4di __X, __v4di __Y)
+        {
+	   return __builtin_ia32_andnotsi256 (__X, __Y);
+        }]], [[]])],
+	AC_DEFINE(HAVE_AVX2, 1,
+	[Define if AVX2 instructions can be compiled.]),
+	[])
+  CFLAGS="$ac_save_CFLAGS"
+])
+
+dnl Check for AVX512f
+
+AC_DEFUN([LIBGFOR_CHECK_AVX512F], [
+  ac_save_CFLAGS="$CFLAGS"
+  CFLAGS="-O2 -mavx512f"
+  AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+	typedef double __m512d __attribute__ ((__vector_size__ (64)));
+	__m512d _mm512_add (__m512d a)
+	{
+	  return __builtin_ia32_addpd512_mask (a, a, a, 1, 4);
+        }]], [[]])],
+	AC_DEFINE(HAVE_AVX512F, 1,
+	[Define if AVX512f instructions can be compiled.]),
+	[])
+  CFLAGS="$ac_save_CFLAGS"
+])
Index: config.h.in
===================================================================
--- config.h.in	(Revision 242477)
+++ config.h.in	(Arbeitskopie)
@@ -78,6 +78,15 @@ 
 /* Define to 1 if the target supports __attribute__((visibility(...))). */
 #undef HAVE_ATTRIBUTE_VISIBILITY
 
+/* Define if AVX instructions can be compiled. */
+#undef HAVE_AVX
+
+/* Define if AVX2 instructions can be compiled. */
+#undef HAVE_AVX2
+
+/* Define if AVX512f instructions can be compiled. */
+#undef HAVE_AVX512F
+
 /* Define to 1 if you have the `cabs' function. */
 #undef HAVE_CABS
 
Index: configure
===================================================================
--- configure	(Revision 242477)
+++ configure	(Arbeitskopie)
@@ -26174,6 +26174,93 @@  $as_echo "#define HAVE_CRLF 1" >>confdefs.h
 
 fi
 
+# Check whether we support AVX extensions
+
+  ac_save_CFLAGS="$CFLAGS"
+  CFLAGS="-O2 -mavx"
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+  void _mm256_zeroall (void)
+        {
+           __builtin_ia32_vzeroall ();
+        }
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+$as_echo "#define HAVE_AVX 1" >>confdefs.h
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  CFLAGS="$ac_save_CFLAGS"
+
+
+# Check wether we support AVX2 extensions
+
+  ac_save_CFLAGS="$CFLAGS"
+  CFLAGS="-O2 -mavx2"
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+  typedef long long __v4di __attribute__ ((__vector_size__ (32)));
+	__v4di
+	mm256_is32_andnotsi256  (__v4di __X, __v4di __Y)
+        {
+	   return __builtin_ia32_andnotsi256 (__X, __Y);
+        }
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+$as_echo "#define HAVE_AVX2 1" >>confdefs.h
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  CFLAGS="$ac_save_CFLAGS"
+
+
+# Check wether we support AVX512f extensions
+
+  ac_save_CFLAGS="$CFLAGS"
+  CFLAGS="-O2 -mavx512f"
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+	typedef double __m512d __attribute__ ((__vector_size__ (64)));
+	__m512d _mm512_add (__m512d a)
+	{
+	  return __builtin_ia32_addpd512_mask (a, a, a, 1, 4);
+        }
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+$as_echo "#define HAVE_AVX512F 1" >>confdefs.h
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  CFLAGS="$ac_save_CFLAGS"
+
+
 cat >confcache <<\_ACEOF
 # This file is a shell script that caches the results of configure
 # tests run on this system so they can be shared between configure
Index: configure.ac
===================================================================
--- configure.ac	(Revision 242477)
+++ configure.ac	(Arbeitskopie)
@@ -609,6 +609,15 @@  LIBGFOR_CHECK_UNLINK_OPEN_FILE
 # Check whether line terminator is LF or CRLF
 LIBGFOR_CHECK_CRLF
 
+# Check whether we support AVX extensions
+LIBGFOR_CHECK_AVX
+
+# Check wether we support AVX2 extensions
+LIBGFOR_CHECK_AVX2
+
+# Check wether we support AVX512f extensions
+LIBGFOR_CHECK_AVX512F
+
 AC_CACHE_SAVE
 
 if test ${multilib} = yes; then
Index: generated/matmul_c10.c
===================================================================
--- generated/matmul_c10.c	(Revision 242477)
+++ generated/matmul_c10.c	(Arbeitskopie)
@@ -75,11 +75,47 @@  extern void matmul_c10 (gfc_array_c10 * const rest
 	int blas_limit, blas_call gemm);
 export_proto(matmul_c10);
 
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available.  For this, we
+   let the actual work be done by the static aux_matmul - function.
+   The user-callable function will then automagically contain the
+   selection code for the right architecture.  This is done to avoid
+   knowledge of architecture details in the front end.  */
+
+static void aux_matmul_c10 (gfc_array_c10 * const restrict retarray, 
+	gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+	__attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
 void
 matmul_c10 (gfc_array_c10 * const restrict retarray, 
 	gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm)
 {
+  aux_matmul_c10 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_c10 (gfc_array_c10 * const restrict retarray, 
+	gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+#else
+matmul_c10 (gfc_array_c10 * const restrict retarray, 
+	gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+#endif
+{
   const GFC_COMPLEX_10 * restrict abase;
   const GFC_COMPLEX_10 * restrict bbase;
   GFC_COMPLEX_10 * restrict dest;
Index: generated/matmul_c16.c
===================================================================
--- generated/matmul_c16.c	(Revision 242477)
+++ generated/matmul_c16.c	(Arbeitskopie)
@@ -75,11 +75,47 @@  extern void matmul_c16 (gfc_array_c16 * const rest
 	int blas_limit, blas_call gemm);
 export_proto(matmul_c16);
 
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available.  For this, we
+   let the actual work be done by the static aux_matmul - function.
+   The user-callable function will then automagically contain the
+   selection code for the right architecture.  This is done to avoid
+   knowledge of architecture details in the front end.  */
+
+static void aux_matmul_c16 (gfc_array_c16 * const restrict retarray, 
+	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+	__attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
 void
 matmul_c16 (gfc_array_c16 * const restrict retarray, 
 	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm)
 {
+  aux_matmul_c16 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_c16 (gfc_array_c16 * const restrict retarray, 
+	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+#else
+matmul_c16 (gfc_array_c16 * const restrict retarray, 
+	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+#endif
+{
   const GFC_COMPLEX_16 * restrict abase;
   const GFC_COMPLEX_16 * restrict bbase;
   GFC_COMPLEX_16 * restrict dest;
Index: generated/matmul_c4.c
===================================================================
--- generated/matmul_c4.c	(Revision 242477)
+++ generated/matmul_c4.c	(Arbeitskopie)
@@ -75,11 +75,47 @@  extern void matmul_c4 (gfc_array_c4 * const restri
 	int blas_limit, blas_call gemm);
 export_proto(matmul_c4);
 
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available.  For this, we
+   let the actual work be done by the static aux_matmul - function.
+   The user-callable function will then automagically contain the
+   selection code for the right architecture.  This is done to avoid
+   knowledge of architecture details in the front end.  */
+
+static void aux_matmul_c4 (gfc_array_c4 * const restrict retarray, 
+	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+	__attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
 void
 matmul_c4 (gfc_array_c4 * const restrict retarray, 
 	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm)
 {
+  aux_matmul_c4 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_c4 (gfc_array_c4 * const restrict retarray, 
+	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+#else
+matmul_c4 (gfc_array_c4 * const restrict retarray, 
+	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+#endif
+{
   const GFC_COMPLEX_4 * restrict abase;
   const GFC_COMPLEX_4 * restrict bbase;
   GFC_COMPLEX_4 * restrict dest;
Index: generated/matmul_c8.c
===================================================================
--- generated/matmul_c8.c	(Revision 242477)
+++ generated/matmul_c8.c	(Arbeitskopie)
@@ -75,11 +75,47 @@  extern void matmul_c8 (gfc_array_c8 * const restri
 	int blas_limit, blas_call gemm);
 export_proto(matmul_c8);
 
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available.  For this, we
+   let the actual work be done by the static aux_matmul - function.
+   The user-callable function will then automagically contain the
+   selection code for the right architecture.  This is done to avoid
+   knowledge of architecture details in the front end.  */
+
+static void aux_matmul_c8 (gfc_array_c8 * const restrict retarray, 
+	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+	__attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
 void
 matmul_c8 (gfc_array_c8 * const restrict retarray, 
 	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm)
 {
+  aux_matmul_c8 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_c8 (gfc_array_c8 * const restrict retarray, 
+	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+#else
+matmul_c8 (gfc_array_c8 * const restrict retarray, 
+	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+#endif
+{
   const GFC_COMPLEX_8 * restrict abase;
   const GFC_COMPLEX_8 * restrict bbase;
   GFC_COMPLEX_8 * restrict dest;
Index: generated/matmul_i1.c
===================================================================
--- generated/matmul_i1.c	(Revision 242477)
+++ generated/matmul_i1.c	(Arbeitskopie)
@@ -75,11 +75,47 @@  extern void matmul_i1 (gfc_array_i1 * const restri
 	int blas_limit, blas_call gemm);
 export_proto(matmul_i1);
 
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available.  For this, we
+   let the actual work be done by the static aux_matmul - function.
+   The user-callable function will then automagically contain the
+   selection code for the right architecture.  This is done to avoid
+   knowledge of architecture details in the front end.  */
+
+static void aux_matmul_i1 (gfc_array_i1 * const restrict retarray, 
+	gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+	__attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
 void
 matmul_i1 (gfc_array_i1 * const restrict retarray, 
 	gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm)
 {
+  aux_matmul_i1 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_i1 (gfc_array_i1 * const restrict retarray, 
+	gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+#else
+matmul_i1 (gfc_array_i1 * const restrict retarray, 
+	gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+#endif
+{
   const GFC_INTEGER_1 * restrict abase;
   const GFC_INTEGER_1 * restrict bbase;
   GFC_INTEGER_1 * restrict dest;
Index: generated/matmul_i16.c
===================================================================
--- generated/matmul_i16.c	(Revision 242477)
+++ generated/matmul_i16.c	(Arbeitskopie)
@@ -75,11 +75,47 @@  extern void matmul_i16 (gfc_array_i16 * const rest
 	int blas_limit, blas_call gemm);
 export_proto(matmul_i16);
 
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available.  For this, we
+   let the actual work be done by the static aux_matmul - function.
+   The user-callable function will then automagically contain the
+   selection code for the right architecture.  This is done to avoid
+   knowledge of architecture details in the front end.  */
+
+static void aux_matmul_i16 (gfc_array_i16 * const restrict retarray, 
+	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+	__attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
 void
 matmul_i16 (gfc_array_i16 * const restrict retarray, 
 	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm)
 {
+  aux_matmul_i16 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_i16 (gfc_array_i16 * const restrict retarray, 
+	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+#else
+matmul_i16 (gfc_array_i16 * const restrict retarray, 
+	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+#endif
+{
   const GFC_INTEGER_16 * restrict abase;
   const GFC_INTEGER_16 * restrict bbase;
   GFC_INTEGER_16 * restrict dest;
Index: generated/matmul_i2.c
===================================================================
--- generated/matmul_i2.c	(Revision 242477)
+++ generated/matmul_i2.c	(Arbeitskopie)
@@ -75,11 +75,47 @@  extern void matmul_i2 (gfc_array_i2 * const restri
 	int blas_limit, blas_call gemm);
 export_proto(matmul_i2);
 
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available.  For this, we
+   let the actual work be done by the static aux_matmul - function.
+   The user-callable function will then automagically contain the
+   selection code for the right architecture.  This is done to avoid
+   knowledge of architecture details in the front end.  */
+
+static void aux_matmul_i2 (gfc_array_i2 * const restrict retarray, 
+	gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+	__attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
 void
 matmul_i2 (gfc_array_i2 * const restrict retarray, 
 	gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm)
 {
+  aux_matmul_i2 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_i2 (gfc_array_i2 * const restrict retarray, 
+	gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+#else
+matmul_i2 (gfc_array_i2 * const restrict retarray, 
+	gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+#endif
+{
   const GFC_INTEGER_2 * restrict abase;
   const GFC_INTEGER_2 * restrict bbase;
   GFC_INTEGER_2 * restrict dest;
Index: generated/matmul_i4.c
===================================================================
--- generated/matmul_i4.c	(Revision 242477)
+++ generated/matmul_i4.c	(Arbeitskopie)
@@ -75,11 +75,47 @@  extern void matmul_i4 (gfc_array_i4 * const restri
 	int blas_limit, blas_call gemm);
 export_proto(matmul_i4);
 
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available.  For this, we
+   let the actual work be done by the static aux_matmul - function.
+   The user-callable function will then automagically contain the
+   selection code for the right architecture.  This is done to avoid
+   knowledge of architecture details in the front end.  */
+
+static void aux_matmul_i4 (gfc_array_i4 * const restrict retarray, 
+	gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+	__attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
 void
 matmul_i4 (gfc_array_i4 * const restrict retarray, 
 	gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm)
 {
+  aux_matmul_i4 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_i4 (gfc_array_i4 * const restrict retarray, 
+	gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+#else
+matmul_i4 (gfc_array_i4 * const restrict retarray, 
+	gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+#endif
+{
   const GFC_INTEGER_4 * restrict abase;
   const GFC_INTEGER_4 * restrict bbase;
   GFC_INTEGER_4 * restrict dest;
Index: generated/matmul_i8.c
===================================================================
--- generated/matmul_i8.c	(Revision 242477)
+++ generated/matmul_i8.c	(Arbeitskopie)
@@ -75,11 +75,47 @@  extern void matmul_i8 (gfc_array_i8 * const restri
 	int blas_limit, blas_call gemm);
 export_proto(matmul_i8);
 
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available.  For this, we
+   let the actual work be done by the static aux_matmul - function.
+   The user-callable function will then automagically contain the
+   selection code for the right architecture.  This is done to avoid
+   knowledge of architecture details in the front end.  */
+
+static void aux_matmul_i8 (gfc_array_i8 * const restrict retarray, 
+	gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+	__attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
 void
 matmul_i8 (gfc_array_i8 * const restrict retarray, 
 	gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm)
 {
+  aux_matmul_i8 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_i8 (gfc_array_i8 * const restrict retarray, 
+	gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+#else
+matmul_i8 (gfc_array_i8 * const restrict retarray, 
+	gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+#endif
+{
   const GFC_INTEGER_8 * restrict abase;
   const GFC_INTEGER_8 * restrict bbase;
   GFC_INTEGER_8 * restrict dest;
Index: generated/matmul_r10.c
===================================================================
--- generated/matmul_r10.c	(Revision 242477)
+++ generated/matmul_r10.c	(Arbeitskopie)
@@ -75,11 +75,47 @@  extern void matmul_r10 (gfc_array_r10 * const rest
 	int blas_limit, blas_call gemm);
 export_proto(matmul_r10);
 
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available.  For this, we
+   let the actual work be done by the static aux_matmul - function.
+   The user-callable function will then automagically contain the
+   selection code for the right architecture.  This is done to avoid
+   knowledge of architecture details in the front end.  */
+
+static void aux_matmul_r10 (gfc_array_r10 * const restrict retarray, 
+	gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+	__attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
 void
 matmul_r10 (gfc_array_r10 * const restrict retarray, 
 	gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm)
 {
+  aux_matmul_r10 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_r10 (gfc_array_r10 * const restrict retarray, 
+	gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+#else
+matmul_r10 (gfc_array_r10 * const restrict retarray, 
+	gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+#endif
+{
   const GFC_REAL_10 * restrict abase;
   const GFC_REAL_10 * restrict bbase;
   GFC_REAL_10 * restrict dest;
Index: generated/matmul_r16.c
===================================================================
--- generated/matmul_r16.c	(Revision 242477)
+++ generated/matmul_r16.c	(Arbeitskopie)
@@ -75,11 +75,47 @@  extern void matmul_r16 (gfc_array_r16 * const rest
 	int blas_limit, blas_call gemm);
 export_proto(matmul_r16);
 
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available.  For this, we
+   let the actual work be done by the static aux_matmul - function.
+   The user-callable function will then automagically contain the
+   selection code for the right architecture.  This is done to avoid
+   knowledge of architecture details in the front end.  */
+
+static void aux_matmul_r16 (gfc_array_r16 * const restrict retarray, 
+	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+	__attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
 void
 matmul_r16 (gfc_array_r16 * const restrict retarray, 
 	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm)
 {
+  aux_matmul_r16 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_r16 (gfc_array_r16 * const restrict retarray, 
+	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+#else
+matmul_r16 (gfc_array_r16 * const restrict retarray, 
+	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+#endif
+{
   const GFC_REAL_16 * restrict abase;
   const GFC_REAL_16 * restrict bbase;
   GFC_REAL_16 * restrict dest;
Index: generated/matmul_r4.c
===================================================================
--- generated/matmul_r4.c	(Revision 242477)
+++ generated/matmul_r4.c	(Arbeitskopie)
@@ -75,11 +75,47 @@  extern void matmul_r4 (gfc_array_r4 * const restri
 	int blas_limit, blas_call gemm);
 export_proto(matmul_r4);
 
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available.  For this, we
+   let the actual work be done by the static aux_matmul - function.
+   The user-callable function will then automagically contain the
+   selection code for the right architecture.  This is done to avoid
+   knowledge of architecture details in the front end.  */
+
+static void aux_matmul_r4 (gfc_array_r4 * const restrict retarray, 
+	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+	__attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
 void
 matmul_r4 (gfc_array_r4 * const restrict retarray, 
 	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm)
 {
+  aux_matmul_r4 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_r4 (gfc_array_r4 * const restrict retarray, 
+	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+#else
+matmul_r4 (gfc_array_r4 * const restrict retarray, 
+	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+#endif
+{
   const GFC_REAL_4 * restrict abase;
   const GFC_REAL_4 * restrict bbase;
   GFC_REAL_4 * restrict dest;
Index: generated/matmul_r8.c
===================================================================
--- generated/matmul_r8.c	(Revision 242477)
+++ generated/matmul_r8.c	(Arbeitskopie)
@@ -75,11 +75,47 @@  extern void matmul_r8 (gfc_array_r8 * const restri
 	int blas_limit, blas_call gemm);
 export_proto(matmul_r8);
 
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available.  For this, we
+   let the actual work be done by the static aux_matmul - function.
+   The user-callable function will then automagically contain the
+   selection code for the right architecture.  This is done to avoid
+   knowledge of architecture details in the front end.  */
+
+static void aux_matmul_r8 (gfc_array_r8 * const restrict retarray, 
+	gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+	__attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
 void
 matmul_r8 (gfc_array_r8 * const restrict retarray, 
 	gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm)
 {
+  aux_matmul_r8 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_r8 (gfc_array_r8 * const restrict retarray, 
+	gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+#else
+matmul_r8 (gfc_array_r8 * const restrict retarray, 
+	gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+#endif
+{
   const GFC_REAL_8 * restrict abase;
   const GFC_REAL_8 * restrict bbase;
   GFC_REAL_8 * restrict dest;
Index: m4/matmul.m4
===================================================================
--- m4/matmul.m4	(Revision 242477)
+++ m4/matmul.m4	(Arbeitskopie)
@@ -76,11 +76,47 @@  extern void matmul_'rtype_code` ('rtype` * const r
 	int blas_limit, blas_call gemm);
 export_proto(matmul_'rtype_code`);
 
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available.  For this, we
+   let the actual work be done by the static aux_matmul - function.
+   The user-callable function will then automagically contain the
+   selection code for the right architecture.  This is done to avoid
+   knowledge of architecture details in the front end.  */
+
+static void aux_matmul_'rtype_code` ('rtype` * const restrict retarray, 
+	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+	__attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
 void
 matmul_'rtype_code` ('rtype` * const restrict retarray, 
 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm)
 {
+  aux_matmul_'rtype_code` (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_'rtype_code` ('rtype` * const restrict retarray, 
+	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+#else
+matmul_'rtype_code` ('rtype` * const restrict retarray, 
+	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm)
+#endif
+{
   const 'rtype_name` * restrict abase;
   const 'rtype_name` * restrict bbase;
   'rtype_name` * restrict dest;