[libfortran] Add AVX-specific matmul

Message ID	0b19ecac-9ac5-5fa8-dc10-1bc2fdded6b5@netcologne.de
State	New
Headers	show Return-Path: <gcc-patches-return-441824-incoming=patchwork.ozlabs.org@gcc.gnu.org> DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender :subject:to:references:cc:from:message-id:date:mime-version :in-reply-to:content-type; q=dns; s=default; b=O73R4/tFm5dwD30kR RyEy90u1SOpSV8+5KOD4dDCFBrihEkCQH1yNdU9MFk//aNMKn+J3mf+uKL0exIG4 428+0nDUrW+5S4zK8QXgBDa1ki0Xl0vJjPZ4uP4xdFY+O8WXh95aB5MPMEYCRhs+ xr0hpYrQsIJZpvuFW/6vgJSjHE= Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org Subject: Re: [patch, libfortran] Add AVX-specific matmul To: Janne Blomqvist <blomqvist.janne@gmail.com> References: <05fbb04a-f4c1-cb61-9baa-7a86ea673784@netcologne.de> <20161116220109.GE3541@tucnak.redhat.com> <ce134840-5406-f452-dd47-d1d0ffb6e304@netcologne.de> <20161116232037.GF3541@tucnak.redhat.com> <e8a6e6b3-3072-ed23-64af-9753c0f2a8ac@netcologne.de> <CAO9iq9Hf=uGBVopKaGiLKo2CUE4JgWS-N8WsXZ_8+fMxNPN=Ww@mail.gmail.com> Cc: Jakub Jelinek <jakub@redhat.com>, "fortran@gcc.gnu.org" <fortran@gcc.gnu.org>, gcc-patches <gcc-patches@gcc.gnu.org>, jerry DeLisle <jvdelisle@charter.net> From: Thomas Koenig <tkoenig@netcologne.de> Message-ID: <0b19ecac-9ac5-5fa8-dc10-1bc2fdded6b5@netcologne.de> Date: Thu, 17 Nov 2016 15:47:19 +0100 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Thunderbird/45.4.0 MIME-Version: 1.0 In-Reply-To: <CAO9iq9Hf=uGBVopKaGiLKo2CUE4JgWS-N8WsXZ_8+fMxNPN=Ww@mail.gmail.com> Content-Type: multipart/mixed; boundary="------------9025402EB3DA403C7E1863AF"

Index: acinclude.m4 =================================================================== --- acinclude.m4 (Revision 242477) +++ acinclude.m4 (Arbeitskopie) @@ -393,3 +393,54 @@ AC_DEFUN([LIBGFOR_CHECK_STRERROR_R], [ [Define if strerror_r takes two arguments and is available in <string.h>.]),) CFLAGS="$ac_save_CFLAGS" ]) + +dnl Check for AVX + +AC_DEFUN([LIBGFOR_CHECK_AVX], [ + ac_save_CFLAGS="$CFLAGS" + CFLAGS="-O2 -mavx" + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ + void _mm256_zeroall (void) + { + __builtin_ia32_vzeroall (); + }]], [[]])], + AC_DEFINE(HAVE_AVX, 1, + [Define if AVX instructions can be compiled.]), + []) + CFLAGS="$ac_save_CFLAGS" +]) + +dnl Check for AVX2 + +AC_DEFUN([LIBGFOR_CHECK_AVX2], [ + ac_save_CFLAGS="$CFLAGS" + CFLAGS="-O2 -mavx2" + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ + typedef long long __v4di __attribute__ ((__vector_size__ (32))); + __v4di + mm256_is32_andnotsi256 (__v4di __X, __v4di __Y) + { + return __builtin_ia32_andnotsi256 (__X, __Y); + }]], [[]])], + AC_DEFINE(HAVE_AVX2, 1, + [Define if AVX2 instructions can be compiled.]), + []) + CFLAGS="$ac_save_CFLAGS" +]) + +dnl Check for AVX512f + +AC_DEFUN([LIBGFOR_CHECK_AVX512F], [ + ac_save_CFLAGS="$CFLAGS" + CFLAGS="-O2 -mavx512f" + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ + typedef double __m512d __attribute__ ((__vector_size__ (64))); + __m512d _mm512_add (__m512d a) + { + return __builtin_ia32_addpd512_mask (a, a, a, 1, 4); + }]], [[]])], + AC_DEFINE(HAVE_AVX512F, 1, + [Define if AVX512f instructions can be compiled.]), + []) + CFLAGS="$ac_save_CFLAGS" +]) Index: config.h.in =================================================================== --- config.h.in (Revision 242477) +++ config.h.in (Arbeitskopie) @@ -78,6 +78,15 @@ /* Define to 1 if the target supports __attribute__((visibility(...))). */ #undef HAVE_ATTRIBUTE_VISIBILITY +/* Define if AVX instructions can be compiled. */ +#undef HAVE_AVX + +/* Define if AVX2 instructions can be compiled. */ +#undef HAVE_AVX2 + +/* Define if AVX512f instructions can be compiled. */ +#undef HAVE_AVX512F + /* Define to 1 if you have the `cabs' function. */ #undef HAVE_CABS Index: configure =================================================================== --- configure (Revision 242477) +++ configure (Arbeitskopie) @@ -26174,6 +26174,93 @@ $as_echo "#define HAVE_CRLF 1" >>confdefs.h fi +# Check whether we support AVX extensions + + ac_save_CFLAGS="$CFLAGS" + CFLAGS="-O2 -mavx" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + + void _mm256_zeroall (void) + { + __builtin_ia32_vzeroall (); + } +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + +$as_echo "#define HAVE_AVX 1" >>confdefs.h + +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + CFLAGS="$ac_save_CFLAGS" + + +# Check wether we support AVX2 extensions + + ac_save_CFLAGS="$CFLAGS" + CFLAGS="-O2 -mavx2" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + + typedef long long __v4di __attribute__ ((__vector_size__ (32))); + __v4di + mm256_is32_andnotsi256 (__v4di __X, __v4di __Y) + { + return __builtin_ia32_andnotsi256 (__X, __Y); + } +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + +$as_echo "#define HAVE_AVX2 1" >>confdefs.h + +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + CFLAGS="$ac_save_CFLAGS" + + +# Check wether we support AVX512f extensions + + ac_save_CFLAGS="$CFLAGS" + CFLAGS="-O2 -mavx512f" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + + typedef double __m512d __attribute__ ((__vector_size__ (64))); + __m512d _mm512_add (__m512d a) + { + return __builtin_ia32_addpd512_mask (a, a, a, 1, 4); + } +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + +$as_echo "#define HAVE_AVX512F 1" >>confdefs.h + +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + CFLAGS="$ac_save_CFLAGS" + + cat >confcache <<\_ACEOF # This file is a shell script that caches the results of configure # tests run on this system so they can be shared between configure Index: configure.ac =================================================================== --- configure.ac (Revision 242477) +++ configure.ac (Arbeitskopie) @@ -609,6 +609,15 @@ LIBGFOR_CHECK_UNLINK_OPEN_FILE # Check whether line terminator is LF or CRLF LIBGFOR_CHECK_CRLF +# Check whether we support AVX extensions +LIBGFOR_CHECK_AVX + +# Check wether we support AVX2 extensions +LIBGFOR_CHECK_AVX2 + +# Check wether we support AVX512f extensions +LIBGFOR_CHECK_AVX512F + AC_CACHE_SAVE if test ${multilib} = yes; then Index: generated/matmul_c10.c =================================================================== --- generated/matmul_c10.c (Revision 242477) +++ generated/matmul_c10.c (Arbeitskopie) @@ -75,11 +75,47 @@ extern void matmul_c10 (gfc_array_c10 * const rest int blas_limit, blas_call gemm); export_proto(matmul_c10); +#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F) + +/* For x86_64, we switch to AVX if that is available. For this, we + let the actual work be done by the static aux_matmul - function. + The user-callable function will then automagically contain the + selection code for the right architecture. This is done to avoid + knowledge of architecture details in the front end. */ + +static void aux_matmul_c10 (gfc_array_c10 * const restrict retarray, + gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) + __attribute__ ((target_clones( +#if defined(HAVE_AVX) +"avx", +#endif +#if defined(HAVE_AVX2) +"avx2", +#endif +#if defined(HAVE_AVX512F) +"avx512f", +#endif +"default"))); + void matmul_c10 (gfc_array_c10 * const restrict retarray, gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { + aux_matmul_c10 (retarray, a, b, try_blas, blas_limit, gemm); +} + +static void +aux_matmul_c10 (gfc_array_c10 * const restrict retarray, + gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#else +matmul_c10 (gfc_array_c10 * const restrict retarray, + gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#endif +{ const GFC_COMPLEX_10 * restrict abase; const GFC_COMPLEX_10 * restrict bbase; GFC_COMPLEX_10 * restrict dest; Index: generated/matmul_c16.c =================================================================== --- generated/matmul_c16.c (Revision 242477) +++ generated/matmul_c16.c (Arbeitskopie) @@ -75,11 +75,47 @@ extern void matmul_c16 (gfc_array_c16 * const rest int blas_limit, blas_call gemm); export_proto(matmul_c16); +#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F) + +/* For x86_64, we switch to AVX if that is available. For this, we + let the actual work be done by the static aux_matmul - function. + The user-callable function will then automagically contain the + selection code for the right architecture. This is done to avoid + knowledge of architecture details in the front end. */ + +static void aux_matmul_c16 (gfc_array_c16 * const restrict retarray, + gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) + __attribute__ ((target_clones( +#if defined(HAVE_AVX) +"avx", +#endif +#if defined(HAVE_AVX2) +"avx2", +#endif +#if defined(HAVE_AVX512F) +"avx512f", +#endif +"default"))); + void matmul_c16 (gfc_array_c16 * const restrict retarray, gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { + aux_matmul_c16 (retarray, a, b, try_blas, blas_limit, gemm); +} + +static void +aux_matmul_c16 (gfc_array_c16 * const restrict retarray, + gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#else +matmul_c16 (gfc_array_c16 * const restrict retarray, + gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#endif +{ const GFC_COMPLEX_16 * restrict abase; const GFC_COMPLEX_16 * restrict bbase; GFC_COMPLEX_16 * restrict dest; Index: generated/matmul_c4.c =================================================================== --- generated/matmul_c4.c (Revision 242477) +++ generated/matmul_c4.c (Arbeitskopie) @@ -75,11 +75,47 @@ extern void matmul_c4 (gfc_array_c4 * const restri int blas_limit, blas_call gemm); export_proto(matmul_c4); +#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F) + +/* For x86_64, we switch to AVX if that is available. For this, we + let the actual work be done by the static aux_matmul - function. + The user-callable function will then automagically contain the + selection code for the right architecture. This is done to avoid + knowledge of architecture details in the front end. */ + +static void aux_matmul_c4 (gfc_array_c4 * const restrict retarray, + gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) + __attribute__ ((target_clones( +#if defined(HAVE_AVX) +"avx", +#endif +#if defined(HAVE_AVX2) +"avx2", +#endif +#if defined(HAVE_AVX512F) +"avx512f", +#endif +"default"))); + void matmul_c4 (gfc_array_c4 * const restrict retarray, gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { + aux_matmul_c4 (retarray, a, b, try_blas, blas_limit, gemm); +} + +static void +aux_matmul_c4 (gfc_array_c4 * const restrict retarray, + gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#else +matmul_c4 (gfc_array_c4 * const restrict retarray, + gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#endif +{ const GFC_COMPLEX_4 * restrict abase; const GFC_COMPLEX_4 * restrict bbase; GFC_COMPLEX_4 * restrict dest; Index: generated/matmul_c8.c =================================================================== --- generated/matmul_c8.c (Revision 242477) +++ generated/matmul_c8.c (Arbeitskopie) @@ -75,11 +75,47 @@ extern void matmul_c8 (gfc_array_c8 * const restri int blas_limit, blas_call gemm); export_proto(matmul_c8); +#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F) + +/* For x86_64, we switch to AVX if that is available. For this, we + let the actual work be done by the static aux_matmul - function. + The user-callable function will then automagically contain the + selection code for the right architecture. This is done to avoid + knowledge of architecture details in the front end. */ + +static void aux_matmul_c8 (gfc_array_c8 * const restrict retarray, + gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) + __attribute__ ((target_clones( +#if defined(HAVE_AVX) +"avx", +#endif +#if defined(HAVE_AVX2) +"avx2", +#endif +#if defined(HAVE_AVX512F) +"avx512f", +#endif +"default"))); + void matmul_c8 (gfc_array_c8 * const restrict retarray, gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { + aux_matmul_c8 (retarray, a, b, try_blas, blas_limit, gemm); +} + +static void +aux_matmul_c8 (gfc_array_c8 * const restrict retarray, + gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#else +matmul_c8 (gfc_array_c8 * const restrict retarray, + gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#endif +{ const GFC_COMPLEX_8 * restrict abase; const GFC_COMPLEX_8 * restrict bbase; GFC_COMPLEX_8 * restrict dest; Index: generated/matmul_i1.c =================================================================== --- generated/matmul_i1.c (Revision 242477) +++ generated/matmul_i1.c (Arbeitskopie) @@ -75,11 +75,47 @@ extern void matmul_i1 (gfc_array_i1 * const restri int blas_limit, blas_call gemm); export_proto(matmul_i1); +#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F) + +/* For x86_64, we switch to AVX if that is available. For this, we + let the actual work be done by the static aux_matmul - function. + The user-callable function will then automagically contain the + selection code for the right architecture. This is done to avoid + knowledge of architecture details in the front end. */ + +static void aux_matmul_i1 (gfc_array_i1 * const restrict retarray, + gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) + __attribute__ ((target_clones( +#if defined(HAVE_AVX) +"avx", +#endif +#if defined(HAVE_AVX2) +"avx2", +#endif +#if defined(HAVE_AVX512F) +"avx512f", +#endif +"default"))); + void matmul_i1 (gfc_array_i1 * const restrict retarray, gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { + aux_matmul_i1 (retarray, a, b, try_blas, blas_limit, gemm); +} + +static void +aux_matmul_i1 (gfc_array_i1 * const restrict retarray, + gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#else +matmul_i1 (gfc_array_i1 * const restrict retarray, + gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#endif +{ const GFC_INTEGER_1 * restrict abase; const GFC_INTEGER_1 * restrict bbase; GFC_INTEGER_1 * restrict dest; Index: generated/matmul_i16.c =================================================================== --- generated/matmul_i16.c (Revision 242477) +++ generated/matmul_i16.c (Arbeitskopie) @@ -75,11 +75,47 @@ extern void matmul_i16 (gfc_array_i16 * const rest int blas_limit, blas_call gemm); export_proto(matmul_i16); +#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F) + +/* For x86_64, we switch to AVX if that is available. For this, we + let the actual work be done by the static aux_matmul - function. + The user-callable function will then automagically contain the + selection code for the right architecture. This is done to avoid + knowledge of architecture details in the front end. */ + +static void aux_matmul_i16 (gfc_array_i16 * const restrict retarray, + gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) + __attribute__ ((target_clones( +#if defined(HAVE_AVX) +"avx", +#endif +#if defined(HAVE_AVX2) +"avx2", +#endif +#if defined(HAVE_AVX512F) +"avx512f", +#endif +"default"))); + void matmul_i16 (gfc_array_i16 * const restrict retarray, gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { + aux_matmul_i16 (retarray, a, b, try_blas, blas_limit, gemm); +} + +static void +aux_matmul_i16 (gfc_array_i16 * const restrict retarray, + gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#else +matmul_i16 (gfc_array_i16 * const restrict retarray, + gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#endif +{ const GFC_INTEGER_16 * restrict abase; const GFC_INTEGER_16 * restrict bbase; GFC_INTEGER_16 * restrict dest; Index: generated/matmul_i2.c =================================================================== --- generated/matmul_i2.c (Revision 242477) +++ generated/matmul_i2.c (Arbeitskopie) @@ -75,11 +75,47 @@ extern void matmul_i2 (gfc_array_i2 * const restri int blas_limit, blas_call gemm); export_proto(matmul_i2); +#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F) + +/* For x86_64, we switch to AVX if that is available. For this, we + let the actual work be done by the static aux_matmul - function. + The user-callable function will then automagically contain the + selection code for the right architecture. This is done to avoid + knowledge of architecture details in the front end. */ + +static void aux_matmul_i2 (gfc_array_i2 * const restrict retarray, + gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) + __attribute__ ((target_clones( +#if defined(HAVE_AVX) +"avx", +#endif +#if defined(HAVE_AVX2) +"avx2", +#endif +#if defined(HAVE_AVX512F) +"avx512f", +#endif +"default"))); + void matmul_i2 (gfc_array_i2 * const restrict retarray, gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { + aux_matmul_i2 (retarray, a, b, try_blas, blas_limit, gemm); +} + +static void +aux_matmul_i2 (gfc_array_i2 * const restrict retarray, + gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#else +matmul_i2 (gfc_array_i2 * const restrict retarray, + gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#endif +{ const GFC_INTEGER_2 * restrict abase; const GFC_INTEGER_2 * restrict bbase; GFC_INTEGER_2 * restrict dest; Index: generated/matmul_i4.c =================================================================== --- generated/matmul_i4.c (Revision 242477) +++ generated/matmul_i4.c (Arbeitskopie) @@ -75,11 +75,47 @@ extern void matmul_i4 (gfc_array_i4 * const restri int blas_limit, blas_call gemm); export_proto(matmul_i4); +#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F) + +/* For x86_64, we switch to AVX if that is available. For this, we + let the actual work be done by the static aux_matmul - function. + The user-callable function will then automagically contain the + selection code for the right architecture. This is done to avoid + knowledge of architecture details in the front end. */ + +static void aux_matmul_i4 (gfc_array_i4 * const restrict retarray, + gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) + __attribute__ ((target_clones( +#if defined(HAVE_AVX) +"avx", +#endif +#if defined(HAVE_AVX2) +"avx2", +#endif +#if defined(HAVE_AVX512F) +"avx512f", +#endif +"default"))); + void matmul_i4 (gfc_array_i4 * const restrict retarray, gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { + aux_matmul_i4 (retarray, a, b, try_blas, blas_limit, gemm); +} + +static void +aux_matmul_i4 (gfc_array_i4 * const restrict retarray, + gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#else +matmul_i4 (gfc_array_i4 * const restrict retarray, + gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#endif +{ const GFC_INTEGER_4 * restrict abase; const GFC_INTEGER_4 * restrict bbase; GFC_INTEGER_4 * restrict dest; Index: generated/matmul_i8.c =================================================================== --- generated/matmul_i8.c (Revision 242477) +++ generated/matmul_i8.c (Arbeitskopie) @@ -75,11 +75,47 @@ extern void matmul_i8 (gfc_array_i8 * const restri int blas_limit, blas_call gemm); export_proto(matmul_i8); +#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F) + +/* For x86_64, we switch to AVX if that is available. For this, we + let the actual work be done by the static aux_matmul - function. + The user-callable function will then automagically contain the + selection code for the right architecture. This is done to avoid + knowledge of architecture details in the front end. */ + +static void aux_matmul_i8 (gfc_array_i8 * const restrict retarray, + gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) + __attribute__ ((target_clones( +#if defined(HAVE_AVX) +"avx", +#endif +#if defined(HAVE_AVX2) +"avx2", +#endif +#if defined(HAVE_AVX512F) +"avx512f", +#endif +"default"))); + void matmul_i8 (gfc_array_i8 * const restrict retarray, gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { + aux_matmul_i8 (retarray, a, b, try_blas, blas_limit, gemm); +} + +static void +aux_matmul_i8 (gfc_array_i8 * const restrict retarray, + gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#else +matmul_i8 (gfc_array_i8 * const restrict retarray, + gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#endif +{ const GFC_INTEGER_8 * restrict abase; const GFC_INTEGER_8 * restrict bbase; GFC_INTEGER_8 * restrict dest; Index: generated/matmul_r10.c =================================================================== --- generated/matmul_r10.c (Revision 242477) +++ generated/matmul_r10.c (Arbeitskopie) @@ -75,11 +75,47 @@ extern void matmul_r10 (gfc_array_r10 * const rest int blas_limit, blas_call gemm); export_proto(matmul_r10); +#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F) + +/* For x86_64, we switch to AVX if that is available. For this, we + let the actual work be done by the static aux_matmul - function. + The user-callable function will then automagically contain the + selection code for the right architecture. This is done to avoid + knowledge of architecture details in the front end. */ + +static void aux_matmul_r10 (gfc_array_r10 * const restrict retarray, + gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) + __attribute__ ((target_clones( +#if defined(HAVE_AVX) +"avx", +#endif +#if defined(HAVE_AVX2) +"avx2", +#endif +#if defined(HAVE_AVX512F) +"avx512f", +#endif +"default"))); + void matmul_r10 (gfc_array_r10 * const restrict retarray, gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { + aux_matmul_r10 (retarray, a, b, try_blas, blas_limit, gemm); +} + +static void +aux_matmul_r10 (gfc_array_r10 * const restrict retarray, + gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#else +matmul_r10 (gfc_array_r10 * const restrict retarray, + gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#endif +{ const GFC_REAL_10 * restrict abase; const GFC_REAL_10 * restrict bbase; GFC_REAL_10 * restrict dest; Index: generated/matmul_r16.c =================================================================== --- generated/matmul_r16.c (Revision 242477) +++ generated/matmul_r16.c (Arbeitskopie) @@ -75,11 +75,47 @@ extern void matmul_r16 (gfc_array_r16 * const rest int blas_limit, blas_call gemm); export_proto(matmul_r16); +#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F) + +/* For x86_64, we switch to AVX if that is available. For this, we + let the actual work be done by the static aux_matmul - function. + The user-callable function will then automagically contain the + selection code for the right architecture. This is done to avoid + knowledge of architecture details in the front end. */ + +static void aux_matmul_r16 (gfc_array_r16 * const restrict retarray, + gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) + __attribute__ ((target_clones( +#if defined(HAVE_AVX) +"avx", +#endif +#if defined(HAVE_AVX2) +"avx2", +#endif +#if defined(HAVE_AVX512F) +"avx512f", +#endif +"default"))); + void matmul_r16 (gfc_array_r16 * const restrict retarray, gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { + aux_matmul_r16 (retarray, a, b, try_blas, blas_limit, gemm); +} + +static void +aux_matmul_r16 (gfc_array_r16 * const restrict retarray, + gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#else +matmul_r16 (gfc_array_r16 * const restrict retarray, + gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#endif +{ const GFC_REAL_16 * restrict abase; const GFC_REAL_16 * restrict bbase; GFC_REAL_16 * restrict dest; Index: generated/matmul_r4.c =================================================================== --- generated/matmul_r4.c (Revision 242477) +++ generated/matmul_r4.c (Arbeitskopie) @@ -75,11 +75,47 @@ extern void matmul_r4 (gfc_array_r4 * const restri int blas_limit, blas_call gemm); export_proto(matmul_r4); +#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F) + +/* For x86_64, we switch to AVX if that is available. For this, we + let the actual work be done by the static aux_matmul - function. + The user-callable function will then automagically contain the + selection code for the right architecture. This is done to avoid + knowledge of architecture details in the front end. */ + +static void aux_matmul_r4 (gfc_array_r4 * const restrict retarray, + gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) + __attribute__ ((target_clones( +#if defined(HAVE_AVX) +"avx", +#endif +#if defined(HAVE_AVX2) +"avx2", +#endif +#if defined(HAVE_AVX512F) +"avx512f", +#endif +"default"))); + void matmul_r4 (gfc_array_r4 * const restrict retarray, gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { + aux_matmul_r4 (retarray, a, b, try_blas, blas_limit, gemm); +} + +static void +aux_matmul_r4 (gfc_array_r4 * const restrict retarray, + gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#else +matmul_r4 (gfc_array_r4 * const restrict retarray, + gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#endif +{ const GFC_REAL_4 * restrict abase; const GFC_REAL_4 * restrict bbase; GFC_REAL_4 * restrict dest; Index: generated/matmul_r8.c =================================================================== --- generated/matmul_r8.c (Revision 242477) +++ generated/matmul_r8.c (Arbeitskopie) @@ -75,11 +75,47 @@ extern void matmul_r8 (gfc_array_r8 * const restri int blas_limit, blas_call gemm); export_proto(matmul_r8); +#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F) + +/* For x86_64, we switch to AVX if that is available. For this, we + let the actual work be done by the static aux_matmul - function. + The user-callable function will then automagically contain the + selection code for the right architecture. This is done to avoid + knowledge of architecture details in the front end. */ + +static void aux_matmul_r8 (gfc_array_r8 * const restrict retarray, + gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) + __attribute__ ((target_clones( +#if defined(HAVE_AVX) +"avx", +#endif +#if defined(HAVE_AVX2) +"avx2", +#endif +#if defined(HAVE_AVX512F) +"avx512f", +#endif +"default"))); + void matmul_r8 (gfc_array_r8 * const restrict retarray, gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { + aux_matmul_r8 (retarray, a, b, try_blas, blas_limit, gemm); +} + +static void +aux_matmul_r8 (gfc_array_r8 * const restrict retarray, + gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#else +matmul_r8 (gfc_array_r8 * const restrict retarray, + gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#endif +{ const GFC_REAL_8 * restrict abase; const GFC_REAL_8 * restrict bbase; GFC_REAL_8 * restrict dest; Index: m4/matmul.m4 =================================================================== --- m4/matmul.m4 (Revision 242477) +++ m4/matmul.m4 (Arbeitskopie) @@ -76,11 +76,47 @@ extern void matmul_'rtype_code` ('rtype` * const r int blas_limit, blas_call gemm); export_proto(matmul_'rtype_code`); +#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F) + +/* For x86_64, we switch to AVX if that is available. For this, we + let the actual work be done by the static aux_matmul - function. + The user-callable function will then automagically contain the + selection code for the right architecture. This is done to avoid + knowledge of architecture details in the front end. */ + +static void aux_matmul_'rtype_code` ('rtype` * const restrict retarray, + 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, + int blas_limit, blas_call gemm) + __attribute__ ((target_clones( +#if defined(HAVE_AVX) +"avx", +#endif +#if defined(HAVE_AVX2) +"avx2", +#endif +#if defined(HAVE_AVX512F) +"avx512f", +#endif +"default"))); + void matmul_'rtype_code` ('rtype` * const restrict retarray, 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, int blas_limit, blas_call gemm) { + aux_matmul_'rtype_code` (retarray, a, b, try_blas, blas_limit, gemm); +} + +static void +aux_matmul_'rtype_code` ('rtype` * const restrict retarray, + 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#else +matmul_'rtype_code` ('rtype` * const restrict retarray, + 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +#endif +{ const 'rtype_name` * restrict abase; const 'rtype_name` * restrict bbase; 'rtype_name` * restrict dest;

[libfortran] Add AVX-specific matmul

Commit Message

Patch