Message ID | 20211112191800.790574-2-skpgkp2@gmail.com |
---|---|
State | New |
Headers | show |
Series | Implement microbenchmark for libmvec | expand |
On Fri, Nov 12, 2021 at 1:19 PM Sunil K Pandey via Libc-alpha <libc-alpha@sourceware.org> wrote: > > Add python script to generate libmvec microbenchmark from the input > values for each libmvec function using skeleton benchmark template. > > Creates double and float benchmarks with vector length 1, 2, 4, 8, > and 16 for each libmvec function. Vector length 1 corresponds to > scalar version of function and is included for vector function perf > comparison. > --- > sysdeps/x86_64/fpu/Makeconfig | 35 ++ > sysdeps/x86_64/fpu/Makefile | 40 ++ > sysdeps/x86_64/fpu/bench-libmvec-skeleton.c | 104 +++++ > sysdeps/x86_64/fpu/scripts/bench_libmvec.py | 464 ++++++++++++++++++++ > 4 files changed, 643 insertions(+) > create mode 100644 sysdeps/x86_64/fpu/bench-libmvec-skeleton.c > create mode 100755 sysdeps/x86_64/fpu/scripts/bench_libmvec.py > > diff --git a/sysdeps/x86_64/fpu/Makeconfig b/sysdeps/x86_64/fpu/Makeconfig > index 24aaee1a43..503e9b5ffa 100644 > --- a/sysdeps/x86_64/fpu/Makeconfig > +++ b/sysdeps/x86_64/fpu/Makeconfig > @@ -29,6 +29,23 @@ libmvec-funcs = \ > sin \ > sincos \ > > +# Define libmvec function for benchtests directory. > +libmvec-bench-funcs = \ > + > +bench-libmvec-double = \ > + $(addprefix double-vlen1-, $(libmvec-bench-funcs)) \ > + $(addprefix double-vlen2-, $(libmvec-bench-funcs)) \ > + $(addprefix double-vlen4-, $(libmvec-bench-funcs)) \ > + $(addprefix double-vlen4-avx2-, $(libmvec-bench-funcs)) \ > + $(addprefix double-vlen8-, $(libmvec-bench-funcs)) \ > + > +bench-libmvec-float = \ > + $(addsuffix f, $(addprefix float-vlen1-, $(libmvec-bench-funcs))) \ > + $(addsuffix f, $(addprefix float-vlen4-, $(libmvec-bench-funcs))) \ > + $(addsuffix f, $(addprefix float-vlen8-, $(libmvec-bench-funcs))) \ > + $(addsuffix f, $(addprefix float-vlen8-avx2-, $(libmvec-bench-funcs))) \ > + $(addsuffix f, $(addprefix float-vlen16-, $(libmvec-bench-funcs))) \ > + > # The base libmvec ABI tests. > libmvec-abi-func-tests = \ > $(addprefix test-double-libmvec-,$(libmvec-funcs)) \ > @@ -83,5 +100,23 @@ $(common-objpfx)libmvec.mk: $(common-objpfx)config.make > echo " \$$(float-vlen16-arch-ext-cflags)"; \ > echo; \ > done; \ > + echo "endif"; \ > + echo "ifeq (\$$(subdir),benchtests)"; \ > + for t in $(libmvec-bench-funcs); do \ > + echo "CFLAGS-bench-double-vlen4-$$t.c = \\"; \ > + echo " \$$(double-vlen4-arch-ext-cflags)"; \ > + echo "CFLAGS-bench-double-vlen4-avx2-$$t.c = \\"; \ > + echo " \$$(double-vlen4-arch-ext2-cflags)"; \ > + echo "CFLAGS-bench-double-vlen8-$$t.c = \\"; \ > + echo " \$$(double-vlen8-arch-ext-cflags)"; \ > + echo; \ > + echo "CFLAGS-bench-float-vlen8-$${t}f.c = \\"; \ > + echo " \$$(float-vlen8-arch-ext-cflags)"; \ > + echo "CFLAGS-bench-float-vlen8-avx2-$${t}f.c = \\"; \ > + echo " \$$(float-vlen8-arch-ext2-cflags)"; \ > + echo "CFLAGS-bench-float-vlen16-$${t}f.c = \\"; \ > + echo " \$$(float-vlen16-arch-ext-cflags)"; \ > + echo; \ > + done; \ > echo "endif") > $@T > mv -f $@T $@ > diff --git a/sysdeps/x86_64/fpu/Makefile b/sysdeps/x86_64/fpu/Makefile > index d172ae815d..9fb587cf8f 100644 > --- a/sysdeps/x86_64/fpu/Makefile > +++ b/sysdeps/x86_64/fpu/Makefile > @@ -72,3 +72,43 @@ ifeq ($(subdir)$(config-cflags-mprefer-vector-width),mathyes) > # performance of sin and cos by more than 40% on Skylake. > CFLAGS-branred.c = -mprefer-vector-width=128 > endif > + > +ifeq ($(subdir),benchtests) > +double-vlen4-arch-ext-cflags = -mavx > +double-vlen4-arch-ext2-cflags = -mavx2 > +double-vlen8-arch-ext-cflags = -mavx512f > + > +float-vlen8-arch-ext-cflags = -mavx > +float-vlen8-arch-ext2-cflags = -mavx2 > +float-vlen16-arch-ext-cflags = -mavx512f > + > +bench-libmvec := $(bench-libmvec-double) $(bench-libmvec-float) > + > +ifeq (${BENCHSET},) > +bench += $(bench-libmvec) > +endif > + > +ifeq (${STATIC-BENCHTESTS},yes) > +libmvec-benchtests = $(common-objpfx)mathvec/libmvec.a $(common-objpfx)math/libm.a > +else > +libmvec-benchtests = $(libmvec) $(libm) > +endif > + > +$(addprefix $(objpfx)bench-,$(bench-libmvec-double)): $(libmvec-benchtests) > +$(addprefix $(objpfx)bench-,$(bench-libmvec-float)): $(libmvec-benchtests) > +bench-libmvec-deps = $(..)sysdeps/x86_64/fpu/bench-libmvec-skeleton.c bench-timing.h Makefile > + > +$(objpfx)bench-float-%.c: $(bench-libmvec-deps) > + { if [ -n "$($*-INCLUDE)" ]; then \ > + cat $($*-INCLUDE); \ > + fi; \ > + $(PYTHON) $(..)sysdeps/x86_64/fpu/scripts/bench_libmvec.py $(basename $(@F)); } > $@-tmp > + mv -f $@-tmp $@ > + > +$(objpfx)bench-double-%.c: $(bench-libmvec-deps) > + { if [ -n "$($*-INCLUDE)" ]; then \ > + cat $($*-INCLUDE); \ > + fi; \ > + $(PYTHON) $(..)sysdeps/x86_64/fpu/scripts/bench_libmvec.py $(basename $(@F)); } > $@-tmp > + mv -f $@-tmp $@ > +endif > diff --git a/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c b/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c > new file mode 100644 > index 0000000000..d56a0c4462 > --- /dev/null > +++ b/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c > @@ -0,0 +1,104 @@ > +/* Skeleton for libmvec benchmark programs. > + Copyright (C) 2021 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <string.h> > +#include <stdint.h> > +#include <stdbool.h> > +#include <stdio.h> > +#include <time.h> > +#include <inttypes.h> > +#include <bench-timing.h> > +#include <json-lib.h> > +#include <bench-util.h> > + > +#include <bench-util.c> > +#include <math-tests-arch.h> > +#define D_ITERS 10000 > + > +int > +main (int argc, char **argv) > +{ > + unsigned long i, k; > + timing_t start, end; > + json_ctx_t json_ctx; > + > +#if defined REQUIRE_AVX > + if (!CPU_FEATURE_ACTIVE (AVX)) > + { > + printf ("AVX not supported.\n"); > + return 0; > + } > +#elif defined REQUIRE_AVX2 > + if (!CPU_FEATURE_ACTIVE (AVX2)) > + { > + printf ("AVX2 not supported.\n"); > + return 0; > + } > +#elif defined REQUIRE_AVX512F > + if (!CPU_FEATURE_ACTIVE (AVX512F)) > + { > + printf ("AVX512F not supported.\n"); > + return 0; > + } > +#endif > + > + bench_start (); > + > +#ifdef BENCH_INIT > + BENCH_INIT (); > +#endif > + > + json_init (&json_ctx, 2, stdout); > + > + /* Begin function. */ > + json_attr_object_begin (&json_ctx, FUNCNAME); > + > + for (int v = 0; v < NUM_VARIANTS; v++) > + { > + double d_total_time = 0; > + uint64_t cur; Think these should also be type `timing_t` > + for (k = 0; k < D_ITERS; k++) > + { > + TIMING_NOW (start); > + for (i = 0; i < NUM_SAMPLES (v); i++) What is the rationale for both `D_ITERS` and `NUM_SAMPLES (v)`? Why not one loop that iterates for `D_ITERS * NUM_SAMPLES (v)`? > + BENCH_FUNC (v, i); > + TIMING_NOW (end); > + > + TIMING_DIFF (cur, start, end); > + > + d_total_time += cur; Think this should be `TIMING_ACCUM(d_total_time, cur)`. > + > + } > + double d_total_data_set = D_ITERS * NUM_SAMPLES (v) * STRIDE; > + > + /* Begin variant. */ > + json_attr_object_begin (&json_ctx, VARIANT (v)); > + > + json_attr_double (&json_ctx, "duration", d_total_time); > + json_attr_double (&json_ctx, "iterations", d_total_data_set); > + json_attr_double (&json_ctx, "mean", d_total_time / d_total_data_set); > + > + /* End variant. */ > + json_attr_object_end (&json_ctx); > + } > + > + /* End function. */ > + json_attr_object_end (&json_ctx); > + > + return 0; > +} > diff --git a/sysdeps/x86_64/fpu/scripts/bench_libmvec.py b/sysdeps/x86_64/fpu/scripts/bench_libmvec.py > new file mode 100755 > index 0000000000..762865de8f > --- /dev/null > +++ b/sysdeps/x86_64/fpu/scripts/bench_libmvec.py > @@ -0,0 +1,464 @@ > +#!/usr/bin/python3 > +# Copyright (C) 2021 Free Software Foundation, Inc. > +# This file is part of the GNU C Library. > +# > +# The GNU C Library is free software; you can redistribute it and/or > +# modify it under the terms of the GNU Lesser General Public > +# License as published by the Free Software Foundation; either > +# version 2.1 of the License, or (at your option) any later version. > +# > +# The GNU C Library is distributed in the hope that it will be useful, > +# but WITHOUT ANY WARRANTY; without even the implied warranty of > +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > +# Lesser General Public License for more details. > +# > +# You should have received a copy of the GNU Lesser General Public > +# License along with the GNU C Library; if not, see > +# <https://www.gnu.org/licenses/>. > + > +"""Benchmark program generator script > + > +This script takes a function name as input and generates a program using > +an libmvec input file located in the sysdeps/x86_64/fpu directory. The > +name of the input file should be of the form libmvec-foo-inputs where > +'foo' is the name of the function. > +""" > + > +from __future__ import print_function > +import sys > +import os > +import itertools > +import re > + > +# Macro definitions for functions that take no arguments. For functions > +# that take arguments, the STRUCT_TEMPLATE, ARGS_TEMPLATE and > +# VARIANTS_TEMPLATE are used instead. > +DEFINES_TEMPLATE = ''' > +#define CALL_BENCH_FUNC(v, i) %(func)s () > +#define NUM_VARIANTS (1) > +#define NUM_SAMPLES(v) (1) > +#define VARIANT(v) FUNCNAME "()" > +''' > + > +# Structures to store arguments for the function call. A function may > +# have its inputs partitioned to represent distinct performance > +# characteristics or distinct flavors of the function. Each such > +# variant is represented by the _VARIANT structure. The ARGS structure > +# represents a single set of arguments. > +BENCH_VEC_TEMPLATE = ''' > +#define CALL_BENCH_FUNC(v, i) (__extension__ ({ \\ > + %(defs)s mx0 = %(func)s (%(func_args)s); \\ > + mx0; })) > +''' > + > +BENCH_SCALAR_TEMPLATE = ''' > +#define CALL_BENCH_FUNC(v, i) %(func)s (%(func_args)s) > +''' > + > +STRUCT_TEMPLATE = '''struct args > +{ > +%(args)s > + double timing; > +}; > + > +struct _variants > +{ > + const char *name; > + int count; > + struct args *in; > +}; > +''' > + > +# The actual input arguments. > +ARGS_TEMPLATE = '''struct args in%(argnum)d[%(num_args)d] = { > +%(args)s > +}; > +''' > + > +# The actual variants, along with macros defined to access the variants. > +VARIANTS_TEMPLATE = '''struct _variants variants[%(num_variants)d] = { > +%(variants)s > +}; > + > +#define NUM_VARIANTS %(num_variants)d > +#define NUM_SAMPLES(i) (variants[i].count) > +#define VARIANT(i) (variants[i].name) > +''' > + > +# Epilogue for the generated source file. > +EPILOGUE = ''' > +#define BENCH_FUNC(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j);}) > +#define FUNCNAME "%(func)s" > +#include <bench-libmvec-skeleton.c>''' > + > + > +def gen_source(func_types, directives, all_vals): > + """Generate source for the function > + > + Generate the C source for the function from the values and > + directives. > + > + Args: > + func: The function name > + directives: A dictionary of directives applicable to this function > + all_vals: A dictionary input values > + """ > + # The includes go in first. > + for header in directives['includes']: > + print('#include <%s>' % header) > + > + for header in directives['include-sources']: > + print('#include "%s"' % header) > + > + argtype_vtable = { > + 2: '128', > + 4: '256', > + 8: '512' > + } > + prefix_vtable = { > + 2: 'b', > + 4: 'c', > + 8: 'e' > + } > + > + # Get all the function properties > + funcname_argtype = '' > + float_flag = False > + if func_types[1] == 'float': > + float_flag = True > + avx_flag = False > + if func_types[3] == 'avx2': > + avx_flag = True > + funcname_stride = int(func_types[2][4:]) > + funcname_origin = func_types[-1] > + if float_flag: > + funcname_origin = funcname_origin[:-1] > + > + if funcname_stride == 1: > + # Prepare for scalar functions file generation > + funcname_prefix = '' > + funcname_prefix_1 = '' > + funcname_argtype = 'double' > + if float_flag: > + funcname_argtype = 'float' > + else: > + # Prepare for libmvec functions file generation > + funcname_prefix_1 = len(directives['args']) * 'v' + '_' > + aligned_stride = funcname_stride > + if float_flag: > + aligned_stride /= 2 > + funcname_prefix = '_ZGV' > + if (avx_flag and (aligned_stride == 4)): > + funcname_prefix += 'd' > + else: > + funcname_prefix += prefix_vtable[aligned_stride] > + funcname_prefix = funcname_prefix + 'N' + func_types[2][4:] > + funcname_argtype = '__m' + argtype_vtable[aligned_stride] > + if not float_flag: > + funcname_argtype += 'd' > + > + # Include x86intrin.h for vector functions > + if not funcname_stride == 1: > + print('#include <x86intrin.h>') > + if (avx_flag and (aligned_stride == 4)): > + # For bench-float-vlen8-avx2* and bench-double-vlen4-avx2* > + print('#define REQUIRE_AVX2') > + elif aligned_stride == 8: > + # For bench-float-vlen16* and bench-double-vlen8* > + print('#define REQUIRE_AVX512F') > + elif aligned_stride == 4: > + # For bench-float-vlen8* and bench-double-vlen4* without avx2 > + print('#define REQUIRE_AVX') > + else: > + print('#define FUNCTYPE %s' % funcname_argtype) > + > + print('#define STRIDE %d ' % funcname_stride) > + > + funcname = funcname_prefix + funcname_prefix_1 + funcname_origin > + if float_flag: > + funcname += 'f' > + > + funcname_rettype = funcname_argtype > + if directives['ret'] == '': > + funcname_rettype = 'void' > + > + funcname_inputtype = [] > + for arg, i in zip(directives['args'], itertools.count()): > + funcname_inputtype.append(funcname_argtype) > + if arg[0] == '<' and arg[-1] == '>': > + pos = arg.rfind('*') > + if pos == -1: > + die('Output argument must be a pointer type') > + funcname_inputtype[i] += ' *' > + > + if not funcname_stride == 1: > + if len(directives['args']) == 2: > + print('extern %s %s (%s, %s);' % (funcname_rettype, funcname, funcname_inputtype[0], funcname_inputtype[1])) > + elif len(directives['args']) == 3: > + print('extern %s %s (%s, %s, %s);' % (funcname_rettype, funcname, funcname_inputtype[0], funcname_inputtype[1], funcname_inputtype[2])) > + else: > + print('extern %s %s (%s);' % (funcname_rettype, funcname, funcname_inputtype[0])) > + > + # Print macros. This branches out to a separate routine if > + # the function takes arguments. > + if not directives['args']: > + print(DEFINES_TEMPLATE % {'funcname': funcname}) > + outargs = [] > + else: > + outargs = _print_arg_data(funcname, float_flag, funcname_argtype, funcname_stride, directives, all_vals) > + > + # Print the output variable definitions if necessary. > + for out in outargs: > + print(out) > + > + # If we have a return value from the function, make sure it is > + # assigned to prevent the compiler from optimizing out the > + # call. > + getret = '' > + > + if directives['ret']: > + if funcname_argtype != '': > + print('static %s volatile ret;' % funcname_argtype) > + getret = 'ret =' > + else: > + print('static %s volatile ret;' % directives['ret']) > + getret = 'ret =' > + > + # Test initialization. > + if directives['init']: > + print('#define BENCH_INIT %s' % directives['init']) > + > + print(EPILOGUE % {'getret': getret, 'func': funcname}) > + > + > +def _print_arg_data(func, float_flag, funcname_argtype, funcname_stride, directives, all_vals): > + """Print argument data > + > + This is a helper function for gen_source that prints structure and > + values for arguments and their variants and returns output arguments > + if any are found. > + > + Args: > + func: Function name > + float_flag: True if function is float type > + funcname_argtype: Type for vector variants > + funcname_stride: Vector Length > + directives: A dictionary of directives applicable to this function > + all_vals: A dictionary input values > + > + Returns: > + Returns a list of definitions for function arguments that act as > + output parameters. > + """ > + # First, all of the definitions. We process writing of > + # CALL_BENCH_FUNC, struct args and also the output arguments > + # together in a single traversal of the arguments list. > + func_args = [] > + _func_args = [] > + arg_struct = [] > + outargs = [] > + # Conversion function for each type > + vtable = { > + '__m128d': '_mm_loadu_pd', > + '__m256d': '_mm256_loadu_pd', > + '__m512d': '_mm512_loadu_pd', > + '__m128': '_mm_loadu_ps', > + '__m256': '_mm256_loadu_ps', > + '__m512': '_mm512_loadu_ps', > + 'double': '', > + 'float': '' > + } > + > + # For double max_vlen=8, for float max_vlen=16. > + if float_flag == True: > + max_vlen = 16 > + else: > + max_vlen = 8 > + > + for arg, i in zip(directives['args'], itertools.count()): > + if arg[0] == '<' and arg[-1] == '>': > + outargs.append('static %s out%d __attribute__((used));' % (funcname_argtype, i)) > + func_args.append('&out%d' % i) > + _func_args.append('&out%d' % i) > + else: > + arg_struct.append(' %s arg%d[STRIDE];' % (arg, i)) > + func_args.append('%s (variants[v].in[i].arg%d)' % > + (vtable[funcname_argtype], i)) > + _func_args.append('variants[v].in[i].arg%d[0]' % i) > + > + if funcname_stride == 1: > + print(BENCH_SCALAR_TEMPLATE % {'func': func, > + 'func_args': ', '.join(_func_args)}) > + elif directives['ret'] == '': > + print(BENCH_SCALAR_TEMPLATE % {'func': func, > + 'func_args': ', '.join(func_args)}) > + else: > + print(BENCH_VEC_TEMPLATE % {'func': func, 'func_args': ', '.join(func_args), > + 'defs': funcname_argtype}) > + print(STRUCT_TEMPLATE % {'args': '\n'.join(arg_struct)}) > + > + # Now print the values. > + variants = [] > + for (k, _vals), i in zip(all_vals.items(), itertools.count()): > + vals = [] > + temp_vals = [] > + j = 0 > + temp_j = 0 > + result_v = ['', '', ''] > + for _v in _vals: > + nums = _v.split(',') > + for l in range(0, len(nums)): > + result_v[l] = result_v[l] + nums[l].strip() + ',' > + j += 1 > + temp_j += 1 > + > + if temp_j == funcname_stride: > + final_result = '' > + for l in range(0, len(nums)): > + final_result = final_result + '{' + result_v[l][:-1] + '},' > + temp_vals.append(final_result[:-1]) > + temp_j = 0 > + result_v = ['', '', ''] > + > + # Make sure amount of test data is multiple of max_vlen > + # to keep data size same for all vector length. > + if j == max_vlen: > + vals.extend(temp_vals) > + temp_vals = [] > + j = 0 > + > + out = [' {%s, 0},' % v for v in vals] > + > + # Members for the variants structure list that we will > + # print later. > + variants.append(' {"%s", %d, in%d},' % (k, len(vals), i)) > + print(ARGS_TEMPLATE % {'argnum': i, 'num_args': len(vals), > + 'args': '\n'.join(out)}) > + > + # Print the variants and the last set of macros. > + print(VARIANTS_TEMPLATE % {'num_variants': len(all_vals), > + 'variants': '\n'.join(variants)}) > + return outargs > + > + > +def _process_directive(d_name, d_val, func_args): > + """Process a directive. > + > + Evaluate the directive name and value passed and return the > + processed value. This is a helper function for parse_file. > + > + Args: > + d_name: Name of the directive > + d_val: The string value to process > + > + Returns: > + The processed value, which may be the string as it is or an object > + that describes the directive. > + """ > + # Process the directive values if necessary. name and ret don't > + # need any processing. > + if d_name.startswith('include'): > + d_val = d_val.split(',') > + elif d_name == 'args': > + d_val = d_val.split(':') > + # Check if args type match > + if not d_val[0] == func_args: > + die("Args mismatch, should be %s, but get %s" % (d_val[0], func_args)) > + > + # Return the values. > + return d_val > + > + > +def parse_file(func_types): > + """Parse an input file > + > + Given a function name, open and parse an input file for the function > + and get the necessary parameters for the generated code and the list > + of inputs. > + > + Args: > + func: The function name > + > + Returns: > + A tuple of two elements, one a dictionary of directives and the > + other a dictionary of all input values. > + """ > + all_vals = {} > + # Valid directives. > + directives = { > + 'name': '', > + 'args': [], > + 'includes': [], > + 'include-sources': [], > + 'ret': '', > + 'init': '' > + } > + > + func = func_types[-1] > + try: > + with open('../sysdeps/x86_64/fpu/libmvec-%s-inputs' % func) as f: > + for line in f: > + # Look for directives and parse it if found. > + if line.startswith('##'): > + try: > + d_name, d_val = line[2:].split(':', 1) > + d_name = d_name.strip() > + d_val = d_val.strip() > + directives[d_name] = _process_directive(d_name, d_val, func_types[1]) > + except (IndexError, KeyError): > + die('Invalid directive: %s' % line[2:]) > + > + # Skip blank lines and comments. > + line = line.split('#', 1)[0].rstrip() > + if not line: > + continue > + > + # Otherwise, we're an input. Add to the appropriate > + # input set. > + cur_name = directives['name'] > + all_vals.setdefault(cur_name, []) > + all_vals[cur_name].append(line) > + except IOError as ex: > + die("Failed to open input file (%s): %s" % (ex.filename, ex.strerror)) > + > + return directives, all_vals > + > + > +def die(msg): > + """Exit with an error > + > + Prints an error message to the standard error stream and exits with > + a non-zero status. > + > + Args: > + msg: The error message to print to standard error > + """ > + print('%s\n' % msg, file=sys.stderr) > + sys.exit(os.EX_DATAERR) > + > + > +def main(args): > + """Main function > + > + Use the first command line argument as function name and parse its > + input file to generate C source that calls the function repeatedly > + for the input. > + > + Args: > + args: The command line arguments with the program name dropped > + > + Returns: > + os.EX_USAGE on error and os.EX_OK on success. > + """ > + if len(args) != 1: > + print('Usage: %s <function>' % sys.argv[0]) > + return os.EX_USAGE > + > + func_types = args[0].split('-') > + directives, all_vals = parse_file(func_types) > + gen_source(func_types, directives, all_vals) > + return os.EX_OK > + > + > +if __name__ == '__main__': > + sys.exit(main(sys.argv[1:])) > -- > 2.31.1 >
On Fri, Nov 12, 2021 at 1:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > On Fri, Nov 12, 2021 at 1:19 PM Sunil K Pandey via Libc-alpha > <libc-alpha@sourceware.org> wrote: > > > > Add python script to generate libmvec microbenchmark from the input > > values for each libmvec function using skeleton benchmark template. > > > > Creates double and float benchmarks with vector length 1, 2, 4, 8, > > and 16 for each libmvec function. Vector length 1 corresponds to > > scalar version of function and is included for vector function perf > > comparison. > > --- > > sysdeps/x86_64/fpu/Makeconfig | 35 ++ > > sysdeps/x86_64/fpu/Makefile | 40 ++ > > sysdeps/x86_64/fpu/bench-libmvec-skeleton.c | 104 +++++ > > sysdeps/x86_64/fpu/scripts/bench_libmvec.py | 464 ++++++++++++++++++++ > > 4 files changed, 643 insertions(+) > > create mode 100644 sysdeps/x86_64/fpu/bench-libmvec-skeleton.c > > create mode 100755 sysdeps/x86_64/fpu/scripts/bench_libmvec.py > > > > diff --git a/sysdeps/x86_64/fpu/Makeconfig > b/sysdeps/x86_64/fpu/Makeconfig > > index 24aaee1a43..503e9b5ffa 100644 > > --- a/sysdeps/x86_64/fpu/Makeconfig > > +++ b/sysdeps/x86_64/fpu/Makeconfig > > @@ -29,6 +29,23 @@ libmvec-funcs = \ > > sin \ > > sincos \ > > > > +# Define libmvec function for benchtests directory. > > +libmvec-bench-funcs = \ > > + > > +bench-libmvec-double = \ > > + $(addprefix double-vlen1-, $(libmvec-bench-funcs)) \ > > + $(addprefix double-vlen2-, $(libmvec-bench-funcs)) \ > > + $(addprefix double-vlen4-, $(libmvec-bench-funcs)) \ > > + $(addprefix double-vlen4-avx2-, $(libmvec-bench-funcs)) \ > > + $(addprefix double-vlen8-, $(libmvec-bench-funcs)) \ > > + > > +bench-libmvec-float = \ > > + $(addsuffix f, $(addprefix float-vlen1-, $(libmvec-bench-funcs))) \ > > + $(addsuffix f, $(addprefix float-vlen4-, $(libmvec-bench-funcs))) \ > > + $(addsuffix f, $(addprefix float-vlen8-, $(libmvec-bench-funcs))) \ > > + $(addsuffix f, $(addprefix float-vlen8-avx2-, > $(libmvec-bench-funcs))) \ > > + $(addsuffix f, $(addprefix float-vlen16-, $(libmvec-bench-funcs))) \ > > + > > # The base libmvec ABI tests. > > libmvec-abi-func-tests = \ > > $(addprefix test-double-libmvec-,$(libmvec-funcs)) \ > > @@ -83,5 +100,23 @@ $(common-objpfx)libmvec.mk: > $(common-objpfx)config.make > > echo " \$$(float-vlen16-arch-ext-cflags)"; \ > > echo; \ > > done; \ > > + echo "endif"; \ > > + echo "ifeq (\$$(subdir),benchtests)"; \ > > + for t in $(libmvec-bench-funcs); do \ > > + echo "CFLAGS-bench-double-vlen4-$$t.c = \\"; \ > > + echo " \$$(double-vlen4-arch-ext-cflags)"; \ > > + echo "CFLAGS-bench-double-vlen4-avx2-$$t.c = \\"; \ > > + echo " \$$(double-vlen4-arch-ext2-cflags)"; \ > > + echo "CFLAGS-bench-double-vlen8-$$t.c = \\"; \ > > + echo " \$$(double-vlen8-arch-ext-cflags)"; \ > > + echo; \ > > + echo "CFLAGS-bench-float-vlen8-$${t}f.c = \\"; \ > > + echo " \$$(float-vlen8-arch-ext-cflags)"; \ > > + echo "CFLAGS-bench-float-vlen8-avx2-$${t}f.c = \\"; \ > > + echo " \$$(float-vlen8-arch-ext2-cflags)"; \ > > + echo "CFLAGS-bench-float-vlen16-$${t}f.c = \\"; \ > > + echo " \$$(float-vlen16-arch-ext-cflags)"; \ > > + echo; \ > > + done; \ > > echo "endif") > $@T > > mv -f $@T $@ > > diff --git a/sysdeps/x86_64/fpu/Makefile b/sysdeps/x86_64/fpu/Makefile > > index d172ae815d..9fb587cf8f 100644 > > --- a/sysdeps/x86_64/fpu/Makefile > > +++ b/sysdeps/x86_64/fpu/Makefile > > @@ -72,3 +72,43 @@ ifeq > ($(subdir)$(config-cflags-mprefer-vector-width),mathyes) > > # performance of sin and cos by more than 40% on Skylake. > > CFLAGS-branred.c = -mprefer-vector-width=128 > > endif > > + > > +ifeq ($(subdir),benchtests) > > +double-vlen4-arch-ext-cflags = -mavx > > +double-vlen4-arch-ext2-cflags = -mavx2 > > +double-vlen8-arch-ext-cflags = -mavx512f > > + > > +float-vlen8-arch-ext-cflags = -mavx > > +float-vlen8-arch-ext2-cflags = -mavx2 > > +float-vlen16-arch-ext-cflags = -mavx512f > > + > > +bench-libmvec := $(bench-libmvec-double) $(bench-libmvec-float) > > + > > +ifeq (${BENCHSET},) > > +bench += $(bench-libmvec) > > +endif > > + > > +ifeq (${STATIC-BENCHTESTS},yes) > > +libmvec-benchtests = $(common-objpfx)mathvec/libmvec.a > $(common-objpfx)math/libm.a > > +else > > +libmvec-benchtests = $(libmvec) $(libm) > > +endif > > + > > +$(addprefix $(objpfx)bench-,$(bench-libmvec-double)): > $(libmvec-benchtests) > > +$(addprefix $(objpfx)bench-,$(bench-libmvec-float)): > $(libmvec-benchtests) > > +bench-libmvec-deps = $(..)sysdeps/x86_64/fpu/bench-libmvec-skeleton.c > bench-timing.h Makefile > > + > > +$(objpfx)bench-float-%.c: $(bench-libmvec-deps) > > + { if [ -n "$($*-INCLUDE)" ]; then \ > > + cat $($*-INCLUDE); \ > > + fi; \ > > + $(PYTHON) $(..)sysdeps/x86_64/fpu/scripts/bench_libmvec.py > $(basename $(@F)); } > $@-tmp > > + mv -f $@-tmp $@ > > + > > +$(objpfx)bench-double-%.c: $(bench-libmvec-deps) > > + { if [ -n "$($*-INCLUDE)" ]; then \ > > + cat $($*-INCLUDE); \ > > + fi; \ > > + $(PYTHON) $(..)sysdeps/x86_64/fpu/scripts/bench_libmvec.py > $(basename $(@F)); } > $@-tmp > > + mv -f $@-tmp $@ > > +endif > > diff --git a/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c > b/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c > > new file mode 100644 > > index 0000000000..d56a0c4462 > > --- /dev/null > > +++ b/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c > > @@ -0,0 +1,104 @@ > > +/* Skeleton for libmvec benchmark programs. > > + Copyright (C) 2021 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#include <string.h> > > +#include <stdint.h> > > +#include <stdbool.h> > > +#include <stdio.h> > > +#include <time.h> > > +#include <inttypes.h> > > +#include <bench-timing.h> > > +#include <json-lib.h> > > +#include <bench-util.h> > > + > > +#include <bench-util.c> > > +#include <math-tests-arch.h> > > +#define D_ITERS 10000 > > + > > +int > > +main (int argc, char **argv) > > +{ > > + unsigned long i, k; > > + timing_t start, end; > > + json_ctx_t json_ctx; > > + > > +#if defined REQUIRE_AVX > > + if (!CPU_FEATURE_ACTIVE (AVX)) > > + { > > + printf ("AVX not supported.\n"); > > + return 0; > > + } > > +#elif defined REQUIRE_AVX2 > > + if (!CPU_FEATURE_ACTIVE (AVX2)) > > + { > > + printf ("AVX2 not supported.\n"); > > + return 0; > > + } > > +#elif defined REQUIRE_AVX512F > > + if (!CPU_FEATURE_ACTIVE (AVX512F)) > > + { > > + printf ("AVX512F not supported.\n"); > > + return 0; > > + } > > +#endif > > + > > + bench_start (); > > + > > +#ifdef BENCH_INIT > > + BENCH_INIT (); > > +#endif > > + > > + json_init (&json_ctx, 2, stdout); > > + > > + /* Begin function. */ > > + json_attr_object_begin (&json_ctx, FUNCNAME); > > + > > + for (int v = 0; v < NUM_VARIANTS; v++) > > + { > > + double d_total_time = 0; > > + uint64_t cur; > > Think these should also be type `timing_t` > I do not see a difference if I use timing_t or uint64_t. In any case variable cur stores the difference between start and end time, not time. > > > + for (k = 0; k < D_ITERS; k++) > > + { > > + TIMING_NOW (start); > > + for (i = 0; i < NUM_SAMPLES (v); i++) > > What is the rationale for both `D_ITERS` and `NUM_SAMPLES (v)`? Why not > one loop that iterates for `D_ITERS * NUM_SAMPLES (v)`? > D_ITERS define how many times each variant full data set will run. NUM_SAMPLES(v) represent the number of data sets in variant v. Index v and i select, i'th data set from variant v and call vector function. Having two loops simplifies logic. > > + BENCH_FUNC (v, i); > > + TIMING_NOW (end); > > + > > + TIMING_DIFF (cur, start, end); > > + > > + d_total_time += cur; > > Think this should be `TIMING_ACCUM(d_total_time, cur)`. > Not much difference, if I use TIMING_ACCUM or simply add cur to d_total_time. > > + > > + } > > + double d_total_data_set = D_ITERS * NUM_SAMPLES (v) * STRIDE; > > + > > + /* Begin variant. */ > > + json_attr_object_begin (&json_ctx, VARIANT (v)); > > + > > + json_attr_double (&json_ctx, "duration", d_total_time); > > + json_attr_double (&json_ctx, "iterations", d_total_data_set); > > + json_attr_double (&json_ctx, "mean", d_total_time / > d_total_data_set); > > + > > + /* End variant. */ > > + json_attr_object_end (&json_ctx); > > + } > > + > > + /* End function. */ > > + json_attr_object_end (&json_ctx); > > + > > + return 0; > > +} > > diff --git a/sysdeps/x86_64/fpu/scripts/bench_libmvec.py > b/sysdeps/x86_64/fpu/scripts/bench_libmvec.py > > new file mode 100755 > > index 0000000000..762865de8f > > --- /dev/null > > +++ b/sysdeps/x86_64/fpu/scripts/bench_libmvec.py > > @@ -0,0 +1,464 @@ > > +#!/usr/bin/python3 > > +# Copyright (C) 2021 Free Software Foundation, Inc. > > +# This file is part of the GNU C Library. > > +# > > +# The GNU C Library is free software; you can redistribute it and/or > > +# modify it under the terms of the GNU Lesser General Public > > +# License as published by the Free Software Foundation; either > > +# version 2.1 of the License, or (at your option) any later version. > > +# > > +# The GNU C Library is distributed in the hope that it will be useful, > > +# but WITHOUT ANY WARRANTY; without even the implied warranty of > > +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > +# Lesser General Public License for more details. > > +# > > +# You should have received a copy of the GNU Lesser General Public > > +# License along with the GNU C Library; if not, see > > +# <https://www.gnu.org/licenses/>. > > + > > +"""Benchmark program generator script > > + > > +This script takes a function name as input and generates a program using > > +an libmvec input file located in the sysdeps/x86_64/fpu directory. The > > +name of the input file should be of the form libmvec-foo-inputs where > > +'foo' is the name of the function. > > +""" > > + > > +from __future__ import print_function > > +import sys > > +import os > > +import itertools > > +import re > > + > > +# Macro definitions for functions that take no arguments. For functions > > +# that take arguments, the STRUCT_TEMPLATE, ARGS_TEMPLATE and > > +# VARIANTS_TEMPLATE are used instead. > > +DEFINES_TEMPLATE = ''' > > +#define CALL_BENCH_FUNC(v, i) %(func)s () > > +#define NUM_VARIANTS (1) > > +#define NUM_SAMPLES(v) (1) > > +#define VARIANT(v) FUNCNAME "()" > > +''' > > + > > +# Structures to store arguments for the function call. A function may > > +# have its inputs partitioned to represent distinct performance > > +# characteristics or distinct flavors of the function. Each such > > +# variant is represented by the _VARIANT structure. The ARGS structure > > +# represents a single set of arguments. > > +BENCH_VEC_TEMPLATE = ''' > > +#define CALL_BENCH_FUNC(v, i) (__extension__ ({ \\ > > + %(defs)s mx0 = %(func)s (%(func_args)s); \\ > > + mx0; })) > > +''' > > + > > +BENCH_SCALAR_TEMPLATE = ''' > > +#define CALL_BENCH_FUNC(v, i) %(func)s (%(func_args)s) > > +''' > > + > > +STRUCT_TEMPLATE = '''struct args > > +{ > > +%(args)s > > + double timing; > > +}; > > + > > +struct _variants > > +{ > > + const char *name; > > + int count; > > + struct args *in; > > +}; > > +''' > > + > > +# The actual input arguments. > > +ARGS_TEMPLATE = '''struct args in%(argnum)d[%(num_args)d] = { > > +%(args)s > > +}; > > +''' > > + > > +# The actual variants, along with macros defined to access the variants. > > +VARIANTS_TEMPLATE = '''struct _variants variants[%(num_variants)d] = { > > +%(variants)s > > +}; > > + > > +#define NUM_VARIANTS %(num_variants)d > > +#define NUM_SAMPLES(i) (variants[i].count) > > +#define VARIANT(i) (variants[i].name) > > +''' > > + > > +# Epilogue for the generated source file. > > +EPILOGUE = ''' > > +#define BENCH_FUNC(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j);}) > > +#define FUNCNAME "%(func)s" > > +#include <bench-libmvec-skeleton.c>''' > > + > > + > > +def gen_source(func_types, directives, all_vals): > > + """Generate source for the function > > + > > + Generate the C source for the function from the values and > > + directives. > > + > > + Args: > > + func: The function name > > + directives: A dictionary of directives applicable to this function > > + all_vals: A dictionary input values > > + """ > > + # The includes go in first. > > + for header in directives['includes']: > > + print('#include <%s>' % header) > > + > > + for header in directives['include-sources']: > > + print('#include "%s"' % header) > > + > > + argtype_vtable = { > > + 2: '128', > > + 4: '256', > > + 8: '512' > > + } > > + prefix_vtable = { > > + 2: 'b', > > + 4: 'c', > > + 8: 'e' > > + } > > + > > + # Get all the function properties > > + funcname_argtype = '' > > + float_flag = False > > + if func_types[1] == 'float': > > + float_flag = True > > + avx_flag = False > > + if func_types[3] == 'avx2': > > + avx_flag = True > > + funcname_stride = int(func_types[2][4:]) > > + funcname_origin = func_types[-1] > > + if float_flag: > > + funcname_origin = funcname_origin[:-1] > > + > > + if funcname_stride == 1: > > + # Prepare for scalar functions file generation > > + funcname_prefix = '' > > + funcname_prefix_1 = '' > > + funcname_argtype = 'double' > > + if float_flag: > > + funcname_argtype = 'float' > > + else: > > + # Prepare for libmvec functions file generation > > + funcname_prefix_1 = len(directives['args']) * 'v' + '_' > > + aligned_stride = funcname_stride > > + if float_flag: > > + aligned_stride /= 2 > > + funcname_prefix = '_ZGV' > > + if (avx_flag and (aligned_stride == 4)): > > + funcname_prefix += 'd' > > + else: > > + funcname_prefix += prefix_vtable[aligned_stride] > > + funcname_prefix = funcname_prefix + 'N' + func_types[2][4:] > > + funcname_argtype = '__m' + argtype_vtable[aligned_stride] > > + if not float_flag: > > + funcname_argtype += 'd' > > + > > + # Include x86intrin.h for vector functions > > + if not funcname_stride == 1: > > + print('#include <x86intrin.h>') > > + if (avx_flag and (aligned_stride == 4)): > > + # For bench-float-vlen8-avx2* and bench-double-vlen4-avx2* > > + print('#define REQUIRE_AVX2') > > + elif aligned_stride == 8: > > + # For bench-float-vlen16* and bench-double-vlen8* > > + print('#define REQUIRE_AVX512F') > > + elif aligned_stride == 4: > > + # For bench-float-vlen8* and bench-double-vlen4* without avx2 > > + print('#define REQUIRE_AVX') > > + else: > > + print('#define FUNCTYPE %s' % funcname_argtype) > > + > > + print('#define STRIDE %d ' % funcname_stride) > > + > > + funcname = funcname_prefix + funcname_prefix_1 + funcname_origin > > + if float_flag: > > + funcname += 'f' > > + > > + funcname_rettype = funcname_argtype > > + if directives['ret'] == '': > > + funcname_rettype = 'void' > > + > > + funcname_inputtype = [] > > + for arg, i in zip(directives['args'], itertools.count()): > > + funcname_inputtype.append(funcname_argtype) > > + if arg[0] == '<' and arg[-1] == '>': > > + pos = arg.rfind('*') > > + if pos == -1: > > + die('Output argument must be a pointer type') > > + funcname_inputtype[i] += ' *' > > + > > + if not funcname_stride == 1: > > + if len(directives['args']) == 2: > > + print('extern %s %s (%s, %s);' % (funcname_rettype, funcname, > funcname_inputtype[0], funcname_inputtype[1])) > > + elif len(directives['args']) == 3: > > + print('extern %s %s (%s, %s, %s);' % (funcname_rettype, funcname, > funcname_inputtype[0], funcname_inputtype[1], funcname_inputtype[2])) > > + else: > > + print('extern %s %s (%s);' % (funcname_rettype, funcname, > funcname_inputtype[0])) > > + > > + # Print macros. This branches out to a separate routine if > > + # the function takes arguments. > > + if not directives['args']: > > + print(DEFINES_TEMPLATE % {'funcname': funcname}) > > + outargs = [] > > + else: > > + outargs = _print_arg_data(funcname, float_flag, funcname_argtype, > funcname_stride, directives, all_vals) > > + > > + # Print the output variable definitions if necessary. > > + for out in outargs: > > + print(out) > > + > > + # If we have a return value from the function, make sure it is > > + # assigned to prevent the compiler from optimizing out the > > + # call. > > + getret = '' > > + > > + if directives['ret']: > > + if funcname_argtype != '': > > + print('static %s volatile ret;' % funcname_argtype) > > + getret = 'ret =' > > + else: > > + print('static %s volatile ret;' % directives['ret']) > > + getret = 'ret =' > > + > > + # Test initialization. > > + if directives['init']: > > + print('#define BENCH_INIT %s' % directives['init']) > > + > > + print(EPILOGUE % {'getret': getret, 'func': funcname}) > > + > > + > > +def _print_arg_data(func, float_flag, funcname_argtype, > funcname_stride, directives, all_vals): > > + """Print argument data > > + > > + This is a helper function for gen_source that prints structure and > > + values for arguments and their variants and returns output arguments > > + if any are found. > > + > > + Args: > > + func: Function name > > + float_flag: True if function is float type > > + funcname_argtype: Type for vector variants > > + funcname_stride: Vector Length > > + directives: A dictionary of directives applicable to this function > > + all_vals: A dictionary input values > > + > > + Returns: > > + Returns a list of definitions for function arguments that act as > > + output parameters. > > + """ > > + # First, all of the definitions. We process writing of > > + # CALL_BENCH_FUNC, struct args and also the output arguments > > + # together in a single traversal of the arguments list. > > + func_args = [] > > + _func_args = [] > > + arg_struct = [] > > + outargs = [] > > + # Conversion function for each type > > + vtable = { > > + '__m128d': '_mm_loadu_pd', > > + '__m256d': '_mm256_loadu_pd', > > + '__m512d': '_mm512_loadu_pd', > > + '__m128': '_mm_loadu_ps', > > + '__m256': '_mm256_loadu_ps', > > + '__m512': '_mm512_loadu_ps', > > + 'double': '', > > + 'float': '' > > + } > > + > > + # For double max_vlen=8, for float max_vlen=16. > > + if float_flag == True: > > + max_vlen = 16 > > + else: > > + max_vlen = 8 > > + > > + for arg, i in zip(directives['args'], itertools.count()): > > + if arg[0] == '<' and arg[-1] == '>': > > + outargs.append('static %s out%d __attribute__((used));' % > (funcname_argtype, i)) > > + func_args.append('&out%d' % i) > > + _func_args.append('&out%d' % i) > > + else: > > + arg_struct.append(' %s arg%d[STRIDE];' % (arg, i)) > > + func_args.append('%s (variants[v].in[i].arg%d)' % > > + (vtable[funcname_argtype], i)) > > + _func_args.append('variants[v].in[i].arg%d[0]' % i) > > + > > + if funcname_stride == 1: > > + print(BENCH_SCALAR_TEMPLATE % {'func': func, > > + 'func_args': ', '.join(_func_args)}) > > + elif directives['ret'] == '': > > + print(BENCH_SCALAR_TEMPLATE % {'func': func, > > + 'func_args': ', '.join(func_args)}) > > + else: > > + print(BENCH_VEC_TEMPLATE % {'func': func, 'func_args': ', > '.join(func_args), > > + 'defs': funcname_argtype}) > > + print(STRUCT_TEMPLATE % {'args': '\n'.join(arg_struct)}) > > + > > + # Now print the values. > > + variants = [] > > + for (k, _vals), i in zip(all_vals.items(), itertools.count()): > > + vals = [] > > + temp_vals = [] > > + j = 0 > > + temp_j = 0 > > + result_v = ['', '', ''] > > + for _v in _vals: > > + nums = _v.split(',') > > + for l in range(0, len(nums)): > > + result_v[l] = result_v[l] + nums[l].strip() + ',' > > + j += 1 > > + temp_j += 1 > > + > > + if temp_j == funcname_stride: > > + final_result = '' > > + for l in range(0, len(nums)): > > + final_result = final_result + '{' + result_v[l][:-1] + '},' > > + temp_vals.append(final_result[:-1]) > > + temp_j = 0 > > + result_v = ['', '', ''] > > + > > + # Make sure amount of test data is multiple of max_vlen > > + # to keep data size same for all vector length. > > + if j == max_vlen: > > + vals.extend(temp_vals) > > + temp_vals = [] > > + j = 0 > > + > > + out = [' {%s, 0},' % v for v in vals] > > + > > + # Members for the variants structure list that we will > > + # print later. > > + variants.append(' {"%s", %d, in%d},' % (k, len(vals), i)) > > + print(ARGS_TEMPLATE % {'argnum': i, 'num_args': len(vals), > > + 'args': '\n'.join(out)}) > > + > > + # Print the variants and the last set of macros. > > + print(VARIANTS_TEMPLATE % {'num_variants': len(all_vals), > > + 'variants': '\n'.join(variants)}) > > + return outargs > > + > > + > > +def _process_directive(d_name, d_val, func_args): > > + """Process a directive. > > + > > + Evaluate the directive name and value passed and return the > > + processed value. This is a helper function for parse_file. > > + > > + Args: > > + d_name: Name of the directive > > + d_val: The string value to process > > + > > + Returns: > > + The processed value, which may be the string as it is or an object > > + that describes the directive. > > + """ > > + # Process the directive values if necessary. name and ret don't > > + # need any processing. > > + if d_name.startswith('include'): > > + d_val = d_val.split(',') > > + elif d_name == 'args': > > + d_val = d_val.split(':') > > + # Check if args type match > > + if not d_val[0] == func_args: > > + die("Args mismatch, should be %s, but get %s" % (d_val[0], > func_args)) > > + > > + # Return the values. > > + return d_val > > + > > + > > +def parse_file(func_types): > > + """Parse an input file > > + > > + Given a function name, open and parse an input file for the function > > + and get the necessary parameters for the generated code and the list > > + of inputs. > > + > > + Args: > > + func: The function name > > + > > + Returns: > > + A tuple of two elements, one a dictionary of directives and the > > + other a dictionary of all input values. > > + """ > > + all_vals = {} > > + # Valid directives. > > + directives = { > > + 'name': '', > > + 'args': [], > > + 'includes': [], > > + 'include-sources': [], > > + 'ret': '', > > + 'init': '' > > + } > > + > > + func = func_types[-1] > > + try: > > + with open('../sysdeps/x86_64/fpu/libmvec-%s-inputs' % func) as f: > > + for line in f: > > + # Look for directives and parse it if found. > > + if line.startswith('##'): > > + try: > > + d_name, d_val = line[2:].split(':', 1) > > + d_name = d_name.strip() > > + d_val = d_val.strip() > > + directives[d_name] = _process_directive(d_name, d_val, > func_types[1]) > > + except (IndexError, KeyError): > > + die('Invalid directive: %s' % line[2:]) > > + > > + # Skip blank lines and comments. > > + line = line.split('#', 1)[0].rstrip() > > + if not line: > > + continue > > + > > + # Otherwise, we're an input. Add to the appropriate > > + # input set. > > + cur_name = directives['name'] > > + all_vals.setdefault(cur_name, []) > > + all_vals[cur_name].append(line) > > + except IOError as ex: > > + die("Failed to open input file (%s): %s" % (ex.filename, > ex.strerror)) > > + > > + return directives, all_vals > > + > > + > > +def die(msg): > > + """Exit with an error > > + > > + Prints an error message to the standard error stream and exits with > > + a non-zero status. > > + > > + Args: > > + msg: The error message to print to standard error > > + """ > > + print('%s\n' % msg, file=sys.stderr) > > + sys.exit(os.EX_DATAERR) > > + > > + > > +def main(args): > > + """Main function > > + > > + Use the first command line argument as function name and parse its > > + input file to generate C source that calls the function repeatedly > > + for the input. > > + > > + Args: > > + args: The command line arguments with the program name dropped > > + > > + Returns: > > + os.EX_USAGE on error and os.EX_OK on success. > > + """ > > + if len(args) != 1: > > + print('Usage: %s <function>' % sys.argv[0]) > > + return os.EX_USAGE > > + > > + func_types = args[0].split('-') > > + directives, all_vals = parse_file(func_types) > > + gen_source(func_types, directives, all_vals) > > + return os.EX_OK > > + > > + > > +if __name__ == '__main__': > > + sys.exit(main(sys.argv[1:])) > > -- > > 2.31.1 > > >
On Fri, Nov 12, 2021 at 2:51 PM Sunil Pandey via Libc-alpha <libc-alpha@sourceware.org> wrote: > > On Fri, Nov 12, 2021 at 1:02 PM Noah Goldstein <goldstein.w.n@gmail.com> > wrote: > > > On Fri, Nov 12, 2021 at 1:19 PM Sunil K Pandey via Libc-alpha > > <libc-alpha@sourceware.org> wrote: > > > > > > Add python script to generate libmvec microbenchmark from the input > > > values for each libmvec function using skeleton benchmark template. > > > > > > Creates double and float benchmarks with vector length 1, 2, 4, 8, > > > and 16 for each libmvec function. Vector length 1 corresponds to > > > scalar version of function and is included for vector function perf > > > comparison. > > > --- > > > sysdeps/x86_64/fpu/Makeconfig | 35 ++ > > > sysdeps/x86_64/fpu/Makefile | 40 ++ > > > sysdeps/x86_64/fpu/bench-libmvec-skeleton.c | 104 +++++ > > > sysdeps/x86_64/fpu/scripts/bench_libmvec.py | 464 ++++++++++++++++++++ > > > 4 files changed, 643 insertions(+) > > > create mode 100644 sysdeps/x86_64/fpu/bench-libmvec-skeleton.c > > > create mode 100755 sysdeps/x86_64/fpu/scripts/bench_libmvec.py > > > > > > diff --git a/sysdeps/x86_64/fpu/Makeconfig > > b/sysdeps/x86_64/fpu/Makeconfig > > > index 24aaee1a43..503e9b5ffa 100644 > > > --- a/sysdeps/x86_64/fpu/Makeconfig > > > +++ b/sysdeps/x86_64/fpu/Makeconfig > > > @@ -29,6 +29,23 @@ libmvec-funcs = \ > > > sin \ > > > sincos \ > > > > > > +# Define libmvec function for benchtests directory. > > > +libmvec-bench-funcs = \ > > > + > > > +bench-libmvec-double = \ > > > + $(addprefix double-vlen1-, $(libmvec-bench-funcs)) \ > > > + $(addprefix double-vlen2-, $(libmvec-bench-funcs)) \ > > > + $(addprefix double-vlen4-, $(libmvec-bench-funcs)) \ > > > + $(addprefix double-vlen4-avx2-, $(libmvec-bench-funcs)) \ > > > + $(addprefix double-vlen8-, $(libmvec-bench-funcs)) \ > > > + > > > +bench-libmvec-float = \ > > > + $(addsuffix f, $(addprefix float-vlen1-, $(libmvec-bench-funcs))) \ > > > + $(addsuffix f, $(addprefix float-vlen4-, $(libmvec-bench-funcs))) \ > > > + $(addsuffix f, $(addprefix float-vlen8-, $(libmvec-bench-funcs))) \ > > > + $(addsuffix f, $(addprefix float-vlen8-avx2-, > > $(libmvec-bench-funcs))) \ > > > + $(addsuffix f, $(addprefix float-vlen16-, $(libmvec-bench-funcs))) \ > > > + > > > # The base libmvec ABI tests. > > > libmvec-abi-func-tests = \ > > > $(addprefix test-double-libmvec-,$(libmvec-funcs)) \ > > > @@ -83,5 +100,23 @@ $(common-objpfx)libmvec.mk: > > $(common-objpfx)config.make > > > echo " \$$(float-vlen16-arch-ext-cflags)"; \ > > > echo; \ > > > done; \ > > > + echo "endif"; \ > > > + echo "ifeq (\$$(subdir),benchtests)"; \ > > > + for t in $(libmvec-bench-funcs); do \ > > > + echo "CFLAGS-bench-double-vlen4-$$t.c = \\"; \ > > > + echo " \$$(double-vlen4-arch-ext-cflags)"; \ > > > + echo "CFLAGS-bench-double-vlen4-avx2-$$t.c = \\"; \ > > > + echo " \$$(double-vlen4-arch-ext2-cflags)"; \ > > > + echo "CFLAGS-bench-double-vlen8-$$t.c = \\"; \ > > > + echo " \$$(double-vlen8-arch-ext-cflags)"; \ > > > + echo; \ > > > + echo "CFLAGS-bench-float-vlen8-$${t}f.c = \\"; \ > > > + echo " \$$(float-vlen8-arch-ext-cflags)"; \ > > > + echo "CFLAGS-bench-float-vlen8-avx2-$${t}f.c = \\"; \ > > > + echo " \$$(float-vlen8-arch-ext2-cflags)"; \ > > > + echo "CFLAGS-bench-float-vlen16-$${t}f.c = \\"; \ > > > + echo " \$$(float-vlen16-arch-ext-cflags)"; \ > > > + echo; \ > > > + done; \ > > > echo "endif") > $@T > > > mv -f $@T $@ > > > diff --git a/sysdeps/x86_64/fpu/Makefile b/sysdeps/x86_64/fpu/Makefile > > > index d172ae815d..9fb587cf8f 100644 > > > --- a/sysdeps/x86_64/fpu/Makefile > > > +++ b/sysdeps/x86_64/fpu/Makefile > > > @@ -72,3 +72,43 @@ ifeq > > ($(subdir)$(config-cflags-mprefer-vector-width),mathyes) > > > # performance of sin and cos by more than 40% on Skylake. > > > CFLAGS-branred.c = -mprefer-vector-width=128 > > > endif > > > + > > > +ifeq ($(subdir),benchtests) > > > +double-vlen4-arch-ext-cflags = -mavx > > > +double-vlen4-arch-ext2-cflags = -mavx2 > > > +double-vlen8-arch-ext-cflags = -mavx512f > > > + > > > +float-vlen8-arch-ext-cflags = -mavx > > > +float-vlen8-arch-ext2-cflags = -mavx2 > > > +float-vlen16-arch-ext-cflags = -mavx512f > > > + > > > +bench-libmvec := $(bench-libmvec-double) $(bench-libmvec-float) > > > + > > > +ifeq (${BENCHSET},) > > > +bench += $(bench-libmvec) > > > +endif > > > + > > > +ifeq (${STATIC-BENCHTESTS},yes) > > > +libmvec-benchtests = $(common-objpfx)mathvec/libmvec.a > > $(common-objpfx)math/libm.a > > > +else > > > +libmvec-benchtests = $(libmvec) $(libm) > > > +endif > > > + > > > +$(addprefix $(objpfx)bench-,$(bench-libmvec-double)): > > $(libmvec-benchtests) > > > +$(addprefix $(objpfx)bench-,$(bench-libmvec-float)): > > $(libmvec-benchtests) > > > +bench-libmvec-deps = $(..)sysdeps/x86_64/fpu/bench-libmvec-skeleton.c > > bench-timing.h Makefile > > > + > > > +$(objpfx)bench-float-%.c: $(bench-libmvec-deps) > > > + { if [ -n "$($*-INCLUDE)" ]; then \ > > > + cat $($*-INCLUDE); \ > > > + fi; \ > > > + $(PYTHON) $(..)sysdeps/x86_64/fpu/scripts/bench_libmvec.py > > $(basename $(@F)); } > $@-tmp > > > + mv -f $@-tmp $@ > > > + > > > +$(objpfx)bench-double-%.c: $(bench-libmvec-deps) > > > + { if [ -n "$($*-INCLUDE)" ]; then \ > > > + cat $($*-INCLUDE); \ > > > + fi; \ > > > + $(PYTHON) $(..)sysdeps/x86_64/fpu/scripts/bench_libmvec.py > > $(basename $(@F)); } > $@-tmp > > > + mv -f $@-tmp $@ > > > +endif > > > diff --git a/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c > > b/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c > > > new file mode 100644 > > > index 0000000000..d56a0c4462 > > > --- /dev/null > > > +++ b/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c > > > @@ -0,0 +1,104 @@ > > > +/* Skeleton for libmvec benchmark programs. > > > + Copyright (C) 2021 Free Software Foundation, Inc. > > > + This file is part of the GNU C Library. > > > + > > > + The GNU C Library is free software; you can redistribute it and/or > > > + modify it under the terms of the GNU Lesser General Public > > > + License as published by the Free Software Foundation; either > > > + version 2.1 of the License, or (at your option) any later version. > > > + > > > + The GNU C Library is distributed in the hope that it will be useful, > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > + Lesser General Public License for more details. > > > + > > > + You should have received a copy of the GNU Lesser General Public > > > + License along with the GNU C Library; if not, see > > > + <https://www.gnu.org/licenses/>. */ > > > + > > > +#include <string.h> > > > +#include <stdint.h> > > > +#include <stdbool.h> > > > +#include <stdio.h> > > > +#include <time.h> > > > +#include <inttypes.h> > > > +#include <bench-timing.h> > > > +#include <json-lib.h> > > > +#include <bench-util.h> > > > + > > > +#include <bench-util.c> > > > +#include <math-tests-arch.h> > > > +#define D_ITERS 10000 > > > + > > > +int > > > +main (int argc, char **argv) > > > +{ > > > + unsigned long i, k; > > > + timing_t start, end; > > > + json_ctx_t json_ctx; > > > + > > > +#if defined REQUIRE_AVX > > > + if (!CPU_FEATURE_ACTIVE (AVX)) > > > + { > > > + printf ("AVX not supported.\n"); > > > + return 0; > > > + } > > > +#elif defined REQUIRE_AVX2 > > > + if (!CPU_FEATURE_ACTIVE (AVX2)) > > > + { > > > + printf ("AVX2 not supported.\n"); > > > + return 0; > > > + } > > > +#elif defined REQUIRE_AVX512F > > > + if (!CPU_FEATURE_ACTIVE (AVX512F)) > > > + { > > > + printf ("AVX512F not supported.\n"); > > > + return 0; > > > + } > > > +#endif > > > + > > > + bench_start (); > > > + > > > +#ifdef BENCH_INIT > > > + BENCH_INIT (); > > > +#endif > > > + > > > + json_init (&json_ctx, 2, stdout); > > > + > > > + /* Begin function. */ > > > + json_attr_object_begin (&json_ctx, FUNCNAME); > > > + > > > + for (int v = 0; v < NUM_VARIANTS; v++) > > > + { > > > + double d_total_time = 0; > > > + uint64_t cur; > > > > Think these should also be type `timing_t` > > > > I do not see a difference if I use timing_t or uint64_t. In any case > variable cur stores the > difference between start and end time, not time. > > > > > > > + for (k = 0; k < D_ITERS; k++) > > > + { > > > + TIMING_NOW (start); > > > + for (i = 0; i < NUM_SAMPLES (v); i++) > > > > What is the rationale for both `D_ITERS` and `NUM_SAMPLES (v)`? Why not > > one loop that iterates for `D_ITERS * NUM_SAMPLES (v)`? > > > > D_ITERS define how many times each variant full data set will run. > NUM_SAMPLES(v) > represent the number of data sets in variant v. Index v and i select, i'th > data set from > variant v and call vector function. Having two loops simplifies logic. > > > > > + BENCH_FUNC (v, i); > > > + TIMING_NOW (end); > > > + > > > + TIMING_DIFF (cur, start, end); > > > + > > > + d_total_time += cur; > >.> > Think this should be `TIMING_ACCUM(d_total_time, cur)`. > > > > Not much difference, if I use TIMING_ACCUM or simply add cur to > d_total_time. > Please use TIMING_ACCUM (d_total_time, cur) to be consistent with TIMING_DIFF (cur, start, end). Thanks.
On Sat, Nov 13, 2021 at 11:48 AM H.J. Lu <hjl.tools@gmail.com> wrote: > On Fri, Nov 12, 2021 at 2:51 PM Sunil Pandey via Libc-alpha > <libc-alpha@sourceware.org> wrote: > > > > On Fri, Nov 12, 2021 at 1:02 PM Noah Goldstein <goldstein.w.n@gmail.com> > > wrote: > > > > > On Fri, Nov 12, 2021 at 1:19 PM Sunil K Pandey via Libc-alpha > > > <libc-alpha@sourceware.org> wrote: > > > > > > > > Add python script to generate libmvec microbenchmark from the input > > > > values for each libmvec function using skeleton benchmark template. > > > > > > > > Creates double and float benchmarks with vector length 1, 2, 4, 8, > > > > and 16 for each libmvec function. Vector length 1 corresponds to > > > > scalar version of function and is included for vector function perf > > > > comparison. > > > > --- > > > > sysdeps/x86_64/fpu/Makeconfig | 35 ++ > > > > sysdeps/x86_64/fpu/Makefile | 40 ++ > > > > sysdeps/x86_64/fpu/bench-libmvec-skeleton.c | 104 +++++ > > > > sysdeps/x86_64/fpu/scripts/bench_libmvec.py | 464 > ++++++++++++++++++++ > > > > 4 files changed, 643 insertions(+) > > > > create mode 100644 sysdeps/x86_64/fpu/bench-libmvec-skeleton.c > > > > create mode 100755 sysdeps/x86_64/fpu/scripts/bench_libmvec.py > > > > > > > > diff --git a/sysdeps/x86_64/fpu/Makeconfig > > > b/sysdeps/x86_64/fpu/Makeconfig > > > > index 24aaee1a43..503e9b5ffa 100644 > > > > --- a/sysdeps/x86_64/fpu/Makeconfig > > > > +++ b/sysdeps/x86_64/fpu/Makeconfig > > > > @@ -29,6 +29,23 @@ libmvec-funcs = \ > > > > sin \ > > > > sincos \ > > > > > > > > +# Define libmvec function for benchtests directory. > > > > +libmvec-bench-funcs = \ > > > > + > > > > +bench-libmvec-double = \ > > > > + $(addprefix double-vlen1-, $(libmvec-bench-funcs)) \ > > > > + $(addprefix double-vlen2-, $(libmvec-bench-funcs)) \ > > > > + $(addprefix double-vlen4-, $(libmvec-bench-funcs)) \ > > > > + $(addprefix double-vlen4-avx2-, $(libmvec-bench-funcs)) \ > > > > + $(addprefix double-vlen8-, $(libmvec-bench-funcs)) \ > > > > + > > > > +bench-libmvec-float = \ > > > > + $(addsuffix f, $(addprefix float-vlen1-, $(libmvec-bench-funcs))) > \ > > > > + $(addsuffix f, $(addprefix float-vlen4-, $(libmvec-bench-funcs))) > \ > > > > + $(addsuffix f, $(addprefix float-vlen8-, $(libmvec-bench-funcs))) > \ > > > > + $(addsuffix f, $(addprefix float-vlen8-avx2-, > > > $(libmvec-bench-funcs))) \ > > > > + $(addsuffix f, $(addprefix float-vlen16-, > $(libmvec-bench-funcs))) \ > > > > + > > > > # The base libmvec ABI tests. > > > > libmvec-abi-func-tests = \ > > > > $(addprefix test-double-libmvec-,$(libmvec-funcs)) \ > > > > @@ -83,5 +100,23 @@ $(common-objpfx)libmvec.mk: > > > $(common-objpfx)config.make > > > > echo " \$$(float-vlen16-arch-ext-cflags)"; \ > > > > echo; \ > > > > done; \ > > > > + echo "endif"; \ > > > > + echo "ifeq (\$$(subdir),benchtests)"; \ > > > > + for t in $(libmvec-bench-funcs); do \ > > > > + echo "CFLAGS-bench-double-vlen4-$$t.c = \\"; \ > > > > + echo " \$$(double-vlen4-arch-ext-cflags)"; \ > > > > + echo "CFLAGS-bench-double-vlen4-avx2-$$t.c = \\"; \ > > > > + echo " \$$(double-vlen4-arch-ext2-cflags)"; \ > > > > + echo "CFLAGS-bench-double-vlen8-$$t.c = \\"; \ > > > > + echo " \$$(double-vlen8-arch-ext-cflags)"; \ > > > > + echo; \ > > > > + echo "CFLAGS-bench-float-vlen8-$${t}f.c = \\"; \ > > > > + echo " \$$(float-vlen8-arch-ext-cflags)"; \ > > > > + echo "CFLAGS-bench-float-vlen8-avx2-$${t}f.c = \\"; \ > > > > + echo " \$$(float-vlen8-arch-ext2-cflags)"; \ > > > > + echo "CFLAGS-bench-float-vlen16-$${t}f.c = \\"; \ > > > > + echo " \$$(float-vlen16-arch-ext-cflags)"; \ > > > > + echo; \ > > > > + done; \ > > > > echo "endif") > $@T > > > > mv -f $@T $@ > > > > diff --git a/sysdeps/x86_64/fpu/Makefile > b/sysdeps/x86_64/fpu/Makefile > > > > index d172ae815d..9fb587cf8f 100644 > > > > --- a/sysdeps/x86_64/fpu/Makefile > > > > +++ b/sysdeps/x86_64/fpu/Makefile > > > > @@ -72,3 +72,43 @@ ifeq > > > ($(subdir)$(config-cflags-mprefer-vector-width),mathyes) > > > > # performance of sin and cos by more than 40% on Skylake. > > > > CFLAGS-branred.c = -mprefer-vector-width=128 > > > > endif > > > > + > > > > +ifeq ($(subdir),benchtests) > > > > +double-vlen4-arch-ext-cflags = -mavx > > > > +double-vlen4-arch-ext2-cflags = -mavx2 > > > > +double-vlen8-arch-ext-cflags = -mavx512f > > > > + > > > > +float-vlen8-arch-ext-cflags = -mavx > > > > +float-vlen8-arch-ext2-cflags = -mavx2 > > > > +float-vlen16-arch-ext-cflags = -mavx512f > > > > + > > > > +bench-libmvec := $(bench-libmvec-double) $(bench-libmvec-float) > > > > + > > > > +ifeq (${BENCHSET},) > > > > +bench += $(bench-libmvec) > > > > +endif > > > > + > > > > +ifeq (${STATIC-BENCHTESTS},yes) > > > > +libmvec-benchtests = $(common-objpfx)mathvec/libmvec.a > > > $(common-objpfx)math/libm.a > > > > +else > > > > +libmvec-benchtests = $(libmvec) $(libm) > > > > +endif > > > > + > > > > +$(addprefix $(objpfx)bench-,$(bench-libmvec-double)): > > > $(libmvec-benchtests) > > > > +$(addprefix $(objpfx)bench-,$(bench-libmvec-float)): > > > $(libmvec-benchtests) > > > > +bench-libmvec-deps = > $(..)sysdeps/x86_64/fpu/bench-libmvec-skeleton.c > > > bench-timing.h Makefile > > > > + > > > > +$(objpfx)bench-float-%.c: $(bench-libmvec-deps) > > > > + { if [ -n "$($*-INCLUDE)" ]; then \ > > > > + cat $($*-INCLUDE); \ > > > > + fi; \ > > > > + $(PYTHON) $(..)sysdeps/x86_64/fpu/scripts/bench_libmvec.py > > > $(basename $(@F)); } > $@-tmp > > > > + mv -f $@-tmp $@ > > > > + > > > > +$(objpfx)bench-double-%.c: $(bench-libmvec-deps) > > > > + { if [ -n "$($*-INCLUDE)" ]; then \ > > > > + cat $($*-INCLUDE); \ > > > > + fi; \ > > > > + $(PYTHON) $(..)sysdeps/x86_64/fpu/scripts/bench_libmvec.py > > > $(basename $(@F)); } > $@-tmp > > > > + mv -f $@-tmp $@ > > > > +endif > > > > diff --git a/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c > > > b/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c > > > > new file mode 100644 > > > > index 0000000000..d56a0c4462 > > > > --- /dev/null > > > > +++ b/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c > > > > @@ -0,0 +1,104 @@ > > > > +/* Skeleton for libmvec benchmark programs. > > > > + Copyright (C) 2021 Free Software Foundation, Inc. > > > > + This file is part of the GNU C Library. > > > > + > > > > + The GNU C Library is free software; you can redistribute it > and/or > > > > + modify it under the terms of the GNU Lesser General Public > > > > + License as published by the Free Software Foundation; either > > > > + version 2.1 of the License, or (at your option) any later > version. > > > > + > > > > + The GNU C Library is distributed in the hope that it will be > useful, > > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > > + Lesser General Public License for more details. > > > > + > > > > + You should have received a copy of the GNU Lesser General Public > > > > + License along with the GNU C Library; if not, see > > > > + <https://www.gnu.org/licenses/>. */ > > > > + > > > > +#include <string.h> > > > > +#include <stdint.h> > > > > +#include <stdbool.h> > > > > +#include <stdio.h> > > > > +#include <time.h> > > > > +#include <inttypes.h> > > > > +#include <bench-timing.h> > > > > +#include <json-lib.h> > > > > +#include <bench-util.h> > > > > + > > > > +#include <bench-util.c> > > > > +#include <math-tests-arch.h> > > > > +#define D_ITERS 10000 > > > > + > > > > +int > > > > +main (int argc, char **argv) > > > > +{ > > > > + unsigned long i, k; > > > > + timing_t start, end; > > > > + json_ctx_t json_ctx; > > > > + > > > > +#if defined REQUIRE_AVX > > > > + if (!CPU_FEATURE_ACTIVE (AVX)) > > > > + { > > > > + printf ("AVX not supported.\n"); > > > > + return 0; > > > > + } > > > > +#elif defined REQUIRE_AVX2 > > > > + if (!CPU_FEATURE_ACTIVE (AVX2)) > > > > + { > > > > + printf ("AVX2 not supported.\n"); > > > > + return 0; > > > > + } > > > > +#elif defined REQUIRE_AVX512F > > > > + if (!CPU_FEATURE_ACTIVE (AVX512F)) > > > > + { > > > > + printf ("AVX512F not supported.\n"); > > > > + return 0; > > > > + } > > > > +#endif > > > > + > > > > + bench_start (); > > > > + > > > > +#ifdef BENCH_INIT > > > > + BENCH_INIT (); > > > > +#endif > > > > + > > > > + json_init (&json_ctx, 2, stdout); > > > > + > > > > + /* Begin function. */ > > > > + json_attr_object_begin (&json_ctx, FUNCNAME); > > > > + > > > > + for (int v = 0; v < NUM_VARIANTS; v++) > > > > + { > > > > + double d_total_time = 0; > > > > + uint64_t cur; > > > > > > Think these should also be type `timing_t` > > > > > > > I do not see a difference if I use timing_t or uint64_t. In any case > > variable cur stores the > > difference between start and end time, not time. > > > > > > > > > > > + for (k = 0; k < D_ITERS; k++) > > > > + { > > > > + TIMING_NOW (start); > > > > + for (i = 0; i < NUM_SAMPLES (v); i++) > > > > > > What is the rationale for both `D_ITERS` and `NUM_SAMPLES (v)`? Why not > > > one loop that iterates for `D_ITERS * NUM_SAMPLES (v)`? > > > > > > > D_ITERS define how many times each variant full data set will run. > > NUM_SAMPLES(v) > > represent the number of data sets in variant v. Index v and i select, > i'th > > data set from > > variant v and call vector function. Having two loops simplifies logic. > > > > > > > > + BENCH_FUNC (v, i); > > > > + TIMING_NOW (end); > > > > + > > > > + TIMING_DIFF (cur, start, end); > > > > + > > > > + d_total_time += cur; > > >.> > Think this should be `TIMING_ACCUM(d_total_time, cur)`. > > > > > > > Not much difference, if I use TIMING_ACCUM or simply add cur to > > d_total_time. > > > > Please use TIMING_ACCUM (d_total_time, cur) to be consistent with > TIMING_DIFF (cur, start, end). > Sure, I will fix it in the next version. > > Thanks. > > > -- > H.J. >
diff --git a/sysdeps/x86_64/fpu/Makeconfig b/sysdeps/x86_64/fpu/Makeconfig index 24aaee1a43..503e9b5ffa 100644 --- a/sysdeps/x86_64/fpu/Makeconfig +++ b/sysdeps/x86_64/fpu/Makeconfig @@ -29,6 +29,23 @@ libmvec-funcs = \ sin \ sincos \ +# Define libmvec function for benchtests directory. +libmvec-bench-funcs = \ + +bench-libmvec-double = \ + $(addprefix double-vlen1-, $(libmvec-bench-funcs)) \ + $(addprefix double-vlen2-, $(libmvec-bench-funcs)) \ + $(addprefix double-vlen4-, $(libmvec-bench-funcs)) \ + $(addprefix double-vlen4-avx2-, $(libmvec-bench-funcs)) \ + $(addprefix double-vlen8-, $(libmvec-bench-funcs)) \ + +bench-libmvec-float = \ + $(addsuffix f, $(addprefix float-vlen1-, $(libmvec-bench-funcs))) \ + $(addsuffix f, $(addprefix float-vlen4-, $(libmvec-bench-funcs))) \ + $(addsuffix f, $(addprefix float-vlen8-, $(libmvec-bench-funcs))) \ + $(addsuffix f, $(addprefix float-vlen8-avx2-, $(libmvec-bench-funcs))) \ + $(addsuffix f, $(addprefix float-vlen16-, $(libmvec-bench-funcs))) \ + # The base libmvec ABI tests. libmvec-abi-func-tests = \ $(addprefix test-double-libmvec-,$(libmvec-funcs)) \ @@ -83,5 +100,23 @@ $(common-objpfx)libmvec.mk: $(common-objpfx)config.make echo " \$$(float-vlen16-arch-ext-cflags)"; \ echo; \ done; \ + echo "endif"; \ + echo "ifeq (\$$(subdir),benchtests)"; \ + for t in $(libmvec-bench-funcs); do \ + echo "CFLAGS-bench-double-vlen4-$$t.c = \\"; \ + echo " \$$(double-vlen4-arch-ext-cflags)"; \ + echo "CFLAGS-bench-double-vlen4-avx2-$$t.c = \\"; \ + echo " \$$(double-vlen4-arch-ext2-cflags)"; \ + echo "CFLAGS-bench-double-vlen8-$$t.c = \\"; \ + echo " \$$(double-vlen8-arch-ext-cflags)"; \ + echo; \ + echo "CFLAGS-bench-float-vlen8-$${t}f.c = \\"; \ + echo " \$$(float-vlen8-arch-ext-cflags)"; \ + echo "CFLAGS-bench-float-vlen8-avx2-$${t}f.c = \\"; \ + echo " \$$(float-vlen8-arch-ext2-cflags)"; \ + echo "CFLAGS-bench-float-vlen16-$${t}f.c = \\"; \ + echo " \$$(float-vlen16-arch-ext-cflags)"; \ + echo; \ + done; \ echo "endif") > $@T mv -f $@T $@ diff --git a/sysdeps/x86_64/fpu/Makefile b/sysdeps/x86_64/fpu/Makefile index d172ae815d..9fb587cf8f 100644 --- a/sysdeps/x86_64/fpu/Makefile +++ b/sysdeps/x86_64/fpu/Makefile @@ -72,3 +72,43 @@ ifeq ($(subdir)$(config-cflags-mprefer-vector-width),mathyes) # performance of sin and cos by more than 40% on Skylake. CFLAGS-branred.c = -mprefer-vector-width=128 endif + +ifeq ($(subdir),benchtests) +double-vlen4-arch-ext-cflags = -mavx +double-vlen4-arch-ext2-cflags = -mavx2 +double-vlen8-arch-ext-cflags = -mavx512f + +float-vlen8-arch-ext-cflags = -mavx +float-vlen8-arch-ext2-cflags = -mavx2 +float-vlen16-arch-ext-cflags = -mavx512f + +bench-libmvec := $(bench-libmvec-double) $(bench-libmvec-float) + +ifeq (${BENCHSET},) +bench += $(bench-libmvec) +endif + +ifeq (${STATIC-BENCHTESTS},yes) +libmvec-benchtests = $(common-objpfx)mathvec/libmvec.a $(common-objpfx)math/libm.a +else +libmvec-benchtests = $(libmvec) $(libm) +endif + +$(addprefix $(objpfx)bench-,$(bench-libmvec-double)): $(libmvec-benchtests) +$(addprefix $(objpfx)bench-,$(bench-libmvec-float)): $(libmvec-benchtests) +bench-libmvec-deps = $(..)sysdeps/x86_64/fpu/bench-libmvec-skeleton.c bench-timing.h Makefile + +$(objpfx)bench-float-%.c: $(bench-libmvec-deps) + { if [ -n "$($*-INCLUDE)" ]; then \ + cat $($*-INCLUDE); \ + fi; \ + $(PYTHON) $(..)sysdeps/x86_64/fpu/scripts/bench_libmvec.py $(basename $(@F)); } > $@-tmp + mv -f $@-tmp $@ + +$(objpfx)bench-double-%.c: $(bench-libmvec-deps) + { if [ -n "$($*-INCLUDE)" ]; then \ + cat $($*-INCLUDE); \ + fi; \ + $(PYTHON) $(..)sysdeps/x86_64/fpu/scripts/bench_libmvec.py $(basename $(@F)); } > $@-tmp + mv -f $@-tmp $@ +endif diff --git a/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c b/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c new file mode 100644 index 0000000000..d56a0c4462 --- /dev/null +++ b/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c @@ -0,0 +1,104 @@ +/* Skeleton for libmvec benchmark programs. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <string.h> +#include <stdint.h> +#include <stdbool.h> +#include <stdio.h> +#include <time.h> +#include <inttypes.h> +#include <bench-timing.h> +#include <json-lib.h> +#include <bench-util.h> + +#include <bench-util.c> +#include <math-tests-arch.h> +#define D_ITERS 10000 + +int +main (int argc, char **argv) +{ + unsigned long i, k; + timing_t start, end; + json_ctx_t json_ctx; + +#if defined REQUIRE_AVX + if (!CPU_FEATURE_ACTIVE (AVX)) + { + printf ("AVX not supported.\n"); + return 0; + } +#elif defined REQUIRE_AVX2 + if (!CPU_FEATURE_ACTIVE (AVX2)) + { + printf ("AVX2 not supported.\n"); + return 0; + } +#elif defined REQUIRE_AVX512F + if (!CPU_FEATURE_ACTIVE (AVX512F)) + { + printf ("AVX512F not supported.\n"); + return 0; + } +#endif + + bench_start (); + +#ifdef BENCH_INIT + BENCH_INIT (); +#endif + + json_init (&json_ctx, 2, stdout); + + /* Begin function. */ + json_attr_object_begin (&json_ctx, FUNCNAME); + + for (int v = 0; v < NUM_VARIANTS; v++) + { + double d_total_time = 0; + uint64_t cur; + for (k = 0; k < D_ITERS; k++) + { + TIMING_NOW (start); + for (i = 0; i < NUM_SAMPLES (v); i++) + BENCH_FUNC (v, i); + TIMING_NOW (end); + + TIMING_DIFF (cur, start, end); + + d_total_time += cur; + + } + double d_total_data_set = D_ITERS * NUM_SAMPLES (v) * STRIDE; + + /* Begin variant. */ + json_attr_object_begin (&json_ctx, VARIANT (v)); + + json_attr_double (&json_ctx, "duration", d_total_time); + json_attr_double (&json_ctx, "iterations", d_total_data_set); + json_attr_double (&json_ctx, "mean", d_total_time / d_total_data_set); + + /* End variant. */ + json_attr_object_end (&json_ctx); + } + + /* End function. */ + json_attr_object_end (&json_ctx); + + return 0; +} diff --git a/sysdeps/x86_64/fpu/scripts/bench_libmvec.py b/sysdeps/x86_64/fpu/scripts/bench_libmvec.py new file mode 100755 index 0000000000..762865de8f --- /dev/null +++ b/sysdeps/x86_64/fpu/scripts/bench_libmvec.py @@ -0,0 +1,464 @@ +#!/usr/bin/python3 +# Copyright (C) 2021 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# <https://www.gnu.org/licenses/>. + +"""Benchmark program generator script + +This script takes a function name as input and generates a program using +an libmvec input file located in the sysdeps/x86_64/fpu directory. The +name of the input file should be of the form libmvec-foo-inputs where +'foo' is the name of the function. +""" + +from __future__ import print_function +import sys +import os +import itertools +import re + +# Macro definitions for functions that take no arguments. For functions +# that take arguments, the STRUCT_TEMPLATE, ARGS_TEMPLATE and +# VARIANTS_TEMPLATE are used instead. +DEFINES_TEMPLATE = ''' +#define CALL_BENCH_FUNC(v, i) %(func)s () +#define NUM_VARIANTS (1) +#define NUM_SAMPLES(v) (1) +#define VARIANT(v) FUNCNAME "()" +''' + +# Structures to store arguments for the function call. A function may +# have its inputs partitioned to represent distinct performance +# characteristics or distinct flavors of the function. Each such +# variant is represented by the _VARIANT structure. The ARGS structure +# represents a single set of arguments. +BENCH_VEC_TEMPLATE = ''' +#define CALL_BENCH_FUNC(v, i) (__extension__ ({ \\ + %(defs)s mx0 = %(func)s (%(func_args)s); \\ + mx0; })) +''' + +BENCH_SCALAR_TEMPLATE = ''' +#define CALL_BENCH_FUNC(v, i) %(func)s (%(func_args)s) +''' + +STRUCT_TEMPLATE = '''struct args +{ +%(args)s + double timing; +}; + +struct _variants +{ + const char *name; + int count; + struct args *in; +}; +''' + +# The actual input arguments. +ARGS_TEMPLATE = '''struct args in%(argnum)d[%(num_args)d] = { +%(args)s +}; +''' + +# The actual variants, along with macros defined to access the variants. +VARIANTS_TEMPLATE = '''struct _variants variants[%(num_variants)d] = { +%(variants)s +}; + +#define NUM_VARIANTS %(num_variants)d +#define NUM_SAMPLES(i) (variants[i].count) +#define VARIANT(i) (variants[i].name) +''' + +# Epilogue for the generated source file. +EPILOGUE = ''' +#define BENCH_FUNC(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j);}) +#define FUNCNAME "%(func)s" +#include <bench-libmvec-skeleton.c>''' + + +def gen_source(func_types, directives, all_vals): + """Generate source for the function + + Generate the C source for the function from the values and + directives. + + Args: + func: The function name + directives: A dictionary of directives applicable to this function + all_vals: A dictionary input values + """ + # The includes go in first. + for header in directives['includes']: + print('#include <%s>' % header) + + for header in directives['include-sources']: + print('#include "%s"' % header) + + argtype_vtable = { + 2: '128', + 4: '256', + 8: '512' + } + prefix_vtable = { + 2: 'b', + 4: 'c', + 8: 'e' + } + + # Get all the function properties + funcname_argtype = '' + float_flag = False + if func_types[1] == 'float': + float_flag = True + avx_flag = False + if func_types[3] == 'avx2': + avx_flag = True + funcname_stride = int(func_types[2][4:]) + funcname_origin = func_types[-1] + if float_flag: + funcname_origin = funcname_origin[:-1] + + if funcname_stride == 1: + # Prepare for scalar functions file generation + funcname_prefix = '' + funcname_prefix_1 = '' + funcname_argtype = 'double' + if float_flag: + funcname_argtype = 'float' + else: + # Prepare for libmvec functions file generation + funcname_prefix_1 = len(directives['args']) * 'v' + '_' + aligned_stride = funcname_stride + if float_flag: + aligned_stride /= 2 + funcname_prefix = '_ZGV' + if (avx_flag and (aligned_stride == 4)): + funcname_prefix += 'd' + else: + funcname_prefix += prefix_vtable[aligned_stride] + funcname_prefix = funcname_prefix + 'N' + func_types[2][4:] + funcname_argtype = '__m' + argtype_vtable[aligned_stride] + if not float_flag: + funcname_argtype += 'd' + + # Include x86intrin.h for vector functions + if not funcname_stride == 1: + print('#include <x86intrin.h>') + if (avx_flag and (aligned_stride == 4)): + # For bench-float-vlen8-avx2* and bench-double-vlen4-avx2* + print('#define REQUIRE_AVX2') + elif aligned_stride == 8: + # For bench-float-vlen16* and bench-double-vlen8* + print('#define REQUIRE_AVX512F') + elif aligned_stride == 4: + # For bench-float-vlen8* and bench-double-vlen4* without avx2 + print('#define REQUIRE_AVX') + else: + print('#define FUNCTYPE %s' % funcname_argtype) + + print('#define STRIDE %d ' % funcname_stride) + + funcname = funcname_prefix + funcname_prefix_1 + funcname_origin + if float_flag: + funcname += 'f' + + funcname_rettype = funcname_argtype + if directives['ret'] == '': + funcname_rettype = 'void' + + funcname_inputtype = [] + for arg, i in zip(directives['args'], itertools.count()): + funcname_inputtype.append(funcname_argtype) + if arg[0] == '<' and arg[-1] == '>': + pos = arg.rfind('*') + if pos == -1: + die('Output argument must be a pointer type') + funcname_inputtype[i] += ' *' + + if not funcname_stride == 1: + if len(directives['args']) == 2: + print('extern %s %s (%s, %s);' % (funcname_rettype, funcname, funcname_inputtype[0], funcname_inputtype[1])) + elif len(directives['args']) == 3: + print('extern %s %s (%s, %s, %s);' % (funcname_rettype, funcname, funcname_inputtype[0], funcname_inputtype[1], funcname_inputtype[2])) + else: + print('extern %s %s (%s);' % (funcname_rettype, funcname, funcname_inputtype[0])) + + # Print macros. This branches out to a separate routine if + # the function takes arguments. + if not directives['args']: + print(DEFINES_TEMPLATE % {'funcname': funcname}) + outargs = [] + else: + outargs = _print_arg_data(funcname, float_flag, funcname_argtype, funcname_stride, directives, all_vals) + + # Print the output variable definitions if necessary. + for out in outargs: + print(out) + + # If we have a return value from the function, make sure it is + # assigned to prevent the compiler from optimizing out the + # call. + getret = '' + + if directives['ret']: + if funcname_argtype != '': + print('static %s volatile ret;' % funcname_argtype) + getret = 'ret =' + else: + print('static %s volatile ret;' % directives['ret']) + getret = 'ret =' + + # Test initialization. + if directives['init']: + print('#define BENCH_INIT %s' % directives['init']) + + print(EPILOGUE % {'getret': getret, 'func': funcname}) + + +def _print_arg_data(func, float_flag, funcname_argtype, funcname_stride, directives, all_vals): + """Print argument data + + This is a helper function for gen_source that prints structure and + values for arguments and their variants and returns output arguments + if any are found. + + Args: + func: Function name + float_flag: True if function is float type + funcname_argtype: Type for vector variants + funcname_stride: Vector Length + directives: A dictionary of directives applicable to this function + all_vals: A dictionary input values + + Returns: + Returns a list of definitions for function arguments that act as + output parameters. + """ + # First, all of the definitions. We process writing of + # CALL_BENCH_FUNC, struct args and also the output arguments + # together in a single traversal of the arguments list. + func_args = [] + _func_args = [] + arg_struct = [] + outargs = [] + # Conversion function for each type + vtable = { + '__m128d': '_mm_loadu_pd', + '__m256d': '_mm256_loadu_pd', + '__m512d': '_mm512_loadu_pd', + '__m128': '_mm_loadu_ps', + '__m256': '_mm256_loadu_ps', + '__m512': '_mm512_loadu_ps', + 'double': '', + 'float': '' + } + + # For double max_vlen=8, for float max_vlen=16. + if float_flag == True: + max_vlen = 16 + else: + max_vlen = 8 + + for arg, i in zip(directives['args'], itertools.count()): + if arg[0] == '<' and arg[-1] == '>': + outargs.append('static %s out%d __attribute__((used));' % (funcname_argtype, i)) + func_args.append('&out%d' % i) + _func_args.append('&out%d' % i) + else: + arg_struct.append(' %s arg%d[STRIDE];' % (arg, i)) + func_args.append('%s (variants[v].in[i].arg%d)' % + (vtable[funcname_argtype], i)) + _func_args.append('variants[v].in[i].arg%d[0]' % i) + + if funcname_stride == 1: + print(BENCH_SCALAR_TEMPLATE % {'func': func, + 'func_args': ', '.join(_func_args)}) + elif directives['ret'] == '': + print(BENCH_SCALAR_TEMPLATE % {'func': func, + 'func_args': ', '.join(func_args)}) + else: + print(BENCH_VEC_TEMPLATE % {'func': func, 'func_args': ', '.join(func_args), + 'defs': funcname_argtype}) + print(STRUCT_TEMPLATE % {'args': '\n'.join(arg_struct)}) + + # Now print the values. + variants = [] + for (k, _vals), i in zip(all_vals.items(), itertools.count()): + vals = [] + temp_vals = [] + j = 0 + temp_j = 0 + result_v = ['', '', ''] + for _v in _vals: + nums = _v.split(',') + for l in range(0, len(nums)): + result_v[l] = result_v[l] + nums[l].strip() + ',' + j += 1 + temp_j += 1 + + if temp_j == funcname_stride: + final_result = '' + for l in range(0, len(nums)): + final_result = final_result + '{' + result_v[l][:-1] + '},' + temp_vals.append(final_result[:-1]) + temp_j = 0 + result_v = ['', '', ''] + + # Make sure amount of test data is multiple of max_vlen + # to keep data size same for all vector length. + if j == max_vlen: + vals.extend(temp_vals) + temp_vals = [] + j = 0 + + out = [' {%s, 0},' % v for v in vals] + + # Members for the variants structure list that we will + # print later. + variants.append(' {"%s", %d, in%d},' % (k, len(vals), i)) + print(ARGS_TEMPLATE % {'argnum': i, 'num_args': len(vals), + 'args': '\n'.join(out)}) + + # Print the variants and the last set of macros. + print(VARIANTS_TEMPLATE % {'num_variants': len(all_vals), + 'variants': '\n'.join(variants)}) + return outargs + + +def _process_directive(d_name, d_val, func_args): + """Process a directive. + + Evaluate the directive name and value passed and return the + processed value. This is a helper function for parse_file. + + Args: + d_name: Name of the directive + d_val: The string value to process + + Returns: + The processed value, which may be the string as it is or an object + that describes the directive. + """ + # Process the directive values if necessary. name and ret don't + # need any processing. + if d_name.startswith('include'): + d_val = d_val.split(',') + elif d_name == 'args': + d_val = d_val.split(':') + # Check if args type match + if not d_val[0] == func_args: + die("Args mismatch, should be %s, but get %s" % (d_val[0], func_args)) + + # Return the values. + return d_val + + +def parse_file(func_types): + """Parse an input file + + Given a function name, open and parse an input file for the function + and get the necessary parameters for the generated code and the list + of inputs. + + Args: + func: The function name + + Returns: + A tuple of two elements, one a dictionary of directives and the + other a dictionary of all input values. + """ + all_vals = {} + # Valid directives. + directives = { + 'name': '', + 'args': [], + 'includes': [], + 'include-sources': [], + 'ret': '', + 'init': '' + } + + func = func_types[-1] + try: + with open('../sysdeps/x86_64/fpu/libmvec-%s-inputs' % func) as f: + for line in f: + # Look for directives and parse it if found. + if line.startswith('##'): + try: + d_name, d_val = line[2:].split(':', 1) + d_name = d_name.strip() + d_val = d_val.strip() + directives[d_name] = _process_directive(d_name, d_val, func_types[1]) + except (IndexError, KeyError): + die('Invalid directive: %s' % line[2:]) + + # Skip blank lines and comments. + line = line.split('#', 1)[0].rstrip() + if not line: + continue + + # Otherwise, we're an input. Add to the appropriate + # input set. + cur_name = directives['name'] + all_vals.setdefault(cur_name, []) + all_vals[cur_name].append(line) + except IOError as ex: + die("Failed to open input file (%s): %s" % (ex.filename, ex.strerror)) + + return directives, all_vals + + +def die(msg): + """Exit with an error + + Prints an error message to the standard error stream and exits with + a non-zero status. + + Args: + msg: The error message to print to standard error + """ + print('%s\n' % msg, file=sys.stderr) + sys.exit(os.EX_DATAERR) + + +def main(args): + """Main function + + Use the first command line argument as function name and parse its + input file to generate C source that calls the function repeatedly + for the input. + + Args: + args: The command line arguments with the program name dropped + + Returns: + os.EX_USAGE on error and os.EX_OK on success. + """ + if len(args) != 1: + print('Usage: %s <function>' % sys.argv[0]) + return os.EX_USAGE + + func_types = args[0].split('-') + directives, all_vals = parse_file(func_types) + gen_source(func_types, directives, all_vals) + return os.EX_OK + + +if __name__ == '__main__': + sys.exit(main(sys.argv[1:]))