Message ID | DB6PR0801MB20530E94B6016F1E3F7CEDEC83820@DB6PR0801MB2053.eurprd08.prod.outlook.com |
---|---|
State | New |
Headers | show |
On Wednesday 16 August 2017 04:55 PM, Wilco Dijkstra wrote: > This patch further improves math function benchmarking by adding a latency > test in addition to throughput. This enables more accurate comparisons of the > math functions. The latency test works by creating a dependency on the previous > iteration: func_res = F (func_res * zero + input[i]). The multiply by zero avoids > changing the input. > > The powf test now shows: > > "workload-spec2006.wrf": { > "throughput": 200, > "latency": 100 > } > > OK for commit? > ChangeLog: > 2017-08-16 Wilco Dijkstra <wdijkstr@arm.com> > > * benchtests/bench-skeleton.c (main): Add support for > latency benchmarking. > * benchtests/scripts/bench.py: Add support for latency benchmarking. > > -- > diff --git a/benchtests/bench-skeleton.c b/benchtests/bench-skeleton.c > index 3c6dad705594ac0a53edcb4e09686252c13127cf..60753ede1aa3cc05cc0e9eccc74dd12a609a1a67 100644 > --- a/benchtests/bench-skeleton.c > +++ b/benchtests/bench-skeleton.c > @@ -71,8 +71,10 @@ main (int argc, char **argv) > bool is_bench = strncmp (VARIANT (v), "workload-", 9) == 0; > double d_total_i = 0; > timing_t total = 0, max = 0, min = 0x7fffffffffffffff; > + timing_t throughput = 0, latency = 0; > int64_t c = 0; > uint64_t cur; > + BENCH_VARS; > while (1) > { > if (is_bench) > @@ -86,7 +88,16 @@ main (int argc, char **argv) > BENCH_FUNC (v, i); > TIMING_NOW (end); > TIMING_DIFF (cur, start, end); > - TIMING_ACCUM (total, cur); > + TIMING_ACCUM (throughput, cur); > + > + TIMING_NOW (start); > + for (k = 0; k < iters; k++) > + for (i = 0; i < NUM_SAMPLES (v); i++) > + BENCH_FUNC_LAT (v, i); > + TIMING_NOW (end); > + TIMING_DIFF (cur, start, end); > + TIMING_ACCUM (latency, cur); > + > d_total_i += iters * NUM_SAMPLES (v); > } > else > @@ -131,12 +142,15 @@ main (int argc, char **argv) > /* Begin variant. */ > json_attr_object_begin (&json_ctx, VARIANT (v)); > > - json_attr_double (&json_ctx, "duration", d_total_s); > - json_attr_double (&json_ctx, "iterations", d_total_i); > if (is_bench) > - json_attr_double (&json_ctx, "throughput", d_total_s / d_total_i); > + { > + json_attr_double (&json_ctx, "throughput", throughput / d_total_i); I didn't notice this earlier, but shouldn't throughput be iterations/cycle and not the other way around? That is, throughput should be the inverse of latency. > + json_attr_double (&json_ctx, "latency", latency / d_total_i); > + } > else > { > + json_attr_double (&json_ctx, "duration", d_total_s); > + json_attr_double (&json_ctx, "iterations", d_total_i); > json_attr_double (&json_ctx, "max", max / d_iters); > json_attr_double (&json_ctx, "min", min / d_iters); > json_attr_double (&json_ctx, "mean", d_total_s / d_total_i); > diff --git a/benchtests/scripts/bench.py b/benchtests/scripts/bench.py > index 8c1c9eeb2bc67a16cb8a8e010fd2b8a2ef8ab6df..b7ccb7c8c2bf1822202a2377dfb0675516115cc5 100755 > --- a/benchtests/scripts/bench.py > +++ b/benchtests/scripts/bench.py > @@ -45,7 +45,7 @@ DEFINES_TEMPLATE = ''' > # variant is represented by the _VARIANT structure. The ARGS structure > # represents a single set of arguments. > STRUCT_TEMPLATE = ''' > -#define CALL_BENCH_FUNC(v, i) %(func)s (%(func_args)s) > +#define CALL_BENCH_FUNC(v, i, x) %(func)s (x %(func_args)s) > > struct args > { > @@ -84,7 +84,9 @@ EPILOGUE = ''' > #define RESULT(__v, __i) (variants[(__v)].in[(__i)].timing) > #define RESULT_ACCUM(r, v, i, old, new) \\ > ((RESULT ((v), (i))) = (RESULT ((v), (i)) * (old) + (r)) / ((new) + 1)) > -#define BENCH_FUNC(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j);}) > +#define BENCH_FUNC(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j, );}) > +#define BENCH_FUNC_LAT(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j, %(latarg)s);}) > +#define BENCH_VARS %(defvar)s > #define FUNCNAME "%(func)s" > #include "bench-skeleton.c"''' > > @@ -122,17 +124,22 @@ def gen_source(func, directives, all_vals): > # If we have a return value from the function, make sure it is > # assigned to prevent the compiler from optimizing out the > # call. > + getret = '' > + latarg = '' > + defvar = '' > + > if directives['ret']: > print('static %s volatile ret;' % directives['ret']) > - getret = 'ret = ' > - else: > - getret = '' > + print('static %s zero __attribute__((used)) = 0;' % directives['ret']) > + getret = 'ret = func_res = ' > + latarg = 'func_res * zero +' > + defvar = '%s func_res = 0;' % directives['ret'] > > # Test initialization. > if directives['init']: > print('#define BENCH_INIT %s' % directives['init']) > > - print(EPILOGUE % {'getret': getret, 'func': func}) > + print(EPILOGUE % {'getret': getret, 'func': func, 'latarg': latarg, 'defvar': defvar }) > > > def _print_arg_data(func, directives, all_vals): >
On 16/08/17 14:07, Siddhesh Poyarekar wrote: > On Wednesday 16 August 2017 04:55 PM, Wilco Dijkstra wrote: >> This patch further improves math function benchmarking by adding a latency >> test in addition to throughput. This enables more accurate comparisons of the >> math functions. The latency test works by creating a dependency on the previous >> iteration: func_res = F (func_res * zero + input[i]). The multiply by zero avoids >> changing the input. >> ... >> - json_attr_double (&json_ctx, "duration", d_total_s); >> - json_attr_double (&json_ctx, "iterations", d_total_i); >> if (is_bench) >> - json_attr_double (&json_ctx, "throughput", d_total_s / d_total_i); >> + { >> + json_attr_double (&json_ctx, "throughput", throughput / d_total_i); > > I didn't notice this earlier, but shouldn't throughput be > iterations/cycle and not the other way around? That is, throughput > should be the inverse of latency. yes in principle the throughput is number of calls/unit time, but the inverse of that is just as useful, it's the rate at which calls can be issued and then it can be easily compared to the latency number, but i guess printing it as calls/sec would work too.
On Wednesday 16 August 2017 07:13 PM, Szabolcs Nagy wrote: > yes in principle the throughput is number of calls/unit time, > but the inverse of that is just as useful, it's the rate at > which calls can be issued and then it can be easily compared > to the latency number, but i guess printing it as calls/sec > would work too. I don't dispute its utility, I'd just like it to match what it is called, so calls/unit time for throughput and time/call for latency. BTW, this will only work for functions that have the same first argument and return types. This is OK for now, but it might be something to fix in future. A comment to that effect in the code would be very useful because I'm pretty sure I'll forget about it in the next 15 minutes or so and Wilco will too, maybe later than that :) Siddhesh
On 8/16/2017 6:07 AM, Siddhesh Poyarekar wrote: > I didn't notice this earlier, but shouldn't throughput be > iterations/cycle and not the other way around? That is, throughput > should be the inverse of latency. well not really... I've been working on making expf() faster for x86 (see HJ's email earlier), and with a massive out of order/pipelined cpu, latency and throughput are very distinct things. expf() can run at a throughput of somewhere in the 10 to 11 cycles range, while the latency can be in the 45 to 55 cycles range. (not trying to do benchmarking here, just wanting to show an order of magnitude) the latency is then the number of cycles it takes to get a result (on an empty cpu) through from end to end, e.g. printf("%e", expf(fl)) while throughput is the cost if you put multiple consecutive through the cpu, like printf("%e", expf(f1) + expf(f2) + expf(f3) + expf(4)) (using "printf" as a proxy for 'make externally visible' sync point; of course in reality it could be many other things) the out of order cpu will start execution of the second third and fourth expf() in parallel to the first, which will hide the latency (so the result time is not 4x45 + time of 4 adds, but much less, closer to 45 + 3x11 + time of 4 adds) I picked 4 expf()s here but theoretically throughput would be measured with the asymptote of 4...
On Wednesday 16 August 2017 07:53 PM, Arjan van de Ven wrote: > well not really... > > I've been working on making expf() faster for x86 (see HJ's email > earlier), and > with a massive out of order/pipelined cpu, latency and throughput are > very distinct things. > expf() can run at a throughput of somewhere in the 10 to 11 cycles > range, while the latency > can be in the 45 to 55 cycles range. > (not trying to do benchmarking here, just wanting to show an order of > magnitude) Ah sorry, I should have been clearer - inverse in terms of units, not value, i.e. throughput is typically expressed in items/unit time while latency is expressed in time/unit item. Siddhesh
On 8/16/2017 7:26 AM, Siddhesh Poyarekar wrote: > On Wednesday 16 August 2017 07:53 PM, Arjan van de Ven wrote: >> well not really... >> >> I've been working on making expf() faster for x86 (see HJ's email >> earlier), and >> with a massive out of order/pipelined cpu, latency and throughput are >> very distinct things. >> expf() can run at a throughput of somewhere in the 10 to 11 cycles >> range, while the latency >> can be in the 45 to 55 cycles range. >> (not trying to do benchmarking here, just wanting to show an order of >> magnitude) > > Ah sorry, I should have been clearer - inverse in terms of units, not > value, i.e. throughput is typically expressed in items/unit time while > latency is expressed in time/unit item. maybe "throughput-time" ?
On Wednesday 16 August 2017 08:03 PM, Arjan van de Ven wrote: >> Ah sorry, I should have been clearer - inverse in terms of units, not >> value, i.e. throughput is typically expressed in items/unit time while >> latency is expressed in time/unit item. > > maybe "throughput-time" ? Sure, but throughput proper (i.e. comparisons) will give bigger numbers and happier marketing teams :) Siddhesh
On Wed, 16 Aug 2017, Arjan van de Ven wrote: > > Ah sorry, I should have been clearer - inverse in terms of units, not > > value, i.e. throughput is typically expressed in items/unit time while > > latency is expressed in time/unit item. > > maybe "throughput-time" ? I suggest using "reciprocal throughput" if you're looking for a short term for 'independent executions per unit time'. It's easier to recognize and already used in practice (e.g. in docs by Agner Fog). Alexander
Siddhesh Poyarekar wrote: > > BTW, this will only work for functions that have the same first argument > and return types. This is OK for now, but it might be something to fix > in future. A comment to that effect in the code would be very useful > because I'm pretty sure I'll forget about it in the next 15 minutes or > so and Wilco will too, maybe later than that :) I've added a comment. It would fail on say pointer + float but works fine if the types support addition between them. The latency test doesn't work for sincos (somehow the madd is left out), but I'll fix that when I add a workload for it. Wilco
diff --git a/benchtests/bench-skeleton.c b/benchtests/bench-skeleton.c index 3c6dad705594ac0a53edcb4e09686252c13127cf..60753ede1aa3cc05cc0e9eccc74dd12a609a1a67 100644 --- a/benchtests/bench-skeleton.c +++ b/benchtests/bench-skeleton.c @@ -71,8 +71,10 @@ main (int argc, char **argv) bool is_bench = strncmp (VARIANT (v), "workload-", 9) == 0; double d_total_i = 0; timing_t total = 0, max = 0, min = 0x7fffffffffffffff; + timing_t throughput = 0, latency = 0; int64_t c = 0; uint64_t cur; + BENCH_VARS; while (1) { if (is_bench) @@ -86,7 +88,16 @@ main (int argc, char **argv) BENCH_FUNC (v, i); TIMING_NOW (end); TIMING_DIFF (cur, start, end); - TIMING_ACCUM (total, cur); + TIMING_ACCUM (throughput, cur); + + TIMING_NOW (start); + for (k = 0; k < iters; k++) + for (i = 0; i < NUM_SAMPLES (v); i++) + BENCH_FUNC_LAT (v, i); + TIMING_NOW (end); + TIMING_DIFF (cur, start, end); + TIMING_ACCUM (latency, cur); + d_total_i += iters * NUM_SAMPLES (v); } else @@ -131,12 +142,15 @@ main (int argc, char **argv) /* Begin variant. */ json_attr_object_begin (&json_ctx, VARIANT (v)); - json_attr_double (&json_ctx, "duration", d_total_s); - json_attr_double (&json_ctx, "iterations", d_total_i); if (is_bench) - json_attr_double (&json_ctx, "throughput", d_total_s / d_total_i); + { + json_attr_double (&json_ctx, "throughput", throughput / d_total_i); + json_attr_double (&json_ctx, "latency", latency / d_total_i); + } else { + json_attr_double (&json_ctx, "duration", d_total_s); + json_attr_double (&json_ctx, "iterations", d_total_i); json_attr_double (&json_ctx, "max", max / d_iters); json_attr_double (&json_ctx, "min", min / d_iters); json_attr_double (&json_ctx, "mean", d_total_s / d_total_i); diff --git a/benchtests/scripts/bench.py b/benchtests/scripts/bench.py index 8c1c9eeb2bc67a16cb8a8e010fd2b8a2ef8ab6df..b7ccb7c8c2bf1822202a2377dfb0675516115cc5 100755 --- a/benchtests/scripts/bench.py +++ b/benchtests/scripts/bench.py @@ -45,7 +45,7 @@ DEFINES_TEMPLATE = ''' # variant is represented by the _VARIANT structure. The ARGS structure # represents a single set of arguments. STRUCT_TEMPLATE = ''' -#define CALL_BENCH_FUNC(v, i) %(func)s (%(func_args)s) +#define CALL_BENCH_FUNC(v, i, x) %(func)s (x %(func_args)s) struct args { @@ -84,7 +84,9 @@ EPILOGUE = ''' #define RESULT(__v, __i) (variants[(__v)].in[(__i)].timing) #define RESULT_ACCUM(r, v, i, old, new) \\ ((RESULT ((v), (i))) = (RESULT ((v), (i)) * (old) + (r)) / ((new) + 1)) -#define BENCH_FUNC(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j);}) +#define BENCH_FUNC(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j, );}) +#define BENCH_FUNC_LAT(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j, %(latarg)s);}) +#define BENCH_VARS %(defvar)s #define FUNCNAME "%(func)s" #include "bench-skeleton.c"''' @@ -122,17 +124,22 @@ def gen_source(func, directives, all_vals): # If we have a return value from the function, make sure it is # assigned to prevent the compiler from optimizing out the # call. + getret = '' + latarg = '' + defvar = '' + if directives['ret']: print('static %s volatile ret;' % directives['ret']) - getret = 'ret = ' - else: - getret = '' + print('static %s zero __attribute__((used)) = 0;' % directives['ret']) + getret = 'ret = func_res = ' + latarg = 'func_res * zero +' + defvar = '%s func_res = 0;' % directives['ret'] # Test initialization. if directives['init']: print('#define BENCH_INIT %s' % directives['init']) - print(EPILOGUE % {'getret': getret, 'func': func}) + print(EPILOGUE % {'getret': getret, 'func': func, 'latarg': latarg, 'defvar': defvar }) def _print_arg_data(func, directives, all_vals):