From patchwork Tue Nov 10 03:31:36 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Li, Liang Z" X-Patchwork-Id: 542118 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from lists.gnu.org (lists.gnu.org [IPv6:2001:4830:134:3::11]) (using TLSv1 with cipher AES256-SHA (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 1F68E140180 for ; Tue, 10 Nov 2015 14:38:06 +1100 (AEDT) Received: from localhost ([::1]:57199 helo=lists.gnu.org) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1Zvzky-0005zS-4G for incoming@patchwork.ozlabs.org; Mon, 09 Nov 2015 22:38:04 -0500 Received: from eggs.gnu.org ([2001:4830:134:3::10]:39736) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1ZvzkT-00050o-N0 for qemu-devel@nongnu.org; Mon, 09 Nov 2015 22:37:35 -0500 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1ZvzkQ-0008Ev-Tu for qemu-devel@nongnu.org; Mon, 09 Nov 2015 22:37:33 -0500 Received: from mga03.intel.com ([134.134.136.65]:26655) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1ZvzkQ-0008Ei-JL for qemu-devel@nongnu.org; Mon, 09 Nov 2015 22:37:30 -0500 Received: from orsmga001.jf.intel.com ([10.7.209.18]) by orsmga103.jf.intel.com with ESMTP; 09 Nov 2015 19:37:28 -0800 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.20,268,1444719600"; d="scan'208";a="815945045" Received: from ll.sh.intel.com (HELO localhost) ([10.239.13.27]) by orsmga001.jf.intel.com with ESMTP; 09 Nov 2015 19:37:26 -0800 From: Liang Li To: qemu-devel@nongnu.org Date: Tue, 10 Nov 2015 11:31:36 +0800 Message-Id: <1447126297-27239-2-git-send-email-liang.z.li@intel.com> X-Mailer: git-send-email 1.9.1 In-Reply-To: <1447126297-27239-1-git-send-email-liang.z.li@intel.com> References: <1447126297-27239-1-git-send-email-liang.z.li@intel.com> X-detected-operating-system: by eggs.gnu.org: Genre and OS details not recognized. X-Received-From: 134.134.136.65 Cc: quintela@redhat.com, Liang Li , mst@redhat.com, amit.shah@redhat.com, pbonzini@redhat.com Subject: [Qemu-devel] [v2 RESEND 1/2] cutils: add avx2 instruction optimization X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.14 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org buffer_find_nonzero_offset() is a hot function during live migration. Now it use SSE2 intructions for optimization. For platform supports AVX2 instructions, use the AVX2 instructions for optimization can help to improve the performance about 30% comparing to SSE2. Zero page check can be faster with this optimization, the test result shows that for an 8GB RAM idle guest, this patch can help to shorten the total live migration time about 6%. This patch use the ifunc mechanism to select the proper function when running, for platform supports AVX2, excute the AVX2 instructions, else, excute the original code. Signed-off-by: Liang Li --- include/qemu-common.h | 28 +++++++++++++++------ util/Makefile.objs | 2 ++ util/avx2.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++ util/cutils.c | 47 +++++++++++++++++++++++++++++++++-- 4 files changed, 136 insertions(+), 9 deletions(-) create mode 100644 util/avx2.c diff --git a/include/qemu-common.h b/include/qemu-common.h index 2f74540..9fa7501 100644 --- a/include/qemu-common.h +++ b/include/qemu-common.h @@ -484,15 +484,29 @@ void qemu_hexdump(const char *buf, FILE *fp, const char *prefix, size_t size); #endif #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8 -static inline bool -can_use_buffer_find_nonzero_offset(const void *buf, size_t len) -{ - return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR - * sizeof(VECTYPE)) == 0 - && ((uintptr_t) buf) % sizeof(VECTYPE) == 0); -} +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len); + size_t buffer_find_nonzero_offset(const void *buf, size_t len); +extern bool +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len); + +extern size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len); + +extern bool +can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len); + +extern size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len); + +__asm__(".type can_use_buffer_find_nonzero_offset, \%gnu_indirect_function"); +__asm__(".type buffer_find_nonzero_offset, \%gnu_indirect_function"); + + +void *can_use_buffer_find_nonzero_offset_ifunc(void) \ + __asm__("can_use_buffer_find_nonzero_offset"); + +void *buffer_find_nonzero_offset_ifunc(void) \ + __asm__("buffer_find_nonzero_offset"); /* * helper to parse debug environment variables */ diff --git a/util/Makefile.objs b/util/Makefile.objs index d7cc399..6aacad7 100644 --- a/util/Makefile.objs +++ b/util/Makefile.objs @@ -1,4 +1,5 @@ util-obj-y = osdep.o cutils.o unicode.o qemu-timer-common.o +util-obj-y += avx2.o util-obj-$(CONFIG_POSIX) += compatfd.o util-obj-$(CONFIG_POSIX) += event_notifier-posix.o util-obj-$(CONFIG_POSIX) += mmap-alloc.o @@ -29,3 +30,4 @@ util-obj-y += qemu-coroutine.o qemu-coroutine-lock.o qemu-coroutine-io.o util-obj-y += qemu-coroutine-sleep.o util-obj-y += coroutine-$(CONFIG_COROUTINE_BACKEND).o util-obj-y += buffer.o +avx2.o-cflags := $(AVX2_CFLAGS) diff --git a/util/avx2.c b/util/avx2.c new file mode 100644 index 0000000..d90289b --- /dev/null +++ b/util/avx2.c @@ -0,0 +1,68 @@ +#include "qemu-common.h" + +#ifdef __AVX2__ +#include +#define AVX2_VECTYPE __m256i +#define AVX2_SPLAT(p) _mm256_set1_epi8(*(p)) +#define AVX2_ALL_EQ(v1, v2) \ + (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF) +#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2)) + +inline bool +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len) +{ + return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR + * sizeof(AVX2_VECTYPE)) == 0 + && ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0); +} + +size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len) +{ + const AVX2_VECTYPE *p = buf; + const AVX2_VECTYPE zero = (AVX2_VECTYPE){0}; + size_t i; + + assert(can_use_buffer_find_nonzero_offset_avx2(buf, len)); + + if (!len) { + return 0; + } + + for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) { + if (!AVX2_ALL_EQ(p[i], zero)) { + return i * sizeof(AVX2_VECTYPE); + } + } + + for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; + i < len / sizeof(AVX2_VECTYPE); + i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) { + AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]); + AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]); + AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]); + AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]); + AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1); + AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3); + if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) { + break; + } + } + + return i * sizeof(AVX2_VECTYPE); +} + +#else +/* use the original functions if avx2 is not enabled when buiding*/ + +inline bool +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len) +{ + return can_use_buffer_find_nonzero_offset_inner(buf, len); +} + +inline size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len) +{ + return buffer_find_nonzero_offset_inner(buf, len); +} + +#endif diff --git a/util/cutils.c b/util/cutils.c index cfeb848..5a9763a 100644 --- a/util/cutils.c +++ b/util/cutils.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "qemu/sockets.h" #include "qemu/iov.h" @@ -161,6 +162,48 @@ int qemu_fdatasync(int fd) #endif } +/* old compiler maynot define bit_AVX2 */ +#ifndef bit_AVX2 +#define bit_AVX2 (1 << 5) +#endif + +static inline bool avx2_support(void) +{ + int a, b, c, d; + + if (__get_cpuid_max(0, NULL) < 7) { + return false; + } + + __cpuid_count(7, 0, a, b, c, d); + return b & bit_AVX2; +} + +void *buffer_find_nonzero_offset_ifunc(void) +{ + typeof(buffer_find_nonzero_offset) *func = (avx2_support()) ? + buffer_find_nonzero_offset_avx2 : buffer_find_nonzero_offset_inner; + + return func; +} + +void *can_use_buffer_find_nonzero_offset_ifunc(void) +{ + typeof(can_use_buffer_find_nonzero_offset) *func = (avx2_support()) ? + can_use_buffer_find_nonzero_offset_avx2 : + can_use_buffer_find_nonzero_offset_inner; + + return func; +} + +inline bool +can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len) +{ + return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR + * sizeof(VECTYPE)) == 0 + && ((uintptr_t) buf) % sizeof(VECTYPE) == 0); +} + /* * Searches for an area with non-zero content in a buffer * @@ -181,13 +224,13 @@ int qemu_fdatasync(int fd) * If the buffer is all zero the return value is equal to len. */ -size_t buffer_find_nonzero_offset(const void *buf, size_t len) +size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len) { const VECTYPE *p = buf; const VECTYPE zero = (VECTYPE){0}; size_t i; - assert(can_use_buffer_find_nonzero_offset(buf, len)); + assert(can_use_buffer_find_nonzero_offset_inner(buf, len)); if (!len) { return 0;