@@ -75,6 +75,7 @@ extern struct nft_set_type nft_set_hash_fast_type;
extern struct nft_set_type nft_set_rbtree_type;
extern struct nft_set_type nft_set_bitmap_type;
extern struct nft_set_type nft_set_pipapo_type;
+extern struct nft_set_type nft_set_pipapo_avx2_type;
struct nft_expr;
struct nft_regs;
@@ -83,6 +83,9 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
nf_tables_set-objs := nf_tables_set_core.o \
nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \
nft_set_pipapo.o
+ifneq (,$(findstring -DCONFIG_AS_AVX2=1,$(KBUILD_CFLAGS)))
+nf_tables_set-objs += nft_set_pipapo_avx2.o
+endif
obj-$(CONFIG_NF_TABLES) += nf_tables.o
obj-$(CONFIG_NF_TABLES_SET) += nf_tables_set.o
@@ -9,6 +9,9 @@ static int __init nf_tables_set_module_init(void)
nft_register_set(&nft_set_rhash_type);
nft_register_set(&nft_set_bitmap_type);
nft_register_set(&nft_set_rbtree_type);
+#ifdef CONFIG_AS_AVX2
+ nft_register_set(&nft_set_pipapo_avx2_type);
+#endif
nft_register_set(&nft_set_pipapo_type);
return 0;
@@ -17,6 +20,9 @@ static int __init nf_tables_set_module_init(void)
static void __exit nf_tables_set_module_exit(void)
{
nft_unregister_set(&nft_set_pipapo_type);
+#ifdef CONFIG_AS_AVX2
+ nft_unregister_set(&nft_set_pipapo_avx2_type);
+#endif
nft_unregister_set(&nft_set_rbtree_type);
nft_unregister_set(&nft_set_bitmap_type);
nft_unregister_set(&nft_set_rhash_type);
@@ -339,6 +339,7 @@
#include <linux/bitmap.h>
#include <linux/bitops.h>
+#include "nft_set_pipapo_avx2.h"
#include "nft_set_pipapo.h"
/* Current working bitmap index, toggled between field matches */
@@ -2138,3 +2139,27 @@ struct nft_set_type nft_set_pipapo_type __read_mostly = {
.elemsize = offsetof(struct nft_pipapo_elem, ext),
},
};
+
+#ifdef CONFIG_AS_AVX2
+struct nft_set_type nft_set_pipapo_avx2_type __read_mostly = {
+ .owner = THIS_MODULE,
+ .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT |
+ NFT_SET_TIMEOUT | NFT_SET_SUBKEY,
+ .ops = {
+ .lookup = nft_pipapo_avx2_lookup,
+ .insert = nft_pipapo_insert,
+ .activate = nft_pipapo_activate,
+ .deactivate = nft_pipapo_deactivate,
+ .flush = nft_pipapo_flush,
+ .remove = nft_pipapo_remove,
+ .walk = nft_pipapo_walk,
+ .get = nft_pipapo_get,
+ .privsize = nft_pipapo_privsize,
+ .estimate = nft_pipapo_avx2_estimate,
+ .init = nft_pipapo_init,
+ .destroy = nft_pipapo_destroy,
+ .gc_init = nft_pipapo_gc_init,
+ .elemsize = offsetof(struct nft_pipapo_elem, ext),
+ },
+};
+#endif
new file mode 100644
@@ -0,0 +1,838 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/* PIPAPO: PIle PAcket POlicies: AVX2 packet lookup routines
+ *
+ * Copyright (c) 2019 Red Hat GmbH
+ *
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <uapi/linux/netfilter/nf_tables.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+
+#include <linux/compiler.h>
+#include <asm/fpu/api.h>
+
+#include "nft_set_pipapo_avx2.h"
+#include "nft_set_pipapo.h"
+
+#define NFT_PIPAPO_LONGS_PER_M256 (XSAVE_YMM_SIZE / BITS_PER_LONG)
+
+/* Load from memory into YMM register with non-temporal hint ("stream load"),
+ * that is, don't fetch lines from memory into the cache. This avoids pushing
+ * precious packet data out of the cache hierarchy, and is appropriate when:
+ *
+ * - loading buckets from lookup tables, as they are not going to be used
+ * again before packets are entirely classified
+ *
+ * - loading the result bitmap from the previous field, as it's never used
+ * again
+ */
+#define NFT_PIPAPO_AVX2_LOAD(reg, loc) \
+ asm volatile("vmovntdqa %0, %%ymm" #reg : : "m" (loc))
+
+/* Stream a single lookup table bucket into YMM register given lookup table,
+ * group index, value of packet bits, bucket size.
+ */
+#define NFT_PIPAPO_AVX2_BUCKET_LOAD(reg, lt, group, v, bsize) \
+ NFT_PIPAPO_AVX2_LOAD(reg, \
+ lt[((group) * NFT_PIPAPO_BUCKETS + (v)) * (bsize)])
+
+/* Bitwise AND: the staple operation of this algorithm */
+#define NFT_PIPAPO_AVX2_AND(dst, a, b) \
+ asm volatile("vpand %ymm" #a ", %ymm" #b ", %ymm" #dst)
+
+/* Jump to label if @reg is zero */
+#define NFT_PIPAPO_AVX2_NOMATCH_GOTO(reg, label) \
+ asm_volatile_goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \
+ "je %l[" #label "]" : : : : label)
+
+/* Store 256 bits from YMM register into memory. Contrary to bucket load
+ * operation, we don't bypass the cache here, as stored matching results
+ * are always used shortly after.
+ */
+#define NFT_PIPAPO_AVX2_STORE(loc, reg) \
+ asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (loc))
+
+/* Zero out a complete YMM register, @reg */
+#define NFT_PIPAPO_AVX2_ZERO(reg) \
+ asm volatile("vpxor %ymm" #reg ", %ymm" #reg ", %ymm" #reg)
+
+/* Current working bitmap index, toggled between field matches */
+static DEFINE_PER_CPU(bool, nft_pipapo_avx2_scratch_index);
+
+/**
+ * nft_pipapo_avx2_prepare() - Prepare before main algorithm body
+ *
+ * This zeroes out ymm15, which is later used whenever we need to clear a
+ * memory location, by storing its content into memory.
+ */
+static void nft_pipapo_avx2_prepare(void)
+{
+ NFT_PIPAPO_AVX2_ZERO(15);
+}
+
+/**
+ * nft_pipapo_avx2_fill() - Fill a bitmap region with ones
+ * @data: Base memory area
+ * @start: First bit to set
+ * @len: Count of bits to fill
+ *
+ * This is nothing else than a version of bitmap_set(), as used e.g. by
+ * pipapo_refill(), tailored for the microarchitectures using it and better
+ * suited for the specific usage: it's very likely that we'll set a small number
+ * of bits, not crossing a word boundary, and correct branch prediction is
+ * critical here.
+ *
+ * This function doesn't actually use any AVX2 instruction.
+ */
+static void nft_pipapo_avx2_fill(unsigned long *data, int start, int len)
+{
+ int offset = start % BITS_PER_LONG;
+ unsigned long mask;
+
+ data += start / BITS_PER_LONG;
+
+ if (likely(len == 1)) {
+ *data |= BIT(offset);
+ return;
+ }
+
+ if (likely(len < BITS_PER_LONG || offset)) {
+ if (likely(len + offset <= BITS_PER_LONG)) {
+ *data |= GENMASK(len - 1 + offset, offset);
+ return;
+ }
+
+ *data |= ~0UL << offset;
+ len -= BITS_PER_LONG - offset;
+ data++;
+
+ if (len <= BITS_PER_LONG) {
+ mask = ~0UL >> (BITS_PER_LONG - len);
+ *data |= mask;
+ return;
+ }
+ }
+
+ memset(data, 0xff, len / BITS_PER_BYTE);
+ data += len / BITS_PER_LONG;
+
+ len %= BITS_PER_LONG;
+ if (len)
+ *data |= ~0UL >> (BITS_PER_LONG - len);
+}
+
+/**
+ * nft_pipapo_avx2_refill() - Scan bitmap, select mapping table item, set bits
+ * @offset: Start from given bitmap (equivalent to bucket) offset, in longs
+ * @map: Bitmap to be scanned for set bits
+ * @dst: Destination bitmap
+ * @mt: Mapping table containing bit set specifiers
+ * @len: Length of bitmap in longs
+ * @last: Return index of first set bit, if this is the last field
+ *
+ * This is an alternative implementation of pipapo_refill() suitable for usage
+ * with AVX2 lookup routines: we know there are four words to be scanned, at
+ * a given offset inside the map, for each matching iteration.
+ *
+ * This function doesn't actually use any AVX2 instruction.
+ *
+ * Return: first set bit index if @last, index of first filled word otherwise.
+ */
+static int nft_pipapo_avx2_refill(int offset, unsigned long *map,
+ unsigned long *dst,
+ union nft_pipapo_map_bucket *mt, bool last)
+{
+ int ret = -1;
+
+#define NFT_PIPAPO_AVX2_REFILL_ONE_WORD(x) \
+ do { \
+ while (map[(x)]) { \
+ int r = __builtin_ctzl(map[(x)]); \
+ int i = (offset + (x)) * BITS_PER_LONG + r; \
+ \
+ if (last) \
+ return i; \
+ \
+ nft_pipapo_avx2_fill(dst, mt[i].to, mt[i].n); \
+ \
+ if (ret == -1) \
+ ret = mt[i].to; \
+ \
+ map[(x)] &= ~(1UL << r); \
+ } \
+ } while (0)
+
+ NFT_PIPAPO_AVX2_REFILL_ONE_WORD(0);
+ NFT_PIPAPO_AVX2_REFILL_ONE_WORD(1);
+ NFT_PIPAPO_AVX2_REFILL_ONE_WORD(2);
+ NFT_PIPAPO_AVX2_REFILL_ONE_WORD(3);
+#undef NFT_PIPAPO_AVX2_REFILL_ONE_WORD
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup2() - AVX2-based lookup for 2 four-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @lt: Lookup table for this field
+ * @mt: Mapping table for this field
+ * @bsize: Bucket size for this lookup table, in longs
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ *
+ * Load buckets from lookup table corresponding to the values of each 4-bit
+ * group of packet bytes, and perform a bitwise intersection between them. If
+ * this is the first field in the set, simply AND the buckets together
+ * (equivalent to using an all-ones starting bitmap), use the provided starting
+ * bitmap otherwise. Then call nft_pipapo_avx2_refill() to generate the next
+ * working bitmap, @fill.
+ *
+ * This is used for 8-bit fields (i.e. protocol numbers).
+ *
+ * Out-of-order (and superscalar) execution is vital here, so it's critical to
+ * avoid false data dependencies. CPU and compiler could (mostly) take care of
+ * this on their own, but the operation ordering is explicitly given here with
+ * a likely execution order in mind, to highlight possible stalls. That's why
+ * a number of logically distinct operations (i.e. loading buckets, intersecting
+ * buckets) are interleaved.
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup2(unsigned long *map, unsigned long *fill,
+ unsigned long *lt,
+ union nft_pipapo_map_bucket *mt,
+ unsigned long bsize, const u8 *pkt,
+ bool first, bool last, int offset)
+{
+ int i, ret = -1, m256_size = bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ u8 pg[2] = { pkt[0] >> 4, pkt[0] & 0xf };
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first) {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(0, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(1, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+ } else {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(0, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_LOAD(2, map[i_ul]);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(1, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nothing);
+ NFT_PIPAPO_AVX2_AND(3, 0, 1);
+ NFT_PIPAPO_AVX2_AND(4, 2, 3);
+ }
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup4() - AVX2-based lookup for 4 four-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @lt: Lookup table for this field
+ * @mt: Mapping table for this field
+ * @bsize: Bucket size for this lookup table, in longs
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ *
+ * See nft_pipapo_avx2_lookup2().
+ *
+ * This is used for 16-bit fields (i.e. ports).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup4(unsigned long *map, unsigned long *fill,
+ unsigned long *lt,
+ union nft_pipapo_map_bucket *mt,
+ unsigned long bsize, const u8 *pkt,
+ bool first, bool last, int offset)
+{
+ int i, ret = -1, m256_size = bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ u8 pg[4] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf };
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first) {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(0, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(1, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(2, lt, 2, pg[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(3, lt, 3, pg[3], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+ NFT_PIPAPO_AVX2_AND(5, 2, 3);
+ NFT_PIPAPO_AVX2_AND(7, 4, 5);
+ } else {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(0, lt, 0, pg[0], bsize);
+
+ NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(2, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(3, lt, 2, pg[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(4, lt, 3, pg[3], bsize);
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
+
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
+ NFT_PIPAPO_AVX2_AND(7, 4, 5);
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(7, 6, 7);
+ }
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(7, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 7);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup8() - AVX2-based lookup for 8 four-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @lt: Lookup table for this field
+ * @mt: Mapping table for this field
+ * @bsize: Bucket size for this lookup table, in longs
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ *
+ * See nft_pipapo_avx2_lookup2().
+ *
+ * This is used for 32-bit fields (i.e. IPv4 addresses).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup8(unsigned long *map, unsigned long *fill,
+ unsigned long *lt,
+ union nft_pipapo_map_bucket *mt,
+ unsigned long bsize, const u8 *pkt,
+ bool first, bool last, int offset)
+{
+ int i, ret = -1, m256_size = bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ u8 pg[8] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf,
+ pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf };
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first) {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(0, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(1, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(2, lt, 2, pg[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(3, lt, 3, pg[3], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(4, lt, 4, pg[4], bsize);
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(6, lt, 5, pg[5], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(7, lt, 6, pg[6], bsize);
+ NFT_PIPAPO_AVX2_AND(8, 2, 3);
+ NFT_PIPAPO_AVX2_AND(9, 4, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(10, lt, 7, pg[7], bsize);
+ NFT_PIPAPO_AVX2_AND(11, 6, 7);
+ NFT_PIPAPO_AVX2_AND(12, 8, 9);
+ NFT_PIPAPO_AVX2_AND(13, 10, 11);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(1, 12, 13);
+ } else {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(0, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(2, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(3, lt, 2, pg[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(4, lt, 3, pg[3], bsize);
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
+
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(6, lt, 4, pg[4], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(7, lt, 5, pg[5], bsize);
+ NFT_PIPAPO_AVX2_AND(8, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(9, lt, 6, pg[6], bsize);
+ NFT_PIPAPO_AVX2_AND(10, 4, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(11, lt, 7, pg[7], bsize);
+ NFT_PIPAPO_AVX2_AND(12, 6, 7);
+ NFT_PIPAPO_AVX2_AND(13, 8, 9);
+ NFT_PIPAPO_AVX2_AND(14, 10, 11);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(1, 12, 13);
+ NFT_PIPAPO_AVX2_AND(1, 1, 14);
+ }
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 1);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup12() - AVX2-based lookup for 12 four-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @lt: Lookup table for this field
+ * @mt: Mapping table for this field
+ * @bsize: Bucket size for this lookup table, in longs
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ *
+ * See nft_pipapo_avx2_lookup2().
+ *
+ * This is used for 48-bit fields (i.e. MAC addresses/EUI-48).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup12(unsigned long *map, unsigned long *fill,
+ unsigned long *lt,
+ union nft_pipapo_map_bucket *mt,
+ unsigned long bsize, const u8 *pkt,
+ bool first, bool last, int offset)
+{
+ int i, ret = -1, m256_size = bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ u8 pg[12] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf,
+ pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf,
+ pkt[4] >> 4, pkt[4] & 0xf, pkt[5] >> 4, pkt[5] & 0xf };
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (!first)
+ NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(1, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(2, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(3, lt, 2, pg[2], bsize);
+
+ if (!first) {
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
+ NFT_PIPAPO_AVX2_AND(1, 1, 0);
+ }
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(4, lt, 3, pg[3], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(5, lt, 4, pg[4], bsize);
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(7, lt, 5, pg[5], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(8, lt, 6, pg[6], bsize);
+ NFT_PIPAPO_AVX2_AND(9, 1, 4);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(10, lt, 7, pg[7], bsize);
+ NFT_PIPAPO_AVX2_AND(11, 5, 6);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(12, lt, 8, pg[8], bsize);
+ NFT_PIPAPO_AVX2_AND(13, 7, 8);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(14, lt, 9, pg[9], bsize);
+
+ NFT_PIPAPO_AVX2_AND(0, 9, 10);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(1, lt, 10, pg[10], bsize);
+ NFT_PIPAPO_AVX2_AND(2, 11, 12);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(3, lt, 11, pg[11], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 13, 14);
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
+
+ /* Stalls */
+ NFT_PIPAPO_AVX2_AND(7, 4, 5);
+ NFT_PIPAPO_AVX2_AND(8, 6, 7);
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(8, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 8);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup32() - AVX2-based lookup for 32 four-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @lt: Lookup table for this field
+ * @mt: Mapping table for this field
+ * @bsize: Bucket size for this lookup table, in longs
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ *
+ * See nft_pipapo_avx2_lookup2().
+ *
+ * This is used for 128-bit fields (i.e. IPv6 addresses).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup32(unsigned long *map, unsigned long *fill,
+ unsigned long *lt,
+ union nft_pipapo_map_bucket *mt,
+ unsigned long bsize, const u8 *pkt,
+ bool first, bool last, int offset)
+{
+ int i, ret = -1, m256_size = bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ u8 pg[32] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf,
+ pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf,
+ pkt[4] >> 4, pkt[4] & 0xf, pkt[5] >> 4, pkt[5] & 0xf,
+ pkt[6] >> 4, pkt[6] & 0xf, pkt[7] >> 4, pkt[7] & 0xf,
+ pkt[8] >> 4, pkt[8] & 0xf, pkt[9] >> 4, pkt[9] & 0xf,
+ pkt[10] >> 4, pkt[10] & 0xf, pkt[11] >> 4, pkt[11] & 0xf,
+ pkt[12] >> 4, pkt[12] & 0xf, pkt[13] >> 4, pkt[13] & 0xf,
+ pkt[14] >> 4, pkt[14] & 0xf, pkt[15] >> 4, pkt[15] & 0xf,
+ };
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (!first)
+ NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(1, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(2, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(3, lt, 2, pg[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(4, lt, 3, pg[3], bsize);
+ if (!first) {
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
+ NFT_PIPAPO_AVX2_AND(1, 1, 0);
+ }
+
+ NFT_PIPAPO_AVX2_AND(5, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(6, lt, 4, pg[4], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(7, lt, 5, pg[5], bsize);
+ NFT_PIPAPO_AVX2_AND(8, 1, 4);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(9, lt, 6, pg[6], bsize);
+ NFT_PIPAPO_AVX2_AND(10, 5, 6);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(11, lt, 7, pg[7], bsize);
+ NFT_PIPAPO_AVX2_AND(12, 7, 8);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(13, lt, 8, pg[8], bsize);
+ NFT_PIPAPO_AVX2_AND(14, 9, 10);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(0, lt, 9, pg[9], bsize);
+ NFT_PIPAPO_AVX2_AND(1, 11, 12);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(2, lt, 10, pg[10], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(3, lt, 11, pg[11], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 13, 14);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(5, lt, 12, pg[12], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(6, lt, 13, pg[13], bsize);
+ NFT_PIPAPO_AVX2_AND(7, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(8, lt, 14, pg[14], bsize);
+ NFT_PIPAPO_AVX2_AND(9, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(10, lt, 15, pg[15], bsize);
+ NFT_PIPAPO_AVX2_AND(11, 4, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(12, lt, 16, pg[16], bsize);
+ NFT_PIPAPO_AVX2_AND(13, 6, 7);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(14, lt, 17, pg[17], bsize);
+
+ NFT_PIPAPO_AVX2_AND(0, 8, 9);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(1, lt, 18, pg[18], bsize);
+ NFT_PIPAPO_AVX2_AND(2, 10, 11);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(3, lt, 19, pg[19], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 12, 13);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(5, lt, 20, pg[20], bsize);
+ NFT_PIPAPO_AVX2_AND(6, 14, 0);
+ NFT_PIPAPO_AVX2_AND(7, 1, 2);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(8, lt, 21, pg[21], bsize);
+ NFT_PIPAPO_AVX2_AND(9, 3, 4);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(10, lt, 22, pg[22], bsize);
+ NFT_PIPAPO_AVX2_AND(11, 5, 6);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(12, lt, 23, pg[23], bsize);
+ NFT_PIPAPO_AVX2_AND(13, 7, 8);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(14, lt, 24, pg[24], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(0, lt, 25, pg[25], bsize);
+ NFT_PIPAPO_AVX2_AND(1, 9, 10);
+ NFT_PIPAPO_AVX2_AND(2, 11, 12);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(3, lt, 26, pg[26], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 13, 14);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(5, lt, 27, pg[27], bsize);
+ NFT_PIPAPO_AVX2_AND(6, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(7, lt, 28, pg[28], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(8, lt, 29, pg[29], bsize);
+ NFT_PIPAPO_AVX2_AND(9, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(10, lt, 30, pg[30], bsize);
+ NFT_PIPAPO_AVX2_AND(11, 4, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD(12, lt, 31, pg[31], bsize);
+
+ NFT_PIPAPO_AVX2_AND(0, 6, 7);
+ NFT_PIPAPO_AVX2_AND(1, 8, 9);
+ NFT_PIPAPO_AVX2_AND(2, 10, 11);
+ NFT_PIPAPO_AVX2_AND(3, 12, 0);
+
+ /* Stalls */
+ NFT_PIPAPO_AVX2_AND(4, 1, 2);
+ NFT_PIPAPO_AVX2_AND(5, 3, 4);
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(5, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 5);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_noavx2() - Fallback function for uncommon field sizes
+ * @f: Field to be matched
+ * @res: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @lt: Lookup table for this field
+ * @mt: Mapping table for this field
+ * @bsize: Bucket size for this lookup table, in longs
+ * @rp: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ *
+ * This function should never be called, but is provided for the case the field
+ * size doesn't match any of the known data types. Matching rate is
+ * substantially lower than AVX2 routines.
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_noavx2(struct nft_pipapo_field *f,
+ unsigned long *res,
+ unsigned long *fill, unsigned long *lt,
+ union nft_pipapo_map_bucket *mt,
+ unsigned long bsize, const u8 *rp,
+ bool first, bool last, int offset)
+{
+ int i, ret = -1, b;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first)
+ memset(res, 0xff, bsize * sizeof(*res));
+
+ for (i = offset; i < bsize; i++) {
+ pipapo_and_field_buckets(f, res, rp);
+
+ b = pipapo_refill(res, bsize, f->rules, fill, mt, last);
+
+ if (last)
+ return b;
+
+ if (ret == -1)
+ ret = b / XSAVE_YMM_SIZE;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_estimate() - Set size, space and lookup complexity
+ * @desc: Set description, initial element count used here
+ * @features: Flags: NFT_SET_SUBKEY needs to be there
+ * @est: Storage for estimation data
+ *
+ * Return: true if @features match and AVX2 is available, false otherwise.
+ */
+bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features,
+ struct nft_set_estimate *est)
+{
+ if (!(features & NFT_SET_SUBKEY))
+ return false;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_AVX))
+ return false;
+
+ est->size = pipapo_estimate_size(desc->size);
+ est->lookup = NFT_SET_CLASS_O_LOG_N;
+ est->space = NFT_SET_CLASS_O_N;
+
+ return true;
+}
+
+/**
+ * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation
+ * @net: Network namespace
+ * @set: nftables API set representation
+ * @elem: nftables API element representation containing key data
+ * @ext: nftables API extension pointer, filled with matching reference
+ *
+ * For more details, see DOC: Theory of Operation in nft_set_pipapo.c.
+ *
+ * This implementation exploits the repetitive characteristic of the algorithm
+ * to provide a fast, vectorised version using the AVX2 SIMD instruction set.
+ *
+ * Return: true on match, false otherwise.
+ */
+bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key, const struct nft_set_ext **ext)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ unsigned long *res, *fill, *scratch;
+ u8 genmask = nft_genmask_cur(net);
+ const u8 *rp = (const u8 *)key;
+ struct nft_pipapo_match *m;
+ struct nft_pipapo_field *f;
+ bool map_index;
+ int i, ret = 0;
+
+ m = rcu_dereference(priv->match);
+
+ /* This also protects access to all data related to scratch maps */
+ kernel_fpu_begin();
+
+ if (unlikely(!m || !*raw_cpu_ptr(m->scratch))) {
+ kernel_fpu_end();
+ return false;
+ }
+
+ scratch = *raw_cpu_ptr(m->scratch_aligned);
+ map_index = raw_cpu_read(nft_pipapo_avx2_scratch_index);
+
+ res = scratch + (map_index ? m->bsize_max : 0);
+ fill = scratch + (map_index ? 0 : m->bsize_max);
+
+ /* Starting map doesn't need to be set for this implementation */
+
+ nft_pipapo_avx2_prepare();
+
+next_match:
+ nft_pipapo_for_each_field(f, i, m) {
+ bool last = i == m->field_count - 1, first = !i;
+
+#define NFT_SET_PIPAPO_AVX2_LOOKUP(n) \
+ (ret = nft_pipapo_avx2_lookup ##n(res, fill, \
+ f->lt_aligned, f->mt, \
+ f->bsize, rp, \
+ first, last, ret))
+
+ if (f->groups == 2) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(2);
+ } else if (f->groups == 4) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(4);
+ } else if (f->groups == 8) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(8);
+ } else if (f->groups == 12) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(12);
+ } else if (f->groups == 32) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(32);
+ } else {
+ ret = nft_pipapo_avx2_lookup_noavx2(f, res, fill,
+ f->lt_aligned,
+ f->mt, f->bsize,
+ rp,
+ first, last, ret);
+ }
+#undef NFT_SET_PIPAPO_AVX2_LOOKUP
+
+ if (ret < 0)
+ goto out;
+
+ if (last) {
+ *ext = &f->mt[ret].e->ext;
+ if (unlikely(nft_set_elem_expired(*ext) ||
+ !nft_set_elem_active(*ext, genmask))) {
+ ret = 0;
+ goto next_match;
+ }
+
+ goto out;
+ }
+
+ map_index = !map_index;
+ swap(res, fill);
+ rp += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups);
+ }
+
+out:
+ raw_cpu_write(nft_pipapo_avx2_scratch_index, map_index);
+ kernel_fpu_end();
+
+ return ret >= 0;
+}
new file mode 100644
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _NFT_SET_PIPAPO_AVX2_H
+
+#ifdef CONFIG_AS_AVX2
+#include <asm/fpu/xstate.h>
+#define NFT_PIPAPO_ALIGN (XSAVE_YMM_SIZE / BITS_PER_BYTE)
+
+bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key, const struct nft_set_ext **ext);
+bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features,
+ struct nft_set_estimate *est);
+#endif /* CONFIG_AS_AVX2 */
+
+#endif /* _NFT_SET_PIPAPO_AVX2_H */
If the AVX2 set is available, we can exploit the repetitive characteristic of this algorithm to provide a fast, vectorised version by using 256-bit wide AVX2 operations for bucket loads and bitwise intersections. In most cases, this implementation consistently outperforms rbtree set instances despite the fact they are configured to use a given, single, ranged data type out of the ones used for performance measurements by the nft_concat_range.sh kselftest. That script, injecting packets directly on the ingoing device path with pktgen, reports: - for one AMD Epyc 7351 thread (2.9GHz, 512 KiB L1D$, 8 MiB L2$): TEST: performance net,port [ OK ] baseline (drop from netdev hook): 10195687pps baseline hash (non-ranged entries): 6166047pps baseline rbtree (match on first field only): 2648166pps set with 1000 full, ranged entries: 5013920pps port,net [ OK ] baseline (drop from netdev hook): 10146446pps baseline hash (non-ranged entries): 5958857pps baseline rbtree (match on first field only): 3972543pps set with 100 full, ranged entries: 5032332pps net6,port [ OK ] baseline (drop from netdev hook): 9621089pps baseline hash (non-ranged entries): 4784304pps baseline rbtree (match on first field only): 1349369pps set with 1000 full, ranged entries: 2413250pps port,proto [ OK ] baseline (drop from netdev hook): 10821583pps baseline hash (non-ranged entries): 6809399pps baseline rbtree (match on first field only): 2799538pps set with 30000 full, ranged entries: 1921039pps net6,port,mac [ OK ] baseline (drop from netdev hook): 9460996pps baseline hash (non-ranged entries): 3893325pps baseline rbtree (match on first field only): 2919418pps set with 10 full, ranged entries: 2898623pps net6,port,mac,proto [ OK ] baseline (drop from netdev hook): 9578663pps baseline hash (non-ranged entries): 3705263pps baseline rbtree (match on first field only): 1342876pps set with 1000 full, ranged entries: 2005030pps net,mac [ OK ] baseline (drop from netdev hook): 10153020pps baseline hash (non-ranged entries): 5145586pps baseline rbtree (match on first field only): 2664821pps set with 1000 full, ranged entries: 3922839pps - for one Intel Core i7-6600U thread (3.4GHz, 64 KiB L1D$, 512 KiB L2$): TEST: performance net,port [ OK ] baseline (drop from netdev hook): 10229859pps baseline hash (non-ranged entries): 6160930pps baseline rbtree (match on first field only): 2926966pps set with 1000 full, ranged entries: 4812717pps port,net [ OK ] baseline (drop from netdev hook): 10234013pps baseline hash (non-ranged entries): 6164457pps baseline rbtree (match on first field only): 4019270pps set with 100 full, ranged entries: 5072830pps net6,port [ OK ] baseline (drop from netdev hook): 9603512pps baseline hash (non-ranged entries): 4771150pps baseline rbtree (match on first field only): 1610077pps set with 1000 full, ranged entries: 2942229pps port,proto [ OK ] baseline (drop from netdev hook): 10912230pps baseline hash (non-ranged entries): 6906587pps baseline rbtree (match on first field only): 3156167pps set with 30000 full, ranged entries: 2440219pps net6,port,mac [ OK ] baseline (drop from netdev hook): 10020213pps baseline hash (non-ranged entries): 3415258pps baseline rbtree (match on first field only): 3167192pps set with 10 full, ranged entries: 2422204pps net6,port,mac,proto [ OK ] baseline (drop from netdev hook): 9860087pps baseline hash (non-ranged entries): 3883861pps baseline rbtree (match on first field only): 1626784pps set with 1000 full, ranged entries: 2318861pps net,mac [ OK ] baseline (drop from netdev hook): 10313285pps baseline hash (non-ranged entries): 5324213pps baseline rbtree (match on first field only): 2970661pps set with 1000 full, ranged entries: 4128105pps A similar strategy could be easily reused to implement specialised versions for other SIMD sets, and I plan to post at least a NEON version at a later time. The vectorised implementation is automatically selected whenever the AVX2 feature is available, and this can be detected with the following check: [ $(uname -m) = "x86_64" ] && grep -q avx2 /proc/cpuinfo In order to make set selection more explicit and visible, we might at a later time export a different name, by introducing a new attribute, e.g. NFTA_SET_OPS, as suggested by Phil Sutter on netfilter-devel in <20180403211540.23700-3-phil@nwl.cc>. v2: - extend scope of kernel_fpu_begin/end() to protect all accesses to scratch maps (Florian Westphal) - drop rcu_read_lock/unlock() from nft_pipapo_avx2_lookup(), it's already implied (Florian Westphal) - mention in commit message how to check if this set is used Signed-off-by: Stefano Brivio <sbrivio@redhat.com> --- include/net/netfilter/nf_tables_core.h | 1 + net/netfilter/Makefile | 3 + net/netfilter/nf_tables_set_core.c | 6 + net/netfilter/nft_set_pipapo.c | 25 + net/netfilter/nft_set_pipapo_avx2.c | 838 +++++++++++++++++++++++++ net/netfilter/nft_set_pipapo_avx2.h | 14 + 6 files changed, 887 insertions(+) create mode 100644 net/netfilter/nft_set_pipapo_avx2.c create mode 100644 net/netfilter/nft_set_pipapo_avx2.h