@@ -312,16 +312,6 @@ static inline bool nf_ct_should_gc(const struct nf_conn *ct)
#define NF_CT_DAY (86400 * HZ)
-/* Set an arbitrary timeout large enough not to ever expire, this save
- * us a check for the IPS_OFFLOAD_BIT from the packet path via
- * nf_ct_is_expired().
- */
-static inline void nf_ct_offload_timeout(struct nf_conn *ct)
-{
- if (nf_ct_expires(ct) < NF_CT_DAY / 2)
- WRITE_ONCE(ct->timeout, nfct_time_stamp + NF_CT_DAY);
-}
-
struct kernel_param;
int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp);
@@ -1498,12 +1498,6 @@ static void gc_worker(struct work_struct *work)
tmp = nf_ct_tuplehash_to_ctrack(h);
- if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
- nf_ct_offload_timeout(tmp);
- if (!nf_conntrack_max95)
- continue;
- }
-
if (expired_count > GC_SCAN_EXPIRED_MAX) {
rcu_read_unlock();
@@ -424,6 +424,103 @@ static bool nf_flow_custom_gc(struct nf_flowtable *flow_table,
return flow_table->type->gc && flow_table->type->gc(flow);
}
+/**
+ * nf_flow_table_tcp_timeout() - new timeout of offloaded tcp entry
+ * @ct: Flowtable offloaded tcp ct
+ *
+ * Return number of seconds when ct entry should expire.
+ */
+static u32 nf_flow_table_tcp_timeout(const struct nf_conn *ct)
+{
+ u8 state = READ_ONCE(ct->proto.tcp.state);
+
+ switch (state) {
+ case TCP_CONNTRACK_SYN_SENT:
+ case TCP_CONNTRACK_SYN_RECV:
+ return 0;
+ case TCP_CONNTRACK_ESTABLISHED:
+ return NF_CT_DAY;
+ case TCP_CONNTRACK_FIN_WAIT:
+ case TCP_CONNTRACK_CLOSE_WAIT:
+ case TCP_CONNTRACK_LAST_ACK:
+ case TCP_CONNTRACK_TIME_WAIT:
+ return 5 * 60 * HZ;
+ case TCP_CONNTRACK_CLOSE:
+ return 0;
+ }
+
+ return 0;
+}
+
+/**
+ * nf_flow_table_extend_ct_timeout() - Extend ct timeout of offloaded conntrack entry
+ * @ct: Flowtable offloaded ct
+ *
+ * Datapath lookups in the conntrack table will evict nf_conn entries
+ * if they have expired.
+ *
+ * Once nf_conn entries have been offloaded, nf_conntrack might not see any
+ * packets anymore. Thus ct->timeout is no longer refreshed and ct can
+ * be evicted.
+ *
+ * To avoid the need for an additional check on the offload bit for every
+ * packet processed via nf_conntrack_in(), set an arbitrary timeout large
+ * enough not to ever expire, this save us a check for the IPS_OFFLOAD_BIT
+ * from the packet path via nf_ct_is_expired().
+ */
+static void nf_flow_table_extend_ct_timeout(struct nf_conn *ct)
+{
+ static const u32 min_timeout = 5 * 60 * HZ;
+ u32 expires = nf_ct_expires(ct);
+
+ /* normal case: large enough timeout, nothing to do. */
+ if (likely(expires >= min_timeout))
+ return;
+
+ /* must check offload bit after this, we do not hold any locks.
+ * flowtable and ct entries could have been removed on another CPU.
+ */
+ if (!refcount_inc_not_zero(&ct->ct_general.use))
+ return;
+
+ /* load ct->status after refcount increase */
+ smp_acquire__after_ctrl_dep();
+
+ if (nf_ct_is_confirmed(ct) &&
+ test_bit(IPS_OFFLOAD_BIT, &ct->status)) {
+ u8 l4proto = nf_ct_protonum(ct);
+ u32 new_timeout = true;
+
+ switch (l4proto) {
+ case IPPROTO_UDP:
+ new_timeout = NF_CT_DAY;
+ break;
+ case IPPROTO_TCP:
+ new_timeout = nf_flow_table_tcp_timeout(ct);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ /* Update to ct->timeout from nf_conntrack happens
+ * without holding ct->lock.
+ *
+ * Use cmpxchg to ensure timeout extension doesn't
+ * happen when we race with conntrack datapath.
+ *
+ * The inverse -- datapath updating ->timeout right
+ * after this -- is fine, datapath is authoritative.
+ */
+ if (new_timeout) {
+ new_timeout += nfct_time_stamp;
+ cmpxchg(&ct->timeout, expires, new_timeout);
+ }
+ }
+
+ nf_ct_put(ct);
+}
+
static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table,
struct flow_offload *flow, void *data)
{
@@ -431,6 +528,8 @@ static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table,
nf_ct_is_dying(flow->ct) ||
nf_flow_custom_gc(flow_table, flow))
flow_offload_teardown(flow);
+ else
+ nf_flow_table_extend_ct_timeout(flow->ct);
if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) {
if (test_bit(NF_FLOW_HW, &flow->flags)) {
Offload nf_conn entries may not see traffic for a very long time. To prevent incorrect 'ct is stale' checks during nf_conntrack table lookup, the gc worker extends the timeout nf_conn entries marked for offload to a large value. The existing logic suffers from a few problems. Garbage collection runs without locks, its unlikely but possible that @ct is removed right after the 'offload' bit test. In that case, the timeout of a new/reallocated nf_conn entry will be increased. Prevent this by obtaining a reference count on the ct object and re-check of the confirmed and offload bits. If those are not set, the ct is being removed, skip the timeout extension in this case. Parallel teardown is also problematic: cpu1 cpu2 gc_worker calls flow_offload_teardown() tests OFFLOAD bit, set clear OFFLOAD bit ct->timeout is repaired (e.g. set to timeout[UDP_CT_REPLIED]) nf_ct_offload_timeout() called expire value is fetched <INTERRUPT> -> NF_CT_DAY timeout for flow that isn't offloaded (and might not see any further packets). Use cmpxchg: if ct->timeout was repaired after the 2nd 'offload bit' test passed, then ct->timeout will only be updated of ct->timeout was not altered in between. As we already have a gc worker for flowtable entries, ct->timeout repair can be handled from the flowtable gc worker. This avoids having flowtable specific logic in the conntrack core and avoids checking entries that were never offloaded. Signed-off-by: Florian Westphal <fw@strlen.de> --- include/net/netfilter/nf_conntrack.h | 10 --- net/netfilter/nf_conntrack_core.c | 6 -- net/netfilter/nf_flow_table_core.c | 99 ++++++++++++++++++++++++++++ 3 files changed, 99 insertions(+), 16 deletions(-)