diff mbox series

[v7,bpf-next,01/16] bpf: verifier: mark verified-insn with sub-register zext flag

Message ID 1558551312-17081-2-git-send-email-jiong.wang@netronome.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series bpf: eliminate zero extensions for sub-register writes | expand

Commit Message

Jiong Wang May 22, 2019, 6:54 p.m. UTC
eBPF ISA specification requires high 32-bit cleared when low 32-bit
sub-register is written. This applies to destination register of ALU32 etc.
JIT back-ends must guarantee this semantic when doing code-gen. x86_64 and
AArch64 ISA has the same semantics, so the corresponding JIT back-end
doesn't need to do extra work.

However, 32-bit arches (arm, x86, nfp etc.) and some other 64-bit arches
(PowerPC, SPARC etc) need to do explicit zero extension to meet this
requirement, otherwise code like the following will fail.

  u64_value = (u64) u32_value
  ... other uses of u64_value

This is because compiler could exploit the semantic described above and
save those zero extensions for extending u32_value to u64_value, these JIT
back-ends are expected to guarantee this through inserting extra zero
extensions which however could be a significant increase on the code size.
Some benchmarks show there could be ~40% sub-register writes out of total
insns, meaning at least ~40% extra code-gen.

One observation is these extra zero extensions are not always necessary.
Take above code snippet for example, it is possible u32_value will never be
casted into a u64, the value of high 32-bit of u32_value then could be
ignored and extra zero extension could be eliminated.

This patch implements this idea, insns defining sub-registers will be
marked when the high 32-bit of the defined sub-register matters. For
those unmarked insns, it is safe to eliminate high 32-bit clearnace for
them.

Algo:
 - Split read flags into READ32 and READ64.

 - Record index of insn that does sub-register write. Keep the index inside
   reg state and update it during verifier insn walking.

 - A full register read on a sub-register marks its definition insn as
   needing zero extension on dst register.

   A new sub-register write overrides the old one.

 - When propagating read64 during path pruning, also mark any insn defining
   a sub-register that is read in the pruned path as full-register.

Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
---
 include/linux/bpf_verifier.h |  14 +++-
 kernel/bpf/verifier.c        | 175 +++++++++++++++++++++++++++++++++++++++----
 2 files changed, 173 insertions(+), 16 deletions(-)

Comments

Alexei Starovoitov May 23, 2019, 2:07 a.m. UTC | #1
On Wed, May 22, 2019 at 07:54:57PM +0100, Jiong Wang wrote:
> eBPF ISA specification requires high 32-bit cleared when low 32-bit
> sub-register is written. This applies to destination register of ALU32 etc.
> JIT back-ends must guarantee this semantic when doing code-gen. x86_64 and
> AArch64 ISA has the same semantics, so the corresponding JIT back-end
> doesn't need to do extra work.
> 
> However, 32-bit arches (arm, x86, nfp etc.) and some other 64-bit arches
> (PowerPC, SPARC etc) need to do explicit zero extension to meet this
> requirement, otherwise code like the following will fail.
> 
>   u64_value = (u64) u32_value
>   ... other uses of u64_value
> 
> This is because compiler could exploit the semantic described above and
> save those zero extensions for extending u32_value to u64_value, these JIT
> back-ends are expected to guarantee this through inserting extra zero
> extensions which however could be a significant increase on the code size.
> Some benchmarks show there could be ~40% sub-register writes out of total
> insns, meaning at least ~40% extra code-gen.
> 
> One observation is these extra zero extensions are not always necessary.
> Take above code snippet for example, it is possible u32_value will never be
> casted into a u64, the value of high 32-bit of u32_value then could be
> ignored and extra zero extension could be eliminated.
> 
> This patch implements this idea, insns defining sub-registers will be
> marked when the high 32-bit of the defined sub-register matters. For
> those unmarked insns, it is safe to eliminate high 32-bit clearnace for
> them.
> 
> Algo:
>  - Split read flags into READ32 and READ64.
> 
>  - Record index of insn that does sub-register write. Keep the index inside
>    reg state and update it during verifier insn walking.
> 
>  - A full register read on a sub-register marks its definition insn as
>    needing zero extension on dst register.
> 
>    A new sub-register write overrides the old one.
> 
>  - When propagating read64 during path pruning, also mark any insn defining
>    a sub-register that is read in the pruned path as full-register.
> 
> Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
> Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
> ---
>  include/linux/bpf_verifier.h |  14 +++-
>  kernel/bpf/verifier.c        | 175 +++++++++++++++++++++++++++++++++++++++----
>  2 files changed, 173 insertions(+), 16 deletions(-)
> 
> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
> index 1305ccb..60fb54e 100644
> --- a/include/linux/bpf_verifier.h
> +++ b/include/linux/bpf_verifier.h
> @@ -36,9 +36,11 @@
>   */
>  enum bpf_reg_liveness {
>  	REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */
> -	REG_LIVE_READ, /* reg was read, so we're sensitive to initial value */
> -	REG_LIVE_WRITTEN, /* reg was written first, screening off later reads */
> -	REG_LIVE_DONE = 4, /* liveness won't be updating this register anymore */
> +	REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial value */
> +	REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */
> +	REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64,
> +	REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later reads */
> +	REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */
>  };
>  
>  struct bpf_reg_state {
> @@ -131,6 +133,11 @@ struct bpf_reg_state {
>  	 * pointing to bpf_func_state.
>  	 */
>  	u32 frameno;
> +	/* Tracks subreg definition. The stored value is the insn_idx of the
> +	 * writing insn. This is safe because subreg_def is used before any insn
> +	 * patching which only happens after main verification finished.
> +	 */
> +	s32 subreg_def;
>  	enum bpf_reg_liveness live;
>  };
>  
> @@ -232,6 +239,7 @@ struct bpf_insn_aux_data {
>  	int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
>  	int sanitize_stack_off; /* stack slot to be cleared */
>  	bool seen; /* this insn was processed by the verifier */
> +	bool zext_dst; /* this insn zero extends dst reg */
>  	u8 alu_state; /* used in combination with alu_limit */
>  	unsigned int orig_idx; /* original instruction index */
>  };
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 95f93544..0efccf8 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -981,6 +981,7 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
>  	__mark_reg_not_init(regs + regno);
>  }
>  
> +#define DEF_NOT_SUBREG	(-1)
>  static void init_reg_state(struct bpf_verifier_env *env,
>  			   struct bpf_func_state *state)
>  {
> @@ -991,6 +992,7 @@ static void init_reg_state(struct bpf_verifier_env *env,
>  		mark_reg_not_init(env, regs, i);
>  		regs[i].live = REG_LIVE_NONE;
>  		regs[i].parent = NULL;
> +		regs[i].subreg_def = DEF_NOT_SUBREG;

shouldn't it be moved into __mark_reg_unknown ?
Also what about my old suggestion to use DEF_NOT_SUBREG==0
to be on safer side when we zero things out?

>  	}
>  
>  	/* frame pointer */
> @@ -1136,7 +1138,7 @@ static int check_subprogs(struct bpf_verifier_env *env)
>   */
>  static int mark_reg_read(struct bpf_verifier_env *env,
>  			 const struct bpf_reg_state *state,
> -			 struct bpf_reg_state *parent)
> +			 struct bpf_reg_state *parent, u8 flag)
>  {
>  	bool writes = parent == state->parent; /* Observe write marks */
>  	int cnt = 0;
> @@ -1151,17 +1153,26 @@ static int mark_reg_read(struct bpf_verifier_env *env,
>  				parent->var_off.value, parent->off);
>  			return -EFAULT;
>  		}
> -		if (parent->live & REG_LIVE_READ)
> +		/* The first condition is more likely to be true than the
> +		 * second, checked it first.
> +		 */
> +		if ((parent->live & REG_LIVE_READ) == flag ||
> +		    parent->live & REG_LIVE_READ64)
>  			/* The parentage chain never changes and
>  			 * this parent was already marked as LIVE_READ.
>  			 * There is no need to keep walking the chain again and
>  			 * keep re-marking all parents as LIVE_READ.
>  			 * This case happens when the same register is read
>  			 * multiple times without writes into it in-between.
> +			 * Also, if parent has the stronger REG_LIVE_READ64 set,
> +			 * then no need to set the weak REG_LIVE_READ32.
>  			 */
>  			break;
>  		/* ... then we depend on parent's value */
> -		parent->live |= REG_LIVE_READ;
> +		parent->live |= flag;
> +		/* REG_LIVE_READ64 overrides REG_LIVE_READ32. */
> +		if (flag == REG_LIVE_READ64)
> +			parent->live &= ~REG_LIVE_READ32;
>  		state = parent;
>  		parent = state->parent;
>  		writes = true;
> @@ -1173,12 +1184,111 @@ static int mark_reg_read(struct bpf_verifier_env *env,
>  	return 0;
>  }
>  
> +/* This function is supposed to be used by the following 32-bit optimization
> + * code only. It returns TRUE if the source or destination register operates
> + * on 64-bit, otherwise return FALSE.
> + */
> +static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
> +		     u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t)
> +{
> +	u8 code, class, op;
> +
> +	code = insn->code;
> +	class = BPF_CLASS(code);
> +	op = BPF_OP(code);
> +	if (class == BPF_JMP) {
> +		/* BPF_EXIT for "main" will reach here. Return TRUE
> +		 * conservatively.
> +		 */
> +		if (op == BPF_EXIT)
> +			return true;
> +		if (op == BPF_CALL) {
> +			/* BPF to BPF call will reach here because of marking
> +			 * caller saved clobber with DST_OP_NO_MARK for which we
> +			 * don't care the register def because they are anyway
> +			 * marked as NOT_INIT already.
> +			 */
> +			if (insn->src_reg == BPF_PSEUDO_CALL)
> +				return false;
> +			/* Helper call will reach here because of arg type
> +			 * check, conservatively return TRUE.
> +			 */
> +			if (t == SRC_OP)
> +				return true;
> +
> +			return false;
> +		}
> +	}
> +
> +	if (class == BPF_ALU64 || class == BPF_JMP ||
> +	    /* BPF_END always use BPF_ALU class. */
> +	    (class == BPF_ALU && op == BPF_END && insn->imm == 64))
> +		return true;
> +
> +	if (class == BPF_ALU || class == BPF_JMP32)
> +		return false;
> +
> +	if (class == BPF_LDX) {
> +		if (t != SRC_OP)
> +			return BPF_SIZE(code) == BPF_DW;
> +		/* LDX source must be ptr. */
> +		return true;
> +	}
> +
> +	if (class == BPF_STX) {
> +		if (reg->type != SCALAR_VALUE)
> +			return true;
> +		return BPF_SIZE(code) == BPF_DW;
> +	}
> +
> +	if (class == BPF_LD) {
> +		u8 mode = BPF_MODE(code);
> +
> +		/* LD_IMM64 */
> +		if (mode == BPF_IMM)
> +			return true;
> +
> +		/* Both LD_IND and LD_ABS return 32-bit data. */
> +		if (t != SRC_OP)
> +			return  false;
> +
> +		/* Implicit ctx ptr. */
> +		if (regno == BPF_REG_6)
> +			return true;

compiler will optimize above 'if' away.
What's the point of the above stmt?

> +
> +		/* Explicit source could be any width. */
> +		return true;
> +	}
> +
> +	if (class == BPF_ST)
> +		/* The only source register for BPF_ST is a ptr. */
> +		return true;

ditto

> +
> +	/* Conservatively return true at default. */
> +	return true;
> +}
> +
> +static void mark_insn_zext(struct bpf_verifier_env *env,
> +			   struct bpf_reg_state *reg)
> +{
> +	s32 def_idx = reg->subreg_def;
> +
> +	if (def_idx == DEF_NOT_SUBREG)
> +		return;
> +
> +	env->insn_aux_data[def_idx].zext_dst = true;
> +	/* The dst will be zero extended, so won't be sub-register anymore. */
> +	reg->subreg_def = DEF_NOT_SUBREG;
> +}
> +
>  static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
>  			 enum reg_arg_type t)
>  {
>  	struct bpf_verifier_state *vstate = env->cur_state;
>  	struct bpf_func_state *state = vstate->frame[vstate->curframe];
> +	struct bpf_insn *insn = env->prog->insnsi + env->insn_idx;
>  	struct bpf_reg_state *reg, *regs = state->regs;
> +	bool rw64;
>  
>  	if (regno >= MAX_BPF_REG) {
>  		verbose(env, "R%d is invalid\n", regno);
> @@ -1186,6 +1296,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
>  	}
>  
>  	reg = &regs[regno];
> +	rw64 = is_reg64(env, insn, regno, reg, t);
>  	if (t == SRC_OP) {
>  		/* check whether register used as source operand can be read */
>  		if (reg->type == NOT_INIT) {
> @@ -1196,7 +1307,11 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
>  		if (regno == BPF_REG_FP)
>  			return 0;
>  
> -		return mark_reg_read(env, reg, reg->parent);
> +		if (rw64)
> +			mark_insn_zext(env, reg);
> +
> +		return mark_reg_read(env, reg, reg->parent,
> +				     rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32);
>  	} else {
>  		/* check whether register used as dest operand can be written to */
>  		if (regno == BPF_REG_FP) {
> @@ -1204,6 +1319,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
>  			return -EACCES;
>  		}
>  		reg->live |= REG_LIVE_WRITTEN;
> +		reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx;
>  		if (t == DST_OP)
>  			mark_reg_unknown(env, regs, regno);
>  	}
> @@ -1383,7 +1499,8 @@ static int check_stack_read(struct bpf_verifier_env *env,
>  			state->regs[value_regno].live |= REG_LIVE_WRITTEN;
>  		}
>  		mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
> -			      reg_state->stack[spi].spilled_ptr.parent);
> +			      reg_state->stack[spi].spilled_ptr.parent,
> +			      REG_LIVE_READ64);
>  		return 0;
>  	} else {
>  		int zeros = 0;
> @@ -1400,7 +1517,9 @@ static int check_stack_read(struct bpf_verifier_env *env,
>  			return -EACCES;
>  		}
>  		mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
> -			      reg_state->stack[spi].spilled_ptr.parent);
> +			      reg_state->stack[spi].spilled_ptr.parent,
> +			      size == BPF_REG_SIZE
> +			      ? REG_LIVE_READ64 : REG_LIVE_READ32);

I don't think that's correct.
It's a parial read from a stack slot that doesn't contain a pointer.
It can be of any size and any offset.
Hence 4-byte read can read upper bits too.

>  		if (value_regno >= 0) {
>  			if (zeros == size) {
>  				/* any size read into register is zero extended,
> @@ -2109,6 +2228,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
>  						    value_regno);
>  				if (reg_type_may_be_null(reg_type))
>  					regs[value_regno].id = ++env->id_gen;
> +				/* A load of ctx field could have different
> +				 * actual load size with the one encoded in the
> +				 * insn. When the dst is PTR, it is for sure not
> +				 * a sub-register.
> +				 */
> +				regs[value_regno].subreg_def = DEF_NOT_SUBREG;

because of cases like above I think that DEF_NOT_SUBREG==0 is a safer choice.
To make any mark_reg_* to set it DEF_NOT_SUBREG and only two places
would have to do dst_reg->subreg_def = env->insn_idx + 1;

>  			}
>  			regs[value_regno].type = reg_type;
>  		}
> @@ -2368,7 +2493,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
>  		 * the whole slot to be marked as 'read'
>  		 */
>  		mark_reg_read(env, &state->stack[spi].spilled_ptr,
> -			      state->stack[spi].spilled_ptr.parent);
> +			      state->stack[spi].spilled_ptr.parent,
> +			      access_size == BPF_REG_SIZE
> +			      ? REG_LIVE_READ64 : REG_LIVE_READ32);
>  	}
>  	return update_stack_depth(env, state, min_off);
>  }
> @@ -3332,6 +3459,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
>  		check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
>  	}
>  
> +	/* helper call returns 64-bit value. */
> +	regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
> +
>  	/* update return register (already marked as written above) */
>  	if (fn->ret_type == RET_INTEGER) {
>  		/* sets type to SCALAR_VALUE */
> @@ -4263,6 +4393,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
>  				 */
>  				*dst_reg = *src_reg;
>  				dst_reg->live |= REG_LIVE_WRITTEN;
> +				dst_reg->subreg_def = DEF_NOT_SUBREG;
>  			} else {
>  				/* R1 = (u32) R2 */
>  				if (is_pointer_value(env, insn->src_reg)) {
> @@ -4273,6 +4404,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
>  				} else if (src_reg->type == SCALAR_VALUE) {
>  					*dst_reg = *src_reg;
>  					dst_reg->live |= REG_LIVE_WRITTEN;
> +					dst_reg->subreg_def = env->insn_idx;
>  				} else {
>  					mark_reg_unknown(env, regs,
>  							 insn->dst_reg);
> @@ -5352,6 +5484,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
>  	 * Already marked as written above.
>  	 */
>  	mark_reg_unknown(env, regs, BPF_REG_0);
> +	/* ld_abs load up to 32-bit skb data. */
> +	regs[BPF_REG_0].subreg_def = env->insn_idx;
>  	return 0;
>  }
>  
> @@ -6292,20 +6426,33 @@ static bool states_equal(struct bpf_verifier_env *env,
>  	return true;
>  }
>  
> +/* Return 0 if no propagation happened. Return negative error code if error
> + * happened. Otherwise, return the propagated bit.
> + */
>  static int propagate_liveness_reg(struct bpf_verifier_env *env,
>  				  struct bpf_reg_state *reg,
>  				  struct bpf_reg_state *parent_reg)
>  {
> +	u8 parent_flag = parent_reg->live & REG_LIVE_READ;
> +	u8 flag = reg->live & REG_LIVE_READ;
>  	int err;
>  
> -	if (parent_reg->live & REG_LIVE_READ || !(reg->live & REG_LIVE_READ))
> +	/* When comes here, read flags of PARENT_REG or REG could be any of
> +	 * REG_LIVE_READ64, REG_LIVE_READ32, REG_LIVE_NONE. There is no need
> +	 * of propagation if PARENT_REG has strongest REG_LIVE_READ64.
> +	 */
> +	if (parent_flag == REG_LIVE_READ64 ||
> +	    /* Or if there is no read flag from REG. */
> +	    !flag ||
> +	    /* Or if the read flag from REG is the same as PARENT_REG. */
> +	    parent_flag == flag)
>  		return 0;
>  
> -	err = mark_reg_read(env, reg, parent_reg);
> +	err = mark_reg_read(env, reg, parent_reg, flag);
>  	if (err)
>  		return err;
>  
> -	return 0;
> +	return flag;
>  }
>  
>  /* A write screens off any subsequent reads; but write marks come from the
> @@ -6339,8 +6486,10 @@ static int propagate_liveness(struct bpf_verifier_env *env,
>  		for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) {
>  			err = propagate_liveness_reg(env, &state_reg[i],
>  						     &parent_reg[i]);
> -			if (err)
> +			if (err < 0)
>  				return err;
> +			if (err == REG_LIVE_READ64)
> +				mark_insn_zext(env, &parent_reg[i]);
>  		}
>  
>  		/* Propagate stack slots. */
> @@ -6350,11 +6499,11 @@ static int propagate_liveness(struct bpf_verifier_env *env,
>  			state_reg = &state->stack[i].spilled_ptr;
>  			err = propagate_liveness_reg(env, state_reg,
>  						     parent_reg);
> -			if (err)
> +			if (err < 0)
>  				return err;
>  		}
>  	}
> -	return err;
> +	return 0;
>  }
>  
>  static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
> -- 
> 2.7.4
>
Jiong Wang May 23, 2019, 2:28 p.m. UTC | #2
> On 23 May 2019, at 03:07, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> 
> On Wed, May 22, 2019 at 07:54:57PM +0100, Jiong Wang wrote:
>> eBPF ISA specification requires high 32-bit cleared when low 32-bit
>> sub-register is written. This applies to destination register of ALU32 etc.
>> JIT back-ends must guarantee this semantic when doing code-gen. x86_64 and
>> AArch64 ISA has the same semantics, so the corresponding JIT back-end
>> doesn't need to do extra work.
>> 
>> However, 32-bit arches (arm, x86, nfp etc.) and some other 64-bit arches
>> (PowerPC, SPARC etc) need to do explicit zero extension to meet this
>> requirement, otherwise code like the following will fail.
>> 
>>  u64_value = (u64) u32_value
>>  ... other uses of u64_value
>> 
>> This is because compiler could exploit the semantic described above and
>> save those zero extensions for extending u32_value to u64_value, these JIT
>> back-ends are expected to guarantee this through inserting extra zero
>> extensions which however could be a significant increase on the code size.
>> Some benchmarks show there could be ~40% sub-register writes out of total
>> insns, meaning at least ~40% extra code-gen.
>> 
>> One observation is these extra zero extensions are not always necessary.
>> Take above code snippet for example, it is possible u32_value will never be
>> casted into a u64, the value of high 32-bit of u32_value then could be
>> ignored and extra zero extension could be eliminated.
>> 
>> This patch implements this idea, insns defining sub-registers will be
>> marked when the high 32-bit of the defined sub-register matters. For
>> those unmarked insns, it is safe to eliminate high 32-bit clearnace for
>> them.
>> 
>> Algo:
>> - Split read flags into READ32 and READ64.
>> 
>> - Record index of insn that does sub-register write. Keep the index inside
>>   reg state and update it during verifier insn walking.
>> 
>> - A full register read on a sub-register marks its definition insn as
>>   needing zero extension on dst register.
>> 
>>   A new sub-register write overrides the old one.
>> 
>> - When propagating read64 during path pruning, also mark any insn defining
>>   a sub-register that is read in the pruned path as full-register.
>> 
>> Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
>> Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
>> ---
>> include/linux/bpf_verifier.h |  14 +++-
>> kernel/bpf/verifier.c        | 175 +++++++++++++++++++++++++++++++++++++++----
>> 2 files changed, 173 insertions(+), 16 deletions(-)
>> 
>> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
>> index 1305ccb..60fb54e 100644
>> --- a/include/linux/bpf_verifier.h
>> +++ b/include/linux/bpf_verifier.h
>> @@ -36,9 +36,11 @@
>>  */
>> enum bpf_reg_liveness {
>> 	REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */
>> -	REG_LIVE_READ, /* reg was read, so we're sensitive to initial value */
>> -	REG_LIVE_WRITTEN, /* reg was written first, screening off later reads */
>> -	REG_LIVE_DONE = 4, /* liveness won't be updating this register anymore */
>> +	REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial value */
>> +	REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */
>> +	REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64,
>> +	REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later reads */
>> +	REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */
>> };
>> 
>> struct bpf_reg_state {
>> @@ -131,6 +133,11 @@ struct bpf_reg_state {
>> 	 * pointing to bpf_func_state.
>> 	 */
>> 	u32 frameno;
>> +	/* Tracks subreg definition. The stored value is the insn_idx of the
>> +	 * writing insn. This is safe because subreg_def is used before any insn
>> +	 * patching which only happens after main verification finished.
>> +	 */
>> +	s32 subreg_def;
>> 	enum bpf_reg_liveness live;
>> };
>> 
>> @@ -232,6 +239,7 @@ struct bpf_insn_aux_data {
>> 	int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
>> 	int sanitize_stack_off; /* stack slot to be cleared */
>> 	bool seen; /* this insn was processed by the verifier */
>> +	bool zext_dst; /* this insn zero extends dst reg */
>> 	u8 alu_state; /* used in combination with alu_limit */
>> 	unsigned int orig_idx; /* original instruction index */
>> };
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index 95f93544..0efccf8 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -981,6 +981,7 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
>> 	__mark_reg_not_init(regs + regno);
>> }
>> 
>> +#define DEF_NOT_SUBREG	(-1)
>> static void init_reg_state(struct bpf_verifier_env *env,
>> 			   struct bpf_func_state *state)
>> {
>> @@ -991,6 +992,7 @@ static void init_reg_state(struct bpf_verifier_env *env,
>> 		mark_reg_not_init(env, regs, i);
>> 		regs[i].live = REG_LIVE_NONE;
>> 		regs[i].parent = NULL;
>> +		regs[i].subreg_def = DEF_NOT_SUBREG;
> 
> shouldn't it be moved into __mark_reg_unknown ?

I had pondered this in early version, but thought it is not correct.
Like “live”, “subreg_def" is initialised inside init_reg_state, they are
not touched by __mark_reg_unknown, instead, they are updated mostly
updated by check_reg_arg which is strictly called at every reg read/write
sites which are places where register def and use happened and are the
places we want to update “subreg_def”.

From my understanding, __mark_reg_unknown could be called when verifier
is not sure about the value of a register, but the call site doesn’t care
about width at the moment. For example, for a narrow loads from stack,
__mark_reg_unknown could be triggered, the dst_reg then will be marked as
DEF_NOT_SUBREG, but it is a subreg define. 

Make sense?


> Also what about my old suggestion to use DEF_NOT_SUBREG==0
> to be on safer side when we zero things out?

Will change DEF_NOT_SUBREG to 0 if no problem shown up.

TBH, I thought about this in initial version, but later used -1
to avoid index + 1 and some other code change issues IIRC.

if we use DEF_NOT_SUBREG == 0, then when we zero things out, we will
mark one register def as a full register, it then won’t be zero extended
in any case.

if we use DEF_NOT_SUBREG == -1, then when we zero things out, one register
will be marked as sub-register def and defined by insn 0, which normally
is a context register move.
  

> 
>> 	}
>> 
>> 	/* frame pointer */
>> @@ -1136,7 +1138,7 @@ static int check_subprogs(struct bpf_verifier_env *env)
>>  */
>> static int mark_reg_read(struct bpf_verifier_env *env,
>> 			 const struct bpf_reg_state *state,
>> -			 struct bpf_reg_state *parent)
>> +			 struct bpf_reg_state *parent, u8 flag)
>> {
>> 	bool writes = parent == state->parent; /* Observe write marks */
>> 	int cnt = 0;
>> @@ -1151,17 +1153,26 @@ static int mark_reg_read(struct bpf_verifier_env *env,
>> 				parent->var_off.value, parent->off);
>> 			return -EFAULT;
>> 		}
>> -		if (parent->live & REG_LIVE_READ)
>> +		/* The first condition is more likely to be true than the
>> +		 * second, checked it first.
>> +		 */
>> +		if ((parent->live & REG_LIVE_READ) == flag ||
>> +		    parent->live & REG_LIVE_READ64)
>> 			/* The parentage chain never changes and
>> 			 * this parent was already marked as LIVE_READ.
>> 			 * There is no need to keep walking the chain again and
>> 			 * keep re-marking all parents as LIVE_READ.
>> 			 * This case happens when the same register is read
>> 			 * multiple times without writes into it in-between.
>> +			 * Also, if parent has the stronger REG_LIVE_READ64 set,
>> +			 * then no need to set the weak REG_LIVE_READ32.
>> 			 */
>> 			break;
>> 		/* ... then we depend on parent's value */
>> -		parent->live |= REG_LIVE_READ;
>> +		parent->live |= flag;
>> +		/* REG_LIVE_READ64 overrides REG_LIVE_READ32. */
>> +		if (flag == REG_LIVE_READ64)
>> +			parent->live &= ~REG_LIVE_READ32;
>> 		state = parent;
>> 		parent = state->parent;
>> 		writes = true;
>> @@ -1173,12 +1184,111 @@ static int mark_reg_read(struct bpf_verifier_env *env,
>> 	return 0;
>> }
>> 
>> +/* This function is supposed to be used by the following 32-bit optimization
>> + * code only. It returns TRUE if the source or destination register operates
>> + * on 64-bit, otherwise return FALSE.
>> + */
>> +static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
>> +		     u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t)
>> +{
>> +	u8 code, class, op;
>> +
>> +	code = insn->code;
>> +	class = BPF_CLASS(code);
>> +	op = BPF_OP(code);
>> +	if (class == BPF_JMP) {
>> +		/* BPF_EXIT for "main" will reach here. Return TRUE
>> +		 * conservatively.
>> +		 */
>> +		if (op == BPF_EXIT)
>> +			return true;
>> +		if (op == BPF_CALL) {
>> +			/* BPF to BPF call will reach here because of marking
>> +			 * caller saved clobber with DST_OP_NO_MARK for which we
>> +			 * don't care the register def because they are anyway
>> +			 * marked as NOT_INIT already.
>> +			 */
>> +			if (insn->src_reg == BPF_PSEUDO_CALL)
>> +				return false;
>> +			/* Helper call will reach here because of arg type
>> +			 * check, conservatively return TRUE.
>> +			 */
>> +			if (t == SRC_OP)
>> +				return true;
>> +
>> +			return false;
>> +		}
>> +	}
>> +
>> +	if (class == BPF_ALU64 || class == BPF_JMP ||
>> +	    /* BPF_END always use BPF_ALU class. */
>> +	    (class == BPF_ALU && op == BPF_END && insn->imm == 64))
>> +		return true;
>> +
>> +	if (class == BPF_ALU || class == BPF_JMP32)
>> +		return false;
>> +
>> +	if (class == BPF_LDX) {
>> +		if (t != SRC_OP)
>> +			return BPF_SIZE(code) == BPF_DW;
>> +		/* LDX source must be ptr. */
>> +		return true;
>> +	}
>> +
>> +	if (class == BPF_STX) {
>> +		if (reg->type != SCALAR_VALUE)
>> +			return true;
>> +		return BPF_SIZE(code) == BPF_DW;
>> +	}
>> +
>> +	if (class == BPF_LD) {
>> +		u8 mode = BPF_MODE(code);
>> +
>> +		/* LD_IMM64 */
>> +		if (mode == BPF_IMM)
>> +			return true;
>> +
>> +		/* Both LD_IND and LD_ABS return 32-bit data. */
>> +		if (t != SRC_OP)
>> +			return  false;
>> +
>> +		/* Implicit ctx ptr. */
>> +		if (regno == BPF_REG_6)
>> +			return true;
> 
> compiler will optimize above 'if' away.
> What's the point of the above stmt?

I implemented this function insn class by class to make sure I had thorough
checks for each insn class and won’t miss some corner case. I think the code
is more clear for maintain. And was making each insn class do full return
after finishing their own checks, tried to avoid fall through, this is why
these “redundant” checks are here.       
                                                                                 
And for this BPF_REG_6 check, BPF_LD insn looked quite special to me, so was     
implementing it following the comments above check_ld_abs case by case, this     
check was for the following comment:                                             
                                                                                 
 * Implicit input:                                                               
 *   ctx == skb == R6 == CTX                                                     
                                                                                 
It is true these checks could be merged, checks inside BPF_LD could be             
even further compressed into single line along with the class == BPF_LD
check but I am thinking that would be a little bit over compressed,
will just drop the following BPF_ST check and this BPF_REG_6 check, I
am thinking avoid fall through might make the code more clear, so will
stop at letting BPF_LD fall through to default return, new code then
will look like the following:

        if (class == BPF_LD) {
                u8 mode = BPF_MODE(code);

                /* LD_IMM64 */
                if (mode == BPF_IMM)
                        return true;

                /* Both LD_IND and LD_ABS return 32-bit data. */
                if (t != SRC_OP)
                        return  false;
              
                /* SRC is implicit ctx ptr which is 64-bit, or explicit source
                 * which could be any width.
                 */
                return true;
        }

        /* Conservatively return true for all the other cases. */
        return true;

Does this looks good?

> 
>> +
>> +		/* Explicit source could be any width. */
>> +		return true;
>> +	}
>> +
>> +	if (class == BPF_ST)
>> +		/* The only source register for BPF_ST is a ptr. */
>> +		return true;
> 
> ditto

> 
>> +
>> +	/* Conservatively return true at default. */
>> +	return true;
>> +}
>> +
>> +static void mark_insn_zext(struct bpf_verifier_env *env,
>> +			   struct bpf_reg_state *reg)
>> +{
>> +	s32 def_idx = reg->subreg_def;
>> +
>> +	if (def_idx == DEF_NOT_SUBREG)
>> +		return;
>> +
>> +	env->insn_aux_data[def_idx].zext_dst = true;
>> +	/* The dst will be zero extended, so won't be sub-register anymore. */
>> +	reg->subreg_def = DEF_NOT_SUBREG;
>> +}
>> +
>> static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
>> 			 enum reg_arg_type t)
>> {
>> 	struct bpf_verifier_state *vstate = env->cur_state;
>> 	struct bpf_func_state *state = vstate->frame[vstate->curframe];
>> +	struct bpf_insn *insn = env->prog->insnsi + env->insn_idx;
>> 	struct bpf_reg_state *reg, *regs = state->regs;
>> +	bool rw64;
>> 
>> 	if (regno >= MAX_BPF_REG) {
>> 		verbose(env, "R%d is invalid\n", regno);
>> @@ -1186,6 +1296,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
>> 	}
>> 
>> 	reg = &regs[regno];
>> +	rw64 = is_reg64(env, insn, regno, reg, t);
>> 	if (t == SRC_OP) {
>> 		/* check whether register used as source operand can be read */
>> 		if (reg->type == NOT_INIT) {
>> @@ -1196,7 +1307,11 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
>> 		if (regno == BPF_REG_FP)
>> 			return 0;
>> 
>> -		return mark_reg_read(env, reg, reg->parent);
>> +		if (rw64)
>> +			mark_insn_zext(env, reg);
>> +
>> +		return mark_reg_read(env, reg, reg->parent,
>> +				     rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32);
>> 	} else {
>> 		/* check whether register used as dest operand can be written to */
>> 		if (regno == BPF_REG_FP) {
>> @@ -1204,6 +1319,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
>> 			return -EACCES;
>> 		}
>> 		reg->live |= REG_LIVE_WRITTEN;
>> +		reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx;
>> 		if (t == DST_OP)
>> 			mark_reg_unknown(env, regs, regno);
>> 	}
>> @@ -1383,7 +1499,8 @@ static int check_stack_read(struct bpf_verifier_env *env,
>> 			state->regs[value_regno].live |= REG_LIVE_WRITTEN;
>> 		}
>> 		mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
>> -			      reg_state->stack[spi].spilled_ptr.parent);
>> +			      reg_state->stack[spi].spilled_ptr.parent,
>> +			      REG_LIVE_READ64);
>> 		return 0;
>> 	} else {
>> 		int zeros = 0;
>> @@ -1400,7 +1517,9 @@ static int check_stack_read(struct bpf_verifier_env *env,
>> 			return -EACCES;
>> 		}
>> 		mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
>> -			      reg_state->stack[spi].spilled_ptr.parent);
>> +			      reg_state->stack[spi].spilled_ptr.parent,
>> +			      size == BPF_REG_SIZE
>> +			      ? REG_LIVE_READ64 : REG_LIVE_READ32);
> 
> I don't think that's correct.
> It's a parial read from a stack slot that doesn't contain a pointer.
> It can be of any size and any offset.
> Hence 4-byte read can read upper bits too.

That’s true. But I think this won’t cause correctness issue. Because
for stack read, we call mark_reg_read mostly to set the read bit in
“live”, and it is used by stacksafe to know whether one stack slot
is ever used or not. Either REG_LIVE_READ64 or REG_LIVE_READ32 will
let the slot marked as “read”, and let stacksafe know it is used.

For stack content, if it comes from eBPF register A, then it must have
been stored by STX to stack which should have triggered zero extension
on the original A if it is a sub-register. So the content is correct.

Here, the issue is stack slot read is reusing mark_reg_read, the last
parameter for stack slot read just need to let mark_reg_read know this
slot has been read, perhaps just pass REG_LIVE_READ64 for all stack
read?
    
> 
>> 		if (value_regno >= 0) {
>> 			if (zeros == size) {
>> 				/* any size read into register is zero extended,
>> @@ -2109,6 +2228,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
>> 						    value_regno);
>> 				if (reg_type_may_be_null(reg_type))
>> 					regs[value_regno].id = ++env->id_gen;
>> +				/* A load of ctx field could have different
>> +				 * actual load size with the one encoded in the
>> +				 * insn. When the dst is PTR, it is for sure not
>> +				 * a sub-register.
>> +				 */
>> +				regs[value_regno].subreg_def = DEF_NOT_SUBREG;
> 
> because of cases like above I think that DEF_NOT_SUBREG==0 is a safer choice.
> To make any mark_reg_* to set it DEF_NOT_SUBREG and only two places
> would have to do dst_reg->subreg_def = env->insn_idx + 1;

> 
>> 			}
>> 			regs[value_regno].type = reg_type;
>> 		}
>> @@ -2368,7 +2493,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
>> 		 * the whole slot to be marked as 'read'
>> 		 */
>> 		mark_reg_read(env, &state->stack[spi].spilled_ptr,
>> -			      state->stack[spi].spilled_ptr.parent);
>> +			      state->stack[spi].spilled_ptr.parent,
>> +			      access_size == BPF_REG_SIZE
>> +			      ? REG_LIVE_READ64 : REG_LIVE_READ32);
>> 	}
>> 	return update_stack_depth(env, state, min_off);
>> }
>> @@ -3332,6 +3459,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
>> 		check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
>> 	}
>> 
>> +	/* helper call returns 64-bit value. */
>> +	regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
>> +
>> 	/* update return register (already marked as written above) */
>> 	if (fn->ret_type == RET_INTEGER) {
>> 		/* sets type to SCALAR_VALUE */
>> @@ -4263,6 +4393,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
>> 				 */
>> 				*dst_reg = *src_reg;
>> 				dst_reg->live |= REG_LIVE_WRITTEN;
>> +				dst_reg->subreg_def = DEF_NOT_SUBREG;
>> 			} else {
>> 				/* R1 = (u32) R2 */
>> 				if (is_pointer_value(env, insn->src_reg)) {
>> @@ -4273,6 +4404,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
>> 				} else if (src_reg->type == SCALAR_VALUE) {
>> 					*dst_reg = *src_reg;
>> 					dst_reg->live |= REG_LIVE_WRITTEN;
>> +					dst_reg->subreg_def = env->insn_idx;
>> 				} else {
>> 					mark_reg_unknown(env, regs,
>> 							 insn->dst_reg);
>> @@ -5352,6 +5484,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
>> 	 * Already marked as written above.
>> 	 */
>> 	mark_reg_unknown(env, regs, BPF_REG_0);
>> +	/* ld_abs load up to 32-bit skb data. */
>> +	regs[BPF_REG_0].subreg_def = env->insn_idx;
>> 	return 0;
>> }
>> 
>> @@ -6292,20 +6426,33 @@ static bool states_equal(struct bpf_verifier_env *env,
>> 	return true;
>> }
>> 
>> +/* Return 0 if no propagation happened. Return negative error code if error
>> + * happened. Otherwise, return the propagated bit.
>> + */
>> static int propagate_liveness_reg(struct bpf_verifier_env *env,
>> 				  struct bpf_reg_state *reg,
>> 				  struct bpf_reg_state *parent_reg)
>> {
>> +	u8 parent_flag = parent_reg->live & REG_LIVE_READ;
>> +	u8 flag = reg->live & REG_LIVE_READ;
>> 	int err;
>> 
>> -	if (parent_reg->live & REG_LIVE_READ || !(reg->live & REG_LIVE_READ))
>> +	/* When comes here, read flags of PARENT_REG or REG could be any of
>> +	 * REG_LIVE_READ64, REG_LIVE_READ32, REG_LIVE_NONE. There is no need
>> +	 * of propagation if PARENT_REG has strongest REG_LIVE_READ64.
>> +	 */
>> +	if (parent_flag == REG_LIVE_READ64 ||
>> +	    /* Or if there is no read flag from REG. */
>> +	    !flag ||
>> +	    /* Or if the read flag from REG is the same as PARENT_REG. */
>> +	    parent_flag == flag)
>> 		return 0;
>> 
>> -	err = mark_reg_read(env, reg, parent_reg);
>> +	err = mark_reg_read(env, reg, parent_reg, flag);
>> 	if (err)
>> 		return err;
>> 
>> -	return 0;
>> +	return flag;
>> }
>> 
>> /* A write screens off any subsequent reads; but write marks come from the
>> @@ -6339,8 +6486,10 @@ static int propagate_liveness(struct bpf_verifier_env *env,
>> 		for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) {
>> 			err = propagate_liveness_reg(env, &state_reg[i],
>> 						     &parent_reg[i]);
>> -			if (err)
>> +			if (err < 0)
>> 				return err;
>> +			if (err == REG_LIVE_READ64)
>> +				mark_insn_zext(env, &parent_reg[i]);
>> 		}
>> 
>> 		/* Propagate stack slots. */
>> @@ -6350,11 +6499,11 @@ static int propagate_liveness(struct bpf_verifier_env *env,
>> 			state_reg = &state->stack[i].spilled_ptr;
>> 			err = propagate_liveness_reg(env, state_reg,
>> 						     parent_reg);
>> -			if (err)
>> +			if (err < 0)
>> 				return err;
>> 		}
>> 	}
>> -	return err;
>> +	return 0;
>> }
>> 
>> static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
>> -- 
>> 2.7.4
Alexei Starovoitov May 23, 2019, 4:16 p.m. UTC | #3
On Thu, May 23, 2019 at 03:28:15PM +0100, Jiong Wang wrote:
> 
> > On 23 May 2019, at 03:07, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> > 
> > On Wed, May 22, 2019 at 07:54:57PM +0100, Jiong Wang wrote:
> >> eBPF ISA specification requires high 32-bit cleared when low 32-bit
> >> sub-register is written. This applies to destination register of ALU32 etc.
> >> JIT back-ends must guarantee this semantic when doing code-gen. x86_64 and
> >> AArch64 ISA has the same semantics, so the corresponding JIT back-end
> >> doesn't need to do extra work.
> >> 
> >> However, 32-bit arches (arm, x86, nfp etc.) and some other 64-bit arches
> >> (PowerPC, SPARC etc) need to do explicit zero extension to meet this
> >> requirement, otherwise code like the following will fail.
> >> 
> >>  u64_value = (u64) u32_value
> >>  ... other uses of u64_value
> >> 
> >> This is because compiler could exploit the semantic described above and
> >> save those zero extensions for extending u32_value to u64_value, these JIT
> >> back-ends are expected to guarantee this through inserting extra zero
> >> extensions which however could be a significant increase on the code size.
> >> Some benchmarks show there could be ~40% sub-register writes out of total
> >> insns, meaning at least ~40% extra code-gen.
> >> 
> >> One observation is these extra zero extensions are not always necessary.
> >> Take above code snippet for example, it is possible u32_value will never be
> >> casted into a u64, the value of high 32-bit of u32_value then could be
> >> ignored and extra zero extension could be eliminated.
> >> 
> >> This patch implements this idea, insns defining sub-registers will be
> >> marked when the high 32-bit of the defined sub-register matters. For
> >> those unmarked insns, it is safe to eliminate high 32-bit clearnace for
> >> them.
> >> 
> >> Algo:
> >> - Split read flags into READ32 and READ64.
> >> 
> >> - Record index of insn that does sub-register write. Keep the index inside
> >>   reg state and update it during verifier insn walking.
> >> 
> >> - A full register read on a sub-register marks its definition insn as
> >>   needing zero extension on dst register.
> >> 
> >>   A new sub-register write overrides the old one.
> >> 
> >> - When propagating read64 during path pruning, also mark any insn defining
> >>   a sub-register that is read in the pruned path as full-register.
> >> 
> >> Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
> >> Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
> >> ---
> >> include/linux/bpf_verifier.h |  14 +++-
> >> kernel/bpf/verifier.c        | 175 +++++++++++++++++++++++++++++++++++++++----
> >> 2 files changed, 173 insertions(+), 16 deletions(-)
> >> 
> >> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
> >> index 1305ccb..60fb54e 100644
> >> --- a/include/linux/bpf_verifier.h
> >> +++ b/include/linux/bpf_verifier.h
> >> @@ -36,9 +36,11 @@
> >>  */
> >> enum bpf_reg_liveness {
> >> 	REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */
> >> -	REG_LIVE_READ, /* reg was read, so we're sensitive to initial value */
> >> -	REG_LIVE_WRITTEN, /* reg was written first, screening off later reads */
> >> -	REG_LIVE_DONE = 4, /* liveness won't be updating this register anymore */
> >> +	REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial value */
> >> +	REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */
> >> +	REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64,
> >> +	REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later reads */
> >> +	REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */
> >> };
> >> 
> >> struct bpf_reg_state {
> >> @@ -131,6 +133,11 @@ struct bpf_reg_state {
> >> 	 * pointing to bpf_func_state.
> >> 	 */
> >> 	u32 frameno;
> >> +	/* Tracks subreg definition. The stored value is the insn_idx of the
> >> +	 * writing insn. This is safe because subreg_def is used before any insn
> >> +	 * patching which only happens after main verification finished.
> >> +	 */
> >> +	s32 subreg_def;
> >> 	enum bpf_reg_liveness live;
> >> };
> >> 
> >> @@ -232,6 +239,7 @@ struct bpf_insn_aux_data {
> >> 	int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
> >> 	int sanitize_stack_off; /* stack slot to be cleared */
> >> 	bool seen; /* this insn was processed by the verifier */
> >> +	bool zext_dst; /* this insn zero extends dst reg */
> >> 	u8 alu_state; /* used in combination with alu_limit */
> >> 	unsigned int orig_idx; /* original instruction index */
> >> };
> >> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> >> index 95f93544..0efccf8 100644
> >> --- a/kernel/bpf/verifier.c
> >> +++ b/kernel/bpf/verifier.c
> >> @@ -981,6 +981,7 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
> >> 	__mark_reg_not_init(regs + regno);
> >> }
> >> 
> >> +#define DEF_NOT_SUBREG	(-1)
> >> static void init_reg_state(struct bpf_verifier_env *env,
> >> 			   struct bpf_func_state *state)
> >> {
> >> @@ -991,6 +992,7 @@ static void init_reg_state(struct bpf_verifier_env *env,
> >> 		mark_reg_not_init(env, regs, i);
> >> 		regs[i].live = REG_LIVE_NONE;
> >> 		regs[i].parent = NULL;
> >> +		regs[i].subreg_def = DEF_NOT_SUBREG;
> > 
> > shouldn't it be moved into __mark_reg_unknown ?
> 
> I had pondered this in early version, but thought it is not correct.
> Like “live”, “subreg_def" is initialised inside init_reg_state, they are
> not touched by __mark_reg_unknown, instead, they are updated mostly
> updated by check_reg_arg which is strictly called at every reg read/write
> sites which are places where register def and use happened and are the
> places we want to update “subreg_def”.
> 
> From my understanding, __mark_reg_unknown could be called when verifier
> is not sure about the value of a register, but the call site doesn’t care
> about width at the moment. For example, for a narrow loads from stack,
> __mark_reg_unknown could be triggered, the dst_reg then will be marked as
> DEF_NOT_SUBREG, but it is a subreg define. 
> 
> Make sense?

yes.

> > Also what about my old suggestion to use DEF_NOT_SUBREG==0
> > to be on safer side when we zero things out?
> 
> Will change DEF_NOT_SUBREG to 0 if no problem shown up.
> 
> TBH, I thought about this in initial version, but later used -1
> to avoid index + 1 and some other code change issues IIRC.
> 
> if we use DEF_NOT_SUBREG == 0, then when we zero things out, we will
> mark one register def as a full register, it then won’t be zero extended
> in any case.
> 
> if we use DEF_NOT_SUBREG == -1, then when we zero things out, one register
> will be marked as sub-register def and defined by insn 0, which normally
> is a context register move.

'normally is a context register' is a dangerous assumption to make.
that's exactly the reason I prefer DEF_NOT_SUBREG == 0

> 
> > 
> >> 	}
> >> 
> >> 	/* frame pointer */
> >> @@ -1136,7 +1138,7 @@ static int check_subprogs(struct bpf_verifier_env *env)
> >>  */
> >> static int mark_reg_read(struct bpf_verifier_env *env,
> >> 			 const struct bpf_reg_state *state,
> >> -			 struct bpf_reg_state *parent)
> >> +			 struct bpf_reg_state *parent, u8 flag)
> >> {
> >> 	bool writes = parent == state->parent; /* Observe write marks */
> >> 	int cnt = 0;
> >> @@ -1151,17 +1153,26 @@ static int mark_reg_read(struct bpf_verifier_env *env,
> >> 				parent->var_off.value, parent->off);
> >> 			return -EFAULT;
> >> 		}
> >> -		if (parent->live & REG_LIVE_READ)
> >> +		/* The first condition is more likely to be true than the
> >> +		 * second, checked it first.
> >> +		 */
> >> +		if ((parent->live & REG_LIVE_READ) == flag ||
> >> +		    parent->live & REG_LIVE_READ64)
> >> 			/* The parentage chain never changes and
> >> 			 * this parent was already marked as LIVE_READ.
> >> 			 * There is no need to keep walking the chain again and
> >> 			 * keep re-marking all parents as LIVE_READ.
> >> 			 * This case happens when the same register is read
> >> 			 * multiple times without writes into it in-between.
> >> +			 * Also, if parent has the stronger REG_LIVE_READ64 set,
> >> +			 * then no need to set the weak REG_LIVE_READ32.
> >> 			 */
> >> 			break;
> >> 		/* ... then we depend on parent's value */
> >> -		parent->live |= REG_LIVE_READ;
> >> +		parent->live |= flag;
> >> +		/* REG_LIVE_READ64 overrides REG_LIVE_READ32. */
> >> +		if (flag == REG_LIVE_READ64)
> >> +			parent->live &= ~REG_LIVE_READ32;
> >> 		state = parent;
> >> 		parent = state->parent;
> >> 		writes = true;
> >> @@ -1173,12 +1184,111 @@ static int mark_reg_read(struct bpf_verifier_env *env,
> >> 	return 0;
> >> }
> >> 
> >> +/* This function is supposed to be used by the following 32-bit optimization
> >> + * code only. It returns TRUE if the source or destination register operates
> >> + * on 64-bit, otherwise return FALSE.
> >> + */
> >> +static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
> >> +		     u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t)
> >> +{
> >> +	u8 code, class, op;
> >> +
> >> +	code = insn->code;
> >> +	class = BPF_CLASS(code);
> >> +	op = BPF_OP(code);
> >> +	if (class == BPF_JMP) {
> >> +		/* BPF_EXIT for "main" will reach here. Return TRUE
> >> +		 * conservatively.
> >> +		 */
> >> +		if (op == BPF_EXIT)
> >> +			return true;
> >> +		if (op == BPF_CALL) {
> >> +			/* BPF to BPF call will reach here because of marking
> >> +			 * caller saved clobber with DST_OP_NO_MARK for which we
> >> +			 * don't care the register def because they are anyway
> >> +			 * marked as NOT_INIT already.
> >> +			 */
> >> +			if (insn->src_reg == BPF_PSEUDO_CALL)
> >> +				return false;
> >> +			/* Helper call will reach here because of arg type
> >> +			 * check, conservatively return TRUE.
> >> +			 */
> >> +			if (t == SRC_OP)
> >> +				return true;
> >> +
> >> +			return false;
> >> +		}
> >> +	}
> >> +
> >> +	if (class == BPF_ALU64 || class == BPF_JMP ||
> >> +	    /* BPF_END always use BPF_ALU class. */
> >> +	    (class == BPF_ALU && op == BPF_END && insn->imm == 64))
> >> +		return true;
> >> +
> >> +	if (class == BPF_ALU || class == BPF_JMP32)
> >> +		return false;
> >> +
> >> +	if (class == BPF_LDX) {
> >> +		if (t != SRC_OP)
> >> +			return BPF_SIZE(code) == BPF_DW;
> >> +		/* LDX source must be ptr. */
> >> +		return true;
> >> +	}
> >> +
> >> +	if (class == BPF_STX) {
> >> +		if (reg->type != SCALAR_VALUE)
> >> +			return true;
> >> +		return BPF_SIZE(code) == BPF_DW;
> >> +	}
> >> +
> >> +	if (class == BPF_LD) {
> >> +		u8 mode = BPF_MODE(code);
> >> +
> >> +		/* LD_IMM64 */
> >> +		if (mode == BPF_IMM)
> >> +			return true;
> >> +
> >> +		/* Both LD_IND and LD_ABS return 32-bit data. */
> >> +		if (t != SRC_OP)
> >> +			return  false;
> >> +
> >> +		/* Implicit ctx ptr. */
> >> +		if (regno == BPF_REG_6)
> >> +			return true;
> > 
> > compiler will optimize above 'if' away.
> > What's the point of the above stmt?
> 
> I implemented this function insn class by class to make sure I had thorough
> checks for each insn class and won’t miss some corner case. I think the code
> is more clear for maintain. And was making each insn class do full return
> after finishing their own checks, tried to avoid fall through, this is why
> these “redundant” checks are here.       
>                                                                                  
> And for this BPF_REG_6 check, BPF_LD insn looked quite special to me, so was     
> implementing it following the comments above check_ld_abs case by case, this     
> check was for the following comment:                                             
>                                                                                  
>  * Implicit input:                                                               
>  *   ctx == skb == R6 == CTX                                                     
>                                                                                  
> It is true these checks could be merged, checks inside BPF_LD could be             
> even further compressed into single line along with the class == BPF_LD
> check but I am thinking that would be a little bit over compressed,
> will just drop the following BPF_ST check and this BPF_REG_6 check, I
> am thinking avoid fall through might make the code more clear, so will
> stop at letting BPF_LD fall through to default return, new code then
> will look like the following:
> 
>         if (class == BPF_LD) {
>                 u8 mode = BPF_MODE(code);
> 
>                 /* LD_IMM64 */
>                 if (mode == BPF_IMM)
>                         return true;
> 
>                 /* Both LD_IND and LD_ABS return 32-bit data. */
>                 if (t != SRC_OP)
>                         return  false;
>               
>                 /* SRC is implicit ctx ptr which is 64-bit, or explicit source
>                  * which could be any width.
>                  */
>                 return true;
>         }
> 
>         /* Conservatively return true for all the other cases. */
>         return true;
> 
> Does this looks good?

well, it made me realize that we're probably doing it wrong,
since after calling check_reg_arg() we need to re-parse insn encoding.
How about we change check_reg_arg()'s enum reg_arg_type instead?
The caller has much more context and no need to parse insn opcode again.
Something like:
enum reg_arg_type {
        SRC_OP64,        
        DST_OP64,       
        DST_OP_NO_MARK, // probably no need to split this one ?
        SRC_OP32,      
        DST_OP32,      
};

> > 
> >> +
> >> +		/* Explicit source could be any width. */
> >> +		return true;
> >> +	}
> >> +
> >> +	if (class == BPF_ST)
> >> +		/* The only source register for BPF_ST is a ptr. */
> >> +		return true;
> > 
> > ditto
> 
> > 
> >> +
> >> +	/* Conservatively return true at default. */
> >> +	return true;
> >> +}
> >> +
> >> +static void mark_insn_zext(struct bpf_verifier_env *env,
> >> +			   struct bpf_reg_state *reg)
> >> +{
> >> +	s32 def_idx = reg->subreg_def;
> >> +
> >> +	if (def_idx == DEF_NOT_SUBREG)
> >> +		return;
> >> +
> >> +	env->insn_aux_data[def_idx].zext_dst = true;
> >> +	/* The dst will be zero extended, so won't be sub-register anymore. */
> >> +	reg->subreg_def = DEF_NOT_SUBREG;
> >> +}
> >> +
> >> static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
> >> 			 enum reg_arg_type t)
> >> {
> >> 	struct bpf_verifier_state *vstate = env->cur_state;
> >> 	struct bpf_func_state *state = vstate->frame[vstate->curframe];
> >> +	struct bpf_insn *insn = env->prog->insnsi + env->insn_idx;
> >> 	struct bpf_reg_state *reg, *regs = state->regs;
> >> +	bool rw64;
> >> 
> >> 	if (regno >= MAX_BPF_REG) {
> >> 		verbose(env, "R%d is invalid\n", regno);
> >> @@ -1186,6 +1296,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
> >> 	}
> >> 
> >> 	reg = &regs[regno];
> >> +	rw64 = is_reg64(env, insn, regno, reg, t);
> >> 	if (t == SRC_OP) {
> >> 		/* check whether register used as source operand can be read */
> >> 		if (reg->type == NOT_INIT) {
> >> @@ -1196,7 +1307,11 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
> >> 		if (regno == BPF_REG_FP)
> >> 			return 0;
> >> 
> >> -		return mark_reg_read(env, reg, reg->parent);
> >> +		if (rw64)
> >> +			mark_insn_zext(env, reg);
> >> +
> >> +		return mark_reg_read(env, reg, reg->parent,
> >> +				     rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32);
> >> 	} else {
> >> 		/* check whether register used as dest operand can be written to */
> >> 		if (regno == BPF_REG_FP) {
> >> @@ -1204,6 +1319,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
> >> 			return -EACCES;
> >> 		}
> >> 		reg->live |= REG_LIVE_WRITTEN;
> >> +		reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx;
> >> 		if (t == DST_OP)
> >> 			mark_reg_unknown(env, regs, regno);
> >> 	}
> >> @@ -1383,7 +1499,8 @@ static int check_stack_read(struct bpf_verifier_env *env,
> >> 			state->regs[value_regno].live |= REG_LIVE_WRITTEN;
> >> 		}
> >> 		mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
> >> -			      reg_state->stack[spi].spilled_ptr.parent);
> >> +			      reg_state->stack[spi].spilled_ptr.parent,
> >> +			      REG_LIVE_READ64);
> >> 		return 0;
> >> 	} else {
> >> 		int zeros = 0;
> >> @@ -1400,7 +1517,9 @@ static int check_stack_read(struct bpf_verifier_env *env,
> >> 			return -EACCES;
> >> 		}
> >> 		mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
> >> -			      reg_state->stack[spi].spilled_ptr.parent);
> >> +			      reg_state->stack[spi].spilled_ptr.parent,
> >> +			      size == BPF_REG_SIZE
> >> +			      ? REG_LIVE_READ64 : REG_LIVE_READ32);
> > 
> > I don't think that's correct.
> > It's a parial read from a stack slot that doesn't contain a pointer.
> > It can be of any size and any offset.
> > Hence 4-byte read can read upper bits too.
> 
> That’s true. But I think this won’t cause correctness issue. Because
> for stack read, we call mark_reg_read mostly to set the read bit in
> “live”, and it is used by stacksafe to know whether one stack slot
> is ever used or not. Either REG_LIVE_READ64 or REG_LIVE_READ32 will
> let the slot marked as “read”, and let stacksafe know it is used.

yes. that's technically correct, but very fragile.

> For stack content, if it comes from eBPF register A, then it must have
> been stored by STX to stack which should have triggered zero extension
> on the original A if it is a sub-register. So the content is correct.
> 
> Here, the issue is stack slot read is reusing mark_reg_read, the last
> parameter for stack slot read just need to let mark_reg_read know this
> slot has been read, perhaps just pass REG_LIVE_READ64 for all stack
> read?

yes. please.
Jiong Wang May 23, 2019, 8:20 p.m. UTC | #4
Alexei Starovoitov writes:

<snip>

> well, it made me realize that we're probably doing it wrong,
> since after calling check_reg_arg() we need to re-parse insn encoding.
> How about we change check_reg_arg()'s enum reg_arg_type instead?

This is exactly what I had implemented in my initial internal version.

> The caller has much more context and no need to parse insn opcode again.

And I had exactly the same thoughts, is_reg64 is duplicating what has been
done.

The code evolved into the current shape mostly because I agreed if we
re-centre all checks inside check_reg_arg, then we don't need to touch
quite a few call sites of check_reg_arg, the change/patch looks simpler,
and I was thinking is_reg64 is a quick check, so the overhead is not big.

> Something like:
> enum reg_arg_type {
>         SRC_OP64,        
>         DST_OP64,       
>         DST_OP_NO_MARK, // probably no need to split this one ?
>         SRC_OP32,      
>         DST_OP32,      
> };
>

Yeah, no need to split DST_OP_NO_MARK, my split was

enum reg_arg_type {
   SRC_OP,
+  SRC32_OP,
   DST_OP,
+  DST32_OP,
   DST_OP_NO_MARK 
}

No renaming on existing SRC/DST_OP, they mean 64-bit, the changes are
smaller, looks better?

But, we also need to know whether one patch-insn define sub-register, if it
is, we then conservatively mark it as needing zero extension. patch-insn
doesn't go through check_reg_arg analysis, so still need separate insn
parsing.

And because we also introduced hi32 rand which should only happen if
insn_aux.zext_dst is false. But when zext_dst is false, there are two
situations, one is this insn define a sub-register whose hi32 is not used
later, the other is this insn define a full 64-bit register. hi32 rand
should only happen on the prior situation. So is_reg64 also called to rule
out the latter. The use of is_64 could be removed by changing aux.zext_dst
from bool to enum, so we rename it to aux.reg_def, its value could be:

  REG_DEF_NONE (some insn doesn't define value, this is default)
  REG_DEF64
  REG_DEF32
  REG_DEF32_ZEXT

When calling check_reg_arg, and DST/DST_32 will initialize aux.reg_def into
REG_DEF64 and REG_DEF32 accordingly. Then, a later 64-bit use on sub-register
could promote REG_DEF32 into REG_DEF32_ZEXT.

In all, my propose changes are:
  1. split enum reg_arg_type, adding new SRC32_OP and DST32_OP, during insn
     walking, let call sites of check_reg_arg passing correct type
     directly, remove insn re-parsing inside check_reg_arg.
  2. keep "is_reg64", it will be used by parsing patched-insn.
  3. change aux.zext_dst to aux.reg_def, and change the type from bool to
     the enum listed above. When promoting one reg to REG_DEF32_ZEXT, also
     do sanity check, the promotion should only happen on REG_DEF32.

Does this looks good?

Regards,
Jiong
Alexei Starovoitov May 23, 2019, 8:42 p.m. UTC | #5
On Thu, May 23, 2019 at 09:20:15PM +0100, Jiong Wang wrote:
> 
> Alexei Starovoitov writes:
> 
> <snip>
> 
> > well, it made me realize that we're probably doing it wrong,
> > since after calling check_reg_arg() we need to re-parse insn encoding.
> > How about we change check_reg_arg()'s enum reg_arg_type instead?
> 
> This is exactly what I had implemented in my initial internal version.

it was long ago :) shrinkers purged it.

> > The caller has much more context and no need to parse insn opcode again.
> 
> And I had exactly the same thoughts, is_reg64 is duplicating what has been
> done.
> 
> The code evolved into the current shape mostly because I agreed if we
> re-centre all checks inside check_reg_arg, then we don't need to touch
> quite a few call sites of check_reg_arg, the change/patch looks simpler,
> and I was thinking is_reg64 is a quick check, so the overhead is not big.
> 
> > Something like:
> > enum reg_arg_type {
> >         SRC_OP64,        
> >         DST_OP64,       
> >         DST_OP_NO_MARK, // probably no need to split this one ?
> >         SRC_OP32,      
> >         DST_OP32,      
> > };
> >
> 
> Yeah, no need to split DST_OP_NO_MARK, my split was
> 
> enum reg_arg_type {
>    SRC_OP,
> +  SRC32_OP,
>    DST_OP,
> +  DST32_OP,
>    DST_OP_NO_MARK 
> }
> 
> No renaming on existing SRC/DST_OP, they mean 64-bit, the changes are
> smaller, looks better?
> 
> But, we also need to know whether one patch-insn define sub-register, if it
> is, we then conservatively mark it as needing zero extension. patch-insn
> doesn't go through check_reg_arg analysis, so still need separate insn
> parsing.

Good point.
Then let's stick with your last is_reg64().
Re-parsing is annoying, but looks like there is already use case for it
and more can appear in the future.
diff mbox series

Patch

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 1305ccb..60fb54e 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -36,9 +36,11 @@ 
  */
 enum bpf_reg_liveness {
 	REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */
-	REG_LIVE_READ, /* reg was read, so we're sensitive to initial value */
-	REG_LIVE_WRITTEN, /* reg was written first, screening off later reads */
-	REG_LIVE_DONE = 4, /* liveness won't be updating this register anymore */
+	REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial value */
+	REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */
+	REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64,
+	REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later reads */
+	REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */
 };
 
 struct bpf_reg_state {
@@ -131,6 +133,11 @@  struct bpf_reg_state {
 	 * pointing to bpf_func_state.
 	 */
 	u32 frameno;
+	/* Tracks subreg definition. The stored value is the insn_idx of the
+	 * writing insn. This is safe because subreg_def is used before any insn
+	 * patching which only happens after main verification finished.
+	 */
+	s32 subreg_def;
 	enum bpf_reg_liveness live;
 };
 
@@ -232,6 +239,7 @@  struct bpf_insn_aux_data {
 	int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
 	int sanitize_stack_off; /* stack slot to be cleared */
 	bool seen; /* this insn was processed by the verifier */
+	bool zext_dst; /* this insn zero extends dst reg */
 	u8 alu_state; /* used in combination with alu_limit */
 	unsigned int orig_idx; /* original instruction index */
 };
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 95f93544..0efccf8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -981,6 +981,7 @@  static void mark_reg_not_init(struct bpf_verifier_env *env,
 	__mark_reg_not_init(regs + regno);
 }
 
+#define DEF_NOT_SUBREG	(-1)
 static void init_reg_state(struct bpf_verifier_env *env,
 			   struct bpf_func_state *state)
 {
@@ -991,6 +992,7 @@  static void init_reg_state(struct bpf_verifier_env *env,
 		mark_reg_not_init(env, regs, i);
 		regs[i].live = REG_LIVE_NONE;
 		regs[i].parent = NULL;
+		regs[i].subreg_def = DEF_NOT_SUBREG;
 	}
 
 	/* frame pointer */
@@ -1136,7 +1138,7 @@  static int check_subprogs(struct bpf_verifier_env *env)
  */
 static int mark_reg_read(struct bpf_verifier_env *env,
 			 const struct bpf_reg_state *state,
-			 struct bpf_reg_state *parent)
+			 struct bpf_reg_state *parent, u8 flag)
 {
 	bool writes = parent == state->parent; /* Observe write marks */
 	int cnt = 0;
@@ -1151,17 +1153,26 @@  static int mark_reg_read(struct bpf_verifier_env *env,
 				parent->var_off.value, parent->off);
 			return -EFAULT;
 		}
-		if (parent->live & REG_LIVE_READ)
+		/* The first condition is more likely to be true than the
+		 * second, checked it first.
+		 */
+		if ((parent->live & REG_LIVE_READ) == flag ||
+		    parent->live & REG_LIVE_READ64)
 			/* The parentage chain never changes and
 			 * this parent was already marked as LIVE_READ.
 			 * There is no need to keep walking the chain again and
 			 * keep re-marking all parents as LIVE_READ.
 			 * This case happens when the same register is read
 			 * multiple times without writes into it in-between.
+			 * Also, if parent has the stronger REG_LIVE_READ64 set,
+			 * then no need to set the weak REG_LIVE_READ32.
 			 */
 			break;
 		/* ... then we depend on parent's value */
-		parent->live |= REG_LIVE_READ;
+		parent->live |= flag;
+		/* REG_LIVE_READ64 overrides REG_LIVE_READ32. */
+		if (flag == REG_LIVE_READ64)
+			parent->live &= ~REG_LIVE_READ32;
 		state = parent;
 		parent = state->parent;
 		writes = true;
@@ -1173,12 +1184,111 @@  static int mark_reg_read(struct bpf_verifier_env *env,
 	return 0;
 }
 
+/* This function is supposed to be used by the following 32-bit optimization
+ * code only. It returns TRUE if the source or destination register operates
+ * on 64-bit, otherwise return FALSE.
+ */
+static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
+		     u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t)
+{
+	u8 code, class, op;
+
+	code = insn->code;
+	class = BPF_CLASS(code);
+	op = BPF_OP(code);
+	if (class == BPF_JMP) {
+		/* BPF_EXIT for "main" will reach here. Return TRUE
+		 * conservatively.
+		 */
+		if (op == BPF_EXIT)
+			return true;
+		if (op == BPF_CALL) {
+			/* BPF to BPF call will reach here because of marking
+			 * caller saved clobber with DST_OP_NO_MARK for which we
+			 * don't care the register def because they are anyway
+			 * marked as NOT_INIT already.
+			 */
+			if (insn->src_reg == BPF_PSEUDO_CALL)
+				return false;
+			/* Helper call will reach here because of arg type
+			 * check, conservatively return TRUE.
+			 */
+			if (t == SRC_OP)
+				return true;
+
+			return false;
+		}
+	}
+
+	if (class == BPF_ALU64 || class == BPF_JMP ||
+	    /* BPF_END always use BPF_ALU class. */
+	    (class == BPF_ALU && op == BPF_END && insn->imm == 64))
+		return true;
+
+	if (class == BPF_ALU || class == BPF_JMP32)
+		return false;
+
+	if (class == BPF_LDX) {
+		if (t != SRC_OP)
+			return BPF_SIZE(code) == BPF_DW;
+		/* LDX source must be ptr. */
+		return true;
+	}
+
+	if (class == BPF_STX) {
+		if (reg->type != SCALAR_VALUE)
+			return true;
+		return BPF_SIZE(code) == BPF_DW;
+	}
+
+	if (class == BPF_LD) {
+		u8 mode = BPF_MODE(code);
+
+		/* LD_IMM64 */
+		if (mode == BPF_IMM)
+			return true;
+
+		/* Both LD_IND and LD_ABS return 32-bit data. */
+		if (t != SRC_OP)
+			return  false;
+
+		/* Implicit ctx ptr. */
+		if (regno == BPF_REG_6)
+			return true;
+
+		/* Explicit source could be any width. */
+		return true;
+	}
+
+	if (class == BPF_ST)
+		/* The only source register for BPF_ST is a ptr. */
+		return true;
+
+	/* Conservatively return true at default. */
+	return true;
+}
+
+static void mark_insn_zext(struct bpf_verifier_env *env,
+			   struct bpf_reg_state *reg)
+{
+	s32 def_idx = reg->subreg_def;
+
+	if (def_idx == DEF_NOT_SUBREG)
+		return;
+
+	env->insn_aux_data[def_idx].zext_dst = true;
+	/* The dst will be zero extended, so won't be sub-register anymore. */
+	reg->subreg_def = DEF_NOT_SUBREG;
+}
+
 static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 			 enum reg_arg_type t)
 {
 	struct bpf_verifier_state *vstate = env->cur_state;
 	struct bpf_func_state *state = vstate->frame[vstate->curframe];
+	struct bpf_insn *insn = env->prog->insnsi + env->insn_idx;
 	struct bpf_reg_state *reg, *regs = state->regs;
+	bool rw64;
 
 	if (regno >= MAX_BPF_REG) {
 		verbose(env, "R%d is invalid\n", regno);
@@ -1186,6 +1296,7 @@  static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 	}
 
 	reg = &regs[regno];
+	rw64 = is_reg64(env, insn, regno, reg, t);
 	if (t == SRC_OP) {
 		/* check whether register used as source operand can be read */
 		if (reg->type == NOT_INIT) {
@@ -1196,7 +1307,11 @@  static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 		if (regno == BPF_REG_FP)
 			return 0;
 
-		return mark_reg_read(env, reg, reg->parent);
+		if (rw64)
+			mark_insn_zext(env, reg);
+
+		return mark_reg_read(env, reg, reg->parent,
+				     rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32);
 	} else {
 		/* check whether register used as dest operand can be written to */
 		if (regno == BPF_REG_FP) {
@@ -1204,6 +1319,7 @@  static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 			return -EACCES;
 		}
 		reg->live |= REG_LIVE_WRITTEN;
+		reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx;
 		if (t == DST_OP)
 			mark_reg_unknown(env, regs, regno);
 	}
@@ -1383,7 +1499,8 @@  static int check_stack_read(struct bpf_verifier_env *env,
 			state->regs[value_regno].live |= REG_LIVE_WRITTEN;
 		}
 		mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
-			      reg_state->stack[spi].spilled_ptr.parent);
+			      reg_state->stack[spi].spilled_ptr.parent,
+			      REG_LIVE_READ64);
 		return 0;
 	} else {
 		int zeros = 0;
@@ -1400,7 +1517,9 @@  static int check_stack_read(struct bpf_verifier_env *env,
 			return -EACCES;
 		}
 		mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
-			      reg_state->stack[spi].spilled_ptr.parent);
+			      reg_state->stack[spi].spilled_ptr.parent,
+			      size == BPF_REG_SIZE
+			      ? REG_LIVE_READ64 : REG_LIVE_READ32);
 		if (value_regno >= 0) {
 			if (zeros == size) {
 				/* any size read into register is zero extended,
@@ -2109,6 +2228,12 @@  static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 						    value_regno);
 				if (reg_type_may_be_null(reg_type))
 					regs[value_regno].id = ++env->id_gen;
+				/* A load of ctx field could have different
+				 * actual load size with the one encoded in the
+				 * insn. When the dst is PTR, it is for sure not
+				 * a sub-register.
+				 */
+				regs[value_regno].subreg_def = DEF_NOT_SUBREG;
 			}
 			regs[value_regno].type = reg_type;
 		}
@@ -2368,7 +2493,9 @@  static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 		 * the whole slot to be marked as 'read'
 		 */
 		mark_reg_read(env, &state->stack[spi].spilled_ptr,
-			      state->stack[spi].spilled_ptr.parent);
+			      state->stack[spi].spilled_ptr.parent,
+			      access_size == BPF_REG_SIZE
+			      ? REG_LIVE_READ64 : REG_LIVE_READ32);
 	}
 	return update_stack_depth(env, state, min_off);
 }
@@ -3332,6 +3459,9 @@  static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
 	}
 
+	/* helper call returns 64-bit value. */
+	regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
+
 	/* update return register (already marked as written above) */
 	if (fn->ret_type == RET_INTEGER) {
 		/* sets type to SCALAR_VALUE */
@@ -4263,6 +4393,7 @@  static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 				 */
 				*dst_reg = *src_reg;
 				dst_reg->live |= REG_LIVE_WRITTEN;
+				dst_reg->subreg_def = DEF_NOT_SUBREG;
 			} else {
 				/* R1 = (u32) R2 */
 				if (is_pointer_value(env, insn->src_reg)) {
@@ -4273,6 +4404,7 @@  static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 				} else if (src_reg->type == SCALAR_VALUE) {
 					*dst_reg = *src_reg;
 					dst_reg->live |= REG_LIVE_WRITTEN;
+					dst_reg->subreg_def = env->insn_idx;
 				} else {
 					mark_reg_unknown(env, regs,
 							 insn->dst_reg);
@@ -5352,6 +5484,8 @@  static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	 * Already marked as written above.
 	 */
 	mark_reg_unknown(env, regs, BPF_REG_0);
+	/* ld_abs load up to 32-bit skb data. */
+	regs[BPF_REG_0].subreg_def = env->insn_idx;
 	return 0;
 }
 
@@ -6292,20 +6426,33 @@  static bool states_equal(struct bpf_verifier_env *env,
 	return true;
 }
 
+/* Return 0 if no propagation happened. Return negative error code if error
+ * happened. Otherwise, return the propagated bit.
+ */
 static int propagate_liveness_reg(struct bpf_verifier_env *env,
 				  struct bpf_reg_state *reg,
 				  struct bpf_reg_state *parent_reg)
 {
+	u8 parent_flag = parent_reg->live & REG_LIVE_READ;
+	u8 flag = reg->live & REG_LIVE_READ;
 	int err;
 
-	if (parent_reg->live & REG_LIVE_READ || !(reg->live & REG_LIVE_READ))
+	/* When comes here, read flags of PARENT_REG or REG could be any of
+	 * REG_LIVE_READ64, REG_LIVE_READ32, REG_LIVE_NONE. There is no need
+	 * of propagation if PARENT_REG has strongest REG_LIVE_READ64.
+	 */
+	if (parent_flag == REG_LIVE_READ64 ||
+	    /* Or if there is no read flag from REG. */
+	    !flag ||
+	    /* Or if the read flag from REG is the same as PARENT_REG. */
+	    parent_flag == flag)
 		return 0;
 
-	err = mark_reg_read(env, reg, parent_reg);
+	err = mark_reg_read(env, reg, parent_reg, flag);
 	if (err)
 		return err;
 
-	return 0;
+	return flag;
 }
 
 /* A write screens off any subsequent reads; but write marks come from the
@@ -6339,8 +6486,10 @@  static int propagate_liveness(struct bpf_verifier_env *env,
 		for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) {
 			err = propagate_liveness_reg(env, &state_reg[i],
 						     &parent_reg[i]);
-			if (err)
+			if (err < 0)
 				return err;
+			if (err == REG_LIVE_READ64)
+				mark_insn_zext(env, &parent_reg[i]);
 		}
 
 		/* Propagate stack slots. */
@@ -6350,11 +6499,11 @@  static int propagate_liveness(struct bpf_verifier_env *env,
 			state_reg = &state->stack[i].spilled_ptr;
 			err = propagate_liveness_reg(env, state_reg,
 						     parent_reg);
-			if (err)
+			if (err < 0)
 				return err;
 		}
 	}
-	return err;
+	return 0;
 }
 
 static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)