diff mbox

[v2,2/5] perf/x86/intel: Record branch type

Message ID 1491562066-7472-3-git-send-email-yao.jin@linux.intel.com (mailing list archive)
State Not Applicable
Headers show

Commit Message

Jin, Yao April 7, 2017, 10:47 a.m. UTC
Perf already has support for disassembling the branch instruction
and using the branch type for filtering. The patch just records
the branch type in perf_branch_entry.

Before recording, the patch converts the x86 branch classification
to common branch classification and compute for checking if the
branches cross 4K or 2MB areas. It's an approximate computing for
crossing 4K page or 2MB page.

Signed-off-by: Jin Yao <yao.jin@linux.intel.com>
---
 arch/x86/events/intel/lbr.c | 106 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 105 insertions(+), 1 deletion(-)

Comments

Peter Zijlstra April 7, 2017, 3:20 p.m. UTC | #1
On Fri, Apr 07, 2017 at 06:47:43PM +0800, Jin Yao wrote:
> Perf already has support for disassembling the branch instruction
> and using the branch type for filtering. The patch just records
> the branch type in perf_branch_entry.
> 
> Before recording, the patch converts the x86 branch classification
> to common branch classification and compute for checking if the
> branches cross 4K or 2MB areas. It's an approximate computing for
> crossing 4K page or 2MB page.

The changelog is completely empty of rationale. Why do we care?

Not having the binary is a very bad reason; you can't do much of
anything if that's missing.


> @@ -923,6 +933,84 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
>  	return ret;
>  }
>  
> +static int
> +common_branch_type(int type, u64 from, u64 to)
> +{
> +	int ret;
> +
> +	type = type & (~(X86_BR_KERNEL | X86_BR_USER));
> +
> +	switch (type) {
> +	case X86_BR_CALL:
> +	case X86_BR_ZERO_CALL:
> +		ret = PERF_BR_CALL;
> +		break;
> +
> +	case X86_BR_RET:
> +		ret = PERF_BR_RET;
> +		break;
> +
> +	case X86_BR_SYSCALL:
> +		ret = PERF_BR_SYSCALL;
> +		break;
> +
> +	case X86_BR_SYSRET:
> +		ret = PERF_BR_SYSRET;
> +		break;
> +
> +	case X86_BR_INT:
> +		ret = PERF_BR_INT;
> +		break;
> +
> +	case X86_BR_IRET:
> +		ret = PERF_BR_IRET;
> +		break;
> +
> +	case X86_BR_IRQ:
> +		ret = PERF_BR_IRQ;
> +		break;
> +
> +	case X86_BR_ABORT:
> +		ret = PERF_BR_FAR_BRANCH;
> +		break;
> +
> +	case X86_BR_JCC:
> +		if (to > from)
> +			ret = PERF_BR_JCC_FWD;
> +		else
> +			ret = PERF_BR_JCC_BWD;
> +		break;

This seems like superfluous information; we already get to and from, so
this comparison is pointless.

The rest looks like something you can simpler implement using a lookup
table.

> +
> +	case X86_BR_JMP:
> +		ret = PERF_BR_JMP;
> +		break;
> +
> +	case X86_BR_IND_CALL:
> +		ret = PERF_BR_IND_CALL;
> +		break;
> +
> +	case X86_BR_IND_JMP:
> +		ret = PERF_BR_IND_JMP;
> +		break;
> +
> +	default:
> +		ret = PERF_BR_NONE;
> +	}
> +
> +	return ret;
> +}
> +
> +static bool
> +cross_area(u64 addr1, u64 addr2, int size)
> +{
> +	u64 align1, align2;
> +
> +	align1 = addr1 & ~(size - 1);
> +	align2 = addr2 & ~(size - 1);
> +
> +	return (align1 != align2) ? true : false;
> +}
> +
>  /*
>   * implement actual branch filter based on user demand.
>   * Hardware may not exactly satisfy that request, thus
> @@ -939,7 +1027,8 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
>  	bool compress = false;
>  
>  	/* if sampling all branches, then nothing to filter */
> -	if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
> +	if (((br_sel & X86_BR_ALL) == X86_BR_ALL) &&
> +	    ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE))
>  		return;
>  
>  	for (i = 0; i < cpuc->lbr_stack.nr; i++) {
> @@ -960,6 +1049,21 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
>  			cpuc->lbr_entries[i].from = 0;
>  			compress = true;
>  		}
> +
> +		if ((br_sel & X86_BR_TYPE_SAVE) == X86_BR_TYPE_SAVE) {
> +			cpuc->lbr_entries[i].type = common_branch_type(type,
> +								       from,
> +								       to);
> +			if (cross_area(from, to, AREA_2M))
> +				cpuc->lbr_entries[i].cross = PERF_BR_CROSS_2M;
> +			else if (cross_area(from, to, AREA_4K))
> +				cpuc->lbr_entries[i].cross = PERF_BR_CROSS_4K;
> +			else
> +				cpuc->lbr_entries[i].cross = PERF_BR_CROSS_NONE;

This again is superfluous information; it is already fully contained in
to and from, which we have.

> +		} else {
> +			cpuc->lbr_entries[i].type = PERF_BR_NONE;
> +			cpuc->lbr_entries[i].cross = PERF_BR_CROSS_NONE;
> +		}
>  	}
>  
>  	if (!compress)
> -- 
> 2.7.4
>
Andi Kleen April 7, 2017, 4:48 p.m. UTC | #2
On Fri, Apr 07, 2017 at 05:20:31PM +0200, Peter Zijlstra wrote:
> On Fri, Apr 07, 2017 at 06:47:43PM +0800, Jin Yao wrote:
> > Perf already has support for disassembling the branch instruction
> > and using the branch type for filtering. The patch just records
> > the branch type in perf_branch_entry.
> > 
> > Before recording, the patch converts the x86 branch classification
> > to common branch classification and compute for checking if the
> > branches cross 4K or 2MB areas. It's an approximate computing for
> > crossing 4K page or 2MB page.
> 
> The changelog is completely empty of rationale. Why do we care?
> 
> Not having the binary is a very bad reason; you can't do much of
> anything if that's missing.

It's a somewhat common situation with partially JITed code, if you
don't have an agent. You can still do a lot of useful things.

We found it useful to have this extra information during workload
analysis. Forward conditionals and page crossing jumps
are indications of frontend problems.

-Andi
Peter Zijlstra April 7, 2017, 5:20 p.m. UTC | #3
On Fri, Apr 07, 2017 at 09:48:34AM -0700, Andi Kleen wrote:
> On Fri, Apr 07, 2017 at 05:20:31PM +0200, Peter Zijlstra wrote:
> > On Fri, Apr 07, 2017 at 06:47:43PM +0800, Jin Yao wrote:
> > > Perf already has support for disassembling the branch instruction
> > > and using the branch type for filtering. The patch just records
> > > the branch type in perf_branch_entry.
> > > 
> > > Before recording, the patch converts the x86 branch classification
> > > to common branch classification and compute for checking if the
> > > branches cross 4K or 2MB areas. It's an approximate computing for
> > > crossing 4K page or 2MB page.
> > 
> > The changelog is completely empty of rationale. Why do we care?
> > 
> > Not having the binary is a very bad reason; you can't do much of
> > anything if that's missing.
> 
> It's a somewhat common situation with partially JITed code, if you
> don't have an agent. You can still do a lot of useful things.

Like what? How can you say anything about code you don't have?

> We found it useful to have this extra information during workload
> analysis. Forward conditionals and page crossing jumps
> are indications of frontend problems.

But you already have the exact same information in {to,from}, why would
you need to repackage information already contained?
Andi Kleen April 7, 2017, 5:50 p.m. UTC | #4
> > It's a somewhat common situation with partially JITed code, if you
> > don't have an agent. You can still do a lot of useful things.
> 
> Like what? How can you say anything about code you don't have?

For example if you combine the PMU topdown measurement, and see if it's
frontend bound, and then you see it has lots of forward conditionals,
then dynamic basic block reordering will help. If you have lots
of cross page jumps then function reordering will help. etc.

> > We found it useful to have this extra information during workload
> > analysis. Forward conditionals and page crossing jumps
> > are indications of frontend problems.
> 
> But you already have the exact same information in {to,from}, why would
> you need to repackage information already contained?

Without this patch, we don't know if it's conditional or something else.
And the kernel already knows this for its filtering, so it can as well
report it.

Right the CROSS_* and forward backward information could be computed
later.

-Andi
Jin, Yao April 8, 2017, 8:46 a.m. UTC | #5
> Without this patch, we don't know if it's conditional or something else.
> And the kernel already knows this for its filtering, so it can as well
> report it.
>
> Right the CROSS_* and forward backward information could be computed
> later.
>
> -Andi
>
>
OK, I will move CROSS_* and JCC forward/backward computing to user-space though it makes user-space code to be complicated.

Thanks
Jin Yao
diff mbox

Patch

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 81b321a..635a0fb 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -109,6 +109,9 @@  enum {
 	X86_BR_ZERO_CALL	= 1 << 15,/* zero length call */
 	X86_BR_CALL_STACK	= 1 << 16,/* call stack */
 	X86_BR_IND_JMP		= 1 << 17,/* indirect jump */
+
+	X86_BR_TYPE_SAVE	= 1 << 18,/* indicate to save branch type */
+
 };
 
 #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
@@ -139,6 +142,9 @@  enum {
 	 X86_BR_IRQ		|\
 	 X86_BR_INT)
 
+#define AREA_4K		4096
+#define AREA_2M		(2 * 1024 * 1024)
+
 static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
 
 /*
@@ -670,6 +676,10 @@  static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
 
 	if (br_type & PERF_SAMPLE_BRANCH_CALL)
 		mask |= X86_BR_CALL | X86_BR_ZERO_CALL;
+
+	if (br_type & PERF_SAMPLE_BRANCH_TYPE_SAVE)
+		mask |= X86_BR_TYPE_SAVE;
+
 	/*
 	 * stash actual user request into reg, it may
 	 * be used by fixup code for some CPU
@@ -923,6 +933,84 @@  static int branch_type(unsigned long from, unsigned long to, int abort)
 	return ret;
 }
 
+static int
+common_branch_type(int type, u64 from, u64 to)
+{
+	int ret;
+
+	type = type & (~(X86_BR_KERNEL | X86_BR_USER));
+
+	switch (type) {
+	case X86_BR_CALL:
+	case X86_BR_ZERO_CALL:
+		ret = PERF_BR_CALL;
+		break;
+
+	case X86_BR_RET:
+		ret = PERF_BR_RET;
+		break;
+
+	case X86_BR_SYSCALL:
+		ret = PERF_BR_SYSCALL;
+		break;
+
+	case X86_BR_SYSRET:
+		ret = PERF_BR_SYSRET;
+		break;
+
+	case X86_BR_INT:
+		ret = PERF_BR_INT;
+		break;
+
+	case X86_BR_IRET:
+		ret = PERF_BR_IRET;
+		break;
+
+	case X86_BR_IRQ:
+		ret = PERF_BR_IRQ;
+		break;
+
+	case X86_BR_ABORT:
+		ret = PERF_BR_FAR_BRANCH;
+		break;
+
+	case X86_BR_JCC:
+		if (to > from)
+			ret = PERF_BR_JCC_FWD;
+		else
+			ret = PERF_BR_JCC_BWD;
+		break;
+
+	case X86_BR_JMP:
+		ret = PERF_BR_JMP;
+		break;
+
+	case X86_BR_IND_CALL:
+		ret = PERF_BR_IND_CALL;
+		break;
+
+	case X86_BR_IND_JMP:
+		ret = PERF_BR_IND_JMP;
+		break;
+
+	default:
+		ret = PERF_BR_NONE;
+	}
+
+	return ret;
+}
+
+static bool
+cross_area(u64 addr1, u64 addr2, int size)
+{
+	u64 align1, align2;
+
+	align1 = addr1 & ~(size - 1);
+	align2 = addr2 & ~(size - 1);
+
+	return (align1 != align2) ? true : false;
+}
+
 /*
  * implement actual branch filter based on user demand.
  * Hardware may not exactly satisfy that request, thus
@@ -939,7 +1027,8 @@  intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
 	bool compress = false;
 
 	/* if sampling all branches, then nothing to filter */
-	if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
+	if (((br_sel & X86_BR_ALL) == X86_BR_ALL) &&
+	    ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE))
 		return;
 
 	for (i = 0; i < cpuc->lbr_stack.nr; i++) {
@@ -960,6 +1049,21 @@  intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
 			cpuc->lbr_entries[i].from = 0;
 			compress = true;
 		}
+
+		if ((br_sel & X86_BR_TYPE_SAVE) == X86_BR_TYPE_SAVE) {
+			cpuc->lbr_entries[i].type = common_branch_type(type,
+								       from,
+								       to);
+			if (cross_area(from, to, AREA_2M))
+				cpuc->lbr_entries[i].cross = PERF_BR_CROSS_2M;
+			else if (cross_area(from, to, AREA_4K))
+				cpuc->lbr_entries[i].cross = PERF_BR_CROSS_4K;
+			else
+				cpuc->lbr_entries[i].cross = PERF_BR_CROSS_NONE;
+		} else {
+			cpuc->lbr_entries[i].type = PERF_BR_NONE;
+			cpuc->lbr_entries[i].cross = PERF_BR_CROSS_NONE;
+		}
 	}
 
 	if (!compress)