diff mbox

[v6,2/7] perf/x86/intel: Record branch type

Message ID 1492690075-17243-3-git-send-email-yao.jin@linux.intel.com (mailing list archive)
State Not Applicable
Headers show

Commit Message

Jin, Yao April 20, 2017, 12:07 p.m. UTC
Perf already has support for disassembling the branch instruction
and using the branch type for filtering. The patch just records
the branch type in perf_branch_entry.

Before recording, the patch converts the x86 branch type to
common branch type.

Change log
----------

v6: Not changed.

v5: Just fix the merge error. No other update.

v4: Comparing to previous version, the major changes are:

1. Uses a lookup table to convert x86 branch type to common branch
   type.

2. Move the JCC forward/JCC backward and cross page computing to
   user space.

3. Initialize branch type to 0 in intel_pmu_lbr_read_32 and
   intel_pmu_lbr_read_64

Signed-off-by: Jin Yao <yao.jin@linux.intel.com>
---
 arch/x86/events/intel/lbr.c | 53 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 52 insertions(+), 1 deletion(-)

Comments

Jiri Olsa April 23, 2017, 1:55 p.m. UTC | #1
On Thu, Apr 20, 2017 at 08:07:50PM +0800, Jin Yao wrote:

SNIP

>  
> +#define X86_BR_TYPE_MAP_MAX	16
> +
> +static int
> +common_branch_type(int type)
> +{
> +	int i, mask;
> +	const int branch_map[X86_BR_TYPE_MAP_MAX] = {
> +		PERF_BR_CALL,		/* X86_BR_CALL */
> +		PERF_BR_RET,		/* X86_BR_RET */
> +		PERF_BR_SYSCALL,	/* X86_BR_SYSCALL */
> +		PERF_BR_SYSRET,		/* X86_BR_SYSRET */
> +		PERF_BR_INT,		/* X86_BR_INT */
> +		PERF_BR_IRET,		/* X86_BR_IRET */
> +		PERF_BR_JCC,		/* X86_BR_JCC */
> +		PERF_BR_JMP,		/* X86_BR_JMP */
> +		PERF_BR_IRQ,		/* X86_BR_IRQ */
> +		PERF_BR_IND_CALL,	/* X86_BR_IND_CALL */
> +		PERF_BR_NONE,		/* X86_BR_ABORT */
> +		PERF_BR_NONE,		/* X86_BR_IN_TX */
> +		PERF_BR_NONE,		/* X86_BR_NO_TX */
> +		PERF_BR_CALL,		/* X86_BR_ZERO_CALL */
> +		PERF_BR_NONE,		/* X86_BR_CALL_STACK */
> +		PERF_BR_IND_JMP,	/* X86_BR_IND_JMP */
> +	};
> +
> +	type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */
> +	mask = ~(~0 << 1);

is that a fancy way to get 1 into the mask? what do I miss?

> +
> +	for (i = 0; i < X86_BR_TYPE_MAP_MAX; i++) {
> +		if (type & mask)
> +			return branch_map[i];

I wonder some bit search would be faster in here, but maybe not big deal

jirka

> +
> +		type >>= 1;
> +	}
> +
> +	return PERF_BR_NONE;
> +}
> +
>  /*
Jin, Yao April 24, 2017, 12:47 a.m. UTC | #2
On 4/23/2017 9:55 PM, Jiri Olsa wrote:
> On Thu, Apr 20, 2017 at 08:07:50PM +0800, Jin Yao wrote:
>
> SNIP
>
>>   
>> +#define X86_BR_TYPE_MAP_MAX	16
>> +
>> +static int
>> +common_branch_type(int type)
>> +{
>> +	int i, mask;
>> +	const int branch_map[X86_BR_TYPE_MAP_MAX] = {
>> +		PERF_BR_CALL,		/* X86_BR_CALL */
>> +		PERF_BR_RET,		/* X86_BR_RET */
>> +		PERF_BR_SYSCALL,	/* X86_BR_SYSCALL */
>> +		PERF_BR_SYSRET,		/* X86_BR_SYSRET */
>> +		PERF_BR_INT,		/* X86_BR_INT */
>> +		PERF_BR_IRET,		/* X86_BR_IRET */
>> +		PERF_BR_JCC,		/* X86_BR_JCC */
>> +		PERF_BR_JMP,		/* X86_BR_JMP */
>> +		PERF_BR_IRQ,		/* X86_BR_IRQ */
>> +		PERF_BR_IND_CALL,	/* X86_BR_IND_CALL */
>> +		PERF_BR_NONE,		/* X86_BR_ABORT */
>> +		PERF_BR_NONE,		/* X86_BR_IN_TX */
>> +		PERF_BR_NONE,		/* X86_BR_NO_TX */
>> +		PERF_BR_CALL,		/* X86_BR_ZERO_CALL */
>> +		PERF_BR_NONE,		/* X86_BR_CALL_STACK */
>> +		PERF_BR_IND_JMP,	/* X86_BR_IND_JMP */
>> +	};
>> +
>> +	type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */
>> +	mask = ~(~0 << 1);
> is that a fancy way to get 1 into the mask? what do I miss?
>
>> +
>> +	for (i = 0; i < X86_BR_TYPE_MAP_MAX; i++) {
>> +		if (type & mask)
>> +			return branch_map[i];
> I wonder some bit search would be faster in here, but maybe not big deal
>
> jirka

I just think the branch_map[] doesn't contain many entries (16 entries 
here), so maybe checking 1 bit one time should be acceptable. I just 
want to keep the code simple.

But if the number of entries is more (e.g. 64), maybe it'd better check 
2 or 4 bits one time.

Thanks
Jin Yao

>
>> +
>> +		type >>= 1;
>> +	}
>> +
>> +	return PERF_BR_NONE;
>> +}
>> +
>>   /*
Jin, Yao May 8, 2017, 12:49 a.m. UTC | #3
On 4/24/2017 8:47 AM, Jin, Yao wrote:
>
>
> On 4/23/2017 9:55 PM, Jiri Olsa wrote:
>> On Thu, Apr 20, 2017 at 08:07:50PM +0800, Jin Yao wrote:
>>
>> SNIP
>>
>>>   +#define X86_BR_TYPE_MAP_MAX    16
>>> +
>>> +static int
>>> +common_branch_type(int type)
>>> +{
>>> +    int i, mask;
>>> +    const int branch_map[X86_BR_TYPE_MAP_MAX] = {
>>> +        PERF_BR_CALL,        /* X86_BR_CALL */
>>> +        PERF_BR_RET,        /* X86_BR_RET */
>>> +        PERF_BR_SYSCALL,    /* X86_BR_SYSCALL */
>>> +        PERF_BR_SYSRET,        /* X86_BR_SYSRET */
>>> +        PERF_BR_INT,        /* X86_BR_INT */
>>> +        PERF_BR_IRET,        /* X86_BR_IRET */
>>> +        PERF_BR_JCC,        /* X86_BR_JCC */
>>> +        PERF_BR_JMP,        /* X86_BR_JMP */
>>> +        PERF_BR_IRQ,        /* X86_BR_IRQ */
>>> +        PERF_BR_IND_CALL,    /* X86_BR_IND_CALL */
>>> +        PERF_BR_NONE,        /* X86_BR_ABORT */
>>> +        PERF_BR_NONE,        /* X86_BR_IN_TX */
>>> +        PERF_BR_NONE,        /* X86_BR_NO_TX */
>>> +        PERF_BR_CALL,        /* X86_BR_ZERO_CALL */
>>> +        PERF_BR_NONE,        /* X86_BR_CALL_STACK */
>>> +        PERF_BR_IND_JMP,    /* X86_BR_IND_JMP */
>>> +    };
>>> +
>>> +    type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */
>>> +    mask = ~(~0 << 1);
>> is that a fancy way to get 1 into the mask? what do I miss?
>>
>>> +
>>> +    for (i = 0; i < X86_BR_TYPE_MAP_MAX; i++) {
>>> +        if (type & mask)
>>> +            return branch_map[i];
>> I wonder some bit search would be faster in here, but maybe not big deal
>>
>> jirka
>
> I just think the branch_map[] doesn't contain many entries (16 entries 
> here), so maybe checking 1 bit one time should be acceptable. I just 
> want to keep the code simple.
>
> But if the number of entries is more (e.g. 64), maybe it'd better 
> check 2 or 4 bits one time.
>
> Thanks
> Jin Yao
>

Hi,

Is this explanation OK? Since for tools part, it's Acked-by: Jiri Olsa. 
I just want to know  if the kernel part is OK either?

Thanks
Jin Yao
Jiri Olsa May 9, 2017, 8:26 a.m. UTC | #4
On Mon, Apr 24, 2017 at 08:47:14AM +0800, Jin, Yao wrote:
> 
> 
> On 4/23/2017 9:55 PM, Jiri Olsa wrote:
> > On Thu, Apr 20, 2017 at 08:07:50PM +0800, Jin Yao wrote:
> > 
> > SNIP
> > 
> > > +#define X86_BR_TYPE_MAP_MAX	16
> > > +
> > > +static int
> > > +common_branch_type(int type)
> > > +{
> > > +	int i, mask;
> > > +	const int branch_map[X86_BR_TYPE_MAP_MAX] = {
> > > +		PERF_BR_CALL,		/* X86_BR_CALL */
> > > +		PERF_BR_RET,		/* X86_BR_RET */
> > > +		PERF_BR_SYSCALL,	/* X86_BR_SYSCALL */
> > > +		PERF_BR_SYSRET,		/* X86_BR_SYSRET */
> > > +		PERF_BR_INT,		/* X86_BR_INT */
> > > +		PERF_BR_IRET,		/* X86_BR_IRET */
> > > +		PERF_BR_JCC,		/* X86_BR_JCC */
> > > +		PERF_BR_JMP,		/* X86_BR_JMP */
> > > +		PERF_BR_IRQ,		/* X86_BR_IRQ */
> > > +		PERF_BR_IND_CALL,	/* X86_BR_IND_CALL */
> > > +		PERF_BR_NONE,		/* X86_BR_ABORT */
> > > +		PERF_BR_NONE,		/* X86_BR_IN_TX */
> > > +		PERF_BR_NONE,		/* X86_BR_NO_TX */
> > > +		PERF_BR_CALL,		/* X86_BR_ZERO_CALL */
> > > +		PERF_BR_NONE,		/* X86_BR_CALL_STACK */
> > > +		PERF_BR_IND_JMP,	/* X86_BR_IND_JMP */
> > > +	};
> > > +
> > > +	type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */
> > > +	mask = ~(~0 << 1);
> > is that a fancy way to get 1 into the mask? what do I miss?

you did not comment on this one

> > 
> > > +
> > > +	for (i = 0; i < X86_BR_TYPE_MAP_MAX; i++) {
> > > +		if (type & mask)
> > > +			return branch_map[i];
> > I wonder some bit search would be faster in here, but maybe not big deal
> > 
> > jirka
> 
> I just think the branch_map[] doesn't contain many entries (16 entries
> here), so maybe checking 1 bit one time should be acceptable. I just want to
> keep the code simple.
> 
> But if the number of entries is more (e.g. 64), maybe it'd better check 2 or
> 4 bits one time.

ook

jirka
Jin, Yao May 9, 2017, 11:57 a.m. UTC | #5
On 5/9/2017 4:26 PM, Jiri Olsa wrote:
> On Mon, Apr 24, 2017 at 08:47:14AM +0800, Jin, Yao wrote:
>>
>> On 4/23/2017 9:55 PM, Jiri Olsa wrote:
>>> On Thu, Apr 20, 2017 at 08:07:50PM +0800, Jin Yao wrote:
>>>
>>> SNIP
>>>
>>>> +#define X86_BR_TYPE_MAP_MAX	16
>>>> +
>>>> +static int
>>>> +common_branch_type(int type)
>>>> +{
>>>> +	int i, mask;
>>>> +	const int branch_map[X86_BR_TYPE_MAP_MAX] = {
>>>> +		PERF_BR_CALL,		/* X86_BR_CALL */
>>>> +		PERF_BR_RET,		/* X86_BR_RET */
>>>> +		PERF_BR_SYSCALL,	/* X86_BR_SYSCALL */
>>>> +		PERF_BR_SYSRET,		/* X86_BR_SYSRET */
>>>> +		PERF_BR_INT,		/* X86_BR_INT */
>>>> +		PERF_BR_IRET,		/* X86_BR_IRET */
>>>> +		PERF_BR_JCC,		/* X86_BR_JCC */
>>>> +		PERF_BR_JMP,		/* X86_BR_JMP */
>>>> +		PERF_BR_IRQ,		/* X86_BR_IRQ */
>>>> +		PERF_BR_IND_CALL,	/* X86_BR_IND_CALL */
>>>> +		PERF_BR_NONE,		/* X86_BR_ABORT */
>>>> +		PERF_BR_NONE,		/* X86_BR_IN_TX */
>>>> +		PERF_BR_NONE,		/* X86_BR_NO_TX */
>>>> +		PERF_BR_CALL,		/* X86_BR_ZERO_CALL */
>>>> +		PERF_BR_NONE,		/* X86_BR_CALL_STACK */
>>>> +		PERF_BR_IND_JMP,	/* X86_BR_IND_JMP */
>>>> +	};
>>>> +
>>>> +	type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */
>>>> +	mask = ~(~0 << 1);
>>> is that a fancy way to get 1 into the mask? what do I miss?
> you did not comment on this one

Sorry, I misunderstood that this comment and the next comment had the 
same meaning.

In the previous version, I used the switch/case to convert from X86_BR 
to PERF_BR. I got a comment from community that it'd better use a lookup 
table for conversion.

Since each bit in type represents a X86_BR type so I use a mask (0x1) to 
filter the bit. Yes, it looks I can also directly set 0x1 to mask.

I write the code "mask = ~(~0 << 1)" according to my coding habits. If 
you think I should change the code to "mask = 0x1", that's OK  :)

>>>> +
>>>> +	for (i = 0; i < X86_BR_TYPE_MAP_MAX; i++) {
>>>> +		if (type & mask)
>>>> +			return branch_map[i];
>>> I wonder some bit search would be faster in here, but maybe not big deal
>>>
>>> jirka
>> I just think the branch_map[] doesn't contain many entries (16 entries
>> here), so maybe checking 1 bit one time should be acceptable. I just want to
>> keep the code simple.
>>
>> But if the number of entries is more (e.g. 64), maybe it'd better check 2 or
>> 4 bits one time.
> ook
>
> jirka
Sorry, what's the meaning of ook? Does it mean "OK"?

Thanks
Jin Yao
Jiri Olsa May 9, 2017, 12:39 p.m. UTC | #6
On Tue, May 09, 2017 at 07:57:11PM +0800, Jin, Yao wrote:

SNIP

> > > > > +
> > > > > +	type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */
> > > > > +	mask = ~(~0 << 1);
> > > > is that a fancy way to get 1 into the mask? what do I miss?
> > you did not comment on this one
> 
> Sorry, I misunderstood that this comment and the next comment had the same
> meaning.
> 
> In the previous version, I used the switch/case to convert from X86_BR to
> PERF_BR. I got a comment from community that it'd better use a lookup table
> for conversion.
> 
> Since each bit in type represents a X86_BR type so I use a mask (0x1) to
> filter the bit. Yes, it looks I can also directly set 0x1 to mask.
> 
> I write the code "mask = ~(~0 << 1)" according to my coding habits. If you
> think I should change the code to "mask = 0x1", that's OK  :)

im ok with that.. was just wondering for the reason
I guess compiler will make it single constant assignment anyway

> 
> > > > > +
> > > > > +	for (i = 0; i < X86_BR_TYPE_MAP_MAX; i++) {
> > > > > +		if (type & mask)
> > > > > +			return branch_map[i];
> > > > I wonder some bit search would be faster in here, but maybe not big deal
> > > > 
> > > > jirka
> > > I just think the branch_map[] doesn't contain many entries (16 entries
> > > here), so maybe checking 1 bit one time should be acceptable. I just want to
> > > keep the code simple.
> > > 
> > > But if the number of entries is more (e.g. 64), maybe it'd better check 2 or
> > > 4 bits one time.
> > ook
> > 
> > jirka
> Sorry, what's the meaning of ook? Does it mean "OK"?

just means ok ;-)

thanks,
jirka
Jin, Yao May 10, 2017, 12:18 a.m. UTC | #7
On 5/9/2017 8:39 PM, Jiri Olsa wrote:
> On Tue, May 09, 2017 at 07:57:11PM +0800, Jin, Yao wrote:
>
> SNIP
>
>>>>>> +
>>>>>> +	type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */
>>>>>> +	mask = ~(~0 << 1);
>>>>> is that a fancy way to get 1 into the mask? what do I miss?
>>> you did not comment on this one
>> Sorry, I misunderstood that this comment and the next comment had the same
>> meaning.
>>
>> In the previous version, I used the switch/case to convert from X86_BR to
>> PERF_BR. I got a comment from community that it'd better use a lookup table
>> for conversion.
>>
>> Since each bit in type represents a X86_BR type so I use a mask (0x1) to
>> filter the bit. Yes, it looks I can also directly set 0x1 to mask.
>>
>> I write the code "mask = ~(~0 << 1)" according to my coding habits. If you
>> think I should change the code to "mask = 0x1", that's OK  :)
> im ok with that.. was just wondering for the reason
> I guess compiler will make it single constant assignment anyway

I think so.  The compiler should be clever enough for this optimization.
>>>>>> +
>>>>>> +	for (i = 0; i < X86_BR_TYPE_MAP_MAX; i++) {
>>>>>> +		if (type & mask)
>>>>>> +			return branch_map[i];
>>>>> I wonder some bit search would be faster in here, but maybe not big deal
>>>>>
>>>>> jirka
>>>> I just think the branch_map[] doesn't contain many entries (16 entries
>>>> here), so maybe checking 1 bit one time should be acceptable. I just want to
>>>> keep the code simple.
>>>>
>>>> But if the number of entries is more (e.g. 64), maybe it'd better check 2 or
>>>> 4 bits one time.
>>> ook
>>>
>>> jirka
>> Sorry, what's the meaning of ook? Does it mean "OK"?
> just means ok ;-)
>
> thanks,
> jirka

Thanks so much!

Jin Yao
diff mbox

Patch

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index f924629..f10a7ed 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -109,6 +109,9 @@  enum {
 	X86_BR_ZERO_CALL	= 1 << 15,/* zero length call */
 	X86_BR_CALL_STACK	= 1 << 16,/* call stack */
 	X86_BR_IND_JMP		= 1 << 17,/* indirect jump */
+
+	X86_BR_TYPE_SAVE	= 1 << 18,/* indicate to save branch type */
+
 };
 
 #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
@@ -510,6 +513,7 @@  static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
 		cpuc->lbr_entries[i].in_tx	= 0;
 		cpuc->lbr_entries[i].abort	= 0;
 		cpuc->lbr_entries[i].cycles	= 0;
+		cpuc->lbr_entries[i].type	= 0;
 		cpuc->lbr_entries[i].reserved	= 0;
 	}
 	cpuc->lbr_stack.nr = i;
@@ -596,6 +600,7 @@  static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 		cpuc->lbr_entries[out].in_tx	 = in_tx;
 		cpuc->lbr_entries[out].abort	 = abort;
 		cpuc->lbr_entries[out].cycles	 = cycles;
+		cpuc->lbr_entries[out].type	 = 0;
 		cpuc->lbr_entries[out].reserved	 = 0;
 		out++;
 	}
@@ -673,6 +678,10 @@  static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
 
 	if (br_type & PERF_SAMPLE_BRANCH_CALL)
 		mask |= X86_BR_CALL | X86_BR_ZERO_CALL;
+
+	if (br_type & PERF_SAMPLE_BRANCH_TYPE_SAVE)
+		mask |= X86_BR_TYPE_SAVE;
+
 	/*
 	 * stash actual user request into reg, it may
 	 * be used by fixup code for some CPU
@@ -926,6 +935,44 @@  static int branch_type(unsigned long from, unsigned long to, int abort)
 	return ret;
 }
 
+#define X86_BR_TYPE_MAP_MAX	16
+
+static int
+common_branch_type(int type)
+{
+	int i, mask;
+	const int branch_map[X86_BR_TYPE_MAP_MAX] = {
+		PERF_BR_CALL,		/* X86_BR_CALL */
+		PERF_BR_RET,		/* X86_BR_RET */
+		PERF_BR_SYSCALL,	/* X86_BR_SYSCALL */
+		PERF_BR_SYSRET,		/* X86_BR_SYSRET */
+		PERF_BR_INT,		/* X86_BR_INT */
+		PERF_BR_IRET,		/* X86_BR_IRET */
+		PERF_BR_JCC,		/* X86_BR_JCC */
+		PERF_BR_JMP,		/* X86_BR_JMP */
+		PERF_BR_IRQ,		/* X86_BR_IRQ */
+		PERF_BR_IND_CALL,	/* X86_BR_IND_CALL */
+		PERF_BR_NONE,		/* X86_BR_ABORT */
+		PERF_BR_NONE,		/* X86_BR_IN_TX */
+		PERF_BR_NONE,		/* X86_BR_NO_TX */
+		PERF_BR_CALL,		/* X86_BR_ZERO_CALL */
+		PERF_BR_NONE,		/* X86_BR_CALL_STACK */
+		PERF_BR_IND_JMP,	/* X86_BR_IND_JMP */
+	};
+
+	type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */
+	mask = ~(~0 << 1);
+
+	for (i = 0; i < X86_BR_TYPE_MAP_MAX; i++) {
+		if (type & mask)
+			return branch_map[i];
+
+		type >>= 1;
+	}
+
+	return PERF_BR_NONE;
+}
+
 /*
  * implement actual branch filter based on user demand.
  * Hardware may not exactly satisfy that request, thus
@@ -942,7 +989,8 @@  intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
 	bool compress = false;
 
 	/* if sampling all branches, then nothing to filter */
-	if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
+	if (((br_sel & X86_BR_ALL) == X86_BR_ALL) &&
+	    ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE))
 		return;
 
 	for (i = 0; i < cpuc->lbr_stack.nr; i++) {
@@ -963,6 +1011,9 @@  intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
 			cpuc->lbr_entries[i].from = 0;
 			compress = true;
 		}
+
+		if ((br_sel & X86_BR_TYPE_SAVE) == X86_BR_TYPE_SAVE)
+			cpuc->lbr_entries[i].type = common_branch_type(type);
 	}
 
 	if (!compress)