Message ID | 20200727233431.4103-1-bimmy.pujari@intel.com |
---|---|
State | Changes Requested |
Delegated to: | BPF Maintainers |
Headers | show |
Series | bpf: Add bpf_ktime_get_real_ns | expand |
On Mon, Jul 27, 2020 at 4:35 PM <bimmy.pujari@intel.com> wrote: > > From: Ashkan Nikravesh <ashkan.nikravesh@intel.com> > > The existing bpf helper functions to get timestamp return the time > elapsed since system boot. This timestamp is not particularly useful > where epoch timestamp is required or more than one server is involved > and time sync is required. Instead, you want to use CLOCK_REALTIME, > which provides epoch timestamp. > Hence add bfp_ktime_get_real_ns() based around CLOCK_REALTIME. > > Signed-off-by: Ashkan Nikravesh <ashkan.nikravesh@intel.com> > Signed-off-by: Bimmy Pujari <bimmy.pujari@intel.com> Looks good. Except I am not sure whether we want to call it real_ns or epoch_timestamp_ns, or epoch_ns. Please add a selftest for this function. Also, when resending the patch, please specify which tree it applies to, like "PATCH bpf-next" or "PATCH bpf". Thanks, Song > --- > drivers/media/rc/bpf-lirc.c | 2 ++ > include/linux/bpf.h | 1 + > include/uapi/linux/bpf.h | 7 +++++++ > kernel/bpf/core.c | 1 + > kernel/bpf/helpers.c | 14 ++++++++++++++ > kernel/trace/bpf_trace.c | 2 ++ > tools/include/uapi/linux/bpf.h | 7 +++++++ > 7 files changed, 34 insertions(+) > > diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c > index 5bb144435c16..1cae0cfdcbaf 100644 > --- a/drivers/media/rc/bpf-lirc.c > +++ b/drivers/media/rc/bpf-lirc.c > @@ -103,6 +103,8 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) > return &bpf_map_peek_elem_proto; > case BPF_FUNC_ktime_get_ns: > return &bpf_ktime_get_ns_proto; > + case BPF_FUNC_ktime_get_real_ns: > + return &bpf_ktime_get_real_ns_proto; > case BPF_FUNC_ktime_get_boot_ns: > return &bpf_ktime_get_boot_ns_proto; > case BPF_FUNC_tail_call: > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 72221aea1c60..9c00ad932f08 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -1639,6 +1639,7 @@ extern const struct bpf_func_proto bpf_get_numa_node_id_proto; > extern const struct bpf_func_proto bpf_tail_call_proto; > extern const struct bpf_func_proto bpf_ktime_get_ns_proto; > extern const struct bpf_func_proto bpf_ktime_get_boot_ns_proto; > +extern const struct bpf_func_proto bpf_ktime_get_real_ns_proto; > extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto; > extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; > extern const struct bpf_func_proto bpf_get_current_comm_proto; > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 54d0c886e3ba..7742c10fbc95 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -3377,6 +3377,12 @@ union bpf_attr { > * A non-negative value equal to or less than *size* on success, > * or a negative error in case of failure. > * > + * u64 bpf_ktime_get_real_ns(void) > + * Description > + * Return the real time in nanoseconds. > + * See: **clock_gettime**\ (**CLOCK_REALTIME**) > + * Return > + * Current *ktime*. > */ > #define __BPF_FUNC_MAPPER(FN) \ > FN(unspec), \ > @@ -3521,6 +3527,7 @@ union bpf_attr { > FN(skc_to_tcp_request_sock), \ > FN(skc_to_udp6_sock), \ > FN(get_task_stack), \ > + FN(ktime_get_real_ns), \ > /* */ > > /* integer value in 'imm' field of BPF_CALL instruction selects which helper > diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c > index 7be02e555ab9..0f25f3413208 100644 > --- a/kernel/bpf/core.c > +++ b/kernel/bpf/core.c > @@ -2211,6 +2211,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; > const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; > const struct bpf_func_proto bpf_get_numa_node_id_proto __weak; > const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; > +const struct bpf_func_proto bpf_ktime_get_real_ns_proto __weak; > const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak; > > const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c > index be43ab3e619f..b726fd077698 100644 > --- a/kernel/bpf/helpers.c > +++ b/kernel/bpf/helpers.c > @@ -155,6 +155,18 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto = { > .ret_type = RET_INTEGER, > }; > > +BPF_CALL_0(bpf_ktime_get_real_ns) > +{ > + /* NMI safe access to clock realtime */ > + return ktime_get_real_fast_ns(); > +} > + > +const struct bpf_func_proto bpf_ktime_get_real_ns_proto = { > + .func = bpf_ktime_get_real_ns, > + .gpl_only = true, > + .ret_type = RET_INTEGER, > +}; > + > BPF_CALL_0(bpf_ktime_get_boot_ns) > { > /* NMI safe access to clock boottime */ > @@ -633,6 +645,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) > return &bpf_tail_call_proto; > case BPF_FUNC_ktime_get_ns: > return &bpf_ktime_get_ns_proto; > + case BPF_FUNC_ktime_get_real_ns: > + return &bpf_ktime_get_real_ns_proto; > case BPF_FUNC_ktime_get_boot_ns: > return &bpf_ktime_get_boot_ns_proto; > case BPF_FUNC_ringbuf_output: > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c > index 3cc0dcb60ca2..4e048dfb527b 100644 > --- a/kernel/trace/bpf_trace.c > +++ b/kernel/trace/bpf_trace.c > @@ -1116,6 +1116,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) > return &bpf_map_peek_elem_proto; > case BPF_FUNC_ktime_get_ns: > return &bpf_ktime_get_ns_proto; > + case BPF_FUNC_ktime_get_real_ns: > + return &bpf_ktime_get_real_ns_proto; > case BPF_FUNC_ktime_get_boot_ns: > return &bpf_ktime_get_boot_ns_proto; > case BPF_FUNC_tail_call: > diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h > index 54d0c886e3ba..9ca51f82dd35 100644 > --- a/tools/include/uapi/linux/bpf.h > +++ b/tools/include/uapi/linux/bpf.h > @@ -3377,6 +3377,12 @@ union bpf_attr { > * A non-negative value equal to or less than *size* on success, > * or a negative error in case of failure. > * > + * u64 bpf_ktime_get_real_ns(void) > + * Description > + * Return the real time in nanoseconds. > + * See: **clock_gettime**\ (**CLOCK_REALTIME**) > + * Return > + * Current *ktime*. > */ > #define __BPF_FUNC_MAPPER(FN) \ > FN(unspec), \ > @@ -3521,6 +3527,7 @@ union bpf_attr { > FN(skc_to_tcp_request_sock), \ > FN(skc_to_udp6_sock), \ > FN(get_task_stack), \ > + FN(ktime_get_real_ns), \ > /* */ > > /* integer value in 'imm' field of BPF_CALL instruction selects which helper > -- > 2.17.1 >
On Mon, Jul 27, 2020 at 4:35 PM <bimmy.pujari@intel.com> wrote: > > From: Ashkan Nikravesh <ashkan.nikravesh@intel.com> > > The existing bpf helper functions to get timestamp return the time > elapsed since system boot. This timestamp is not particularly useful > where epoch timestamp is required or more than one server is involved > and time sync is required. Instead, you want to use CLOCK_REALTIME, > which provides epoch timestamp. > Hence add bfp_ktime_get_real_ns() based around CLOCK_REALTIME. > This doesn't seem like a good idea. With time-since-boot it's very easy to translate timestamp into a real time on the host. Having get_real_ns() variant might just encourage people to assume precise wall-clock timestamps that can be compared between within or even across different hosts. REALCLOCK can jump around, you can get duplicate timestamps, timestamps can go back in time, etc. It's just not a good way to measure time. Also, you mention the need for time sync. It's an extremely hard thing to have synchronized time between two different physical hosts, as anyone that has dealt with distributed systems will attest. Having this helper will just create a dangerous illusion that it is possible and will just cause more problems down the road for people. > Signed-off-by: Ashkan Nikravesh <ashkan.nikravesh@intel.com> > Signed-off-by: Bimmy Pujari <bimmy.pujari@intel.com> > --- > drivers/media/rc/bpf-lirc.c | 2 ++ > include/linux/bpf.h | 1 + > include/uapi/linux/bpf.h | 7 +++++++ > kernel/bpf/core.c | 1 + > kernel/bpf/helpers.c | 14 ++++++++++++++ > kernel/trace/bpf_trace.c | 2 ++ > tools/include/uapi/linux/bpf.h | 7 +++++++ > 7 files changed, 34 insertions(+) [...]
On Tue, Jul 28, 2020 at 10:57 AM Maciej Żenczykowski <maze@google.com> wrote: > > On Mon, Jul 27, 2020 at 10:01 PM Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote: >> >> On Mon, Jul 27, 2020 at 4:35 PM <bimmy.pujari@intel.com> wrote: >> > >> > From: Ashkan Nikravesh <ashkan.nikravesh@intel.com> >> > >> > The existing bpf helper functions to get timestamp return the time >> > elapsed since system boot. This timestamp is not particularly useful >> > where epoch timestamp is required or more than one server is involved >> > and time sync is required. Instead, you want to use CLOCK_REALTIME, >> > which provides epoch timestamp. >> > Hence add bfp_ktime_get_real_ns() based around CLOCK_REALTIME. >> > >> >> This doesn't seem like a good idea. > > > I disagree. > >> >> With time-since-boot it's very >> easy to translate timestamp into a real time on the host. > > > While it's easy to do, it's annoying, because you need to hardcode the offset into the bpf program > (which has security implications) or use another bpf array lookup/read (perf implications). There is no array lookup/read if you are using a global variable for this. > >> >> Having >> get_real_ns() variant might just encourage people to assume precise >> wall-clock timestamps that can be compared between within or even >> across different hosts. > > > In some environments they *can*. Within a datacenter it is pretty common > to have extremely tightly synchronized clocks across thousands of machines. In some, yes, which also means that in some other they can't. So I'm still worried about misuses of REALCLOCK, within (internal daemons within the company) our outside (BCC tools and alike) of data centers. Especially if people will start using it to measure elapsed time between events. I'd rather not have to explain over and over again that REALCLOCK is not for measuring passage of time. > >> >> REALCLOCK can jump around, you can get >> duplicate timestamps, timestamps can go back in time, etc. > > > That's only true if you allow it to. On a server machine, once it's booted Right, but BPF is used so widely that it will be true at least in some cases. > up and time is synchronized, it no longer jumps around, only the frequency > is slowly adjusted to keep things in sync. > >> >> It's just not a good way to measure time. > > > It doesn't change the fact that some packets/protocols include real world timestamps. > We already have the since-boot time fetchers, so this doesn't prevent you from > using that - if that is what you want. > > Also one thing to remember is that a lot of time you only need ~1s precision. > In which case 'real' can simply be a lot easier to deal with. > Again, I'm worried about ensuing confusion by users trying to pick which variant to use. I'm not worried about those who understand the difference and can pick the right one for their needs, I'm worried about those that don't even give it a second thought. Given it's rather simple to convert boot time into wall-clock time, it doesn't seem like a necessary addition. >> >> Also, you mention the need for time sync. It's an extremely hard thing >> to have synchronized time between two different physical hosts, as >> anyone that has dealt with distributed systems will attest. Having >> this helper will just create a dangerous illusion that it is possible >> and will just cause more problems down the road for people. > > > It is possible. But yes. It is very very hard. > You can read up on Google TrueTime as an example real world implementation. Thank you, I did, though quite a while ago. Notice, I didn't say it's impossible, right? ;) But then Google TrueTime provides a range of time within confidence intervals, not a single seemingly-deterministic timestamp. And it needs custom hardware, so it's not realistic to expect to have it everywhere :) > >> >> >> > Signed-off-by: Ashkan Nikravesh <ashkan.nikravesh@intel.com> >> > Signed-off-by: Bimmy Pujari <bimmy.pujari@intel.com> >> > --- >> > drivers/media/rc/bpf-lirc.c | 2 ++ >> > include/linux/bpf.h | 1 + >> > include/uapi/linux/bpf.h | 7 +++++++ >> > kernel/bpf/core.c | 1 + >> > kernel/bpf/helpers.c | 14 ++++++++++++++ >> > kernel/trace/bpf_trace.c | 2 ++ >> > tools/include/uapi/linux/bpf.h | 7 +++++++ >> > 7 files changed, 34 insertions(+) >> >> [...] > > > Maciej Żenczykowski, Kernel Networking Developer @ Google
On 7/28/20 12:28 PM, Andrii Nakryiko wrote: > In some, yes, which also means that in some other they can't. So I'm > still worried about misuses of REALCLOCK, within (internal daemons > within the company) our outside (BCC tools and alike) of data centers. > Especially if people will start using it to measure elapsed time > between events. I'd rather not have to explain over and over again > that REALCLOCK is not for measuring passage of time. Why is documenting the type of clock and its limitations not sufficient? Users are going to make mistakes and use of gettimeofday to measure time differences is a common one for userspace code. That should not define or limit the ability to correctly and most directly do something in bpf. I have a patch to export local_clock as bpf_ktime_get_fast_ns. It too can be abused given that it has limitations (can not be used across CPUs and does not correlate to any exported clock), but it too has important use cases (twice as fast as bpf_ktime_get_ns and useful for per-cpu delta-time needs). Users have to know what they are doing; making mistakes is part of learning. Proper documentation is all you can do.
On Tue, Jul 28, 2020 at 1:00 PM Nikravesh, Ashkan <ashkan.nikravesh@intel.com> wrote: > > > > ________________________________ > From: Andrii Nakryiko <andrii.nakryiko@gmail.com> > Sent: Tuesday, July 28, 2020 11:28 AM > To: Maciej Żenczykowski <maze@google.com> > Cc: Pujari, Bimmy <bimmy.pujari@intel.com>; bpf <bpf@vger.kernel.org>; Networking <netdev@vger.kernel.org>; mchehab@kernel.org <mchehab@kernel.org>; Alexei Starovoitov <ast@kernel.org>; Daniel Borkmann <daniel@iogearbox.net>; Martin Lau <kafai@fb.com>; Nikravesh, Ashkan <ashkan.nikravesh@intel.com> > Subject: Re: [PATCH] bpf: Add bpf_ktime_get_real_ns > > On Tue, Jul 28, 2020 at 10:57 AM Maciej Żenczykowski <maze@google.com> wrote: > > > > On Mon, Jul 27, 2020 at 10:01 PM Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote: > >> > >> On Mon, Jul 27, 2020 at 4:35 PM <bimmy.pujari@intel.com> wrote: > >> > > >> > From: Ashkan Nikravesh <ashkan.nikravesh@intel.com> > >> > > >> > The existing bpf helper functions to get timestamp return the time > >> > elapsed since system boot. This timestamp is not particularly useful > >> > where epoch timestamp is required or more than one server is involved > >> > and time sync is required. Instead, you want to use CLOCK_REALTIME, > >> > which provides epoch timestamp. > >> > Hence add bfp_ktime_get_real_ns() based around CLOCK_REALTIME. > >> > > >> > >> This doesn't seem like a good idea. > > > > > > I disagree. > > > >> > >> With time-since-boot it's very > >> easy to translate timestamp into a real time on the host. > > > > > > While it's easy to do, it's annoying, because you need to hardcode the offset into the bpf program > > (which has security implications) or use another bpf array lookup/read (perf implications). > > There is no array lookup/read if you are using a global variable for this. > > [AN] Yes, as Maciej mentioned, converting since-boot-time to epoch in kernel has performance implication when the epoch timestamp/offset is provided by user-space and hard coding the offset doesn't seem to be a good idea. I am not sure what you refer by using a global variable. > This is global variable: /* this gets initialized from user-space, * see tools/bpf/runqslower for examples, * or many of selftests doing this */ const volatile __u64 boot_to_wall_off_ns = 0; SEC("...") int my_handler(...) { u64 wallclock_ns = bpf_ktime_get_boot_ns() + boot_to_wall_off_ns; ... } There is no map lookup involved, boot_to_wall_off_ns is accessed directly. > > > >> > >> Having > >> get_real_ns() variant might just encourage people to assume precise > >> wall-clock timestamps that can be compared between within or even > >> across different hosts. > > > > > > In some environments they *can*. Within a datacenter it is pretty common > > to have extremely tightly synchronized clocks across thousands of machines. > > In some, yes, which also means that in some other they can't. So I'm > still worried about misuses of REALCLOCK, within (internal daemons > within the company) our outside (BCC tools and alike) of data centers. > Especially if people will start using it to measure elapsed time > between events. I'd rather not have to explain over and over again > that REALCLOCK is not for measuring passage of time. > > [AN] Time synchronization fidelity or accuracy requirement is driven by the use-case. A time synchronization is not the question here, you assume that your time synchronization accuracy is sufficient for your use-case. You can always aim for better accuracy. For instance, the proposed helper function can be used as a mechanism to communicate the timestamp. For this, one use-case could be to measure e2e latency by adding egress timestamp to the packet. To do that, we sync the clock across all the hosts using NTP within a data center. You are talking about use cases where you guys are conscious about implications of using wallclock timestamps, and I'm talking about general BPF users that might not know about any of these nuances, but seeing bpf_ktime_get_real_ns() would be happy that it's "real" and would just use it for everything. > For instance, the proposed helper function can be used as a mechanism to communicate the timestamp. See example above, can `bpf_ktime_get_boot_ns() + boot_to_wall_off_ns` be used similarly for cross-host timestamp comparison? > > > > >> > >> REALCLOCK can jump around, you can get > >> duplicate timestamps, timestamps can go back in time, etc. > > > > > > That's only true if you allow it to. On a server machine, once it's booted > > Right, but BPF is used so widely that it will be true at least in some cases. > > > up and time is synchronized, it no longer jumps around, only the frequency > > is slowly adjusted to keep things in sync. > > > >> > >> It's just not a good way to measure time. > > > > > > It doesn't change the fact that some packets/protocols include real world timestamps. > > We already have the since-boot time fetchers, so this doesn't prevent you from > > using that - if that is what you want. > > > > Also one thing to remember is that a lot of time you only need ~1s precision. > > In which case 'real' can simply be a lot easier to deal with. > > > > Again, I'm worried about ensuing confusion by users trying to pick > which variant to use. I'm not worried about those who understand the > difference and can pick the right one for their needs, I'm worried > about those that don't even give it a second thought. Given it's > rather simple to convert boot time into wall-clock time, it doesn't > seem like a necessary addition. > > >> > >> Also, you mention the need for time sync. It's an extremely hard thing > >> to have synchronized time between two different physical hosts, as > >> anyone that has dealt with distributed systems will attest. Having > >> this helper will just create a dangerous illusion that it is possible > >> and will just cause more problems down the road for people. > > > > > > It is possible. But yes. It is very very hard. > > You can read up on Google TrueTime as an example real world implementation. > > Thank you, I did, though quite a while ago. Notice, I didn't say it's > impossible, right? ;) But then Google TrueTime provides a range of > time within confidence intervals, not a single seemingly-deterministic > timestamp. And it needs custom hardware, so it's not realistic to > expect to have it everywhere :) > > > > >> > >> > >> > Signed-off-by: Ashkan Nikravesh <ashkan.nikravesh@intel.com> > >> > Signed-off-by: Bimmy Pujari <bimmy.pujari@intel.com> > >> > --- > >> > drivers/media/rc/bpf-lirc.c | 2 ++ > >> > include/linux/bpf.h | 1 + > >> > include/uapi/linux/bpf.h | 7 +++++++ > >> > kernel/bpf/core.c | 1 + > >> > kernel/bpf/helpers.c | 14 ++++++++++++++ > >> > kernel/trace/bpf_trace.c | 2 ++ > >> > tools/include/uapi/linux/bpf.h | 7 +++++++ > >> > 7 files changed, 34 insertions(+) > >> > >> [...] > > > > > > Maciej Żenczykowski, Kernel Networking Developer @ Google
On Tue, Jul 28, 2020 at 1:57 PM David Ahern <dsahern@gmail.com> wrote: > > On 7/28/20 12:28 PM, Andrii Nakryiko wrote: > > In some, yes, which also means that in some other they can't. So I'm > > still worried about misuses of REALCLOCK, within (internal daemons > > within the company) our outside (BCC tools and alike) of data centers. > > Especially if people will start using it to measure elapsed time > > between events. I'd rather not have to explain over and over again > > that REALCLOCK is not for measuring passage of time. > > Why is documenting the type of clock and its limitations not sufficient? > Users are going to make mistakes and use of gettimeofday to measure time > differences is a common one for userspace code. That should not define > or limit the ability to correctly and most directly do something in bpf. > > I have a patch to export local_clock as bpf_ktime_get_fast_ns. It too > can be abused given that it has limitations (can not be used across CPUs > and does not correlate to any exported clock), but it too has important > use cases (twice as fast as bpf_ktime_get_ns and useful for per-cpu > delta-time needs). > > Users have to know what they are doing; making mistakes is part of > learning. Proper documentation is all you can do. I don't believe that's all we can do. Designing APIs that are less error-prone is at least one way to go about that. One can find plenty of examples where well-documented and standardized APIs are nevertheless misused regularly. Also, "users read and follow documentation" doesn't match my experience, unfortunately.
On 7/28/20 11:15 PM, Andrii Nakryiko wrote: > On Tue, Jul 28, 2020 at 1:57 PM David Ahern <dsahern@gmail.com> wrote: >> >> On 7/28/20 12:28 PM, Andrii Nakryiko wrote: >>> In some, yes, which also means that in some other they can't. So I'm >>> still worried about misuses of REALCLOCK, within (internal daemons >>> within the company) our outside (BCC tools and alike) of data centers. >>> Especially if people will start using it to measure elapsed time >>> between events. I'd rather not have to explain over and over again >>> that REALCLOCK is not for measuring passage of time. >> >> Why is documenting the type of clock and its limitations not sufficient? >> Users are going to make mistakes and use of gettimeofday to measure time >> differences is a common one for userspace code. That should not define >> or limit the ability to correctly and most directly do something in bpf. >> >> I have a patch to export local_clock as bpf_ktime_get_fast_ns. It too >> can be abused given that it has limitations (can not be used across CPUs >> and does not correlate to any exported clock), but it too has important >> use cases (twice as fast as bpf_ktime_get_ns and useful for per-cpu >> delta-time needs). >> >> Users have to know what they are doing; making mistakes is part of >> learning. Proper documentation is all you can do. > > I don't believe that's all we can do. Designing APIs that are less > error-prone is at least one way to go about that. One can find plenty > of examples where well-documented and standardized APIs are > nevertheless misused regularly. Also, "users read and follow > documentation" doesn't match my experience, unfortunately. > This API is about exposing a standard, well-known clock source to bpf programs. Your argument is no because some users might use it incorrectly, and I think that argument is wrong. Don't make people who know what they are doing jump through hoops because a novice might make a mistake.
On Mon Jul 27, 2020 at 10:01 PM PDT, Andrii Nakryiko wrote: > On Mon, Jul 27, 2020 at 4:35 PM <bimmy.pujari@intel.com> wrote: > > > > From: Ashkan Nikravesh <ashkan.nikravesh@intel.com> > > > > The existing bpf helper functions to get timestamp return the time > > elapsed since system boot. This timestamp is not particularly useful > > where epoch timestamp is required or more than one server is involved > > and time sync is required. Instead, you want to use CLOCK_REALTIME, > > which provides epoch timestamp. > > Hence add bfp_ktime_get_real_ns() based around CLOCK_REALTIME. > > > > This doesn't seem like a good idea. With time-since-boot it's very > easy to translate timestamp into a real time on the host. For bpftrace, we have a need to get millisecond-level precision on timestamps. bpf has nanosecond level precision via bpf_ktime_get[_boot]_ns(), but to the best of my knowledge userspace doesn't have a high precision boot timestamp. There's /proc/stat's btime, but that's second-level precision. There's also /proc/uptime which has millisecond-level precision but you need to make a second call to get current time. Between those two calls there could be some unknown delta. For millisecond we could probably get away with calculating a delta and warning on large delta but maybe one day we'll want microsecond-level precision. I'll probably put up a patch to add nanoseconds to btime (new field in /proc/stat) to see how it's received. But either this patch or my patch would work for bpftrace. [...] Thanks, Daniel
On Tue, Aug 18, 2020 at 2:11 PM Daniel Xu <dxu@dxuuu.xyz> wrote: > > On Mon Jul 27, 2020 at 10:01 PM PDT, Andrii Nakryiko wrote: > > On Mon, Jul 27, 2020 at 4:35 PM <bimmy.pujari@intel.com> wrote: > > > > > > From: Ashkan Nikravesh <ashkan.nikravesh@intel.com> > > > > > > The existing bpf helper functions to get timestamp return the time > > > elapsed since system boot. This timestamp is not particularly useful > > > where epoch timestamp is required or more than one server is involved > > > and time sync is required. Instead, you want to use CLOCK_REALTIME, > > > which provides epoch timestamp. > > > Hence add bfp_ktime_get_real_ns() based around CLOCK_REALTIME. > > > > > > > This doesn't seem like a good idea. With time-since-boot it's very > > easy to translate timestamp into a real time on the host. > > For bpftrace, we have a need to get millisecond-level precision on > timestamps. bpf has nanosecond level precision via > bpf_ktime_get[_boot]_ns(), but to the best of my knowledge userspace > doesn't have a high precision boot timestamp. > > There's /proc/stat's btime, but that's second-level precision. There's > also /proc/uptime which has millisecond-level precision but you need to > make a second call to get current time. Between those two calls there > could be some unknown delta. For millisecond we could probably get away > with calculating a delta and warning on large delta but maybe one day > we'll want microsecond-level precision. > > I'll probably put up a patch to add nanoseconds to btime (new field in > /proc/stat) to see how it's received. But either this patch or my patch > would work for bpftrace. > > [...] > > Thanks, > Daniel Not sure what problem you're trying to solve and thus what exactly you need... but you can probably get something very very close with 1 or 2 of clock_gettime(CLOCK_{BOOTTIME,MONOTONIC,REALTIME}) possibly in a triple vdso call sandwich and iterated a few times and picking the one with smallest delta between 1st and 3rd calls. And then assuming the avg of 1st and 3rd matches the 2nd. ie. 5 times do: t1[i] = clock_gettime(REALTIME) t2[i] = clock_gettime(BOOTTIME) t3[i] = clock_gettime(REALTIME) pick i so t3[i] - t1[i] is smallest t2[i] is near equivalent to (t1[i] + t3[i]) / 2 which basically gives you a REAL to BOOT offset.
Hi Maciej, On Tue Aug 18, 2020 at 2:19 PM PDT, Maciej Żenczykowski wrote: > On Tue, Aug 18, 2020 at 2:11 PM Daniel Xu <dxu@dxuuu.xyz> wrote: > > > > On Mon Jul 27, 2020 at 10:01 PM PDT, Andrii Nakryiko wrote: > > > On Mon, Jul 27, 2020 at 4:35 PM <bimmy.pujari@intel.com> wrote: > > > > > > > > From: Ashkan Nikravesh <ashkan.nikravesh@intel.com> > > > > > > > > The existing bpf helper functions to get timestamp return the time > > > > elapsed since system boot. This timestamp is not particularly useful > > > > where epoch timestamp is required or more than one server is involved > > > > and time sync is required. Instead, you want to use CLOCK_REALTIME, > > > > which provides epoch timestamp. > > > > Hence add bfp_ktime_get_real_ns() based around CLOCK_REALTIME. > > > > > > > > > > This doesn't seem like a good idea. With time-since-boot it's very > > > easy to translate timestamp into a real time on the host. > > > > For bpftrace, we have a need to get millisecond-level precision on > > timestamps. bpf has nanosecond level precision via > > bpf_ktime_get[_boot]_ns(), but to the best of my knowledge userspace > > doesn't have a high precision boot timestamp. > > > > There's /proc/stat's btime, but that's second-level precision. There's > > also /proc/uptime which has millisecond-level precision but you need to > > make a second call to get current time. Between those two calls there > > could be some unknown delta. For millisecond we could probably get away > > with calculating a delta and warning on large delta but maybe one day > > we'll want microsecond-level precision. > > > > I'll probably put up a patch to add nanoseconds to btime (new field in > > /proc/stat) to see how it's received. But either this patch or my patch > > would work for bpftrace. > > > > [...] > > > > Thanks, > > Daniel > > Not sure what problem you're trying to solve and thus what exactly you > need... but you can probably get something very very close with 1 or 2 > of clock_gettime(CLOCK_{BOOTTIME,MONOTONIC,REALTIME}) possibly in a > triple vdso call sandwich and iterated a few times and picking the one > with smallest delta between 1st and 3rd calls. And then assuming the > avg of 1st and 3rd matches the 2nd. > ie. > > 5 times do: > t1[i] = clock_gettime(REALTIME) > t2[i] = clock_gettime(BOOTTIME) > t3[i] = clock_gettime(REALTIME) > > pick i so t3[i] - t1[i] is smallest > > t2[i] is near equivalent to (t1[i] + t3[i]) / 2 > > which basically gives you a REAL to BOOT offset. I tried out the triple vdso sandwich and I got pretty good results (~30ns ballpark). Thanks for the tip. Daniel
diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index 5bb144435c16..1cae0cfdcbaf 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -103,6 +103,8 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_peek_elem_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; + case BPF_FUNC_ktime_get_real_ns: + return &bpf_ktime_get_real_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; case BPF_FUNC_tail_call: diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 72221aea1c60..9c00ad932f08 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1639,6 +1639,7 @@ extern const struct bpf_func_proto bpf_get_numa_node_id_proto; extern const struct bpf_func_proto bpf_tail_call_proto; extern const struct bpf_func_proto bpf_ktime_get_ns_proto; extern const struct bpf_func_proto bpf_ktime_get_boot_ns_proto; +extern const struct bpf_func_proto bpf_ktime_get_real_ns_proto; extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto; extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 54d0c886e3ba..7742c10fbc95 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3377,6 +3377,12 @@ union bpf_attr { * A non-negative value equal to or less than *size* on success, * or a negative error in case of failure. * + * u64 bpf_ktime_get_real_ns(void) + * Description + * Return the real time in nanoseconds. + * See: **clock_gettime**\ (**CLOCK_REALTIME**) + * Return + * Current *ktime*. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3521,6 +3527,7 @@ union bpf_attr { FN(skc_to_tcp_request_sock), \ FN(skc_to_udp6_sock), \ FN(get_task_stack), \ + FN(ktime_get_real_ns), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7be02e555ab9..0f25f3413208 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2211,6 +2211,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; const struct bpf_func_proto bpf_get_numa_node_id_proto __weak; const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; +const struct bpf_func_proto bpf_ktime_get_real_ns_proto __weak; const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index be43ab3e619f..b726fd077698 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -155,6 +155,18 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto = { .ret_type = RET_INTEGER, }; +BPF_CALL_0(bpf_ktime_get_real_ns) +{ + /* NMI safe access to clock realtime */ + return ktime_get_real_fast_ns(); +} + +const struct bpf_func_proto bpf_ktime_get_real_ns_proto = { + .func = bpf_ktime_get_real_ns, + .gpl_only = true, + .ret_type = RET_INTEGER, +}; + BPF_CALL_0(bpf_ktime_get_boot_ns) { /* NMI safe access to clock boottime */ @@ -633,6 +645,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_tail_call_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; + case BPF_FUNC_ktime_get_real_ns: + return &bpf_ktime_get_real_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; case BPF_FUNC_ringbuf_output: diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3cc0dcb60ca2..4e048dfb527b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1116,6 +1116,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_peek_elem_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; + case BPF_FUNC_ktime_get_real_ns: + return &bpf_ktime_get_real_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; case BPF_FUNC_tail_call: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 54d0c886e3ba..9ca51f82dd35 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3377,6 +3377,12 @@ union bpf_attr { * A non-negative value equal to or less than *size* on success, * or a negative error in case of failure. * + * u64 bpf_ktime_get_real_ns(void) + * Description + * Return the real time in nanoseconds. + * See: **clock_gettime**\ (**CLOCK_REALTIME**) + * Return + * Current *ktime*. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3521,6 +3527,7 @@ union bpf_attr { FN(skc_to_tcp_request_sock), \ FN(skc_to_udp6_sock), \ FN(get_task_stack), \ + FN(ktime_get_real_ns), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper