diff mbox series

[v1,1/2] powerpc/mmiotrace: Add MMIO Tracing tool for PowerPC

Message ID 2bf90acf7d29641ba6643934ff8dbba897dbd2d9.1718873074.git.jialong.yang@shingroup.cn (mailing list archive)
State Changes Requested
Headers show
Series [v1,1/2] powerpc/mmiotrace: Add MMIO Tracing tool for PowerPC | expand

Commit Message

Yang Jialong 杨佳龙 June 20, 2024, 8:51 a.m. UTC
mmiotrace is a useful tool to trace MMIO accesses. Nowadays, it only
supported on x86 and x86_64 platforms. Here is a support for powerpc.
The manual is located at Documentation/trace/mmiotrace.rst which means
I have not changed user API. People will be easy to use it.
Almost all files are copied from x86/mm, there are only some
differences from hardware and architectures software.

LINK: https://lore.kernel.org/lkml/20080127195536.50809974@daedalus.pq.iki.fi/

Signed-off-by: Jialong Yang <jialong.yang@shingroup.cn>
---
 arch/powerpc/Kconfig.debug       |   3 +
 arch/powerpc/mm/Makefile         |   1 +
 arch/powerpc/mm/kmmio.c          | 649 +++++++++++++++++++++++++++++++
 arch/powerpc/mm/mmio-mod.c       | 414 ++++++++++++++++++++
 arch/powerpc/mm/mmiotrace_arch.c | 149 +++++++
 arch/powerpc/mm/mmiotrace_arch.h |  25 ++
 arch/powerpc/mm/pf_in.c          | 185 +++++++++
 arch/powerpc/mm/pf_in.h          |  33 ++
 8 files changed, 1459 insertions(+)
 create mode 100644 arch/powerpc/mm/kmmio.c
 create mode 100644 arch/powerpc/mm/mmio-mod.c
 create mode 100644 arch/powerpc/mm/mmiotrace_arch.c
 create mode 100644 arch/powerpc/mm/mmiotrace_arch.h
 create mode 100644 arch/powerpc/mm/pf_in.c
 create mode 100644 arch/powerpc/mm/pf_in.h

Comments

kernel test robot June 27, 2024, 12:31 p.m. UTC | #1
Hi Jialong,

kernel test robot noticed the following build errors:

[auto build test ERROR on powerpc/next]
[also build test ERROR on powerpc/fixes linus/master v6.10-rc5 next-20240626]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Jialong-Yang/powerpc-mmiotrace-bind-ioremap-and-page-fault-to-active-mmiotrace/20240624-163027
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
patch link:    https://lore.kernel.org/r/2bf90acf7d29641ba6643934ff8dbba897dbd2d9.1718873074.git.jialong.yang%40shingroup.cn
patch subject: [PATCH v1 1/2] powerpc/mmiotrace: Add MMIO Tracing tool for PowerPC
config: powerpc-randconfig-r113-20240627 (https://download.01.org/0day-ci/archive/20240627/202406271946.A6jwFfaY-lkp@intel.com/config)
compiler: powerpc-linux-gcc (GCC) 13.2.0
reproduce: (https://download.01.org/0day-ci/archive/20240627/202406271946.A6jwFfaY-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202406271946.A6jwFfaY-lkp@intel.com/

All error/warnings (new ones prefixed by >>):

   arch/powerpc/mm/kmmio.c: In function 'pmd_mkinvalid':
>> arch/powerpc/mm/kmmio.c:140:16: error: implicit declaration of function '__pmd_raw' [-Werror=implicit-function-declaration]
     140 |         return __pmd_raw(pmd_raw(pmd) & ~cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID));
         |                ^~~~~~~~~
>> arch/powerpc/mm/kmmio.c:140:26: error: implicit declaration of function 'pmd_raw'; did you mean 'pmd_bad'? [-Werror=implicit-function-declaration]
     140 |         return __pmd_raw(pmd_raw(pmd) & ~cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID));
         |                          ^~~~~~~
         |                          pmd_bad
   In file included from include/linux/byteorder/big_endian.h:5,
                    from arch/powerpc/include/uapi/asm/byteorder.h:14,
                    from include/asm-generic/bitops/le.h:6,
                    from arch/powerpc/include/asm/bitops.h:325,
                    from include/linux/bitops.h:63,
                    from include/linux/thread_info.h:27,
                    from arch/powerpc/include/asm/ptrace.h:342,
                    from arch/powerpc/include/asm/hw_irq.h:12,
                    from arch/powerpc/include/asm/irqflags.h:12,
                    from include/linux/irqflags.h:18,
                    from include/asm-generic/cmpxchg-local.h:6,
                    from arch/powerpc/include/asm/cmpxchg.h:755,
                    from arch/powerpc/include/asm/atomic.h:11,
                    from include/linux/atomic.h:7,
                    from include/linux/rcupdate.h:25,
                    from include/linux/rculist.h:11,
                    from arch/powerpc/mm/kmmio.c:10:
>> arch/powerpc/mm/kmmio.c:140:70: error: '_PAGE_INVALID' undeclared (first use in this function); did you mean 'RPM_INVALID'?
     140 |         return __pmd_raw(pmd_raw(pmd) & ~cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID));
         |                                                                      ^~~~~~~~~~~~~
   include/uapi/linux/byteorder/big_endian.h:38:51: note: in definition of macro '__cpu_to_be64'
      38 | #define __cpu_to_be64(x) ((__force __be64)(__u64)(x))
         |                                                   ^
   arch/powerpc/mm/kmmio.c:140:42: note: in expansion of macro 'cpu_to_be64'
     140 |         return __pmd_raw(pmd_raw(pmd) & ~cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID));
         |                                          ^~~~~~~~~~~
   arch/powerpc/mm/kmmio.c:140:70: note: each undeclared identifier is reported only once for each function it appears in
     140 |         return __pmd_raw(pmd_raw(pmd) & ~cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID));
         |                                                                      ^~~~~~~~~~~~~
   include/uapi/linux/byteorder/big_endian.h:38:51: note: in definition of macro '__cpu_to_be64'
      38 | #define __cpu_to_be64(x) ((__force __be64)(__u64)(x))
         |                                                   ^
   arch/powerpc/mm/kmmio.c:140:42: note: in expansion of macro 'cpu_to_be64'
     140 |         return __pmd_raw(pmd_raw(pmd) & ~cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID));
         |                                          ^~~~~~~~~~~
   arch/powerpc/mm/kmmio.c: In function 'kmmio_handler':
>> arch/powerpc/mm/kmmio.c:318:32: error: 'struct pt_regs' has no member named 'softe'
     318 |         ctx->saved_softe = regs->softe;
         |                                ^~
   arch/powerpc/mm/kmmio.c:330:13: error: 'struct pt_regs' has no member named 'softe'
     330 |         regs->softe = IRQS_DISABLED; // soft interrupt
         |             ^~
>> arch/powerpc/mm/kmmio.c:332:9: error: 'local_paca' undeclared (first use in this function); did you mean 'local_lock'?
     332 |         local_paca->srr_valid = 0;
         |         ^~~~~~~~~~
         |         local_lock
   arch/powerpc/mm/kmmio.c: In function 'post_kmmio_handler':
   arch/powerpc/mm/kmmio.c:383:13: error: 'struct pt_regs' has no member named 'softe'
     383 |         regs->softe = ctx->saved_softe;
         |             ^~
   arch/powerpc/mm/kmmio.c: In function 'pmd_mkinvalid':
>> arch/powerpc/mm/kmmio.c:141:1: warning: control reaches end of non-void function [-Wreturn-type]
     141 | }
         | ^
   cc1: some warnings being treated as errors


vim +/__pmd_raw +140 arch/powerpc/mm/kmmio.c

   137	
   138	static inline pmd_t pmd_mkinvalid(pmd_t pmd)
   139	{
 > 140		return __pmd_raw(pmd_raw(pmd) & ~cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID));
 > 141	}
   142	
   143	static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
   144	{
   145		pmd_t new_pmd;
   146		pmdval_t v = pmd_val(*pmd);
   147	
   148		if (clear) {
   149			*old = v;
   150			new_pmd = pmd_mkinvalid(*pmd);
   151		} else {
   152			/* Presume this has been called with clear==true previously */
   153			new_pmd = __pmd(*old);
   154		}
   155		*pmd = new_pmd;
   156	}
   157	
   158	static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old, unsigned long addr)
   159	{
   160		pteval_t v = pte_val(*pte);
   161	
   162		if (clear) {
   163			*old = v;
   164			/* Nothing should care about address */
   165			pte_clear(&init_mm, addr, pte);
   166		} else {
   167			/* Presume this has been called with clear==true previously */
   168			set_pte_at(&init_mm, addr, pte, __pte(*old));
   169		}
   170	}
   171	
   172	static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
   173	{
   174		unsigned int level;
   175		pte_t *pte = lookup_address(f->addr, &level);
   176	
   177		if (!pte) {
   178			pr_err("no pte for addr 0x%08lx\n", f->addr);
   179			return -1;
   180		}
   181	
   182		if (level == PMD_SHIFT)
   183			clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence);
   184		else if (level == PAGE_SHIFT)
   185			clear_pte_presence(pte, clear, &f->old_presence, f->addr);
   186		else {
   187			pr_err("unexpected page level 0x%x.\n", level);
   188			return -1;
   189		}
   190	
   191		mmap_read_lock(&init_mm);
   192		struct vm_area_struct *vma = find_vma(&init_mm, f->addr);
   193	
   194		mmap_read_unlock(&init_mm);
   195	
   196		flush_tlb_page(vma, f->addr);
   197	
   198		return 0;
   199	}
   200	
   201	/*
   202	 * Mark the given page as not present. Access to it will trigger a fault.
   203	 *
   204	 * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the
   205	 * protection is ignored here. RCU read lock is assumed held, so the struct
   206	 * will not disappear unexpectedly. Furthermore, the caller must guarantee,
   207	 * that double arming the same virtual address (page) cannot occur.
   208	 *
   209	 * Double disarming on the other hand is allowed, and may occur when a fault
   210	 * and mmiotrace shutdown happen simultaneously.
   211	 */
   212	static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
   213	{
   214		int ret;
   215	
   216		WARN_ONCE(f->armed, pr_fmt("kmmio page already armed.\n"));
   217		if (f->armed) {
   218			pr_warn("double-arm: addr 0x%08lx, ref %d, old %d\n",
   219				f->addr, f->count, !!f->old_presence);
   220		}
   221		ret = clear_page_presence(f, true);
   222		WARN_ONCE(ret < 0, pr_fmt("arming at 0x%08lx failed.\n"),
   223			  f->addr);
   224		f->armed = true;
   225		return ret;
   226	}
   227	
   228	/** Restore the given page to saved presence state. */
   229	static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
   230	{
   231		int ret = clear_page_presence(f, false);
   232	
   233		WARN_ONCE(ret < 0,
   234				KERN_ERR "kmmio disarming at 0x%08lx failed.\n", f->addr);
   235		f->armed = false;
   236	}
   237	
   238	/*
   239	 * This is being called from do_page_fault().
   240	 *
   241	 * We may be in an interrupt or a critical section. Also prefecthing may
   242	 * trigger a page fault. We may be in the middle of process switch.
   243	 * We cannot take any locks, because we could be executing especially
   244	 * within a kmmio critical section.
   245	 *
   246	 * Local interrupts are disabled, so preemption cannot happen.
   247	 * Do not enable interrupts, do not sleep, and watch out for other CPUs.
   248	 */
   249	/*
   250	 * Interrupts are disabled on entry as trap3 is an interrupt gate
   251	 * and they remain disabled throughout this function.
   252	 */
   253	int kmmio_handler(struct pt_regs *regs, unsigned long addr)
   254	{
   255		struct kmmio_context *ctx;
   256		struct kmmio_fault_page *faultpage;
   257		int ret = 0; /* default to fault not handled */
   258		unsigned long page_base = addr;
   259		unsigned int l;
   260		pte_t *pte = lookup_address(addr, &l);
   261	
   262		if (!pte)
   263			return -EINVAL;
   264		page_base &= page_level_mask(l);
   265	
   266		/*
   267		 * Hold the RCU read lock over single stepping to avoid looking
   268		 * up the probe and kmmio_fault_page again. The rcu_read_lock_sched()
   269		 * also disables preemption and prevents process switch during
   270		 * the single stepping. We can only handle one active kmmio trace
   271		 * per cpu, so ensure that we finish it before something else
   272		 * gets to run.
   273		 */
   274		rcu_read_lock_sched_notrace();
   275	
   276		faultpage = get_kmmio_fault_page(page_base);
   277		if (!faultpage) {
   278			/*
   279			 * Either this page fault is not caused by kmmio, or
   280			 * another CPU just pulled the kmmio probe from under
   281			 * our feet. The latter case should not be possible.
   282			 */
   283			goto no_kmmio;
   284		}
   285	
   286		ctx = this_cpu_ptr(&kmmio_ctx);
   287		if (ctx->active) {
   288			if (page_base == ctx->addr) {
   289				/*
   290				 * A second fault on the same page means some other
   291				 * condition needs handling by do_page_fault(), the
   292				 * page really not being present is the most common.
   293				 */
   294				pr_debug("secondary hit for 0x%08lx CPU %d.\n",
   295					 addr, smp_processor_id());
   296	
   297				if (!faultpage->old_presence)
   298					pr_info("unexpected secondary hit for address 0x%08lx on CPU %d.\n",
   299						addr, smp_processor_id());
   300			} else {
   301				/*
   302				 * Prevent overwriting already in-flight context.
   303				 * This should not happen, let's hope disarming at
   304				 * least prevents a panic.
   305				 */
   306				pr_emerg("recursive probe hit on CPU %d, for address 0x%08lx. Ignoring.\n",
   307					 smp_processor_id(), addr);
   308				pr_emerg("previous hit was at 0x%08lx.\n", ctx->addr);
   309				disarm_kmmio_fault_page(faultpage);
   310			}
   311			goto no_kmmio;
   312		}
   313		ctx->active++;
   314	
   315		ctx->fpage = faultpage;
   316		ctx->probe = get_kmmio_probe(page_base);
   317		ctx->saved_flags = (regs->msr & (MSR_SE | MSR_EE));
 > 318		ctx->saved_softe = regs->softe;
   319		ctx->addr = page_base;
   320	
   321		if (ctx->probe && ctx->probe->pre_handler)
   322			ctx->probe->pre_handler(ctx->probe, regs, addr);
   323	
   324		/*
   325		 * Enable single-stepping and disable interrupts for the faulting
   326		 * context. Local interrupts must not get enabled during stepping.
   327		 */
   328		regs->msr |= MSR_SE;         // single step
   329		regs->msr &= ~MSR_EE;        // hard interrupt
   330		regs->softe = IRQS_DISABLED; // soft interrupt
   331	
 > 332		local_paca->srr_valid = 0;
   333	
   334		/* Now we set present bit in PTE and single step. */
   335		disarm_kmmio_fault_page(ctx->fpage);
   336	
   337		/*
   338		 * If another cpu accesses the same page while we are stepping,
   339		 * the access will not be caught. It will simply succeed and the
   340		 * only downside is we lose the event. If this becomes a problem,
   341		 * the user should drop to single cpu before tracing.
   342		 */
   343	
   344		return 1; /* fault handled */
   345	
   346	no_kmmio:
   347		rcu_read_unlock_sched_notrace();
   348		return ret;
   349	}
   350
Michael Ellerman June 28, 2024, 7:02 a.m. UTC | #2
Jialong Yang <jialong.yang@shingroup.cn> writes:
> mmiotrace is a useful tool to trace MMIO accesses. Nowadays, it only
> supported on x86 and x86_64 platforms.

I've never used mmiotrace, and don't know it well.

I'm not necessarily opposed to merging it, but AFAIK it was mostly used
for reverse engineering proprietary drivers, where the driver itself
couldn't be easily instrumented. Is that what you're using it for?

For drivers where we have the source wouldn't it be easier to just use
tracepoints in the MMIO accessors?

Is it still in-use/maintained on the x86 side?

> Here is a support for powerpc.
> The manual is located at Documentation/trace/mmiotrace.rst which means
> I have not changed user API. People will be easy to use it.
> Almost all files are copied from x86/mm, there are only some
> differences from hardware and architectures software.
>
> LINK: https://lore.kernel.org/lkml/20080127195536.50809974@daedalus.pq.iki.fi/
>
> Signed-off-by: Jialong Yang <jialong.yang@shingroup.cn>
> ---
>  arch/powerpc/Kconfig.debug       |   3 +
>  arch/powerpc/mm/Makefile         |   1 +
>  arch/powerpc/mm/kmmio.c          | 649 +++++++++++++++++++++++++++++++
>  arch/powerpc/mm/mmio-mod.c       | 414 ++++++++++++++++++++
>  arch/powerpc/mm/mmiotrace_arch.c | 149 +++++++
>  arch/powerpc/mm/mmiotrace_arch.h |  25 ++
>  arch/powerpc/mm/pf_in.c          | 185 +++++++++
>  arch/powerpc/mm/pf_in.h          |  33 ++
>  8 files changed, 1459 insertions(+)
  
At a glance most of that code could be shared between arches. I don't
think I can merge that as-is, without some attempt to split the generic
parts out.

cheers
Yang Jialong 杨佳龙 June 28, 2024, 8:21 a.m. UTC | #3
在 2024/6/28 15:02, Michael Ellerman 写道:
> Jialong Yang <jialong.yang@shingroup.cn> writes:
>> mmiotrace is a useful tool to trace MMIO accesses. Nowadays, it only
>> supported on x86 and x86_64 platforms.
> I've never used mmiotrace, and don't know it well.
>
> I'm not necessarily opposed to merging it, but AFAIK it was mostly used
> for reverse engineering proprietary drivers, where the driver itself
> couldn't be easily instrumented. Is that what you're using it for?

Yes. Just like you think. We have used it for network stack debug in 
ppc64le.


>
> For drivers where we have the source wouldn't it be easier to just use
> tracepoints in the MMIO accessors?


Tracepoints need pre-defined. And in some big driver, it's not easy to 
overwrite

all points where access registers in io area. And tracepoint is C 
function level filter.

mmiotrace is similar to set tracepoints in writel/readl... But it can do 
deeperly.

mmiotrace is a asm level filter tool. It doesn't care what have done in 
C level. It will

only find what have done by asm, such as stw(store word)/lw(load word),  
just like standing

in the view of device.


>
> Is it still in-use/maintained on the x86 side?


Here is some core file patches number in x86:

|      | mmio_mod.c | kmmio.c | pf_in.* | testmmiotrace.c |
|------+------------+---------+---------+-----------------|
| 2022 |               1 |            3 | |                  |
| 2021 |               2 |            1 |               |         |
| 2020 |               4 |            4 | |               1 |
| 2019 |               2 |            1 |            1 |               4 |
| 2018 |                  |            2 |               |          |
| 2017 |               2 |            2 | |               1 |
| 2016 |               1 |            2 |            1 |        |
| 2014 |                  |            1 |               |          |
| 2013 |               1 |               |               |          |
| 2012 |               1 |               | |                  |
| 2011 |               3 |               |            1 |         |
| 2010 |               1 |            3 |            2 |               1 |
| 2009 |               4 |          19 | |               3 |
| 2008 |             13 |            5 |            2 |               3 |

>
>> Here is a support for powerpc.
>> The manual is located at Documentation/trace/mmiotrace.rst which means
>> I have not changed user API. People will be easy to use it.
>> Almost all files are copied from x86/mm, there are only some
>> differences from hardware and architectures software.
>>
>> LINK: https://lore.kernel.org/lkml/20080127195536.50809974@daedalus.pq.iki.fi/
>>
>> Signed-off-by: Jialong Yang <jialong.yang@shingroup.cn>
>> ---
>>   arch/powerpc/Kconfig.debug       |   3 +
>>   arch/powerpc/mm/Makefile         |   1 +
>>   arch/powerpc/mm/kmmio.c          | 649 +++++++++++++++++++++++++++++++
>>   arch/powerpc/mm/mmio-mod.c       | 414 ++++++++++++++++++++
>>   arch/powerpc/mm/mmiotrace_arch.c | 149 +++++++
>>   arch/powerpc/mm/mmiotrace_arch.h |  25 ++
>>   arch/powerpc/mm/pf_in.c          | 185 +++++++++
>>   arch/powerpc/mm/pf_in.h          |  33 ++
>>   8 files changed, 1459 insertions(+)
>    
> At a glance most of that code could be shared between arches. I don't
> think I can merge that as-is, without some attempt to split the generic
> parts out.


Right.

I just copy them from arch/x86/mm. There are many code not arch specific.


> cheers
>
diff mbox series

Patch

diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index 8c80b154e814..8a69188aa75a 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -1,5 +1,8 @@ 
 # SPDX-License-Identifier: GPL-2.0
 
+config HAVE_MMIOTRACE_SUPPORT
+	def_bool y
+
 config PPC_DISABLE_WERROR
 	bool "Don't build arch/powerpc code with -Werror"
 	help
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 0fe2f085c05a..cb92049f1239 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -17,3 +17,4 @@  obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
 obj-$(CONFIG_PPC_COPRO_BASE)	+= copro_fault.o
 obj-$(CONFIG_PTDUMP_CORE)	+= ptdump/
 obj-$(CONFIG_KASAN)		+= kasan/
+obj-$(CONFIG_MMIOTRACE) += kmmio.o mmio-mod.o pf_in.o mmiotrace_arch.o
diff --git a/arch/powerpc/mm/kmmio.c b/arch/powerpc/mm/kmmio.c
new file mode 100644
index 000000000000..f4374e721b37
--- /dev/null
+++ b/arch/powerpc/mm/kmmio.c
@@ -0,0 +1,649 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Support for MMIO probes.
+ * Derived from arch/x86/mm/kmmio.c:
+ *   Copyright (C) 2024 Jialong Yang (jialong.yang@shingroup.cn)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/spinlock.h>
+#include <linux/hash.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/uaccess.h>
+#include <linux/ptrace.h>
+#include <linux/preempt.h>
+#include <linux/percpu.h>
+#include <linux/kdebug.h>
+#include <linux/mutex.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/paca.h>
+#include <linux/errno.h>
+#include <linux/mmiotrace.h>
+
+#include "mmiotrace_arch.h"
+
+typedef unsigned long	pteval_t;
+typedef unsigned long	pmdval_t;
+
+#define KMMIO_PAGE_HASH_BITS 4
+#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
+
+struct kmmio_fault_page {
+	struct list_head list;
+	struct kmmio_fault_page *release_next;
+	unsigned long addr; /* the requested address */
+	pteval_t old_presence; /* page presence prior to arming */
+	bool armed;
+
+	/*
+	 * Number of times this page has been registered as a part
+	 * of a probe. If zero, page is disarmed and this may be freed.
+	 * Used only by writers (RCU) and post_kmmio_handler().
+	 * Protected by kmmio_lock, when linked into kmmio_page_table.
+	 */
+	int count;
+
+	bool scheduled_for_release;
+};
+
+struct kmmio_delayed_release {
+	struct rcu_head rcu;
+	struct kmmio_fault_page *release_list;
+};
+
+struct kmmio_context {
+	struct kmmio_fault_page *fpage;
+	struct kmmio_probe *probe;
+	unsigned long saved_flags;
+	unsigned long saved_softe;
+	unsigned long addr;
+	int active;
+};
+
+/*
+ * The kmmio_lock is taken in int3 context, which is treated as NMI context.
+ * This causes lockdep to complain about it bein in both NMI and normal
+ * context. Hide it from lockdep, as it should not have any other locks
+ * taken under it, and this is only enabled for debugging mmio anyway.
+ */
+static arch_spinlock_t kmmio_lock = __ARCH_SPIN_LOCK_UNLOCKED;
+
+/* Protected by kmmio_lock */
+unsigned int kmmio_count;
+
+/* Read-protected by RCU, write-protected by kmmio_lock. */
+static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
+static LIST_HEAD(kmmio_probes);
+
+static struct list_head *kmmio_page_list(unsigned long addr)
+{
+	unsigned int l;
+	pte_t *pte = lookup_address(addr, &l);
+
+	if (!pte)
+		return NULL;
+	addr &= page_level_mask(l);
+
+	return &kmmio_page_table[hash_long(addr, KMMIO_PAGE_HASH_BITS)];
+}
+
+/* Accessed per-cpu */
+static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
+
+/*
+ * this is basically a dynamic stabbing problem:
+ * Could use the existing prio tree code or
+ * Possible better implementations:
+ * The Interval Skip List: A Data Structure for Finding All Intervals That
+ * Overlap a Point (might be simple)
+ * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
+ */
+/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */
+static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
+{
+	struct kmmio_probe *p;
+
+	list_for_each_entry_rcu(p, &kmmio_probes, list) {
+		if (addr >= p->addr && addr < (p->addr + p->len))
+			return p;
+	}
+	return NULL;
+}
+
+/* You must be holding RCU read lock. */
+static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr)
+{
+	struct list_head *head;
+	struct kmmio_fault_page *f;
+	unsigned int l;
+	pte_t *pte = lookup_address(addr, &l);
+
+	if (!pte)
+		return NULL;
+	addr &= page_level_mask(l);
+	head = kmmio_page_list(addr);
+	list_for_each_entry_rcu(f, head, list) {
+		if (f->addr == addr)
+			return f;
+	}
+	return NULL;
+}
+
+static inline pmd_t pmd_mkinvalid(pmd_t pmd)
+{
+	return __pmd_raw(pmd_raw(pmd) & ~cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID));
+}
+
+static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
+{
+	pmd_t new_pmd;
+	pmdval_t v = pmd_val(*pmd);
+
+	if (clear) {
+		*old = v;
+		new_pmd = pmd_mkinvalid(*pmd);
+	} else {
+		/* Presume this has been called with clear==true previously */
+		new_pmd = __pmd(*old);
+	}
+	*pmd = new_pmd;
+}
+
+static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old, unsigned long addr)
+{
+	pteval_t v = pte_val(*pte);
+
+	if (clear) {
+		*old = v;
+		/* Nothing should care about address */
+		pte_clear(&init_mm, addr, pte);
+	} else {
+		/* Presume this has been called with clear==true previously */
+		set_pte_at(&init_mm, addr, pte, __pte(*old));
+	}
+}
+
+static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
+{
+	unsigned int level;
+	pte_t *pte = lookup_address(f->addr, &level);
+
+	if (!pte) {
+		pr_err("no pte for addr 0x%08lx\n", f->addr);
+		return -1;
+	}
+
+	if (level == PMD_SHIFT)
+		clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence);
+	else if (level == PAGE_SHIFT)
+		clear_pte_presence(pte, clear, &f->old_presence, f->addr);
+	else {
+		pr_err("unexpected page level 0x%x.\n", level);
+		return -1;
+	}
+
+	mmap_read_lock(&init_mm);
+	struct vm_area_struct *vma = find_vma(&init_mm, f->addr);
+
+	mmap_read_unlock(&init_mm);
+
+	flush_tlb_page(vma, f->addr);
+
+	return 0;
+}
+
+/*
+ * Mark the given page as not present. Access to it will trigger a fault.
+ *
+ * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the
+ * protection is ignored here. RCU read lock is assumed held, so the struct
+ * will not disappear unexpectedly. Furthermore, the caller must guarantee,
+ * that double arming the same virtual address (page) cannot occur.
+ *
+ * Double disarming on the other hand is allowed, and may occur when a fault
+ * and mmiotrace shutdown happen simultaneously.
+ */
+static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
+{
+	int ret;
+
+	WARN_ONCE(f->armed, pr_fmt("kmmio page already armed.\n"));
+	if (f->armed) {
+		pr_warn("double-arm: addr 0x%08lx, ref %d, old %d\n",
+			f->addr, f->count, !!f->old_presence);
+	}
+	ret = clear_page_presence(f, true);
+	WARN_ONCE(ret < 0, pr_fmt("arming at 0x%08lx failed.\n"),
+		  f->addr);
+	f->armed = true;
+	return ret;
+}
+
+/** Restore the given page to saved presence state. */
+static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
+{
+	int ret = clear_page_presence(f, false);
+
+	WARN_ONCE(ret < 0,
+			KERN_ERR "kmmio disarming at 0x%08lx failed.\n", f->addr);
+	f->armed = false;
+}
+
+/*
+ * This is being called from do_page_fault().
+ *
+ * We may be in an interrupt or a critical section. Also prefecthing may
+ * trigger a page fault. We may be in the middle of process switch.
+ * We cannot take any locks, because we could be executing especially
+ * within a kmmio critical section.
+ *
+ * Local interrupts are disabled, so preemption cannot happen.
+ * Do not enable interrupts, do not sleep, and watch out for other CPUs.
+ */
+/*
+ * Interrupts are disabled on entry as trap3 is an interrupt gate
+ * and they remain disabled throughout this function.
+ */
+int kmmio_handler(struct pt_regs *regs, unsigned long addr)
+{
+	struct kmmio_context *ctx;
+	struct kmmio_fault_page *faultpage;
+	int ret = 0; /* default to fault not handled */
+	unsigned long page_base = addr;
+	unsigned int l;
+	pte_t *pte = lookup_address(addr, &l);
+
+	if (!pte)
+		return -EINVAL;
+	page_base &= page_level_mask(l);
+
+	/*
+	 * Hold the RCU read lock over single stepping to avoid looking
+	 * up the probe and kmmio_fault_page again. The rcu_read_lock_sched()
+	 * also disables preemption and prevents process switch during
+	 * the single stepping. We can only handle one active kmmio trace
+	 * per cpu, so ensure that we finish it before something else
+	 * gets to run.
+	 */
+	rcu_read_lock_sched_notrace();
+
+	faultpage = get_kmmio_fault_page(page_base);
+	if (!faultpage) {
+		/*
+		 * Either this page fault is not caused by kmmio, or
+		 * another CPU just pulled the kmmio probe from under
+		 * our feet. The latter case should not be possible.
+		 */
+		goto no_kmmio;
+	}
+
+	ctx = this_cpu_ptr(&kmmio_ctx);
+	if (ctx->active) {
+		if (page_base == ctx->addr) {
+			/*
+			 * A second fault on the same page means some other
+			 * condition needs handling by do_page_fault(), the
+			 * page really not being present is the most common.
+			 */
+			pr_debug("secondary hit for 0x%08lx CPU %d.\n",
+				 addr, smp_processor_id());
+
+			if (!faultpage->old_presence)
+				pr_info("unexpected secondary hit for address 0x%08lx on CPU %d.\n",
+					addr, smp_processor_id());
+		} else {
+			/*
+			 * Prevent overwriting already in-flight context.
+			 * This should not happen, let's hope disarming at
+			 * least prevents a panic.
+			 */
+			pr_emerg("recursive probe hit on CPU %d, for address 0x%08lx. Ignoring.\n",
+				 smp_processor_id(), addr);
+			pr_emerg("previous hit was at 0x%08lx.\n", ctx->addr);
+			disarm_kmmio_fault_page(faultpage);
+		}
+		goto no_kmmio;
+	}
+	ctx->active++;
+
+	ctx->fpage = faultpage;
+	ctx->probe = get_kmmio_probe(page_base);
+	ctx->saved_flags = (regs->msr & (MSR_SE | MSR_EE));
+	ctx->saved_softe = regs->softe;
+	ctx->addr = page_base;
+
+	if (ctx->probe && ctx->probe->pre_handler)
+		ctx->probe->pre_handler(ctx->probe, regs, addr);
+
+	/*
+	 * Enable single-stepping and disable interrupts for the faulting
+	 * context. Local interrupts must not get enabled during stepping.
+	 */
+	regs->msr |= MSR_SE;         // single step
+	regs->msr &= ~MSR_EE;        // hard interrupt
+	regs->softe = IRQS_DISABLED; // soft interrupt
+
+	local_paca->srr_valid = 0;
+
+	/* Now we set present bit in PTE and single step. */
+	disarm_kmmio_fault_page(ctx->fpage);
+
+	/*
+	 * If another cpu accesses the same page while we are stepping,
+	 * the access will not be caught. It will simply succeed and the
+	 * only downside is we lose the event. If this becomes a problem,
+	 * the user should drop to single cpu before tracing.
+	 */
+
+	return 1; /* fault handled */
+
+no_kmmio:
+	rcu_read_unlock_sched_notrace();
+	return ret;
+}
+
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate
+ * and they remain disabled throughout this function.
+ * This must always get called as the pair to kmmio_handler().
+ */
+static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
+{
+	int ret = 0;
+	struct kmmio_context *ctx = this_cpu_ptr(&kmmio_ctx);
+
+	if (!ctx->active) {
+		/*
+		 * debug traps without an active context are due to either
+		 * something external causing them (f.e. using a debugger while
+		 * mmio tracing enabled), or erroneous behaviour
+		 */
+		pr_warn("unexpected debug trap on CPU %d.\n", smp_processor_id());
+		goto out;
+	}
+
+	if (ctx->probe && ctx->probe->post_handler)
+		ctx->probe->post_handler(ctx->probe, condition, regs);
+
+	/* Prevent racing against release_kmmio_fault_page(). */
+	arch_spin_lock(&kmmio_lock);
+	if (ctx->fpage->count)
+		arm_kmmio_fault_page(ctx->fpage);
+	arch_spin_unlock(&kmmio_lock);
+
+	// disabled single step in entry of single_step_exception.
+	// regs->msr &= ~MSR_SE;
+	regs->msr |= ctx->saved_flags;
+	regs->softe = ctx->saved_softe;
+
+	/* These were acquired in kmmio_handler(). */
+	ctx->active--;
+	BUG_ON(ctx->active);
+	rcu_read_unlock_sched_notrace();
+
+	/*
+	 * if somebody else is singlestepping across a probe point, flags
+	 * will have TF set, in which case, continue the remaining processing
+	 * of do_debug, as if this is not a probe hit.
+	 */
+	if (!(regs->msr & MSR_SE))
+		ret = 1;
+out:
+	return ret;
+}
+
+/* You must be holding kmmio_lock. */
+static int add_kmmio_fault_page(unsigned long addr)
+{
+	struct kmmio_fault_page *f;
+
+	f = get_kmmio_fault_page(addr);
+	if (f) {
+		if (!f->count)
+			arm_kmmio_fault_page(f);
+		f->count++;
+		return 0;
+	}
+
+	f = kzalloc(sizeof(*f), GFP_ATOMIC);
+	if (!f)
+		return -1;
+
+	f->count = 1;
+	f->addr = addr;
+
+	if (arm_kmmio_fault_page(f)) {
+		kfree(f);
+		return -1;
+	}
+
+	list_add_rcu(&f->list, kmmio_page_list(f->addr));
+
+	return 0;
+}
+
+/* You must be holding kmmio_lock. */
+static void release_kmmio_fault_page(unsigned long addr,
+				struct kmmio_fault_page **release_list)
+{
+	struct kmmio_fault_page *f;
+
+	f = get_kmmio_fault_page(addr);
+	if (!f)
+		return;
+
+	f->count--;
+	BUG_ON(f->count < 0);
+	if (!f->count) {
+		disarm_kmmio_fault_page(f);
+		if (!f->scheduled_for_release) {
+			f->release_next = *release_list;
+			*release_list = f;
+			f->scheduled_for_release = true;
+		}
+	}
+}
+
+/*
+ * With page-unaligned ioremaps, one or two armed pages may contain
+ * addresses from outside the intended mapping. Events for these addresses
+ * are currently silently dropped. The events may result only from programming
+ * mistakes by accessing addresses before the beginning or past the end of a
+ * mapping.
+ */
+int register_kmmio_probe(struct kmmio_probe *p)
+{
+	unsigned long flags;
+	int ret = 0;
+	unsigned long size = 0;
+	unsigned long addr = p->addr & PAGE_MASK;
+	const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
+	unsigned int l;
+	pte_t *pte;
+
+	local_irq_save(flags);
+	arch_spin_lock(&kmmio_lock);
+	if (get_kmmio_probe(addr)) {
+		ret = -EEXIST;
+		goto out;
+	}
+
+	pte = lookup_address(addr, &l);
+	if (!pte) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	kmmio_count++;
+	list_add_rcu(&p->list, &kmmio_probes);
+	while (size < size_lim) {
+		if (add_kmmio_fault_page(addr + size))
+			pr_err("Unable to set page fault.\n");
+		size += page_level_size(l);
+	}
+out:
+	arch_spin_unlock(&kmmio_lock);
+	local_irq_restore(flags);
+
+	/*
+	 * XXX: What should I do here?
+	 * Here was a call to global_flush_tlb(), but it does not exist
+	 * anymore. It seems it's not needed after all.
+	 */
+	return ret;
+}
+EXPORT_SYMBOL(register_kmmio_probe);
+
+static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
+{
+	struct kmmio_delayed_release *dr = container_of(
+						head,
+						struct kmmio_delayed_release,
+						rcu);
+	struct kmmio_fault_page *f = dr->release_list;
+
+	while (f) {
+		struct kmmio_fault_page *next = f->release_next;
+
+		BUG_ON(f->count);
+		kfree(f);
+		f = next;
+	}
+	kfree(dr);
+}
+
+static void remove_kmmio_fault_pages(struct rcu_head *head)
+{
+	struct kmmio_delayed_release *dr =
+		container_of(head, struct kmmio_delayed_release, rcu);
+	struct kmmio_fault_page *f = dr->release_list;
+	struct kmmio_fault_page **prevp = &dr->release_list;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	arch_spin_lock(&kmmio_lock);
+	while (f) {
+		if (!f->count) {
+			list_del_rcu(&f->list);
+			prevp = &f->release_next;
+		} else {
+			*prevp = f->release_next;
+			f->release_next = NULL;
+			f->scheduled_for_release = false;
+		}
+		f = *prevp;
+	}
+	arch_spin_unlock(&kmmio_lock);
+	local_irq_restore(flags);
+
+	/* This is the real RCU destroy call. */
+	call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
+}
+
+/*
+ * Remove a kmmio probe. You have to synchronize_rcu() before you can be
+ * sure that the callbacks will not be called anymore. Only after that
+ * you may actually release your struct kmmio_probe.
+ *
+ * Unregistering a kmmio fault page has three steps:
+ * 1. release_kmmio_fault_page()
+ *    Disarm the page, wait a grace period to let all faults finish.
+ * 2. remove_kmmio_fault_pages()
+ *    Remove the pages from kmmio_page_table.
+ * 3. rcu_free_kmmio_fault_pages()
+ *    Actually free the kmmio_fault_page structs as with RCU.
+ */
+void unregister_kmmio_probe(struct kmmio_probe *p)
+{
+	unsigned long flags;
+	unsigned long size = 0;
+	unsigned long addr = p->addr & PAGE_MASK;
+	const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
+	struct kmmio_fault_page *release_list = NULL;
+	struct kmmio_delayed_release *drelease;
+	unsigned int l;
+	pte_t *pte;
+
+	pte = lookup_address(addr, &l);
+	if (!pte)
+		return;
+
+	local_irq_save(flags);
+	arch_spin_lock(&kmmio_lock);
+	while (size < size_lim) {
+		release_kmmio_fault_page(addr + size, &release_list);
+		size += page_level_size(l);
+	}
+	list_del_rcu(&p->list);
+	kmmio_count--;
+	arch_spin_unlock(&kmmio_lock);
+	local_irq_restore(flags);
+
+	if (!release_list)
+		return;
+
+	drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
+	if (!drelease)
+		return;
+
+	drelease->release_list = release_list;
+
+	/*
+	 * This is not really RCU here. We have just disarmed a set of
+	 * pages so that they cannot trigger page faults anymore. However,
+	 * we cannot remove the pages from kmmio_page_table,
+	 * because a probe hit might be in flight on another CPU. The
+	 * pages are collected into a list, and they will be removed from
+	 * kmmio_page_table when it is certain that no probe hit related to
+	 * these pages can be in flight. RCU grace period sounds like a
+	 * good choice.
+	 *
+	 * If we removed the pages too early, kmmio page fault handler might
+	 * not find the respective kmmio_fault_page and determine it's not
+	 * a kmmio fault, when it actually is. This would lead to madness.
+	 */
+	call_rcu(&drelease->rcu, remove_kmmio_fault_pages);
+}
+EXPORT_SYMBOL(unregister_kmmio_probe);
+
+static int
+kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
+{
+	struct die_args *arg = args;
+
+	if (val == DIE_SSTEP && post_kmmio_handler(0, arg->regs) == 1)
+		return NOTIFY_STOP;
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block nb_die = {
+	.notifier_call = kmmio_die_notifier
+};
+
+int kmmio_init(void)
+{
+	int i;
+
+	for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
+		INIT_LIST_HEAD(&kmmio_page_table[i]);
+
+	return register_die_notifier(&nb_die);
+}
+
+void kmmio_cleanup(void)
+{
+	int i;
+
+	unregister_die_notifier(&nb_die);
+	for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) {
+		WARN_ONCE(!list_empty(&kmmio_page_table[i]),
+			  pr_fmt("kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n"));
+	}
+}
diff --git a/arch/powerpc/mm/mmio-mod.c b/arch/powerpc/mm/mmio-mod.c
new file mode 100644
index 000000000000..68ba9f028678
--- /dev/null
+++ b/arch/powerpc/mm/mmio-mod.c
@@ -0,0 +1,414 @@ 
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Derived from arch/x86/mm/mmio-mod.c:
+ *   Copyright (C) 2024 Jialong Yang (jialong.yang@shingroup.cn)
+ */
+
+#define pr_fmt(fmt) "mmiotrace: " fmt
+
+#include <linux/moduleparam.h>
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/mmiotrace.h>
+#include <linux/pgtable.h>
+#include <linux/atomic.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+
+#include "pf_in.h"
+#include "mmiotrace_arch.h"
+
+struct remap_trace {
+	struct list_head list;
+	struct kmmio_probe probe;
+	resource_size_t phys;
+	unsigned long id;
+};
+
+/* Accessed per-cpu. */
+static DEFINE_PER_CPU(struct trap_reason, pf_reason);
+static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace);
+
+static DEFINE_MUTEX(mmiotrace_mutex);
+static DEFINE_SPINLOCK(trace_lock);
+static atomic_t mmiotrace_enabled;
+static LIST_HEAD(trace_list);		/* struct remap_trace */
+
+/*
+ * Locking in this file:
+ * - mmiotrace_mutex enforces enable/disable_mmiotrace() critical sections.
+ * - mmiotrace_enabled may be modified only when holding mmiotrace_mutex
+ *   and trace_lock.
+ * - Routines depending on is_enabled() must take trace_lock.
+ * - trace_list users must hold trace_lock.
+ * - is_enabled() guarantees that mmio_trace_{rw,mapping} are allowed.
+ * - pre/post callbacks assume the effect of is_enabled() being true.
+ */
+
+/* module parameters */
+static unsigned long	filter_offset;
+static bool		nommiotrace;
+
+module_param(filter_offset, ulong, 0);
+module_param(nommiotrace, bool, 0);
+
+MODULE_PARM_DESC(filter_offset, "Start address of traced mappings.");
+MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing.");
+
+static bool is_enabled(void)
+{
+	return atomic_read(&mmiotrace_enabled);
+}
+
+static void print_pte(unsigned long address)
+{
+	unsigned int level;
+	pte_t *pte = lookup_address(address, &level);
+
+	if (!pte) {
+		pr_err("Error in %s: no pte for page 0x%08lx\n",
+		       __func__, address);
+		return;
+	}
+
+	if (level == PMD_SHIFT) {
+		pr_emerg("4MB pages are not currently supported: 0x%08lx\n",
+			 address);
+		BUG();
+	}
+	pr_info("pte for 0x%lx: 0x%llx 0x%llx\n",
+		address,
+		(unsigned long long)pte_val(*pte),
+		(unsigned long long)pte_val(*pte) & _PAGE_PRESENT);
+}
+
+/*
+ * For some reason the pre/post pairs have been called in an
+ * unmatched order. Report and die.
+ */
+static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
+{
+	const struct trap_reason *my_reason = &get_cpu_var(pf_reason);
+
+	pr_emerg("unexpected fault for address: 0x%08lx, last fault for address: 0x%08lx\n",
+		 addr, my_reason->addr);
+	print_pte(addr);
+	pr_emerg("faulting IP is at %pS\n", (void *)regs->nip);
+	pr_emerg("last faulting IP was at %pS\n", (void *)my_reason->ip);
+	put_cpu_var(pf_reason);
+	BUG();
+}
+
+static void pre(struct kmmio_probe *p, struct pt_regs *regs,
+						unsigned long addr)
+{
+	struct trap_reason *my_reason = &get_cpu_var(pf_reason);
+	struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
+	const unsigned long instptr = instruction_pointer(regs);
+	struct opcode_t *opcode = get_opcode((unsigned int *)instptr);
+	enum mm_io_opcode type = get_ins_type(opcode);
+	struct remap_trace *trace = p->private;
+
+	/* it doesn't make sense to have more than one active trace per cpu */
+	if (my_reason->active_traces)
+		die_kmmio_nesting_error(regs, addr);
+	else
+		my_reason->active_traces++;
+
+	if (!opcode) {
+		pr_warn("The ins may be not included in src. Tell the dever follow info:");
+		pr_warn("ins_addr: 0x%lx    ins: 0x%lx", instptr, *(unsigned long *)instptr);
+	}
+
+	my_reason->opcode = opcode;
+
+	my_reason->addr = addr;
+	my_reason->ip = instptr;
+
+	my_trace->phys = addr - trace->probe.addr + trace->phys;
+	my_trace->map_id = trace->id;
+
+	my_trace->pc = instptr;
+
+	my_trace->opcode = type;
+	my_trace->width = get_ins_width(opcode);
+
+	if (type == MMIO_WRITE)
+		my_trace->value = get_ins_val(my_reason, regs);
+
+	put_cpu_var(cpu_trace);
+	put_cpu_var(pf_reason);
+}
+
+static void post(struct kmmio_probe *p, unsigned long condition,
+							struct pt_regs *regs)
+{
+	struct trap_reason *my_reason = &get_cpu_var(pf_reason);
+	struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
+	struct opcode_t *opcode = my_reason->opcode;
+	enum mm_io_opcode type = get_ins_type(opcode);
+
+	/* this should always return the active_trace count to 0 */
+	my_reason->active_traces--;
+	if (my_reason->active_traces) {
+		pr_emerg("unexpected post handler");
+		BUG();
+	}
+
+	if (type == MMIO_READ)
+		my_trace->value = get_ins_val(my_reason, regs);
+
+	mmio_trace_rw(my_trace);
+	put_cpu_var(cpu_trace);
+	put_cpu_var(pf_reason);
+}
+
+static void ioremap_trace_core(resource_size_t offset, unsigned long size,
+							void __iomem *addr)
+{
+	static atomic_t next_id;
+	struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL);
+	/* These are page-unaligned. */
+	struct mmiotrace_map map = {
+		.phys = offset,
+		.virt = (unsigned long)addr,
+		.len = size,
+		.opcode = MMIO_PROBE
+	};
+
+	if (!trace) {
+		pr_err("kmalloc failed in ioremap\n");
+		return;
+	}
+
+	*trace = (struct remap_trace) {
+		.probe = {
+			.addr = (unsigned long)addr,
+			.len = size,
+			.pre_handler = pre,
+			.post_handler = post,
+			.private = trace
+		},
+		.phys = offset,
+		.id = atomic_inc_return(&next_id)
+	};
+	map.map_id = trace->id;
+
+	spin_lock_irq(&trace_lock);
+	if (!is_enabled()) {
+		kfree(trace);
+		goto not_enabled;
+	}
+
+	mmio_trace_mapping(&map);
+	list_add_tail(&trace->list, &trace_list);
+	if (!nommiotrace)
+		register_kmmio_probe(&trace->probe);
+
+not_enabled:
+	spin_unlock_irq(&trace_lock);
+}
+
+void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
+						void __iomem *addr)
+{
+	pr_err("ioremap_*(0x%llx, 0x%lx) = %p\n",
+		 (unsigned long long)offset, size, addr);
+	if (!is_enabled()) /* recheck and proper locking in *_core() */
+		return;
+
+	pr_debug("ioremap_*(0x%llx, 0x%lx) = %p\n",
+		 (unsigned long long)offset, size, addr);
+	if ((filter_offset) && (offset != filter_offset))
+		return;
+	ioremap_trace_core(offset, size, addr);
+}
+
+static void iounmap_trace_core(volatile void __iomem *addr)
+{
+	struct mmiotrace_map map = {
+		.phys = 0,
+		.virt = (unsigned long)addr,
+		.len = 0,
+		.opcode = MMIO_UNPROBE
+	};
+	struct remap_trace *trace;
+	struct remap_trace *tmp;
+	struct remap_trace *found_trace = NULL;
+
+	pr_debug("Unmapping %p.\n", addr);
+
+	spin_lock_irq(&trace_lock);
+	if (!is_enabled())
+		goto not_enabled;
+
+	list_for_each_entry_safe(trace, tmp, &trace_list, list) {
+		if ((unsigned long)addr == trace->probe.addr) {
+			if (!nommiotrace)
+				unregister_kmmio_probe(&trace->probe);
+			list_del(&trace->list);
+			found_trace = trace;
+			break;
+		}
+	}
+	map.map_id = (found_trace) ? found_trace->id : -1;
+	mmio_trace_mapping(&map);
+
+not_enabled:
+	spin_unlock_irq(&trace_lock);
+	if (found_trace) {
+		synchronize_rcu(); /* unregister_kmmio_probe() requirement */
+		kfree(found_trace);
+	}
+}
+
+void mmiotrace_iounmap(volatile void __iomem *addr)
+{
+	might_sleep();
+	if (is_enabled()) /* recheck and proper locking in *_core() */
+		iounmap_trace_core(addr);
+}
+
+int mmiotrace_printk(const char *fmt, ...)
+{
+	int ret = 0;
+	va_list args;
+	unsigned long flags;
+
+	va_start(args, fmt);
+
+	spin_lock_irqsave(&trace_lock, flags);
+	if (is_enabled())
+		ret = mmio_trace_printk(fmt, args);
+	spin_unlock_irqrestore(&trace_lock, flags);
+
+	va_end(args);
+	return ret;
+}
+EXPORT_SYMBOL(mmiotrace_printk);
+
+static void clear_trace_list(void)
+{
+	struct remap_trace *trace;
+	struct remap_trace *tmp;
+
+	/*
+	 * No locking required, because the caller ensures we are in a
+	 * critical section via mutex, and is_enabled() is false,
+	 * i.e. nothing can traverse or modify this list.
+	 * Caller also ensures is_enabled() cannot change.
+	 */
+	list_for_each_entry(trace, &trace_list, list) {
+		pr_notice("purging non-iounmapped trace @0x%08lx, size 0x%lx.\n",
+			  trace->probe.addr, trace->probe.len);
+		if (!nommiotrace)
+			unregister_kmmio_probe(&trace->probe);
+	}
+	synchronize_rcu(); /* unregister_kmmio_probe() requirement */
+
+	list_for_each_entry_safe(trace, tmp, &trace_list, list) {
+		list_del(&trace->list);
+		kfree(trace);
+	}
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static cpumask_var_t downed_cpus;
+
+static void enter_uniprocessor(void)
+{
+	int cpu;
+	int err;
+
+	if (!cpumask_available(downed_cpus) &&
+	    !alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) {
+		pr_notice("Failed to allocate mask\n");
+		goto out;
+	}
+
+	cpus_read_lock();
+	cpumask_copy(downed_cpus, cpu_online_mask);
+	cpumask_clear_cpu(cpumask_first(cpu_online_mask), downed_cpus);
+	if (num_online_cpus() > 1)
+		pr_notice("Disabling non-boot CPUs...\n");
+	cpus_read_unlock();
+
+	for_each_cpu(cpu, downed_cpus) {
+		err = remove_cpu(cpu);
+		if (!err)
+			pr_info("CPU%d is down.\n", cpu);
+		else
+			pr_err("Error taking CPU%d down: %d\n", cpu, err);
+	}
+out:
+	if (num_online_cpus() > 1)
+		pr_warn("multiple CPUs still online, may miss events.\n");
+}
+
+static void leave_uniprocessor(void)
+{
+	int cpu;
+	int err;
+
+	if (!cpumask_available(downed_cpus) || cpumask_empty(downed_cpus))
+		return;
+	pr_notice("Re-enabling CPUs...\n");
+	for_each_cpu(cpu, downed_cpus) {
+		err = add_cpu(cpu);
+		if (!err)
+			pr_info("enabled CPU%d.\n", cpu);
+		else
+			pr_err("cannot re-enable CPU%d: %d\n", cpu, err);
+	}
+}
+
+#else /* !CONFIG_HOTPLUG_CPU */
+static void enter_uniprocessor(void)
+{
+	if (num_online_cpus() > 1)
+		pr_warn("multiple CPUs are online, may miss events. Suggest booting with maxcpus=1 kernel argument.\n");
+}
+
+static void leave_uniprocessor(void)
+{
+}
+#endif
+
+void enable_mmiotrace(void)
+{
+	mutex_lock(&mmiotrace_mutex);
+	if (is_enabled())
+		goto out;
+
+	if (nommiotrace)
+		pr_info("MMIO tracing disabled.\n");
+	kmmio_init();
+	enter_uniprocessor();
+	spin_lock_irq(&trace_lock);
+	atomic_inc(&mmiotrace_enabled);
+	spin_unlock_irq(&trace_lock);
+	pr_info("enabled.\n");
+out:
+	mutex_unlock(&mmiotrace_mutex);
+}
+
+void disable_mmiotrace(void)
+{
+	mutex_lock(&mmiotrace_mutex);
+	if (!is_enabled())
+		goto out;
+
+	spin_lock_irq(&trace_lock);
+	atomic_dec(&mmiotrace_enabled);
+	BUG_ON(is_enabled());
+	spin_unlock_irq(&trace_lock);
+
+	clear_trace_list(); /* guarantees: no more kmmio callbacks */
+	leave_uniprocessor();
+	kmmio_cleanup();
+	pr_info("disabled.\n");
+out:
+	mutex_unlock(&mmiotrace_mutex);
+}
diff --git a/arch/powerpc/mm/mmiotrace_arch.c b/arch/powerpc/mm/mmiotrace_arch.c
new file mode 100644
index 000000000000..ccc8032384ef
--- /dev/null
+++ b/arch/powerpc/mm/mmiotrace_arch.c
@@ -0,0 +1,149 @@ 
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Derived from arch/powerpc/mm/pgtable.c:
+ *   Copyright (C) 2024 Jialong Yang (jialong.yang@shingroup.cn)
+ */
+
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/hugetlb.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/hugetlb.h>
+
+#include "mmiotrace_arch.h"
+
+static pte_t *mmiotrace_find_linux_pte(pgd_t *pgdp, unsigned long ea,
+			bool *is_thp, unsigned int *hpage_shift)
+{
+	p4d_t p4d, *p4dp;
+	pud_t pud, *pudp;
+	pmd_t pmd, *pmdp;
+	pte_t *ret_pte;
+	hugepd_t *hpdp = NULL;
+	unsigned int pdshift;
+
+	if (hpage_shift)
+		*hpage_shift = 0;
+
+	if (is_thp)
+		*is_thp = false;
+
+	/*
+	 * Always operate on the local stack value. This make sure the
+	 * value don't get updated by a parallel THP split/collapse,
+	 * page fault or a page unmap. The return pte_t * is still not
+	 * stable. So should be checked there for above conditions.
+	 * Top level is an exception because it is folded into p4d.
+	 */
+	p4dp = p4d_offset(pgdp, ea);
+	p4d  = READ_ONCE(*p4dp);
+	pdshift = P4D_SHIFT;
+
+	if (p4d_none(p4d))
+		return NULL;
+
+	if (p4d_leaf(p4d)) {
+		ret_pte = (pte_t *)p4dp;
+		goto out;
+	}
+
+	if (is_hugepd(__hugepd(p4d_val(p4d)))) {
+		hpdp = (hugepd_t *)&p4d;
+		goto out_huge;
+	}
+
+	/*
+	 * Even if we end up with an unmap, the pgtable will not
+	 * be freed, because we do an rcu free and here we are
+	 * irq disabled
+	 */
+	pdshift = PUD_SHIFT;
+	pudp = pud_offset(&p4d, ea);
+	pud  = READ_ONCE(*pudp);
+
+	if (pud_none(pud))
+		return NULL;
+
+	if (pud_leaf(pud)) {
+		ret_pte = (pte_t *)pudp;
+		goto out;
+	}
+
+	if (is_hugepd(__hugepd(pud_val(pud)))) {
+		hpdp = (hugepd_t *)&pud;
+		goto out_huge;
+	}
+
+	pdshift = PMD_SHIFT;
+	pmdp = pmd_offset(&pud, ea);
+	pmd  = READ_ONCE(*pmdp);
+
+	/*
+	 * A hugepage collapse is captured by this condition, see
+	 * pmdp_collapse_flush.
+	 */
+	if (pmd_none(pmd))
+		return NULL;
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	/*
+	 * A hugepage split is captured by this condition, see
+	 * pmdp_invalidate.
+	 *
+	 * Huge page modification can be caught here too.
+	 */
+	if (pmd_is_serializing(pmd))
+		return NULL;
+#endif
+
+	if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
+		if (is_thp)
+			*is_thp = true;
+		ret_pte = (pte_t *)pmdp;
+		goto out;
+	}
+
+	if (pmd_leaf(pmd)) {
+		ret_pte = (pte_t *)pmdp;
+		goto out;
+	}
+
+	if (is_hugepd(__hugepd(pmd_val(pmd)))) {
+		hpdp = (hugepd_t *)&pmd;
+		goto out_huge;
+	}
+
+	pdshift = PAGE_SHIFT;
+
+	if (hpage_shift)
+		*hpage_shift = pdshift;
+
+	return pte_offset_kernel(&pmd, ea);
+
+out_huge:
+	if (!hpdp)
+		return NULL;
+
+	ret_pte = hugepte_offset(*hpdp, ea, pdshift);
+	pdshift = hugepd_shift(*hpdp);
+out:
+	if (hpage_shift)
+		*hpage_shift = pdshift;
+	return ret_pte;
+}
+
+pte_t *lookup_address(unsigned long address, unsigned int *shift)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	pte_t *pte = mmiotrace_find_linux_pte(pgd_offset_k(address), address, NULL, shift);
+
+	local_irq_restore(flags);
+
+	return pte;
+}
diff --git a/arch/powerpc/mm/mmiotrace_arch.h b/arch/powerpc/mm/mmiotrace_arch.h
new file mode 100644
index 000000000000..f4a5bff24a07
--- /dev/null
+++ b/arch/powerpc/mm/mmiotrace_arch.h
@@ -0,0 +1,25 @@ 
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Derived from arch/powerpc/mm/pgtable.c:
+ *   Copyright (C) 2024 Jialong Yang (jialong.yang@shingroup.cn)
+ */
+
+#ifndef __MMIOTRACE_ARCH_
+#define __MMIOTRACE_ARCH_
+#include <asm/pgtable.h>
+
+static inline int page_level_shift(unsigned int level)
+{
+	return level;
+}
+static inline unsigned long page_level_size(unsigned int level)
+{
+	return 1UL << page_level_shift(level);
+}
+static inline unsigned long page_level_mask(unsigned int level)
+{
+	return ~(page_level_size(level) - 1);
+}
+
+pte_t *lookup_address(unsigned long address, unsigned int *level);
+#endif // __MMIOTRACE_ARCH_
diff --git a/arch/powerpc/mm/pf_in.c b/arch/powerpc/mm/pf_in.c
new file mode 100644
index 000000000000..e6c90b383e7f
--- /dev/null
+++ b/arch/powerpc/mm/pf_in.c
@@ -0,0 +1,185 @@ 
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Derived from arch/x86/mm/pf_in.c:
+ *   Copyright (C) 2024 Jialong Yang (jialong.yang@shingroup.cn)
+ */
+
+#include <linux/ptrace.h> /* struct pt_regs */
+#include "pf_in.h"
+#include <linux/printk.h>
+#include <linux/mmiotrace.h>
+
+/* D 32 0x80000000 B lwz Load Word and Zero */
+/* D 33 0x84000000 B lwz Load Word and Zero with Update */
+/* D 34 0x88000000 B lbz Load Byte and Zero */
+/* D 33 0x8C000000 B lbzu Load Word and Zero with Update */
+/* D 35 0x90000000 B stw Store Word */
+/* D 36 0x94000000 B stwu Store Word with Update */
+/* D 37 0x98000000 B stb Store Byte */
+/* D 38 0x9C000000 B stbu Store Byte with Update */
+/* D 40 0xA0000000 B lhz Load Halfword and Zero with Update */
+/* D 41 0xA4000000 B lhzu Load Halfword and Zero with Update */
+/* D 42 0xA8000000 B lha Load Halfword Algebraic */
+/* D 43 0xAC000000 B lhau Load Halfword Algebraic with Update */
+/* D 44 0xB0000000 B sth Store Halfword */
+/* D 45 0xB4000000 B sthu Store Halfword with Update */
+/* D 46 0xB8000000 B lmw Load Multiple Word */
+/* D 47 0xBC000000 B stmw Store Multiple Word */
+/* D 48 0xC0000000 FP lfs Load Floating-Point Single */
+/* D 49 0xC4000000 FP lfsu Load Floating-Point Single with Update */
+/* D 50 0xC8000000 FP lfd Load Floating-Point Double */
+/* D 51 0xCC000000 FP lfdu Load Floating-Point Double with Update */
+/* D 52 0xD0000000 FP stfs Store Floating-Point Single */
+/* D 53 0xD4000000 FP stfsu Store Floating-Point Single with Update */
+/* D 54 0xD8000000 FP stfd Store Floating-Point Double */
+/* D 55 0xDC000000 FP stfdu Store Floating-Point Double with Update */
+/* DQ 56 0xE0000000 P 58 LSQ lq Load Quadword */
+/* DS 57 0xE4000000 140 FP.out Lfdp Load Floating-Point Double Pair */
+/* DS 58 0xE8000000 53 64 Ldu Load Doubleword with Update */
+/* DS 58 0xE8000001 53 64 Ld Load Doubleword */
+/* DS 58 0xE8000002 52 64 Lwa Load Word Algebraic */
+/* DS 62 0xF8000000 57 64 std Store Doubleword */
+/* DS 62 0xF8000001 57 64 stdu Store Doubleword with Update */
+/* DS 62 0xF8000002 59 LSQ stq Store Quadword */
+
+// D-form:
+// 0-5    6-10    11-15   16-31
+// opcode RT      RA      Offset
+
+// DQ-form:
+// 0-5    6-10  11-15  16-27
+// opcode RT    RA     Offset
+
+// DS-form:
+// 0-5    6-10  11-15  16-29  30-31
+// opcode RT    RA     Offset opcode
+
+#define D_OPCODE_MASK GENMASK(31, 26)
+#define DQ_OPCODE_MASK D_OPCODE_MASK
+#define DS_OPCODE_MASK (D_OPCODE_MASK | GENMASK(0, 1))
+#define RS_RT_OFFSET 21UL
+#define RS_RT_MASK GENMASK(25, 21)
+#define RA_MASK GENMASK(20, 16)
+#define D_OFFSET GENMASK(15, 0)
+#define DQ_OFFSET GENMASK(15, 4)
+#define DS_OFFSET GENMASK(15, 2)
+
+struct opcode_t opcodes[] = {
+	{0x80000000, D_FORMAT, "lwz", },
+	{0x84000000, D_FORMAT, "lwzu", },
+	{0x88000000, D_FORMAT, "lbz", },
+	{0x8C000000, D_FORMAT, "lbzu", },
+	{0x90000000, D_FORMAT, "stw", },
+	{0x94000000, D_FORMAT, "stwu", },
+	{0x98000000, D_FORMAT, "stb", },
+	{0x9C000000, D_FORMAT, "stbu", },
+	{0xA0000000, D_FORMAT, "lhz", },
+	{0xA4000000, D_FORMAT, "lhzu", },
+	{0xA8000000, D_FORMAT, "lha", },
+	{0xAC000000, D_FORMAT, "lhau", },
+	{0xB0000000, D_FORMAT, "sth", },
+	{0xB4000000, D_FORMAT, "sthu", },
+	{0xB8000000, D_FORMAT, "lmw", },
+	{0xBC000000, D_FORMAT, "stmw", },
+	{0xC0000000, D_FORMAT, "lfs", },
+	{0xC4000000, D_FORMAT, "lfsu", },
+	{0xC8000000, D_FORMAT, "lfd", },
+	{0xCC000000, D_FORMAT, "lfdu", },
+	{0xD0000000, D_FORMAT, "stfs", },
+	{0xD4000000, D_FORMAT, "stfsu", },
+	{0xD8000000, D_FORMAT, "stfd", },
+	{0xDC000000, D_FORMAT, "stfdu", },
+	{0xE0000000, DQ_FORMAT, "lq", },
+	{0xE4000000, DS_FORMAT, "lfdp", },
+	{0xE8000000, DS_FORMAT, "ldu", },
+	{0xE8000001, DS_FORMAT, "ld", },
+	{0xE8000002, DS_FORMAT, "lwa", },
+	{0xF8000000, DS_FORMAT, "std", },
+	{0xF8000001, DS_FORMAT, "stdu", },
+	{0xF8000002, DS_FORMAT, "stq", }
+};
+
+struct opcode_t *get_opcode(unsigned int *addr)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(opcodes); i++) {
+		switch (opcodes[i].form) {
+		case D_FORMAT:
+			if (opcodes[i].opcode == (*addr & D_OPCODE_MASK))
+				return &opcodes[i];
+			break;
+		case DQ_FORMAT:
+			if (opcodes[i].opcode == (*addr & DQ_OPCODE_MASK))
+				return &opcodes[i];
+			break;
+		case DS_FORMAT:
+			if (opcodes[i].opcode == (*addr & DQ_OPCODE_MASK))
+				return &opcodes[i];
+			break;
+		}
+	}
+
+	return NULL;
+}
+
+inline enum mm_io_opcode get_ins_type(struct opcode_t *opcode)
+{
+	if (!opcode)
+		return MMIO_UNKNOWN_OP;
+
+	if (opcode->name[0] == 'l')
+		return MMIO_READ;
+
+	if (opcode->name[0] == 's')
+		return MMIO_WRITE;
+
+	return MMIO_UNKNOWN_OP;
+}
+
+unsigned int get_ins_width(struct opcode_t *opcode)
+{
+	char width_ch;
+
+	if (!opcode)
+		return 0;
+
+	if (opcode->name[0] == 'l')
+		width_ch = opcode->name[1];
+
+	if (opcode->name[0] == 's')
+		width_ch = opcode->name[2];
+
+	switch (width_ch) {
+	case 'b': /* byte */
+		return 1;
+	case 'h': /* half word */
+		return sizeof(long) / 2;
+	case 'w': /* word */
+		/* return sizeof(long); */
+	case 'm': /* multi words(can be calculated out by (32-RT) * sizeof(long)) */
+	case 'f': /* float(not too much. So ignore word number) */
+	case 'd': /* double words */
+		/* return 2 * sizeof(long); */
+	case 'q': /* quad words */
+		/* return 4 * sizeof(long); */
+	default:
+		return sizeof(long);
+	}
+}
+
+unsigned long get_ins_val(struct trap_reason *reason, struct pt_regs *regs)
+{
+	struct opcode_t *opcode = reason->opcode;
+	unsigned int ins = *(unsigned int *)(reason->ip);
+	unsigned int reg_no;
+	unsigned long mask = ~0UL;
+
+	if (!opcode)
+		return 0;
+
+	mask >>= 8 * (sizeof(long) - get_ins_width(opcode));
+	reg_no = (ins & RS_RT_MASK) >> RS_RT_OFFSET;
+
+	return regs->gpr[reg_no] & mask;
+}
diff --git a/arch/powerpc/mm/pf_in.h b/arch/powerpc/mm/pf_in.h
new file mode 100644
index 000000000000..905ba4937137
--- /dev/null
+++ b/arch/powerpc/mm/pf_in.h
@@ -0,0 +1,33 @@ 
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Derived from arch/x86/mm/pf_in.h:
+ *   Copyright (C) 2024 Jialong Yang (jialong.yang@shingroup.cn)
+ */
+
+#ifndef __PF_H_
+#define __PF_H_
+
+enum OPCODE_FORMAT {
+	D_FORMAT,
+	DQ_FORMAT,
+	DS_FORMAT,
+};
+
+struct opcode_t {
+	unsigned int opcode;
+	enum OPCODE_FORMAT form;
+	const char *name;
+};
+
+struct trap_reason {
+	unsigned long addr;
+	unsigned long ip;
+	struct opcode_t *opcode;
+	int active_traces;
+};
+
+struct opcode_t *get_opcode(unsigned int *addr);
+enum mm_io_opcode get_ins_type(struct opcode_t *opcode);
+unsigned int get_ins_width(struct opcode_t *opcode);
+unsigned long get_ins_val(struct trap_reason *reason, struct pt_regs *regs);
+#endif /* __PF_H_ */