diff mbox

[2/3] s390: implement pci instructions

Message ID 1415629216-59652-3-git-send-email-blaschka@linux.vnet.ibm.com
State New
Headers show

Commit Message

Frank Blaschka Nov. 10, 2014, 2:20 p.m. UTC
From: Frank Blaschka <frank.blaschka@de.ibm.com>

This patch implements the s390 pci instructions in qemu. It allows
to access and drive pci devices attached to the s390 pci bus.
Because of platform constrains devices using IO BARs are not
supported. Also a device has to support MSI/MSI-X to run on s390.

Signed-off-by: Frank Blaschka <frank.blaschka@de.ibm.com>
---
 target-s390x/Makefile.objs |   2 +-
 target-s390x/kvm.c         |  52 ++++
 target-s390x/pci_ic.c      | 753 +++++++++++++++++++++++++++++++++++++++++++++
 target-s390x/pci_ic.h      | 335 ++++++++++++++++++++
 4 files changed, 1141 insertions(+), 1 deletion(-)
 create mode 100644 target-s390x/pci_ic.c
 create mode 100644 target-s390x/pci_ic.h

Comments

Alexander Graf Nov. 10, 2014, 3:56 p.m. UTC | #1
On 10.11.14 15:20, Frank Blaschka wrote:
> From: Frank Blaschka <frank.blaschka@de.ibm.com>
> 
> This patch implements the s390 pci instructions in qemu. It allows
> to access and drive pci devices attached to the s390 pci bus.
> Because of platform constrains devices using IO BARs are not
> supported. Also a device has to support MSI/MSI-X to run on s390.
> 
> Signed-off-by: Frank Blaschka <frank.blaschka@de.ibm.com>
> ---
>  target-s390x/Makefile.objs |   2 +-
>  target-s390x/kvm.c         |  52 ++++
>  target-s390x/pci_ic.c      | 753 +++++++++++++++++++++++++++++++++++++++++++++
>  target-s390x/pci_ic.h      | 335 ++++++++++++++++++++
>  4 files changed, 1141 insertions(+), 1 deletion(-)
>  create mode 100644 target-s390x/pci_ic.c
>  create mode 100644 target-s390x/pci_ic.h
> 
> diff --git a/target-s390x/Makefile.objs b/target-s390x/Makefile.objs
> index 2c57494..cc71400 100644
> --- a/target-s390x/Makefile.objs
> +++ b/target-s390x/Makefile.objs
> @@ -2,4 +2,4 @@ obj-y += translate.o helper.o cpu.o interrupt.o
>  obj-y += int_helper.o fpu_helper.o cc_helper.o mem_helper.o misc_helper.o
>  obj-y += gdbstub.o
>  obj-$(CONFIG_SOFTMMU) += machine.o ioinst.o arch_dump.o
> -obj-$(CONFIG_KVM) += kvm.o
> +obj-$(CONFIG_KVM) += kvm.o pci_ic.o
> diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c
> index 5b10a25..d59e740 100644
> --- a/target-s390x/kvm.c
> +++ b/target-s390x/kvm.c
> @@ -40,6 +40,7 @@
>  #include "exec/gdbstub.h"
>  #include "trace.h"
>  #include "qapi-event.h"
> +#include "pci_ic.h"
>  
>  /* #define DEBUG_KVM */
>  
> @@ -56,6 +57,7 @@
>  #define IPA0_B2                         0xb200
>  #define IPA0_B9                         0xb900
>  #define IPA0_EB                         0xeb00
> +#define IPA0_E3                         0xe300
>  
>  #define PRIV_B2_SCLP_CALL               0x20
>  #define PRIV_B2_CSCH                    0x30
> @@ -76,8 +78,17 @@
>  #define PRIV_B2_XSCH                    0x76
>  
>  #define PRIV_EB_SQBS                    0x8a
> +#define PRIV_EB_PCISTB                  0xd0
> +#define PRIV_EB_SIC                     0xd1
>  
>  #define PRIV_B9_EQBS                    0x9c
> +#define PRIV_B9_CLP                     0xa0
> +#define PRIV_B9_PCISTG                  0xd0
> +#define PRIV_B9_PCILG                   0xd2
> +#define PRIV_B9_RPCIT                   0xd3
> +
> +#define PRIV_E3_MPCIFC                  0xd0
> +#define PRIV_E3_STPCIFC                 0xd4
>  
>  #define DIAG_IPL                        0x308
>  #define DIAG_KVM_HYPERCALL              0x500
> @@ -814,6 +825,18 @@ static int handle_b9(S390CPU *cpu, struct kvm_run *run, uint8_t ipa1)
>      int r = 0;
>  
>      switch (ipa1) {
> +    case PRIV_B9_CLP:
> +        r = kvm_clp_service_call(cpu, run);
> +        break;
> +    case PRIV_B9_PCISTG:
> +        r = kvm_pcistg_service_call(cpu, run);
> +        break;
> +    case PRIV_B9_PCILG:
> +        r = kvm_pcilg_service_call(cpu, run);
> +        break;
> +    case PRIV_B9_RPCIT:
> +        r = kvm_rpcit_service_call(cpu, run);
> +        break;
>      case PRIV_B9_EQBS:
>          /* just inject exception */
>          r = -1;
> @@ -832,6 +855,12 @@ static int handle_eb(S390CPU *cpu, struct kvm_run *run, uint8_t ipa1)
>      int r = 0;
>  
>      switch (ipa1) {
> +    case PRIV_EB_PCISTB:
> +        r = kvm_pcistb_service_call(cpu, run);
> +        break;
> +    case PRIV_EB_SIC:
> +        r = kvm_sic_service_call(cpu, run);
> +        break;
>      case PRIV_EB_SQBS:
>          /* just inject exception */
>          r = -1;
> @@ -845,6 +874,26 @@ static int handle_eb(S390CPU *cpu, struct kvm_run *run, uint8_t ipa1)
>      return r;
>  }
>  
> +static int handle_e3(S390CPU *cpu, struct kvm_run *run, uint8_t ipbl)
> +{
> +    int r = 0;
> +
> +    switch (ipbl) {
> +    case PRIV_E3_MPCIFC:
> +        r = kvm_mpcifc_service_call(cpu, run);
> +        break;
> +    case PRIV_E3_STPCIFC:
> +        r = kvm_stpcifc_service_call(cpu, run);
> +        break;
> +    default:
> +        r = -1;
> +        DPRINTF("KVM: unhandled PRIV: 0xe3%x\n", ipbl);
> +        break;
> +    }
> +
> +    return r;
> +}
> +
>  static int handle_hypercall(S390CPU *cpu, struct kvm_run *run)
>  {
>      CPUS390XState *env = &cpu->env;
> @@ -1041,6 +1090,9 @@ static int handle_instruction(S390CPU *cpu, struct kvm_run *run)
>      case IPA0_EB:
>          r = handle_eb(cpu, run, ipa1);
>          break;
> +    case IPA0_E3:
> +        r = handle_e3(cpu, run, run->s390_sieic.ipb & 0xff);
> +        break;
>      case IPA0_DIAG:
>          r = handle_diag(cpu, run, run->s390_sieic.ipb);
>          break;
> diff --git a/target-s390x/pci_ic.c b/target-s390x/pci_ic.c
> new file mode 100644
> index 0000000..6c05faf
> --- /dev/null
> +++ b/target-s390x/pci_ic.c
> @@ -0,0 +1,753 @@
> +/*
> + * s390 PCI intercepts
> + *
> + * Copyright 2014 IBM Corp.
> + * Author(s): Frank Blaschka <frank.blaschka@de.ibm.com>
> + *            Hong Bo Li <lihbbj@cn.ibm.com>
> + *            Yi Min Zhao <zyimin@cn.ibm.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or (at
> + * your option) any later version. See the COPYING file in the top-level
> + * directory.
> + */
> +
> +#include <sys/types.h>
> +#include <sys/ioctl.h>
> +#include <sys/mman.h>
> +
> +#include <linux/kvm.h>
> +#include <asm/ptrace.h>
> +#include <hw/pci/pci.h>
> +#include <hw/pci/pci_host.h>
> +#include <net/net.h>
> +
> +#include "qemu-common.h"
> +#include "qemu/timer.h"
> +#include "migration/qemu-file.h"
> +#include "sysemu/sysemu.h"
> +#include "sysemu/kvm.h"
> +#include "cpu.h"
> +#include "sysemu/device_tree.h"
> +#include "monitor/monitor.h"
> +#include "pci_ic.h"
> +
> +#include "hw/hw.h"
> +#include "hw/pci/pci.h"
> +#include "hw/pci/pci_bridge.h"
> +#include "hw/pci/pci_bus.h"
> +#include "hw/pci/pci_host.h"
> +#include "hw/s390x/s390-pci-bus.h"
> +#include "exec/exec-all.h"
> +#include "exec/memory-internal.h"
> +
> +/* #define DEBUG_S390PCI_IC */
> +#ifdef DEBUG_S390PCI_IC
> +#define DPRINTF(fmt, ...) \
> +    do { fprintf(stderr, "s390pci_ic: " fmt, ## __VA_ARGS__); } while (0)
> +#else
> +#define DPRINTF(fmt, ...) \
> +    do { } while (0)
> +#endif
> +
> +static uint64_t resume_token;

global variable? Why?

> +
> +static uint8_t barsize(uint64_t size)
> +{
> +    uint64_t mask = 1;
> +    int i;
> +
> +    if (!size) {
> +        return 0;
> +    }
> +
> +    for (i = 0; i < 64; i++) {
> +        if (size & mask) {
> +            break;
> +        }
> +        mask = (mask << 1);
> +    }
> +
> +    return i;
> +}

Isn't there an existing helper for this in the PCI layer?

In fact, please check whether it makes sense to move some of the code to
hw/ rather than target-s390x.

> +
> +static void s390_set_status_code(CPUS390XState *env,
> +                                 uint8_t r, uint64_t status_code)
> +{
> +    env->regs[r] &= ~0xff000000;
> +    env->regs[r] |= (status_code & 0xff) << 24;
> +}
> +
> +static int list_pci(ClpReqRspListPci *rrb, uint8_t *cc)
> +{
> +    S390PCIBusDevice *pbdev;
> +    uint32_t res_code, initial_l2, g_l2, finish;
> +    int rc, idx;
> +
> +    rc = 0;
> +    if (be16_to_cpu(rrb->request.hdr.len) != 32) {
> +        res_code = CLP_RC_LEN;
> +        rc = -EINVAL;
> +        goto out;
> +    }
> +
> +    if ((be32_to_cpu(rrb->request.fmt) & CLP_MASK_FMT) != 0) {
> +        res_code = CLP_RC_FMT;
> +        rc = -EINVAL;
> +        goto out;
> +    }
> +
> +    if ((be32_to_cpu(rrb->request.fmt) & ~CLP_MASK_FMT) != 0 ||
> +        rrb->request.reserved1 != 0 ||
> +        rrb->request.reserved2 != 0) {
> +        res_code = CLP_RC_RESNOT0;
> +        rc = -EINVAL;
> +        goto out;
> +    }
> +
> +    if (be64_to_cpu(rrb->request.resume_token) == 0) {
> +        resume_token = 0;
> +    } else if (be64_to_cpu(rrb->request.resume_token) != resume_token) {
> +        res_code = CLP_RC_LISTPCI_BADRT;
> +        rc = -EINVAL;
> +        goto out;
> +    }
> +
> +    if (be16_to_cpu(rrb->response.hdr.len) < 48) {
> +        res_code = CLP_RC_8K;
> +        rc = -EINVAL;
> +        goto out;
> +    }
> +
> +    initial_l2 = be16_to_cpu(rrb->response.hdr.len);
> +    if ((initial_l2 - LIST_PCI_HDR_LEN) % sizeof(ClpFhListEntry)
> +        != 0) {
> +        rc = -EINVAL;
> +        *cc = 3;
> +        goto out;
> +    }
> +
> +    rrb->response.fmt = 0;
> +    rrb->response.reserved1 = rrb->response.reserved2 = 0;
> +    rrb->response.mdd = cpu_to_be32(FH_VIRT);
> +    rrb->response.max_fn = cpu_to_be16(PCI_MAX_FUNCTIONS);
> +    rrb->response.entry_size = sizeof(ClpFhListEntry);
> +    finish = 0;
> +    idx = resume_token;
> +    g_l2 = LIST_PCI_HDR_LEN;
> +    do {
> +        pbdev = s390_pci_find_dev_by_idx(idx);
> +        if (!pbdev) {
> +            finish = 1;
> +            break;
> +        }
> +        rrb->response.fh_list[idx - resume_token].device_id =
> +            pci_get_word(pbdev->pdev->config + PCI_DEVICE_ID);
> +        rrb->response.fh_list[idx - resume_token].vendor_id =
> +            pci_get_word(pbdev->pdev->config + PCI_VENDOR_ID);
> +        rrb->response.fh_list[idx - resume_token].config =
> +            cpu_to_be32(0x80000000);
> +        rrb->response.fh_list[idx - resume_token].fid = cpu_to_be32(pbdev->fid);
> +        rrb->response.fh_list[idx - resume_token].fh = cpu_to_be32(pbdev->fh);
> +
> +        g_l2 += sizeof(ClpFhListEntry);
> +        DPRINTF("g_l2 %d vendor id 0x%x device id 0x%x fid 0x%x fh 0x%x\n",
> +            g_l2,
> +            rrb->response.fh_list[idx - resume_token].vendor_id,
> +            rrb->response.fh_list[idx - resume_token].device_id,
> +            rrb->response.fh_list[idx - resume_token].fid,
> +            rrb->response.fh_list[idx - resume_token].fh);
> +        idx++;
> +    } while (g_l2 < initial_l2);
> +
> +    if (finish == 1) {
> +        resume_token = 0;
> +    } else {
> +        resume_token = idx;
> +    }
> +    rrb->response.resume_token = cpu_to_be64(resume_token);
> +    rrb->response.hdr.len = cpu_to_be16(g_l2);
> +    rrb->response.hdr.rsp = cpu_to_be16(CLP_RC_OK);
> +out:
> +    if (rc) {
> +        DPRINTF("list pci failed rc 0x%x\n", rc);
> +        rrb->response.hdr.rsp = cpu_to_be16(res_code);
> +    }
> +    return rc;
> +}
> +
> +int kvm_clp_service_call(S390CPU *cpu, struct kvm_run *run)

Please separate kvm_ calls from the actual implementation. Do all the
parameter extraction in the kvm_ function and then forward on to a
generic function that doesn't need to know about kvm_run anymore.

kvm specific c file:

int kvm_clp_service_call(S390CPU *cpu, struct kvm_run *run)
{
    uint8_t r2 = (run->s390_sieic.ipb & 0x000f0000) >> 16;
    return clp_service_call(cpu, r2);
}

io / pci specific c file:

int clp_service_call(S390CPU *cpu, uint8_t r2)
{
    ...
}

> +{
> +    ClpReqHdr *reqh;
> +    ClpRspHdr *resh;
> +    S390PCIBusDevice *pbdev;
> +    uint32_t req_len;
> +    uint32_t res_len;
> +    uint8_t *buffer;
> +    uint8_t cc = 0;
> +    CPUS390XState *env = &cpu->env;
> +    uint8_t r2 = (run->s390_sieic.ipb & 0x000f0000) >> 16;
> +    int i;
> +
> +    buffer = g_malloc0(4096 * 2);

Do you really need this? Couldn't you make the pointers be actual
structs on the stack and just read/write from them directly?

The compiler should be smart enough to throw away elements that aren't
used anymore to conserve memory.

> +    cpu_synchronize_state(CPU(cpu));
> +
> +    if (env->psw.mask & PSW_MASK_PSTATE) {
> +        program_interrupt(env, PGM_PRIVILEGED, 4);
> +        return 0;
> +    }
> +
> +    cpu_physical_memory_rw(env->regs[r2], buffer, sizeof(*reqh), 0);
> +    reqh = (ClpReqHdr *)buffer;
> +    req_len = be16_to_cpu(reqh->len);
> +    if (req_len < 16 || req_len > 8184 || (req_len % 8 != 0)) {
> +        program_interrupt(env, PGM_OPERAND, 4);
> +        return 0;
> +    }
> +
> +    cpu_physical_memory_rw(env->regs[r2], buffer, req_len + sizeof(*resh), 0);
> +    resh = (ClpRspHdr *)(buffer + req_len);
> +    res_len = be16_to_cpu(resh->len);
> +    if (res_len < 8 || res_len > 8176 || (res_len % 8 != 0)) {
> +        program_interrupt(env, PGM_OPERAND, 4);
> +        return 0;
> +    }
> +    if ((req_len + res_len) > 8192) {
> +        program_interrupt(env, PGM_OPERAND, 4);
> +        return 0;
> +    }
> +
> +    cpu_physical_memory_rw(env->regs[r2], buffer, req_len + res_len, 0);
> +
> +    if (req_len != 32) {
> +        resh->rsp = cpu_to_be16(CLP_RC_LEN);
> +        goto out;
> +    }
> +
> +    switch (reqh->cmd) {
> +    case CLP_LIST_PCI: {
> +        ClpReqRspListPci *rrb = (ClpReqRspListPci *)buffer;
> +        list_pci(rrb, &cc);
> +        break;
> +    }
> +    case CLP_SET_PCI_FN: {
> +        ClpReqSetPci *reqsetpci = (ClpReqSetPci *)reqh;
> +        ClpRspSetPci *ressetpci = (ClpRspSetPci *)resh;
> +
> +        pbdev = s390_pci_find_dev_by_fh(be32_to_cpu(reqsetpci->fh));
> +        if (!pbdev) {
> +                ressetpci->hdr.rsp = cpu_to_be16(CLP_RC_SETPCIFN_FH);
> +                goto out;
> +        }
> +
> +        switch (reqsetpci->oc) {
> +        case CLP_SET_ENABLE_PCI_FN:
> +            pbdev->fh = pbdev->fh | 1 << ENABLE_BIT_OFFSET;
> +            ressetpci->fh = cpu_to_be32(pbdev->fh);
> +            ressetpci->hdr.rsp = cpu_to_be16(CLP_RC_OK);
> +            break;
> +        case CLP_SET_DISABLE_PCI_FN:
> +            pbdev->fh = pbdev->fh & ~(1 << ENABLE_BIT_OFFSET);
> +            ressetpci->fh = cpu_to_be32(pbdev->fh);
> +            ressetpci->hdr.rsp = cpu_to_be16(CLP_RC_OK);
> +            break;
> +        default:
> +            DPRINTF("unknown set pci command\n");
> +            ressetpci->hdr.rsp = cpu_to_be16(CLP_RC_SETPCIFN_FHOP);
> +            break;
> +        }
> +        break;
> +    }
> +    case CLP_QUERY_PCI_FN: {
> +        ClpReqQueryPci *reqquery = (ClpReqQueryPci *)reqh;
> +        ClpRspQueryPci *resquery = (ClpRspQueryPci *)resh;
> +
> +        pbdev = s390_pci_find_dev_by_fh(reqquery->fh);
> +        if (!pbdev) {
> +            DPRINTF("query pci no pci dev\n");
> +            resquery->hdr.rsp = cpu_to_be16(CLP_RC_SETPCIFN_FH);
> +            goto out;
> +        }
> +
> +        for (i = 0; i < PCI_BAR_COUNT; i++) {
> +            uint64_t data = pci_host_config_read_common(pbdev->pdev,
> +                0x10 + (i * 4), pci_config_size(pbdev->pdev), 4);
> +
> +            resquery->bar[i] = bswap32(data);
> +            resquery->bar_size[i] = barsize(pbdev->pdev->io_regions[i].size);
> +            DPRINTF("bar %d addr 0x%x size 0x%lx barsize 0x%x\n", i,
> +                    resquery->bar[i], pbdev->pdev->io_regions[i].size,
> +                    resquery->bar_size[i]);
> +        }
> +
> +        resquery->sdma = ZPCI_SDMA_ADDR;
> +        resquery->edma = ZPCI_EDMA_ADDR;
> +        resquery->pchid = 0;
> +        resquery->ug = 1;
> +        resquery->uid = pbdev->fid;
> +
> +        resquery->hdr.rsp = CLP_RC_OK;
> +        break;
> +    }
> +    case CLP_QUERY_PCI_FNGRP: {
> +        ClpRspQueryPciGrp *resgrp = (ClpRspQueryPciGrp *)resh;
> +        resgrp->fr = 1;
> +        resgrp->dasm = 0;
> +        resgrp->msia = ZPCI_MSI_ADDR;
> +        resgrp->mui = 0;
> +        resgrp->i = 128;
> +        resgrp->version = 0;
> +
> +        resgrp->hdr.rsp = CLP_RC_OK;
> +        break;
> +    }
> +    default:
> +        DPRINTF("unknown clp command\n");
> +        resh->rsp = cpu_to_be16(CLP_RC_CMD);
> +        break;
> +    }
> +
> +out:
> +    cpu_physical_memory_rw(env->regs[r2], buffer, req_len + res_len, 1);

... ah, to write back. Wouldn't it be cleaner to do this explicitly?

> +    g_free(buffer);
> +    setcc(cpu, cc);
> +    return 0;
> +}
> +
> +int kvm_pcilg_service_call(S390CPU *cpu, struct kvm_run *run)
> +{
> +    CPUS390XState *env = &cpu->env;
> +    S390PCIBusDevice *pbdev;
> +    uint8_t r1 = (run->s390_sieic.ipb & 0x00f00000) >> 20;
> +    uint8_t r2 = (run->s390_sieic.ipb & 0x000f0000) >> 16;
> +    PciLgStg *rp;
> +    uint64_t offset;
> +    uint64_t data;
> +    uint8_t len;
> +
> +    cpu_synchronize_state(CPU(cpu));
> +
> +    if (env->psw.mask & PSW_MASK_PSTATE) {
> +        program_interrupt(env, PGM_PRIVILEGED, 4);
> +        return 0;
> +    }
> +
> +    if (r2 & 0x1) {
> +        program_interrupt(env, PGM_SPECIFICATION, 4);
> +        return 0;
> +    }
> +
> +    rp = (PciLgStg *)&env->regs[r2];
> +    offset = env->regs[r2 + 1];
> +
> +    pbdev = s390_pci_find_dev_by_fh(rp->fh);
> +    if (!pbdev) {
> +        DPRINTF("pcilg no pci dev\n");
> +        setcc(cpu, ZPCI_PCI_LS_INVAL_HANDLE);
> +        return 0;
> +    }
> +
> +    len = rp->len & 0xF;
> +    if (rp->pcias < 6) {
> +        if ((8 - (offset & 0x7)) < len) {
> +            program_interrupt(env, PGM_OPERAND, 4);
> +            return 0;
> +        }
> +        MemoryRegion *mr = pbdev->pdev->io_regions[rp->pcias].memory;
> +        io_mem_read(mr, offset, &data, len);
> +    } else if (rp->pcias == 15) {
> +        if ((4 - (offset & 0x3)) < len) {
> +            program_interrupt(env, PGM_OPERAND, 4);
> +            return 0;
> +        }
> +        data =  pci_host_config_read_common(
> +                   pbdev->pdev, offset, pci_config_size(pbdev->pdev), len);
> +
> +        switch (len) {
> +        case 1:
> +            break;
> +        case 2:
> +            data = cpu_to_le16(data);
> +            break;
> +        case 4:
> +            data = cpu_to_le32(data);
> +            break;
> +        case 8:
> +            data = cpu_to_le64(data);
> +            break;

Why? Also, this is wrong. cpu_to_le64 convert between host endianness
and LE. So if you're running this on an LE host, you won't swap the
value and get a broken result.

If you know that the value is always swapped, use bswapxx().

> +        default:
> +            program_interrupt(env, PGM_OPERAND, 4);
> +            return 0;
> +        }
> +    } else {
> +        DPRINTF("invalid space\n");
> +        setcc(cpu, ZPCI_PCI_LS_ERR);
> +        s390_set_status_code(env, r2, ZPCI_PCI_ST_INVAL_AS);
> +        return 0;
> +    }
> +
> +    env->regs[r1] = data;
> +    setcc(cpu, ZPCI_PCI_LS_OK);
> +    return 0;
> +}
> +
> +static void update_msix_table_msg_data(S390PCIBusDevice *pbdev, uint64_t offset,
> +                                       uint64_t *data, uint8_t len)
> +{
> +    uint32_t msg_data;
> +
> +    if (offset % PCI_MSIX_ENTRY_SIZE != 8) {
> +        return;
> +    }
> +
> +    if (len != 4) {
> +        DPRINTF("access msix table msg data but len is %d\n", len);
> +        return;
> +    }
> +
> +    msg_data = (pbdev->fid << ZPCI_MSI_VEC_BITS) | le32_to_cpu(*data);
> +    *data = cpu_to_le32(msg_data);
> +    DPRINTF("update msix msg_data to 0x%x\n", msg_data);
> +}
> +
> +static int trap_msix(S390PCIBusDevice *pbdev, uint64_t offset, uint8_t pcias)
> +{
> +    if (pbdev->msix.available && pbdev->msix.table_bar == pcias &&
> +        offset >= pbdev->msix.table_offset &&
> +        offset <= pbdev->msix.table_offset +
> +                  (pbdev->msix.entries - 1) * PCI_MSIX_ENTRY_SIZE) {
> +        return 1;
> +    } else {
> +        return 0;
> +    }
> +}
> +
> +int kvm_pcistg_service_call(S390CPU *cpu, struct kvm_run *run)
> +{
> +    CPUS390XState *env = &cpu->env;
> +    uint8_t r1 = (run->s390_sieic.ipb & 0x00f00000) >> 20;
> +    uint8_t r2 = (run->s390_sieic.ipb & 0x000f0000) >> 16;
> +    PciLgStg *rp;
> +    uint64_t offset, data;
> +    S390PCIBusDevice *pbdev;
> +    uint8_t len;
> +
> +    cpu_synchronize_state(CPU(cpu));
> +
> +    if (env->psw.mask & PSW_MASK_PSTATE) {
> +        program_interrupt(env, PGM_PRIVILEGED, 4);
> +        return 0;
> +    }
> +
> +    if (r2 & 0x1) {
> +        program_interrupt(env, PGM_SPECIFICATION, 4);
> +        return 0;
> +    }
> +
> +    rp = (PciLgStg *)&env->regs[r2];
> +    offset = env->regs[r2 + 1];
> +
> +    pbdev = s390_pci_find_dev_by_fh(rp->fh);
> +    if (!pbdev) {
> +        DPRINTF("pcistg no pci dev\n");
> +        setcc(cpu, ZPCI_PCI_LS_INVAL_HANDLE);
> +        return 0;
> +    }
> +
> +    data = env->regs[r1];
> +    len = rp->len & 0xF;
> +    if (rp->pcias < 6) {
> +        if ((8 - (offset & 0x7)) < len) {
> +            program_interrupt(env, PGM_OPERAND, 4);
> +            return 0;
> +        }
> +        MemoryRegion *mr;
> +        if (trap_msix(pbdev, offset, rp->pcias)) {
> +            offset = offset - pbdev->msix.table_offset;
> +            mr = &pbdev->pdev->msix_table_mmio;
> +            update_msix_table_msg_data(pbdev, offset, &data, len);
> +        } else {
> +            mr = pbdev->pdev->io_regions[rp->pcias].memory;
> +        }
> +
> +        io_mem_write(mr, offset, data, len);
> +    } else if (rp->pcias == 15) {
> +        if ((4 - (offset & 0x3)) < len) {
> +            program_interrupt(env, PGM_OPERAND, 4);
> +            return 0;
> +        }
> +        switch (len) {
> +        case 1:
> +            break;
> +        case 2:
> +            data = le16_to_cpu(data);
> +            break;
> +        case 4:
> +            data = le32_to_cpu(data);
> +            break;
> +        case 8:
> +            data = le64_to_cpu(data);
> +            break;
> +        default:
> +            program_interrupt(env, PGM_OPERAND, 4);
> +            return 0;
> +        }

I guess you want a generic function similar to qemu_bswap_len() that
supports 64bit?

> +
> +        pci_host_config_write_common(pbdev->pdev, offset,
> +                                     pci_config_size(pbdev->pdev),
> +                                     data, len);
> +    } else {
> +        DPRINTF("pcistg invalid space\n");
> +        setcc(cpu, ZPCI_PCI_LS_ERR);
> +        s390_set_status_code(env, r2, ZPCI_PCI_ST_INVAL_AS);
> +        return 0;
> +    }
> +
> +    setcc(cpu, ZPCI_PCI_LS_OK);
> +    return 0;
> +}
> +
> +int kvm_rpcit_service_call(S390CPU *cpu, struct kvm_run *run)
> +{
> +    CPUS390XState *env = &cpu->env;
> +    uint8_t r1 = (run->s390_sieic.ipb & 0x00f00000) >> 20;
> +    uint8_t r2 = (run->s390_sieic.ipb & 0x000f0000) >> 16;
> +    uint32_t fh;
> +    uint64_t pte;
> +    S390PCIBusDevice *pbdev;
> +    ram_addr_t size;
> +    int flags;
> +    IOMMUTLBEntry entry;
> +
> +    cpu_synchronize_state(CPU(cpu));
> +
> +    if (env->psw.mask & PSW_MASK_PSTATE) {
> +        program_interrupt(env, PGM_PRIVILEGED, 4);
> +        return 0;
> +    }
> +
> +    if (r2 & 0x1) {
> +        program_interrupt(env, PGM_SPECIFICATION, 4);
> +        return 0;
> +    }
> +
> +    fh = env->regs[r1] >> 32;
> +    size = env->regs[r2 + 1];
> +
> +    pbdev = s390_pci_find_dev_by_fh(fh);
> +
> +    if (!pbdev) {
> +        DPRINTF("rpcit no pci dev\n");
> +        setcc(cpu, ZPCI_PCI_LS_INVAL_HANDLE);
> +        return 0;
> +    }
> +
> +    pte = s390_guest_io_table_walk(s390_pci_get_table_origin(pbdev->g_iota),
> +                                   env->regs[r2]);
> +    flags = pte & ZPCI_PTE_FLAG_MASK;
> +    entry.target_as = &address_space_memory;
> +    entry.iova = env->regs[r2];
> +    entry.translated_addr = pte & ZPCI_PTE_ADDR_MASK;
> +    entry.addr_mask = size - 1;
> +
> +    if (flags & ZPCI_PTE_INVALID) {
> +        entry.perm = IOMMU_NONE;
> +    } else {
> +        entry.perm = IOMMU_RW;
> +    }

Deja vu? This is the iommu translation function, no? Can't you somehow
just call it?

> +
> +    memory_region_notify_iommu(pci_device_iommu_address_space(
> +                               pbdev->pdev)->root, entry);
> +
> +    setcc(cpu, ZPCI_PCI_LS_OK);
> +    return 0;
> +}
> +
> +int kvm_sic_service_call(S390CPU *cpu, struct kvm_run *run)
> +{
> +    qemu_log_mask(LOG_UNIMP, "SIC missing\n");
> +    return 0;
> +}
> +
> +int kvm_pcistb_service_call(S390CPU *cpu, struct kvm_run *run)
> +{
> +    CPUS390XState *env = &cpu->env;
> +    uint8_t r1 = (run->s390_sieic.ipa & 0x00f0) >> 4;
> +    uint8_t r3 = run->s390_sieic.ipa & 0x000f;
> +    PciStb *rp;
> +    uint64_t gaddr;
> +    uint64_t *uaddr, *pu;
> +    hwaddr len;
> +    S390PCIBusDevice *pbdev;
> +    MemoryRegion *mr;
> +    int i;
> +
> +    cpu_synchronize_state(CPU(cpu));
> +
> +    if (env->psw.mask & PSW_MASK_PSTATE) {
> +        program_interrupt(env, PGM_PRIVILEGED, 6);
> +        return 0;
> +    }
> +
> +    rp = (PciStb *)&env->regs[r1];
> +    if (rp->pcias > 5) {
> +        DPRINTF("pcistb invalid space\n");
> +        setcc(cpu, ZPCI_PCI_LS_ERR);
> +        s390_set_status_code(env, r1, ZPCI_PCI_ST_INVAL_AS);
> +        return 0;
> +    }
> +
> +    switch (rp->len) {
> +    case 16:
> +    case 32:
> +    case 64:
> +    case 128:
> +        break;
> +    default:
> +        program_interrupt(env, PGM_SPECIFICATION, 6);
> +        return 0;
> +    }
> +
> +    gaddr = get_base_disp_rsy(cpu, run);
> +    len = rp->len;
> +
> +    pbdev = s390_pci_find_dev_by_fh(rp->fh);
> +    if (!pbdev) {
> +        DPRINTF("pcistb no pci dev fh 0x%x\n", rp->fh);
> +        setcc(cpu, ZPCI_PCI_LS_INVAL_HANDLE);
> +        return 0;
> +    }
> +
> +    uaddr = cpu_physical_memory_map(gaddr, &len, 0);
> +    mr = pbdev->pdev->io_regions[rp->pcias].memory;
> +    if (!memory_region_access_valid(mr, env->regs[r3], rp->len, true)) {
> +        cpu_physical_memory_unmap(uaddr, len, 0, len);
> +        program_interrupt(env, PGM_ADDRESSING, 6);
> +        return 0;
> +    }
> +
> +    pu = uaddr;
> +    for (i = 0; i < rp->len / 8; i++) {
> +        io_mem_write(mr, env->regs[r3] + i * 8, *pu, 8);

Please don't overoptimize and just use individual ldq_phys() operations
here for each memory access. In general, try to avoid
cpu_physical_memory_map().

> +        pu++;
> +    }
> +
> +    cpu_physical_memory_unmap(uaddr, len, 0, len);
> +    setcc(cpu, ZPCI_PCI_LS_OK);
> +    return 0;
> +}
> +
> +static int reg_irqs(CPUS390XState *env, S390PCIBusDevice *pbdev, ZpciFib fib)
> +{
> +    int ret;
> +    S390FLICState *fs = s390_get_flic();
> +    S390FLICStateClass *fsc = S390_FLIC_COMMON_GET_CLASS(fs);
> +
> +    ret = css_register_io_adapter(S390_PCIPT_ADAPTER,
> +                                  FIB_DATA_ISC(fib.data), true, false,
> +                                  &pbdev->routes.adapter.adapter_id);
> +    assert(ret == 0);
> +
> +    fsc->io_adapter_map(fs, pbdev->routes.adapter.adapter_id, fib.aisb, true);
> +    fsc->io_adapter_map(fs, pbdev->routes.adapter.adapter_id, fib.aibv, true);
> +
> +    pbdev->routes.adapter.summary_addr = fib.aisb;
> +    pbdev->routes.adapter.summary_offset = FIB_DATA_AISBO(fib.data);
> +    pbdev->routes.adapter.ind_addr = fib.aibv;
> +    pbdev->routes.adapter.ind_offset = FIB_DATA_AIBVO(fib.data);
> +
> +    DPRINTF("reg_irqs adapter id %d\n", pbdev->routes.adapter.adapter_id);
> +    return 0;
> +}
> +
> +static int dereg_irqs(S390PCIBusDevice *pbdev)
> +{
> +    S390FLICState *fs = s390_get_flic();
> +    S390FLICStateClass *fsc = S390_FLIC_COMMON_GET_CLASS(fs);
> +
> +    fsc->io_adapter_map(fs, pbdev->routes.adapter.adapter_id,
> +                        pbdev->routes.adapter.ind_addr, false);
> +
> +    pbdev->routes.adapter.summary_addr = 0;
> +    pbdev->routes.adapter.summary_offset = 0;
> +    pbdev->routes.adapter.ind_addr = 0;
> +    pbdev->routes.adapter.ind_offset = 0;
> +
> +    DPRINTF("dereg_irqs adapter id %d\n", pbdev->routes.adapter.adapter_id);
> +    return 0;
> +}
> +
> +int kvm_mpcifc_service_call(S390CPU *cpu, struct kvm_run *run)
> +{
> +    CPUS390XState *env = &cpu->env;
> +    uint8_t r1 = (run->s390_sieic.ipa & 0x00f0) >> 4;
> +    uint8_t oc;
> +    uint32_t fh;
> +    uint64_t fiba;
> +    ZpciFib fib;
> +    S390PCIBusDevice *pbdev;
> +
> +    cpu_synchronize_state(CPU(cpu));
> +
> +    if (env->psw.mask & PSW_MASK_PSTATE) {
> +        program_interrupt(env, PGM_PRIVILEGED, 6);
> +        return 0;
> +    }
> +
> +    oc = env->regs[r1] & 0xff;
> +    fh = env->regs[r1] >> 32;
> +    fiba = get_base_disp_rxy(cpu, run);
> +
> +    if (fiba & 0x7) {
> +        program_interrupt(env, PGM_SPECIFICATION, 6);
> +        return 0;
> +    }
> +
> +    pbdev = s390_pci_find_dev_by_fh(fh);
> +    if (!pbdev) {
> +        DPRINTF("mpcifc no pci dev fh 0x%x\n", fh);
> +        setcc(cpu, ZPCI_PCI_LS_INVAL_HANDLE);
> +        return 0;
> +    }
> +
> +    cpu_physical_memory_rw(fiba, (uint8_t *)&fib, sizeof(fib), 0);

I also find cpu_physical_memory_rw() pretty hard to read. Meanwhile,
it's been deprecated by cpu_physical_memory_read() and
cpu_physical_memory_write() which make the code more readable.

> +
> +    switch (oc) {
> +    case ZPCI_MOD_FC_REG_INT: {
> +        pbdev->isc = FIB_DATA_ISC(fib.data);
> +        reg_irqs(env, pbdev, fib);
> +        break;
> +    }
> +    case ZPCI_MOD_FC_DEREG_INT:
> +        dereg_irqs(pbdev);
> +        break;
> +    case ZPCI_MOD_FC_REG_IOAT:
> +        if (fib.pba > fib.pal) {
> +            program_interrupt(&cpu->env, PGM_OPERAND, 6);
> +            return 0;
> +        }
> +        pbdev->g_iota = fib.iota;
> +        break;
> +    case ZPCI_MOD_FC_DEREG_IOAT:
> +        break;
> +    case ZPCI_MOD_FC_REREG_IOAT:
> +        break;
> +    case ZPCI_MOD_FC_RESET_ERROR:
> +        break;
> +    case ZPCI_MOD_FC_RESET_BLOCK:
> +        break;
> +    case ZPCI_MOD_FC_SET_MEASURE:
> +        break;
> +    default:
> +        program_interrupt(&cpu->env, PGM_OPERAND, 6);
> +        return 0;
> +    }
> +
> +    setcc(cpu, ZPCI_PCI_LS_OK);
> +    return 0;
> +}
> +
> +int kvm_stpcifc_service_call(S390CPU *cpu, struct kvm_run *run)
> +{
> +    qemu_log_mask(LOG_UNIMP, "STPCIFC missing\n");
> +    return 0;
> +}
> diff --git a/target-s390x/pci_ic.h b/target-s390x/pci_ic.h
> new file mode 100644
> index 0000000..0eb6c27
> --- /dev/null
> +++ b/target-s390x/pci_ic.h
> @@ -0,0 +1,335 @@
> +/*
> + * s390 PCI intercept definitions
> + *
> + * Copyright 2014 IBM Corp.
> + * Author(s): Frank Blaschka <frank.blaschka@de.ibm.com>
> + *            Hong Bo Li <lihbbj@cn.ibm.com>
> + *            Yi Min Zhao <zyimin@cn.ibm.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or (at
> + * your option) any later version. See the COPYING file in the top-level
> + * directory.
> + */
> +
> +#ifndef PCI_IC_S390X_H
> +#define PCI_IC_S390X_H
> +
> +#include <sysemu/dma.h>
> +
> +/* CLP common request & response block size */
> +#define CLP_BLK_SIZE 4096
> +#define PCI_BAR_COUNT 6
> +#define PCI_MAX_FUNCTIONS 4096
> +
> +typedef struct ClpReqHdr {
> +    __uint16_t len;
> +    __uint16_t cmd;
> +} QEMU_PACKED ClpReqHdr;
> +
> +typedef struct ClpRspHdr {
> +    __uint16_t len;
> +    __uint16_t rsp;
> +} QEMU_PACKED ClpRspHdr;
> +
> +/* CLP Response Codes */
> +#define CLP_RC_OK         0x0010  /* Command request successfully */
> +#define CLP_RC_CMD        0x0020  /* Command code not recognized */
> +#define CLP_RC_PERM       0x0030  /* Command not authorized */
> +#define CLP_RC_FMT        0x0040  /* Invalid command request format */
> +#define CLP_RC_LEN        0x0050  /* Invalid command request length */
> +#define CLP_RC_8K         0x0060  /* Command requires 8K LPCB */
> +#define CLP_RC_RESNOT0    0x0070  /* Reserved field not zero */
> +#define CLP_RC_NODATA     0x0080  /* No data available */
> +#define CLP_RC_FC_UNKNOWN 0x0100  /* Function code not recognized */
> +
> +/*
> + * Call Logical Processor - Command Codes
> + */
> +#define CLP_LIST_PCI            0x0002
> +#define CLP_QUERY_PCI_FN        0x0003
> +#define CLP_QUERY_PCI_FNGRP     0x0004
> +#define CLP_SET_PCI_FN          0x0005
> +
> +/* PCI function handle list entry */
> +typedef struct ClpFhListEntry {
> +    __uint16_t device_id;
> +    __uint16_t vendor_id;
> +#define CLP_FHLIST_MASK_CONFIG 0x80000000
> +    __uint32_t config;
> +    __uint32_t fid;
> +    __uint32_t fh;
> +} QEMU_PACKED ClpFhListEntry;
> +
> +#define CLP_RC_SETPCIFN_FH      0x0101 /* Invalid PCI fn handle */
> +#define CLP_RC_SETPCIFN_FHOP    0x0102 /* Fn handle not valid for op */
> +#define CLP_RC_SETPCIFN_DMAAS   0x0103 /* Invalid DMA addr space */
> +#define CLP_RC_SETPCIFN_RES     0x0104 /* Insufficient resources */
> +#define CLP_RC_SETPCIFN_ALRDY   0x0105 /* Fn already in requested state */
> +#define CLP_RC_SETPCIFN_ERR     0x0106 /* Fn in permanent error state */
> +#define CLP_RC_SETPCIFN_RECPND  0x0107 /* Error recovery pending */
> +#define CLP_RC_SETPCIFN_BUSY    0x0108 /* Fn busy */
> +#define CLP_RC_LISTPCI_BADRT    0x010a /* Resume token not recognized */
> +#define CLP_RC_QUERYPCIFG_PFGID 0x010b /* Unrecognized PFGID */
> +
> +/* request or response block header length */
> +#define LIST_PCI_HDR_LEN 32
> +
> +/* Number of function handles fitting in response block */
> +#define CLP_FH_LIST_NR_ENTRIES \
> +    ((CLP_BLK_SIZE - 2 * LIST_PCI_HDR_LEN) \
> +        / sizeof(ClpFhListEntry))
> +
> +#define CLP_SET_ENABLE_PCI_FN  0 /* Yes, 0 enables it */
> +#define CLP_SET_DISABLE_PCI_FN 1 /* Yes, 1 disables it */
> +
> +#define CLP_UTIL_STR_LEN 64
> +
> +#define CLP_MASK_FMT 0xf0000000
> +
> +/* List PCI functions request */
> +typedef struct ClpReqListPci {
> +    ClpReqHdr hdr;
> +    __uint32_t fmt;
> +    __uint64_t reserved1;
> +    __uint64_t resume_token;
> +    __uint64_t reserved2;
> +} QEMU_PACKED ClpReqListPci;
> +
> +/* List PCI functions response */
> +typedef struct ClpRspListPci {
> +    ClpRspHdr hdr;
> +    __uint32_t fmt;
> +    __uint64_t reserved1;
> +    __uint64_t resume_token;
> +    __uint32_t mdd;
> +    __uint16_t max_fn;
> +    __uint8_t reserved2;
> +    __uint8_t entry_size;
> +    ClpFhListEntry fh_list[CLP_FH_LIST_NR_ENTRIES];
> +} QEMU_PACKED ClpRspListPci;
> +
> +/* Query PCI function request */
> +typedef struct ClpReqQueryPci {
> +    ClpReqHdr hdr;
> +    __uint32_t fmt;
> +    __uint64_t reserved1;
> +    __uint32_t fh; /* function handle */
> +    __uint32_t reserved2;
> +    __uint64_t reserved3;
> +} QEMU_PACKED ClpReqQueryPci;
> +
> +/* Query PCI function response */
> +typedef struct ClpRspQueryPci {
> +    ClpRspHdr hdr;
> +    __uint32_t fmt;
> +    __uint64_t reserved1;
> +    __uint16_t vfn; /* virtual fn number */
> +#define CLP_RSP_QPCI_MASK_UTIL  0x100
> +#define CLP_RSP_QPCI_MASK_PFGID 0xff
> +    __uint16_t ug;
> +    __uint32_t fid; /* pci function id */
> +    __uint8_t bar_size[PCI_BAR_COUNT];
> +    __uint16_t pchid;
> +    __uint32_t bar[PCI_BAR_COUNT];
> +    __uint64_t reserved2;
> +    __uint64_t sdma; /* start dma as */
> +    __uint64_t edma; /* end dma as */
> +    __uint32_t reserved3[11];
> +    __uint32_t uid;
> +    __uint8_t util_str[CLP_UTIL_STR_LEN]; /* utility string */
> +} QEMU_PACKED ClpRspQueryPci;
> +
> +/* Query PCI function group request */
> +typedef struct ClpReqQueryPciGrp {
> +    ClpReqHdr hdr;
> +    __uint32_t fmt;
> +    __uint64_t reserved1;
> +#define CLP_REQ_QPCIG_MASK_PFGID 0xff
> +    __uint32_t g;
> +    __uint32_t reserved2;
> +    __uint64_t reserved3;
> +} QEMU_PACKED ClpReqQueryPciGrp;
> +
> +/* Query PCI function group response */
> +typedef struct ClpRspQueryPciGrp {
> +    ClpRspHdr hdr;
> +    __uint32_t fmt;
> +    __uint64_t reserved1;
> +#define CLP_RSP_QPCIG_MASK_NOI 0xfff
> +    __uint16_t i;
> +    __uint8_t version;
> +#define CLP_RSP_QPCIG_MASK_FRAME   0x2
> +#define CLP_RSP_QPCIG_MASK_REFRESH 0x1
> +    __uint8_t fr;
> +    __uint16_t reserved2;
> +    __uint16_t mui;
> +    __uint64_t reserved3;
> +    __uint64_t dasm; /* dma address space mask */
> +    __uint64_t msia; /* MSI address */
> +    __uint64_t reserved4;
> +    __uint64_t reserved5;
> +} QEMU_PACKED ClpRspQueryPciGrp;
> +
> +/* Set PCI function request */
> +typedef struct ClpReqSetPci {
> +    ClpReqHdr hdr;
> +    __uint32_t fmt;
> +    __uint64_t reserved1;
> +    __uint32_t fh; /* function handle */
> +    __uint16_t reserved2;
> +    __uint8_t oc; /* operation controls */
> +    __uint8_t ndas; /* number of dma spaces */
> +    __uint64_t reserved3;
> +} QEMU_PACKED ClpReqSetPci;
> +
> +/* Set PCI function response */
> +typedef struct ClpRspSetPci {
> +    ClpRspHdr hdr;
> +    __uint32_t fmt;
> +    __uint64_t reserved1;
> +    __uint32_t fh; /* function handle */
> +    __uint32_t reserved3;
> +    __uint64_t reserved4;
> +} QEMU_PACKED ClpRspSetPci;
> +
> +typedef struct ClpReqRspListPci {
> +    ClpReqListPci request;
> +    ClpRspListPci response;
> +} QEMU_PACKED ClpReqRspListPci;
> +
> +typedef struct ClpReqRspSetPci {
> +    ClpReqSetPci request;
> +    ClpRspSetPci response;
> +} QEMU_PACKED ClpReqRspSetPci;
> +
> +typedef struct ClpReqRspQueryPci {
> +    ClpReqQueryPci request;
> +    ClpRspQueryPci response;
> +} QEMU_PACKED ClpReqRspQueryPci;
> +
> +typedef struct ClpReqRspQueryPciGrp {
> +    ClpReqQueryPciGrp request;
> +    ClpRspQueryPciGrp response;
> +} QEMU_PACKED ClpReqRspQueryPciGrp;
> +
> +typedef struct PciLgStg {
> +    uint32_t fh;
> +    uint8_t status;
> +    uint8_t pcias;
> +    uint8_t reserved;
> +    uint8_t len;
> +} QEMU_PACKED PciLgStg;
> +
> +typedef struct PciStb {
> +    uint32_t fh;
> +    uint8_t status;
> +    uint8_t pcias;
> +    uint8_t reserved;
> +    uint8_t len;
> +} QEMU_PACKED PciStb;
> +
> +/* Load/Store status codes */
> +#define ZPCI_PCI_ST_FUNC_NOT_ENABLED        4
> +#define ZPCI_PCI_ST_FUNC_IN_ERR             8
> +#define ZPCI_PCI_ST_BLOCKED                 12
> +#define ZPCI_PCI_ST_INSUF_RES               16
> +#define ZPCI_PCI_ST_INVAL_AS                20
> +#define ZPCI_PCI_ST_FUNC_ALREADY_ENABLED    24
> +#define ZPCI_PCI_ST_DMA_AS_NOT_ENABLED      28
> +#define ZPCI_PCI_ST_2ND_OP_IN_INV_AS        36
> +#define ZPCI_PCI_ST_FUNC_NOT_AVAIL          40
> +#define ZPCI_PCI_ST_ALREADY_IN_RQ_STATE     44
> +
> +/* Load/Store return codes */
> +#define ZPCI_PCI_LS_OK              0
> +#define ZPCI_PCI_LS_ERR             1
> +#define ZPCI_PCI_LS_BUSY            2
> +#define ZPCI_PCI_LS_INVAL_HANDLE    3
> +
> +/* Modify PCI Function Controls */
> +#define ZPCI_MOD_FC_REG_INT     2
> +#define ZPCI_MOD_FC_DEREG_INT   3
> +#define ZPCI_MOD_FC_REG_IOAT    4
> +#define ZPCI_MOD_FC_DEREG_IOAT  5
> +#define ZPCI_MOD_FC_REREG_IOAT  6
> +#define ZPCI_MOD_FC_RESET_ERROR 7
> +#define ZPCI_MOD_FC_RESET_BLOCK 9
> +#define ZPCI_MOD_FC_SET_MEASURE 10
> +
> +/* FIB function controls */
> +#define ZPCI_FIB_FC_ENABLED     0x80
> +#define ZPCI_FIB_FC_ERROR       0x40
> +#define ZPCI_FIB_FC_LS_BLOCKED  0x20
> +#define ZPCI_FIB_FC_DMAAS_REG   0x10
> +
> +/* FIB function controls */
> +#define ZPCI_FIB_FC_ENABLED     0x80
> +#define ZPCI_FIB_FC_ERROR       0x40
> +#define ZPCI_FIB_FC_LS_BLOCKED  0x20
> +#define ZPCI_FIB_FC_DMAAS_REG   0x10
> +
> +/* Function Information Block */
> +typedef struct ZpciFib {
> +    __uint8_t fmt;   /* format */
> +    __uint8_t reserved1[7];
> +    __uint8_t fc;                  /* function controls */
> +    __uint8_t reserved2;
> +    __uint16_t reserved3;
> +    __uint32_t reserved4;
> +    __uint64_t pba;                /* PCI base address */
> +    __uint64_t pal;                /* PCI address limit */
> +    __uint64_t iota;               /* I/O Translation Anchor */
> +#define FIB_DATA_ISC(x)    (((x) >> 28) & 0x7)
> +#define FIB_DATA_NOI(x)    (((x) >> 16) & 0xfff)
> +#define FIB_DATA_AIBVO(x) (((x) >> 8) & 0x3f)
> +#define FIB_DATA_SUM(x)    (((x) >> 7) & 0x1)
> +#define FIB_DATA_AISBO(x)  ((x) & 0x3f)
> +    __uint32_t data;
> +    __uint32_t reserved5;
> +    __uint64_t aibv;               /* Adapter int bit vector address */
> +    __uint64_t aisb;               /* Adapter int summary bit address */
> +    __uint64_t fmb_addr;           /* Function measurement address and key */
> +    __uint32_t reserved6;
> +    __uint32_t gd;
> +} QEMU_PACKED ZpciFib;
> +
> +static inline uint64_t get_base_disp_rxy(S390CPU *cpu, struct kvm_run *run)
> +{
> +    CPUS390XState *env = &cpu->env;
> +    uint32_t x2 = (run->s390_sieic.ipa & 0x000f);
> +    uint32_t base2 = run->s390_sieic.ipb >> 28;
> +    uint32_t disp2 = ((run->s390_sieic.ipb & 0x0fff0000) >> 16) +
> +                     ((run->s390_sieic.ipb & 0xff00) << 4);
> +
> +    if (disp2 & 0x80000) {
> +        disp2 += 0xfff00000;
> +    }
> +
> +    return (base2 ? env->regs[base2] : 0) +
> +           (x2 ? env->regs[x2] : 0) + (long)(int)disp2;
> +}
> +
> +static inline uint64_t get_base_disp_rsy(S390CPU *cpu, struct kvm_run *run)
> +{
> +    CPUS390XState *env = &cpu->env;
> +    uint32_t base2 = run->s390_sieic.ipb >> 28;
> +    uint32_t disp2 = ((run->s390_sieic.ipb & 0x0fff0000) >> 16) +
> +                     ((run->s390_sieic.ipb & 0xff00) << 4);
> +
> +    if (disp2 & 0x80000) {
> +        disp2 += 0xfff00000;
> +    }
> +
> +    return (base2 ? env->regs[base2] : 0) + (long)(int)disp2;
> +}

Same comment as in the previous patch here, please try to avoid putting
code into a header file.

> +
> +int kvm_clp_service_call(S390CPU *cpu, struct kvm_run *run);
> +int kvm_rpcit_service_call(S390CPU *cpu, struct kvm_run *run);
> +int kvm_sic_service_call(S390CPU *cpu, struct kvm_run *run);
> +int kvm_pcistb_service_call(S390CPU *cpu, struct kvm_run *run);
> +int kvm_mpcifc_service_call(S390CPU *cpu, struct kvm_run *run);
> +int kvm_pcistg_service_call(S390CPU *cpu, struct kvm_run *run);
> +int kvm_pcilg_service_call(S390CPU *cpu, struct kvm_run *run);
> +int kvm_stpcifc_service_call(S390CPU *cpu, struct kvm_run *run);

Hrm. Maybe we could add some registration hook similar to spapr's hcall
or rtas callback registration that would allow us to encapsulate this a
bit better?

Then you'd only have to spawn a PHB device which could register for
these service calls.


Alex
Frank Blaschka Nov. 11, 2014, 12:10 p.m. UTC | #2
On Mon, Nov 10, 2014 at 04:56:21PM +0100, Alexander Graf wrote:
> 
> 
> On 10.11.14 15:20, Frank Blaschka wrote:
> > From: Frank Blaschka <frank.blaschka@de.ibm.com>
> > 
> > This patch implements the s390 pci instructions in qemu. It allows
> > to access and drive pci devices attached to the s390 pci bus.
> > Because of platform constrains devices using IO BARs are not
> > supported. Also a device has to support MSI/MSI-X to run on s390.
> > 
> > Signed-off-by: Frank Blaschka <frank.blaschka@de.ibm.com>
> > ---
> >  target-s390x/Makefile.objs |   2 +-
> >  target-s390x/kvm.c         |  52 ++++
> >  target-s390x/pci_ic.c      | 753 +++++++++++++++++++++++++++++++++++++++++++++
> >  target-s390x/pci_ic.h      | 335 ++++++++++++++++++++
> >  4 files changed, 1141 insertions(+), 1 deletion(-)
> >  create mode 100644 target-s390x/pci_ic.c
> >  create mode 100644 target-s390x/pci_ic.h
> > 
> > diff --git a/target-s390x/Makefile.objs b/target-s390x/Makefile.objs
> > index 2c57494..cc71400 100644
> > --- a/target-s390x/Makefile.objs
> > +++ b/target-s390x/Makefile.objs
> > @@ -2,4 +2,4 @@ obj-y += translate.o helper.o cpu.o interrupt.o
> >  obj-y += int_helper.o fpu_helper.o cc_helper.o mem_helper.o misc_helper.o
> >  obj-y += gdbstub.o
> >  obj-$(CONFIG_SOFTMMU) += machine.o ioinst.o arch_dump.o
> > -obj-$(CONFIG_KVM) += kvm.o
> > +obj-$(CONFIG_KVM) += kvm.o pci_ic.o
> > diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c
> > index 5b10a25..d59e740 100644
> > --- a/target-s390x/kvm.c
> > +++ b/target-s390x/kvm.c
> > @@ -40,6 +40,7 @@
> >  #include "exec/gdbstub.h"
> >  #include "trace.h"
> >  #include "qapi-event.h"
> > +#include "pci_ic.h"
> >  
> >  /* #define DEBUG_KVM */
> >  
> > @@ -56,6 +57,7 @@
> >  #define IPA0_B2                         0xb200
> >  #define IPA0_B9                         0xb900
> >  #define IPA0_EB                         0xeb00
> > +#define IPA0_E3                         0xe300
> >  
> >  #define PRIV_B2_SCLP_CALL               0x20
> >  #define PRIV_B2_CSCH                    0x30
> > @@ -76,8 +78,17 @@
> >  #define PRIV_B2_XSCH                    0x76
> >  
> >  #define PRIV_EB_SQBS                    0x8a
> > +#define PRIV_EB_PCISTB                  0xd0
> > +#define PRIV_EB_SIC                     0xd1
> >  
> >  #define PRIV_B9_EQBS                    0x9c
> > +#define PRIV_B9_CLP                     0xa0
> > +#define PRIV_B9_PCISTG                  0xd0
> > +#define PRIV_B9_PCILG                   0xd2
> > +#define PRIV_B9_RPCIT                   0xd3
> > +
> > +#define PRIV_E3_MPCIFC                  0xd0
> > +#define PRIV_E3_STPCIFC                 0xd4
> >  
> >  #define DIAG_IPL                        0x308
> >  #define DIAG_KVM_HYPERCALL              0x500
> > @@ -814,6 +825,18 @@ static int handle_b9(S390CPU *cpu, struct kvm_run *run, uint8_t ipa1)
> >      int r = 0;
> >  
> >      switch (ipa1) {
> > +    case PRIV_B9_CLP:
> > +        r = kvm_clp_service_call(cpu, run);
> > +        break;
> > +    case PRIV_B9_PCISTG:
> > +        r = kvm_pcistg_service_call(cpu, run);
> > +        break;
> > +    case PRIV_B9_PCILG:
> > +        r = kvm_pcilg_service_call(cpu, run);
> > +        break;
> > +    case PRIV_B9_RPCIT:
> > +        r = kvm_rpcit_service_call(cpu, run);
> > +        break;
> >      case PRIV_B9_EQBS:
> >          /* just inject exception */
> >          r = -1;
> > @@ -832,6 +855,12 @@ static int handle_eb(S390CPU *cpu, struct kvm_run *run, uint8_t ipa1)
> >      int r = 0;
> >  
> >      switch (ipa1) {
> > +    case PRIV_EB_PCISTB:
> > +        r = kvm_pcistb_service_call(cpu, run);
> > +        break;
> > +    case PRIV_EB_SIC:
> > +        r = kvm_sic_service_call(cpu, run);
> > +        break;
> >      case PRIV_EB_SQBS:
> >          /* just inject exception */
> >          r = -1;
> > @@ -845,6 +874,26 @@ static int handle_eb(S390CPU *cpu, struct kvm_run *run, uint8_t ipa1)
> >      return r;
> >  }
> >  
> > +static int handle_e3(S390CPU *cpu, struct kvm_run *run, uint8_t ipbl)
> > +{
> > +    int r = 0;
> > +
> > +    switch (ipbl) {
> > +    case PRIV_E3_MPCIFC:
> > +        r = kvm_mpcifc_service_call(cpu, run);
> > +        break;
> > +    case PRIV_E3_STPCIFC:
> > +        r = kvm_stpcifc_service_call(cpu, run);
> > +        break;
> > +    default:
> > +        r = -1;
> > +        DPRINTF("KVM: unhandled PRIV: 0xe3%x\n", ipbl);
> > +        break;
> > +    }
> > +
> > +    return r;
> > +}
> > +
> >  static int handle_hypercall(S390CPU *cpu, struct kvm_run *run)
> >  {
> >      CPUS390XState *env = &cpu->env;
> > @@ -1041,6 +1090,9 @@ static int handle_instruction(S390CPU *cpu, struct kvm_run *run)
> >      case IPA0_EB:
> >          r = handle_eb(cpu, run, ipa1);
> >          break;
> > +    case IPA0_E3:
> > +        r = handle_e3(cpu, run, run->s390_sieic.ipb & 0xff);
> > +        break;
> >      case IPA0_DIAG:
> >          r = handle_diag(cpu, run, run->s390_sieic.ipb);
> >          break;
> > diff --git a/target-s390x/pci_ic.c b/target-s390x/pci_ic.c
> > new file mode 100644
> > index 0000000..6c05faf
> > --- /dev/null
> > +++ b/target-s390x/pci_ic.c
> > @@ -0,0 +1,753 @@
> > +/*
> > + * s390 PCI intercepts
> > + *
> > + * Copyright 2014 IBM Corp.
> > + * Author(s): Frank Blaschka <frank.blaschka@de.ibm.com>
> > + *            Hong Bo Li <lihbbj@cn.ibm.com>
> > + *            Yi Min Zhao <zyimin@cn.ibm.com>
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2 or (at
> > + * your option) any later version. See the COPYING file in the top-level
> > + * directory.
> > + */
> > +
> > +#include <sys/types.h>
> > +#include <sys/ioctl.h>
> > +#include <sys/mman.h>
> > +
> > +#include <linux/kvm.h>
> > +#include <asm/ptrace.h>
> > +#include <hw/pci/pci.h>
> > +#include <hw/pci/pci_host.h>
> > +#include <net/net.h>
> > +
> > +#include "qemu-common.h"
> > +#include "qemu/timer.h"
> > +#include "migration/qemu-file.h"
> > +#include "sysemu/sysemu.h"
> > +#include "sysemu/kvm.h"
> > +#include "cpu.h"
> > +#include "sysemu/device_tree.h"
> > +#include "monitor/monitor.h"
> > +#include "pci_ic.h"
> > +
> > +#include "hw/hw.h"
> > +#include "hw/pci/pci.h"
> > +#include "hw/pci/pci_bridge.h"
> > +#include "hw/pci/pci_bus.h"
> > +#include "hw/pci/pci_host.h"
> > +#include "hw/s390x/s390-pci-bus.h"
> > +#include "exec/exec-all.h"
> > +#include "exec/memory-internal.h"
> > +
> > +/* #define DEBUG_S390PCI_IC */
> > +#ifdef DEBUG_S390PCI_IC
> > +#define DPRINTF(fmt, ...) \
> > +    do { fprintf(stderr, "s390pci_ic: " fmt, ## __VA_ARGS__); } while (0)
> > +#else
> > +#define DPRINTF(fmt, ...) \
> > +    do { } while (0)
> > +#endif
> > +
> > +static uint64_t resume_token;
> 
> global variable? Why?
>

Hi Alex,

thx for the review will try to address all issues from 1/3 and 2/3 patch.
If I do not agree with a change I try to explain ...
 
> > +
> > +static uint8_t barsize(uint64_t size)
> > +{
> > +    uint64_t mask = 1;
> > +    int i;
> > +
> > +    if (!size) {
> > +        return 0;
> > +    }
> > +
> > +    for (i = 0; i < 64; i++) {
> > +        if (size & mask) {
> > +            break;
> > +        }
> > +        mask = (mask << 1);
> > +    }
> > +
> > +    return i;
> > +}
> 
> Isn't there an existing helper for this in the PCI layer?
>

Did not find one, this function is used to fill a s390 specific len
in an instruction intercept (architecture specific encoding of the len).
 
> In fact, please check whether it makes sense to move some of the code to
> hw/ rather than target-s390x.
> 
> > +
> > +static void s390_set_status_code(CPUS390XState *env,
> > +                                 uint8_t r, uint64_t status_code)
> > +{
> > +    env->regs[r] &= ~0xff000000;
> > +    env->regs[r] |= (status_code & 0xff) << 24;
> > +}
> > +
> > +static int list_pci(ClpReqRspListPci *rrb, uint8_t *cc)
> > +{
> > +    S390PCIBusDevice *pbdev;
> > +    uint32_t res_code, initial_l2, g_l2, finish;
> > +    int rc, idx;
> > +
> > +    rc = 0;
> > +    if (be16_to_cpu(rrb->request.hdr.len) != 32) {
> > +        res_code = CLP_RC_LEN;
> > +        rc = -EINVAL;
> > +        goto out;
> > +    }
> > +
> > +    if ((be32_to_cpu(rrb->request.fmt) & CLP_MASK_FMT) != 0) {
> > +        res_code = CLP_RC_FMT;
> > +        rc = -EINVAL;
> > +        goto out;
> > +    }
> > +
> > +    if ((be32_to_cpu(rrb->request.fmt) & ~CLP_MASK_FMT) != 0 ||
> > +        rrb->request.reserved1 != 0 ||
> > +        rrb->request.reserved2 != 0) {
> > +        res_code = CLP_RC_RESNOT0;
> > +        rc = -EINVAL;
> > +        goto out;
> > +    }
> > +
> > +    if (be64_to_cpu(rrb->request.resume_token) == 0) {
> > +        resume_token = 0;
> > +    } else if (be64_to_cpu(rrb->request.resume_token) != resume_token) {
> > +        res_code = CLP_RC_LISTPCI_BADRT;
> > +        rc = -EINVAL;
> > +        goto out;
> > +    }
> > +
> > +    if (be16_to_cpu(rrb->response.hdr.len) < 48) {
> > +        res_code = CLP_RC_8K;
> > +        rc = -EINVAL;
> > +        goto out;
> > +    }
> > +
> > +    initial_l2 = be16_to_cpu(rrb->response.hdr.len);
> > +    if ((initial_l2 - LIST_PCI_HDR_LEN) % sizeof(ClpFhListEntry)
> > +        != 0) {
> > +        rc = -EINVAL;
> > +        *cc = 3;
> > +        goto out;
> > +    }
> > +
> > +    rrb->response.fmt = 0;
> > +    rrb->response.reserved1 = rrb->response.reserved2 = 0;
> > +    rrb->response.mdd = cpu_to_be32(FH_VIRT);
> > +    rrb->response.max_fn = cpu_to_be16(PCI_MAX_FUNCTIONS);
> > +    rrb->response.entry_size = sizeof(ClpFhListEntry);
> > +    finish = 0;
> > +    idx = resume_token;
> > +    g_l2 = LIST_PCI_HDR_LEN;
> > +    do {
> > +        pbdev = s390_pci_find_dev_by_idx(idx);
> > +        if (!pbdev) {
> > +            finish = 1;
> > +            break;
> > +        }
> > +        rrb->response.fh_list[idx - resume_token].device_id =
> > +            pci_get_word(pbdev->pdev->config + PCI_DEVICE_ID);
> > +        rrb->response.fh_list[idx - resume_token].vendor_id =
> > +            pci_get_word(pbdev->pdev->config + PCI_VENDOR_ID);
> > +        rrb->response.fh_list[idx - resume_token].config =
> > +            cpu_to_be32(0x80000000);
> > +        rrb->response.fh_list[idx - resume_token].fid = cpu_to_be32(pbdev->fid);
> > +        rrb->response.fh_list[idx - resume_token].fh = cpu_to_be32(pbdev->fh);
> > +
> > +        g_l2 += sizeof(ClpFhListEntry);
> > +        DPRINTF("g_l2 %d vendor id 0x%x device id 0x%x fid 0x%x fh 0x%x\n",
> > +            g_l2,
> > +            rrb->response.fh_list[idx - resume_token].vendor_id,
> > +            rrb->response.fh_list[idx - resume_token].device_id,
> > +            rrb->response.fh_list[idx - resume_token].fid,
> > +            rrb->response.fh_list[idx - resume_token].fh);
> > +        idx++;
> > +    } while (g_l2 < initial_l2);
> > +
> > +    if (finish == 1) {
> > +        resume_token = 0;
> > +    } else {
> > +        resume_token = idx;
> > +    }
> > +    rrb->response.resume_token = cpu_to_be64(resume_token);
> > +    rrb->response.hdr.len = cpu_to_be16(g_l2);
> > +    rrb->response.hdr.rsp = cpu_to_be16(CLP_RC_OK);
> > +out:
> > +    if (rc) {
> > +        DPRINTF("list pci failed rc 0x%x\n", rc);
> > +        rrb->response.hdr.rsp = cpu_to_be16(res_code);
> > +    }
> > +    return rc;
> > +}
> > +
> > +int kvm_clp_service_call(S390CPU *cpu, struct kvm_run *run)
> 
> Please separate kvm_ calls from the actual implementation. Do all the
> parameter extraction in the kvm_ function and then forward on to a
> generic function that doesn't need to know about kvm_run anymore.
> 
> kvm specific c file:
> 
> int kvm_clp_service_call(S390CPU *cpu, struct kvm_run *run)
> {
>     uint8_t r2 = (run->s390_sieic.ipb & 0x000f0000) >> 16;
>     return clp_service_call(cpu, r2);
> }
> 
> io / pci specific c file:
> 
> int clp_service_call(S390CPU *cpu, uint8_t r2)
> {
>     ...
> }
>

I had already in mind to separate clp and pci instruction implementation
from kvm and move them to hw. Will do some major rework and move code arround.
 
> > +{
> > +    ClpReqHdr *reqh;
> > +    ClpRspHdr *resh;
> > +    S390PCIBusDevice *pbdev;
> > +    uint32_t req_len;
> > +    uint32_t res_len;
> > +    uint8_t *buffer;
> > +    uint8_t cc = 0;
> > +    CPUS390XState *env = &cpu->env;
> > +    uint8_t r2 = (run->s390_sieic.ipb & 0x000f0000) >> 16;
> > +    int i;
> > +
> > +    buffer = g_malloc0(4096 * 2);
> 
> Do you really need this? Couldn't you make the pointers be actual
> structs on the stack and just read/write from them directly?
> 
> The compiler should be smart enough to throw away elements that aren't
> used anymore to conserve memory.
> 
> > +    cpu_synchronize_state(CPU(cpu));
> > +
> > +    if (env->psw.mask & PSW_MASK_PSTATE) {
> > +        program_interrupt(env, PGM_PRIVILEGED, 4);
> > +        return 0;
> > +    }
> > +
> > +    cpu_physical_memory_rw(env->regs[r2], buffer, sizeof(*reqh), 0);
> > +    reqh = (ClpReqHdr *)buffer;
> > +    req_len = be16_to_cpu(reqh->len);
> > +    if (req_len < 16 || req_len > 8184 || (req_len % 8 != 0)) {
> > +        program_interrupt(env, PGM_OPERAND, 4);
> > +        return 0;
> > +    }
> > +
> > +    cpu_physical_memory_rw(env->regs[r2], buffer, req_len + sizeof(*resh), 0);
> > +    resh = (ClpRspHdr *)(buffer + req_len);
> > +    res_len = be16_to_cpu(resh->len);
> > +    if (res_len < 8 || res_len > 8176 || (res_len % 8 != 0)) {
> > +        program_interrupt(env, PGM_OPERAND, 4);
> > +        return 0;
> > +    }
> > +    if ((req_len + res_len) > 8192) {
> > +        program_interrupt(env, PGM_OPERAND, 4);
> > +        return 0;
> > +    }
> > +
> > +    cpu_physical_memory_rw(env->regs[r2], buffer, req_len + res_len, 0);
> > +
> > +    if (req_len != 32) {
> > +        resh->rsp = cpu_to_be16(CLP_RC_LEN);
> > +        goto out;
> > +    }
> > +
> > +    switch (reqh->cmd) {
> > +    case CLP_LIST_PCI: {
> > +        ClpReqRspListPci *rrb = (ClpReqRspListPci *)buffer;
> > +        list_pci(rrb, &cc);
> > +        break;
> > +    }
> > +    case CLP_SET_PCI_FN: {
> > +        ClpReqSetPci *reqsetpci = (ClpReqSetPci *)reqh;
> > +        ClpRspSetPci *ressetpci = (ClpRspSetPci *)resh;
> > +
> > +        pbdev = s390_pci_find_dev_by_fh(be32_to_cpu(reqsetpci->fh));
> > +        if (!pbdev) {
> > +                ressetpci->hdr.rsp = cpu_to_be16(CLP_RC_SETPCIFN_FH);
> > +                goto out;
> > +        }
> > +
> > +        switch (reqsetpci->oc) {
> > +        case CLP_SET_ENABLE_PCI_FN:
> > +            pbdev->fh = pbdev->fh | 1 << ENABLE_BIT_OFFSET;
> > +            ressetpci->fh = cpu_to_be32(pbdev->fh);
> > +            ressetpci->hdr.rsp = cpu_to_be16(CLP_RC_OK);
> > +            break;
> > +        case CLP_SET_DISABLE_PCI_FN:
> > +            pbdev->fh = pbdev->fh & ~(1 << ENABLE_BIT_OFFSET);
> > +            ressetpci->fh = cpu_to_be32(pbdev->fh);
> > +            ressetpci->hdr.rsp = cpu_to_be16(CLP_RC_OK);
> > +            break;
> > +        default:
> > +            DPRINTF("unknown set pci command\n");
> > +            ressetpci->hdr.rsp = cpu_to_be16(CLP_RC_SETPCIFN_FHOP);
> > +            break;
> > +        }
> > +        break;
> > +    }
> > +    case CLP_QUERY_PCI_FN: {
> > +        ClpReqQueryPci *reqquery = (ClpReqQueryPci *)reqh;
> > +        ClpRspQueryPci *resquery = (ClpRspQueryPci *)resh;
> > +
> > +        pbdev = s390_pci_find_dev_by_fh(reqquery->fh);
> > +        if (!pbdev) {
> > +            DPRINTF("query pci no pci dev\n");
> > +            resquery->hdr.rsp = cpu_to_be16(CLP_RC_SETPCIFN_FH);
> > +            goto out;
> > +        }
> > +
> > +        for (i = 0; i < PCI_BAR_COUNT; i++) {
> > +            uint64_t data = pci_host_config_read_common(pbdev->pdev,
> > +                0x10 + (i * 4), pci_config_size(pbdev->pdev), 4);
> > +
> > +            resquery->bar[i] = bswap32(data);
> > +            resquery->bar_size[i] = barsize(pbdev->pdev->io_regions[i].size);
> > +            DPRINTF("bar %d addr 0x%x size 0x%lx barsize 0x%x\n", i,
> > +                    resquery->bar[i], pbdev->pdev->io_regions[i].size,
> > +                    resquery->bar_size[i]);
> > +        }
> > +
> > +        resquery->sdma = ZPCI_SDMA_ADDR;
> > +        resquery->edma = ZPCI_EDMA_ADDR;
> > +        resquery->pchid = 0;
> > +        resquery->ug = 1;
> > +        resquery->uid = pbdev->fid;
> > +
> > +        resquery->hdr.rsp = CLP_RC_OK;
> > +        break;
> > +    }
> > +    case CLP_QUERY_PCI_FNGRP: {
> > +        ClpRspQueryPciGrp *resgrp = (ClpRspQueryPciGrp *)resh;
> > +        resgrp->fr = 1;
> > +        resgrp->dasm = 0;
> > +        resgrp->msia = ZPCI_MSI_ADDR;
> > +        resgrp->mui = 0;
> > +        resgrp->i = 128;
> > +        resgrp->version = 0;
> > +
> > +        resgrp->hdr.rsp = CLP_RC_OK;
> > +        break;
> > +    }
> > +    default:
> > +        DPRINTF("unknown clp command\n");
> > +        resh->rsp = cpu_to_be16(CLP_RC_CMD);
> > +        break;
> > +    }
> > +
> > +out:
> > +    cpu_physical_memory_rw(env->regs[r2], buffer, req_len + res_len, 1);
> 
> ... ah, to write back. Wouldn't it be cleaner to do this explicitly?
> 
> > +    g_free(buffer);
> > +    setcc(cpu, cc);
> > +    return 0;
> > +}
> > +
> > +int kvm_pcilg_service_call(S390CPU *cpu, struct kvm_run *run)
> > +{
> > +    CPUS390XState *env = &cpu->env;
> > +    S390PCIBusDevice *pbdev;
> > +    uint8_t r1 = (run->s390_sieic.ipb & 0x00f00000) >> 20;
> > +    uint8_t r2 = (run->s390_sieic.ipb & 0x000f0000) >> 16;
> > +    PciLgStg *rp;
> > +    uint64_t offset;
> > +    uint64_t data;
> > +    uint8_t len;
> > +
> > +    cpu_synchronize_state(CPU(cpu));
> > +
> > +    if (env->psw.mask & PSW_MASK_PSTATE) {
> > +        program_interrupt(env, PGM_PRIVILEGED, 4);
> > +        return 0;
> > +    }
> > +
> > +    if (r2 & 0x1) {
> > +        program_interrupt(env, PGM_SPECIFICATION, 4);
> > +        return 0;
> > +    }
> > +
> > +    rp = (PciLgStg *)&env->regs[r2];
> > +    offset = env->regs[r2 + 1];
> > +
> > +    pbdev = s390_pci_find_dev_by_fh(rp->fh);
> > +    if (!pbdev) {
> > +        DPRINTF("pcilg no pci dev\n");
> > +        setcc(cpu, ZPCI_PCI_LS_INVAL_HANDLE);
> > +        return 0;
> > +    }
> > +
> > +    len = rp->len & 0xF;
> > +    if (rp->pcias < 6) {
> > +        if ((8 - (offset & 0x7)) < len) {
> > +            program_interrupt(env, PGM_OPERAND, 4);
> > +            return 0;
> > +        }
> > +        MemoryRegion *mr = pbdev->pdev->io_regions[rp->pcias].memory;
> > +        io_mem_read(mr, offset, &data, len);
> > +    } else if (rp->pcias == 15) {
> > +        if ((4 - (offset & 0x3)) < len) {
> > +            program_interrupt(env, PGM_OPERAND, 4);
> > +            return 0;
> > +        }
> > +        data =  pci_host_config_read_common(
> > +                   pbdev->pdev, offset, pci_config_size(pbdev->pdev), len);
> > +
> > +        switch (len) {
> > +        case 1:
> > +            break;
> > +        case 2:
> > +            data = cpu_to_le16(data);
> > +            break;
> > +        case 4:
> > +            data = cpu_to_le32(data);
> > +            break;
> > +        case 8:
> > +            data = cpu_to_le64(data);
> > +            break;
> 
> Why? Also, this is wrong. cpu_to_le64 convert between host endianness
> and LE. So if you're running this on an LE host, you won't swap the
> value and get a broken result.
> 
> If you know that the value is always swapped, use bswapxx().
>

Actually the code is right and required for a big endian host :-)
pcilg/pcistg provide access to the PCI config space which is defined
as PCI byte order (little endian). Since pci_host_config_read_common does
already a le to cpu conversion we have to convert back to PCI byte order.
Doing an unconditional swap would be a bug on a little endian host.

> > +        default:
> > +            program_interrupt(env, PGM_OPERAND, 4);
> > +            return 0;
> > +        }
> > +    } else {
> > +        DPRINTF("invalid space\n");
> > +        setcc(cpu, ZPCI_PCI_LS_ERR);
> > +        s390_set_status_code(env, r2, ZPCI_PCI_ST_INVAL_AS);
> > +        return 0;
> > +    }
> > +
> > +    env->regs[r1] = data;
> > +    setcc(cpu, ZPCI_PCI_LS_OK);
> > +    return 0;
> > +}
> > +
> > +static void update_msix_table_msg_data(S390PCIBusDevice *pbdev, uint64_t offset,
> > +                                       uint64_t *data, uint8_t len)
> > +{
> > +    uint32_t msg_data;
> > +
> > +    if (offset % PCI_MSIX_ENTRY_SIZE != 8) {
> > +        return;
> > +    }
> > +
> > +    if (len != 4) {
> > +        DPRINTF("access msix table msg data but len is %d\n", len);
> > +        return;
> > +    }
> > +
> > +    msg_data = (pbdev->fid << ZPCI_MSI_VEC_BITS) | le32_to_cpu(*data);
> > +    *data = cpu_to_le32(msg_data);
> > +    DPRINTF("update msix msg_data to 0x%x\n", msg_data);
> > +}
> > +
> > +static int trap_msix(S390PCIBusDevice *pbdev, uint64_t offset, uint8_t pcias)
> > +{
> > +    if (pbdev->msix.available && pbdev->msix.table_bar == pcias &&
> > +        offset >= pbdev->msix.table_offset &&
> > +        offset <= pbdev->msix.table_offset +
> > +                  (pbdev->msix.entries - 1) * PCI_MSIX_ENTRY_SIZE) {
> > +        return 1;
> > +    } else {
> > +        return 0;
> > +    }
> > +}
> > +
> > +int kvm_pcistg_service_call(S390CPU *cpu, struct kvm_run *run)
> > +{
> > +    CPUS390XState *env = &cpu->env;
> > +    uint8_t r1 = (run->s390_sieic.ipb & 0x00f00000) >> 20;
> > +    uint8_t r2 = (run->s390_sieic.ipb & 0x000f0000) >> 16;
> > +    PciLgStg *rp;
> > +    uint64_t offset, data;
> > +    S390PCIBusDevice *pbdev;
> > +    uint8_t len;
> > +
> > +    cpu_synchronize_state(CPU(cpu));
> > +
> > +    if (env->psw.mask & PSW_MASK_PSTATE) {
> > +        program_interrupt(env, PGM_PRIVILEGED, 4);
> > +        return 0;
> > +    }
> > +
> > +    if (r2 & 0x1) {
> > +        program_interrupt(env, PGM_SPECIFICATION, 4);
> > +        return 0;
> > +    }
> > +
> > +    rp = (PciLgStg *)&env->regs[r2];
> > +    offset = env->regs[r2 + 1];
> > +
> > +    pbdev = s390_pci_find_dev_by_fh(rp->fh);
> > +    if (!pbdev) {
> > +        DPRINTF("pcistg no pci dev\n");
> > +        setcc(cpu, ZPCI_PCI_LS_INVAL_HANDLE);
> > +        return 0;
> > +    }
> > +
> > +    data = env->regs[r1];
> > +    len = rp->len & 0xF;
> > +    if (rp->pcias < 6) {
> > +        if ((8 - (offset & 0x7)) < len) {
> > +            program_interrupt(env, PGM_OPERAND, 4);
> > +            return 0;
> > +        }
> > +        MemoryRegion *mr;
> > +        if (trap_msix(pbdev, offset, rp->pcias)) {
> > +            offset = offset - pbdev->msix.table_offset;
> > +            mr = &pbdev->pdev->msix_table_mmio;
> > +            update_msix_table_msg_data(pbdev, offset, &data, len);
> > +        } else {
> > +            mr = pbdev->pdev->io_regions[rp->pcias].memory;
> > +        }
> > +
> > +        io_mem_write(mr, offset, data, len);
> > +    } else if (rp->pcias == 15) {
> > +        if ((4 - (offset & 0x3)) < len) {
> > +            program_interrupt(env, PGM_OPERAND, 4);
> > +            return 0;
> > +        }
> > +        switch (len) {
> > +        case 1:
> > +            break;
> > +        case 2:
> > +            data = le16_to_cpu(data);
> > +            break;
> > +        case 4:
> > +            data = le32_to_cpu(data);
> > +            break;
> > +        case 8:
> > +            data = le64_to_cpu(data);
> > +            break;
> > +        default:
> > +            program_interrupt(env, PGM_OPERAND, 4);
> > +            return 0;
> > +        }
> 
> I guess you want a generic function similar to qemu_bswap_len() that
> supports 64bit?
> 
> > +
> > +        pci_host_config_write_common(pbdev->pdev, offset,
> > +                                     pci_config_size(pbdev->pdev),
> > +                                     data, len);
> > +    } else {
> > +        DPRINTF("pcistg invalid space\n");
> > +        setcc(cpu, ZPCI_PCI_LS_ERR);
> > +        s390_set_status_code(env, r2, ZPCI_PCI_ST_INVAL_AS);
> > +        return 0;
> > +    }
> > +
> > +    setcc(cpu, ZPCI_PCI_LS_OK);
> > +    return 0;
> > +}
> > +
> > +int kvm_rpcit_service_call(S390CPU *cpu, struct kvm_run *run)
> > +{
> > +    CPUS390XState *env = &cpu->env;
> > +    uint8_t r1 = (run->s390_sieic.ipb & 0x00f00000) >> 20;
> > +    uint8_t r2 = (run->s390_sieic.ipb & 0x000f0000) >> 16;
> > +    uint32_t fh;
> > +    uint64_t pte;
> > +    S390PCIBusDevice *pbdev;
> > +    ram_addr_t size;
> > +    int flags;
> > +    IOMMUTLBEntry entry;
> > +
> > +    cpu_synchronize_state(CPU(cpu));
> > +
> > +    if (env->psw.mask & PSW_MASK_PSTATE) {
> > +        program_interrupt(env, PGM_PRIVILEGED, 4);
> > +        return 0;
> > +    }
> > +
> > +    if (r2 & 0x1) {
> > +        program_interrupt(env, PGM_SPECIFICATION, 4);
> > +        return 0;
> > +    }
> > +
> > +    fh = env->regs[r1] >> 32;
> > +    size = env->regs[r2 + 1];
> > +
> > +    pbdev = s390_pci_find_dev_by_fh(fh);
> > +
> > +    if (!pbdev) {
> > +        DPRINTF("rpcit no pci dev\n");
> > +        setcc(cpu, ZPCI_PCI_LS_INVAL_HANDLE);
> > +        return 0;
> > +    }
> > +
> > +    pte = s390_guest_io_table_walk(s390_pci_get_table_origin(pbdev->g_iota),
> > +                                   env->regs[r2]);
> > +    flags = pte & ZPCI_PTE_FLAG_MASK;
> > +    entry.target_as = &address_space_memory;
> > +    entry.iova = env->regs[r2];
> > +    entry.translated_addr = pte & ZPCI_PTE_ADDR_MASK;
> > +    entry.addr_mask = size - 1;
> > +
> > +    if (flags & ZPCI_PTE_INVALID) {
> > +        entry.perm = IOMMU_NONE;
> > +    } else {
> > +        entry.perm = IOMMU_RW;
> > +    }
> 
> Deja vu? This is the iommu translation function, no? Can't you somehow
> just call it?
>

yes you are so right, can't belive I did't saw this before
 
> > +
> > +    memory_region_notify_iommu(pci_device_iommu_address_space(
> > +                               pbdev->pdev)->root, entry);
> > +
> > +    setcc(cpu, ZPCI_PCI_LS_OK);
> > +    return 0;
> > +}
> > +
> > +int kvm_sic_service_call(S390CPU *cpu, struct kvm_run *run)
> > +{
> > +    qemu_log_mask(LOG_UNIMP, "SIC missing\n");
> > +    return 0;
> > +}
> > +
> > +int kvm_pcistb_service_call(S390CPU *cpu, struct kvm_run *run)
> > +{
> > +    CPUS390XState *env = &cpu->env;
> > +    uint8_t r1 = (run->s390_sieic.ipa & 0x00f0) >> 4;
> > +    uint8_t r3 = run->s390_sieic.ipa & 0x000f;
> > +    PciStb *rp;
> > +    uint64_t gaddr;
> > +    uint64_t *uaddr, *pu;
> > +    hwaddr len;
> > +    S390PCIBusDevice *pbdev;
> > +    MemoryRegion *mr;
> > +    int i;
> > +
> > +    cpu_synchronize_state(CPU(cpu));
> > +
> > +    if (env->psw.mask & PSW_MASK_PSTATE) {
> > +        program_interrupt(env, PGM_PRIVILEGED, 6);
> > +        return 0;
> > +    }
> > +
> > +    rp = (PciStb *)&env->regs[r1];
> > +    if (rp->pcias > 5) {
> > +        DPRINTF("pcistb invalid space\n");
> > +        setcc(cpu, ZPCI_PCI_LS_ERR);
> > +        s390_set_status_code(env, r1, ZPCI_PCI_ST_INVAL_AS);
> > +        return 0;
> > +    }
> > +
> > +    switch (rp->len) {
> > +    case 16:
> > +    case 32:
> > +    case 64:
> > +    case 128:
> > +        break;
> > +    default:
> > +        program_interrupt(env, PGM_SPECIFICATION, 6);
> > +        return 0;
> > +    }
> > +
> > +    gaddr = get_base_disp_rsy(cpu, run);
> > +    len = rp->len;
> > +
> > +    pbdev = s390_pci_find_dev_by_fh(rp->fh);
> > +    if (!pbdev) {
> > +        DPRINTF("pcistb no pci dev fh 0x%x\n", rp->fh);
> > +        setcc(cpu, ZPCI_PCI_LS_INVAL_HANDLE);
> > +        return 0;
> > +    }
> > +
> > +    uaddr = cpu_physical_memory_map(gaddr, &len, 0);
> > +    mr = pbdev->pdev->io_regions[rp->pcias].memory;
> > +    if (!memory_region_access_valid(mr, env->regs[r3], rp->len, true)) {
> > +        cpu_physical_memory_unmap(uaddr, len, 0, len);
> > +        program_interrupt(env, PGM_ADDRESSING, 6);
> > +        return 0;
> > +    }
> > +
> > +    pu = uaddr;
> > +    for (i = 0; i < rp->len / 8; i++) {
> > +        io_mem_write(mr, env->regs[r3] + i * 8, *pu, 8);
> 
> Please don't overoptimize and just use individual ldq_phys() operations
> here for each memory access. In general, try to avoid
> cpu_physical_memory_map().
> 
> > +        pu++;
> > +    }
> > +
> > +    cpu_physical_memory_unmap(uaddr, len, 0, len);
> > +    setcc(cpu, ZPCI_PCI_LS_OK);
> > +    return 0;
> > +}
> > +
> > +static int reg_irqs(CPUS390XState *env, S390PCIBusDevice *pbdev, ZpciFib fib)
> > +{
> > +    int ret;
> > +    S390FLICState *fs = s390_get_flic();
> > +    S390FLICStateClass *fsc = S390_FLIC_COMMON_GET_CLASS(fs);
> > +
> > +    ret = css_register_io_adapter(S390_PCIPT_ADAPTER,
> > +                                  FIB_DATA_ISC(fib.data), true, false,
> > +                                  &pbdev->routes.adapter.adapter_id);
> > +    assert(ret == 0);
> > +
> > +    fsc->io_adapter_map(fs, pbdev->routes.adapter.adapter_id, fib.aisb, true);
> > +    fsc->io_adapter_map(fs, pbdev->routes.adapter.adapter_id, fib.aibv, true);
> > +
> > +    pbdev->routes.adapter.summary_addr = fib.aisb;
> > +    pbdev->routes.adapter.summary_offset = FIB_DATA_AISBO(fib.data);
> > +    pbdev->routes.adapter.ind_addr = fib.aibv;
> > +    pbdev->routes.adapter.ind_offset = FIB_DATA_AIBVO(fib.data);
> > +
> > +    DPRINTF("reg_irqs adapter id %d\n", pbdev->routes.adapter.adapter_id);
> > +    return 0;
> > +}
> > +
> > +static int dereg_irqs(S390PCIBusDevice *pbdev)
> > +{
> > +    S390FLICState *fs = s390_get_flic();
> > +    S390FLICStateClass *fsc = S390_FLIC_COMMON_GET_CLASS(fs);
> > +
> > +    fsc->io_adapter_map(fs, pbdev->routes.adapter.adapter_id,
> > +                        pbdev->routes.adapter.ind_addr, false);
> > +
> > +    pbdev->routes.adapter.summary_addr = 0;
> > +    pbdev->routes.adapter.summary_offset = 0;
> > +    pbdev->routes.adapter.ind_addr = 0;
> > +    pbdev->routes.adapter.ind_offset = 0;
> > +
> > +    DPRINTF("dereg_irqs adapter id %d\n", pbdev->routes.adapter.adapter_id);
> > +    return 0;
> > +}
> > +
> > +int kvm_mpcifc_service_call(S390CPU *cpu, struct kvm_run *run)
> > +{
> > +    CPUS390XState *env = &cpu->env;
> > +    uint8_t r1 = (run->s390_sieic.ipa & 0x00f0) >> 4;
> > +    uint8_t oc;
> > +    uint32_t fh;
> > +    uint64_t fiba;
> > +    ZpciFib fib;
> > +    S390PCIBusDevice *pbdev;
> > +
> > +    cpu_synchronize_state(CPU(cpu));
> > +
> > +    if (env->psw.mask & PSW_MASK_PSTATE) {
> > +        program_interrupt(env, PGM_PRIVILEGED, 6);
> > +        return 0;
> > +    }
> > +
> > +    oc = env->regs[r1] & 0xff;
> > +    fh = env->regs[r1] >> 32;
> > +    fiba = get_base_disp_rxy(cpu, run);
> > +
> > +    if (fiba & 0x7) {
> > +        program_interrupt(env, PGM_SPECIFICATION, 6);
> > +        return 0;
> > +    }
> > +
> > +    pbdev = s390_pci_find_dev_by_fh(fh);
> > +    if (!pbdev) {
> > +        DPRINTF("mpcifc no pci dev fh 0x%x\n", fh);
> > +        setcc(cpu, ZPCI_PCI_LS_INVAL_HANDLE);
> > +        return 0;
> > +    }
> > +
> > +    cpu_physical_memory_rw(fiba, (uint8_t *)&fib, sizeof(fib), 0);
> 
> I also find cpu_physical_memory_rw() pretty hard to read. Meanwhile,
> it's been deprecated by cpu_physical_memory_read() and
> cpu_physical_memory_write() which make the code more readable.
> 
> > +
> > +    switch (oc) {
> > +    case ZPCI_MOD_FC_REG_INT: {
> > +        pbdev->isc = FIB_DATA_ISC(fib.data);
> > +        reg_irqs(env, pbdev, fib);
> > +        break;
> > +    }
> > +    case ZPCI_MOD_FC_DEREG_INT:
> > +        dereg_irqs(pbdev);
> > +        break;
> > +    case ZPCI_MOD_FC_REG_IOAT:
> > +        if (fib.pba > fib.pal) {
> > +            program_interrupt(&cpu->env, PGM_OPERAND, 6);
> > +            return 0;
> > +        }
> > +        pbdev->g_iota = fib.iota;
> > +        break;
> > +    case ZPCI_MOD_FC_DEREG_IOAT:
> > +        break;
> > +    case ZPCI_MOD_FC_REREG_IOAT:
> > +        break;
> > +    case ZPCI_MOD_FC_RESET_ERROR:
> > +        break;
> > +    case ZPCI_MOD_FC_RESET_BLOCK:
> > +        break;
> > +    case ZPCI_MOD_FC_SET_MEASURE:
> > +        break;
> > +    default:
> > +        program_interrupt(&cpu->env, PGM_OPERAND, 6);
> > +        return 0;
> > +    }
> > +
> > +    setcc(cpu, ZPCI_PCI_LS_OK);
> > +    return 0;
> > +}
> > +
> > +int kvm_stpcifc_service_call(S390CPU *cpu, struct kvm_run *run)
> > +{
> > +    qemu_log_mask(LOG_UNIMP, "STPCIFC missing\n");
> > +    return 0;
> > +}
> > diff --git a/target-s390x/pci_ic.h b/target-s390x/pci_ic.h
> > new file mode 100644
> > index 0000000..0eb6c27
> > --- /dev/null
> > +++ b/target-s390x/pci_ic.h
> > @@ -0,0 +1,335 @@
> > +/*
> > + * s390 PCI intercept definitions
> > + *
> > + * Copyright 2014 IBM Corp.
> > + * Author(s): Frank Blaschka <frank.blaschka@de.ibm.com>
> > + *            Hong Bo Li <lihbbj@cn.ibm.com>
> > + *            Yi Min Zhao <zyimin@cn.ibm.com>
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2 or (at
> > + * your option) any later version. See the COPYING file in the top-level
> > + * directory.
> > + */
> > +
> > +#ifndef PCI_IC_S390X_H
> > +#define PCI_IC_S390X_H
> > +
> > +#include <sysemu/dma.h>
> > +
> > +/* CLP common request & response block size */
> > +#define CLP_BLK_SIZE 4096
> > +#define PCI_BAR_COUNT 6
> > +#define PCI_MAX_FUNCTIONS 4096
> > +
> > +typedef struct ClpReqHdr {
> > +    __uint16_t len;
> > +    __uint16_t cmd;
> > +} QEMU_PACKED ClpReqHdr;
> > +
> > +typedef struct ClpRspHdr {
> > +    __uint16_t len;
> > +    __uint16_t rsp;
> > +} QEMU_PACKED ClpRspHdr;
> > +
> > +/* CLP Response Codes */
> > +#define CLP_RC_OK         0x0010  /* Command request successfully */
> > +#define CLP_RC_CMD        0x0020  /* Command code not recognized */
> > +#define CLP_RC_PERM       0x0030  /* Command not authorized */
> > +#define CLP_RC_FMT        0x0040  /* Invalid command request format */
> > +#define CLP_RC_LEN        0x0050  /* Invalid command request length */
> > +#define CLP_RC_8K         0x0060  /* Command requires 8K LPCB */
> > +#define CLP_RC_RESNOT0    0x0070  /* Reserved field not zero */
> > +#define CLP_RC_NODATA     0x0080  /* No data available */
> > +#define CLP_RC_FC_UNKNOWN 0x0100  /* Function code not recognized */
> > +
> > +/*
> > + * Call Logical Processor - Command Codes
> > + */
> > +#define CLP_LIST_PCI            0x0002
> > +#define CLP_QUERY_PCI_FN        0x0003
> > +#define CLP_QUERY_PCI_FNGRP     0x0004
> > +#define CLP_SET_PCI_FN          0x0005
> > +
> > +/* PCI function handle list entry */
> > +typedef struct ClpFhListEntry {
> > +    __uint16_t device_id;
> > +    __uint16_t vendor_id;
> > +#define CLP_FHLIST_MASK_CONFIG 0x80000000
> > +    __uint32_t config;
> > +    __uint32_t fid;
> > +    __uint32_t fh;
> > +} QEMU_PACKED ClpFhListEntry;
> > +
> > +#define CLP_RC_SETPCIFN_FH      0x0101 /* Invalid PCI fn handle */
> > +#define CLP_RC_SETPCIFN_FHOP    0x0102 /* Fn handle not valid for op */
> > +#define CLP_RC_SETPCIFN_DMAAS   0x0103 /* Invalid DMA addr space */
> > +#define CLP_RC_SETPCIFN_RES     0x0104 /* Insufficient resources */
> > +#define CLP_RC_SETPCIFN_ALRDY   0x0105 /* Fn already in requested state */
> > +#define CLP_RC_SETPCIFN_ERR     0x0106 /* Fn in permanent error state */
> > +#define CLP_RC_SETPCIFN_RECPND  0x0107 /* Error recovery pending */
> > +#define CLP_RC_SETPCIFN_BUSY    0x0108 /* Fn busy */
> > +#define CLP_RC_LISTPCI_BADRT    0x010a /* Resume token not recognized */
> > +#define CLP_RC_QUERYPCIFG_PFGID 0x010b /* Unrecognized PFGID */
> > +
> > +/* request or response block header length */
> > +#define LIST_PCI_HDR_LEN 32
> > +
> > +/* Number of function handles fitting in response block */
> > +#define CLP_FH_LIST_NR_ENTRIES \
> > +    ((CLP_BLK_SIZE - 2 * LIST_PCI_HDR_LEN) \
> > +        / sizeof(ClpFhListEntry))
> > +
> > +#define CLP_SET_ENABLE_PCI_FN  0 /* Yes, 0 enables it */
> > +#define CLP_SET_DISABLE_PCI_FN 1 /* Yes, 1 disables it */
> > +
> > +#define CLP_UTIL_STR_LEN 64
> > +
> > +#define CLP_MASK_FMT 0xf0000000
> > +
> > +/* List PCI functions request */
> > +typedef struct ClpReqListPci {
> > +    ClpReqHdr hdr;
> > +    __uint32_t fmt;
> > +    __uint64_t reserved1;
> > +    __uint64_t resume_token;
> > +    __uint64_t reserved2;
> > +} QEMU_PACKED ClpReqListPci;
> > +
> > +/* List PCI functions response */
> > +typedef struct ClpRspListPci {
> > +    ClpRspHdr hdr;
> > +    __uint32_t fmt;
> > +    __uint64_t reserved1;
> > +    __uint64_t resume_token;
> > +    __uint32_t mdd;
> > +    __uint16_t max_fn;
> > +    __uint8_t reserved2;
> > +    __uint8_t entry_size;
> > +    ClpFhListEntry fh_list[CLP_FH_LIST_NR_ENTRIES];
> > +} QEMU_PACKED ClpRspListPci;
> > +
> > +/* Query PCI function request */
> > +typedef struct ClpReqQueryPci {
> > +    ClpReqHdr hdr;
> > +    __uint32_t fmt;
> > +    __uint64_t reserved1;
> > +    __uint32_t fh; /* function handle */
> > +    __uint32_t reserved2;
> > +    __uint64_t reserved3;
> > +} QEMU_PACKED ClpReqQueryPci;
> > +
> > +/* Query PCI function response */
> > +typedef struct ClpRspQueryPci {
> > +    ClpRspHdr hdr;
> > +    __uint32_t fmt;
> > +    __uint64_t reserved1;
> > +    __uint16_t vfn; /* virtual fn number */
> > +#define CLP_RSP_QPCI_MASK_UTIL  0x100
> > +#define CLP_RSP_QPCI_MASK_PFGID 0xff
> > +    __uint16_t ug;
> > +    __uint32_t fid; /* pci function id */
> > +    __uint8_t bar_size[PCI_BAR_COUNT];
> > +    __uint16_t pchid;
> > +    __uint32_t bar[PCI_BAR_COUNT];
> > +    __uint64_t reserved2;
> > +    __uint64_t sdma; /* start dma as */
> > +    __uint64_t edma; /* end dma as */
> > +    __uint32_t reserved3[11];
> > +    __uint32_t uid;
> > +    __uint8_t util_str[CLP_UTIL_STR_LEN]; /* utility string */
> > +} QEMU_PACKED ClpRspQueryPci;
> > +
> > +/* Query PCI function group request */
> > +typedef struct ClpReqQueryPciGrp {
> > +    ClpReqHdr hdr;
> > +    __uint32_t fmt;
> > +    __uint64_t reserved1;
> > +#define CLP_REQ_QPCIG_MASK_PFGID 0xff
> > +    __uint32_t g;
> > +    __uint32_t reserved2;
> > +    __uint64_t reserved3;
> > +} QEMU_PACKED ClpReqQueryPciGrp;
> > +
> > +/* Query PCI function group response */
> > +typedef struct ClpRspQueryPciGrp {
> > +    ClpRspHdr hdr;
> > +    __uint32_t fmt;
> > +    __uint64_t reserved1;
> > +#define CLP_RSP_QPCIG_MASK_NOI 0xfff
> > +    __uint16_t i;
> > +    __uint8_t version;
> > +#define CLP_RSP_QPCIG_MASK_FRAME   0x2
> > +#define CLP_RSP_QPCIG_MASK_REFRESH 0x1
> > +    __uint8_t fr;
> > +    __uint16_t reserved2;
> > +    __uint16_t mui;
> > +    __uint64_t reserved3;
> > +    __uint64_t dasm; /* dma address space mask */
> > +    __uint64_t msia; /* MSI address */
> > +    __uint64_t reserved4;
> > +    __uint64_t reserved5;
> > +} QEMU_PACKED ClpRspQueryPciGrp;
> > +
> > +/* Set PCI function request */
> > +typedef struct ClpReqSetPci {
> > +    ClpReqHdr hdr;
> > +    __uint32_t fmt;
> > +    __uint64_t reserved1;
> > +    __uint32_t fh; /* function handle */
> > +    __uint16_t reserved2;
> > +    __uint8_t oc; /* operation controls */
> > +    __uint8_t ndas; /* number of dma spaces */
> > +    __uint64_t reserved3;
> > +} QEMU_PACKED ClpReqSetPci;
> > +
> > +/* Set PCI function response */
> > +typedef struct ClpRspSetPci {
> > +    ClpRspHdr hdr;
> > +    __uint32_t fmt;
> > +    __uint64_t reserved1;
> > +    __uint32_t fh; /* function handle */
> > +    __uint32_t reserved3;
> > +    __uint64_t reserved4;
> > +} QEMU_PACKED ClpRspSetPci;
> > +
> > +typedef struct ClpReqRspListPci {
> > +    ClpReqListPci request;
> > +    ClpRspListPci response;
> > +} QEMU_PACKED ClpReqRspListPci;
> > +
> > +typedef struct ClpReqRspSetPci {
> > +    ClpReqSetPci request;
> > +    ClpRspSetPci response;
> > +} QEMU_PACKED ClpReqRspSetPci;
> > +
> > +typedef struct ClpReqRspQueryPci {
> > +    ClpReqQueryPci request;
> > +    ClpRspQueryPci response;
> > +} QEMU_PACKED ClpReqRspQueryPci;
> > +
> > +typedef struct ClpReqRspQueryPciGrp {
> > +    ClpReqQueryPciGrp request;
> > +    ClpRspQueryPciGrp response;
> > +} QEMU_PACKED ClpReqRspQueryPciGrp;
> > +
> > +typedef struct PciLgStg {
> > +    uint32_t fh;
> > +    uint8_t status;
> > +    uint8_t pcias;
> > +    uint8_t reserved;
> > +    uint8_t len;
> > +} QEMU_PACKED PciLgStg;
> > +
> > +typedef struct PciStb {
> > +    uint32_t fh;
> > +    uint8_t status;
> > +    uint8_t pcias;
> > +    uint8_t reserved;
> > +    uint8_t len;
> > +} QEMU_PACKED PciStb;
> > +
> > +/* Load/Store status codes */
> > +#define ZPCI_PCI_ST_FUNC_NOT_ENABLED        4
> > +#define ZPCI_PCI_ST_FUNC_IN_ERR             8
> > +#define ZPCI_PCI_ST_BLOCKED                 12
> > +#define ZPCI_PCI_ST_INSUF_RES               16
> > +#define ZPCI_PCI_ST_INVAL_AS                20
> > +#define ZPCI_PCI_ST_FUNC_ALREADY_ENABLED    24
> > +#define ZPCI_PCI_ST_DMA_AS_NOT_ENABLED      28
> > +#define ZPCI_PCI_ST_2ND_OP_IN_INV_AS        36
> > +#define ZPCI_PCI_ST_FUNC_NOT_AVAIL          40
> > +#define ZPCI_PCI_ST_ALREADY_IN_RQ_STATE     44
> > +
> > +/* Load/Store return codes */
> > +#define ZPCI_PCI_LS_OK              0
> > +#define ZPCI_PCI_LS_ERR             1
> > +#define ZPCI_PCI_LS_BUSY            2
> > +#define ZPCI_PCI_LS_INVAL_HANDLE    3
> > +
> > +/* Modify PCI Function Controls */
> > +#define ZPCI_MOD_FC_REG_INT     2
> > +#define ZPCI_MOD_FC_DEREG_INT   3
> > +#define ZPCI_MOD_FC_REG_IOAT    4
> > +#define ZPCI_MOD_FC_DEREG_IOAT  5
> > +#define ZPCI_MOD_FC_REREG_IOAT  6
> > +#define ZPCI_MOD_FC_RESET_ERROR 7
> > +#define ZPCI_MOD_FC_RESET_BLOCK 9
> > +#define ZPCI_MOD_FC_SET_MEASURE 10
> > +
> > +/* FIB function controls */
> > +#define ZPCI_FIB_FC_ENABLED     0x80
> > +#define ZPCI_FIB_FC_ERROR       0x40
> > +#define ZPCI_FIB_FC_LS_BLOCKED  0x20
> > +#define ZPCI_FIB_FC_DMAAS_REG   0x10
> > +
> > +/* FIB function controls */
> > +#define ZPCI_FIB_FC_ENABLED     0x80
> > +#define ZPCI_FIB_FC_ERROR       0x40
> > +#define ZPCI_FIB_FC_LS_BLOCKED  0x20
> > +#define ZPCI_FIB_FC_DMAAS_REG   0x10
> > +
> > +/* Function Information Block */
> > +typedef struct ZpciFib {
> > +    __uint8_t fmt;   /* format */
> > +    __uint8_t reserved1[7];
> > +    __uint8_t fc;                  /* function controls */
> > +    __uint8_t reserved2;
> > +    __uint16_t reserved3;
> > +    __uint32_t reserved4;
> > +    __uint64_t pba;                /* PCI base address */
> > +    __uint64_t pal;                /* PCI address limit */
> > +    __uint64_t iota;               /* I/O Translation Anchor */
> > +#define FIB_DATA_ISC(x)    (((x) >> 28) & 0x7)
> > +#define FIB_DATA_NOI(x)    (((x) >> 16) & 0xfff)
> > +#define FIB_DATA_AIBVO(x) (((x) >> 8) & 0x3f)
> > +#define FIB_DATA_SUM(x)    (((x) >> 7) & 0x1)
> > +#define FIB_DATA_AISBO(x)  ((x) & 0x3f)
> > +    __uint32_t data;
> > +    __uint32_t reserved5;
> > +    __uint64_t aibv;               /* Adapter int bit vector address */
> > +    __uint64_t aisb;               /* Adapter int summary bit address */
> > +    __uint64_t fmb_addr;           /* Function measurement address and key */
> > +    __uint32_t reserved6;
> > +    __uint32_t gd;
> > +} QEMU_PACKED ZpciFib;
> > +
> > +static inline uint64_t get_base_disp_rxy(S390CPU *cpu, struct kvm_run *run)
> > +{
> > +    CPUS390XState *env = &cpu->env;
> > +    uint32_t x2 = (run->s390_sieic.ipa & 0x000f);
> > +    uint32_t base2 = run->s390_sieic.ipb >> 28;
> > +    uint32_t disp2 = ((run->s390_sieic.ipb & 0x0fff0000) >> 16) +
> > +                     ((run->s390_sieic.ipb & 0xff00) << 4);
> > +
> > +    if (disp2 & 0x80000) {
> > +        disp2 += 0xfff00000;
> > +    }
> > +
> > +    return (base2 ? env->regs[base2] : 0) +
> > +           (x2 ? env->regs[x2] : 0) + (long)(int)disp2;
> > +}
> > +
> > +static inline uint64_t get_base_disp_rsy(S390CPU *cpu, struct kvm_run *run)
> > +{
> > +    CPUS390XState *env = &cpu->env;
> > +    uint32_t base2 = run->s390_sieic.ipb >> 28;
> > +    uint32_t disp2 = ((run->s390_sieic.ipb & 0x0fff0000) >> 16) +
> > +                     ((run->s390_sieic.ipb & 0xff00) << 4);
> > +
> > +    if (disp2 & 0x80000) {
> > +        disp2 += 0xfff00000;
> > +    }
> > +
> > +    return (base2 ? env->regs[base2] : 0) + (long)(int)disp2;
> > +}
> 
> Same comment as in the previous patch here, please try to avoid putting
> code into a header file.
> 
> > +
> > +int kvm_clp_service_call(S390CPU *cpu, struct kvm_run *run);
> > +int kvm_rpcit_service_call(S390CPU *cpu, struct kvm_run *run);
> > +int kvm_sic_service_call(S390CPU *cpu, struct kvm_run *run);
> > +int kvm_pcistb_service_call(S390CPU *cpu, struct kvm_run *run);
> > +int kvm_mpcifc_service_call(S390CPU *cpu, struct kvm_run *run);
> > +int kvm_pcistg_service_call(S390CPU *cpu, struct kvm_run *run);
> > +int kvm_pcilg_service_call(S390CPU *cpu, struct kvm_run *run);
> > +int kvm_stpcifc_service_call(S390CPU *cpu, struct kvm_run *run);
> 
> Hrm. Maybe we could add some registration hook similar to spapr's hcall
> or rtas callback registration that would allow us to encapsulate this a
> bit better?
> 
> Then you'd only have to spawn a PHB device which could register for
> these service calls.
> 
> 
> Alex
> 
>
Alexander Graf Nov. 11, 2014, 12:16 p.m. UTC | #3
On 11.11.14 13:10, Frank Blaschka wrote:
> On Mon, Nov 10, 2014 at 04:56:21PM +0100, Alexander Graf wrote:
>>
>>
>> On 10.11.14 15:20, Frank Blaschka wrote:
>>> From: Frank Blaschka <frank.blaschka@de.ibm.com>
>>>
>>> This patch implements the s390 pci instructions in qemu. It allows
>>> to access and drive pci devices attached to the s390 pci bus.
>>> Because of platform constrains devices using IO BARs are not
>>> supported. Also a device has to support MSI/MSI-X to run on s390.
>>>
>>> Signed-off-by: Frank Blaschka <frank.blaschka@de.ibm.com>
>>> ---
>>>  target-s390x/Makefile.objs |   2 +-
>>>  target-s390x/kvm.c         |  52 ++++
>>>  target-s390x/pci_ic.c      | 753 +++++++++++++++++++++++++++++++++++++++++++++
>>>  target-s390x/pci_ic.h      | 335 ++++++++++++++++++++
>>>  4 files changed, 1141 insertions(+), 1 deletion(-)
>>>  create mode 100644 target-s390x/pci_ic.c
>>>  create mode 100644 target-s390x/pci_ic.h
>>>

[...]

>>> +int kvm_pcilg_service_call(S390CPU *cpu, struct kvm_run *run)
>>> +{
>>> +    CPUS390XState *env = &cpu->env;
>>> +    S390PCIBusDevice *pbdev;
>>> +    uint8_t r1 = (run->s390_sieic.ipb & 0x00f00000) >> 20;
>>> +    uint8_t r2 = (run->s390_sieic.ipb & 0x000f0000) >> 16;
>>> +    PciLgStg *rp;
>>> +    uint64_t offset;
>>> +    uint64_t data;
>>> +    uint8_t len;
>>> +
>>> +    cpu_synchronize_state(CPU(cpu));
>>> +
>>> +    if (env->psw.mask & PSW_MASK_PSTATE) {
>>> +        program_interrupt(env, PGM_PRIVILEGED, 4);
>>> +        return 0;
>>> +    }
>>> +
>>> +    if (r2 & 0x1) {
>>> +        program_interrupt(env, PGM_SPECIFICATION, 4);
>>> +        return 0;
>>> +    }
>>> +
>>> +    rp = (PciLgStg *)&env->regs[r2];
>>> +    offset = env->regs[r2 + 1];
>>> +
>>> +    pbdev = s390_pci_find_dev_by_fh(rp->fh);
>>> +    if (!pbdev) {
>>> +        DPRINTF("pcilg no pci dev\n");
>>> +        setcc(cpu, ZPCI_PCI_LS_INVAL_HANDLE);
>>> +        return 0;
>>> +    }
>>> +
>>> +    len = rp->len & 0xF;
>>> +    if (rp->pcias < 6) {
>>> +        if ((8 - (offset & 0x7)) < len) {
>>> +            program_interrupt(env, PGM_OPERAND, 4);
>>> +            return 0;
>>> +        }
>>> +        MemoryRegion *mr = pbdev->pdev->io_regions[rp->pcias].memory;
>>> +        io_mem_read(mr, offset, &data, len);
>>> +    } else if (rp->pcias == 15) {
>>> +        if ((4 - (offset & 0x3)) < len) {
>>> +            program_interrupt(env, PGM_OPERAND, 4);
>>> +            return 0;
>>> +        }
>>> +        data =  pci_host_config_read_common(
>>> +                   pbdev->pdev, offset, pci_config_size(pbdev->pdev), len);
>>> +
>>> +        switch (len) {
>>> +        case 1:
>>> +            break;
>>> +        case 2:
>>> +            data = cpu_to_le16(data);
>>> +            break;
>>> +        case 4:
>>> +            data = cpu_to_le32(data);
>>> +            break;
>>> +        case 8:
>>> +            data = cpu_to_le64(data);
>>> +            break;
>>
>> Why? Also, this is wrong. cpu_to_le64 convert between host endianness
>> and LE. So if you're running this on an LE host, you won't swap the
>> value and get a broken result.
>>
>> If you know that the value is always swapped, use bswapxx().
>>
> 
> Actually the code is right and required for a big endian host :-)
> pcilg/pcistg provide access to the PCI config space which is defined
> as PCI byte order (little endian). Since pci_host_config_read_common does
> already a le to cpu conversion we have to convert back to PCI byte order.
> Doing an unconditional swap would be a bug on a little endian host.

Why would it be a bug? The value you end up writing is contents of a
register and thus doesn't have endianness. So if QEMU was an LE process,
the value of data would be identical as on a BE QEMU before your swab.
After the swab, it would be bswap'ed on BE, but not LE. So LE hosts break.


Alex
Peter Maydell Nov. 11, 2014, 12:17 p.m. UTC | #4
On 11 November 2014 12:10, Frank Blaschka <blaschka@linux.vnet.ibm.com> wrote:
> On Mon, Nov 10, 2014 at 04:56:21PM +0100, Alexander Graf wrote:
>> > +static uint8_t barsize(uint64_t size)
>> > +{
>> > +    uint64_t mask = 1;
>> > +    int i;
>> > +
>> > +    if (!size) {
>> > +        return 0;
>> > +    }
>> > +
>> > +    for (i = 0; i < 64; i++) {
>> > +        if (size & mask) {
>> > +            break;
>> > +        }
>> > +        mask = (mask << 1);
>> > +    }
>> > +
>> > +    return i;
>> > +}
>>
>> Isn't there an existing helper for this in the PCI layer?
>>
>
> Did not find one, this function is used to fill a s390 specific len
> in an instruction intercept (architecture specific encoding of the len).

If you do need to implement this here then you should probably
be using ctz64(). I think what you have here is equivalent to

    return size ? ctz64(size) : 0;

but you should check that.

thanks
-- PMM
Frank Blaschka Nov. 11, 2014, 12:39 p.m. UTC | #5
On Tue, Nov 11, 2014 at 01:16:04PM +0100, Alexander Graf wrote:
> 
> 
> On 11.11.14 13:10, Frank Blaschka wrote:
> > On Mon, Nov 10, 2014 at 04:56:21PM +0100, Alexander Graf wrote:
> >>
> >>
> >> On 10.11.14 15:20, Frank Blaschka wrote:
> >>> From: Frank Blaschka <frank.blaschka@de.ibm.com>
> >>>
> >>> This patch implements the s390 pci instructions in qemu. It allows
> >>> to access and drive pci devices attached to the s390 pci bus.
> >>> Because of platform constrains devices using IO BARs are not
> >>> supported. Also a device has to support MSI/MSI-X to run on s390.
> >>>
> >>> Signed-off-by: Frank Blaschka <frank.blaschka@de.ibm.com>
> >>> ---
> >>>  target-s390x/Makefile.objs |   2 +-
> >>>  target-s390x/kvm.c         |  52 ++++
> >>>  target-s390x/pci_ic.c      | 753 +++++++++++++++++++++++++++++++++++++++++++++
> >>>  target-s390x/pci_ic.h      | 335 ++++++++++++++++++++
> >>>  4 files changed, 1141 insertions(+), 1 deletion(-)
> >>>  create mode 100644 target-s390x/pci_ic.c
> >>>  create mode 100644 target-s390x/pci_ic.h
> >>>
> 
> [...]
> 
> >>> +int kvm_pcilg_service_call(S390CPU *cpu, struct kvm_run *run)
> >>> +{
> >>> +    CPUS390XState *env = &cpu->env;
> >>> +    S390PCIBusDevice *pbdev;
> >>> +    uint8_t r1 = (run->s390_sieic.ipb & 0x00f00000) >> 20;
> >>> +    uint8_t r2 = (run->s390_sieic.ipb & 0x000f0000) >> 16;
> >>> +    PciLgStg *rp;
> >>> +    uint64_t offset;
> >>> +    uint64_t data;
> >>> +    uint8_t len;
> >>> +
> >>> +    cpu_synchronize_state(CPU(cpu));
> >>> +
> >>> +    if (env->psw.mask & PSW_MASK_PSTATE) {
> >>> +        program_interrupt(env, PGM_PRIVILEGED, 4);
> >>> +        return 0;
> >>> +    }
> >>> +
> >>> +    if (r2 & 0x1) {
> >>> +        program_interrupt(env, PGM_SPECIFICATION, 4);
> >>> +        return 0;
> >>> +    }
> >>> +
> >>> +    rp = (PciLgStg *)&env->regs[r2];
> >>> +    offset = env->regs[r2 + 1];
> >>> +
> >>> +    pbdev = s390_pci_find_dev_by_fh(rp->fh);
> >>> +    if (!pbdev) {
> >>> +        DPRINTF("pcilg no pci dev\n");
> >>> +        setcc(cpu, ZPCI_PCI_LS_INVAL_HANDLE);
> >>> +        return 0;
> >>> +    }
> >>> +
> >>> +    len = rp->len & 0xF;
> >>> +    if (rp->pcias < 6) {
> >>> +        if ((8 - (offset & 0x7)) < len) {
> >>> +            program_interrupt(env, PGM_OPERAND, 4);
> >>> +            return 0;
> >>> +        }
> >>> +        MemoryRegion *mr = pbdev->pdev->io_regions[rp->pcias].memory;
> >>> +        io_mem_read(mr, offset, &data, len);
> >>> +    } else if (rp->pcias == 15) {
> >>> +        if ((4 - (offset & 0x3)) < len) {
> >>> +            program_interrupt(env, PGM_OPERAND, 4);
> >>> +            return 0;
> >>> +        }
> >>> +        data =  pci_host_config_read_common(
> >>> +                   pbdev->pdev, offset, pci_config_size(pbdev->pdev), len);
> >>> +
> >>> +        switch (len) {
> >>> +        case 1:
> >>> +            break;
> >>> +        case 2:
> >>> +            data = cpu_to_le16(data);
> >>> +            break;
> >>> +        case 4:
> >>> +            data = cpu_to_le32(data);
> >>> +            break;
> >>> +        case 8:
> >>> +            data = cpu_to_le64(data);
> >>> +            break;
> >>
> >> Why? Also, this is wrong. cpu_to_le64 convert between host endianness
> >> and LE. So if you're running this on an LE host, you won't swap the
> >> value and get a broken result.
> >>
> >> If you know that the value is always swapped, use bswapxx().
> >>
> > 
> > Actually the code is right and required for a big endian host :-)
> > pcilg/pcistg provide access to the PCI config space which is defined
> > as PCI byte order (little endian). Since pci_host_config_read_common does
> > already a le to cpu conversion we have to convert back to PCI byte order.
> > Doing an unconditional swap would be a bug on a little endian host.
> 
> Why would it be a bug? The value you end up writing is contents of a
> register and thus doesn't have endianness. So if QEMU was an LE process,

No, the s390 guest executing pcilg instruction expects to receive config space data
in PCI byte order.

> the value of data would be identical as on a BE QEMU before your swab.
> After the swab, it would be bswap'ed on BE, but not LE. So LE hosts break.
>

Again on BE endian host we do the swap because of pci_host_config_read_common does
read the value and do a byte swap for that value, but we need PCI byte order not BE here.

On LE host pci_host_config_read_common does not do a byte swap so we do not have to
convert back to PCI byte order.
 
> 
> Alex
>
Frank Blaschka Nov. 11, 2014, 12:40 p.m. UTC | #6
On Tue, Nov 11, 2014 at 12:17:17PM +0000, Peter Maydell wrote:
> On 11 November 2014 12:10, Frank Blaschka <blaschka@linux.vnet.ibm.com> wrote:
> > On Mon, Nov 10, 2014 at 04:56:21PM +0100, Alexander Graf wrote:
> >> > +static uint8_t barsize(uint64_t size)
> >> > +{
> >> > +    uint64_t mask = 1;
> >> > +    int i;
> >> > +
> >> > +    if (!size) {
> >> > +        return 0;
> >> > +    }
> >> > +
> >> > +    for (i = 0; i < 64; i++) {
> >> > +        if (size & mask) {
> >> > +            break;
> >> > +        }
> >> > +        mask = (mask << 1);
> >> > +    }
> >> > +
> >> > +    return i;
> >> > +}
> >>
> >> Isn't there an existing helper for this in the PCI layer?
> >>
> >
> > Did not find one, this function is used to fill a s390 specific len
> > in an instruction intercept (architecture specific encoding of the len).
> 
> If you do need to implement this here then you should probably
> be using ctz64(). I think what you have here is equivalent to
> 
>     return size ? ctz64(size) : 0;
> 
> but you should check that.

will do thx!

> 
> thanks
> -- PMM
>
Alexander Graf Nov. 11, 2014, 12:51 p.m. UTC | #7
> Am 11.11.2014 um 13:39 schrieb Frank Blaschka <blaschka@linux.vnet.ibm.com>:
> 
>> On Tue, Nov 11, 2014 at 01:16:04PM +0100, Alexander Graf wrote:
>> 
>> 
>>> On 11.11.14 13:10, Frank Blaschka wrote:
>>>> On Mon, Nov 10, 2014 at 04:56:21PM +0100, Alexander Graf wrote:
>>>> 
>>>> 
>>>>> On 10.11.14 15:20, Frank Blaschka wrote:
>>>>> From: Frank Blaschka <frank.blaschka@de.ibm.com>
>>>>> 
>>>>> This patch implements the s390 pci instructions in qemu. It allows
>>>>> to access and drive pci devices attached to the s390 pci bus.
>>>>> Because of platform constrains devices using IO BARs are not
>>>>> supported. Also a device has to support MSI/MSI-X to run on s390.
>>>>> 
>>>>> Signed-off-by: Frank Blaschka <frank.blaschka@de.ibm.com>
>>>>> ---
>>>>> target-s390x/Makefile.objs |   2 +-
>>>>> target-s390x/kvm.c         |  52 ++++
>>>>> target-s390x/pci_ic.c      | 753 +++++++++++++++++++++++++++++++++++++++++++++
>>>>> target-s390x/pci_ic.h      | 335 ++++++++++++++++++++
>>>>> 4 files changed, 1141 insertions(+), 1 deletion(-)
>>>>> create mode 100644 target-s390x/pci_ic.c
>>>>> create mode 100644 target-s390x/pci_ic.h
>>>>> 
>> 
>> [...]
>> 
>>>>> +int kvm_pcilg_service_call(S390CPU *cpu, struct kvm_run *run)
>>>>> +{
>>>>> +    CPUS390XState *env = &cpu->env;
>>>>> +    S390PCIBusDevice *pbdev;
>>>>> +    uint8_t r1 = (run->s390_sieic.ipb & 0x00f00000) >> 20;
>>>>> +    uint8_t r2 = (run->s390_sieic.ipb & 0x000f0000) >> 16;
>>>>> +    PciLgStg *rp;
>>>>> +    uint64_t offset;
>>>>> +    uint64_t data;
>>>>> +    uint8_t len;
>>>>> +
>>>>> +    cpu_synchronize_state(CPU(cpu));
>>>>> +
>>>>> +    if (env->psw.mask & PSW_MASK_PSTATE) {
>>>>> +        program_interrupt(env, PGM_PRIVILEGED, 4);
>>>>> +        return 0;
>>>>> +    }
>>>>> +
>>>>> +    if (r2 & 0x1) {
>>>>> +        program_interrupt(env, PGM_SPECIFICATION, 4);
>>>>> +        return 0;
>>>>> +    }
>>>>> +
>>>>> +    rp = (PciLgStg *)&env->regs[r2];
>>>>> +    offset = env->regs[r2 + 1];
>>>>> +
>>>>> +    pbdev = s390_pci_find_dev_by_fh(rp->fh);
>>>>> +    if (!pbdev) {
>>>>> +        DPRINTF("pcilg no pci dev\n");
>>>>> +        setcc(cpu, ZPCI_PCI_LS_INVAL_HANDLE);
>>>>> +        return 0;
>>>>> +    }
>>>>> +
>>>>> +    len = rp->len & 0xF;
>>>>> +    if (rp->pcias < 6) {
>>>>> +        if ((8 - (offset & 0x7)) < len) {
>>>>> +            program_interrupt(env, PGM_OPERAND, 4);
>>>>> +            return 0;
>>>>> +        }
>>>>> +        MemoryRegion *mr = pbdev->pdev->io_regions[rp->pcias].memory;
>>>>> +        io_mem_read(mr, offset, &data, len);
>>>>> +    } else if (rp->pcias == 15) {
>>>>> +        if ((4 - (offset & 0x3)) < len) {
>>>>> +            program_interrupt(env, PGM_OPERAND, 4);
>>>>> +            return 0;
>>>>> +        }
>>>>> +        data =  pci_host_config_read_common(
>>>>> +                   pbdev->pdev, offset, pci_config_size(pbdev->pdev), len);
>>>>> +
>>>>> +        switch (len) {
>>>>> +        case 1:
>>>>> +            break;
>>>>> +        case 2:
>>>>> +            data = cpu_to_le16(data);
>>>>> +            break;
>>>>> +        case 4:
>>>>> +            data = cpu_to_le32(data);
>>>>> +            break;
>>>>> +        case 8:
>>>>> +            data = cpu_to_le64(data);
>>>>> +            break;
>>>> 
>>>> Why? Also, this is wrong. cpu_to_le64 convert between host endianness
>>>> and LE. So if you're running this on an LE host, you won't swap the
>>>> value and get a broken result.
>>>> 
>>>> If you know that the value is always swapped, use bswapxx().
>>>> 
>>> 
>>> Actually the code is right and required for a big endian host :-)
>>> pcilg/pcistg provide access to the PCI config space which is defined
>>> as PCI byte order (little endian). Since pci_host_config_read_common does
>>> already a le to cpu conversion we have to convert back to PCI byte order.
>>> Doing an unconditional swap would be a bug on a little endian host.
>> 
>> Why would it be a bug? The value you end up writing is contents of a
>> register and thus doesn't have endianness. So if QEMU was an LE process,
> 
> No, the s390 guest executing pcilg instruction expects to receive config space data
> in PCI byte order.
> 
>> the value of data would be identical as on a BE QEMU before your swab.
>> After the swab, it would be bswap'ed on BE, but not LE. So LE hosts break.
>> 
> 
> Again on BE endian host we do the swap because of pci_host_config_read_common does
> read the value and do a byte swap for that value, but we need PCI byte order not BE here.
> 
> On LE host pci_host_config_read_common does not do a byte swap so we do not have to
> convert back to PCI byte order.

We maintain the PCI config space always in LE byte order in memory, that's why there is a bwap in its read function. The return result of the read function however is always the same, regardless of LE or BE host. If I do a read of size 4, I will always get 0x1, not 0x01000000 returned.

So now you need to convert that 0x1 into a 0x01000000 manually here because some architect thought that registers have endianness (which they don't). But you need to do it always, even on an LE host, because the pci config space return value is identical on LE and BE.


Alex
Frank Blaschka Nov. 11, 2014, 2:08 p.m. UTC | #8
On Tue, Nov 11, 2014 at 01:51:25PM +0100, Alexander Graf wrote:
> 
> 
> 
> > Am 11.11.2014 um 13:39 schrieb Frank Blaschka <blaschka@linux.vnet.ibm.com>:
> > 
> >> On Tue, Nov 11, 2014 at 01:16:04PM +0100, Alexander Graf wrote:
> >> 
> >> 
> >>> On 11.11.14 13:10, Frank Blaschka wrote:
> >>>> On Mon, Nov 10, 2014 at 04:56:21PM +0100, Alexander Graf wrote:
> >>>> 
> >>>> 
> >>>>> On 10.11.14 15:20, Frank Blaschka wrote:
> >>>>> From: Frank Blaschka <frank.blaschka@de.ibm.com>
> >>>>> 
> >>>>> This patch implements the s390 pci instructions in qemu. It allows
> >>>>> to access and drive pci devices attached to the s390 pci bus.
> >>>>> Because of platform constrains devices using IO BARs are not
> >>>>> supported. Also a device has to support MSI/MSI-X to run on s390.
> >>>>> 
> >>>>> Signed-off-by: Frank Blaschka <frank.blaschka@de.ibm.com>
> >>>>> ---
> >>>>> target-s390x/Makefile.objs |   2 +-
> >>>>> target-s390x/kvm.c         |  52 ++++
> >>>>> target-s390x/pci_ic.c      | 753 +++++++++++++++++++++++++++++++++++++++++++++
> >>>>> target-s390x/pci_ic.h      | 335 ++++++++++++++++++++
> >>>>> 4 files changed, 1141 insertions(+), 1 deletion(-)
> >>>>> create mode 100644 target-s390x/pci_ic.c
> >>>>> create mode 100644 target-s390x/pci_ic.h
> >>>>> 
> >> 
> >> [...]
> >> 
> >>>>> +int kvm_pcilg_service_call(S390CPU *cpu, struct kvm_run *run)
> >>>>> +{
> >>>>> +    CPUS390XState *env = &cpu->env;
> >>>>> +    S390PCIBusDevice *pbdev;
> >>>>> +    uint8_t r1 = (run->s390_sieic.ipb & 0x00f00000) >> 20;
> >>>>> +    uint8_t r2 = (run->s390_sieic.ipb & 0x000f0000) >> 16;
> >>>>> +    PciLgStg *rp;
> >>>>> +    uint64_t offset;
> >>>>> +    uint64_t data;
> >>>>> +    uint8_t len;
> >>>>> +
> >>>>> +    cpu_synchronize_state(CPU(cpu));
> >>>>> +
> >>>>> +    if (env->psw.mask & PSW_MASK_PSTATE) {
> >>>>> +        program_interrupt(env, PGM_PRIVILEGED, 4);
> >>>>> +        return 0;
> >>>>> +    }
> >>>>> +
> >>>>> +    if (r2 & 0x1) {
> >>>>> +        program_interrupt(env, PGM_SPECIFICATION, 4);
> >>>>> +        return 0;
> >>>>> +    }
> >>>>> +
> >>>>> +    rp = (PciLgStg *)&env->regs[r2];
> >>>>> +    offset = env->regs[r2 + 1];
> >>>>> +
> >>>>> +    pbdev = s390_pci_find_dev_by_fh(rp->fh);
> >>>>> +    if (!pbdev) {
> >>>>> +        DPRINTF("pcilg no pci dev\n");
> >>>>> +        setcc(cpu, ZPCI_PCI_LS_INVAL_HANDLE);
> >>>>> +        return 0;
> >>>>> +    }
> >>>>> +
> >>>>> +    len = rp->len & 0xF;
> >>>>> +    if (rp->pcias < 6) {
> >>>>> +        if ((8 - (offset & 0x7)) < len) {
> >>>>> +            program_interrupt(env, PGM_OPERAND, 4);
> >>>>> +            return 0;
> >>>>> +        }
> >>>>> +        MemoryRegion *mr = pbdev->pdev->io_regions[rp->pcias].memory;
> >>>>> +        io_mem_read(mr, offset, &data, len);
> >>>>> +    } else if (rp->pcias == 15) {
> >>>>> +        if ((4 - (offset & 0x3)) < len) {
> >>>>> +            program_interrupt(env, PGM_OPERAND, 4);
> >>>>> +            return 0;
> >>>>> +        }
> >>>>> +        data =  pci_host_config_read_common(
> >>>>> +                   pbdev->pdev, offset, pci_config_size(pbdev->pdev), len);
> >>>>> +
> >>>>> +        switch (len) {
> >>>>> +        case 1:
> >>>>> +            break;
> >>>>> +        case 2:
> >>>>> +            data = cpu_to_le16(data);
> >>>>> +            break;
> >>>>> +        case 4:
> >>>>> +            data = cpu_to_le32(data);
> >>>>> +            break;
> >>>>> +        case 8:
> >>>>> +            data = cpu_to_le64(data);
> >>>>> +            break;
> >>>> 
> >>>> Why? Also, this is wrong. cpu_to_le64 convert between host endianness
> >>>> and LE. So if you're running this on an LE host, you won't swap the
> >>>> value and get a broken result.
> >>>> 
> >>>> If you know that the value is always swapped, use bswapxx().
> >>>> 
> >>> 
> >>> Actually the code is right and required for a big endian host :-)
> >>> pcilg/pcistg provide access to the PCI config space which is defined
> >>> as PCI byte order (little endian). Since pci_host_config_read_common does
> >>> already a le to cpu conversion we have to convert back to PCI byte order.
> >>> Doing an unconditional swap would be a bug on a little endian host.
> >> 
> >> Why would it be a bug? The value you end up writing is contents of a
> >> register and thus doesn't have endianness. So if QEMU was an LE process,
> > 
> > No, the s390 guest executing pcilg instruction expects to receive config space data
> > in PCI byte order.
> > 
> >> the value of data would be identical as on a BE QEMU before your swab.
> >> After the swab, it would be bswap'ed on BE, but not LE. So LE hosts break.
> >> 
> > 
> > Again on BE endian host we do the swap because of pci_host_config_read_common does
> > read the value and do a byte swap for that value, but we need PCI byte order not BE here.
> > 
> > On LE host pci_host_config_read_common does not do a byte swap so we do not have to
> > convert back to PCI byte order.
> 
> We maintain the PCI config space always in LE byte order in memory, that's why there is a bwap in its read function. The return result of the read function however is always the same, regardless of LE or BE host. If I do a read of size 4, I will always get 0x1, not 0x01000000 returned.
> 
> So now you need to convert that 0x1 into a 0x01000000 manually here because some architect thought that registers have endianness (which they don't). But you need to do it always, even on an LE host, because the pci config space return value is identical on LE and BE.
> 
so you tell me pci_host_config_read_common does not end up in pci_default_read_config?

uint32_t pci_default_read_config(PCIDevice *d,
                                 uint32_t address, int len)
{
    uint32_t val = 0;

    memcpy(&val, d->config + address, len);
    return le32_to_cpu(val);
}

What did I miss?

> 
> Alex
> 
>
Alexander Graf Nov. 11, 2014, 3:24 p.m. UTC | #9
On 11.11.14 15:08, Frank Blaschka wrote:
> On Tue, Nov 11, 2014 at 01:51:25PM +0100, Alexander Graf wrote:
>>
>>
>>
>>> Am 11.11.2014 um 13:39 schrieb Frank Blaschka <blaschka@linux.vnet.ibm.com>:
>>>
>>>> On Tue, Nov 11, 2014 at 01:16:04PM +0100, Alexander Graf wrote:
>>>>
>>>>
>>>>> On 11.11.14 13:10, Frank Blaschka wrote:
>>>>>> On Mon, Nov 10, 2014 at 04:56:21PM +0100, Alexander Graf wrote:
>>>>>>
>>>>>>
>>>>>>> On 10.11.14 15:20, Frank Blaschka wrote:
>>>>>>> From: Frank Blaschka <frank.blaschka@de.ibm.com>
>>>>>>>
>>>>>>> This patch implements the s390 pci instructions in qemu. It allows
>>>>>>> to access and drive pci devices attached to the s390 pci bus.
>>>>>>> Because of platform constrains devices using IO BARs are not
>>>>>>> supported. Also a device has to support MSI/MSI-X to run on s390.
>>>>>>>
>>>>>>> Signed-off-by: Frank Blaschka <frank.blaschka@de.ibm.com>
>>>>>>> ---
>>>>>>> target-s390x/Makefile.objs |   2 +-
>>>>>>> target-s390x/kvm.c         |  52 ++++
>>>>>>> target-s390x/pci_ic.c      | 753 +++++++++++++++++++++++++++++++++++++++++++++
>>>>>>> target-s390x/pci_ic.h      | 335 ++++++++++++++++++++
>>>>>>> 4 files changed, 1141 insertions(+), 1 deletion(-)
>>>>>>> create mode 100644 target-s390x/pci_ic.c
>>>>>>> create mode 100644 target-s390x/pci_ic.h
>>>>>>>
>>>>
>>>> [...]
>>>>
>>>>>>> +int kvm_pcilg_service_call(S390CPU *cpu, struct kvm_run *run)
>>>>>>> +{
>>>>>>> +    CPUS390XState *env = &cpu->env;
>>>>>>> +    S390PCIBusDevice *pbdev;
>>>>>>> +    uint8_t r1 = (run->s390_sieic.ipb & 0x00f00000) >> 20;
>>>>>>> +    uint8_t r2 = (run->s390_sieic.ipb & 0x000f0000) >> 16;
>>>>>>> +    PciLgStg *rp;
>>>>>>> +    uint64_t offset;
>>>>>>> +    uint64_t data;
>>>>>>> +    uint8_t len;
>>>>>>> +
>>>>>>> +    cpu_synchronize_state(CPU(cpu));
>>>>>>> +
>>>>>>> +    if (env->psw.mask & PSW_MASK_PSTATE) {
>>>>>>> +        program_interrupt(env, PGM_PRIVILEGED, 4);
>>>>>>> +        return 0;
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    if (r2 & 0x1) {
>>>>>>> +        program_interrupt(env, PGM_SPECIFICATION, 4);
>>>>>>> +        return 0;
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    rp = (PciLgStg *)&env->regs[r2];
>>>>>>> +    offset = env->regs[r2 + 1];
>>>>>>> +
>>>>>>> +    pbdev = s390_pci_find_dev_by_fh(rp->fh);
>>>>>>> +    if (!pbdev) {
>>>>>>> +        DPRINTF("pcilg no pci dev\n");
>>>>>>> +        setcc(cpu, ZPCI_PCI_LS_INVAL_HANDLE);
>>>>>>> +        return 0;
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    len = rp->len & 0xF;
>>>>>>> +    if (rp->pcias < 6) {
>>>>>>> +        if ((8 - (offset & 0x7)) < len) {
>>>>>>> +            program_interrupt(env, PGM_OPERAND, 4);
>>>>>>> +            return 0;
>>>>>>> +        }
>>>>>>> +        MemoryRegion *mr = pbdev->pdev->io_regions[rp->pcias].memory;
>>>>>>> +        io_mem_read(mr, offset, &data, len);
>>>>>>> +    } else if (rp->pcias == 15) {
>>>>>>> +        if ((4 - (offset & 0x3)) < len) {
>>>>>>> +            program_interrupt(env, PGM_OPERAND, 4);
>>>>>>> +            return 0;
>>>>>>> +        }
>>>>>>> +        data =  pci_host_config_read_common(
>>>>>>> +                   pbdev->pdev, offset, pci_config_size(pbdev->pdev), len);
>>>>>>> +
>>>>>>> +        switch (len) {
>>>>>>> +        case 1:
>>>>>>> +            break;
>>>>>>> +        case 2:
>>>>>>> +            data = cpu_to_le16(data);
>>>>>>> +            break;
>>>>>>> +        case 4:
>>>>>>> +            data = cpu_to_le32(data);
>>>>>>> +            break;
>>>>>>> +        case 8:
>>>>>>> +            data = cpu_to_le64(data);
>>>>>>> +            break;
>>>>>>
>>>>>> Why? Also, this is wrong. cpu_to_le64 convert between host endianness
>>>>>> and LE. So if you're running this on an LE host, you won't swap the
>>>>>> value and get a broken result.
>>>>>>
>>>>>> If you know that the value is always swapped, use bswapxx().
>>>>>>
>>>>>
>>>>> Actually the code is right and required for a big endian host :-)
>>>>> pcilg/pcistg provide access to the PCI config space which is defined
>>>>> as PCI byte order (little endian). Since pci_host_config_read_common does
>>>>> already a le to cpu conversion we have to convert back to PCI byte order.
>>>>> Doing an unconditional swap would be a bug on a little endian host.
>>>>
>>>> Why would it be a bug? The value you end up writing is contents of a
>>>> register and thus doesn't have endianness. So if QEMU was an LE process,
>>>
>>> No, the s390 guest executing pcilg instruction expects to receive config space data
>>> in PCI byte order.
>>>
>>>> the value of data would be identical as on a BE QEMU before your swab.
>>>> After the swab, it would be bswap'ed on BE, but not LE. So LE hosts break.
>>>>
>>>
>>> Again on BE endian host we do the swap because of pci_host_config_read_common does
>>> read the value and do a byte swap for that value, but we need PCI byte order not BE here.
>>>
>>> On LE host pci_host_config_read_common does not do a byte swap so we do not have to
>>> convert back to PCI byte order.
>>
>> We maintain the PCI config space always in LE byte order in memory, that's why there is a bwap in its read function. The return result of the read function however is always the same, regardless of LE or BE host. If I do a read of size 4, I will always get 0x1, not 0x01000000 returned.
>>
>> So now you need to convert that 0x1 into a 0x01000000 manually here because some architect thought that registers have endianness (which they don't). But you need to do it always, even on an LE host, because the pci config space return value is identical on LE and BE.
>>
> so you tell me pci_host_config_read_common does not end up in pci_default_read_config?
> 
> uint32_t pci_default_read_config(PCIDevice *d,
>                                  uint32_t address, int len)
> {
>     uint32_t val = 0;
> 
>     memcpy(&val, d->config + address, len);
>     return le32_to_cpu(val);
> }
> 
> What did I miss?

That's exactly where you end up in - and it's there to convert from the
PCI config space backing storage to a native number.

Imagine you write 0x12345678 at offset 0. Because PCI config space is
defined to be LE, in the PCI config space memory this gets stored as

78 56 34 12

The reason we do the internal storage of the config space that way is
that it's (in some PCI implementations) legal to access with single byte
granularities. So you could do a pci_config_read(offset = 1) which
should return 0x56.

However, that means we completely nullify any effect of host endianness
in the PCI config layer already. So if you do pci_config_write(offset =
0, size = 4, value = 0x12345678), the contents of d->config will always
be identical, regardless of host endianness. The same holds true for
pci_config_read(offset = 0, size = 4). It will always return 0x12345678.

In your code, you swab that value again. I assume there's a reason
you're swapping it and that it's the way the architecture expects it
(mind to point me to the respective spec so I can verify?). But if the
architecture expects it, then it expects it regardless of host
endianness. The contents of regs[r1] should always be 0x78563412, no
matter whether we're in an LE or a BE environment.

Does that make sense now?


Alex
Frank Blaschka Nov. 12, 2014, 8:49 a.m. UTC | #10
On Tue, Nov 11, 2014 at 04:24:24PM +0100, Alexander Graf wrote:
> 
> 
> On 11.11.14 15:08, Frank Blaschka wrote:
> > On Tue, Nov 11, 2014 at 01:51:25PM +0100, Alexander Graf wrote:
> >>
> >>
> >>
> >>> Am 11.11.2014 um 13:39 schrieb Frank Blaschka <blaschka@linux.vnet.ibm.com>:
> >>>
> >>>> On Tue, Nov 11, 2014 at 01:16:04PM +0100, Alexander Graf wrote:
> >>>>
> >>>>
> >>>>> On 11.11.14 13:10, Frank Blaschka wrote:
> >>>>>> On Mon, Nov 10, 2014 at 04:56:21PM +0100, Alexander Graf wrote:
> >>>>>>
> >>>>>>
> >>>>>>> On 10.11.14 15:20, Frank Blaschka wrote:
> >>>>>>> From: Frank Blaschka <frank.blaschka@de.ibm.com>
> >>>>>>>
> >>>>>>> This patch implements the s390 pci instructions in qemu. It allows
> >>>>>>> to access and drive pci devices attached to the s390 pci bus.
> >>>>>>> Because of platform constrains devices using IO BARs are not
> >>>>>>> supported. Also a device has to support MSI/MSI-X to run on s390.
> >>>>>>>
> >>>>>>> Signed-off-by: Frank Blaschka <frank.blaschka@de.ibm.com>
> >>>>>>> ---
> >>>>>>> target-s390x/Makefile.objs |   2 +-
> >>>>>>> target-s390x/kvm.c         |  52 ++++
> >>>>>>> target-s390x/pci_ic.c      | 753 +++++++++++++++++++++++++++++++++++++++++++++
> >>>>>>> target-s390x/pci_ic.h      | 335 ++++++++++++++++++++
> >>>>>>> 4 files changed, 1141 insertions(+), 1 deletion(-)
> >>>>>>> create mode 100644 target-s390x/pci_ic.c
> >>>>>>> create mode 100644 target-s390x/pci_ic.h
> >>>>>>>
> >>>>
> >>>> [...]
> >>>>
> >>>>>>> +int kvm_pcilg_service_call(S390CPU *cpu, struct kvm_run *run)
> >>>>>>> +{
> >>>>>>> +    CPUS390XState *env = &cpu->env;
> >>>>>>> +    S390PCIBusDevice *pbdev;
> >>>>>>> +    uint8_t r1 = (run->s390_sieic.ipb & 0x00f00000) >> 20;
> >>>>>>> +    uint8_t r2 = (run->s390_sieic.ipb & 0x000f0000) >> 16;
> >>>>>>> +    PciLgStg *rp;
> >>>>>>> +    uint64_t offset;
> >>>>>>> +    uint64_t data;
> >>>>>>> +    uint8_t len;
> >>>>>>> +
> >>>>>>> +    cpu_synchronize_state(CPU(cpu));
> >>>>>>> +
> >>>>>>> +    if (env->psw.mask & PSW_MASK_PSTATE) {
> >>>>>>> +        program_interrupt(env, PGM_PRIVILEGED, 4);
> >>>>>>> +        return 0;
> >>>>>>> +    }
> >>>>>>> +
> >>>>>>> +    if (r2 & 0x1) {
> >>>>>>> +        program_interrupt(env, PGM_SPECIFICATION, 4);
> >>>>>>> +        return 0;
> >>>>>>> +    }
> >>>>>>> +
> >>>>>>> +    rp = (PciLgStg *)&env->regs[r2];
> >>>>>>> +    offset = env->regs[r2 + 1];
> >>>>>>> +
> >>>>>>> +    pbdev = s390_pci_find_dev_by_fh(rp->fh);
> >>>>>>> +    if (!pbdev) {
> >>>>>>> +        DPRINTF("pcilg no pci dev\n");
> >>>>>>> +        setcc(cpu, ZPCI_PCI_LS_INVAL_HANDLE);
> >>>>>>> +        return 0;
> >>>>>>> +    }
> >>>>>>> +
> >>>>>>> +    len = rp->len & 0xF;
> >>>>>>> +    if (rp->pcias < 6) {
> >>>>>>> +        if ((8 - (offset & 0x7)) < len) {
> >>>>>>> +            program_interrupt(env, PGM_OPERAND, 4);
> >>>>>>> +            return 0;
> >>>>>>> +        }
> >>>>>>> +        MemoryRegion *mr = pbdev->pdev->io_regions[rp->pcias].memory;
> >>>>>>> +        io_mem_read(mr, offset, &data, len);
> >>>>>>> +    } else if (rp->pcias == 15) {
> >>>>>>> +        if ((4 - (offset & 0x3)) < len) {
> >>>>>>> +            program_interrupt(env, PGM_OPERAND, 4);
> >>>>>>> +            return 0;
> >>>>>>> +        }
> >>>>>>> +        data =  pci_host_config_read_common(
> >>>>>>> +                   pbdev->pdev, offset, pci_config_size(pbdev->pdev), len);
> >>>>>>> +
> >>>>>>> +        switch (len) {
> >>>>>>> +        case 1:
> >>>>>>> +            break;
> >>>>>>> +        case 2:
> >>>>>>> +            data = cpu_to_le16(data);
> >>>>>>> +            break;
> >>>>>>> +        case 4:
> >>>>>>> +            data = cpu_to_le32(data);
> >>>>>>> +            break;
> >>>>>>> +        case 8:
> >>>>>>> +            data = cpu_to_le64(data);
> >>>>>>> +            break;
> >>>>>>
> >>>>>> Why? Also, this is wrong. cpu_to_le64 convert between host endianness
> >>>>>> and LE. So if you're running this on an LE host, you won't swap the
> >>>>>> value and get a broken result.
> >>>>>>
> >>>>>> If you know that the value is always swapped, use bswapxx().
> >>>>>>
> >>>>>
> >>>>> Actually the code is right and required for a big endian host :-)
> >>>>> pcilg/pcistg provide access to the PCI config space which is defined
> >>>>> as PCI byte order (little endian). Since pci_host_config_read_common does
> >>>>> already a le to cpu conversion we have to convert back to PCI byte order.
> >>>>> Doing an unconditional swap would be a bug on a little endian host.
> >>>>
> >>>> Why would it be a bug? The value you end up writing is contents of a
> >>>> register and thus doesn't have endianness. So if QEMU was an LE process,
> >>>
> >>> No, the s390 guest executing pcilg instruction expects to receive config space data
> >>> in PCI byte order.
> >>>
> >>>> the value of data would be identical as on a BE QEMU before your swab.
> >>>> After the swab, it would be bswap'ed on BE, but not LE. So LE hosts break.
> >>>>
> >>>
> >>> Again on BE endian host we do the swap because of pci_host_config_read_common does
> >>> read the value and do a byte swap for that value, but we need PCI byte order not BE here.
> >>>
> >>> On LE host pci_host_config_read_common does not do a byte swap so we do not have to
> >>> convert back to PCI byte order.
> >>
> >> We maintain the PCI config space always in LE byte order in memory, that's why there is a bwap in its read function. The return result of the read function however is always the same, regardless of LE or BE host. If I do a read of size 4, I will always get 0x1, not 0x01000000 returned.
> >>
> >> So now you need to convert that 0x1 into a 0x01000000 manually here because some architect thought that registers have endianness (which they don't). But you need to do it always, even on an LE host, because the pci config space return value is identical on LE and BE.
> >>
> > so you tell me pci_host_config_read_common does not end up in pci_default_read_config?
> > 
> > uint32_t pci_default_read_config(PCIDevice *d,
> >                                  uint32_t address, int len)
> > {
> >     uint32_t val = 0;
> > 
> >     memcpy(&val, d->config + address, len);
> >     return le32_to_cpu(val);
> > }
> > 
> > What did I miss?
> 
> That's exactly where you end up in - and it's there to convert from the
> PCI config space backing storage to a native number.
> 
> Imagine you write 0x12345678 at offset 0. Because PCI config space is
> defined to be LE, in the PCI config space memory this gets stored as
> 
> 78 56 34 12
> 
> The reason we do the internal storage of the config space that way is
> that it's (in some PCI implementations) legal to access with single byte
> granularities. So you could do a pci_config_read(offset = 1) which
> should return 0x56.
> 
> However, that means we completely nullify any effect of host endianness
> in the PCI config layer already. So if you do pci_config_write(offset =
> 0, size = 4, value = 0x12345678), the contents of d->config will always
> be identical, regardless of host endianness. The same holds true for
> pci_config_read(offset = 0, size = 4). It will always return 0x12345678.
>

I understood this from the beginning and I completely agree to this.
 
> In your code, you swab that value again. I assume there's a reason
> you're swapping it and that it's the way the architecture expects it

Yes, s390 pcilg architecture states:
Data in the PCI configuration space are treated
as being in little-endian byte ordering

> (mind to point me to the respective spec so I can verify?). But if the
> architecture expects it, then it expects it regardless of host
> endianness. The contents of regs[r1] should always be 0x78563412, no
> matter whether we're in an LE or a BE environment.
> 
> Does that make sense now?
> 
Absolutely lets make an example for qemu running on BE and LE

byte order    config space backing   pci_default_read_config   pcilg (with cpu_to_le)
BE            0x78563412             0x12345678                0x78563412
LE            0x78563412             0x78563412                0x78563412

So what is the problem with my code? Adding unconditional byte swap instead of
cpu_to_le in pcilg would break architecture for pcilg if qemu is running on LE
platform.

> 
> Alex
>
Alexander Graf Nov. 12, 2014, 9:08 a.m. UTC | #11
On 12.11.14 09:49, Frank Blaschka wrote:
> On Tue, Nov 11, 2014 at 04:24:24PM +0100, Alexander Graf wrote:
>>
>>
>> On 11.11.14 15:08, Frank Blaschka wrote:
>>> On Tue, Nov 11, 2014 at 01:51:25PM +0100, Alexander Graf wrote:
>>>>
>>>>
>>>>
>>>>> Am 11.11.2014 um 13:39 schrieb Frank Blaschka <blaschka@linux.vnet.ibm.com>:
>>>>>
>>>>>> On Tue, Nov 11, 2014 at 01:16:04PM +0100, Alexander Graf wrote:
>>>>>>
>>>>>>
>>>>>>> On 11.11.14 13:10, Frank Blaschka wrote:
>>>>>>>> On Mon, Nov 10, 2014 at 04:56:21PM +0100, Alexander Graf wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>>> On 10.11.14 15:20, Frank Blaschka wrote:
>>>>>>>>> From: Frank Blaschka <frank.blaschka@de.ibm.com>
>>>>>>>>>
>>>>>>>>> This patch implements the s390 pci instructions in qemu. It allows
>>>>>>>>> to access and drive pci devices attached to the s390 pci bus.
>>>>>>>>> Because of platform constrains devices using IO BARs are not
>>>>>>>>> supported. Also a device has to support MSI/MSI-X to run on s390.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Frank Blaschka <frank.blaschka@de.ibm.com>
>>>>>>>>> ---
>>>>>>>>> target-s390x/Makefile.objs |   2 +-
>>>>>>>>> target-s390x/kvm.c         |  52 ++++
>>>>>>>>> target-s390x/pci_ic.c      | 753 +++++++++++++++++++++++++++++++++++++++++++++
>>>>>>>>> target-s390x/pci_ic.h      | 335 ++++++++++++++++++++
>>>>>>>>> 4 files changed, 1141 insertions(+), 1 deletion(-)
>>>>>>>>> create mode 100644 target-s390x/pci_ic.c
>>>>>>>>> create mode 100644 target-s390x/pci_ic.h
>>>>>>>>>
>>>>>>
>>>>>> [...]
>>>>>>
>>>>>>>>> +int kvm_pcilg_service_call(S390CPU *cpu, struct kvm_run *run)
>>>>>>>>> +{
>>>>>>>>> +    CPUS390XState *env = &cpu->env;
>>>>>>>>> +    S390PCIBusDevice *pbdev;
>>>>>>>>> +    uint8_t r1 = (run->s390_sieic.ipb & 0x00f00000) >> 20;
>>>>>>>>> +    uint8_t r2 = (run->s390_sieic.ipb & 0x000f0000) >> 16;
>>>>>>>>> +    PciLgStg *rp;
>>>>>>>>> +    uint64_t offset;
>>>>>>>>> +    uint64_t data;
>>>>>>>>> +    uint8_t len;
>>>>>>>>> +
>>>>>>>>> +    cpu_synchronize_state(CPU(cpu));
>>>>>>>>> +
>>>>>>>>> +    if (env->psw.mask & PSW_MASK_PSTATE) {
>>>>>>>>> +        program_interrupt(env, PGM_PRIVILEGED, 4);
>>>>>>>>> +        return 0;
>>>>>>>>> +    }
>>>>>>>>> +
>>>>>>>>> +    if (r2 & 0x1) {
>>>>>>>>> +        program_interrupt(env, PGM_SPECIFICATION, 4);
>>>>>>>>> +        return 0;
>>>>>>>>> +    }
>>>>>>>>> +
>>>>>>>>> +    rp = (PciLgStg *)&env->regs[r2];
>>>>>>>>> +    offset = env->regs[r2 + 1];
>>>>>>>>> +
>>>>>>>>> +    pbdev = s390_pci_find_dev_by_fh(rp->fh);
>>>>>>>>> +    if (!pbdev) {
>>>>>>>>> +        DPRINTF("pcilg no pci dev\n");
>>>>>>>>> +        setcc(cpu, ZPCI_PCI_LS_INVAL_HANDLE);
>>>>>>>>> +        return 0;
>>>>>>>>> +    }
>>>>>>>>> +
>>>>>>>>> +    len = rp->len & 0xF;
>>>>>>>>> +    if (rp->pcias < 6) {
>>>>>>>>> +        if ((8 - (offset & 0x7)) < len) {
>>>>>>>>> +            program_interrupt(env, PGM_OPERAND, 4);
>>>>>>>>> +            return 0;
>>>>>>>>> +        }
>>>>>>>>> +        MemoryRegion *mr = pbdev->pdev->io_regions[rp->pcias].memory;
>>>>>>>>> +        io_mem_read(mr, offset, &data, len);
>>>>>>>>> +    } else if (rp->pcias == 15) {
>>>>>>>>> +        if ((4 - (offset & 0x3)) < len) {
>>>>>>>>> +            program_interrupt(env, PGM_OPERAND, 4);
>>>>>>>>> +            return 0;
>>>>>>>>> +        }
>>>>>>>>> +        data =  pci_host_config_read_common(
>>>>>>>>> +                   pbdev->pdev, offset, pci_config_size(pbdev->pdev), len);
>>>>>>>>> +
>>>>>>>>> +        switch (len) {
>>>>>>>>> +        case 1:
>>>>>>>>> +            break;
>>>>>>>>> +        case 2:
>>>>>>>>> +            data = cpu_to_le16(data);
>>>>>>>>> +            break;
>>>>>>>>> +        case 4:
>>>>>>>>> +            data = cpu_to_le32(data);
>>>>>>>>> +            break;
>>>>>>>>> +        case 8:
>>>>>>>>> +            data = cpu_to_le64(data);
>>>>>>>>> +            break;
>>>>>>>>
>>>>>>>> Why? Also, this is wrong. cpu_to_le64 convert between host endianness
>>>>>>>> and LE. So if you're running this on an LE host, you won't swap the
>>>>>>>> value and get a broken result.
>>>>>>>>
>>>>>>>> If you know that the value is always swapped, use bswapxx().
>>>>>>>>
>>>>>>>
>>>>>>> Actually the code is right and required for a big endian host :-)
>>>>>>> pcilg/pcistg provide access to the PCI config space which is defined
>>>>>>> as PCI byte order (little endian). Since pci_host_config_read_common does
>>>>>>> already a le to cpu conversion we have to convert back to PCI byte order.
>>>>>>> Doing an unconditional swap would be a bug on a little endian host.
>>>>>>
>>>>>> Why would it be a bug? The value you end up writing is contents of a
>>>>>> register and thus doesn't have endianness. So if QEMU was an LE process,
>>>>>
>>>>> No, the s390 guest executing pcilg instruction expects to receive config space data
>>>>> in PCI byte order.
>>>>>
>>>>>> the value of data would be identical as on a BE QEMU before your swab.
>>>>>> After the swab, it would be bswap'ed on BE, but not LE. So LE hosts break.
>>>>>>
>>>>>
>>>>> Again on BE endian host we do the swap because of pci_host_config_read_common does
>>>>> read the value and do a byte swap for that value, but we need PCI byte order not BE here.
>>>>>
>>>>> On LE host pci_host_config_read_common does not do a byte swap so we do not have to
>>>>> convert back to PCI byte order.
>>>>
>>>> We maintain the PCI config space always in LE byte order in memory, that's why there is a bwap in its read function. The return result of the read function however is always the same, regardless of LE or BE host. If I do a read of size 4, I will always get 0x1, not 0x01000000 returned.
>>>>
>>>> So now you need to convert that 0x1 into a 0x01000000 manually here because some architect thought that registers have endianness (which they don't). But you need to do it always, even on an LE host, because the pci config space return value is identical on LE and BE.
>>>>
>>> so you tell me pci_host_config_read_common does not end up in pci_default_read_config?
>>>
>>> uint32_t pci_default_read_config(PCIDevice *d,
>>>                                  uint32_t address, int len)
>>> {
>>>     uint32_t val = 0;
>>>
>>>     memcpy(&val, d->config + address, len);
>>>     return le32_to_cpu(val);
>>> }
>>>
>>> What did I miss?
>>
>> That's exactly where you end up in - and it's there to convert from the
>> PCI config space backing storage to a native number.
>>
>> Imagine you write 0x12345678 at offset 0. Because PCI config space is
>> defined to be LE, in the PCI config space memory this gets stored as
>>
>> 78 56 34 12
>>
>> The reason we do the internal storage of the config space that way is
>> that it's (in some PCI implementations) legal to access with single byte
>> granularities. So you could do a pci_config_read(offset = 1) which
>> should return 0x56.
>>
>> However, that means we completely nullify any effect of host endianness
>> in the PCI config layer already. So if you do pci_config_write(offset =
>> 0, size = 4, value = 0x12345678), the contents of d->config will always
>> be identical, regardless of host endianness. The same holds true for
>> pci_config_read(offset = 0, size = 4). It will always return 0x12345678.
>>
> 
> I understood this from the beginning and I completely agree to this.
>  
>> In your code, you swab that value again. I assume there's a reason
>> you're swapping it and that it's the way the architecture expects it
> 
> Yes, s390 pcilg architecture states:
> Data in the PCI configuration space are treated
> as being in little-endian byte ordering
> 
>> (mind to point me to the respective spec so I can verify?). But if the
>> architecture expects it, then it expects it regardless of host
>> endianness. The contents of regs[r1] should always be 0x78563412, no
>> matter whether we're in an LE or a BE environment.
>>
>> Does that make sense now?
>>
> Absolutely lets make an example for qemu running on BE and LE
> 
> byte order    config space backing   pci_default_read_config   pcilg (with cpu_to_le)
> BE            0x78563412             0x12345678                0x78563412
> LE            0x78563412             0x78563412                0x78563412

No, pci_default_read_config() always returns 0x12345678 because it
returns a register, not memory.


Alex

> 
> So what is the problem with my code? Adding unconditional byte swap instead of
> cpu_to_le in pcilg would break architecture for pcilg if qemu is running on LE
> platform.
> 
>>
>> Alex
>>
>
Paolo Bonzini Nov. 12, 2014, 9:11 a.m. UTC | #12
On 12/11/2014 10:08, Alexander Graf wrote:
> 
> 
> On 12.11.14 09:49, Frank Blaschka wrote:
>> On Tue, Nov 11, 2014 at 04:24:24PM +0100, Alexander Graf wrote:
>>>
>>>
>>> On 11.11.14 15:08, Frank Blaschka wrote:
>>>> On Tue, Nov 11, 2014 at 01:51:25PM +0100, Alexander Graf wrote:
>>>>>
>>>>>
>>>>>
>>>>>> Am 11.11.2014 um 13:39 schrieb Frank Blaschka <blaschka@linux.vnet.ibm.com>:
>>>>>>
>>>>>>> On Tue, Nov 11, 2014 at 01:16:04PM +0100, Alexander Graf wrote:
>>>>>>>
>>>>>>>
>>>>>>>> On 11.11.14 13:10, Frank Blaschka wrote:
>>>>>>>>> On Mon, Nov 10, 2014 at 04:56:21PM +0100, Alexander Graf wrote:
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>> On 10.11.14 15:20, Frank Blaschka wrote:
>>>>>>>>>> From: Frank Blaschka <frank.blaschka@de.ibm.com>
>>>>>>>>>>
>>>>>>>>>> This patch implements the s390 pci instructions in qemu. It allows
>>>>>>>>>> to access and drive pci devices attached to the s390 pci bus.
>>>>>>>>>> Because of platform constrains devices using IO BARs are not
>>>>>>>>>> supported. Also a device has to support MSI/MSI-X to run on s390.
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Frank Blaschka <frank.blaschka@de.ibm.com>
>>>>>>>>>> ---
>>>>>>>>>> target-s390x/Makefile.objs |   2 +-
>>>>>>>>>> target-s390x/kvm.c         |  52 ++++
>>>>>>>>>> target-s390x/pci_ic.c      | 753 +++++++++++++++++++++++++++++++++++++++++++++
>>>>>>>>>> target-s390x/pci_ic.h      | 335 ++++++++++++++++++++
>>>>>>>>>> 4 files changed, 1141 insertions(+), 1 deletion(-)
>>>>>>>>>> create mode 100644 target-s390x/pci_ic.c
>>>>>>>>>> create mode 100644 target-s390x/pci_ic.h
>>>>>>>>>>
>>>>>>>
>>>>>>> [...]
>>>>>>>
>>>>>>>>>> +int kvm_pcilg_service_call(S390CPU *cpu, struct kvm_run *run)
>>>>>>>>>> +{
>>>>>>>>>> +    CPUS390XState *env = &cpu->env;
>>>>>>>>>> +    S390PCIBusDevice *pbdev;
>>>>>>>>>> +    uint8_t r1 = (run->s390_sieic.ipb & 0x00f00000) >> 20;
>>>>>>>>>> +    uint8_t r2 = (run->s390_sieic.ipb & 0x000f0000) >> 16;
>>>>>>>>>> +    PciLgStg *rp;
>>>>>>>>>> +    uint64_t offset;
>>>>>>>>>> +    uint64_t data;
>>>>>>>>>> +    uint8_t len;
>>>>>>>>>> +
>>>>>>>>>> +    cpu_synchronize_state(CPU(cpu));
>>>>>>>>>> +
>>>>>>>>>> +    if (env->psw.mask & PSW_MASK_PSTATE) {
>>>>>>>>>> +        program_interrupt(env, PGM_PRIVILEGED, 4);
>>>>>>>>>> +        return 0;
>>>>>>>>>> +    }
>>>>>>>>>> +
>>>>>>>>>> +    if (r2 & 0x1) {
>>>>>>>>>> +        program_interrupt(env, PGM_SPECIFICATION, 4);
>>>>>>>>>> +        return 0;
>>>>>>>>>> +    }
>>>>>>>>>> +
>>>>>>>>>> +    rp = (PciLgStg *)&env->regs[r2];
>>>>>>>>>> +    offset = env->regs[r2 + 1];
>>>>>>>>>> +
>>>>>>>>>> +    pbdev = s390_pci_find_dev_by_fh(rp->fh);
>>>>>>>>>> +    if (!pbdev) {
>>>>>>>>>> +        DPRINTF("pcilg no pci dev\n");
>>>>>>>>>> +        setcc(cpu, ZPCI_PCI_LS_INVAL_HANDLE);
>>>>>>>>>> +        return 0;
>>>>>>>>>> +    }
>>>>>>>>>> +
>>>>>>>>>> +    len = rp->len & 0xF;
>>>>>>>>>> +    if (rp->pcias < 6) {
>>>>>>>>>> +        if ((8 - (offset & 0x7)) < len) {
>>>>>>>>>> +            program_interrupt(env, PGM_OPERAND, 4);
>>>>>>>>>> +            return 0;
>>>>>>>>>> +        }
>>>>>>>>>> +        MemoryRegion *mr = pbdev->pdev->io_regions[rp->pcias].memory;
>>>>>>>>>> +        io_mem_read(mr, offset, &data, len);
>>>>>>>>>> +    } else if (rp->pcias == 15) {
>>>>>>>>>> +        if ((4 - (offset & 0x3)) < len) {
>>>>>>>>>> +            program_interrupt(env, PGM_OPERAND, 4);
>>>>>>>>>> +            return 0;
>>>>>>>>>> +        }
>>>>>>>>>> +        data =  pci_host_config_read_common(
>>>>>>>>>> +                   pbdev->pdev, offset, pci_config_size(pbdev->pdev), len);
>>>>>>>>>> +
>>>>>>>>>> +        switch (len) {
>>>>>>>>>> +        case 1:
>>>>>>>>>> +            break;
>>>>>>>>>> +        case 2:
>>>>>>>>>> +            data = cpu_to_le16(data);
>>>>>>>>>> +            break;
>>>>>>>>>> +        case 4:
>>>>>>>>>> +            data = cpu_to_le32(data);
>>>>>>>>>> +            break;
>>>>>>>>>> +        case 8:
>>>>>>>>>> +            data = cpu_to_le64(data);
>>>>>>>>>> +            break;
>>>>>>>>>
>>>>>>>>> Why? Also, this is wrong. cpu_to_le64 convert between host endianness
>>>>>>>>> and LE. So if you're running this on an LE host, you won't swap the
>>>>>>>>> value and get a broken result.
>>>>>>>>>
>>>>>>>>> If you know that the value is always swapped, use bswapxx().
>>>>>>>>>
>>>>>>>>
>>>>>>>> Actually the code is right and required for a big endian host :-)
>>>>>>>> pcilg/pcistg provide access to the PCI config space which is defined
>>>>>>>> as PCI byte order (little endian). Since pci_host_config_read_common does
>>>>>>>> already a le to cpu conversion we have to convert back to PCI byte order.
>>>>>>>> Doing an unconditional swap would be a bug on a little endian host.
>>>>>>>
>>>>>>> Why would it be a bug? The value you end up writing is contents of a
>>>>>>> register and thus doesn't have endianness. So if QEMU was an LE process,
>>>>>>
>>>>>> No, the s390 guest executing pcilg instruction expects to receive config space data
>>>>>> in PCI byte order.
>>>>>>
>>>>>>> the value of data would be identical as on a BE QEMU before your swab.
>>>>>>> After the swab, it would be bswap'ed on BE, but not LE. So LE hosts break.
>>>>>>>
>>>>>>
>>>>>> Again on BE endian host we do the swap because of pci_host_config_read_common does
>>>>>> read the value and do a byte swap for that value, but we need PCI byte order not BE here.
>>>>>>
>>>>>> On LE host pci_host_config_read_common does not do a byte swap so we do not have to
>>>>>> convert back to PCI byte order.
>>>>>
>>>>> We maintain the PCI config space always in LE byte order in memory, that's why there is a bwap in its read function. The return result of the read function however is always the same, regardless of LE or BE host. If I do a read of size 4, I will always get 0x1, not 0x01000000 returned.
>>>>>
>>>>> So now you need to convert that 0x1 into a 0x01000000 manually here because some architect thought that registers have endianness (which they don't). But you need to do it always, even on an LE host, because the pci config space return value is identical on LE and BE.
>>>>>
>>>> so you tell me pci_host_config_read_common does not end up in pci_default_read_config?
>>>>
>>>> uint32_t pci_default_read_config(PCIDevice *d,
>>>>                                  uint32_t address, int len)
>>>> {
>>>>     uint32_t val = 0;
>>>>
>>>>     memcpy(&val, d->config + address, len);
>>>>     return le32_to_cpu(val);
>>>> }
>>>>
>>>> What did I miss?
>>>
>>> That's exactly where you end up in - and it's there to convert from the
>>> PCI config space backing storage to a native number.
>>>
>>> Imagine you write 0x12345678 at offset 0. Because PCI config space is
>>> defined to be LE, in the PCI config space memory this gets stored as
>>>
>>> 78 56 34 12
>>>
>>> The reason we do the internal storage of the config space that way is
>>> that it's (in some PCI implementations) legal to access with single byte
>>> granularities. So you could do a pci_config_read(offset = 1) which
>>> should return 0x56.
>>>
>>> However, that means we completely nullify any effect of host endianness
>>> in the PCI config layer already. So if you do pci_config_write(offset =
>>> 0, size = 4, value = 0x12345678), the contents of d->config will always
>>> be identical, regardless of host endianness. The same holds true for
>>> pci_config_read(offset = 0, size = 4). It will always return 0x12345678.
>>>
>>
>> I understood this from the beginning and I completely agree to this.
>>  
>>> In your code, you swab that value again. I assume there's a reason
>>> you're swapping it and that it's the way the architecture expects it
>>
>> Yes, s390 pcilg architecture states:
>> Data in the PCI configuration space are treated
>> as being in little-endian byte ordering
>>
>>> (mind to point me to the respective spec so I can verify?). But if the
>>> architecture expects it, then it expects it regardless of host
>>> endianness. The contents of regs[r1] should always be 0x78563412, no
>>> matter whether we're in an LE or a BE environment.
>>>
>>> Does that make sense now?
>>>
>> Absolutely lets make an example for qemu running on BE and LE
>>
>> byte order    config space backing   pci_default_read_config   pcilg (with cpu_to_le)
>> BE            0x78563412             0x12345678                0x78563412
>> LE            0x78563412             0x78563412                0x78563412
> 
> No, pci_default_read_config() always returns 0x12345678 because it
> returns a register, not memory.

So:

      config space    pci_default_read_config     pcilg
      (bytes)         memcpy       cpu_to_le      (with cpu_to_le)
BE    78 56 34 12     0x78563412   0x12345678     0x78563412
LE    78 56 34 12     0x12345678   0x12345678     0x12345678

Right?

Paolo
Alexander Graf Nov. 12, 2014, 9:13 a.m. UTC | #13
On 12.11.14 10:11, Paolo Bonzini wrote:
> 
> 
> On 12/11/2014 10:08, Alexander Graf wrote:
>>
>>
>> On 12.11.14 09:49, Frank Blaschka wrote:
>>> On Tue, Nov 11, 2014 at 04:24:24PM +0100, Alexander Graf wrote:
>>>>
>>>>
>>>> On 11.11.14 15:08, Frank Blaschka wrote:
>>>>> On Tue, Nov 11, 2014 at 01:51:25PM +0100, Alexander Graf wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>>> Am 11.11.2014 um 13:39 schrieb Frank Blaschka <blaschka@linux.vnet.ibm.com>:
>>>>>>>
>>>>>>>> On Tue, Nov 11, 2014 at 01:16:04PM +0100, Alexander Graf wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>>> On 11.11.14 13:10, Frank Blaschka wrote:
>>>>>>>>>> On Mon, Nov 10, 2014 at 04:56:21PM +0100, Alexander Graf wrote:
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>> On 10.11.14 15:20, Frank Blaschka wrote:
>>>>>>>>>>> From: Frank Blaschka <frank.blaschka@de.ibm.com>
>>>>>>>>>>>
>>>>>>>>>>> This patch implements the s390 pci instructions in qemu. It allows
>>>>>>>>>>> to access and drive pci devices attached to the s390 pci bus.
>>>>>>>>>>> Because of platform constrains devices using IO BARs are not
>>>>>>>>>>> supported. Also a device has to support MSI/MSI-X to run on s390.
>>>>>>>>>>>
>>>>>>>>>>> Signed-off-by: Frank Blaschka <frank.blaschka@de.ibm.com>
>>>>>>>>>>> ---
>>>>>>>>>>> target-s390x/Makefile.objs |   2 +-
>>>>>>>>>>> target-s390x/kvm.c         |  52 ++++
>>>>>>>>>>> target-s390x/pci_ic.c      | 753 +++++++++++++++++++++++++++++++++++++++++++++
>>>>>>>>>>> target-s390x/pci_ic.h      | 335 ++++++++++++++++++++
>>>>>>>>>>> 4 files changed, 1141 insertions(+), 1 deletion(-)
>>>>>>>>>>> create mode 100644 target-s390x/pci_ic.c
>>>>>>>>>>> create mode 100644 target-s390x/pci_ic.h
>>>>>>>>>>>
>>>>>>>>
>>>>>>>> [...]
>>>>>>>>
>>>>>>>>>>> +int kvm_pcilg_service_call(S390CPU *cpu, struct kvm_run *run)
>>>>>>>>>>> +{
>>>>>>>>>>> +    CPUS390XState *env = &cpu->env;
>>>>>>>>>>> +    S390PCIBusDevice *pbdev;
>>>>>>>>>>> +    uint8_t r1 = (run->s390_sieic.ipb & 0x00f00000) >> 20;
>>>>>>>>>>> +    uint8_t r2 = (run->s390_sieic.ipb & 0x000f0000) >> 16;
>>>>>>>>>>> +    PciLgStg *rp;
>>>>>>>>>>> +    uint64_t offset;
>>>>>>>>>>> +    uint64_t data;
>>>>>>>>>>> +    uint8_t len;
>>>>>>>>>>> +
>>>>>>>>>>> +    cpu_synchronize_state(CPU(cpu));
>>>>>>>>>>> +
>>>>>>>>>>> +    if (env->psw.mask & PSW_MASK_PSTATE) {
>>>>>>>>>>> +        program_interrupt(env, PGM_PRIVILEGED, 4);
>>>>>>>>>>> +        return 0;
>>>>>>>>>>> +    }
>>>>>>>>>>> +
>>>>>>>>>>> +    if (r2 & 0x1) {
>>>>>>>>>>> +        program_interrupt(env, PGM_SPECIFICATION, 4);
>>>>>>>>>>> +        return 0;
>>>>>>>>>>> +    }
>>>>>>>>>>> +
>>>>>>>>>>> +    rp = (PciLgStg *)&env->regs[r2];
>>>>>>>>>>> +    offset = env->regs[r2 + 1];
>>>>>>>>>>> +
>>>>>>>>>>> +    pbdev = s390_pci_find_dev_by_fh(rp->fh);
>>>>>>>>>>> +    if (!pbdev) {
>>>>>>>>>>> +        DPRINTF("pcilg no pci dev\n");
>>>>>>>>>>> +        setcc(cpu, ZPCI_PCI_LS_INVAL_HANDLE);
>>>>>>>>>>> +        return 0;
>>>>>>>>>>> +    }
>>>>>>>>>>> +
>>>>>>>>>>> +    len = rp->len & 0xF;
>>>>>>>>>>> +    if (rp->pcias < 6) {
>>>>>>>>>>> +        if ((8 - (offset & 0x7)) < len) {
>>>>>>>>>>> +            program_interrupt(env, PGM_OPERAND, 4);
>>>>>>>>>>> +            return 0;
>>>>>>>>>>> +        }
>>>>>>>>>>> +        MemoryRegion *mr = pbdev->pdev->io_regions[rp->pcias].memory;
>>>>>>>>>>> +        io_mem_read(mr, offset, &data, len);
>>>>>>>>>>> +    } else if (rp->pcias == 15) {
>>>>>>>>>>> +        if ((4 - (offset & 0x3)) < len) {
>>>>>>>>>>> +            program_interrupt(env, PGM_OPERAND, 4);
>>>>>>>>>>> +            return 0;
>>>>>>>>>>> +        }
>>>>>>>>>>> +        data =  pci_host_config_read_common(
>>>>>>>>>>> +                   pbdev->pdev, offset, pci_config_size(pbdev->pdev), len);
>>>>>>>>>>> +
>>>>>>>>>>> +        switch (len) {
>>>>>>>>>>> +        case 1:
>>>>>>>>>>> +            break;
>>>>>>>>>>> +        case 2:
>>>>>>>>>>> +            data = cpu_to_le16(data);
>>>>>>>>>>> +            break;
>>>>>>>>>>> +        case 4:
>>>>>>>>>>> +            data = cpu_to_le32(data);
>>>>>>>>>>> +            break;
>>>>>>>>>>> +        case 8:
>>>>>>>>>>> +            data = cpu_to_le64(data);
>>>>>>>>>>> +            break;
>>>>>>>>>>
>>>>>>>>>> Why? Also, this is wrong. cpu_to_le64 convert between host endianness
>>>>>>>>>> and LE. So if you're running this on an LE host, you won't swap the
>>>>>>>>>> value and get a broken result.
>>>>>>>>>>
>>>>>>>>>> If you know that the value is always swapped, use bswapxx().
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Actually the code is right and required for a big endian host :-)
>>>>>>>>> pcilg/pcistg provide access to the PCI config space which is defined
>>>>>>>>> as PCI byte order (little endian). Since pci_host_config_read_common does
>>>>>>>>> already a le to cpu conversion we have to convert back to PCI byte order.
>>>>>>>>> Doing an unconditional swap would be a bug on a little endian host.
>>>>>>>>
>>>>>>>> Why would it be a bug? The value you end up writing is contents of a
>>>>>>>> register and thus doesn't have endianness. So if QEMU was an LE process,
>>>>>>>
>>>>>>> No, the s390 guest executing pcilg instruction expects to receive config space data
>>>>>>> in PCI byte order.
>>>>>>>
>>>>>>>> the value of data would be identical as on a BE QEMU before your swab.
>>>>>>>> After the swab, it would be bswap'ed on BE, but not LE. So LE hosts break.
>>>>>>>>
>>>>>>>
>>>>>>> Again on BE endian host we do the swap because of pci_host_config_read_common does
>>>>>>> read the value and do a byte swap for that value, but we need PCI byte order not BE here.
>>>>>>>
>>>>>>> On LE host pci_host_config_read_common does not do a byte swap so we do not have to
>>>>>>> convert back to PCI byte order.
>>>>>>
>>>>>> We maintain the PCI config space always in LE byte order in memory, that's why there is a bwap in its read function. The return result of the read function however is always the same, regardless of LE or BE host. If I do a read of size 4, I will always get 0x1, not 0x01000000 returned.
>>>>>>
>>>>>> So now you need to convert that 0x1 into a 0x01000000 manually here because some architect thought that registers have endianness (which they don't). But you need to do it always, even on an LE host, because the pci config space return value is identical on LE and BE.
>>>>>>
>>>>> so you tell me pci_host_config_read_common does not end up in pci_default_read_config?
>>>>>
>>>>> uint32_t pci_default_read_config(PCIDevice *d,
>>>>>                                  uint32_t address, int len)
>>>>> {
>>>>>     uint32_t val = 0;
>>>>>
>>>>>     memcpy(&val, d->config + address, len);
>>>>>     return le32_to_cpu(val);
>>>>> }
>>>>>
>>>>> What did I miss?
>>>>
>>>> That's exactly where you end up in - and it's there to convert from the
>>>> PCI config space backing storage to a native number.
>>>>
>>>> Imagine you write 0x12345678 at offset 0. Because PCI config space is
>>>> defined to be LE, in the PCI config space memory this gets stored as
>>>>
>>>> 78 56 34 12
>>>>
>>>> The reason we do the internal storage of the config space that way is
>>>> that it's (in some PCI implementations) legal to access with single byte
>>>> granularities. So you could do a pci_config_read(offset = 1) which
>>>> should return 0x56.
>>>>
>>>> However, that means we completely nullify any effect of host endianness
>>>> in the PCI config layer already. So if you do pci_config_write(offset =
>>>> 0, size = 4, value = 0x12345678), the contents of d->config will always
>>>> be identical, regardless of host endianness. The same holds true for
>>>> pci_config_read(offset = 0, size = 4). It will always return 0x12345678.
>>>>
>>>
>>> I understood this from the beginning and I completely agree to this.
>>>  
>>>> In your code, you swab that value again. I assume there's a reason
>>>> you're swapping it and that it's the way the architecture expects it
>>>
>>> Yes, s390 pcilg architecture states:
>>> Data in the PCI configuration space are treated
>>> as being in little-endian byte ordering
>>>
>>>> (mind to point me to the respective spec so I can verify?). But if the
>>>> architecture expects it, then it expects it regardless of host
>>>> endianness. The contents of regs[r1] should always be 0x78563412, no
>>>> matter whether we're in an LE or a BE environment.
>>>>
>>>> Does that make sense now?
>>>>
>>> Absolutely lets make an example for qemu running on BE and LE
>>>
>>> byte order    config space backing   pci_default_read_config   pcilg (with cpu_to_le)
>>> BE            0x78563412             0x12345678                0x78563412
>>> LE            0x78563412             0x78563412                0x78563412
>>
>> No, pci_default_read_config() always returns 0x12345678 because it
>> returns a register, not memory.
> 
> So:
> 
>       config space    pci_default_read_config     pcilg
>       (bytes)         memcpy       cpu_to_le      (with cpu_to_le)
> BE    78 56 34 12     0x78563412   0x12345678     0x78563412
> LE    78 56 34 12     0x12345678   0x12345678     0x12345678
> 
> Right?

Yes, exactly :).


Alex
Frank Blaschka Nov. 12, 2014, 9:19 a.m. UTC | #14
On Wed, Nov 12, 2014 at 10:08:19AM +0100, Alexander Graf wrote:
> 
> 
> On 12.11.14 09:49, Frank Blaschka wrote:
> > On Tue, Nov 11, 2014 at 04:24:24PM +0100, Alexander Graf wrote:
> >>
> >>
> >> On 11.11.14 15:08, Frank Blaschka wrote:
> >>> On Tue, Nov 11, 2014 at 01:51:25PM +0100, Alexander Graf wrote:
> >>>>
> >>>>
> >>>>
> >>>>> Am 11.11.2014 um 13:39 schrieb Frank Blaschka <blaschka@linux.vnet.ibm.com>:
> >>>>>
> >>>>>> On Tue, Nov 11, 2014 at 01:16:04PM +0100, Alexander Graf wrote:
> >>>>>>
> >>>>>>
> >>>>>>> On 11.11.14 13:10, Frank Blaschka wrote:
> >>>>>>>> On Mon, Nov 10, 2014 at 04:56:21PM +0100, Alexander Graf wrote:
> >>>>>>>>
> >>>>>>>>
> >>>>>>>>> On 10.11.14 15:20, Frank Blaschka wrote:
> >>>>>>>>> From: Frank Blaschka <frank.blaschka@de.ibm.com>
> >>>>>>>>>
> >>>>>>>>> This patch implements the s390 pci instructions in qemu. It allows
> >>>>>>>>> to access and drive pci devices attached to the s390 pci bus.
> >>>>>>>>> Because of platform constrains devices using IO BARs are not
> >>>>>>>>> supported. Also a device has to support MSI/MSI-X to run on s390.
> >>>>>>>>>
> >>>>>>>>> Signed-off-by: Frank Blaschka <frank.blaschka@de.ibm.com>
> >>>>>>>>> ---
> >>>>>>>>> target-s390x/Makefile.objs |   2 +-
> >>>>>>>>> target-s390x/kvm.c         |  52 ++++
> >>>>>>>>> target-s390x/pci_ic.c      | 753 +++++++++++++++++++++++++++++++++++++++++++++
> >>>>>>>>> target-s390x/pci_ic.h      | 335 ++++++++++++++++++++
> >>>>>>>>> 4 files changed, 1141 insertions(+), 1 deletion(-)
> >>>>>>>>> create mode 100644 target-s390x/pci_ic.c
> >>>>>>>>> create mode 100644 target-s390x/pci_ic.h
> >>>>>>>>>
> >>>>>>
> >>>>>> [...]
> >>>>>>
> >>>>>>>>> +int kvm_pcilg_service_call(S390CPU *cpu, struct kvm_run *run)
> >>>>>>>>> +{
> >>>>>>>>> +    CPUS390XState *env = &cpu->env;
> >>>>>>>>> +    S390PCIBusDevice *pbdev;
> >>>>>>>>> +    uint8_t r1 = (run->s390_sieic.ipb & 0x00f00000) >> 20;
> >>>>>>>>> +    uint8_t r2 = (run->s390_sieic.ipb & 0x000f0000) >> 16;
> >>>>>>>>> +    PciLgStg *rp;
> >>>>>>>>> +    uint64_t offset;
> >>>>>>>>> +    uint64_t data;
> >>>>>>>>> +    uint8_t len;
> >>>>>>>>> +
> >>>>>>>>> +    cpu_synchronize_state(CPU(cpu));
> >>>>>>>>> +
> >>>>>>>>> +    if (env->psw.mask & PSW_MASK_PSTATE) {
> >>>>>>>>> +        program_interrupt(env, PGM_PRIVILEGED, 4);
> >>>>>>>>> +        return 0;
> >>>>>>>>> +    }
> >>>>>>>>> +
> >>>>>>>>> +    if (r2 & 0x1) {
> >>>>>>>>> +        program_interrupt(env, PGM_SPECIFICATION, 4);
> >>>>>>>>> +        return 0;
> >>>>>>>>> +    }
> >>>>>>>>> +
> >>>>>>>>> +    rp = (PciLgStg *)&env->regs[r2];
> >>>>>>>>> +    offset = env->regs[r2 + 1];
> >>>>>>>>> +
> >>>>>>>>> +    pbdev = s390_pci_find_dev_by_fh(rp->fh);
> >>>>>>>>> +    if (!pbdev) {
> >>>>>>>>> +        DPRINTF("pcilg no pci dev\n");
> >>>>>>>>> +        setcc(cpu, ZPCI_PCI_LS_INVAL_HANDLE);
> >>>>>>>>> +        return 0;
> >>>>>>>>> +    }
> >>>>>>>>> +
> >>>>>>>>> +    len = rp->len & 0xF;
> >>>>>>>>> +    if (rp->pcias < 6) {
> >>>>>>>>> +        if ((8 - (offset & 0x7)) < len) {
> >>>>>>>>> +            program_interrupt(env, PGM_OPERAND, 4);
> >>>>>>>>> +            return 0;
> >>>>>>>>> +        }
> >>>>>>>>> +        MemoryRegion *mr = pbdev->pdev->io_regions[rp->pcias].memory;
> >>>>>>>>> +        io_mem_read(mr, offset, &data, len);
> >>>>>>>>> +    } else if (rp->pcias == 15) {
> >>>>>>>>> +        if ((4 - (offset & 0x3)) < len) {
> >>>>>>>>> +            program_interrupt(env, PGM_OPERAND, 4);
> >>>>>>>>> +            return 0;
> >>>>>>>>> +        }
> >>>>>>>>> +        data =  pci_host_config_read_common(
> >>>>>>>>> +                   pbdev->pdev, offset, pci_config_size(pbdev->pdev), len);
> >>>>>>>>> +
> >>>>>>>>> +        switch (len) {
> >>>>>>>>> +        case 1:
> >>>>>>>>> +            break;
> >>>>>>>>> +        case 2:
> >>>>>>>>> +            data = cpu_to_le16(data);
> >>>>>>>>> +            break;
> >>>>>>>>> +        case 4:
> >>>>>>>>> +            data = cpu_to_le32(data);
> >>>>>>>>> +            break;
> >>>>>>>>> +        case 8:
> >>>>>>>>> +            data = cpu_to_le64(data);
> >>>>>>>>> +            break;
> >>>>>>>>
> >>>>>>>> Why? Also, this is wrong. cpu_to_le64 convert between host endianness
> >>>>>>>> and LE. So if you're running this on an LE host, you won't swap the
> >>>>>>>> value and get a broken result.
> >>>>>>>>
> >>>>>>>> If you know that the value is always swapped, use bswapxx().
> >>>>>>>>
> >>>>>>>
> >>>>>>> Actually the code is right and required for a big endian host :-)
> >>>>>>> pcilg/pcistg provide access to the PCI config space which is defined
> >>>>>>> as PCI byte order (little endian). Since pci_host_config_read_common does
> >>>>>>> already a le to cpu conversion we have to convert back to PCI byte order.
> >>>>>>> Doing an unconditional swap would be a bug on a little endian host.
> >>>>>>
> >>>>>> Why would it be a bug? The value you end up writing is contents of a
> >>>>>> register and thus doesn't have endianness. So if QEMU was an LE process,
> >>>>>
> >>>>> No, the s390 guest executing pcilg instruction expects to receive config space data
> >>>>> in PCI byte order.
> >>>>>
> >>>>>> the value of data would be identical as on a BE QEMU before your swab.
> >>>>>> After the swab, it would be bswap'ed on BE, but not LE. So LE hosts break.
> >>>>>>
> >>>>>
> >>>>> Again on BE endian host we do the swap because of pci_host_config_read_common does
> >>>>> read the value and do a byte swap for that value, but we need PCI byte order not BE here.
> >>>>>
> >>>>> On LE host pci_host_config_read_common does not do a byte swap so we do not have to
> >>>>> convert back to PCI byte order.
> >>>>
> >>>> We maintain the PCI config space always in LE byte order in memory, that's why there is a bwap in its read function. The return result of the read function however is always the same, regardless of LE or BE host. If I do a read of size 4, I will always get 0x1, not 0x01000000 returned.
> >>>>
> >>>> So now you need to convert that 0x1 into a 0x01000000 manually here because some architect thought that registers have endianness (which they don't). But you need to do it always, even on an LE host, because the pci config space return value is identical on LE and BE.
> >>>>
> >>> so you tell me pci_host_config_read_common does not end up in pci_default_read_config?
> >>>
> >>> uint32_t pci_default_read_config(PCIDevice *d,
> >>>                                  uint32_t address, int len)
> >>> {
> >>>     uint32_t val = 0;
> >>>
> >>>     memcpy(&val, d->config + address, len);
> >>>     return le32_to_cpu(val);
> >>> }
> >>>
> >>> What did I miss?
> >>
> >> That's exactly where you end up in - and it's there to convert from the
> >> PCI config space backing storage to a native number.
> >>
> >> Imagine you write 0x12345678 at offset 0. Because PCI config space is
> >> defined to be LE, in the PCI config space memory this gets stored as
> >>
> >> 78 56 34 12
> >>
> >> The reason we do the internal storage of the config space that way is
> >> that it's (in some PCI implementations) legal to access with single byte
> >> granularities. So you could do a pci_config_read(offset = 1) which
> >> should return 0x56.
> >>
> >> However, that means we completely nullify any effect of host endianness
> >> in the PCI config layer already. So if you do pci_config_write(offset =
> >> 0, size = 4, value = 0x12345678), the contents of d->config will always
> >> be identical, regardless of host endianness. The same holds true for
> >> pci_config_read(offset = 0, size = 4). It will always return 0x12345678.
> >>
> > 
> > I understood this from the beginning and I completely agree to this.
> >  
> >> In your code, you swab that value again. I assume there's a reason
> >> you're swapping it and that it's the way the architecture expects it
> > 
> > Yes, s390 pcilg architecture states:
> > Data in the PCI configuration space are treated
> > as being in little-endian byte ordering
> > 
> >> (mind to point me to the respective spec so I can verify?). But if the
> >> architecture expects it, then it expects it regardless of host
> >> endianness. The contents of regs[r1] should always be 0x78563412, no
> >> matter whether we're in an LE or a BE environment.
> >>
> >> Does that make sense now?
> >>
> > Absolutely lets make an example for qemu running on BE and LE
> > 
> > byte order    config space backing   pci_default_read_config   pcilg (with cpu_to_le)
> > BE            0x78563412             0x12345678                0x78563412
> > LE            0x78563412             0x78563412                0x78563412
> 
> No, pci_default_read_config() always returns 0x12345678 because it
> returns a register, not memory.
>

You mean implementation of pci_default_read_config is broken?
If it should return a register it should not do "return le32_to_cpu(val);"
 
> 
> Alex
> 
> > 
> > So what is the problem with my code? Adding unconditional byte swap instead of
> > cpu_to_le in pcilg would break architecture for pcilg if qemu is running on LE
> > platform.
> > 
> >>
> >> Alex
> >>
> > 
>
Alexander Graf Nov. 12, 2014, 9:22 a.m. UTC | #15
On 12.11.14 10:19, Frank Blaschka wrote:
> On Wed, Nov 12, 2014 at 10:08:19AM +0100, Alexander Graf wrote:
>>
>>
>> On 12.11.14 09:49, Frank Blaschka wrote:
>>> On Tue, Nov 11, 2014 at 04:24:24PM +0100, Alexander Graf wrote:
>>>>
>>>>
>>>> On 11.11.14 15:08, Frank Blaschka wrote:
>>>>> On Tue, Nov 11, 2014 at 01:51:25PM +0100, Alexander Graf wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>>> Am 11.11.2014 um 13:39 schrieb Frank Blaschka <blaschka@linux.vnet.ibm.com>:
>>>>>>>
>>>>>>>> On Tue, Nov 11, 2014 at 01:16:04PM +0100, Alexander Graf wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>>> On 11.11.14 13:10, Frank Blaschka wrote:
>>>>>>>>>> On Mon, Nov 10, 2014 at 04:56:21PM +0100, Alexander Graf wrote:
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>> On 10.11.14 15:20, Frank Blaschka wrote:
>>>>>>>>>>> From: Frank Blaschka <frank.blaschka@de.ibm.com>
>>>>>>>>>>>
>>>>>>>>>>> This patch implements the s390 pci instructions in qemu. It allows
>>>>>>>>>>> to access and drive pci devices attached to the s390 pci bus.
>>>>>>>>>>> Because of platform constrains devices using IO BARs are not
>>>>>>>>>>> supported. Also a device has to support MSI/MSI-X to run on s390.
>>>>>>>>>>>
>>>>>>>>>>> Signed-off-by: Frank Blaschka <frank.blaschka@de.ibm.com>
>>>>>>>>>>> ---
>>>>>>>>>>> target-s390x/Makefile.objs |   2 +-
>>>>>>>>>>> target-s390x/kvm.c         |  52 ++++
>>>>>>>>>>> target-s390x/pci_ic.c      | 753 +++++++++++++++++++++++++++++++++++++++++++++
>>>>>>>>>>> target-s390x/pci_ic.h      | 335 ++++++++++++++++++++
>>>>>>>>>>> 4 files changed, 1141 insertions(+), 1 deletion(-)
>>>>>>>>>>> create mode 100644 target-s390x/pci_ic.c
>>>>>>>>>>> create mode 100644 target-s390x/pci_ic.h
>>>>>>>>>>>
>>>>>>>>
>>>>>>>> [...]
>>>>>>>>
>>>>>>>>>>> +int kvm_pcilg_service_call(S390CPU *cpu, struct kvm_run *run)
>>>>>>>>>>> +{
>>>>>>>>>>> +    CPUS390XState *env = &cpu->env;
>>>>>>>>>>> +    S390PCIBusDevice *pbdev;
>>>>>>>>>>> +    uint8_t r1 = (run->s390_sieic.ipb & 0x00f00000) >> 20;
>>>>>>>>>>> +    uint8_t r2 = (run->s390_sieic.ipb & 0x000f0000) >> 16;
>>>>>>>>>>> +    PciLgStg *rp;
>>>>>>>>>>> +    uint64_t offset;
>>>>>>>>>>> +    uint64_t data;
>>>>>>>>>>> +    uint8_t len;
>>>>>>>>>>> +
>>>>>>>>>>> +    cpu_synchronize_state(CPU(cpu));
>>>>>>>>>>> +
>>>>>>>>>>> +    if (env->psw.mask & PSW_MASK_PSTATE) {
>>>>>>>>>>> +        program_interrupt(env, PGM_PRIVILEGED, 4);
>>>>>>>>>>> +        return 0;
>>>>>>>>>>> +    }
>>>>>>>>>>> +
>>>>>>>>>>> +    if (r2 & 0x1) {
>>>>>>>>>>> +        program_interrupt(env, PGM_SPECIFICATION, 4);
>>>>>>>>>>> +        return 0;
>>>>>>>>>>> +    }
>>>>>>>>>>> +
>>>>>>>>>>> +    rp = (PciLgStg *)&env->regs[r2];
>>>>>>>>>>> +    offset = env->regs[r2 + 1];
>>>>>>>>>>> +
>>>>>>>>>>> +    pbdev = s390_pci_find_dev_by_fh(rp->fh);
>>>>>>>>>>> +    if (!pbdev) {
>>>>>>>>>>> +        DPRINTF("pcilg no pci dev\n");
>>>>>>>>>>> +        setcc(cpu, ZPCI_PCI_LS_INVAL_HANDLE);
>>>>>>>>>>> +        return 0;
>>>>>>>>>>> +    }
>>>>>>>>>>> +
>>>>>>>>>>> +    len = rp->len & 0xF;
>>>>>>>>>>> +    if (rp->pcias < 6) {
>>>>>>>>>>> +        if ((8 - (offset & 0x7)) < len) {
>>>>>>>>>>> +            program_interrupt(env, PGM_OPERAND, 4);
>>>>>>>>>>> +            return 0;
>>>>>>>>>>> +        }
>>>>>>>>>>> +        MemoryRegion *mr = pbdev->pdev->io_regions[rp->pcias].memory;
>>>>>>>>>>> +        io_mem_read(mr, offset, &data, len);
>>>>>>>>>>> +    } else if (rp->pcias == 15) {
>>>>>>>>>>> +        if ((4 - (offset & 0x3)) < len) {
>>>>>>>>>>> +            program_interrupt(env, PGM_OPERAND, 4);
>>>>>>>>>>> +            return 0;
>>>>>>>>>>> +        }
>>>>>>>>>>> +        data =  pci_host_config_read_common(
>>>>>>>>>>> +                   pbdev->pdev, offset, pci_config_size(pbdev->pdev), len);
>>>>>>>>>>> +
>>>>>>>>>>> +        switch (len) {
>>>>>>>>>>> +        case 1:
>>>>>>>>>>> +            break;
>>>>>>>>>>> +        case 2:
>>>>>>>>>>> +            data = cpu_to_le16(data);
>>>>>>>>>>> +            break;
>>>>>>>>>>> +        case 4:
>>>>>>>>>>> +            data = cpu_to_le32(data);
>>>>>>>>>>> +            break;
>>>>>>>>>>> +        case 8:
>>>>>>>>>>> +            data = cpu_to_le64(data);
>>>>>>>>>>> +            break;
>>>>>>>>>>
>>>>>>>>>> Why? Also, this is wrong. cpu_to_le64 convert between host endianness
>>>>>>>>>> and LE. So if you're running this on an LE host, you won't swap the
>>>>>>>>>> value and get a broken result.
>>>>>>>>>>
>>>>>>>>>> If you know that the value is always swapped, use bswapxx().
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Actually the code is right and required for a big endian host :-)
>>>>>>>>> pcilg/pcistg provide access to the PCI config space which is defined
>>>>>>>>> as PCI byte order (little endian). Since pci_host_config_read_common does
>>>>>>>>> already a le to cpu conversion we have to convert back to PCI byte order.
>>>>>>>>> Doing an unconditional swap would be a bug on a little endian host.
>>>>>>>>
>>>>>>>> Why would it be a bug? The value you end up writing is contents of a
>>>>>>>> register and thus doesn't have endianness. So if QEMU was an LE process,
>>>>>>>
>>>>>>> No, the s390 guest executing pcilg instruction expects to receive config space data
>>>>>>> in PCI byte order.
>>>>>>>
>>>>>>>> the value of data would be identical as on a BE QEMU before your swab.
>>>>>>>> After the swab, it would be bswap'ed on BE, but not LE. So LE hosts break.
>>>>>>>>
>>>>>>>
>>>>>>> Again on BE endian host we do the swap because of pci_host_config_read_common does
>>>>>>> read the value and do a byte swap for that value, but we need PCI byte order not BE here.
>>>>>>>
>>>>>>> On LE host pci_host_config_read_common does not do a byte swap so we do not have to
>>>>>>> convert back to PCI byte order.
>>>>>>
>>>>>> We maintain the PCI config space always in LE byte order in memory, that's why there is a bwap in its read function. The return result of the read function however is always the same, regardless of LE or BE host. If I do a read of size 4, I will always get 0x1, not 0x01000000 returned.
>>>>>>
>>>>>> So now you need to convert that 0x1 into a 0x01000000 manually here because some architect thought that registers have endianness (which they don't). But you need to do it always, even on an LE host, because the pci config space return value is identical on LE and BE.
>>>>>>
>>>>> so you tell me pci_host_config_read_common does not end up in pci_default_read_config?
>>>>>
>>>>> uint32_t pci_default_read_config(PCIDevice *d,
>>>>>                                  uint32_t address, int len)
>>>>> {
>>>>>     uint32_t val = 0;
>>>>>
>>>>>     memcpy(&val, d->config + address, len);
>>>>>     return le32_to_cpu(val);
>>>>> }
>>>>>
>>>>> What did I miss?
>>>>
>>>> That's exactly where you end up in - and it's there to convert from the
>>>> PCI config space backing storage to a native number.
>>>>
>>>> Imagine you write 0x12345678 at offset 0. Because PCI config space is
>>>> defined to be LE, in the PCI config space memory this gets stored as
>>>>
>>>> 78 56 34 12
>>>>
>>>> The reason we do the internal storage of the config space that way is
>>>> that it's (in some PCI implementations) legal to access with single byte
>>>> granularities. So you could do a pci_config_read(offset = 1) which
>>>> should return 0x56.
>>>>
>>>> However, that means we completely nullify any effect of host endianness
>>>> in the PCI config layer already. So if you do pci_config_write(offset =
>>>> 0, size = 4, value = 0x12345678), the contents of d->config will always
>>>> be identical, regardless of host endianness. The same holds true for
>>>> pci_config_read(offset = 0, size = 4). It will always return 0x12345678.
>>>>
>>>
>>> I understood this from the beginning and I completely agree to this.
>>>  
>>>> In your code, you swab that value again. I assume there's a reason
>>>> you're swapping it and that it's the way the architecture expects it
>>>
>>> Yes, s390 pcilg architecture states:
>>> Data in the PCI configuration space are treated
>>> as being in little-endian byte ordering
>>>
>>>> (mind to point me to the respective spec so I can verify?). But if the
>>>> architecture expects it, then it expects it regardless of host
>>>> endianness. The contents of regs[r1] should always be 0x78563412, no
>>>> matter whether we're in an LE or a BE environment.
>>>>
>>>> Does that make sense now?
>>>>
>>> Absolutely lets make an example for qemu running on BE and LE
>>>
>>> byte order    config space backing   pci_default_read_config   pcilg (with cpu_to_le)
>>> BE            0x78563412             0x12345678                0x78563412
>>> LE            0x78563412             0x78563412                0x78563412
>>
>> No, pci_default_read_config() always returns 0x12345678 because it
>> returns a register, not memory.
>>
> 
> You mean implementation of pci_default_read_config is broken?
> If it should return a register it should not do "return le32_to_cpu(val);"

It has to, to convert from memory (after memcpy) to an actual register
value. Look at the value list in Paolo's email - I really have no idea
how to explain it any better.


Alex
Paolo Bonzini Nov. 12, 2014, 9:36 a.m. UTC | #16
On 12/11/2014 10:22, Alexander Graf wrote:
>>>> Absolutely lets make an example for qemu running on BE and LE
>>>>
>>>> byte order    config space backing   pci_default_read_config   pcilg (with cpu_to_le)
>>>> BE            0x78563412             0x12345678                0x78563412
>>>> LE            0x78563412             0x78563412                0x78563412
>>>
>>> No, pci_default_read_config() always returns 0x12345678 because it
>>> returns a register, not memory.
>>>
>>
>> You mean implementation of pci_default_read_config is broken?
>> If it should return a register it should not do "return le32_to_cpu(val);"
> 
> It has to, to convert from memory (after memcpy) to an actual register
> value. Look at the value list in Paolo's email - I really have no idea
> how to explain it any better.

pci_default_read_config is reading from a *device* register, and has
absolutely zero knowledge of the host CPU endianness.

Another way to explain that the result of pci_default_read_config is
independent of the host endianness, is that the function is basically
doing this:

switch (len) {
    case 1: return d->config[address];
    case 2: return ldw_le_p(&d->config[address)]);
    case 4: return ldl_le_p(&d->config[address)]);
    default: abort();
}

So if you want to make the outcome big endian, you have to swap
unconditionally.

Paolo
Frank Blaschka Nov. 12, 2014, 2:34 p.m. UTC | #17
On Wed, Nov 12, 2014 at 10:36:03AM +0100, Paolo Bonzini wrote:
> 
> 
> On 12/11/2014 10:22, Alexander Graf wrote:
> >>>> Absolutely lets make an example for qemu running on BE and LE
> >>>>
> >>>> byte order    config space backing   pci_default_read_config   pcilg (with cpu_to_le)
> >>>> BE            0x78563412             0x12345678                0x78563412
> >>>> LE            0x78563412             0x78563412                0x78563412
> >>>
> >>> No, pci_default_read_config() always returns 0x12345678 because it
> >>> returns a register, not memory.
> >>>
> >>
> >> You mean implementation of pci_default_read_config is broken?
> >> If it should return a register it should not do "return le32_to_cpu(val);"
> > 
> > It has to, to convert from memory (after memcpy) to an actual register
> > value. Look at the value list in Paolo's email - I really have no idea
> > how to explain it any better.
> 
> pci_default_read_config is reading from a *device* register, and has
> absolutely zero knowledge of the host CPU endianness.
> 
> Another way to explain that the result of pci_default_read_config is
> independent of the host endianness, is that the function is basically
> doing this:
> 
> switch (len) {
>     case 1: return d->config[address];
>     case 2: return ldw_le_p(&d->config[address)]);
>     case 4: return ldl_le_p(&d->config[address)]);
>     default: abort();
> }
> 
> So if you want to make the outcome big endian, you have to swap
> unconditionally.
> 
> Paolo

Hi Paolo, Alex,

thx a lot for all the explanation and patience.
I think I have understand your point now. I will change the code to 
unconditional swap. I feel I had a knowledge gap regarding running guest and
host which different byte orders. Hope this gap is filled now ;)

Frank

>
diff mbox

Patch

diff --git a/target-s390x/Makefile.objs b/target-s390x/Makefile.objs
index 2c57494..cc71400 100644
--- a/target-s390x/Makefile.objs
+++ b/target-s390x/Makefile.objs
@@ -2,4 +2,4 @@  obj-y += translate.o helper.o cpu.o interrupt.o
 obj-y += int_helper.o fpu_helper.o cc_helper.o mem_helper.o misc_helper.o
 obj-y += gdbstub.o
 obj-$(CONFIG_SOFTMMU) += machine.o ioinst.o arch_dump.o
-obj-$(CONFIG_KVM) += kvm.o
+obj-$(CONFIG_KVM) += kvm.o pci_ic.o
diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c
index 5b10a25..d59e740 100644
--- a/target-s390x/kvm.c
+++ b/target-s390x/kvm.c
@@ -40,6 +40,7 @@ 
 #include "exec/gdbstub.h"
 #include "trace.h"
 #include "qapi-event.h"
+#include "pci_ic.h"
 
 /* #define DEBUG_KVM */
 
@@ -56,6 +57,7 @@ 
 #define IPA0_B2                         0xb200
 #define IPA0_B9                         0xb900
 #define IPA0_EB                         0xeb00
+#define IPA0_E3                         0xe300
 
 #define PRIV_B2_SCLP_CALL               0x20
 #define PRIV_B2_CSCH                    0x30
@@ -76,8 +78,17 @@ 
 #define PRIV_B2_XSCH                    0x76
 
 #define PRIV_EB_SQBS                    0x8a
+#define PRIV_EB_PCISTB                  0xd0
+#define PRIV_EB_SIC                     0xd1
 
 #define PRIV_B9_EQBS                    0x9c
+#define PRIV_B9_CLP                     0xa0
+#define PRIV_B9_PCISTG                  0xd0
+#define PRIV_B9_PCILG                   0xd2
+#define PRIV_B9_RPCIT                   0xd3
+
+#define PRIV_E3_MPCIFC                  0xd0
+#define PRIV_E3_STPCIFC                 0xd4
 
 #define DIAG_IPL                        0x308
 #define DIAG_KVM_HYPERCALL              0x500
@@ -814,6 +825,18 @@  static int handle_b9(S390CPU *cpu, struct kvm_run *run, uint8_t ipa1)
     int r = 0;
 
     switch (ipa1) {
+    case PRIV_B9_CLP:
+        r = kvm_clp_service_call(cpu, run);
+        break;
+    case PRIV_B9_PCISTG:
+        r = kvm_pcistg_service_call(cpu, run);
+        break;
+    case PRIV_B9_PCILG:
+        r = kvm_pcilg_service_call(cpu, run);
+        break;
+    case PRIV_B9_RPCIT:
+        r = kvm_rpcit_service_call(cpu, run);
+        break;
     case PRIV_B9_EQBS:
         /* just inject exception */
         r = -1;
@@ -832,6 +855,12 @@  static int handle_eb(S390CPU *cpu, struct kvm_run *run, uint8_t ipa1)
     int r = 0;
 
     switch (ipa1) {
+    case PRIV_EB_PCISTB:
+        r = kvm_pcistb_service_call(cpu, run);
+        break;
+    case PRIV_EB_SIC:
+        r = kvm_sic_service_call(cpu, run);
+        break;
     case PRIV_EB_SQBS:
         /* just inject exception */
         r = -1;
@@ -845,6 +874,26 @@  static int handle_eb(S390CPU *cpu, struct kvm_run *run, uint8_t ipa1)
     return r;
 }
 
+static int handle_e3(S390CPU *cpu, struct kvm_run *run, uint8_t ipbl)
+{
+    int r = 0;
+
+    switch (ipbl) {
+    case PRIV_E3_MPCIFC:
+        r = kvm_mpcifc_service_call(cpu, run);
+        break;
+    case PRIV_E3_STPCIFC:
+        r = kvm_stpcifc_service_call(cpu, run);
+        break;
+    default:
+        r = -1;
+        DPRINTF("KVM: unhandled PRIV: 0xe3%x\n", ipbl);
+        break;
+    }
+
+    return r;
+}
+
 static int handle_hypercall(S390CPU *cpu, struct kvm_run *run)
 {
     CPUS390XState *env = &cpu->env;
@@ -1041,6 +1090,9 @@  static int handle_instruction(S390CPU *cpu, struct kvm_run *run)
     case IPA0_EB:
         r = handle_eb(cpu, run, ipa1);
         break;
+    case IPA0_E3:
+        r = handle_e3(cpu, run, run->s390_sieic.ipb & 0xff);
+        break;
     case IPA0_DIAG:
         r = handle_diag(cpu, run, run->s390_sieic.ipb);
         break;
diff --git a/target-s390x/pci_ic.c b/target-s390x/pci_ic.c
new file mode 100644
index 0000000..6c05faf
--- /dev/null
+++ b/target-s390x/pci_ic.c
@@ -0,0 +1,753 @@ 
+/*
+ * s390 PCI intercepts
+ *
+ * Copyright 2014 IBM Corp.
+ * Author(s): Frank Blaschka <frank.blaschka@de.ibm.com>
+ *            Hong Bo Li <lihbbj@cn.ibm.com>
+ *            Yi Min Zhao <zyimin@cn.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or (at
+ * your option) any later version. See the COPYING file in the top-level
+ * directory.
+ */
+
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+
+#include <linux/kvm.h>
+#include <asm/ptrace.h>
+#include <hw/pci/pci.h>
+#include <hw/pci/pci_host.h>
+#include <net/net.h>
+
+#include "qemu-common.h"
+#include "qemu/timer.h"
+#include "migration/qemu-file.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/kvm.h"
+#include "cpu.h"
+#include "sysemu/device_tree.h"
+#include "monitor/monitor.h"
+#include "pci_ic.h"
+
+#include "hw/hw.h"
+#include "hw/pci/pci.h"
+#include "hw/pci/pci_bridge.h"
+#include "hw/pci/pci_bus.h"
+#include "hw/pci/pci_host.h"
+#include "hw/s390x/s390-pci-bus.h"
+#include "exec/exec-all.h"
+#include "exec/memory-internal.h"
+
+/* #define DEBUG_S390PCI_IC */
+#ifdef DEBUG_S390PCI_IC
+#define DPRINTF(fmt, ...) \
+    do { fprintf(stderr, "s390pci_ic: " fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTF(fmt, ...) \
+    do { } while (0)
+#endif
+
+static uint64_t resume_token;
+
+static uint8_t barsize(uint64_t size)
+{
+    uint64_t mask = 1;
+    int i;
+
+    if (!size) {
+        return 0;
+    }
+
+    for (i = 0; i < 64; i++) {
+        if (size & mask) {
+            break;
+        }
+        mask = (mask << 1);
+    }
+
+    return i;
+}
+
+static void s390_set_status_code(CPUS390XState *env,
+                                 uint8_t r, uint64_t status_code)
+{
+    env->regs[r] &= ~0xff000000;
+    env->regs[r] |= (status_code & 0xff) << 24;
+}
+
+static int list_pci(ClpReqRspListPci *rrb, uint8_t *cc)
+{
+    S390PCIBusDevice *pbdev;
+    uint32_t res_code, initial_l2, g_l2, finish;
+    int rc, idx;
+
+    rc = 0;
+    if (be16_to_cpu(rrb->request.hdr.len) != 32) {
+        res_code = CLP_RC_LEN;
+        rc = -EINVAL;
+        goto out;
+    }
+
+    if ((be32_to_cpu(rrb->request.fmt) & CLP_MASK_FMT) != 0) {
+        res_code = CLP_RC_FMT;
+        rc = -EINVAL;
+        goto out;
+    }
+
+    if ((be32_to_cpu(rrb->request.fmt) & ~CLP_MASK_FMT) != 0 ||
+        rrb->request.reserved1 != 0 ||
+        rrb->request.reserved2 != 0) {
+        res_code = CLP_RC_RESNOT0;
+        rc = -EINVAL;
+        goto out;
+    }
+
+    if (be64_to_cpu(rrb->request.resume_token) == 0) {
+        resume_token = 0;
+    } else if (be64_to_cpu(rrb->request.resume_token) != resume_token) {
+        res_code = CLP_RC_LISTPCI_BADRT;
+        rc = -EINVAL;
+        goto out;
+    }
+
+    if (be16_to_cpu(rrb->response.hdr.len) < 48) {
+        res_code = CLP_RC_8K;
+        rc = -EINVAL;
+        goto out;
+    }
+
+    initial_l2 = be16_to_cpu(rrb->response.hdr.len);
+    if ((initial_l2 - LIST_PCI_HDR_LEN) % sizeof(ClpFhListEntry)
+        != 0) {
+        rc = -EINVAL;
+        *cc = 3;
+        goto out;
+    }
+
+    rrb->response.fmt = 0;
+    rrb->response.reserved1 = rrb->response.reserved2 = 0;
+    rrb->response.mdd = cpu_to_be32(FH_VIRT);
+    rrb->response.max_fn = cpu_to_be16(PCI_MAX_FUNCTIONS);
+    rrb->response.entry_size = sizeof(ClpFhListEntry);
+    finish = 0;
+    idx = resume_token;
+    g_l2 = LIST_PCI_HDR_LEN;
+    do {
+        pbdev = s390_pci_find_dev_by_idx(idx);
+        if (!pbdev) {
+            finish = 1;
+            break;
+        }
+        rrb->response.fh_list[idx - resume_token].device_id =
+            pci_get_word(pbdev->pdev->config + PCI_DEVICE_ID);
+        rrb->response.fh_list[idx - resume_token].vendor_id =
+            pci_get_word(pbdev->pdev->config + PCI_VENDOR_ID);
+        rrb->response.fh_list[idx - resume_token].config =
+            cpu_to_be32(0x80000000);
+        rrb->response.fh_list[idx - resume_token].fid = cpu_to_be32(pbdev->fid);
+        rrb->response.fh_list[idx - resume_token].fh = cpu_to_be32(pbdev->fh);
+
+        g_l2 += sizeof(ClpFhListEntry);
+        DPRINTF("g_l2 %d vendor id 0x%x device id 0x%x fid 0x%x fh 0x%x\n",
+            g_l2,
+            rrb->response.fh_list[idx - resume_token].vendor_id,
+            rrb->response.fh_list[idx - resume_token].device_id,
+            rrb->response.fh_list[idx - resume_token].fid,
+            rrb->response.fh_list[idx - resume_token].fh);
+        idx++;
+    } while (g_l2 < initial_l2);
+
+    if (finish == 1) {
+        resume_token = 0;
+    } else {
+        resume_token = idx;
+    }
+    rrb->response.resume_token = cpu_to_be64(resume_token);
+    rrb->response.hdr.len = cpu_to_be16(g_l2);
+    rrb->response.hdr.rsp = cpu_to_be16(CLP_RC_OK);
+out:
+    if (rc) {
+        DPRINTF("list pci failed rc 0x%x\n", rc);
+        rrb->response.hdr.rsp = cpu_to_be16(res_code);
+    }
+    return rc;
+}
+
+int kvm_clp_service_call(S390CPU *cpu, struct kvm_run *run)
+{
+    ClpReqHdr *reqh;
+    ClpRspHdr *resh;
+    S390PCIBusDevice *pbdev;
+    uint32_t req_len;
+    uint32_t res_len;
+    uint8_t *buffer;
+    uint8_t cc = 0;
+    CPUS390XState *env = &cpu->env;
+    uint8_t r2 = (run->s390_sieic.ipb & 0x000f0000) >> 16;
+    int i;
+
+    buffer = g_malloc0(4096 * 2);
+    cpu_synchronize_state(CPU(cpu));
+
+    if (env->psw.mask & PSW_MASK_PSTATE) {
+        program_interrupt(env, PGM_PRIVILEGED, 4);
+        return 0;
+    }
+
+    cpu_physical_memory_rw(env->regs[r2], buffer, sizeof(*reqh), 0);
+    reqh = (ClpReqHdr *)buffer;
+    req_len = be16_to_cpu(reqh->len);
+    if (req_len < 16 || req_len > 8184 || (req_len % 8 != 0)) {
+        program_interrupt(env, PGM_OPERAND, 4);
+        return 0;
+    }
+
+    cpu_physical_memory_rw(env->regs[r2], buffer, req_len + sizeof(*resh), 0);
+    resh = (ClpRspHdr *)(buffer + req_len);
+    res_len = be16_to_cpu(resh->len);
+    if (res_len < 8 || res_len > 8176 || (res_len % 8 != 0)) {
+        program_interrupt(env, PGM_OPERAND, 4);
+        return 0;
+    }
+    if ((req_len + res_len) > 8192) {
+        program_interrupt(env, PGM_OPERAND, 4);
+        return 0;
+    }
+
+    cpu_physical_memory_rw(env->regs[r2], buffer, req_len + res_len, 0);
+
+    if (req_len != 32) {
+        resh->rsp = cpu_to_be16(CLP_RC_LEN);
+        goto out;
+    }
+
+    switch (reqh->cmd) {
+    case CLP_LIST_PCI: {
+        ClpReqRspListPci *rrb = (ClpReqRspListPci *)buffer;
+        list_pci(rrb, &cc);
+        break;
+    }
+    case CLP_SET_PCI_FN: {
+        ClpReqSetPci *reqsetpci = (ClpReqSetPci *)reqh;
+        ClpRspSetPci *ressetpci = (ClpRspSetPci *)resh;
+
+        pbdev = s390_pci_find_dev_by_fh(be32_to_cpu(reqsetpci->fh));
+        if (!pbdev) {
+                ressetpci->hdr.rsp = cpu_to_be16(CLP_RC_SETPCIFN_FH);
+                goto out;
+        }
+
+        switch (reqsetpci->oc) {
+        case CLP_SET_ENABLE_PCI_FN:
+            pbdev->fh = pbdev->fh | 1 << ENABLE_BIT_OFFSET;
+            ressetpci->fh = cpu_to_be32(pbdev->fh);
+            ressetpci->hdr.rsp = cpu_to_be16(CLP_RC_OK);
+            break;
+        case CLP_SET_DISABLE_PCI_FN:
+            pbdev->fh = pbdev->fh & ~(1 << ENABLE_BIT_OFFSET);
+            ressetpci->fh = cpu_to_be32(pbdev->fh);
+            ressetpci->hdr.rsp = cpu_to_be16(CLP_RC_OK);
+            break;
+        default:
+            DPRINTF("unknown set pci command\n");
+            ressetpci->hdr.rsp = cpu_to_be16(CLP_RC_SETPCIFN_FHOP);
+            break;
+        }
+        break;
+    }
+    case CLP_QUERY_PCI_FN: {
+        ClpReqQueryPci *reqquery = (ClpReqQueryPci *)reqh;
+        ClpRspQueryPci *resquery = (ClpRspQueryPci *)resh;
+
+        pbdev = s390_pci_find_dev_by_fh(reqquery->fh);
+        if (!pbdev) {
+            DPRINTF("query pci no pci dev\n");
+            resquery->hdr.rsp = cpu_to_be16(CLP_RC_SETPCIFN_FH);
+            goto out;
+        }
+
+        for (i = 0; i < PCI_BAR_COUNT; i++) {
+            uint64_t data = pci_host_config_read_common(pbdev->pdev,
+                0x10 + (i * 4), pci_config_size(pbdev->pdev), 4);
+
+            resquery->bar[i] = bswap32(data);
+            resquery->bar_size[i] = barsize(pbdev->pdev->io_regions[i].size);
+            DPRINTF("bar %d addr 0x%x size 0x%lx barsize 0x%x\n", i,
+                    resquery->bar[i], pbdev->pdev->io_regions[i].size,
+                    resquery->bar_size[i]);
+        }
+
+        resquery->sdma = ZPCI_SDMA_ADDR;
+        resquery->edma = ZPCI_EDMA_ADDR;
+        resquery->pchid = 0;
+        resquery->ug = 1;
+        resquery->uid = pbdev->fid;
+
+        resquery->hdr.rsp = CLP_RC_OK;
+        break;
+    }
+    case CLP_QUERY_PCI_FNGRP: {
+        ClpRspQueryPciGrp *resgrp = (ClpRspQueryPciGrp *)resh;
+        resgrp->fr = 1;
+        resgrp->dasm = 0;
+        resgrp->msia = ZPCI_MSI_ADDR;
+        resgrp->mui = 0;
+        resgrp->i = 128;
+        resgrp->version = 0;
+
+        resgrp->hdr.rsp = CLP_RC_OK;
+        break;
+    }
+    default:
+        DPRINTF("unknown clp command\n");
+        resh->rsp = cpu_to_be16(CLP_RC_CMD);
+        break;
+    }
+
+out:
+    cpu_physical_memory_rw(env->regs[r2], buffer, req_len + res_len, 1);
+    g_free(buffer);
+    setcc(cpu, cc);
+    return 0;
+}
+
+int kvm_pcilg_service_call(S390CPU *cpu, struct kvm_run *run)
+{
+    CPUS390XState *env = &cpu->env;
+    S390PCIBusDevice *pbdev;
+    uint8_t r1 = (run->s390_sieic.ipb & 0x00f00000) >> 20;
+    uint8_t r2 = (run->s390_sieic.ipb & 0x000f0000) >> 16;
+    PciLgStg *rp;
+    uint64_t offset;
+    uint64_t data;
+    uint8_t len;
+
+    cpu_synchronize_state(CPU(cpu));
+
+    if (env->psw.mask & PSW_MASK_PSTATE) {
+        program_interrupt(env, PGM_PRIVILEGED, 4);
+        return 0;
+    }
+
+    if (r2 & 0x1) {
+        program_interrupt(env, PGM_SPECIFICATION, 4);
+        return 0;
+    }
+
+    rp = (PciLgStg *)&env->regs[r2];
+    offset = env->regs[r2 + 1];
+
+    pbdev = s390_pci_find_dev_by_fh(rp->fh);
+    if (!pbdev) {
+        DPRINTF("pcilg no pci dev\n");
+        setcc(cpu, ZPCI_PCI_LS_INVAL_HANDLE);
+        return 0;
+    }
+
+    len = rp->len & 0xF;
+    if (rp->pcias < 6) {
+        if ((8 - (offset & 0x7)) < len) {
+            program_interrupt(env, PGM_OPERAND, 4);
+            return 0;
+        }
+        MemoryRegion *mr = pbdev->pdev->io_regions[rp->pcias].memory;
+        io_mem_read(mr, offset, &data, len);
+    } else if (rp->pcias == 15) {
+        if ((4 - (offset & 0x3)) < len) {
+            program_interrupt(env, PGM_OPERAND, 4);
+            return 0;
+        }
+        data =  pci_host_config_read_common(
+                   pbdev->pdev, offset, pci_config_size(pbdev->pdev), len);
+
+        switch (len) {
+        case 1:
+            break;
+        case 2:
+            data = cpu_to_le16(data);
+            break;
+        case 4:
+            data = cpu_to_le32(data);
+            break;
+        case 8:
+            data = cpu_to_le64(data);
+            break;
+        default:
+            program_interrupt(env, PGM_OPERAND, 4);
+            return 0;
+        }
+    } else {
+        DPRINTF("invalid space\n");
+        setcc(cpu, ZPCI_PCI_LS_ERR);
+        s390_set_status_code(env, r2, ZPCI_PCI_ST_INVAL_AS);
+        return 0;
+    }
+
+    env->regs[r1] = data;
+    setcc(cpu, ZPCI_PCI_LS_OK);
+    return 0;
+}
+
+static void update_msix_table_msg_data(S390PCIBusDevice *pbdev, uint64_t offset,
+                                       uint64_t *data, uint8_t len)
+{
+    uint32_t msg_data;
+
+    if (offset % PCI_MSIX_ENTRY_SIZE != 8) {
+        return;
+    }
+
+    if (len != 4) {
+        DPRINTF("access msix table msg data but len is %d\n", len);
+        return;
+    }
+
+    msg_data = (pbdev->fid << ZPCI_MSI_VEC_BITS) | le32_to_cpu(*data);
+    *data = cpu_to_le32(msg_data);
+    DPRINTF("update msix msg_data to 0x%x\n", msg_data);
+}
+
+static int trap_msix(S390PCIBusDevice *pbdev, uint64_t offset, uint8_t pcias)
+{
+    if (pbdev->msix.available && pbdev->msix.table_bar == pcias &&
+        offset >= pbdev->msix.table_offset &&
+        offset <= pbdev->msix.table_offset +
+                  (pbdev->msix.entries - 1) * PCI_MSIX_ENTRY_SIZE) {
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+int kvm_pcistg_service_call(S390CPU *cpu, struct kvm_run *run)
+{
+    CPUS390XState *env = &cpu->env;
+    uint8_t r1 = (run->s390_sieic.ipb & 0x00f00000) >> 20;
+    uint8_t r2 = (run->s390_sieic.ipb & 0x000f0000) >> 16;
+    PciLgStg *rp;
+    uint64_t offset, data;
+    S390PCIBusDevice *pbdev;
+    uint8_t len;
+
+    cpu_synchronize_state(CPU(cpu));
+
+    if (env->psw.mask & PSW_MASK_PSTATE) {
+        program_interrupt(env, PGM_PRIVILEGED, 4);
+        return 0;
+    }
+
+    if (r2 & 0x1) {
+        program_interrupt(env, PGM_SPECIFICATION, 4);
+        return 0;
+    }
+
+    rp = (PciLgStg *)&env->regs[r2];
+    offset = env->regs[r2 + 1];
+
+    pbdev = s390_pci_find_dev_by_fh(rp->fh);
+    if (!pbdev) {
+        DPRINTF("pcistg no pci dev\n");
+        setcc(cpu, ZPCI_PCI_LS_INVAL_HANDLE);
+        return 0;
+    }
+
+    data = env->regs[r1];
+    len = rp->len & 0xF;
+    if (rp->pcias < 6) {
+        if ((8 - (offset & 0x7)) < len) {
+            program_interrupt(env, PGM_OPERAND, 4);
+            return 0;
+        }
+        MemoryRegion *mr;
+        if (trap_msix(pbdev, offset, rp->pcias)) {
+            offset = offset - pbdev->msix.table_offset;
+            mr = &pbdev->pdev->msix_table_mmio;
+            update_msix_table_msg_data(pbdev, offset, &data, len);
+        } else {
+            mr = pbdev->pdev->io_regions[rp->pcias].memory;
+        }
+
+        io_mem_write(mr, offset, data, len);
+    } else if (rp->pcias == 15) {
+        if ((4 - (offset & 0x3)) < len) {
+            program_interrupt(env, PGM_OPERAND, 4);
+            return 0;
+        }
+        switch (len) {
+        case 1:
+            break;
+        case 2:
+            data = le16_to_cpu(data);
+            break;
+        case 4:
+            data = le32_to_cpu(data);
+            break;
+        case 8:
+            data = le64_to_cpu(data);
+            break;
+        default:
+            program_interrupt(env, PGM_OPERAND, 4);
+            return 0;
+        }
+
+        pci_host_config_write_common(pbdev->pdev, offset,
+                                     pci_config_size(pbdev->pdev),
+                                     data, len);
+    } else {
+        DPRINTF("pcistg invalid space\n");
+        setcc(cpu, ZPCI_PCI_LS_ERR);
+        s390_set_status_code(env, r2, ZPCI_PCI_ST_INVAL_AS);
+        return 0;
+    }
+
+    setcc(cpu, ZPCI_PCI_LS_OK);
+    return 0;
+}
+
+int kvm_rpcit_service_call(S390CPU *cpu, struct kvm_run *run)
+{
+    CPUS390XState *env = &cpu->env;
+    uint8_t r1 = (run->s390_sieic.ipb & 0x00f00000) >> 20;
+    uint8_t r2 = (run->s390_sieic.ipb & 0x000f0000) >> 16;
+    uint32_t fh;
+    uint64_t pte;
+    S390PCIBusDevice *pbdev;
+    ram_addr_t size;
+    int flags;
+    IOMMUTLBEntry entry;
+
+    cpu_synchronize_state(CPU(cpu));
+
+    if (env->psw.mask & PSW_MASK_PSTATE) {
+        program_interrupt(env, PGM_PRIVILEGED, 4);
+        return 0;
+    }
+
+    if (r2 & 0x1) {
+        program_interrupt(env, PGM_SPECIFICATION, 4);
+        return 0;
+    }
+
+    fh = env->regs[r1] >> 32;
+    size = env->regs[r2 + 1];
+
+    pbdev = s390_pci_find_dev_by_fh(fh);
+
+    if (!pbdev) {
+        DPRINTF("rpcit no pci dev\n");
+        setcc(cpu, ZPCI_PCI_LS_INVAL_HANDLE);
+        return 0;
+    }
+
+    pte = s390_guest_io_table_walk(s390_pci_get_table_origin(pbdev->g_iota),
+                                   env->regs[r2]);
+    flags = pte & ZPCI_PTE_FLAG_MASK;
+    entry.target_as = &address_space_memory;
+    entry.iova = env->regs[r2];
+    entry.translated_addr = pte & ZPCI_PTE_ADDR_MASK;
+    entry.addr_mask = size - 1;
+
+    if (flags & ZPCI_PTE_INVALID) {
+        entry.perm = IOMMU_NONE;
+    } else {
+        entry.perm = IOMMU_RW;
+    }
+
+    memory_region_notify_iommu(pci_device_iommu_address_space(
+                               pbdev->pdev)->root, entry);
+
+    setcc(cpu, ZPCI_PCI_LS_OK);
+    return 0;
+}
+
+int kvm_sic_service_call(S390CPU *cpu, struct kvm_run *run)
+{
+    qemu_log_mask(LOG_UNIMP, "SIC missing\n");
+    return 0;
+}
+
+int kvm_pcistb_service_call(S390CPU *cpu, struct kvm_run *run)
+{
+    CPUS390XState *env = &cpu->env;
+    uint8_t r1 = (run->s390_sieic.ipa & 0x00f0) >> 4;
+    uint8_t r3 = run->s390_sieic.ipa & 0x000f;
+    PciStb *rp;
+    uint64_t gaddr;
+    uint64_t *uaddr, *pu;
+    hwaddr len;
+    S390PCIBusDevice *pbdev;
+    MemoryRegion *mr;
+    int i;
+
+    cpu_synchronize_state(CPU(cpu));
+
+    if (env->psw.mask & PSW_MASK_PSTATE) {
+        program_interrupt(env, PGM_PRIVILEGED, 6);
+        return 0;
+    }
+
+    rp = (PciStb *)&env->regs[r1];
+    if (rp->pcias > 5) {
+        DPRINTF("pcistb invalid space\n");
+        setcc(cpu, ZPCI_PCI_LS_ERR);
+        s390_set_status_code(env, r1, ZPCI_PCI_ST_INVAL_AS);
+        return 0;
+    }
+
+    switch (rp->len) {
+    case 16:
+    case 32:
+    case 64:
+    case 128:
+        break;
+    default:
+        program_interrupt(env, PGM_SPECIFICATION, 6);
+        return 0;
+    }
+
+    gaddr = get_base_disp_rsy(cpu, run);
+    len = rp->len;
+
+    pbdev = s390_pci_find_dev_by_fh(rp->fh);
+    if (!pbdev) {
+        DPRINTF("pcistb no pci dev fh 0x%x\n", rp->fh);
+        setcc(cpu, ZPCI_PCI_LS_INVAL_HANDLE);
+        return 0;
+    }
+
+    uaddr = cpu_physical_memory_map(gaddr, &len, 0);
+    mr = pbdev->pdev->io_regions[rp->pcias].memory;
+    if (!memory_region_access_valid(mr, env->regs[r3], rp->len, true)) {
+        cpu_physical_memory_unmap(uaddr, len, 0, len);
+        program_interrupt(env, PGM_ADDRESSING, 6);
+        return 0;
+    }
+
+    pu = uaddr;
+    for (i = 0; i < rp->len / 8; i++) {
+        io_mem_write(mr, env->regs[r3] + i * 8, *pu, 8);
+        pu++;
+    }
+
+    cpu_physical_memory_unmap(uaddr, len, 0, len);
+    setcc(cpu, ZPCI_PCI_LS_OK);
+    return 0;
+}
+
+static int reg_irqs(CPUS390XState *env, S390PCIBusDevice *pbdev, ZpciFib fib)
+{
+    int ret;
+    S390FLICState *fs = s390_get_flic();
+    S390FLICStateClass *fsc = S390_FLIC_COMMON_GET_CLASS(fs);
+
+    ret = css_register_io_adapter(S390_PCIPT_ADAPTER,
+                                  FIB_DATA_ISC(fib.data), true, false,
+                                  &pbdev->routes.adapter.adapter_id);
+    assert(ret == 0);
+
+    fsc->io_adapter_map(fs, pbdev->routes.adapter.adapter_id, fib.aisb, true);
+    fsc->io_adapter_map(fs, pbdev->routes.adapter.adapter_id, fib.aibv, true);
+
+    pbdev->routes.adapter.summary_addr = fib.aisb;
+    pbdev->routes.adapter.summary_offset = FIB_DATA_AISBO(fib.data);
+    pbdev->routes.adapter.ind_addr = fib.aibv;
+    pbdev->routes.adapter.ind_offset = FIB_DATA_AIBVO(fib.data);
+
+    DPRINTF("reg_irqs adapter id %d\n", pbdev->routes.adapter.adapter_id);
+    return 0;
+}
+
+static int dereg_irqs(S390PCIBusDevice *pbdev)
+{
+    S390FLICState *fs = s390_get_flic();
+    S390FLICStateClass *fsc = S390_FLIC_COMMON_GET_CLASS(fs);
+
+    fsc->io_adapter_map(fs, pbdev->routes.adapter.adapter_id,
+                        pbdev->routes.adapter.ind_addr, false);
+
+    pbdev->routes.adapter.summary_addr = 0;
+    pbdev->routes.adapter.summary_offset = 0;
+    pbdev->routes.adapter.ind_addr = 0;
+    pbdev->routes.adapter.ind_offset = 0;
+
+    DPRINTF("dereg_irqs adapter id %d\n", pbdev->routes.adapter.adapter_id);
+    return 0;
+}
+
+int kvm_mpcifc_service_call(S390CPU *cpu, struct kvm_run *run)
+{
+    CPUS390XState *env = &cpu->env;
+    uint8_t r1 = (run->s390_sieic.ipa & 0x00f0) >> 4;
+    uint8_t oc;
+    uint32_t fh;
+    uint64_t fiba;
+    ZpciFib fib;
+    S390PCIBusDevice *pbdev;
+
+    cpu_synchronize_state(CPU(cpu));
+
+    if (env->psw.mask & PSW_MASK_PSTATE) {
+        program_interrupt(env, PGM_PRIVILEGED, 6);
+        return 0;
+    }
+
+    oc = env->regs[r1] & 0xff;
+    fh = env->regs[r1] >> 32;
+    fiba = get_base_disp_rxy(cpu, run);
+
+    if (fiba & 0x7) {
+        program_interrupt(env, PGM_SPECIFICATION, 6);
+        return 0;
+    }
+
+    pbdev = s390_pci_find_dev_by_fh(fh);
+    if (!pbdev) {
+        DPRINTF("mpcifc no pci dev fh 0x%x\n", fh);
+        setcc(cpu, ZPCI_PCI_LS_INVAL_HANDLE);
+        return 0;
+    }
+
+    cpu_physical_memory_rw(fiba, (uint8_t *)&fib, sizeof(fib), 0);
+
+    switch (oc) {
+    case ZPCI_MOD_FC_REG_INT: {
+        pbdev->isc = FIB_DATA_ISC(fib.data);
+        reg_irqs(env, pbdev, fib);
+        break;
+    }
+    case ZPCI_MOD_FC_DEREG_INT:
+        dereg_irqs(pbdev);
+        break;
+    case ZPCI_MOD_FC_REG_IOAT:
+        if (fib.pba > fib.pal) {
+            program_interrupt(&cpu->env, PGM_OPERAND, 6);
+            return 0;
+        }
+        pbdev->g_iota = fib.iota;
+        break;
+    case ZPCI_MOD_FC_DEREG_IOAT:
+        break;
+    case ZPCI_MOD_FC_REREG_IOAT:
+        break;
+    case ZPCI_MOD_FC_RESET_ERROR:
+        break;
+    case ZPCI_MOD_FC_RESET_BLOCK:
+        break;
+    case ZPCI_MOD_FC_SET_MEASURE:
+        break;
+    default:
+        program_interrupt(&cpu->env, PGM_OPERAND, 6);
+        return 0;
+    }
+
+    setcc(cpu, ZPCI_PCI_LS_OK);
+    return 0;
+}
+
+int kvm_stpcifc_service_call(S390CPU *cpu, struct kvm_run *run)
+{
+    qemu_log_mask(LOG_UNIMP, "STPCIFC missing\n");
+    return 0;
+}
diff --git a/target-s390x/pci_ic.h b/target-s390x/pci_ic.h
new file mode 100644
index 0000000..0eb6c27
--- /dev/null
+++ b/target-s390x/pci_ic.h
@@ -0,0 +1,335 @@ 
+/*
+ * s390 PCI intercept definitions
+ *
+ * Copyright 2014 IBM Corp.
+ * Author(s): Frank Blaschka <frank.blaschka@de.ibm.com>
+ *            Hong Bo Li <lihbbj@cn.ibm.com>
+ *            Yi Min Zhao <zyimin@cn.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or (at
+ * your option) any later version. See the COPYING file in the top-level
+ * directory.
+ */
+
+#ifndef PCI_IC_S390X_H
+#define PCI_IC_S390X_H
+
+#include <sysemu/dma.h>
+
+/* CLP common request & response block size */
+#define CLP_BLK_SIZE 4096
+#define PCI_BAR_COUNT 6
+#define PCI_MAX_FUNCTIONS 4096
+
+typedef struct ClpReqHdr {
+    __uint16_t len;
+    __uint16_t cmd;
+} QEMU_PACKED ClpReqHdr;
+
+typedef struct ClpRspHdr {
+    __uint16_t len;
+    __uint16_t rsp;
+} QEMU_PACKED ClpRspHdr;
+
+/* CLP Response Codes */
+#define CLP_RC_OK         0x0010  /* Command request successfully */
+#define CLP_RC_CMD        0x0020  /* Command code not recognized */
+#define CLP_RC_PERM       0x0030  /* Command not authorized */
+#define CLP_RC_FMT        0x0040  /* Invalid command request format */
+#define CLP_RC_LEN        0x0050  /* Invalid command request length */
+#define CLP_RC_8K         0x0060  /* Command requires 8K LPCB */
+#define CLP_RC_RESNOT0    0x0070  /* Reserved field not zero */
+#define CLP_RC_NODATA     0x0080  /* No data available */
+#define CLP_RC_FC_UNKNOWN 0x0100  /* Function code not recognized */
+
+/*
+ * Call Logical Processor - Command Codes
+ */
+#define CLP_LIST_PCI            0x0002
+#define CLP_QUERY_PCI_FN        0x0003
+#define CLP_QUERY_PCI_FNGRP     0x0004
+#define CLP_SET_PCI_FN          0x0005
+
+/* PCI function handle list entry */
+typedef struct ClpFhListEntry {
+    __uint16_t device_id;
+    __uint16_t vendor_id;
+#define CLP_FHLIST_MASK_CONFIG 0x80000000
+    __uint32_t config;
+    __uint32_t fid;
+    __uint32_t fh;
+} QEMU_PACKED ClpFhListEntry;
+
+#define CLP_RC_SETPCIFN_FH      0x0101 /* Invalid PCI fn handle */
+#define CLP_RC_SETPCIFN_FHOP    0x0102 /* Fn handle not valid for op */
+#define CLP_RC_SETPCIFN_DMAAS   0x0103 /* Invalid DMA addr space */
+#define CLP_RC_SETPCIFN_RES     0x0104 /* Insufficient resources */
+#define CLP_RC_SETPCIFN_ALRDY   0x0105 /* Fn already in requested state */
+#define CLP_RC_SETPCIFN_ERR     0x0106 /* Fn in permanent error state */
+#define CLP_RC_SETPCIFN_RECPND  0x0107 /* Error recovery pending */
+#define CLP_RC_SETPCIFN_BUSY    0x0108 /* Fn busy */
+#define CLP_RC_LISTPCI_BADRT    0x010a /* Resume token not recognized */
+#define CLP_RC_QUERYPCIFG_PFGID 0x010b /* Unrecognized PFGID */
+
+/* request or response block header length */
+#define LIST_PCI_HDR_LEN 32
+
+/* Number of function handles fitting in response block */
+#define CLP_FH_LIST_NR_ENTRIES \
+    ((CLP_BLK_SIZE - 2 * LIST_PCI_HDR_LEN) \
+        / sizeof(ClpFhListEntry))
+
+#define CLP_SET_ENABLE_PCI_FN  0 /* Yes, 0 enables it */
+#define CLP_SET_DISABLE_PCI_FN 1 /* Yes, 1 disables it */
+
+#define CLP_UTIL_STR_LEN 64
+
+#define CLP_MASK_FMT 0xf0000000
+
+/* List PCI functions request */
+typedef struct ClpReqListPci {
+    ClpReqHdr hdr;
+    __uint32_t fmt;
+    __uint64_t reserved1;
+    __uint64_t resume_token;
+    __uint64_t reserved2;
+} QEMU_PACKED ClpReqListPci;
+
+/* List PCI functions response */
+typedef struct ClpRspListPci {
+    ClpRspHdr hdr;
+    __uint32_t fmt;
+    __uint64_t reserved1;
+    __uint64_t resume_token;
+    __uint32_t mdd;
+    __uint16_t max_fn;
+    __uint8_t reserved2;
+    __uint8_t entry_size;
+    ClpFhListEntry fh_list[CLP_FH_LIST_NR_ENTRIES];
+} QEMU_PACKED ClpRspListPci;
+
+/* Query PCI function request */
+typedef struct ClpReqQueryPci {
+    ClpReqHdr hdr;
+    __uint32_t fmt;
+    __uint64_t reserved1;
+    __uint32_t fh; /* function handle */
+    __uint32_t reserved2;
+    __uint64_t reserved3;
+} QEMU_PACKED ClpReqQueryPci;
+
+/* Query PCI function response */
+typedef struct ClpRspQueryPci {
+    ClpRspHdr hdr;
+    __uint32_t fmt;
+    __uint64_t reserved1;
+    __uint16_t vfn; /* virtual fn number */
+#define CLP_RSP_QPCI_MASK_UTIL  0x100
+#define CLP_RSP_QPCI_MASK_PFGID 0xff
+    __uint16_t ug;
+    __uint32_t fid; /* pci function id */
+    __uint8_t bar_size[PCI_BAR_COUNT];
+    __uint16_t pchid;
+    __uint32_t bar[PCI_BAR_COUNT];
+    __uint64_t reserved2;
+    __uint64_t sdma; /* start dma as */
+    __uint64_t edma; /* end dma as */
+    __uint32_t reserved3[11];
+    __uint32_t uid;
+    __uint8_t util_str[CLP_UTIL_STR_LEN]; /* utility string */
+} QEMU_PACKED ClpRspQueryPci;
+
+/* Query PCI function group request */
+typedef struct ClpReqQueryPciGrp {
+    ClpReqHdr hdr;
+    __uint32_t fmt;
+    __uint64_t reserved1;
+#define CLP_REQ_QPCIG_MASK_PFGID 0xff
+    __uint32_t g;
+    __uint32_t reserved2;
+    __uint64_t reserved3;
+} QEMU_PACKED ClpReqQueryPciGrp;
+
+/* Query PCI function group response */
+typedef struct ClpRspQueryPciGrp {
+    ClpRspHdr hdr;
+    __uint32_t fmt;
+    __uint64_t reserved1;
+#define CLP_RSP_QPCIG_MASK_NOI 0xfff
+    __uint16_t i;
+    __uint8_t version;
+#define CLP_RSP_QPCIG_MASK_FRAME   0x2
+#define CLP_RSP_QPCIG_MASK_REFRESH 0x1
+    __uint8_t fr;
+    __uint16_t reserved2;
+    __uint16_t mui;
+    __uint64_t reserved3;
+    __uint64_t dasm; /* dma address space mask */
+    __uint64_t msia; /* MSI address */
+    __uint64_t reserved4;
+    __uint64_t reserved5;
+} QEMU_PACKED ClpRspQueryPciGrp;
+
+/* Set PCI function request */
+typedef struct ClpReqSetPci {
+    ClpReqHdr hdr;
+    __uint32_t fmt;
+    __uint64_t reserved1;
+    __uint32_t fh; /* function handle */
+    __uint16_t reserved2;
+    __uint8_t oc; /* operation controls */
+    __uint8_t ndas; /* number of dma spaces */
+    __uint64_t reserved3;
+} QEMU_PACKED ClpReqSetPci;
+
+/* Set PCI function response */
+typedef struct ClpRspSetPci {
+    ClpRspHdr hdr;
+    __uint32_t fmt;
+    __uint64_t reserved1;
+    __uint32_t fh; /* function handle */
+    __uint32_t reserved3;
+    __uint64_t reserved4;
+} QEMU_PACKED ClpRspSetPci;
+
+typedef struct ClpReqRspListPci {
+    ClpReqListPci request;
+    ClpRspListPci response;
+} QEMU_PACKED ClpReqRspListPci;
+
+typedef struct ClpReqRspSetPci {
+    ClpReqSetPci request;
+    ClpRspSetPci response;
+} QEMU_PACKED ClpReqRspSetPci;
+
+typedef struct ClpReqRspQueryPci {
+    ClpReqQueryPci request;
+    ClpRspQueryPci response;
+} QEMU_PACKED ClpReqRspQueryPci;
+
+typedef struct ClpReqRspQueryPciGrp {
+    ClpReqQueryPciGrp request;
+    ClpRspQueryPciGrp response;
+} QEMU_PACKED ClpReqRspQueryPciGrp;
+
+typedef struct PciLgStg {
+    uint32_t fh;
+    uint8_t status;
+    uint8_t pcias;
+    uint8_t reserved;
+    uint8_t len;
+} QEMU_PACKED PciLgStg;
+
+typedef struct PciStb {
+    uint32_t fh;
+    uint8_t status;
+    uint8_t pcias;
+    uint8_t reserved;
+    uint8_t len;
+} QEMU_PACKED PciStb;
+
+/* Load/Store status codes */
+#define ZPCI_PCI_ST_FUNC_NOT_ENABLED        4
+#define ZPCI_PCI_ST_FUNC_IN_ERR             8
+#define ZPCI_PCI_ST_BLOCKED                 12
+#define ZPCI_PCI_ST_INSUF_RES               16
+#define ZPCI_PCI_ST_INVAL_AS                20
+#define ZPCI_PCI_ST_FUNC_ALREADY_ENABLED    24
+#define ZPCI_PCI_ST_DMA_AS_NOT_ENABLED      28
+#define ZPCI_PCI_ST_2ND_OP_IN_INV_AS        36
+#define ZPCI_PCI_ST_FUNC_NOT_AVAIL          40
+#define ZPCI_PCI_ST_ALREADY_IN_RQ_STATE     44
+
+/* Load/Store return codes */
+#define ZPCI_PCI_LS_OK              0
+#define ZPCI_PCI_LS_ERR             1
+#define ZPCI_PCI_LS_BUSY            2
+#define ZPCI_PCI_LS_INVAL_HANDLE    3
+
+/* Modify PCI Function Controls */
+#define ZPCI_MOD_FC_REG_INT     2
+#define ZPCI_MOD_FC_DEREG_INT   3
+#define ZPCI_MOD_FC_REG_IOAT    4
+#define ZPCI_MOD_FC_DEREG_IOAT  5
+#define ZPCI_MOD_FC_REREG_IOAT  6
+#define ZPCI_MOD_FC_RESET_ERROR 7
+#define ZPCI_MOD_FC_RESET_BLOCK 9
+#define ZPCI_MOD_FC_SET_MEASURE 10
+
+/* FIB function controls */
+#define ZPCI_FIB_FC_ENABLED     0x80
+#define ZPCI_FIB_FC_ERROR       0x40
+#define ZPCI_FIB_FC_LS_BLOCKED  0x20
+#define ZPCI_FIB_FC_DMAAS_REG   0x10
+
+/* FIB function controls */
+#define ZPCI_FIB_FC_ENABLED     0x80
+#define ZPCI_FIB_FC_ERROR       0x40
+#define ZPCI_FIB_FC_LS_BLOCKED  0x20
+#define ZPCI_FIB_FC_DMAAS_REG   0x10
+
+/* Function Information Block */
+typedef struct ZpciFib {
+    __uint8_t fmt;   /* format */
+    __uint8_t reserved1[7];
+    __uint8_t fc;                  /* function controls */
+    __uint8_t reserved2;
+    __uint16_t reserved3;
+    __uint32_t reserved4;
+    __uint64_t pba;                /* PCI base address */
+    __uint64_t pal;                /* PCI address limit */
+    __uint64_t iota;               /* I/O Translation Anchor */
+#define FIB_DATA_ISC(x)    (((x) >> 28) & 0x7)
+#define FIB_DATA_NOI(x)    (((x) >> 16) & 0xfff)
+#define FIB_DATA_AIBVO(x) (((x) >> 8) & 0x3f)
+#define FIB_DATA_SUM(x)    (((x) >> 7) & 0x1)
+#define FIB_DATA_AISBO(x)  ((x) & 0x3f)
+    __uint32_t data;
+    __uint32_t reserved5;
+    __uint64_t aibv;               /* Adapter int bit vector address */
+    __uint64_t aisb;               /* Adapter int summary bit address */
+    __uint64_t fmb_addr;           /* Function measurement address and key */
+    __uint32_t reserved6;
+    __uint32_t gd;
+} QEMU_PACKED ZpciFib;
+
+static inline uint64_t get_base_disp_rxy(S390CPU *cpu, struct kvm_run *run)
+{
+    CPUS390XState *env = &cpu->env;
+    uint32_t x2 = (run->s390_sieic.ipa & 0x000f);
+    uint32_t base2 = run->s390_sieic.ipb >> 28;
+    uint32_t disp2 = ((run->s390_sieic.ipb & 0x0fff0000) >> 16) +
+                     ((run->s390_sieic.ipb & 0xff00) << 4);
+
+    if (disp2 & 0x80000) {
+        disp2 += 0xfff00000;
+    }
+
+    return (base2 ? env->regs[base2] : 0) +
+           (x2 ? env->regs[x2] : 0) + (long)(int)disp2;
+}
+
+static inline uint64_t get_base_disp_rsy(S390CPU *cpu, struct kvm_run *run)
+{
+    CPUS390XState *env = &cpu->env;
+    uint32_t base2 = run->s390_sieic.ipb >> 28;
+    uint32_t disp2 = ((run->s390_sieic.ipb & 0x0fff0000) >> 16) +
+                     ((run->s390_sieic.ipb & 0xff00) << 4);
+
+    if (disp2 & 0x80000) {
+        disp2 += 0xfff00000;
+    }
+
+    return (base2 ? env->regs[base2] : 0) + (long)(int)disp2;
+}
+
+int kvm_clp_service_call(S390CPU *cpu, struct kvm_run *run);
+int kvm_rpcit_service_call(S390CPU *cpu, struct kvm_run *run);
+int kvm_sic_service_call(S390CPU *cpu, struct kvm_run *run);
+int kvm_pcistb_service_call(S390CPU *cpu, struct kvm_run *run);
+int kvm_mpcifc_service_call(S390CPU *cpu, struct kvm_run *run);
+int kvm_pcistg_service_call(S390CPU *cpu, struct kvm_run *run);
+int kvm_pcilg_service_call(S390CPU *cpu, struct kvm_run *run);
+int kvm_stpcifc_service_call(S390CPU *cpu, struct kvm_run *run);
+
+#endif