diff mbox

[V4,2/3] occ: Add support for OPAL-OCC command/response interface

Message ID 1498760408-20178-3-git-send-email-shilpa.bhat@linux.vnet.ibm.com
State Superseded
Headers show

Commit Message

Shilpasri G Bhat June 29, 2017, 6:20 p.m. UTC
This patch adds support for a shared memory based command/response
interface between OCC and OPAL. In HOMER, there is an OPAL command
buffer and an OCC response buffer which is used to send inband
commands to OCC.

The OPAL-OCC command/response sequence is as follows:

1. Check if both 'OCC Progress' bit in OCC response flag and 'Cmd Ready'
   bit in OPAL command flag are set to zero. If yes then proceed with
   below steps to send a command to OCC.
2. Write the command value, request ID and command specific data
   to the OPAL command buffer.
3. Clear the response flag and set the 'Cmd Ready' bit in OPAL command
   flag to indicate command is ready.
4. OCC will poll the command flag every 4ms to check if 'Cmd Ready' bit
   is set by OPAL. If the bit is set then OCC will set the 'OCC Progress'
   bit.
5. OCC will process the command and write the response to the OCC response
   buffer and set the 'Rsp Ready' bit in the response flag and sends an
   interrupt.
8. OPAL will receive the interrupt and queue the response to the host.

Signed-off-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
---
Changes from V3:
- Rebased on master
- Update the command data declaration for OCC_SENSOR_LIMIT_GROUP_JOB_SCHED
  as per the latest OCC-OPAL interface document

Chnages from V2:
- Add token as a new parameter to opal_occ_command()
- Get rid of the response queue

 hw/occ.c           | 415 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 include/opal-api.h |  82 ++++++++++-
 2 files changed, 492 insertions(+), 5 deletions(-)

Comments

Stewart Smith July 7, 2017, 8:53 a.m. UTC | #1
Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com> writes:
> This patch adds support for a shared memory based command/response
> interface between OCC and OPAL. In HOMER, there is an OPAL command
> buffer and an OCC response buffer which is used to send inband
> commands to OCC.
>
> The OPAL-OCC command/response sequence is as follows:
>
> 1. Check if both 'OCC Progress' bit in OCC response flag and 'Cmd Ready'
>    bit in OPAL command flag are set to zero. If yes then proceed with
>    below steps to send a command to OCC.
> 2. Write the command value, request ID and command specific data
>    to the OPAL command buffer.
> 3. Clear the response flag and set the 'Cmd Ready' bit in OPAL command
>    flag to indicate command is ready.
> 4. OCC will poll the command flag every 4ms to check if 'Cmd Ready' bit
>    is set by OPAL. If the bit is set then OCC will set the 'OCC Progress'
>    bit.
> 5. OCC will process the command and write the response to the OCC response
>    buffer and set the 'Rsp Ready' bit in the response flag and sends an
>    interrupt.
> 8. OPAL will receive the interrupt and queue the response to the host.
>
> Signed-off-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
> ---
> Changes from V3:
> - Rebased on master
> - Update the command data declaration for OCC_SENSOR_LIMIT_GROUP_JOB_SCHED
>   as per the latest OCC-OPAL interface document
>
> Chnages from V2:
> - Add token as a new parameter to opal_occ_command()
> - Get rid of the response queue
>
>  hw/occ.c           | 415 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  include/opal-api.h |  82 ++++++++++-
>  2 files changed, 492 insertions(+), 5 deletions(-)
>
> diff --git a/hw/occ.c b/hw/occ.c
> index 832bf42..553c29f 100644
> --- a/hw/occ.c
> +++ b/hw/occ.c
> @@ -120,6 +120,87 @@ struct occ_pstate_table {
>  } __packed;
>  
>  /**
> + * OPAL-OCC Command Response Interface
> + *
> + * OPAL-OCC Command Buffer
> + *
> + * ---------------------------------------------------------------------
> + * | OPAL  |  Cmd    | OPAL |	       | Cmd Data | Cmd Data | OPAL    |
> + * | Cmd   | Request | OCC  | Reserved | Length   | Length   | Cmd     |
> + * | Flags |   ID    | Cmd  |	       | (MSB)    | (LSB)    | Data... |
> + * ---------------------------------------------------------------------
> + * |  ….OPAL Command Data up to max of Cmd Data Length 4090 bytes      |
> + * |								       |
> + * ---------------------------------------------------------------------
> + *
> + * OPAL Command Flag
> + *
> + * -----------------------------------------------------------------
> + * | Bit 7 | Bit 6 | Bit 5 | Bit 4 | Bit 3 | Bit 2 | Bit 1 | Bit 0 |
> + * | (msb) |	   |	   |	   |	   |	   |	   | (lsb) |
> + * -----------------------------------------------------------------
> + * |Cmd    |       |       |       |       |       |       |       |
> + * |Ready  |	   |	   |	   |	   |	   |	   |	   |
> + * -----------------------------------------------------------------
> + *
> + * struct opal_command_buffer -	Defines the layout of OPAL command buffer
> + * @flag:			Provides general status of the command
> + * @request_id:			Token to identify request
> + * @cmd:			Command sent
> + * @data_size:			Command data length
> + * @data:			Command specific data
> + * @spare:			Unused byte
> + */
> +struct opal_command_buffer {
> +	u8 flag;
> +	u8 request_id;
> +	u8 cmd;
> +	u8 spare;
> +	u16 data_size;
> +	u8 data[MAX_OPAL_CMD_DATA_LENGTH];
> +} __packed;
> +
> +/**
> + * OPAL-OCC Response Buffer
> + *
> + * ---------------------------------------------------------------------
> + * | OCC   |  Cmd    | OPAL | Response | Rsp Data | Rsp Data | OPAL    |
> + * | Rsp   | Request | OCC  |  Status  | Length   | Length   | Rsp     |
> + * | Flags |   ID    | Cmd  |	       | (MSB)    | (LSB)    | Data... |
> + * ---------------------------------------------------------------------
> + * |  ….OPAL Response Data up to max of Rsp Data Length 8698 bytes     |
> + * |								       |
> + * ---------------------------------------------------------------------
> + *
> + * OCC Response Flag
> + *
> + * -----------------------------------------------------------------
> + * | Bit 7 | Bit 6 | Bit 5 | Bit 4 | Bit 3 | Bit 2 | Bit 1 | Bit 0 |
> + * | (msb) |	   |	   |	   |	   |	   |	   | (lsb) |
> + * -----------------------------------------------------------------
> + * |       |       |       |       |       |       |OCC in  | Rsp  |
> + * |       |	   |	   |	   |	   |	   |progress|Ready |
> + * -----------------------------------------------------------------
> + *
> + * struct occ_response_buffer -	Defines the layout of OCC response buffer
> + * @flag:			Provides general status of the response
> + * @request_id:			Token to identify request
> + * @cmd:			Command requested
> + * @status:			Indicates success/failure status of
> + *				the command
> + * @data_size:			Response data length
> + * @data:			Response specific data
> + */
> +struct occ_response_buffer {
> +	u8 flag;
> +	u8 request_id;
> +	u8 cmd;
> +	u8 status;
> +	u16 data_size;
> +	u8 data[MAX_OCC_RSP_DATA_LENGTH];
> +} __packed;
> +
> +/**
>   * OCC-OPAL Shared Memory Interface Dynamic Data Vx90
>   *
>   * struct occ_dynamic_data -	Contains runtime attributes
> @@ -136,6 +217,8 @@ struct occ_pstate_table {
>   * @max_pwr_cap:		Maximum allowed system power cap in Watts
>   * @cur_pwr_cap:		Current system power cap
>   * @spare/reserved:		Unused data
> + * @cmd:			Opal Command Buffer
> + * @rsp:			OCC Response Buffer
>   */
>  struct occ_dynamic_data {
>  	u8 occ_state;
> @@ -151,7 +234,9 @@ struct occ_dynamic_data {
>  	u16 min_pwr_cap;
>  	u16 max_pwr_cap;
>  	u16 cur_pwr_cap;
> -	u64 reserved;
> +	u8 pad[112];
> +	struct opal_command_buffer cmd;
> +	struct occ_response_buffer rsp;
>  } __packed;
>  
>  static bool occ_reset;
> @@ -843,6 +928,325 @@ done:
>  	unlock(&occ_lock);
>  }
>  
> +#define OCC_RSP_READY		0x01
> +#define OCC_IN_PROGRESS		0x02
> +#define OPAL_CMD_READY		0x80
> +
> +enum occ_state {
> +	OCC_STATE_NOT_RUNNING		= 0x00,
> +	OCC_STATE_STANDBY		= 0x01,
> +	OCC_STATE_OBSERVATION		= 0x02,
> +	OCC_STATE_ACTIVE		= 0x03,
> +	OCC_STATE_SAFE			= 0x04,
> +	OCC_STATE_CHARACTERIZATION	= 0x05,
> +};
> +
> +enum occ_role {
> +	OCC_ROLE_SLAVE		= 0x0,
> +	OCC_ROLE_MASTER		= 0x1,
> +};
> +
> +enum occ_cmd_value {
> +	OCC_CMD_VALUE_AMESTER_PASS_THRU		= 0x41,
> +	OCC_CMD_VALUE_CLEAR_SENSOR_DATA		= 0xD0,
> +	OCC_CMD_VALUE_SET_POWER_CAP		= 0xD1,
> +	OCC_CMD_VALUE_SET_POWER_SHIFTING_RATIO	= 0xD2,
> +	OCC_CMD_VALUE_SELECT_SENSOR_GROUPS	= 0xD3,
> +};
> +
> +struct opal_occ_cmd_info {
> +	enum	occ_cmd_value value;
> +	int	timeout_ms;
> +	u16	state_mask;
> +	u8	role_mask;
> +};
> +
> +static struct opal_occ_cmd_info occ_cmds[] = {
> +	{	OCC_CMD_VALUE_AMESTER_PASS_THRU,
> +		1000,
> +		PPC_BIT16(OCC_STATE_OBSERVATION) |
> +		PPC_BIT16(OCC_STATE_ACTIVE) |
> +		PPC_BIT16(OCC_STATE_CHARACTERIZATION),
> +		PPC_BIT8(OCC_ROLE_MASTER) | PPC_BIT8(OCC_ROLE_SLAVE)
> +	},
> +	{	OCC_CMD_VALUE_CLEAR_SENSOR_DATA,
> +		1000,
> +		PPC_BIT16(OCC_STATE_OBSERVATION) |
> +		PPC_BIT16(OCC_STATE_ACTIVE) |
> +		PPC_BIT16(OCC_STATE_CHARACTERIZATION),
> +		PPC_BIT8(OCC_ROLE_MASTER) | PPC_BIT8(OCC_ROLE_SLAVE)
> +	},
> +	{	OCC_CMD_VALUE_SET_POWER_CAP,
> +		1000,
> +		PPC_BIT16(OCC_STATE_OBSERVATION) |
> +		PPC_BIT16(OCC_STATE_ACTIVE) |
> +		PPC_BIT16(OCC_STATE_CHARACTERIZATION),
> +		PPC_BIT8(OCC_ROLE_MASTER)
> +	},
> +	{	OCC_CMD_VALUE_SET_POWER_SHIFTING_RATIO,
> +		1000,
> +		PPC_BIT16(OCC_STATE_OBSERVATION) |
> +		PPC_BIT16(OCC_STATE_ACTIVE) |
> +		PPC_BIT16(OCC_STATE_CHARACTERIZATION),
> +		PPC_BIT8(OCC_ROLE_MASTER) | PPC_BIT8(OCC_ROLE_SLAVE)
> +	},
> +	{	OCC_CMD_VALUE_SELECT_SENSOR_GROUPS,
> +		1000,
> +		PPC_BIT16(OCC_STATE_OBSERVATION) |
> +		PPC_BIT16(OCC_STATE_ACTIVE) |
> +		PPC_BIT16(OCC_STATE_CHARACTERIZATION),
> +		PPC_BIT8(OCC_ROLE_MASTER) | PPC_BIT8(OCC_ROLE_SLAVE)
> +	},
> +};
> +
> +static struct cmd_interface {
> +	struct lock queue_lock;
> +	struct timer timeout;
> +	struct opal_command_buffer *cmd;
> +	struct occ_response_buffer *rsp;
> +	struct opal_occ_cmd_rsp_msg *msg;
> +	u32 id;
> +	u32 token;
> +	enum occ_cmd prev_cmd;
> +	u8 occ_role;
> +	u8 *occ_state;
> +	bool cmd_in_progress;
> +} *chips;
> +
> +static int nr_occs;
> +
> +static inline struct cmd_interface *get_chip_cmd_interface(int chip_id)
> +{
> +	int i;
> +
> +	for (i = 0; i < nr_occs; i++)
> +		if (chips[i].id == chip_id)
> +			return &chips[i];
> +
> +	return NULL;
> +}
> +
> +static inline bool occ_in_progress(struct cmd_interface *chip)
> +{
> +	return (chip->rsp->flag == OCC_IN_PROGRESS);
> +}
> +
> +static int write_occ_cmd(struct cmd_interface *chip, bool retry)
> +{
> +	struct opal_command_buffer *cmd = chip->cmd;
> +	struct opal_occ_cmd_rsp_msg *msg = chip->msg;
> +
> +	if (!retry && occ_in_progress(chip)) {
> +		chip->cmd_in_progress = false;
> +		return OPAL_OCC_BUSY;
> +	}
> +
> +	cmd->flag = chip->rsp->flag = 0;
> +	cmd->cmd = occ_cmds[msg->cmd].value;
> +	cmd->request_id = msg->request_id;
> +	cmd->data_size = msg->cdata_size;
> +	memcpy(&cmd->data, (u8 *)msg->cdata, msg->cdata_size);
> +	cmd->flag = OPAL_CMD_READY;
> +
> +	schedule_timer(&chip->timeout,
> +		       msecs_to_tb(occ_cmds[msg->cmd].timeout_ms));
> +	chip->prev_cmd = msg->cmd;
> +
> +	return OPAL_ASYNC_COMPLETION;
> +}
> +
> +static int64_t opal_occ_command(int chip_id, struct opal_occ_cmd_rsp_msg *msg,
> +				int token, bool retry)

It looks like retry is not actually retry, but rather 'return if busy,
don't queue' as in write_occ_cmd that's the logic.

> +{
> +	struct cmd_interface *chip;
> +	int rc;
> +
> +	if (!msg || !opal_addr_valid((u8 *)msg->cdata) ||
> +	    !opal_addr_valid((u8 *)msg->rdata))
> +		return OPAL_PARAMETER;

no check if msg is valid addr?

> +
> +	if (msg->cmd >= OCC_CMD_LAST)
> +		return OPAL_UNSUPPORTED;
> +
> +	if (msg->cdata_size > MAX_OPAL_CMD_DATA_LENGTH)
> +		return OPAL_PARAMETER;
> +
> +	chip = get_chip_cmd_interface(chip_id);
> +	if (!chip)
> +		return OPAL_PARAMETER;
> +
> +	if (retry && msg->cmd != chip->prev_cmd)
> +		return OPAL_PARAMETER;
> +
> +	if (!(PPC_BIT8(chip->occ_role) & occ_cmds[msg->cmd].role_mask))
> +		return OPAL_PARAMETER;
> +
> +	if (!(PPC_BIT16(*chip->occ_state) & occ_cmds[msg->cmd].state_mask))
> +		return OPAL_OCC_INVALID_STATE;
> +
> +	lock(&chip->queue_lock);
> +	if (chip->cmd_in_progress) {
> +		rc = OPAL_OCC_BUSY;
> +		goto out;
> +	}
> +
> +	chip->msg = msg;
> +	chip->token = token;
> +	chip->cmd_in_progress = true;
> +	rc = write_occ_cmd(chip, retry);
> +out:
> +	unlock(&chip->queue_lock);
> +	return rc;
> +}
> +
> +static inline bool sanity_check_opal_cmd(struct opal_command_buffer *cmd,
> +					 struct opal_occ_cmd_rsp_msg *msg)
> +{
> +	return ((cmd->cmd == occ_cmds[msg->cmd].value) &&
> +		(cmd->request_id == msg->request_id) &&
> +		(cmd->data_size == msg->cdata_size));
> +}
> +
> +static inline bool check_occ_rsp(struct opal_command_buffer *cmd,
> +				 struct occ_response_buffer *rsp)
> +{
> +	if (cmd->cmd != rsp->cmd) {
> +		prlog(PR_WARNING, "OCC: Command value mismatch in OCC response"
> +		      "rsp->cmd = %d cmd->cmd = %d\n", rsp->cmd, cmd->cmd);
> +		return false;
> +	}
> +
> +	if (cmd->request_id != rsp->request_id) {
> +		prlog(PR_WARNING, "OCC: Request ID mismatch in OCC response"
> +		      "rsp->request_id = %d cmd->request_id = %d\n",
> +		      rsp->request_id, cmd->request_id);
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
> +static inline void queue_occ_rsp_msg(int token, int rc)
> +{
> +	int ret;
> +
> +	ret = opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, token, rc);
> +	if (ret)
> +		prerror("OCC: Failed to queue OCC response status message\n");
> +}
> +
> +static void occ_cmd_timeout_handler(struct timer *t __unused, void *data,
> +				    uint64_t now __unused)
> +{
> +	struct cmd_interface *chip = data;
> +
> +	lock(&chip->queue_lock);
> +	if (!chip->cmd_in_progress)
> +		goto exit;
> +
> +	chip->cmd_in_progress = false;
> +	queue_occ_rsp_msg(chip->token, OPAL_OCC_CMD_TIMEOUT);
> +exit:
> +	unlock(&chip->queue_lock);
> +}
> +
> +static void read_occ_rsp(struct occ_response_buffer *rsp,
> +			 struct opal_occ_cmd_rsp_msg *msg)
> +{
> +	/* Copy response to host buffer */
> +	msg->status = rsp->status;
> +	msg->rdata_size = rsp->data_size;
> +	memcpy((u8 *)msg->rdata, rsp->data, rsp->data_size);
> +
> +	/* Clear the OCC response flag */
> +	rsp->flag = 0;
> +}
> +
> +static void handle_occ_rsp(uint32_t chip_id)
> +{
> +	struct cmd_interface *chip;
> +	struct opal_command_buffer *cmd;
> +	struct occ_response_buffer *rsp;
> +	struct opal_occ_cmd_rsp_msg *msg;
> +
> +	chip = get_chip_cmd_interface(chip_id);
> +	if (!chip)
> +		return;
> +
> +	cmd = chip->cmd;
> +	rsp = chip->rsp;
> +	msg = chip->msg;
> +
> +	/*Read rsp*/
> +	if (rsp->flag != OCC_RSP_READY)
> +		return;
> +	lock(&chip->queue_lock);
> +	if (!chip->cmd_in_progress)
> +		goto exit;
> +
> +	chip->cmd_in_progress = false;
> +	cancel_timer(&chip->timeout);
> +	if (!sanity_check_opal_cmd(cmd, chip->msg) ||
> +	    !check_occ_rsp(cmd, rsp)) {
> +		queue_occ_rsp_msg(chip->token, OPAL_OCC_CMD_MISMATCH);
> +		goto exit;
> +	}
> +
> +	read_occ_rsp(chip->rsp, msg);
> +	queue_occ_rsp_msg(chip->token, OPAL_SUCCESS);
> +exit:
> +	unlock(&chip->queue_lock);
> +}
> +
> +static void occ_cmd_interface_init(void)
> +{
> +	struct dt_node *power_mgt;
> +	struct occ_dynamic_data *data;
> +	struct occ_pstate_table *pdata;
> +	struct proc_chip *chip;
> +	int i = 0;
> +
> +	chip = next_chip(NULL);
> +	pdata = get_occ_pstate_table(chip);
> +	if (pdata->version != 0x90)
> +		return;
> +
> +	for_each_chip(chip)
> +		nr_occs++;
> +
> +	chips = malloc(sizeof(*chips) * nr_occs);
> +	assert(chips);
> +
> +	for_each_chip(chip) {
> +		pdata = get_occ_pstate_table(chip);
> +		data = get_occ_dynamic_data(chip);
> +		chips[i].id = chip->id;
> +		chips[i].occ_role = pdata->v9.occ_role;
> +		chips[i].occ_state = &data->occ_state;
> +		chips[i].cmd = &data->cmd;
> +		chips[i].rsp = &data->rsp;
> +		init_lock(&chips[i].queue_lock);
> +		chips[i].cmd_in_progress = false;
> +		chips[i].prev_cmd = OCC_CMD_LAST;
> +		init_timer(&chips[i].timeout, occ_cmd_timeout_handler,
> +			   &chips[i]);
> +		i++;
> +	}
> +
> +	power_mgt = dt_find_by_path(dt_root, "/ibm,opal/power-mgt");
> +	if (!power_mgt) {
> +		prerror("OCC: dt node /ibm,opal/power-mgt not found\n");
> +		free(chips);
> +		return;
> +	}
> +
> +	update_max_async_completions(nr_occs);
> +	alloc_opal_msg_list(nr_occs);
> +	dt_add_property_string(power_mgt, "compatible",
> +			       "ibm,opal-occ-cmd-rsp-interface");
> +	opal_register(OPAL_OCC_COMMAND, opal_occ_command, 4);

This means you need to add documentation to doc/opal-api/ describing the
API.


> @@ -910,6 +1314,9 @@ void occ_pstates_init(void)
>  		chip->throttle = 0;
>  	opal_add_poller(occ_throttle_poll, NULL);
>  	occ_pstates_initialized = true;
> +
> +	/* Init OPAL-OCC command-response interface */
> +	occ_cmd_interface_init();
>  }
>  
>  struct occ_load_req {
> @@ -1409,8 +1816,10 @@ void occ_p9_interrupt(uint32_t chip_id)
>  	if (ireg & OCB_OCI_OCIMISC_IRQ_TMGT)
>  		prd_tmgt_interrupt(chip_id);
>  
> -	if (ireg & OCB_OCI_OCIMISC_IRQ_SHMEM)
> +	if (ireg & OCB_OCI_OCIMISC_IRQ_SHMEM) {
>  		occ_throttle_poll(NULL);
> +		handle_occ_rsp(chip_id);
> +	}
>  
>  	if (ireg & OCB_OCI_OCIMISC_IRQ_I2C)
>  		p9_i2c_bus_owner_change(chip_id);
> @@ -1435,5 +1844,3 @@ void occ_fsp_init(void)
>  	if (fsp_present())
>  		fsp_register_client(&fsp_occ_client, FSP_MCLASS_OCC);
>  }
> -
> -
> diff --git a/include/opal-api.h b/include/opal-api.h
> index edae069..dd92cf9 100644
> --- a/include/opal-api.h
> +++ b/include/opal-api.h
> @@ -55,6 +55,10 @@
>  #define OPAL_XSCOM_CTR_OFFLINED	-30
>  #define OPAL_XIVE_PROVISIONING	-31
>  #define OPAL_XIVE_FREE_ACTIVE	-32
> +#define OPAL_OCC_INVALID_STATE	-33

Instead of adding new return codes, I think existing ones suffice, or is
there a really big need to have a way to distinguish these errors from
other ones?  e.g. insetad of OPAL_OCC_INVALID_STATE, it could be
OPAL_WRONG_STATE - and this is consistent with other OPAL calls.

> +#define OPAL_OCC_BUSY		-34

OPAL_BUSY

> +#define OPAL_OCC_CMD_TIMEOUT	-35

We probably want an OPAL_TIMEOUT actually... we have OPAL_I2C_TIMEOUT
and OPAL_XSCOM_TIMEOUT

> +#define OPAL_OCC_CMD_MISMATCH	-36

OPAL_PARAMETER

Does that make sense? Any reason not to do that?

>  
>  /* API Tokens (in r0) */
>  #define OPAL_INVALID_CALL		       -1
> @@ -207,7 +211,8 @@
>  #define OPAL_IMC_COUNTERS_INIT			149
>  #define OPAL_IMC_COUNTERS_START			150
>  #define OPAL_IMC_COUNTERS_STOP			151
> -#define OPAL_LAST				151
> +#define OPAL_OCC_COMMAND			152
> +#define OPAL_LAST				152
>  
>  /* Device tree flags */
>  
> @@ -1033,6 +1038,81 @@ typedef struct oppanel_line {
>  	__be64 line_len;
>  } oppanel_line_t;
>  
> +enum occ_cmd {
> +	OCC_CMD_AMESTER_PASS_THRU = 0,

What is amester passthru for? Do we need this as part of our ABI?

> +	OCC_CMD_CLEAR_SENSOR_DATA,

Clear what sensors?

> +	OCC_CMD_SET_POWER_CAP,

Power cap for what? System? Socket?

> +	OCC_CMD_SET_POWER_SHIFTING_RATIO,
> +	OCC_CMD_SELECT_SENSOR_GROUPS,

Basically, this all needs to be documented somewhere, notably in
doc/opal-api/

I also wonder if this approach is the best for the OPAL API. While we do
*currently* have a thing called an OCC that does these functions, is
that *always* going to be the case? Elsewhere in firmware we've been
going down the route of calling things "pm_complex" and sending commands
to/from the pm_complex to do tasks for us.

would it be better if our OPAL API instead described the intended effect
rather than how to achieve said effect?

I wouldn't be opposed to an OPAL_SET_POWER_CAP call, that's an async
call, and accepts parameters in a range that's described in the device
tree. That call could be implemented on P8 too (where you currently set
power caps by doing crazy IPMI incantations to the BMC), and that would
prove that the OPAL API is implementation independent, and we should be
future proof with however we do the intent of capping
system/socket/other-thing power in the future.

How do the sensors work? Do they fit into the existing OPAL Sensors
infrastructure? are they more in-memory sensors? I ask as perhaps the
select/clear calls should be part of either the regular opal sensors API
or the IMC api rather than their own API.

> +
> +enum occ_cmd_data_length {
> +	OCC_CMD_DL_AMESTER_PASS_THRU		= 0, /* Variable data length */
> +	OCC_CMD_DL_CLEAR_SENSOR_DATA		= 4,
> +	OCC_CMD_DL_SET_POWER_CAP		= 2,
> +	OCC_CMD_DL_SET_POWER_SHIFTING_RATIO	= 1,
> +	OCC_CMD_DL_SELECT_SENSOR_GROUPS		= 2,
> +};
> +
> +enum occ_rsp_data_length {
> +	OCC_RSP_DL_AMESTER_PASS_THRU		= 0, /* Variable data length */
> +	OCC_RSP_DL_CLEAR_SENSOR_DATA		= 4,
> +	OCC_RSP_DL_SET_POWER_CAP		= 2,
> +	OCC_RSP_DL_SET_POWER_SHIFTING_RATIO	= 1,
> +	OCC_RSP_DL_SELECT_SENSOR_GROUPS		= 2,
> +};
> +
> +enum occ_sensor_limit_group {
> +	OCC_SENSOR_LIMIT_GROUP_CSM		= 0x10,
> +	OCC_SENSOR_LIMIT_GROUP_PROFILER		= 0x20,
> +	OCC_SENSOR_LIMIT_GROUP_JOB_SCHED	= 0x40,
> +};
> +
> +enum occ_sensor_group_mask {
> +	OCC_SENSOR_GROUP_MASK_PERFORMANCE	= 6,
> +	OCC_SENSOR_GROUP_MASK_POWER		= 8,
> +	OCC_SENSOR_GROUP_MASK_FREQUENCY		= 9,
> +	OCC_SENSOR_GROUP_MASK_TIME		= 10,
> +	OCC_SENSOR_GROUP_MASK_UTILIZATION	= 11,
> +	OCC_SENSOR_GROUP_MASK_TEMPERATURE	= 12,
> +	OCC_SENSOR_GROUP_MASK_VOLTAGE		= 13,
> +	OCC_SENSOR_GROUP_MASK_CURRENT		= 14,
> +};
> +
> +#define MAX_OPAL_CMD_DATA_LENGTH	4090
> +#define MAX_OCC_RSP_DATA_LENGTH		8698
> +
> +enum occ_response_status {
> +	OCC_SUCCESS			= 0x00,
> +	OCC_INVALID_COMMAND		= 0x11,
> +	OCC_INVALID_CMD_DATA_LENGTH	= 0x12,
> +	OCC_INVALID_DATA		= 0x13,
> +	OCC_INTERNAL_ERROR		= 0x15,
> +};
> +
> +struct opal_occ_cmd_rsp_msg {
> +	__be64 cdata;
> +	__be64 rdata;
> +	__be16 cdata_size;
> +	__be16 rdata_size;
> +	u8 cmd;
> +	u8 request_id;
> +	u8 status;
> +};
> +
> +struct opal_occ_cmd_data {
> +	__be16 size;
> +	u8 cmd;
> +	u8 data[];
> +};
> +
> +struct opal_occ_rsp_data {
> +	__be16 size;
> +	u8 status;
> +	u8 data[];
> +};
> +
>  enum opal_prd_msg_type {
>  	OPAL_PRD_MSG_TYPE_INIT = 0,	/* HBRT --> OPAL */
>  	OPAL_PRD_MSG_TYPE_FINI,		/* HBRT/kernel --> OPAL */
> -- 
> 1.8.3.1
>
> _______________________________________________
> Skiboot mailing list
> Skiboot@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/skiboot
Shilpasri G Bhat July 10, 2017, 7:04 a.m. UTC | #2
Hi Stewart,

On 07/07/2017 02:23 PM, Stewart Smith wrote:
> Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com> writes:


>> +static inline bool occ_in_progress(struct cmd_interface *chip)
>> +{
>> +	return (chip->rsp->flag == OCC_IN_PROGRESS);
>> +}
>> +
>> +static int write_occ_cmd(struct cmd_interface *chip, bool retry)
>> +{
>> +	struct opal_command_buffer *cmd = chip->cmd;
>> +	struct opal_occ_cmd_rsp_msg *msg = chip->msg;
>> +
>> +	if (!retry && occ_in_progress(chip)) {
>> +		chip->cmd_in_progress = false;
>> +		return OPAL_OCC_BUSY;
>> +	}
>> +
>> +	cmd->flag = chip->rsp->flag = 0;
>> +	cmd->cmd = occ_cmds[msg->cmd].value;
>> +	cmd->request_id = msg->request_id;
>> +	cmd->data_size = msg->cdata_size;
>> +	memcpy(&cmd->data, (u8 *)msg->cdata, msg->cdata_size);
>> +	cmd->flag = OPAL_CMD_READY;
>> +
>> +	schedule_timer(&chip->timeout,
>> +		       msecs_to_tb(occ_cmds[msg->cmd].timeout_ms));
>> +	chip->prev_cmd = msg->cmd;
>> +
>> +	return OPAL_ASYNC_COMPLETION;
>> +}
>> +
>> +static int64_t opal_occ_command(int chip_id, struct opal_occ_cmd_rsp_msg *msg,
>> +				int token, bool retry)
> 
> It looks like retry is not actually retry, but rather 'return if busy,
> don't queue' as in write_occ_cmd that's the logic.

If kernel is retrying the same command due to a command-timeout in the previous
try, then don't check the OCC status flag as OCC might not have been able to
update the status flag. 'retry' flag is used to indicate that kernel is retrying
the command.

Do you want 'retry' state to be stored in OPAL ?

> 
>> +{
>> +	struct cmd_interface *chip;
>> +	int rc;
>> +
>> +	if (!msg || !opal_addr_valid((u8 *)msg->cdata) ||
>> +	    !opal_addr_valid((u8 *)msg->rdata))
>> +		return OPAL_PARAMETER;
> 
> no check if msg is valid addr?

Yeah will add the check for 'msg'.

> 
>> +
>> +	if (msg->cmd >= OCC_CMD_LAST)
>> +		return OPAL_UNSUPPORTED;
>> +
>> +	if (msg->cdata_size > MAX_OPAL_CMD_DATA_LENGTH)
>> +		return OPAL_PARAMETER;
>> +
>> +	chip = get_chip_cmd_interface(chip_id);
>> +	if (!chip)
>> +		return OPAL_PARAMETER;
>> +
>> +	if (retry && msg->cmd != chip->prev_cmd)
>> +		return OPAL_PARAMETER;
>> +
>> +	if (!(PPC_BIT8(chip->occ_role) & occ_cmds[msg->cmd].role_mask))
>> +		return OPAL_PARAMETER;
>> +
>> +	if (!(PPC_BIT16(*chip->occ_state) & occ_cmds[msg->cmd].state_mask))
>> +		return OPAL_OCC_INVALID_STATE;
>> +
>> +	lock(&chip->queue_lock);
>> +	if (chip->cmd_in_progress) {
>> +		rc = OPAL_OCC_BUSY;
>> +		goto out;
>> +	}
>> +
>> +	chip->msg = msg;
>> +	chip->token = token;
>> +	chip->cmd_in_progress = true;
>> +	rc = write_occ_cmd(chip, retry);
>> +out:
>> +	unlock(&chip->queue_lock);
>> +	return rc;
>> +}
>> +
>> +static inline bool sanity_check_opal_cmd(struct opal_command_buffer *cmd,
>> +					 struct opal_occ_cmd_rsp_msg *msg)
>> +{
>> +	return ((cmd->cmd == occ_cmds[msg->cmd].value) &&
>> +		(cmd->request_id == msg->request_id) &&
>> +		(cmd->data_size == msg->cdata_size));
>> +}
>> +
>> +static inline bool check_occ_rsp(struct opal_command_buffer *cmd,
>> +				 struct occ_response_buffer *rsp)
>> +{
>> +	if (cmd->cmd != rsp->cmd) {
>> +		prlog(PR_WARNING, "OCC: Command value mismatch in OCC response"
>> +		      "rsp->cmd = %d cmd->cmd = %d\n", rsp->cmd, cmd->cmd);
>> +		return false;
>> +	}
>> +
>> +	if (cmd->request_id != rsp->request_id) {
>> +		prlog(PR_WARNING, "OCC: Request ID mismatch in OCC response"
>> +		      "rsp->request_id = %d cmd->request_id = %d\n",
>> +		      rsp->request_id, cmd->request_id);
>> +		return false;
>> +	}
>> +
>> +	return true;
>> +}
>> +
>> +static inline void queue_occ_rsp_msg(int token, int rc)
>> +{
>> +	int ret;
>> +
>> +	ret = opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, token, rc);
>> +	if (ret)
>> +		prerror("OCC: Failed to queue OCC response status message\n");
>> +}
>> +
>> +static void occ_cmd_timeout_handler(struct timer *t __unused, void *data,
>> +				    uint64_t now __unused)
>> +{
>> +	struct cmd_interface *chip = data;
>> +
>> +	lock(&chip->queue_lock);
>> +	if (!chip->cmd_in_progress)
>> +		goto exit;
>> +
>> +	chip->cmd_in_progress = false;
>> +	queue_occ_rsp_msg(chip->token, OPAL_OCC_CMD_TIMEOUT);
>> +exit:
>> +	unlock(&chip->queue_lock);
>> +}
>> +
>> +static void read_occ_rsp(struct occ_response_buffer *rsp,
>> +			 struct opal_occ_cmd_rsp_msg *msg)
>> +{
>> +	/* Copy response to host buffer */
>> +	msg->status = rsp->status;
>> +	msg->rdata_size = rsp->data_size;
>> +	memcpy((u8 *)msg->rdata, rsp->data, rsp->data_size);
>> +
>> +	/* Clear the OCC response flag */
>> +	rsp->flag = 0;
>> +}
>> +
>> +static void handle_occ_rsp(uint32_t chip_id)
>> +{
>> +	struct cmd_interface *chip;
>> +	struct opal_command_buffer *cmd;
>> +	struct occ_response_buffer *rsp;
>> +	struct opal_occ_cmd_rsp_msg *msg;
>> +
>> +	chip = get_chip_cmd_interface(chip_id);
>> +	if (!chip)
>> +		return;
>> +
>> +	cmd = chip->cmd;
>> +	rsp = chip->rsp;
>> +	msg = chip->msg;
>> +
>> +	/*Read rsp*/
>> +	if (rsp->flag != OCC_RSP_READY)
>> +		return;
>> +	lock(&chip->queue_lock);
>> +	if (!chip->cmd_in_progress)
>> +		goto exit;
>> +
>> +	chip->cmd_in_progress = false;
>> +	cancel_timer(&chip->timeout);
>> +	if (!sanity_check_opal_cmd(cmd, chip->msg) ||
>> +	    !check_occ_rsp(cmd, rsp)) {
>> +		queue_occ_rsp_msg(chip->token, OPAL_OCC_CMD_MISMATCH);
>> +		goto exit;
>> +	}
>> +
>> +	read_occ_rsp(chip->rsp, msg);
>> +	queue_occ_rsp_msg(chip->token, OPAL_SUCCESS);
>> +exit:
>> +	unlock(&chip->queue_lock);
>> +}
>> +
>> +static void occ_cmd_interface_init(void)
>> +{
>> +	struct dt_node *power_mgt;
>> +	struct occ_dynamic_data *data;
>> +	struct occ_pstate_table *pdata;
>> +	struct proc_chip *chip;
>> +	int i = 0;
>> +
>> +	chip = next_chip(NULL);
>> +	pdata = get_occ_pstate_table(chip);
>> +	if (pdata->version != 0x90)
>> +		return;
>> +
>> +	for_each_chip(chip)
>> +		nr_occs++;
>> +
>> +	chips = malloc(sizeof(*chips) * nr_occs);
>> +	assert(chips);
>> +
>> +	for_each_chip(chip) {
>> +		pdata = get_occ_pstate_table(chip);
>> +		data = get_occ_dynamic_data(chip);
>> +		chips[i].id = chip->id;
>> +		chips[i].occ_role = pdata->v9.occ_role;
>> +		chips[i].occ_state = &data->occ_state;
>> +		chips[i].cmd = &data->cmd;
>> +		chips[i].rsp = &data->rsp;
>> +		init_lock(&chips[i].queue_lock);
>> +		chips[i].cmd_in_progress = false;
>> +		chips[i].prev_cmd = OCC_CMD_LAST;
>> +		init_timer(&chips[i].timeout, occ_cmd_timeout_handler,
>> +			   &chips[i]);
>> +		i++;
>> +	}
>> +
>> +	power_mgt = dt_find_by_path(dt_root, "/ibm,opal/power-mgt");
>> +	if (!power_mgt) {
>> +		prerror("OCC: dt node /ibm,opal/power-mgt not found\n");
>> +		free(chips);
>> +		return;
>> +	}
>> +
>> +	update_max_async_completions(nr_occs);
>> +	alloc_opal_msg_list(nr_occs);
>> +	dt_add_property_string(power_mgt, "compatible",
>> +			       "ibm,opal-occ-cmd-rsp-interface");
>> +	opal_register(OPAL_OCC_COMMAND, opal_occ_command, 4);
> 
> This means you need to add documentation to doc/opal-api/ describing the
> API.
> 
> 

Okay.

>> @@ -910,6 +1314,9 @@ void occ_pstates_init(void)
>>  		chip->throttle = 0;
>>  	opal_add_poller(occ_throttle_poll, NULL);
>>  	occ_pstates_initialized = true;
>> +
>> +	/* Init OPAL-OCC command-response interface */
>> +	occ_cmd_interface_init();
>>  }
>>  
>>  struct occ_load_req {
>> @@ -1409,8 +1816,10 @@ void occ_p9_interrupt(uint32_t chip_id)
>>  	if (ireg & OCB_OCI_OCIMISC_IRQ_TMGT)
>>  		prd_tmgt_interrupt(chip_id);
>>  
>> -	if (ireg & OCB_OCI_OCIMISC_IRQ_SHMEM)
>> +	if (ireg & OCB_OCI_OCIMISC_IRQ_SHMEM) {
>>  		occ_throttle_poll(NULL);
>> +		handle_occ_rsp(chip_id);
>> +	}
>>  
>>  	if (ireg & OCB_OCI_OCIMISC_IRQ_I2C)
>>  		p9_i2c_bus_owner_change(chip_id);
>> @@ -1435,5 +1844,3 @@ void occ_fsp_init(void)
>>  	if (fsp_present())
>>  		fsp_register_client(&fsp_occ_client, FSP_MCLASS_OCC);
>>  }
>> -
>> -
>> diff --git a/include/opal-api.h b/include/opal-api.h
>> index edae069..dd92cf9 100644
>> --- a/include/opal-api.h
>> +++ b/include/opal-api.h
>> @@ -55,6 +55,10 @@
>>  #define OPAL_XSCOM_CTR_OFFLINED	-30
>>  #define OPAL_XIVE_PROVISIONING	-31
>>  #define OPAL_XIVE_FREE_ACTIVE	-32
>> +#define OPAL_OCC_INVALID_STATE	-33
> 
> Instead of adding new return codes, I think existing ones suffice, or is
> there a really big need to have a way to distinguish these errors from
> other ones?  e.g. insetad of OPAL_OCC_INVALID_STATE, it could be
> OPAL_WRONG_STATE - and this is consistent with other OPAL calls.
> 
>> +#define OPAL_OCC_BUSY		-34
> 
> OPAL_BUSY

Yeah OPAL_BUSY works.

> 
>> +#define OPAL_OCC_CMD_TIMEOUT	-35
> 
> We probably want an OPAL_TIMEOUT actually... we have OPAL_I2C_TIMEOUT
> and OPAL_XSCOM_TIMEOUT
> 
>> +#define OPAL_OCC_CMD_MISMATCH	-36
> 
> OPAL_PARAMETER
> 
> Does that make sense? Any reason not to do that?
> 

OPAL_OCC_CMD_TIMEOUT and OPAL_OCC_CMD_MISMATCH is used by kerenl to retry the
OCC command. As OPAL_PARAMETER is also used for the sanity checks of the
parameters passed to opal_call, kernel wont be able to distinguish when to retry.

>>  
>>  /* API Tokens (in r0) */
>>  #define OPAL_INVALID_CALL		       -1
>> @@ -207,7 +211,8 @@
>>  #define OPAL_IMC_COUNTERS_INIT			149
>>  #define OPAL_IMC_COUNTERS_START			150
>>  #define OPAL_IMC_COUNTERS_STOP			151
>> -#define OPAL_LAST				151
>> +#define OPAL_OCC_COMMAND			152
>> +#define OPAL_LAST				152
>>  
>>  /* Device tree flags */
>>  
>> @@ -1033,6 +1038,81 @@ typedef struct oppanel_line {
>>  	__be64 line_len;
>>  } oppanel_line_t;
>>  
>> +enum occ_cmd {
>> +	OCC_CMD_AMESTER_PASS_THRU = 0,
> 
> What is amester passthru for? Do we need this as part of our ABI?

This command will be used by Amester for tracing the sensor values at 250us/2ms
sampling rate. In P8, Amester used IPMI for tracing. In P9, Amester will be able
to communicate to OCC inband using this cmd/rsp interface.

Yeah we need this command to support inband Amester tracing.

> 
>> +	OCC_CMD_CLEAR_SENSOR_DATA,
> 
> Clear what sensors?

Min-max values of sensors stored for different users like CSM, Profiler and
Job-scheduler.

How about OCC_CMD_CLEAR_SENSOR_MIN_MAX instead?

> 
>> +	OCC_CMD_SET_POWER_CAP,
> 
> Power cap for what? System? Socket?

System.

> 
>> +	OCC_CMD_SET_POWER_SHIFTING_RATIO,
>> +	OCC_CMD_SELECT_SENSOR_GROUPS,
> 
> Basically, this all needs to be documented somewhere, notably in
> doc/opal-api/

Okay.

> 
> I also wonder if this approach is the best for the OPAL API. While we do
> *currently* have a thing called an OCC that does these functions, is
> that *always* going to be the case? Elsewhere in firmware we've been
> going down the route of calling things "pm_complex" and sending commands
> to/from the pm_complex to do tasks for us.
> 
> would it be better if our OPAL API instead described the intended effect
> rather than how to achieve said effect?
> 
> I wouldn't be opposed to an OPAL_SET_POWER_CAP call, that's an async
> call, and accepts parameters in a range that's described in the device
> tree. That call could be implemented on P8 too (where you currently set
> power caps by doing crazy IPMI incantations to the BMC), and that would
> prove that the OPAL API is implementation independent, and we should be
> future proof with however we do the intent of capping
> system/socket/other-thing power in the future.

I agree with defining an abstract API for setting powercap as a better approach.

> 
> How do the sensors work? Do they fit into the existing OPAL Sensors
> infrastructure? are they more in-memory sensors? I ask as perhaps the
> select/clear calls should be part of either the regular opal sensors API
> or the IMC api rather than their own API.

Currently, only in-memory environment sensors are exported in the OPAL sensor
infrastructure. Yes there are more perf/util/freq in-memory sensors that are not
exported.

The current sensor framework provides API to only read the sensor values. We
will need new API additions to the sensor framework then to clear min/max of
in-memory sensors and select the sensor groups to be copied to main memory.
So should we have new opal-calls defined for these sensor operations instead of
the OCC cmd/rsp interface?


Thanks and Regards,
Shilpa

>> -- 
>> 1.8.3.1
>>
>> _______________________________________________
>> Skiboot mailing list
>> Skiboot@lists.ozlabs.org
>> https://lists.ozlabs.org/listinfo/skiboot
>
Stewart Smith July 16, 2017, 11:26 p.m. UTC | #3
Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com> writes:
> On 07/07/2017 02:23 PM, Stewart Smith wrote:
>> Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com> writes:
>
>
>>> +static inline bool occ_in_progress(struct cmd_interface *chip)
>>> +{
>>> +	return (chip->rsp->flag == OCC_IN_PROGRESS);
>>> +}
>>> +
>>> +static int write_occ_cmd(struct cmd_interface *chip, bool retry)
>>> +{
>>> +	struct opal_command_buffer *cmd = chip->cmd;
>>> +	struct opal_occ_cmd_rsp_msg *msg = chip->msg;
>>> +
>>> +	if (!retry && occ_in_progress(chip)) {
>>> +		chip->cmd_in_progress = false;
>>> +		return OPAL_OCC_BUSY;
>>> +	}
>>> +
>>> +	cmd->flag = chip->rsp->flag = 0;
>>> +	cmd->cmd = occ_cmds[msg->cmd].value;
>>> +	cmd->request_id = msg->request_id;
>>> +	cmd->data_size = msg->cdata_size;
>>> +	memcpy(&cmd->data, (u8 *)msg->cdata, msg->cdata_size);
>>> +	cmd->flag = OPAL_CMD_READY;
>>> +
>>> +	schedule_timer(&chip->timeout,
>>> +		       msecs_to_tb(occ_cmds[msg->cmd].timeout_ms));
>>> +	chip->prev_cmd = msg->cmd;
>>> +
>>> +	return OPAL_ASYNC_COMPLETION;
>>> +}
>>> +
>>> +static int64_t opal_occ_command(int chip_id, struct opal_occ_cmd_rsp_msg *msg,
>>> +				int token, bool retry)
>> 
>> It looks like retry is not actually retry, but rather 'return if busy,
>> don't queue' as in write_occ_cmd that's the logic.
>
> If kernel is retrying the same command due to a command-timeout in the previous
> try, then don't check the OCC status flag as OCC might not have been able to
> update the status flag. 'retry' flag is used to indicate that kernel is retrying
> the command.
>
> Do you want 'retry' state to be stored in OPAL ?

I think that would be better. If part of the protocol in talking to the
OCC is that if at first you don't succeed, retry without looking at
status, then we should do the retry within OPAL transparently to Linux,
so that the kernel just sees success/fail.

>>> @@ -910,6 +1314,9 @@ void occ_pstates_init(void)
>>>  		chip->throttle = 0;
>>>  	opal_add_poller(occ_throttle_poll, NULL);
>>>  	occ_pstates_initialized = true;
>>> +
>>> +	/* Init OPAL-OCC command-response interface */
>>> +	occ_cmd_interface_init();
>>>  }
>>>  
>>>  struct occ_load_req {
>>> @@ -1409,8 +1816,10 @@ void occ_p9_interrupt(uint32_t chip_id)
>>>  	if (ireg & OCB_OCI_OCIMISC_IRQ_TMGT)
>>>  		prd_tmgt_interrupt(chip_id);
>>>  
>>> -	if (ireg & OCB_OCI_OCIMISC_IRQ_SHMEM)
>>> +	if (ireg & OCB_OCI_OCIMISC_IRQ_SHMEM) {
>>>  		occ_throttle_poll(NULL);
>>> +		handle_occ_rsp(chip_id);
>>> +	}
>>>  
>>>  	if (ireg & OCB_OCI_OCIMISC_IRQ_I2C)
>>>  		p9_i2c_bus_owner_change(chip_id);
>>> @@ -1435,5 +1844,3 @@ void occ_fsp_init(void)
>>>  	if (fsp_present())
>>>  		fsp_register_client(&fsp_occ_client, FSP_MCLASS_OCC);
>>>  }
>>> -
>>> -
>>> diff --git a/include/opal-api.h b/include/opal-api.h
>>> index edae069..dd92cf9 100644
>>> --- a/include/opal-api.h
>>> +++ b/include/opal-api.h
>>> @@ -55,6 +55,10 @@
>>>  #define OPAL_XSCOM_CTR_OFFLINED	-30
>>>  #define OPAL_XIVE_PROVISIONING	-31
>>>  #define OPAL_XIVE_FREE_ACTIVE	-32
>>> +#define OPAL_OCC_INVALID_STATE	-33
>> 
>> Instead of adding new return codes, I think existing ones suffice, or is
>> there a really big need to have a way to distinguish these errors from
>> other ones?  e.g. insetad of OPAL_OCC_INVALID_STATE, it could be
>> OPAL_WRONG_STATE - and this is consistent with other OPAL calls.
>> 
>>> +#define OPAL_OCC_BUSY		-34
>> 
>> OPAL_BUSY
>
> Yeah OPAL_BUSY works.
>
>> 
>>> +#define OPAL_OCC_CMD_TIMEOUT	-35
>> 
>> We probably want an OPAL_TIMEOUT actually... we have OPAL_I2C_TIMEOUT
>> and OPAL_XSCOM_TIMEOUT
>> 
>>> +#define OPAL_OCC_CMD_MISMATCH	-36
>> 
>> OPAL_PARAMETER
>> 
>> Does that make sense? Any reason not to do that?
>> 
>
> OPAL_OCC_CMD_TIMEOUT and OPAL_OCC_CMD_MISMATCH is used by kerenl to retry the
> OCC command. As OPAL_PARAMETER is also used for the sanity checks of the
> parameters passed to opal_call, kernel wont be able to distinguish
> when to retry.

If the retry logic is in OPAL though, that problem should go away.

>>> @@ -1033,6 +1038,81 @@ typedef struct oppanel_line {
>>>  	__be64 line_len;
>>>  } oppanel_line_t;
>>>  
>>> +enum occ_cmd {
>>> +	OCC_CMD_AMESTER_PASS_THRU = 0,
>> 
>> What is amester passthru for? Do we need this as part of our ABI?
>
> This command will be used by Amester for tracing the sensor values at 250us/2ms
> sampling rate. In P8, Amester used IPMI for tracing. In P9, Amester will be able
> to communicate to OCC inband using this cmd/rsp interface.
>
> Yeah we need this command to support inband Amester tracing.

So, the P8 design is pretty awful, random undocumented out of band IPMI
commands aren't great.

So, what exactly is this call accomplishing though? Is it setting a
sampling frequency? I really dislike these kind of passthrough APIs, as
the care to maintain backwards compatibility expands to many places when
we could have just kept it between OPAL and Linux.

>>> +	OCC_CMD_CLEAR_SENSOR_DATA,
>> 
>> Clear what sensors?
>
> Min-max values of sensors stored for different users like CSM, Profiler and
> Job-scheduler.
>
> How about OCC_CMD_CLEAR_SENSOR_MIN_MAX instead?

I don't think it needs to be OCC specific, why not add an API to clear
any OPAL_SENSOR? For any sensor that isn't min/max and clearable (could
be property in device tree, it'd return OPAL_UNSUPPORTED. That way,
these sensors could be read by existing kernels, although a small patch
would be required to clear them.

>>> +	OCC_CMD_SET_POWER_CAP,
>> 
>> Power cap for what? System? Socket?
>
> System.
>
>>> +	OCC_CMD_SET_POWER_SHIFTING_RATIO,
>>> +	OCC_CMD_SELECT_SENSOR_GROUPS,
>> 
>> Basically, this all needs to be documented somewhere, notably in
>> doc/opal-api/
>
> Okay.
>
>> 
>> I also wonder if this approach is the best for the OPAL API. While we do
>> *currently* have a thing called an OCC that does these functions, is
>> that *always* going to be the case? Elsewhere in firmware we've been
>> going down the route of calling things "pm_complex" and sending commands
>> to/from the pm_complex to do tasks for us.
>> 
>> would it be better if our OPAL API instead described the intended effect
>> rather than how to achieve said effect?
>> 
>> I wouldn't be opposed to an OPAL_SET_POWER_CAP call, that's an async
>> call, and accepts parameters in a range that's described in the device
>> tree. That call could be implemented on P8 too (where you currently set
>> power caps by doing crazy IPMI incantations to the BMC), and that would
>> prove that the OPAL API is implementation independent, and we should be
>> future proof with however we do the intent of capping
>> system/socket/other-thing power in the future.
>
> I agree with defining an abstract API for setting powercap as a better
> approach.

Great! I think it'd end up being a much better approach.

>> How do the sensors work? Do they fit into the existing OPAL Sensors
>> infrastructure? are they more in-memory sensors? I ask as perhaps the
>> select/clear calls should be part of either the regular opal sensors API
>> or the IMC api rather than their own API.
>
> Currently, only in-memory environment sensors are exported in the OPAL sensor
> infrastructure. Yes there are more perf/util/freq in-memory sensors that are not
> exported.
>
> The current sensor framework provides API to only read the sensor values. We
> will need new API additions to the sensor framework then to clear min/max of
> in-memory sensors and select the sensor groups to be copied to main memory.
> So should we have new opal-calls defined for these sensor operations instead of
> the OCC cmd/rsp interface?

Okay, so there's groups of sensors that need enabling? What is the
impact of always enabling them? What specific sensors are they?
Shilpasri G Bhat July 17, 2017, 4:46 a.m. UTC | #4
Hi Stewart,

On 07/17/2017 04:56 AM, Stewart Smith wrote:
> Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com> writes:
>> On 07/07/2017 02:23 PM, Stewart Smith wrote:
>>> Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com> writes:
>>
>>
>>>> +static inline bool occ_in_progress(struct cmd_interface *chip)
>>>> +{
>>>> +	return (chip->rsp->flag == OCC_IN_PROGRESS);
>>>> +}
>>>> +
>>>> +static int write_occ_cmd(struct cmd_interface *chip, bool retry)
>>>> +{
>>>> +	struct opal_command_buffer *cmd = chip->cmd;
>>>> +	struct opal_occ_cmd_rsp_msg *msg = chip->msg;
>>>> +
>>>> +	if (!retry && occ_in_progress(chip)) {
>>>> +		chip->cmd_in_progress = false;
>>>> +		return OPAL_OCC_BUSY;
>>>> +	}
>>>> +
>>>> +	cmd->flag = chip->rsp->flag = 0;
>>>> +	cmd->cmd = occ_cmds[msg->cmd].value;
>>>> +	cmd->request_id = msg->request_id;
>>>> +	cmd->data_size = msg->cdata_size;
>>>> +	memcpy(&cmd->data, (u8 *)msg->cdata, msg->cdata_size);
>>>> +	cmd->flag = OPAL_CMD_READY;
>>>> +
>>>> +	schedule_timer(&chip->timeout,
>>>> +		       msecs_to_tb(occ_cmds[msg->cmd].timeout_ms));
>>>> +	chip->prev_cmd = msg->cmd;
>>>> +
>>>> +	return OPAL_ASYNC_COMPLETION;
>>>> +}
>>>> +
>>>> +static int64_t opal_occ_command(int chip_id, struct opal_occ_cmd_rsp_msg *msg,
>>>> +				int token, bool retry)
>>>
>>> It looks like retry is not actually retry, but rather 'return if busy,
>>> don't queue' as in write_occ_cmd that's the logic.
>>
>> If kernel is retrying the same command due to a command-timeout in the previous
>> try, then don't check the OCC status flag as OCC might not have been able to
>> update the status flag. 'retry' flag is used to indicate that kernel is retrying
>> the command.
>>
>> Do you want 'retry' state to be stored in OPAL ?
> 
> I think that would be better. If part of the protocol in talking to the
> OCC is that if at first you don't succeed, retry without looking at
> status, then we should do the retry within OPAL transparently to Linux,
> so that the kernel just sees success/fail.

Sure will do it this way.

> 
>>>> @@ -910,6 +1314,9 @@ void occ_pstates_init(void)
>>>>  		chip->throttle = 0;
>>>>  	opal_add_poller(occ_throttle_poll, NULL);
>>>>  	occ_pstates_initialized = true;
>>>> +
>>>> +	/* Init OPAL-OCC command-response interface */
>>>> +	occ_cmd_interface_init();
>>>>  }
>>>>  
>>>>  struct occ_load_req {
>>>> @@ -1409,8 +1816,10 @@ void occ_p9_interrupt(uint32_t chip_id)
>>>>  	if (ireg & OCB_OCI_OCIMISC_IRQ_TMGT)
>>>>  		prd_tmgt_interrupt(chip_id);
>>>>  
>>>> -	if (ireg & OCB_OCI_OCIMISC_IRQ_SHMEM)
>>>> +	if (ireg & OCB_OCI_OCIMISC_IRQ_SHMEM) {
>>>>  		occ_throttle_poll(NULL);
>>>> +		handle_occ_rsp(chip_id);
>>>> +	}
>>>>  
>>>>  	if (ireg & OCB_OCI_OCIMISC_IRQ_I2C)
>>>>  		p9_i2c_bus_owner_change(chip_id);
>>>> @@ -1435,5 +1844,3 @@ void occ_fsp_init(void)
>>>>  	if (fsp_present())
>>>>  		fsp_register_client(&fsp_occ_client, FSP_MCLASS_OCC);
>>>>  }
>>>> -
>>>> -
>>>> diff --git a/include/opal-api.h b/include/opal-api.h
>>>> index edae069..dd92cf9 100644
>>>> --- a/include/opal-api.h
>>>> +++ b/include/opal-api.h
>>>> @@ -55,6 +55,10 @@
>>>>  #define OPAL_XSCOM_CTR_OFFLINED	-30
>>>>  #define OPAL_XIVE_PROVISIONING	-31
>>>>  #define OPAL_XIVE_FREE_ACTIVE	-32
>>>> +#define OPAL_OCC_INVALID_STATE	-33
>>>
>>> Instead of adding new return codes, I think existing ones suffice, or is
>>> there a really big need to have a way to distinguish these errors from
>>> other ones?  e.g. insetad of OPAL_OCC_INVALID_STATE, it could be
>>> OPAL_WRONG_STATE - and this is consistent with other OPAL calls.
>>>
>>>> +#define OPAL_OCC_BUSY		-34
>>>
>>> OPAL_BUSY
>>
>> Yeah OPAL_BUSY works.
>>
>>>
>>>> +#define OPAL_OCC_CMD_TIMEOUT	-35
>>>
>>> We probably want an OPAL_TIMEOUT actually... we have OPAL_I2C_TIMEOUT
>>> and OPAL_XSCOM_TIMEOUT
>>>
>>>> +#define OPAL_OCC_CMD_MISMATCH	-36
>>>
>>> OPAL_PARAMETER
>>>
>>> Does that make sense? Any reason not to do that?
>>>
>>
>> OPAL_OCC_CMD_TIMEOUT and OPAL_OCC_CMD_MISMATCH is used by kerenl to retry the
>> OCC command. As OPAL_PARAMETER is also used for the sanity checks of the
>> parameters passed to opal_call, kernel wont be able to distinguish
>> when to retry.
> 
> If the retry logic is in OPAL though, that problem should go away.

Yup agree...


> 
>>>> @@ -1033,6 +1038,81 @@ typedef struct oppanel_line {
>>>>  	__be64 line_len;
>>>>  } oppanel_line_t;
>>>>  
>>>> +enum occ_cmd {
>>>> +	OCC_CMD_AMESTER_PASS_THRU = 0,
>>>
>>> What is amester passthru for? Do we need this as part of our ABI?
>>
>> This command will be used by Amester for tracing the sensor values at 250us/2ms
>> sampling rate. In P8, Amester used IPMI for tracing. In P9, Amester will be able
>> to communicate to OCC inband using this cmd/rsp interface.
>>
>> Yeah we need this command to support inband Amester tracing.
> 
> So, the P8 design is pretty awful, random undocumented out of band IPMI
> commands aren't great.
> 
> So, what exactly is this call accomplishing though? Is it setting a
> sampling frequency? I really dislike these kind of passthrough APIs, as
> the care to maintain backwards compatibility expands to many places when
> we could have just kept it between OPAL and Linux.

Amester tracing is one of the passthrough commands that I am aware of.
Yes it will select one of the sampling rates and collect all the samples at
oneshot till the trace buffer is full or till the user specified time. This is
used for sub-millisecond profiling.

> 
>>>> +	OCC_CMD_CLEAR_SENSOR_DATA,
>>>
>>> Clear what sensors?
>>
>> Min-max values of sensors stored for different users like CSM, Profiler and
>> Job-scheduler.
>>
>> How about OCC_CMD_CLEAR_SENSOR_MIN_MAX instead?
> 
> I don't think it needs to be OCC specific, why not add an API to clear
> any OPAL_SENSOR? For any sensor that isn't min/max and clearable (could
> be property in device tree, it'd return OPAL_UNSUPPORTED. That way,
> these sensors could be read by existing kernels, although a small patch
> would be required to clear them.

The current implementation does not allow clearing of individual sensors. OCC
maintains min/max for each sensor for three different owners: CSM agent,
Profiler and Job-Scheduler (all three are CORAL subsystems)

This command will clear the min/max of all the sensors for the given owner.
So you are suggesting we add an API that clears individual sensors, but in OCC
implementation  clearing one sensor will clear all the min/max history of all
sensors belonging to the sensor ownwer.

> 
>>>> +	OCC_CMD_SET_POWER_CAP,
>>>
>>> Power cap for what? System? Socket?
>>
>> System.
>>
>>>> +	OCC_CMD_SET_POWER_SHIFTING_RATIO,
>>>> +	OCC_CMD_SELECT_SENSOR_GROUPS,
>>>
>>> Basically, this all needs to be documented somewhere, notably in
>>> doc/opal-api/
>>
>> Okay.
>>
>>>
>>> I also wonder if this approach is the best for the OPAL API. While we do
>>> *currently* have a thing called an OCC that does these functions, is
>>> that *always* going to be the case? Elsewhere in firmware we've been
>>> going down the route of calling things "pm_complex" and sending commands
>>> to/from the pm_complex to do tasks for us.
>>>
>>> would it be better if our OPAL API instead described the intended effect
>>> rather than how to achieve said effect?
>>>
>>> I wouldn't be opposed to an OPAL_SET_POWER_CAP call, that's an async
>>> call, and accepts parameters in a range that's described in the device
>>> tree. That call could be implemented on P8 too (where you currently set
>>> power caps by doing crazy IPMI incantations to the BMC), and that would
>>> prove that the OPAL API is implementation independent, and we should be
>>> future proof with however we do the intent of capping
>>> system/socket/other-thing power in the future.
>>
>> I agree with defining an abstract API for setting powercap as a better
>> approach.
> 
> Great! I think it'd end up being a much better approach.
> 
>>> How do the sensors work? Do they fit into the existing OPAL Sensors
>>> infrastructure? are they more in-memory sensors? I ask as perhaps the
>>> select/clear calls should be part of either the regular opal sensors API
>>> or the IMC api rather than their own API.
>>
>> Currently, only in-memory environment sensors are exported in the OPAL sensor
>> infrastructure. Yes there are more perf/util/freq in-memory sensors that are not
>> exported.
>>
>> The current sensor framework provides API to only read the sensor values. We
>> will need new API additions to the sensor framework then to clear min/max of
>> in-memory sensors and select the sensor groups to be copied to main memory.
>> So should we have new opal-calls defined for these sensor operations instead of
>> the OCC cmd/rsp interface?
> 
> Okay, so there's groups of sensors that need enabling? What is the
> impact of always enabling them? What specific sensors are they?
> 

Sensors are grouped based on their type, like temperature sensors, power
sensors, utilization sensors and so on. By default all the sensor groups are
enabled and thus all of them will be copied to main memory (at the rate of
100ms). OCC allows to select sensor groups to copy which will allow OCC to
increase the copying frequency with fewer selected sensor groups.

There are totally 5 commands:

1) Amester passthrough
2) Sensor clear min/max
3) Set system powercap
4) Set power-shifting ratio
5) Select sensor groups to copy

2) and 3) will have separate APIs and the remaining will be part of a single
cmd/rsp API. (?)

Thanks and Regards,
Shilpa
diff mbox

Patch

diff --git a/hw/occ.c b/hw/occ.c
index 832bf42..553c29f 100644
--- a/hw/occ.c
+++ b/hw/occ.c
@@ -120,6 +120,87 @@  struct occ_pstate_table {
 } __packed;
 
 /**
+ * OPAL-OCC Command Response Interface
+ *
+ * OPAL-OCC Command Buffer
+ *
+ * ---------------------------------------------------------------------
+ * | OPAL  |  Cmd    | OPAL |	       | Cmd Data | Cmd Data | OPAL    |
+ * | Cmd   | Request | OCC  | Reserved | Length   | Length   | Cmd     |
+ * | Flags |   ID    | Cmd  |	       | (MSB)    | (LSB)    | Data... |
+ * ---------------------------------------------------------------------
+ * |  ….OPAL Command Data up to max of Cmd Data Length 4090 bytes      |
+ * |								       |
+ * ---------------------------------------------------------------------
+ *
+ * OPAL Command Flag
+ *
+ * -----------------------------------------------------------------
+ * | Bit 7 | Bit 6 | Bit 5 | Bit 4 | Bit 3 | Bit 2 | Bit 1 | Bit 0 |
+ * | (msb) |	   |	   |	   |	   |	   |	   | (lsb) |
+ * -----------------------------------------------------------------
+ * |Cmd    |       |       |       |       |       |       |       |
+ * |Ready  |	   |	   |	   |	   |	   |	   |	   |
+ * -----------------------------------------------------------------
+ *
+ * struct opal_command_buffer -	Defines the layout of OPAL command buffer
+ * @flag:			Provides general status of the command
+ * @request_id:			Token to identify request
+ * @cmd:			Command sent
+ * @data_size:			Command data length
+ * @data:			Command specific data
+ * @spare:			Unused byte
+ */
+struct opal_command_buffer {
+	u8 flag;
+	u8 request_id;
+	u8 cmd;
+	u8 spare;
+	u16 data_size;
+	u8 data[MAX_OPAL_CMD_DATA_LENGTH];
+} __packed;
+
+/**
+ * OPAL-OCC Response Buffer
+ *
+ * ---------------------------------------------------------------------
+ * | OCC   |  Cmd    | OPAL | Response | Rsp Data | Rsp Data | OPAL    |
+ * | Rsp   | Request | OCC  |  Status  | Length   | Length   | Rsp     |
+ * | Flags |   ID    | Cmd  |	       | (MSB)    | (LSB)    | Data... |
+ * ---------------------------------------------------------------------
+ * |  ….OPAL Response Data up to max of Rsp Data Length 8698 bytes     |
+ * |								       |
+ * ---------------------------------------------------------------------
+ *
+ * OCC Response Flag
+ *
+ * -----------------------------------------------------------------
+ * | Bit 7 | Bit 6 | Bit 5 | Bit 4 | Bit 3 | Bit 2 | Bit 1 | Bit 0 |
+ * | (msb) |	   |	   |	   |	   |	   |	   | (lsb) |
+ * -----------------------------------------------------------------
+ * |       |       |       |       |       |       |OCC in  | Rsp  |
+ * |       |	   |	   |	   |	   |	   |progress|Ready |
+ * -----------------------------------------------------------------
+ *
+ * struct occ_response_buffer -	Defines the layout of OCC response buffer
+ * @flag:			Provides general status of the response
+ * @request_id:			Token to identify request
+ * @cmd:			Command requested
+ * @status:			Indicates success/failure status of
+ *				the command
+ * @data_size:			Response data length
+ * @data:			Response specific data
+ */
+struct occ_response_buffer {
+	u8 flag;
+	u8 request_id;
+	u8 cmd;
+	u8 status;
+	u16 data_size;
+	u8 data[MAX_OCC_RSP_DATA_LENGTH];
+} __packed;
+
+/**
  * OCC-OPAL Shared Memory Interface Dynamic Data Vx90
  *
  * struct occ_dynamic_data -	Contains runtime attributes
@@ -136,6 +217,8 @@  struct occ_pstate_table {
  * @max_pwr_cap:		Maximum allowed system power cap in Watts
  * @cur_pwr_cap:		Current system power cap
  * @spare/reserved:		Unused data
+ * @cmd:			Opal Command Buffer
+ * @rsp:			OCC Response Buffer
  */
 struct occ_dynamic_data {
 	u8 occ_state;
@@ -151,7 +234,9 @@  struct occ_dynamic_data {
 	u16 min_pwr_cap;
 	u16 max_pwr_cap;
 	u16 cur_pwr_cap;
-	u64 reserved;
+	u8 pad[112];
+	struct opal_command_buffer cmd;
+	struct occ_response_buffer rsp;
 } __packed;
 
 static bool occ_reset;
@@ -843,6 +928,325 @@  done:
 	unlock(&occ_lock);
 }
 
+#define OCC_RSP_READY		0x01
+#define OCC_IN_PROGRESS		0x02
+#define OPAL_CMD_READY		0x80
+
+enum occ_state {
+	OCC_STATE_NOT_RUNNING		= 0x00,
+	OCC_STATE_STANDBY		= 0x01,
+	OCC_STATE_OBSERVATION		= 0x02,
+	OCC_STATE_ACTIVE		= 0x03,
+	OCC_STATE_SAFE			= 0x04,
+	OCC_STATE_CHARACTERIZATION	= 0x05,
+};
+
+enum occ_role {
+	OCC_ROLE_SLAVE		= 0x0,
+	OCC_ROLE_MASTER		= 0x1,
+};
+
+enum occ_cmd_value {
+	OCC_CMD_VALUE_AMESTER_PASS_THRU		= 0x41,
+	OCC_CMD_VALUE_CLEAR_SENSOR_DATA		= 0xD0,
+	OCC_CMD_VALUE_SET_POWER_CAP		= 0xD1,
+	OCC_CMD_VALUE_SET_POWER_SHIFTING_RATIO	= 0xD2,
+	OCC_CMD_VALUE_SELECT_SENSOR_GROUPS	= 0xD3,
+};
+
+struct opal_occ_cmd_info {
+	enum	occ_cmd_value value;
+	int	timeout_ms;
+	u16	state_mask;
+	u8	role_mask;
+};
+
+static struct opal_occ_cmd_info occ_cmds[] = {
+	{	OCC_CMD_VALUE_AMESTER_PASS_THRU,
+		1000,
+		PPC_BIT16(OCC_STATE_OBSERVATION) |
+		PPC_BIT16(OCC_STATE_ACTIVE) |
+		PPC_BIT16(OCC_STATE_CHARACTERIZATION),
+		PPC_BIT8(OCC_ROLE_MASTER) | PPC_BIT8(OCC_ROLE_SLAVE)
+	},
+	{	OCC_CMD_VALUE_CLEAR_SENSOR_DATA,
+		1000,
+		PPC_BIT16(OCC_STATE_OBSERVATION) |
+		PPC_BIT16(OCC_STATE_ACTIVE) |
+		PPC_BIT16(OCC_STATE_CHARACTERIZATION),
+		PPC_BIT8(OCC_ROLE_MASTER) | PPC_BIT8(OCC_ROLE_SLAVE)
+	},
+	{	OCC_CMD_VALUE_SET_POWER_CAP,
+		1000,
+		PPC_BIT16(OCC_STATE_OBSERVATION) |
+		PPC_BIT16(OCC_STATE_ACTIVE) |
+		PPC_BIT16(OCC_STATE_CHARACTERIZATION),
+		PPC_BIT8(OCC_ROLE_MASTER)
+	},
+	{	OCC_CMD_VALUE_SET_POWER_SHIFTING_RATIO,
+		1000,
+		PPC_BIT16(OCC_STATE_OBSERVATION) |
+		PPC_BIT16(OCC_STATE_ACTIVE) |
+		PPC_BIT16(OCC_STATE_CHARACTERIZATION),
+		PPC_BIT8(OCC_ROLE_MASTER) | PPC_BIT8(OCC_ROLE_SLAVE)
+	},
+	{	OCC_CMD_VALUE_SELECT_SENSOR_GROUPS,
+		1000,
+		PPC_BIT16(OCC_STATE_OBSERVATION) |
+		PPC_BIT16(OCC_STATE_ACTIVE) |
+		PPC_BIT16(OCC_STATE_CHARACTERIZATION),
+		PPC_BIT8(OCC_ROLE_MASTER) | PPC_BIT8(OCC_ROLE_SLAVE)
+	},
+};
+
+static struct cmd_interface {
+	struct lock queue_lock;
+	struct timer timeout;
+	struct opal_command_buffer *cmd;
+	struct occ_response_buffer *rsp;
+	struct opal_occ_cmd_rsp_msg *msg;
+	u32 id;
+	u32 token;
+	enum occ_cmd prev_cmd;
+	u8 occ_role;
+	u8 *occ_state;
+	bool cmd_in_progress;
+} *chips;
+
+static int nr_occs;
+
+static inline struct cmd_interface *get_chip_cmd_interface(int chip_id)
+{
+	int i;
+
+	for (i = 0; i < nr_occs; i++)
+		if (chips[i].id == chip_id)
+			return &chips[i];
+
+	return NULL;
+}
+
+static inline bool occ_in_progress(struct cmd_interface *chip)
+{
+	return (chip->rsp->flag == OCC_IN_PROGRESS);
+}
+
+static int write_occ_cmd(struct cmd_interface *chip, bool retry)
+{
+	struct opal_command_buffer *cmd = chip->cmd;
+	struct opal_occ_cmd_rsp_msg *msg = chip->msg;
+
+	if (!retry && occ_in_progress(chip)) {
+		chip->cmd_in_progress = false;
+		return OPAL_OCC_BUSY;
+	}
+
+	cmd->flag = chip->rsp->flag = 0;
+	cmd->cmd = occ_cmds[msg->cmd].value;
+	cmd->request_id = msg->request_id;
+	cmd->data_size = msg->cdata_size;
+	memcpy(&cmd->data, (u8 *)msg->cdata, msg->cdata_size);
+	cmd->flag = OPAL_CMD_READY;
+
+	schedule_timer(&chip->timeout,
+		       msecs_to_tb(occ_cmds[msg->cmd].timeout_ms));
+	chip->prev_cmd = msg->cmd;
+
+	return OPAL_ASYNC_COMPLETION;
+}
+
+static int64_t opal_occ_command(int chip_id, struct opal_occ_cmd_rsp_msg *msg,
+				int token, bool retry)
+{
+	struct cmd_interface *chip;
+	int rc;
+
+	if (!msg || !opal_addr_valid((u8 *)msg->cdata) ||
+	    !opal_addr_valid((u8 *)msg->rdata))
+		return OPAL_PARAMETER;
+
+	if (msg->cmd >= OCC_CMD_LAST)
+		return OPAL_UNSUPPORTED;
+
+	if (msg->cdata_size > MAX_OPAL_CMD_DATA_LENGTH)
+		return OPAL_PARAMETER;
+
+	chip = get_chip_cmd_interface(chip_id);
+	if (!chip)
+		return OPAL_PARAMETER;
+
+	if (retry && msg->cmd != chip->prev_cmd)
+		return OPAL_PARAMETER;
+
+	if (!(PPC_BIT8(chip->occ_role) & occ_cmds[msg->cmd].role_mask))
+		return OPAL_PARAMETER;
+
+	if (!(PPC_BIT16(*chip->occ_state) & occ_cmds[msg->cmd].state_mask))
+		return OPAL_OCC_INVALID_STATE;
+
+	lock(&chip->queue_lock);
+	if (chip->cmd_in_progress) {
+		rc = OPAL_OCC_BUSY;
+		goto out;
+	}
+
+	chip->msg = msg;
+	chip->token = token;
+	chip->cmd_in_progress = true;
+	rc = write_occ_cmd(chip, retry);
+out:
+	unlock(&chip->queue_lock);
+	return rc;
+}
+
+static inline bool sanity_check_opal_cmd(struct opal_command_buffer *cmd,
+					 struct opal_occ_cmd_rsp_msg *msg)
+{
+	return ((cmd->cmd == occ_cmds[msg->cmd].value) &&
+		(cmd->request_id == msg->request_id) &&
+		(cmd->data_size == msg->cdata_size));
+}
+
+static inline bool check_occ_rsp(struct opal_command_buffer *cmd,
+				 struct occ_response_buffer *rsp)
+{
+	if (cmd->cmd != rsp->cmd) {
+		prlog(PR_WARNING, "OCC: Command value mismatch in OCC response"
+		      "rsp->cmd = %d cmd->cmd = %d\n", rsp->cmd, cmd->cmd);
+		return false;
+	}
+
+	if (cmd->request_id != rsp->request_id) {
+		prlog(PR_WARNING, "OCC: Request ID mismatch in OCC response"
+		      "rsp->request_id = %d cmd->request_id = %d\n",
+		      rsp->request_id, cmd->request_id);
+		return false;
+	}
+
+	return true;
+}
+
+static inline void queue_occ_rsp_msg(int token, int rc)
+{
+	int ret;
+
+	ret = opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, token, rc);
+	if (ret)
+		prerror("OCC: Failed to queue OCC response status message\n");
+}
+
+static void occ_cmd_timeout_handler(struct timer *t __unused, void *data,
+				    uint64_t now __unused)
+{
+	struct cmd_interface *chip = data;
+
+	lock(&chip->queue_lock);
+	if (!chip->cmd_in_progress)
+		goto exit;
+
+	chip->cmd_in_progress = false;
+	queue_occ_rsp_msg(chip->token, OPAL_OCC_CMD_TIMEOUT);
+exit:
+	unlock(&chip->queue_lock);
+}
+
+static void read_occ_rsp(struct occ_response_buffer *rsp,
+			 struct opal_occ_cmd_rsp_msg *msg)
+{
+	/* Copy response to host buffer */
+	msg->status = rsp->status;
+	msg->rdata_size = rsp->data_size;
+	memcpy((u8 *)msg->rdata, rsp->data, rsp->data_size);
+
+	/* Clear the OCC response flag */
+	rsp->flag = 0;
+}
+
+static void handle_occ_rsp(uint32_t chip_id)
+{
+	struct cmd_interface *chip;
+	struct opal_command_buffer *cmd;
+	struct occ_response_buffer *rsp;
+	struct opal_occ_cmd_rsp_msg *msg;
+
+	chip = get_chip_cmd_interface(chip_id);
+	if (!chip)
+		return;
+
+	cmd = chip->cmd;
+	rsp = chip->rsp;
+	msg = chip->msg;
+
+	/*Read rsp*/
+	if (rsp->flag != OCC_RSP_READY)
+		return;
+	lock(&chip->queue_lock);
+	if (!chip->cmd_in_progress)
+		goto exit;
+
+	chip->cmd_in_progress = false;
+	cancel_timer(&chip->timeout);
+	if (!sanity_check_opal_cmd(cmd, chip->msg) ||
+	    !check_occ_rsp(cmd, rsp)) {
+		queue_occ_rsp_msg(chip->token, OPAL_OCC_CMD_MISMATCH);
+		goto exit;
+	}
+
+	read_occ_rsp(chip->rsp, msg);
+	queue_occ_rsp_msg(chip->token, OPAL_SUCCESS);
+exit:
+	unlock(&chip->queue_lock);
+}
+
+static void occ_cmd_interface_init(void)
+{
+	struct dt_node *power_mgt;
+	struct occ_dynamic_data *data;
+	struct occ_pstate_table *pdata;
+	struct proc_chip *chip;
+	int i = 0;
+
+	chip = next_chip(NULL);
+	pdata = get_occ_pstate_table(chip);
+	if (pdata->version != 0x90)
+		return;
+
+	for_each_chip(chip)
+		nr_occs++;
+
+	chips = malloc(sizeof(*chips) * nr_occs);
+	assert(chips);
+
+	for_each_chip(chip) {
+		pdata = get_occ_pstate_table(chip);
+		data = get_occ_dynamic_data(chip);
+		chips[i].id = chip->id;
+		chips[i].occ_role = pdata->v9.occ_role;
+		chips[i].occ_state = &data->occ_state;
+		chips[i].cmd = &data->cmd;
+		chips[i].rsp = &data->rsp;
+		init_lock(&chips[i].queue_lock);
+		chips[i].cmd_in_progress = false;
+		chips[i].prev_cmd = OCC_CMD_LAST;
+		init_timer(&chips[i].timeout, occ_cmd_timeout_handler,
+			   &chips[i]);
+		i++;
+	}
+
+	power_mgt = dt_find_by_path(dt_root, "/ibm,opal/power-mgt");
+	if (!power_mgt) {
+		prerror("OCC: dt node /ibm,opal/power-mgt not found\n");
+		free(chips);
+		return;
+	}
+
+	update_max_async_completions(nr_occs);
+	alloc_opal_msg_list(nr_occs);
+	dt_add_property_string(power_mgt, "compatible",
+			       "ibm,opal-occ-cmd-rsp-interface");
+	opal_register(OPAL_OCC_COMMAND, opal_occ_command, 4);
+}
+
 /* CPU-OCC PState init */
 /* Called after OCC init on P8 and P9 */
 void occ_pstates_init(void)
@@ -910,6 +1314,9 @@  void occ_pstates_init(void)
 		chip->throttle = 0;
 	opal_add_poller(occ_throttle_poll, NULL);
 	occ_pstates_initialized = true;
+
+	/* Init OPAL-OCC command-response interface */
+	occ_cmd_interface_init();
 }
 
 struct occ_load_req {
@@ -1409,8 +1816,10 @@  void occ_p9_interrupt(uint32_t chip_id)
 	if (ireg & OCB_OCI_OCIMISC_IRQ_TMGT)
 		prd_tmgt_interrupt(chip_id);
 
-	if (ireg & OCB_OCI_OCIMISC_IRQ_SHMEM)
+	if (ireg & OCB_OCI_OCIMISC_IRQ_SHMEM) {
 		occ_throttle_poll(NULL);
+		handle_occ_rsp(chip_id);
+	}
 
 	if (ireg & OCB_OCI_OCIMISC_IRQ_I2C)
 		p9_i2c_bus_owner_change(chip_id);
@@ -1435,5 +1844,3 @@  void occ_fsp_init(void)
 	if (fsp_present())
 		fsp_register_client(&fsp_occ_client, FSP_MCLASS_OCC);
 }
-
-
diff --git a/include/opal-api.h b/include/opal-api.h
index edae069..dd92cf9 100644
--- a/include/opal-api.h
+++ b/include/opal-api.h
@@ -55,6 +55,10 @@ 
 #define OPAL_XSCOM_CTR_OFFLINED	-30
 #define OPAL_XIVE_PROVISIONING	-31
 #define OPAL_XIVE_FREE_ACTIVE	-32
+#define OPAL_OCC_INVALID_STATE	-33
+#define OPAL_OCC_BUSY		-34
+#define OPAL_OCC_CMD_TIMEOUT	-35
+#define OPAL_OCC_CMD_MISMATCH	-36
 
 /* API Tokens (in r0) */
 #define OPAL_INVALID_CALL		       -1
@@ -207,7 +211,8 @@ 
 #define OPAL_IMC_COUNTERS_INIT			149
 #define OPAL_IMC_COUNTERS_START			150
 #define OPAL_IMC_COUNTERS_STOP			151
-#define OPAL_LAST				151
+#define OPAL_OCC_COMMAND			152
+#define OPAL_LAST				152
 
 /* Device tree flags */
 
@@ -1033,6 +1038,81 @@  typedef struct oppanel_line {
 	__be64 line_len;
 } oppanel_line_t;
 
+enum occ_cmd {
+	OCC_CMD_AMESTER_PASS_THRU = 0,
+	OCC_CMD_CLEAR_SENSOR_DATA,
+	OCC_CMD_SET_POWER_CAP,
+	OCC_CMD_SET_POWER_SHIFTING_RATIO,
+	OCC_CMD_SELECT_SENSOR_GROUPS,
+	OCC_CMD_LAST
+};
+
+enum occ_cmd_data_length {
+	OCC_CMD_DL_AMESTER_PASS_THRU		= 0, /* Variable data length */
+	OCC_CMD_DL_CLEAR_SENSOR_DATA		= 4,
+	OCC_CMD_DL_SET_POWER_CAP		= 2,
+	OCC_CMD_DL_SET_POWER_SHIFTING_RATIO	= 1,
+	OCC_CMD_DL_SELECT_SENSOR_GROUPS		= 2,
+};
+
+enum occ_rsp_data_length {
+	OCC_RSP_DL_AMESTER_PASS_THRU		= 0, /* Variable data length */
+	OCC_RSP_DL_CLEAR_SENSOR_DATA		= 4,
+	OCC_RSP_DL_SET_POWER_CAP		= 2,
+	OCC_RSP_DL_SET_POWER_SHIFTING_RATIO	= 1,
+	OCC_RSP_DL_SELECT_SENSOR_GROUPS		= 2,
+};
+
+enum occ_sensor_limit_group {
+	OCC_SENSOR_LIMIT_GROUP_CSM		= 0x10,
+	OCC_SENSOR_LIMIT_GROUP_PROFILER		= 0x20,
+	OCC_SENSOR_LIMIT_GROUP_JOB_SCHED	= 0x40,
+};
+
+enum occ_sensor_group_mask {
+	OCC_SENSOR_GROUP_MASK_PERFORMANCE	= 6,
+	OCC_SENSOR_GROUP_MASK_POWER		= 8,
+	OCC_SENSOR_GROUP_MASK_FREQUENCY		= 9,
+	OCC_SENSOR_GROUP_MASK_TIME		= 10,
+	OCC_SENSOR_GROUP_MASK_UTILIZATION	= 11,
+	OCC_SENSOR_GROUP_MASK_TEMPERATURE	= 12,
+	OCC_SENSOR_GROUP_MASK_VOLTAGE		= 13,
+	OCC_SENSOR_GROUP_MASK_CURRENT		= 14,
+};
+
+#define MAX_OPAL_CMD_DATA_LENGTH	4090
+#define MAX_OCC_RSP_DATA_LENGTH		8698
+
+enum occ_response_status {
+	OCC_SUCCESS			= 0x00,
+	OCC_INVALID_COMMAND		= 0x11,
+	OCC_INVALID_CMD_DATA_LENGTH	= 0x12,
+	OCC_INVALID_DATA		= 0x13,
+	OCC_INTERNAL_ERROR		= 0x15,
+};
+
+struct opal_occ_cmd_rsp_msg {
+	__be64 cdata;
+	__be64 rdata;
+	__be16 cdata_size;
+	__be16 rdata_size;
+	u8 cmd;
+	u8 request_id;
+	u8 status;
+};
+
+struct opal_occ_cmd_data {
+	__be16 size;
+	u8 cmd;
+	u8 data[];
+};
+
+struct opal_occ_rsp_data {
+	__be16 size;
+	u8 status;
+	u8 data[];
+};
+
 enum opal_prd_msg_type {
 	OPAL_PRD_MSG_TYPE_INIT = 0,	/* HBRT --> OPAL */
 	OPAL_PRD_MSG_TYPE_FINI,		/* HBRT/kernel --> OPAL */