diff mbox

[OpenWrt-Devel,6/7] ramips: improve mt7621 spi flash read speed

Message ID 1442928363-6219-6-git-send-email-igvtee@gmail.com
State Rejected
Headers show

Commit Message

Mingyu Li Sept. 22, 2015, 1:26 p.m. UTC
From: michael lee <igvtee@gmail.com>

only support spi flash command (half duplex).
no need chunk io patch. done by driver.

test results on mt7621. use dd read bs=512 with 32128 records
old driver : 30.52s
new driver : 34.31s
new driver + no chunk io : 16.65s
new driver + no chunk io + fast read clock from 10Mhz to 50Mhz : 5.00s

Signed-off-by: Michael Lee <igvtee@gmail.com>
---
 target/linux/ramips/dts/MT7628.dts                 |   1 -
 target/linux/ramips/dts/mt7621.dtsi                |   7 +-
 target/linux/ramips/dts/mt7628an.dtsi              |   6 +-
 .../0044-mtd-add-chunked-read-io-to-m25p80.patch   | 103 ---
 ...0061-SPI-ralink-add-mt7621-SoC-spi-driver.patch | 794 ++++++++++++++-------
 5 files changed, 553 insertions(+), 358 deletions(-)
 delete mode 100644 target/linux/ramips/patches-3.18/0044-mtd-add-chunked-read-io-to-m25p80.patch

Comments

Felix Fietkau Sept. 22, 2015, 1:48 p.m. UTC | #1
On 2015-09-22 15:26, Michael Lee wrote:
> From: michael lee <igvtee@gmail.com>
> 
> only support spi flash command (half duplex).
> no need chunk io patch. done by driver.
> 
> test results on mt7621. use dd read bs=512 with 32128 records
> old driver : 30.52s
> new driver : 34.31s
> new driver + no chunk io : 16.65s
> new driver + no chunk io + fast read clock from 10Mhz to 50Mhz : 5.00s
Where does the performance gain without the clock change come from?

- Felix
John Crispin Sept. 22, 2015, 2:19 p.m. UTC | #2
On 22/09/2015 15:48, Felix Fietkau wrote:
> On 2015-09-22 15:26, Michael Lee wrote:
>> From: michael lee <igvtee@gmail.com>
>>
>> only support spi flash command (half duplex).
>> no need chunk io patch. done by driver.
>>
>> test results on mt7621. use dd read bs=512 with 32128 records
>> old driver : 30.52s
>> new driver : 34.31s
>> new driver + no chunk io : 16.65s
>> new driver + no chunk io + fast read clock from 10Mhz to 50Mhz : 5.00s
> Where does the performance gain without the clock change come from?
> 
> - Felix
> 

Hi,

i am working on a different patch that does several consecutive
fullduplex transactions which improves the speed by more than 10%. i
have not measure it but subjective feeling is more like 50-100%.

also this patch has a lot of stuff all mixed into a single patch making
it near impossible to review the code.

and, you make the driver m25p80 aware which is SDK style. we dont want
that in our kernel.

	John
diff mbox

Patch

diff --git a/target/linux/ramips/dts/MT7628.dts b/target/linux/ramips/dts/MT7628.dts
index dd6647f..87d12d2 100644
--- a/target/linux/ramips/dts/MT7628.dts
+++ b/target/linux/ramips/dts/MT7628.dts
@@ -31,7 +31,6 @@ 
 				reg = <0 0>;
 				linux,modalias = "m25p80", "en25q64";
 				spi-max-frequency = <10000000>;
-				m25p,chunked-io = <32>;
 
 				partition@0 {
 					label = "u-boot";
diff --git a/target/linux/ramips/dts/mt7621.dtsi b/target/linux/ramips/dts/mt7621.dtsi
index bc79d39..cd115b1 100644
--- a/target/linux/ramips/dts/mt7621.dtsi
+++ b/target/linux/ramips/dts/mt7621.dtsi
@@ -20,6 +20,10 @@ 
 		compatible = "mti,cpu-interrupt-controller";
 	};
 
+	aliases {
+		spi0 = &spi0;
+	};
+
 	palmbus@1E000000 {
 		compatible = "palmbus";
 		reg = <0x1E000000 0x100000>;
@@ -84,7 +88,7 @@ 
 			no-loopback-test;
 		};
 
-		spi@b00 {
+		spi0: spi@b00 {
 			status = "okay";
 
 			compatible = "ralink,mt7621-spi";
@@ -104,7 +108,6 @@ 
 				#size-cells = <1>;
 				reg = <0 0>;
 				spi-max-frequency = <10000000>;
-				m25p,chunked-io = <32>;
 			};
 		};
 	};
diff --git a/target/linux/ramips/dts/mt7628an.dtsi b/target/linux/ramips/dts/mt7628an.dtsi
index 02f9df3..eb8a6ee 100644
--- a/target/linux/ramips/dts/mt7628an.dtsi
+++ b/target/linux/ramips/dts/mt7628an.dtsi
@@ -20,6 +20,10 @@ 
 		compatible = "mti,cpu-interrupt-controller";
 	};
 
+	aliases {
+		spi0 = &spi0;
+	};
+
 	palmbus@10000000 {
 		compatible = "palmbus";
 		reg = <0x10000000 0x200000>;
@@ -102,7 +106,7 @@ 
 			};
 		};
 
-		spi@b00 {
+		spi0: spi@b00 {
 			compatible = "ralink,mt7621-spi";
 			reg = <0xb00 0x100>;
 
diff --git a/target/linux/ramips/patches-3.18/0044-mtd-add-chunked-read-io-to-m25p80.patch b/target/linux/ramips/patches-3.18/0044-mtd-add-chunked-read-io-to-m25p80.patch
deleted file mode 100644
index 1716e1c..0000000
--- a/target/linux/ramips/patches-3.18/0044-mtd-add-chunked-read-io-to-m25p80.patch
+++ /dev/null
@@ -1,103 +0,0 @@ 
---- a/drivers/mtd/devices/m25p80.c
-+++ b/drivers/mtd/devices/m25p80.c
-@@ -19,6 +19,7 @@
- #include <linux/errno.h>
- #include <linux/module.h>
- #include <linux/device.h>
-+#include <linux/of.h>
- 
- #include <linux/mtd/mtd.h>
- #include <linux/mtd/partitions.h>
-@@ -32,6 +33,7 @@ struct m25p {
- 	struct spi_device	*spi;
- 	struct spi_nor		spi_nor;
- 	struct mtd_info		mtd;
-+	u16			chunk_size;
- 	u8			command[MAX_CMD_SIZE];
- };
- 
-@@ -157,6 +159,61 @@ static int m25p80_read(struct spi_nor *n
- 	return 0;
- }
- 
-+static void m25p80_chunked_write(struct spi_nor *nor, loff_t _from, size_t _len,
-+			size_t *_retlen, const u_char *_buf)
-+{
-+	struct m25p *flash = nor->priv;
-+	int chunk_size;
-+	int retlen = 0;
-+
-+	chunk_size = flash->chunk_size;
-+	if (!chunk_size)
-+		chunk_size = _len;
-+
-+	if (nor->addr_width > 3)
-+		chunk_size -= nor->addr_width - 3;
-+
-+	while (retlen < _len) {
-+		size_t len = min_t(int, chunk_size, _len - retlen);
-+		const u_char *buf = _buf + retlen;
-+		loff_t from = _from + retlen;
-+
-+		nor->wait_till_ready(nor);
-+		nor->write_reg(nor, SPINOR_OP_WREN, NULL, 0, 0);
-+
-+		m25p80_write(nor, from, len, &retlen, buf);
-+	}
-+	*_retlen += retlen;
-+}
-+
-+static int m25p80_chunked_read(struct spi_nor *nor, loff_t _from, size_t _len,
-+			size_t *_retlen, u_char *_buf)
-+{
-+	struct m25p *flash = nor->priv;
-+	int chunk_size;
-+
-+	chunk_size = flash->chunk_size;
-+	if (!chunk_size)
-+		chunk_size = _len;
-+
-+	*_retlen = 0;
-+
-+	while (*_retlen < _len) {
-+		size_t len = min_t(int, chunk_size, _len - *_retlen);
-+		u_char *buf = _buf + *_retlen;
-+		loff_t from = _from + *_retlen;
-+		int retlen = 0;
-+		int ret = m25p80_read(nor, from, len, &retlen, buf);
-+
-+		if (ret)
-+			return ret;
-+
-+		*_retlen += retlen;
-+	}
-+
-+	return 0;
-+}
-+
- static int m25p80_erase(struct spi_nor *nor, loff_t offset)
- {
- 	struct m25p *flash = nor->priv;
-@@ -197,6 +254,7 @@ static int m25p_probe(struct spi_device
- 	struct spi_nor *nor;
- 	enum read_mode mode = SPI_NOR_NORMAL;
- 	char *flash_name = NULL;
-+	u32 val;
- 	int ret;
- 
- 	data = dev_get_platdata(&spi->dev);
-@@ -244,6 +302,14 @@ static int m25p_probe(struct spi_device
- 	if (ret)
- 		return ret;
- 
-+	if (spi->dev.of_node &&
-+	    !of_property_read_u32(spi->dev.of_node, "m25p,chunked-io", &val)) {
-+		dev_warn(&spi->dev, "using chunked io\n");
-+		nor->read = m25p80_chunked_read;
-+		nor->write = m25p80_chunked_write;
-+		flash->chunk_size = val;
-+	}
-+
- 	ppdata.of_node = spi->dev.of_node;
- 
- 	return mtd_device_parse_register(&flash->mtd, NULL, &ppdata,
diff --git a/target/linux/ramips/patches-3.18/0061-SPI-ralink-add-mt7621-SoC-spi-driver.patch b/target/linux/ramips/patches-3.18/0061-SPI-ralink-add-mt7621-SoC-spi-driver.patch
index 2ba1ee8..539e8bd 100644
--- a/target/linux/ramips/patches-3.18/0061-SPI-ralink-add-mt7621-SoC-spi-driver.patch
+++ b/target/linux/ramips/patches-3.18/0061-SPI-ralink-add-mt7621-SoC-spi-driver.patch
@@ -1,6 +1,6 @@ 
 --- a/drivers/spi/Kconfig
 +++ b/drivers/spi/Kconfig
-@@ -439,6 +439,12 @@
+@@ -439,6 +439,12 @@ config SPI_RT2880
  	help
  	  This selects a driver for the Ralink RT288x/RT305x SPI Controller.
  
@@ -15,7 +15,7 @@ 
  	depends on ARCH_S3C24XX
 --- a/drivers/spi/Makefile
 +++ b/drivers/spi/Makefile
-@@ -46,6 +46,7 @@
+@@ -46,6 +46,7 @@ obj-$(CONFIG_SPI_LM70_LLP)		+= spi-lm70l
  obj-$(CONFIG_SPI_MPC512x_PSC)		+= spi-mpc512x-psc.o
  obj-$(CONFIG_SPI_MPC52xx_PSC)		+= spi-mpc52xx-psc.o
  obj-$(CONFIG_SPI_MPC52xx)		+= spi-mpc52xx.o
@@ -25,13 +25,14 @@ 
  obj-$(CONFIG_SPI_OC_TINY)		+= spi-oc-tiny.o
 --- /dev/null
 +++ b/drivers/spi/spi-mt7621.c
-@@ -0,0 +1,479 @@
+@@ -0,0 +1,771 @@
 +/*
 + * spi-mt7621.c -- MediaTek MT7621 SPI controller driver
 + *
 + * Copyright (C) 2011 Sergiy <piratfm@gmail.com>
 + * Copyright (C) 2011-2013 Gabor Juhos <juhosg@openwrt.org>
 + * Copyright (C) 2014-2015 Felix Fietkau <nbd@openwrt.org>
++ * Copyright (C) 2015 Michael Lee <igvtee@gmail.com>
 + *
 + * Some parts are based on spi-orion.c:
 + *   Author: Shadi Ammouri <shadi@marvell.com>
@@ -53,49 +54,140 @@ 
 +#include <linux/of_device.h>
 +#include <linux/platform_device.h>
 +#include <linux/swab.h>
++#include <linux/mtd/spi-nor.h>
 +
 +#include <ralink_regs.h>
 +
-+#define SPI_BPW_MASK(bits) BIT((bits) - 1)
-+
 +#define DRIVER_NAME			"spi-mt7621"
-+/* in usec */
-+#define RALINK_SPI_WAIT_MAX_LOOP	2000
-+
-+/* SPISTAT register bit field */
-+#define SPISTAT_BUSY			BIT(0)
 +
 +#define MT7621_SPI_TRANS	0x00
-+#define SPITRANS_BUSY		BIT(16)
-+
 +#define MT7621_SPI_OPCODE	0x04
 +#define MT7621_SPI_DATA0	0x08
-+#define MT7621_SPI_DATA4	0x18
-+#define SPI_CTL_TX_RX_CNT_MASK	0xff
-+#define SPI_CTL_START		BIT(8)
-+
-+#define MT7621_SPI_POLAR	0x38
 +#define MT7621_SPI_MASTER	0x28
 +#define MT7621_SPI_MOREBUF	0x2c
++#define MT7621_SPI_QUEUE_CTL	0x30
++#define MT7621_SPI_STATUS	0x34
++#define MT7621_SPI_POLAR	0x38
 +#define MT7621_SPI_SPACE	0x3c
 +
-+#define MT7621_CPHA		BIT(5)
-+#define MT7621_CPOL		BIT(4)
-+#define MT7621_LSB_FIRST	BIT(3)
++/* MT7621_SPI_TRANS */
++#define SPITRANS_ADDREXT_MASK		0xff
++#define SPITRANS_ADDREXT_OFFSET		24
++#define SPITRANS_ADDRSIZE_MASK		0x3
++#define SPITRANS_ADDRSIZE_OFFSET	19
++#define SPITRANS_BUSY			BIT(16)
++#define SPITRANS_START			BIT(8)
++#define SPITRANS_BYTECNT_MASK		0xf
++#define SPITRANS_MISO_OFFSET		4
++#define SPITRANS_MOSI_OFFSET		0
++
++/* MT7621_SPI_OPCODE */
++#define SPIOP_MB_OPCODE_OFFSET		24
++#define SPIOP_MB_ADDR_MASK		0xffffff
++
++/* MT7621_SPI_MASTER */
++#define SPIMASTER_CS_MASK		0x7
++#define SPIMASTER_CS_OFFSET		29
++#define SPIMASTER_CLK_HIGH		BIT(28)
++#define SPIMASTER_CLKSEL_MASK		0xfff
++#define SPIMASTER_CLKSEL_OFFSET		16
++#define SPIMASTER_CSDSEL_MASK		0x1f
++#define SPIMASTER_CSDSEL_OFFSET		11
++#define SPIMASTER_FULL_DUPLEX		BIT(10)
++#define SPIMASTER_INTR_ENABLE		BIT(9)
++#define SPIMASTER_START_6CLK		BIT(8)
++#define SPIMASTER_PREFETCH_ENABLE	BIT(7)
++#define SPIMASTER_BIDIR_MODE		BIT(6)
++#define SPIMASTER_CPHA			BIT(5)
++#define SPIMASTER_CPOL			BIT(4)
++#define SPIMASTER_LSB			BIT(3)
++#define SPIMASTER_MB_MODE		BIT(2)
++#define SPIMASTER_SERIAL_MASK		0x3
++
++/* MT7621_SPI_MOREBUF */
++#define SPIMB_CMD_MASK			0x3f
++#define SPIMB_CMD_OFFSET		24
++#define SPIMB_MISO_MASK			0x1ff
++#define SPIMB_MISO_OFFSET		12
++#define SPIMB_MOSI_MASK			0x1ff
++#define SPIMB_MOSI_OFFSET		0
++
++/* MT7621_SPI_QUEUE_CTL */
++#define SPIQCTL_PAGE_MASK		0x3f
++#define SPIQCTL_PAGE_OFFSET		26
++#define SPIQCTL_BUSY			BIT(12)
++#define SPIQCTL_ADDRSIZE_MASK		0x3
++#define SPIQCTL_ADDRSIZER_OFFSET	10
++#define SPIQCTL_ADDRSIZE_OFFSET		8
++#define SPIQCTL_MOSI_MASK		0xf
++#define SPIQCTL_FASTSEL_MASK		0x7
++
++/* MT7621_SPI_STATUS */
++#define SPISTA_MODE_MASK		0x3
++#define SPISTA_MODE_OFFSET		4
++#define SPISTA_OK			BIT(0)
++
++/* MT7621_SPI_POLAR */
++#define SPIPOL_CSPOL_MASK		0xff
++#define SPIPOL_CSPOL_OFFSET		0
++#define SPIPOL_CSPOL_HIGH		1
++
++/* define MT7621_SPI_SPACE */
++#define SPISPA_CS_MASK			0x7
++#define SPISPA_CS_OFFSET		12
++#define SPISPA_CLKSEL_MASK		0xfff
++#define SPISPA_CLKSEL_OFFSET		0
++
++#define MT7621_SPI_MODE_BITS	(SPI_CPOL | SPI_CPHA | SPI_LSB_FIRST | \
++		SPI_CS_HIGH)
++
++struct mt7621_mb_reg {
++	u32 mosi_bit:12,
++	    miso_bit:12,
++	    cmd_bit:8;
++};
 +
-+#define RT2880_SPI_MODE_BITS	(SPI_CPOL | SPI_CPHA | SPI_LSB_FIRST | SPI_CS_HIGH)
++struct mt7621_spi_data {
++	union {
++		u32 mb_reg;
++		struct mt7621_mb_reg cnt;
++	};
++	union {
++		u32 data[9];
++		u8 buf[36];
++	};
++};
 +
-+struct mt7621_spi;
++/* cmd flags */
++#define SPI_CMD_ADDR	BIT(0)
++#define SPI_CMD_DATA	BIT(1)
++#define SPI_CMD_TX	BIT(2)
++#define SPI_CMD_RX	BIT(3)
++#define SPI_CMD_EN4B	BIT(4)
++#define SPI_CMD_EX4B	BIT(5)
++
++/* cmd status */
++#define SPI_STATE_OPCODE	0
++#define SPI_STATE_DATA		1
++
++struct mt7621_spi_cmd {
++	u16 status;
++	u16 flags;
++	u32 addr;
++	u32 opaddr_len;
++	const u8 *opaddr_data;
++};
 +
 +struct mt7621_spi {
 +	struct spi_master	*master;
 +	void __iomem		*base;
-+	unsigned int		sys_freq;
-+	unsigned int		speed;
++	u32			speed;
++	u16			wait_loops;
++	u16			mode;
 +	struct clk		*clk;
-+	spinlock_t		lock;
++	int			addr_width;
 +
-+	struct mt7621_spi_ops	*ops;
++	struct mt7621_spi_cmd	cmd;
 +};
 +
 +static inline struct mt7621_spi *spidev_to_mt7621_spi(struct spi_device *spi)
@@ -113,292 +205,483 @@ 
 +	iowrite32(val, rs->base + reg);
 +}
 +
-+static void mt7621_spi_reset(struct mt7621_spi *rs, int duplex)
++static inline void mt7621_spi_setbits(struct mt7621_spi *rs, u32 reg, u32 mask)
 +{
-+	u32 master = mt7621_spi_read(rs, MT7621_SPI_MASTER);
-+
-+	master &= ~(0xfff << 16);
-+	master |= 1 << 16;
-+	master |= 7 << 29;
-+	master |= 1 << 2;
-+	if (duplex)
-+		master |= 1 << 10;
++	void __iomem *addr = rs->base + reg;
 +
-+	mt7621_spi_write(rs, MT7621_SPI_MASTER, master);
++	iowrite32((ioread32(addr) | mask), addr);
 +}
 +
-+static void mt7621_spi_set_cs(struct spi_device *spi, int enable)
++static inline void mt7621_spi_clrbits(struct mt7621_spi *rs, u32 reg, u32 mask)
 +{
-+	struct mt7621_spi *rs = spidev_to_mt7621_spi(spi);
-+	int cs = spi->chip_select;
-+	u32 polar = 0;
++	void __iomem *addr = rs->base + reg;
 +
-+        mt7621_spi_reset(rs, cs);
-+	if (enable)
-+		polar = BIT(cs);
-+	mt7621_spi_write(rs, MT7621_SPI_POLAR, polar);
++	iowrite32((ioread32(addr) & ~mask), addr);
 +}
 +
-+static int mt7621_spi_prepare(struct spi_device *spi, unsigned int speed)
++static u32 mt7621_spi_baudrate_get(struct spi_device *spi, unsigned int speed)
 +{
 +	struct mt7621_spi *rs = spidev_to_mt7621_spi(spi);
 +	u32 rate;
-+	u32 reg;
-+
-+	dev_dbg(&spi->dev, "speed:%u\n", speed);
++	u32 prescale;
 +
-+	rate = DIV_ROUND_UP(rs->sys_freq, speed);
-+	dev_dbg(&spi->dev, "rate-1:%u\n", rate);
++	/*
++	 * the supported rates are: 2, 3, 4, ... 4096
++	 * round up as we look for equal or less speed
++	 */
++	rate = DIV_ROUND_UP(clk_get_rate(rs->clk), speed);
 +
-+	if (rate > 4097)
-+		return -EINVAL;
++	/* Convert the rate to SPI clock divisor value.	*/
++	prescale = rate - 2;
 +
-+	if (rate < 2)
-+		rate = 2;
-+
-+	reg = mt7621_spi_read(rs, MT7621_SPI_MASTER);
-+	reg &= ~(0xfff << 16);
-+	reg |= (rate - 2) << 16;
++	/* some tolerance. double and add 100 */
++	rs->wait_loops = (8 * HZ * loops_per_jiffy) /
++		(clk_get_rate(rs->clk) / rate);
++	rs->wait_loops = (rs->wait_loops << 1) + 100;
 +	rs->speed = speed;
 +
-+	reg &= ~MT7621_LSB_FIRST;
-+	if (spi->mode & SPI_LSB_FIRST)
-+		reg |= MT7621_LSB_FIRST;
 +
-+	reg &= ~(MT7621_CPHA | MT7621_CPOL);
-+	switch(spi->mode & (SPI_CPOL | SPI_CPHA)) {
-+		case SPI_MODE_0:
-+			break;
-+		case SPI_MODE_1:
-+			reg |= MT7621_CPHA;
-+			break;
-+		case SPI_MODE_2:
-+			reg |= MT7621_CPOL;
-+			break;
-+		case SPI_MODE_3:
-+			reg |= MT7621_CPOL | MT7621_CPHA;
-+			break;
-+	}
-+	mt7621_spi_write(rs, MT7621_SPI_MASTER, reg);
++	dev_dbg(&spi->dev, "speed: %lu/%u, rate: %u, prescal: %u, loops: %hu\n",
++			clk_get_rate(rs->clk) / rate, speed, rate, prescale,
++			rs->wait_loops);
 +
-+	return 0;
++	return (prescale << SPIMASTER_CLKSEL_OFFSET);
 +}
 +
-+static inline int mt7621_spi_wait_till_ready(struct spi_device *spi)
++static void mt7621_spi_set_cs(struct spi_device *spi, bool enable)
 +{
 +	struct mt7621_spi *rs = spidev_to_mt7621_spi(spi);
-+	int i;
++	u32 reg;
 +
-+	for (i = 0; i < RALINK_SPI_WAIT_MAX_LOOP; i++) {
-+		u32 status;
++	if (spi->mode & SPI_CS_HIGH)
++		enable = !enable;
++	enable = !enable;
 +
-+		status = mt7621_spi_read(rs, MT7621_SPI_TRANS);
-+		if ((status & SPITRANS_BUSY) == 0) {
-+			return 0;
-+		}
-+		cpu_relax();
-+		udelay(1);
++	reg = mt7621_spi_read(rs, MT7621_SPI_MASTER);
++	reg &= ~(SPIMASTER_CS_MASK << SPIMASTER_CS_OFFSET);
++
++	if (enable)
++		reg |= (spi->chip_select << SPIMASTER_CS_OFFSET);
++	else {
++		/* when disable just enable cs 8 instead */
++		reg |= (SPIMASTER_CS_MASK << SPIMASTER_CS_OFFSET);
 +	}
 +
++	mt7621_spi_write(rs, MT7621_SPI_MASTER, reg);
++}
++
++static inline int mt7621_spi_wait_ready(struct mt7621_spi *rs, int len)
++{
++	int loop = rs->wait_loops * len;
++
++	while ((mt7621_spi_read(rs, MT7621_SPI_TRANS) & SPITRANS_BUSY) && --loop)
++		cpu_relax();
++
++	if (loop)
++		return 0;
++
 +	return -ETIMEDOUT;
 +}
 +
-+static int mt7621_spi_transfer_half_duplex(struct spi_master *master,
-+					   struct spi_message *m)
++static void mt7621_dump_reg(struct spi_master *master, const char *func)
 +{
 +	struct mt7621_spi *rs = spi_master_get_devdata(master);
-+	struct spi_device *spi = m->spi;
-+	unsigned int speed = spi->max_speed_hz;
-+	struct spi_transfer *t = NULL;
-+	int status = 0;
-+	int i, len = 0;
-+	int rx_len = 0;
-+	u32 data[9] = { 0 };
-+	u32 val;
 +
-+	mt7621_spi_wait_till_ready(spi);
++	dev_dbg(&master->dev, "%s trans: %08x, opcode: %08x, data0: %08x, "
++			"data1: %08x, data2: %08x, data3: %08x, " \
++			"data4: %08x, data5: %08x, data6: %08x, " \
++			"data7: %08x, master: %08x, morebuf: %08x, " \
++			"qctl: %08x, status: %08x, polar: %08x, " \
++			"space: %08x\n",
++			func,
++			mt7621_spi_read(rs, MT7621_SPI_TRANS),
++			mt7621_spi_read(rs, MT7621_SPI_OPCODE),
++			mt7621_spi_read(rs, MT7621_SPI_DATA0),
++			mt7621_spi_read(rs, MT7621_SPI_DATA0 + 4),
++			mt7621_spi_read(rs, MT7621_SPI_DATA0 + 8),
++			mt7621_spi_read(rs, MT7621_SPI_DATA0 + 12),
++			mt7621_spi_read(rs, MT7621_SPI_DATA0 + 16),
++			mt7621_spi_read(rs, MT7621_SPI_DATA0 + 20),
++			mt7621_spi_read(rs, MT7621_SPI_DATA0 + 24),
++			mt7621_spi_read(rs, MT7621_SPI_DATA0 + 28),
++			mt7621_spi_read(rs, MT7621_SPI_MASTER),
++			mt7621_spi_read(rs, MT7621_SPI_MOREBUF),
++			mt7621_spi_read(rs, MT7621_SPI_QUEUE_CTL),
++			mt7621_spi_read(rs, MT7621_SPI_STATUS),
++			mt7621_spi_read(rs, MT7621_SPI_POLAR),
++			mt7621_spi_read(rs, MT7621_SPI_SPACE));
++}
 +
-+	list_for_each_entry(t, &m->transfers, transfer_list) {
-+		const u8 *buf = t->tx_buf;
++static void m25p_addr2cmd(unsigned int addr, u8 *cmd, int addr_width)
++{
++	/* opcode is in cmd[0] */
++	cmd[1] = addr >> (addr_width * 8 -  8);
++	cmd[2] = addr >> (addr_width * 8 - 16);
++	cmd[3] = addr >> (addr_width * 8 - 24);
++	cmd[4] = addr >> (addr_width * 8 - 32);
++}
 +
-+		if (t->rx_buf)
-+			rx_len += t->len;
++static unsigned int m25p_cmd2addr(int addr_width, const u8 *cmd)
++{
++	unsigned int addr;
 +
-+		if (!buf)
-+			continue;
++	/* opcode is in cmd[0] */
++	addr = cmd[1] << (addr_width * 8 - 8);
++	addr |= cmd[2] << (addr_width * 8 - 16);
++	addr |= cmd[3] << (addr_width * 8 - 24);
++	addr |= cmd[4] << (addr_width * 8 - 32);
 +
-+		if (WARN_ON(len + t->len > 36)) {
-+			status = -EIO;
-+			goto msg_done;
-+		}
++	return addr;
++}
 +
-+		for (i = 0; i < t->len; i++, len++)
-+			data[len / 4] |= buf[i] << (8 * (len & 3));
++static int setup_spi_cmd(struct mt7621_spi *rs, const u8 *tx, u32 len,
++		struct mt7621_spi_cmd *cmd)
++{
++	int ret = 0;
++
++	switch (tx[0]) {
++	case SPINOR_OP_READ:
++	case SPINOR_OP_READ_FAST:
++		cmd->flags = SPI_CMD_ADDR;
++	case SPINOR_OP_RDSR:
++	case SPINOR_OP_RDID:
++	case SPINOR_OP_RDFSR:
++	case SPINOR_OP_RDCR:
++		cmd->flags |= SPI_CMD_RX;
++		break;
++	case SPINOR_OP_BRWR:
++		if (tx[1] == BIT(7))
++			cmd->flags = SPI_CMD_EN4B;
++		else
++			cmd->flags = SPI_CMD_EX4B;
++	case SPINOR_OP_WRSR:
++		cmd->flags |= SPI_CMD_DATA | SPI_CMD_TX;
++		break;
++	case SPINOR_OP_PP:
++		cmd->flags = SPI_CMD_ADDR | SPI_CMD_TX;
++		break;
++	case SPINOR_OP_SE:
++		cmd->flags = SPI_CMD_ADDR;
++	case SPINOR_OP_WREN:
++	case SPINOR_OP_WRDI:
++	case SPINOR_OP_CHIP_ERASE:
++		break;
++	case SPINOR_OP_EN4B:
++		cmd->flags = SPI_CMD_EN4B;
++		break;
++	case SPINOR_OP_EX4B:
++		cmd->flags = SPI_CMD_EX4B;
++		break;
++	default:
++		ret = -EINVAL;
++		goto out;
 +	}
 +
-+	if (WARN_ON(rx_len > 32)) {
-+		status = -EIO;
-+		goto msg_done;
-+	}
++	cmd->opaddr_len = len;
++	cmd->opaddr_data = tx;
 +
-+	if (mt7621_spi_prepare(spi, speed)) {
-+		status = -EIO;
-+		goto msg_done;
-+	}
-+	data[0] = swab32(data[0]);
-+	if (len < 4)
-+		data[0] >>= (4 - len) * 8;
++	/* setup 4 bytes address */
++	if (cmd->flags & SPI_CMD_EN4B)
++		rs->addr_width = 4;
++	else if (cmd->flags & SPI_CMD_EX4B)
++		rs->addr_width = 3;
 +
-+	for (i = 0; i < len; i += 4)
-+		mt7621_spi_write(rs, MT7621_SPI_OPCODE + i, data[i / 4]);
++	/* address */
++	if (cmd->flags & SPI_CMD_ADDR)
++		cmd->addr = m25p_cmd2addr(rs->addr_width, tx);
++
++out:
++	return ret;
++}
 +
-+	val = (min_t(int, len, 4) * 8) << 24;
-+	if (len > 4)
-+		val |= (len - 4) * 8;
-+	val |= (rx_len * 8) << 12;
-+	mt7621_spi_write(rs, MT7621_SPI_MOREBUF, val);
++static int mt7621_spi_start(struct mt7621_spi *rs, struct mt7621_spi_data *data)
++{
++	int i, len, ret;
 +
-+	mt7621_spi_set_cs(spi, 1);
++	/* opcode/addr */
++	mt7621_spi_write(rs, MT7621_SPI_OPCODE, data->data[0]);
 +
-+	val = mt7621_spi_read(rs, MT7621_SPI_TRANS);
-+	val |= SPI_CTL_START;
-+	mt7621_spi_write(rs, MT7621_SPI_TRANS, val);
++	/* tx data */
++	len = data->cnt.mosi_bit >> 3;
++	for (i = 0; i < len; i += 4)
++		mt7621_spi_write(rs, (MT7621_SPI_DATA0 + i),
++				data->data[1 + (i / 4)]);
 +
-+	mt7621_spi_wait_till_ready(spi);
++	/* set more buf size */
++	mt7621_spi_write(rs, MT7621_SPI_MOREBUF, data->mb_reg);
 +
-+	mt7621_spi_set_cs(spi, 0);
++	/* start transaction */
++	mt7621_spi_setbits(rs, MT7621_SPI_TRANS, SPITRANS_START);
 +
-+	for (i = 0; i < rx_len; i += 4)
-+		data[i / 4] = mt7621_spi_read(rs, MT7621_SPI_DATA0 + i);
++	len = (data->cnt.cmd_bit + data->cnt.miso_bit + data->cnt.mosi_bit) >> 3;
++	ret = mt7621_spi_wait_ready(rs, len);
++	if (ret)
++		return ret;
 +
-+	m->actual_length = len + rx_len;
++	/* rx data */
++	len = data->cnt.miso_bit >> 3;
++	if (len)
++		for (i = 0; i < len; i += 4)
++			data->data[i / 4] = mt7621_spi_read(rs,
++					(MT7621_SPI_DATA0 + i));
 +
-+	len = 0;
-+	list_for_each_entry(t, &m->transfers, transfer_list) {
-+		u8 *buf = t->rx_buf;
++	return ret;
++}
 +
-+		if (!buf)
-+			continue;
++static int mt7621_read_sr(struct mt7621_spi *rs)
++{
++	struct mt7621_spi_data data = {0};
++	int ret;
 +
-+		for (i = 0; i < t->len; i++, len++)
-+			buf[i] = data[len / 4] >> (8 * (len & 3));
-+	}
++	data.cnt.cmd_bit = 8;
++	data.cnt.miso_bit = 8;
++	data.data[0] = SPINOR_OP_RDSR;
 +
-+msg_done:
-+	m->status = status;
-+	spi_finalize_current_message(master);
++	ret = mt7621_spi_start(rs, &data);
++	if (ret)
++		return ret;
 +
-+	return 0;
++	return (int)data.buf[0];
 +}
 +
-+static int mt7621_spi_transfer_full_duplex(struct spi_master *master,
-+					   struct spi_message *m)
++static int mt7621_wait_till_ready(struct mt7621_spi *rs)
 +{
-+	struct mt7621_spi *rs = spi_master_get_devdata(master);
-+	struct spi_device *spi = m->spi;
-+	unsigned int speed = spi->max_speed_hz;
-+	struct spi_transfer *t = NULL;
-+	int status = 0;
-+	int i, len = 0;
-+	int rx_len = 0;
-+	u32 data[9] = { 0 };
-+	u32 val = 0;
++	unsigned long deadline;
++	int sr;
++
++	deadline = jiffies + (40 * HZ);
 +
-+	mt7621_spi_wait_till_ready(spi);
++	do {
++		usleep_range(10, 100);
++
++		sr = mt7621_read_sr(rs);
++		if (sr < 0)
++			break;
++		else if (!(sr & SR_WIP))
++			return 0;
++	} while (!time_after_eq(jiffies, deadline));
 +
-+	list_for_each_entry(t, &m->transfers, transfer_list) {
-+		const u8 *buf = t->tx_buf;
++	return -ETIMEDOUT;
++}
 +
-+		if (t->rx_buf)
-+			rx_len += t->len;
++static int mt7621_write_enable(struct mt7621_spi *rs)
++{
++	struct mt7621_spi_data data = {0};
++	int ret;
 +
-+		if (!buf)
-+			continue;
++	data.cnt.cmd_bit = 8;
++	data.data[0] = SPINOR_OP_WREN;
 +
-+		if (WARN_ON(len + t->len > 16)) {
-+			status = -EIO;
-+			goto msg_done;
++	ret = mt7621_spi_start(rs, &data);
++
++	return ret;
++}
++
++static int mt7621_spi_transfer_one(struct spi_master *master,
++		struct spi_device *spi, struct spi_transfer *xfer)
++{
++	struct mt7621_spi *rs = spi_master_get_devdata(master);
++	struct mt7621_spi_cmd *cmd;
++	struct mt7621_spi_data data;
++	const u8 *tx;
++	u8 *rx;
++	int len, tx_len, rx_len, ret = 0;
++
++	cmd = &rs->cmd;
++	if (cmd->status == SPI_STATE_OPCODE) {
++		if (!xfer->tx_buf)
++			dev_err(&spi->dev, "only support spi flash device\n");
++
++		memset(cmd, 0, sizeof(*cmd));
++		ret = setup_spi_cmd(rs, xfer->tx_buf, xfer->len, cmd);
++		if (ret < 0) {
++			dev_err(&spi->dev, "unknown spi command %02x\n",
++					*(u8 *)xfer->tx_buf);
++			goto err;
 +		}
 +
-+		for (i = 0; i < t->len; i++, len++)
-+			data[len / 4] |= buf[i] << (8 * (len & 3));
++		/* need data at next transfer */
++		if ((cmd->flags & (SPI_CMD_TX | SPI_CMD_RX)) &&
++				!(cmd->flags & SPI_CMD_DATA)) {
++			cmd->status = SPI_STATE_DATA;
++			return ret;
++		}
++		/* just opcode and address. no need other data */
++		len = 0;
++		tx = rx = NULL;
++	} else {
++		if (((cmd->flags & SPI_CMD_TX) && !xfer->tx_buf) ||
++				((cmd->flags & SPI_CMD_RX) && !xfer->rx_buf)) {
++			dev_err(&spi->dev, "no spi data found\n");
++			ret = -EINVAL;
++			goto err;
++		}
++		len = xfer->len;
++		tx = xfer->tx_buf;
++		rx = xfer->rx_buf;
 +	}
 +
-+	if (WARN_ON(rx_len > 16)) {
-+		status = -EIO;
-+		goto msg_done;
-+	}
++	memcpy(data.buf, cmd->opaddr_data, cmd->opaddr_len);
++	do {
++		/* handle tx data */
++		if (tx) {
++			tx_len = min(len, (int)(36 - cmd->opaddr_len));
++
++			memcpy((data.buf + cmd->opaddr_len), tx, tx_len);
++			tx += tx_len;
++			len -= tx_len;
++			cmd->addr += tx_len;
++		} else
++			tx_len = 0;
++
++		tx_len += cmd->opaddr_len;
++		data.cnt.cmd_bit = min(tx_len * 8, 32);
++		data.cnt.mosi_bit = (tx_len * 8) - data.cnt.cmd_bit;
++
++		/* fill opaddr reg */
++		data.data[0] = cpu_to_be32(data.data[0]);
++		if (data.cnt.cmd_bit < 32)
++			data.data[0] >>= (32 - data.cnt.cmd_bit);
++
++		if (rx) {
++			rx_len = min(len, 32);
++			data.cnt.miso_bit = rx_len * 8;
++		} else
++			data.cnt.miso_bit = 0;
++
++		/* start transfer */
++		ret = mt7621_spi_start(rs, &data);
++		if (ret) {
++			dev_err(&spi->dev, "start wait timeout\n");
++			goto err;
++		}
 +
-+	if (mt7621_spi_prepare(spi, speed)) {
-+		status = -EIO;
-+		goto msg_done;
-+	}
++		/* handle rx data */
++		if (rx) {
++			memcpy(rx, data.buf, rx_len);
++			rx += rx_len;
++			len -= rx_len;
++			cmd->addr += rx_len;
++		}
 +
-+	for (i = 0; i < len; i += 4)
-+		mt7621_spi_write(rs, MT7621_SPI_DATA0 + i, data[i / 4]);
++		/* work around hw limit */
++		if (len) {
++			if (tx) {
++				ret = mt7621_wait_till_ready(rs);
++				if (ret) {
++					dev_err(&spi->dev, "wait timeout\n");
++					goto err;
++				}
++				ret = mt7621_write_enable(rs);
++				if (ret) {
++					dev_err(&spi->dev, "write enable timeout\n");
++					goto err;
++				}
++			}
++			if (cmd->flags & SPI_CMD_ADDR) {
++				memcpy(data.buf, cmd->opaddr_data,
++						cmd->opaddr_len);
++				/* update address for next loop */
++				m25p_addr2cmd(cmd->addr, data.buf,
++						rs->addr_width);
++			}
++		}
++	} while (len);
 +
-+	val |= len * 8;
-+	val |= (rx_len * 8) << 12;
-+	mt7621_spi_write(rs, MT7621_SPI_MOREBUF, val);
++err:
++	cmd->status = SPI_STATE_OPCODE;
++	if (ret)
++		mt7621_dump_reg(master, __func__);
 +
-+	mt7621_spi_set_cs(spi, 1);
++	return ret;
++}
 +
-+	val = mt7621_spi_read(rs, MT7621_SPI_TRANS);
-+	val |= SPI_CTL_START;
-+	mt7621_spi_write(rs, MT7621_SPI_TRANS, val);
++static void spi_set_cs(struct spi_device *spi, bool enable)
++{
++	if (spi->mode & SPI_CS_HIGH)
++		enable = !enable;
 +
-+	mt7621_spi_wait_till_ready(spi);
++	if (spi->master->set_cs)
++		spi->master->set_cs(spi, !enable);
++}
 +
-+	mt7621_spi_set_cs(spi, 0);
++static int mt7621_spi_setup(struct spi_device *spi)
++{
++	struct spi_master *master = spi->master;
++	struct mt7621_spi *rs = spi_master_get_devdata(master);
 +
-+	for (i = 0; i < rx_len; i += 4)
-+		data[i / 4] = mt7621_spi_read(rs, MT7621_SPI_DATA4 + i);
++	if ((spi->max_speed_hz > master->max_speed_hz) ||
++			(spi->max_speed_hz < master->min_speed_hz)) {
++		dev_err(&spi->dev, "invalide requested speed %d Hz\n",
++				spi->max_speed_hz);
++		return -EINVAL;
++	}
 +
-+	//m->actual_length = len + rx_len;
-+	m->actual_length = rx_len;
++	if (!(master->bits_per_word_mask &
++				BIT(spi->bits_per_word - 1))) {
++		dev_err(&spi->dev, "invalide bits_per_word %d\n",
++				spi->bits_per_word);
++		return -EINVAL;
++	}
 +
-+	len = 0;
-+	list_for_each_entry(t, &m->transfers, transfer_list) {
-+		u8 *buf = t->rx_buf;
++	/* chip polarity */
++	if (spi->mode & SPI_CS_HIGH)
++		mt7621_spi_setbits(rs, MT7621_SPI_POLAR,
++				(SPIPOL_CSPOL_HIGH << spi->chip_select));
++	else
++		mt7621_spi_clrbits(rs, MT7621_SPI_POLAR,
++				(SPIPOL_CSPOL_HIGH << spi->chip_select));
 +
-+		if (!buf)
-+			continue;
++	/* enable more buffer mode */
++	mt7621_spi_setbits(rs, MT7621_SPI_MASTER, SPIMASTER_MB_MODE);
 +
-+		for (i = 0; i < t->len; i++, len++)
-+			buf[i] = data[len / 4] >> (8 * (len & 3));
-+	}
++	/* deselected the spi device */
++	spi_set_cs(spi, false);
 +
-+msg_done:
-+	m->status = status;
-+	spi_finalize_current_message(master);
++	mt7621_dump_reg(master, __func__);
 +
 +	return 0;
 +}
 +
-+static int mt7621_spi_transfer_one_message(struct spi_master *master,
-+					   struct spi_message *m)
++static int mt7621_spi_prepare_message(struct spi_master *master,
++		struct spi_message *msg)
 +{
-+	struct spi_device *spi = m->spi;
-+	int cs = spi->chip_select;
-+
-+	if (cs)
-+		return mt7621_spi_transfer_full_duplex(master, m);
-+	return mt7621_spi_transfer_half_duplex(master, m);
-+}
++	struct mt7621_spi *rs = spi_master_get_devdata(master);
++	struct spi_device *spi = msg->spi;
++	u32 reg;
 +
-+static int mt7621_spi_setup(struct spi_device *spi)
-+{
-+	struct mt7621_spi *rs = spidev_to_mt7621_spi(spi);
++	if ((rs->mode == spi->mode) && (rs->speed == spi->max_speed_hz))
++		return 0;
 +
-+	if ((spi->max_speed_hz == 0) ||
-+		(spi->max_speed_hz > (rs->sys_freq / 2)))
-+		spi->max_speed_hz = (rs->sys_freq / 2);
++	reg = mt7621_spi_read(rs, MT7621_SPI_MASTER);
++	reg &= ~((SPIMASTER_CLKSEL_MASK << SPIMASTER_CLKSEL_OFFSET) |
++			SPIMASTER_CPHA | SPIMASTER_CPOL |
++			SPIMASTER_LSB);
 +
-+	if (spi->max_speed_hz < (rs->sys_freq / 4097)) {
-+		dev_err(&spi->dev, "setup: requested speed is too low %d Hz\n",
-+			spi->max_speed_hz);
-+		return -EINVAL;
++	/* LSB */
++	if (spi->mode & SPI_LSB_FIRST)
++		reg |= SPIMASTER_LSB;
++
++	/* spi mode */
++	switch (spi->mode & (SPI_CPOL | SPI_CPHA)) {
++	case SPI_MODE_0:
++		break;
++	case SPI_MODE_1:
++		reg |= SPIMASTER_CPHA;
++		break;
++	case SPI_MODE_2:
++		reg |= SPIMASTER_CPOL;
++		break;
++	case SPI_MODE_3:
++		reg |= SPIMASTER_CPOL | SPIMASTER_CPHA;
++		break;
 +	}
++	rs->mode = spi->mode;
++
++	/* clock divide */
++	reg |= mt7621_spi_baudrate_get(spi, spi->max_speed_hz);
++
++	mt7621_spi_write(rs, MT7621_SPI_MASTER, reg);
 +
 +	return 0;
 +}
@@ -414,17 +697,14 @@ 
 +	const struct of_device_id *match;
 +	struct spi_master *master;
 +	struct mt7621_spi *rs;
-+	unsigned long flags;
 +	void __iomem *base;
 +	struct resource *r;
-+	int status = 0;
 +	struct clk *clk;
-+	struct mt7621_spi_ops *ops;
++	int ret;
 +
 +	match = of_match_device(mt7621_spi_match, &pdev->dev);
 +	if (!match)
 +		return -EINVAL;
-+	ops = (struct mt7621_spi_ops *)match->data;
 +
 +	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 +	base = devm_ioremap_resource(&pdev->dev, r);
@@ -433,45 +713,58 @@ 
 +
 +	clk = devm_clk_get(&pdev->dev, NULL);
 +	if (IS_ERR(clk)) {
-+		dev_err(&pdev->dev, "unable to get SYS clock, err=%d\n",
-+			status);
++		dev_err(&pdev->dev, "unable to get SYS clock\n");
 +		return PTR_ERR(clk);
 +	}
 +
-+	status = clk_prepare_enable(clk);
-+	if (status)
-+		return status;
++	ret = clk_prepare_enable(clk);
++	if (ret)
++		goto err_clk;
 +
 +	master = spi_alloc_master(&pdev->dev, sizeof(*rs));
 +	if (master == NULL) {
-+		dev_info(&pdev->dev, "master allocation failed\n");
-+		return -ENOMEM;
++		dev_err(&pdev->dev, "master allocation failed\n");
++		ret = -ENOMEM;
++		goto err_clk;
 +	}
 +
-+	master->mode_bits = RT2880_SPI_MODE_BITS;
-+
-+	master->setup = mt7621_spi_setup;
-+	master->transfer_one_message = mt7621_spi_transfer_one_message;
-+	master->bits_per_word_mask = SPI_BPW_MASK(8);
-+	master->dev.of_node = pdev->dev.of_node;
 +	master->num_chipselect = 2;
++	master->dev.of_node = pdev->dev.of_node;
++	master->mode_bits = MT7621_SPI_MODE_BITS;
++	master->bits_per_word_mask = SPI_BPW_MASK(8);
++	master->min_speed_hz = clk_get_rate(clk) / 4097;
++	master->max_speed_hz = clk_get_rate(clk) / 2;
++	master->flags = SPI_MASTER_HALF_DUPLEX;
++	master->setup = mt7621_spi_setup;
++	master->prepare_message = mt7621_spi_prepare_message;
++	master->set_cs = mt7621_spi_set_cs;
++	master->transfer_one = mt7621_spi_transfer_one;
 +
 +	dev_set_drvdata(&pdev->dev, master);
 +
 +	rs = spi_master_get_devdata(master);
++	rs->master = master;
 +	rs->base = base;
 +	rs->clk = clk;
-+	rs->master = master;
-+	rs->sys_freq = clk_get_rate(rs->clk);
-+	rs->ops = ops;
-+	dev_info(&pdev->dev, "sys_freq: %u\n", rs->sys_freq);
-+	spin_lock_irqsave(&rs->lock, flags);
++	rs->addr_width = 3;
 +
 +	device_reset(&pdev->dev);
 +
-+	mt7621_spi_reset(rs, 0);
++	ret = devm_spi_register_master(&pdev->dev, master);
++	if (ret < 0) {
++		dev_err(&pdev->dev, "devm_spi_register_master error.\n");
++		goto err_master;
++	}
++
++	return ret;
++
++err_master:
++	spi_master_put(master);
++	kfree(master);
++err_clk:
++	clk_disable_unprepare(clk);
 +
-+	return spi_register_master(master);
++	return ret;
 +}
 +
 +static int mt7621_spi_remove(struct platform_device *pdev)
@@ -482,8 +775,7 @@ 
 +	master = dev_get_drvdata(&pdev->dev);
 +	rs = spi_master_get_devdata(master);
 +
-+	clk_disable(rs->clk);
-+	spi_unregister_master(master);
++	clk_disable_unprepare(rs->clk);
 +
 +	return 0;
 +}