diff mbox series

[8/8] video: Convert UTF-8 input stream to the 437 code page

Message ID 20220110005638.21599-9-andre.przywara@arm.com
State Deferred
Delegated to: Tom Rini
Headers show
Series video: improve UEFI experience on DM_VIDEO | expand

Commit Message

Andre Przywara Jan. 10, 2022, 12:56 a.m. UTC
The bitmap fonts (VGA 8x16 and friends) we import from Linux use the
437 code page to map their glyphs. For U-Boot's own purposes this is
probably fine, but UEFI applications output Unicode, which only matches
in the very basic first 127 characters.

Add a function that converts UTF-8 character sequences into the
respective CP437 code point, as far as the characters defined in there
allow this. This includes quite some international and box drawing
characters, which are used by UEFI applications.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
---
 drivers/video/Makefile            |   1 +
 drivers/video/utf8_cp437.c        | 169 ++++++++++++++++++++++++++++++
 drivers/video/vidconsole-uclass.c |   6 +-
 include/video_console.h           |   9 ++
 4 files changed, 184 insertions(+), 1 deletion(-)
 create mode 100644 drivers/video/utf8_cp437.c

Comments

Heinrich Schuchardt Jan. 10, 2022, 8:18 a.m. UTC | #1
/On 1/10/22 01:56, Andre Przywara wrote:
> The bitmap fonts (VGA 8x16 and friends) we import from Linux use the
> 437 code page to map their glyphs. For U-Boot's own purposes this is
> probably fine, but UEFI applications output Unicode, which only matches
> in the very basic first 127 characters.
> 
> Add a function that converts UTF-8 character sequences into the
> respective CP437 code point, as far as the characters defined in there
> allow this. This includes quite some international and box drawing
> characters, which are used by UEFI applications.
> 
> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
> ---
>   drivers/video/Makefile            |   1 +
>   drivers/video/utf8_cp437.c        | 169 ++++++++++++++++++++++++++++++
>   drivers/video/vidconsole-uclass.c |   6 +-
>   include/video_console.h           |   9 ++
>   4 files changed, 184 insertions(+), 1 deletion(-)
>   create mode 100644 drivers/video/utf8_cp437.c
> 
> diff --git a/drivers/video/Makefile b/drivers/video/Makefile
> index 8956b5f9b00..5f9823dff9e 100644
> --- a/drivers/video/Makefile
> +++ b/drivers/video/Makefile
> @@ -14,6 +14,7 @@ obj-$(CONFIG_DISPLAY) += display-uclass.o
>   obj-$(CONFIG_VIDEO_MIPI_DSI) += dsi-host-uclass.o
>   obj-$(CONFIG_DM_VIDEO) += video-uclass.o vidconsole-uclass.o
>   obj-$(CONFIG_DM_VIDEO) += video_bmp.o
> +obj-$(CONFIG_DM_VIDEO) += utf8_cp437.o
>   obj-$(CONFIG_PANEL) += panel-uclass.o
>   obj-$(CONFIG_DM_PANEL_HX8238D) += hx8238d.o
>   obj-$(CONFIG_SIMPLE_PANEL) += simple_panel.o
> diff --git a/drivers/video/utf8_cp437.c b/drivers/video/utf8_cp437.c
> new file mode 100644
> index 00000000000..cab68b92b6e
> --- /dev/null
> +++ b/drivers/video/utf8_cp437.c

A translation from Unicode to CP437 is needed in the FAT driver (but 
missing), in Unicode Collation Protocol and here in video. So this 
functionality should live in lib/charset.c.

Please, have a look at efi_fat_to_str() in 
lib/efi_loader/efi_unicode_collation.c. We should avoid code duplication.

Maybe we should drop CP1250 support to make our effort simpler? No 
defconfig uses it.

> @@ -0,0 +1,169 @@
> +/*
> + * Convert UTF-8 bytes into a code page 437 character.
> + * Based on the table in the Code_page_437 Wikipedia page.
> + */
> +
> +#include <linux/types.h>
> +
> +uint8_t code_points_00a0[] = {
> +	255, 173, 155, 156,   7, 157,   7,  21,
> +	  7,   7, 166, 174, 170,   7,   7,   7,
> +	248, 241, 253,   7,   7, 230,  20, 250,
> +	  7,   7, 167, 175, 172, 171,   7, 168,
> +	  7,   7,   7,   7, 142, 143, 146, 128,
> +	  7, 144,   7,   7,   7,   7,   7,   7,
> +	  7, 165,   7,   7,   7,   7, 153,   7,
> +	  7,   7,   7,   7, 154,   7,   7, 225,
> +	133, 160, 131,   7, 132, 134, 145, 135,
> +	138, 130, 136, 137, 141, 161, 140, 139,
> +	  7, 164, 149, 162, 147,   7, 148, 246,
> +	  7, 151, 163, 150, 129,   7,   7, 152,
> +};
> +
> +uint8_t code_points_2550[] = {
> +	205, 186, 213, 214, 201, 184, 183, 187,
> +	212, 211, 200, 190, 189, 188, 198, 199,
> +	204, 181, 182, 185, 209, 210, 203, 207,
> +	208, 202, 216, 215, 206
> +};
> +
> +static uint8_t utf8_convert_11bit(uint16_t code)
> +{
> +	switch (code) {
> +	case 0x0192: return 159;
> +	case 0x0393: return 226;
> +	case 0x0398: return 233;
> +	case 0x03A3: return 228;
> +	case 0x03A6: return 232;
> +	case 0x03A9: return 234;
> +	case 0x03B1: return 224;
> +	case 0x03B4: return 235;
> +	case 0x03B5: return 238;
> +	case 0x03C0: return 227;
> +	case 0x03C3: return 229;
> +	case 0x03C4: return 231;
> +	case 0x03C6: return 237;
> +	}
> +
> +	return 0;
> +};
> +
> +static uint8_t utf8_convert_2xxx(uint16_t code)


This is duplicate to include/cp437.h

> +{
> +	switch (code) {
> +	case 0x2022: return 7;
> +	case 0x203C: return 19;
> +	case 0x207F: return 252;
> +	case 0x20A7: return 158;
> +	case 0x2190: return 27;
> +	case 0x2191: return 24;
> +	case 0x2192: return 26;
> +	case 0x2193: return 25;
> +	case 0x2194: return 29;
> +	case 0x2195: return 18;
> +	case 0x21A8: return 23;
> +	case 0x2219: return 249;
> +	case 0x221A: return 251;
> +	case 0x221E: return 236;
> +	case 0x221F: return 28;
> +	case 0x2229: return 239;
> +	case 0x2248: return 247;
> +	case 0x2261: return 240;
> +	case 0x2264: return 243;
> +	case 0x2265: return 242;
> +	case 0x2310: return 169;
> +	case 0x2320: return 244;
> +	case 0x2321: return 245;
> +	case 0x2500: return 196;
> +	case 0x2502: return 179;
> +	case 0x250C: return 218;
> +	case 0x2510: return 191;
> +	case 0x2514: return 192;
> +	case 0x2518: return 217;
> +	case 0x251C: return 195;
> +	case 0x2524: return 180;
> +	case 0x252C: return 194;
> +	case 0x2534: return 193;
> +	case 0x253C: return 197;
> +	case 0x2580: return 223;
> +	case 0x2584: return 220;
> +	case 0x2588: return 219;
> +	case 0x258C: return 221;
> +	case 0x2590: return 222;
> +	case 0x2591: return 176;
> +	case 0x2592: return 177;
> +	case 0x2593: return 178;
> +	case 0x25A0: return 254;
> +	case 0x25AC: return 22;
> +	case 0x25B2: return 30;
> +	case 0x25BA: return 16;
> +	case 0x25BC: return 31;
> +	case 0x25C4: return 17;
> +	case 0x25CB: return 9;
> +	case 0x25D8: return 8;
> +	case 0x25D9: return 10;
> +	case 0x263A: return 1;
> +	case 0x263B: return 2;
> +	case 0x263C: return 15;
> +	case 0x2640: return 12;
> +	case 0x2642: return 11;
> +	case 0x2660: return 6;
> +	case 0x2663: return 5;
> +	case 0x2665: return 3;
> +	case 0x2666: return 4;
> +	case 0x266A: return 13;
> +	case 0x266B: return 14;
> +	}
> +
> +	return 0;
> +}
> +
> +uint8_t convert_uc16_to_cp437(uint16_t code)

We should not duplicate efi_fat_to_str() but use a common function.

> +{
> +	if (code < 0x7f)		// ASCII
> +		return code;
> +	if (code < 0xa0)		// high control characters
> +		return code;
> +	if (code < 0x100)		// international characters
> +		return code_points_00a0[code - 0xa0];
> +	if (code < 0x800)
> +		return utf8_convert_11bit(code);
> +	if (code >= 0x2550 && code < 0x256d)	// block graphics
> +		return code_points_2550[code - 0x2550];

How about ÄÖÜß and other European letters?

> +
> +	return utf8_convert_2xxx(code);
> +}
> +
> +uint8_t convert_utf8_to_cp437(uint8_t c, uint32_t *esc)
> +{
> +	int shift;
> +	uint16_t ucs;
> +
> +	if (c < 127)			// ASCII
> +		return c;
> +	if (c == 127)
> +		return 8;		// DEL (?)
> +
> +	switch (c & 0xf0) {
> +	case 0xc0: case 0xd0:		// two bytes sequence
> +		*esc = (1U << 24) | ((c & 0x1f) << 6);
> +		return 0;
> +	case 0xe0:			// three bytes sequence
> +		*esc = (2U << 24) | ((c & 0x0f) << 12);
> +		return 0;
> +	case 0xf0:			// four bytes sequence
> +		*esc = (3U << 24) | ((c & 0x07) << 18);
> +		return 0;
> +	case 0x80: case 0x90: case 0xa0: case 0xb0:	// continuation
> +		shift = (*esc >> 24) - 1;
> +		ucs = *esc & 0xffffff;
> +		if (shift) {
> +			*esc = (shift << 24) | ucs | (c & 0x3f) << (shift * 6);
> +			return 0;
> +		}
> +		*esc = 0;
> +		return convert_uc16_to_cp437(ucs | (c & 0x3f));
> +	}
> +
> +	return 0;
> +}
> diff --git a/drivers/video/vidconsole-uclass.c b/drivers/video/vidconsole-uclass.c
> index 420fd86f9ac..ca6e1a2620c 100644
> --- a/drivers/video/vidconsole-uclass.c
> +++ b/drivers/video/vidconsole-uclass.c
> @@ -546,6 +546,7 @@ static int vidconsole_output_glyph(struct udevice *dev, char ch)
>   int vidconsole_put_char(struct udevice *dev, char ch)
>   {
>   	struct vidconsole_priv *priv = dev_get_uclass_priv(dev);
> +	uint8_t cp437;
>   	int ret;
>   
>   	/*
> @@ -587,7 +588,10 @@ int vidconsole_put_char(struct udevice *dev, char ch)
>   		priv->last_ch = 0;
>   		break;
>   	default:
> -		ret = vidconsole_output_glyph(dev, ch);
> +		cp437 = convert_utf8_to_cp437(ch, &priv->ucs);
> +		if (cp437 == 0)
> +			return 0;
> +		ret = vidconsole_output_glyph(dev, cp437);
>   		if (ret < 0)
>   			return ret;
>   		break;
> diff --git a/include/video_console.h b/include/video_console.h

This should go to include/charset.h.

Best regards

Heinrich

> index a908f1412e8..f2d05e7f4e7 100644
> --- a/include/video_console.h
> +++ b/include/video_console.h
> @@ -83,6 +83,7 @@ struct vidconsole_priv {
>   	int escape_len;
>   	int row_saved;
>   	int col_saved;
> +	u32 ucs;
>   	bool cursor_visible;
>   	char escape_buf[32];
>   };
> @@ -304,6 +305,14 @@ static inline int vidconsole_memmove(struct udevice *dev, void *dst,
>   	return 0;
>   }
>   
> +/*
> + * Convert an UTF-8 byte into the corresponding character in the CP437
> + * code page. Returns 0 if that character is part of a multi-byte sequence.
> + * for which *esc holds the state of. Repeatedly feed in more bytes until
> + * the return value returns a non-0 character.

Please, follow the style described in 
https://www.kernel.org/doc/html/latest/doc-guide/kernel-doc.html#function-documentation. 
This will allow inclusion in the HTML documentation (see 
https://u-boot.readthedocs.io/en/latest/api/index.html).

Best regards

Heinrich

> + */
> +uint8_t convert_utf8_to_cp437(uint8_t c, uint32_t *esc);
> +
>   #endif
>   
>   #endif
Andre Przywara Jan. 10, 2022, 11:08 a.m. UTC | #2
On Mon, 10 Jan 2022 09:18:45 +0100
Heinrich Schuchardt <heinrich.schuchardt@canonical.com> wrote:

Hi Heinrich,

> /On 1/10/22 01:56, Andre Przywara wrote:
> > The bitmap fonts (VGA 8x16 and friends) we import from Linux use the
> > 437 code page to map their glyphs. For U-Boot's own purposes this is
> > probably fine, but UEFI applications output Unicode, which only matches
> > in the very basic first 127 characters.
> > 
> > Add a function that converts UTF-8 character sequences into the
> > respective CP437 code point, as far as the characters defined in there
> > allow this. This includes quite some international and box drawing
> > characters, which are used by UEFI applications.
> > 
> > Signed-off-by: Andre Przywara <andre.przywara@arm.com>
> > ---
> >   drivers/video/Makefile            |   1 +
> >   drivers/video/utf8_cp437.c        | 169 ++++++++++++++++++++++++++++++
> >   drivers/video/vidconsole-uclass.c |   6 +-
> >   include/video_console.h           |   9 ++
> >   4 files changed, 184 insertions(+), 1 deletion(-)
> >   create mode 100644 drivers/video/utf8_cp437.c
> > 
> > diff --git a/drivers/video/Makefile b/drivers/video/Makefile
> > index 8956b5f9b00..5f9823dff9e 100644
> > --- a/drivers/video/Makefile
> > +++ b/drivers/video/Makefile
> > @@ -14,6 +14,7 @@ obj-$(CONFIG_DISPLAY) += display-uclass.o
> >   obj-$(CONFIG_VIDEO_MIPI_DSI) += dsi-host-uclass.o
> >   obj-$(CONFIG_DM_VIDEO) += video-uclass.o vidconsole-uclass.o
> >   obj-$(CONFIG_DM_VIDEO) += video_bmp.o
> > +obj-$(CONFIG_DM_VIDEO) += utf8_cp437.o
> >   obj-$(CONFIG_PANEL) += panel-uclass.o
> >   obj-$(CONFIG_DM_PANEL_HX8238D) += hx8238d.o
> >   obj-$(CONFIG_SIMPLE_PANEL) += simple_panel.o
> > diff --git a/drivers/video/utf8_cp437.c b/drivers/video/utf8_cp437.c
> > new file mode 100644
> > index 00000000000..cab68b92b6e
> > --- /dev/null
> > +++ b/drivers/video/utf8_cp437.c  
> 
> A translation from Unicode to CP437 is needed in the FAT driver (but 
> missing), in Unicode Collation Protocol and here in video. So this 
> functionality should live in lib/charset.c.
> Please, have a look at efi_fat_to_str() in 
> lib/efi_loader/efi_unicode_collation.c. We should avoid code duplication.

Ah, didn't know, thanks for the heads up! Happy to move the code in there,
and will also look for prior art.

> Maybe we should drop CP1250 support to make our effort simpler? No 
> defconfig uses it.

I need to have a closer look.

> 
> > @@ -0,0 +1,169 @@
> > +/*
> > + * Convert UTF-8 bytes into a code page 437 character.
> > + * Based on the table in the Code_page_437 Wikipedia page.
> > + */
> > +
> > +#include <linux/types.h>
> > +
> > +uint8_t code_points_00a0[] = {
> > +	255, 173, 155, 156,   7, 157,   7,  21,
> > +	  7,   7, 166, 174, 170,   7,   7,   7,
> > +	248, 241, 253,   7,   7, 230,  20, 250,
> > +	  7,   7, 167, 175, 172, 171,   7, 168,
> > +	  7,   7,   7,   7, 142, 143, 146, 128,
> > +	  7, 144,   7,   7,   7,   7,   7,   7,
> > +	  7, 165,   7,   7,   7,   7, 153,   7,
> > +	  7,   7,   7,   7, 154,   7,   7, 225,
> > +	133, 160, 131,   7, 132, 134, 145, 135,
> > +	138, 130, 136, 137, 141, 161, 140, 139,
> > +	  7, 164, 149, 162, 147,   7, 148, 246,
> > +	  7, 151, 163, 150, 129,   7,   7, 152,
> > +};
> > +
> > +uint8_t code_points_2550[] = {
> > +	205, 186, 213, 214, 201, 184, 183, 187,
> > +	212, 211, 200, 190, 189, 188, 198, 199,
> > +	204, 181, 182, 185, 209, 210, 203, 207,
> > +	208, 202, 216, 215, 206
> > +};
> > +
> > +static uint8_t utf8_convert_11bit(uint16_t code)
> > +{
> > +	switch (code) {
> > +	case 0x0192: return 159;
> > +	case 0x0393: return 226;
> > +	case 0x0398: return 233;
> > +	case 0x03A3: return 228;
> > +	case 0x03A6: return 232;
> > +	case 0x03A9: return 234;
> > +	case 0x03B1: return 224;
> > +	case 0x03B4: return 235;
> > +	case 0x03B5: return 238;
> > +	case 0x03C0: return 227;
> > +	case 0x03C3: return 229;
> > +	case 0x03C4: return 231;
> > +	case 0x03C6: return 237;
> > +	}
> > +
> > +	return 0;
> > +};
> > +
> > +static uint8_t utf8_convert_2xxx(uint16_t code)  
> 
> 
> This is duplicate to include/cp437.h

But this is the other way around, isn't it? The above is from CP437 to
UCS-2, this one here is from UCS-2 to CP437 (and it's misnamed).
But I can certainly move it in there.

> > +{
> > +	switch (code) {
> > +	case 0x2022: return 7;
> > +	case 0x203C: return 19;
> > +	case 0x207F: return 252;
> > +	case 0x20A7: return 158;
> > +	case 0x2190: return 27;
> > +	case 0x2191: return 24;
> > +	case 0x2192: return 26;
> > +	case 0x2193: return 25;
> > +	case 0x2194: return 29;
> > +	case 0x2195: return 18;
> > +	case 0x21A8: return 23;
> > +	case 0x2219: return 249;
> > +	case 0x221A: return 251;
> > +	case 0x221E: return 236;
> > +	case 0x221F: return 28;
> > +	case 0x2229: return 239;
> > +	case 0x2248: return 247;
> > +	case 0x2261: return 240;
> > +	case 0x2264: return 243;
> > +	case 0x2265: return 242;
> > +	case 0x2310: return 169;
> > +	case 0x2320: return 244;
> > +	case 0x2321: return 245;
> > +	case 0x2500: return 196;
> > +	case 0x2502: return 179;
> > +	case 0x250C: return 218;
> > +	case 0x2510: return 191;
> > +	case 0x2514: return 192;
> > +	case 0x2518: return 217;
> > +	case 0x251C: return 195;
> > +	case 0x2524: return 180;
> > +	case 0x252C: return 194;
> > +	case 0x2534: return 193;
> > +	case 0x253C: return 197;
> > +	case 0x2580: return 223;
> > +	case 0x2584: return 220;
> > +	case 0x2588: return 219;
> > +	case 0x258C: return 221;
> > +	case 0x2590: return 222;
> > +	case 0x2591: return 176;
> > +	case 0x2592: return 177;
> > +	case 0x2593: return 178;
> > +	case 0x25A0: return 254;
> > +	case 0x25AC: return 22;
> > +	case 0x25B2: return 30;
> > +	case 0x25BA: return 16;
> > +	case 0x25BC: return 31;
> > +	case 0x25C4: return 17;
> > +	case 0x25CB: return 9;
> > +	case 0x25D8: return 8;
> > +	case 0x25D9: return 10;
> > +	case 0x263A: return 1;
> > +	case 0x263B: return 2;
> > +	case 0x263C: return 15;
> > +	case 0x2640: return 12;
> > +	case 0x2642: return 11;
> > +	case 0x2660: return 6;
> > +	case 0x2663: return 5;
> > +	case 0x2665: return 3;
> > +	case 0x2666: return 4;
> > +	case 0x266A: return 13;
> > +	case 0x266B: return 14;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +uint8_t convert_uc16_to_cp437(uint16_t code)  
> 
> We should not duplicate efi_fat_to_str() but use a common function.

Definitely, I wasn't aware of this code, thanks for the heads up.

> > +{
> > +	if (code < 0x7f)		// ASCII
> > +		return code;
> > +	if (code < 0xa0)		// high control characters
> > +		return code;
> > +	if (code < 0x100)		// international characters
> > +		return code_points_00a0[code - 0xa0];
> > +	if (code < 0x800)
> > +		return utf8_convert_11bit(code);
> > +	if (code >= 0x2550 && code < 0x256d)	// block graphics
> > +		return code_points_2550[code - 0x2550];  
> 
> How about ÄÖÜß and other European letters?

Any "Übergrößengeschäft" should be covered by the international
characters section above, between 160 and 255. Do you miss anything in
particular?

> > +
> > +	return utf8_convert_2xxx(code);
> > +}
> > +
> > +uint8_t convert_utf8_to_cp437(uint8_t c, uint32_t *esc)
> > +{
> > +	int shift;
> > +	uint16_t ucs;
> > +
> > +	if (c < 127)			// ASCII
> > +		return c;
> > +	if (c == 127)
> > +		return 8;		// DEL (?)
> > +
> > +	switch (c & 0xf0) {
> > +	case 0xc0: case 0xd0:		// two bytes sequence
> > +		*esc = (1U << 24) | ((c & 0x1f) << 6);
> > +		return 0;
> > +	case 0xe0:			// three bytes sequence
> > +		*esc = (2U << 24) | ((c & 0x0f) << 12);
> > +		return 0;
> > +	case 0xf0:			// four bytes sequence
> > +		*esc = (3U << 24) | ((c & 0x07) << 18);
> > +		return 0;
> > +	case 0x80: case 0x90: case 0xa0: case 0xb0:	// continuation
> > +		shift = (*esc >> 24) - 1;
> > +		ucs = *esc & 0xffffff;
> > +		if (shift) {
> > +			*esc = (shift << 24) | ucs | (c & 0x3f) << (shift * 6);
> > +			return 0;
> > +		}
> > +		*esc = 0;
> > +		return convert_uc16_to_cp437(ucs | (c & 0x3f));
> > +	}
> > +
> > +	return 0;
> > +}
> > diff --git a/drivers/video/vidconsole-uclass.c b/drivers/video/vidconsole-uclass.c
> > index 420fd86f9ac..ca6e1a2620c 100644
> > --- a/drivers/video/vidconsole-uclass.c
> > +++ b/drivers/video/vidconsole-uclass.c
> > @@ -546,6 +546,7 @@ static int vidconsole_output_glyph(struct udevice *dev, char ch)
> >   int vidconsole_put_char(struct udevice *dev, char ch)
> >   {
> >   	struct vidconsole_priv *priv = dev_get_uclass_priv(dev);
> > +	uint8_t cp437;
> >   	int ret;
> >   
> >   	/*
> > @@ -587,7 +588,10 @@ int vidconsole_put_char(struct udevice *dev, char ch)
> >   		priv->last_ch = 0;
> >   		break;
> >   	default:
> > -		ret = vidconsole_output_glyph(dev, ch);
> > +		cp437 = convert_utf8_to_cp437(ch, &priv->ucs);
> > +		if (cp437 == 0)
> > +			return 0;
> > +		ret = vidconsole_output_glyph(dev, cp437);
> >   		if (ret < 0)
> >   			return ret;
> >   		break;
> > diff --git a/include/video_console.h b/include/video_console.h  
> 
> This should go to include/charset.h.
> 
> Best regards
> 
> Heinrich
> 
> > index a908f1412e8..f2d05e7f4e7 100644
> > --- a/include/video_console.h
> > +++ b/include/video_console.h
> > @@ -83,6 +83,7 @@ struct vidconsole_priv {
> >   	int escape_len;
> >   	int row_saved;
> >   	int col_saved;
> > +	u32 ucs;
> >   	bool cursor_visible;
> >   	char escape_buf[32];
> >   };
> > @@ -304,6 +305,14 @@ static inline int vidconsole_memmove(struct udevice *dev, void *dst,
> >   	return 0;
> >   }
> >   
> > +/*
> > + * Convert an UTF-8 byte into the corresponding character in the CP437
> > + * code page. Returns 0 if that character is part of a multi-byte sequence.
> > + * for which *esc holds the state of. Repeatedly feed in more bytes until
> > + * the return value returns a non-0 character.  
> 
> Please, follow the style described in 
> https://www.kernel.org/doc/html/latest/doc-guide/kernel-doc.html#function-documentation. 
> This will allow inclusion in the HTML documentation (see 
> https://u-boot.readthedocs.io/en/latest/api/index.html).
> 

Sure, I love and use kerneldoc, just missed that in that three-year old
patch ;-)

Cheers,
Andre


> 
> > + */
> > +uint8_t convert_utf8_to_cp437(uint8_t c, uint32_t *esc);
> > +
> >   #endif
> >   
> >   #endif  
>
Simon Glass March 13, 2022, 10:23 p.m. UTC | #3
Hi Andre,

On Sun, 9 Jan 2022 at 17:57, Andre Przywara <andre.przywara@arm.com> wrote:
>
> The bitmap fonts (VGA 8x16 and friends) we import from Linux use the
> 437 code page to map their glyphs. For U-Boot's own purposes this is
> probably fine, but UEFI applications output Unicode, which only matches
> in the very basic first 127 characters.
>
> Add a function that converts UTF-8 character sequences into the
> respective CP437 code point, as far as the characters defined in there
> allow this. This includes quite some international and box drawing
> characters, which are used by UEFI applications.
>
> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
> ---
>  drivers/video/Makefile            |   1 +
>  drivers/video/utf8_cp437.c        | 169 ++++++++++++++++++++++++++++++
>  drivers/video/vidconsole-uclass.c |   6 +-
>  include/video_console.h           |   9 ++
>  4 files changed, 184 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/video/utf8_cp437.c
>

[..]

This code should only be enabled if UEFI is being used, to reduce code size.

> diff --git a/include/video_console.h b/include/video_console.h
> index a908f1412e8..f2d05e7f4e7 100644
> --- a/include/video_console.h
> +++ b/include/video_console.h
> @@ -83,6 +83,7 @@ struct vidconsole_priv {
>         int escape_len;
>         int row_saved;
>         int col_saved;
> +       u32 ucs;
>         bool cursor_visible;
>         char escape_buf[32];
>  };
> @@ -304,6 +305,14 @@ static inline int vidconsole_memmove(struct udevice *dev, void *dst,
>         return 0;
>  }
>
> +/*
> + * Convert an UTF-8 byte into the corresponding character in the CP437
> + * code page. Returns 0 if that character is part of a multi-byte sequence.
> + * for which *esc holds the state of. Repeatedly feed in more bytes until
> + * the return value returns a non-0 character.
> + */
> +uint8_t convert_utf8_to_cp437(uint8_t c, uint32_t *esc);

Just a note that this should be below the next #endif  :

> +
>  #endif

<here>

>
>  #endif
> --
> 2.17.6
>

Regards,
Simon
diff mbox series

Patch

diff --git a/drivers/video/Makefile b/drivers/video/Makefile
index 8956b5f9b00..5f9823dff9e 100644
--- a/drivers/video/Makefile
+++ b/drivers/video/Makefile
@@ -14,6 +14,7 @@  obj-$(CONFIG_DISPLAY) += display-uclass.o
 obj-$(CONFIG_VIDEO_MIPI_DSI) += dsi-host-uclass.o
 obj-$(CONFIG_DM_VIDEO) += video-uclass.o vidconsole-uclass.o
 obj-$(CONFIG_DM_VIDEO) += video_bmp.o
+obj-$(CONFIG_DM_VIDEO) += utf8_cp437.o
 obj-$(CONFIG_PANEL) += panel-uclass.o
 obj-$(CONFIG_DM_PANEL_HX8238D) += hx8238d.o
 obj-$(CONFIG_SIMPLE_PANEL) += simple_panel.o
diff --git a/drivers/video/utf8_cp437.c b/drivers/video/utf8_cp437.c
new file mode 100644
index 00000000000..cab68b92b6e
--- /dev/null
+++ b/drivers/video/utf8_cp437.c
@@ -0,0 +1,169 @@ 
+/*
+ * Convert UTF-8 bytes into a code page 437 character.
+ * Based on the table in the Code_page_437 Wikipedia page.
+ */
+
+#include <linux/types.h>
+
+uint8_t code_points_00a0[] = {
+	255, 173, 155, 156,   7, 157,   7,  21,
+	  7,   7, 166, 174, 170,   7,   7,   7,
+	248, 241, 253,   7,   7, 230,  20, 250,
+	  7,   7, 167, 175, 172, 171,   7, 168,
+	  7,   7,   7,   7, 142, 143, 146, 128,
+	  7, 144,   7,   7,   7,   7,   7,   7,
+	  7, 165,   7,   7,   7,   7, 153,   7,
+	  7,   7,   7,   7, 154,   7,   7, 225,
+	133, 160, 131,   7, 132, 134, 145, 135,
+	138, 130, 136, 137, 141, 161, 140, 139,
+	  7, 164, 149, 162, 147,   7, 148, 246,
+	  7, 151, 163, 150, 129,   7,   7, 152,
+};
+
+uint8_t code_points_2550[] = {
+	205, 186, 213, 214, 201, 184, 183, 187,
+	212, 211, 200, 190, 189, 188, 198, 199,
+	204, 181, 182, 185, 209, 210, 203, 207,
+	208, 202, 216, 215, 206
+};
+
+static uint8_t utf8_convert_11bit(uint16_t code)
+{
+	switch (code) {
+	case 0x0192: return 159;
+	case 0x0393: return 226;
+	case 0x0398: return 233;
+	case 0x03A3: return 228;
+	case 0x03A6: return 232;
+	case 0x03A9: return 234;
+	case 0x03B1: return 224;
+	case 0x03B4: return 235;
+	case 0x03B5: return 238;
+	case 0x03C0: return 227;
+	case 0x03C3: return 229;
+	case 0x03C4: return 231;
+	case 0x03C6: return 237;
+	}
+
+	return 0;
+};
+
+static uint8_t utf8_convert_2xxx(uint16_t code)
+{
+	switch (code) {
+	case 0x2022: return 7;
+	case 0x203C: return 19;
+	case 0x207F: return 252;
+	case 0x20A7: return 158;
+	case 0x2190: return 27;
+	case 0x2191: return 24;
+	case 0x2192: return 26;
+	case 0x2193: return 25;
+	case 0x2194: return 29;
+	case 0x2195: return 18;
+	case 0x21A8: return 23;
+	case 0x2219: return 249;
+	case 0x221A: return 251;
+	case 0x221E: return 236;
+	case 0x221F: return 28;
+	case 0x2229: return 239;
+	case 0x2248: return 247;
+	case 0x2261: return 240;
+	case 0x2264: return 243;
+	case 0x2265: return 242;
+	case 0x2310: return 169;
+	case 0x2320: return 244;
+	case 0x2321: return 245;
+	case 0x2500: return 196;
+	case 0x2502: return 179;
+	case 0x250C: return 218;
+	case 0x2510: return 191;
+	case 0x2514: return 192;
+	case 0x2518: return 217;
+	case 0x251C: return 195;
+	case 0x2524: return 180;
+	case 0x252C: return 194;
+	case 0x2534: return 193;
+	case 0x253C: return 197;
+	case 0x2580: return 223;
+	case 0x2584: return 220;
+	case 0x2588: return 219;
+	case 0x258C: return 221;
+	case 0x2590: return 222;
+	case 0x2591: return 176;
+	case 0x2592: return 177;
+	case 0x2593: return 178;
+	case 0x25A0: return 254;
+	case 0x25AC: return 22;
+	case 0x25B2: return 30;
+	case 0x25BA: return 16;
+	case 0x25BC: return 31;
+	case 0x25C4: return 17;
+	case 0x25CB: return 9;
+	case 0x25D8: return 8;
+	case 0x25D9: return 10;
+	case 0x263A: return 1;
+	case 0x263B: return 2;
+	case 0x263C: return 15;
+	case 0x2640: return 12;
+	case 0x2642: return 11;
+	case 0x2660: return 6;
+	case 0x2663: return 5;
+	case 0x2665: return 3;
+	case 0x2666: return 4;
+	case 0x266A: return 13;
+	case 0x266B: return 14;
+	}
+
+	return 0;
+}
+
+uint8_t convert_uc16_to_cp437(uint16_t code)
+{
+	if (code < 0x7f)		// ASCII
+		return code;
+	if (code < 0xa0)		// high control characters
+		return code;
+	if (code < 0x100)		// international characters
+		return code_points_00a0[code - 0xa0];
+	if (code < 0x800)
+		return utf8_convert_11bit(code);
+	if (code >= 0x2550 && code < 0x256d)	// block graphics
+		return code_points_2550[code - 0x2550];
+
+	return utf8_convert_2xxx(code);
+}
+
+uint8_t convert_utf8_to_cp437(uint8_t c, uint32_t *esc)
+{
+	int shift;
+	uint16_t ucs;
+
+	if (c < 127)			// ASCII
+		return c;
+	if (c == 127)
+		return 8;		// DEL (?)
+
+	switch (c & 0xf0) {
+	case 0xc0: case 0xd0:		// two bytes sequence
+		*esc = (1U << 24) | ((c & 0x1f) << 6);
+		return 0;
+	case 0xe0:			// three bytes sequence
+		*esc = (2U << 24) | ((c & 0x0f) << 12);
+		return 0;
+	case 0xf0:			// four bytes sequence
+		*esc = (3U << 24) | ((c & 0x07) << 18);
+		return 0;
+	case 0x80: case 0x90: case 0xa0: case 0xb0:	// continuation
+		shift = (*esc >> 24) - 1;
+		ucs = *esc & 0xffffff;
+		if (shift) {
+			*esc = (shift << 24) | ucs | (c & 0x3f) << (shift * 6);
+			return 0;
+		}
+		*esc = 0;
+		return convert_uc16_to_cp437(ucs | (c & 0x3f));
+	}
+
+	return 0;
+}
diff --git a/drivers/video/vidconsole-uclass.c b/drivers/video/vidconsole-uclass.c
index 420fd86f9ac..ca6e1a2620c 100644
--- a/drivers/video/vidconsole-uclass.c
+++ b/drivers/video/vidconsole-uclass.c
@@ -546,6 +546,7 @@  static int vidconsole_output_glyph(struct udevice *dev, char ch)
 int vidconsole_put_char(struct udevice *dev, char ch)
 {
 	struct vidconsole_priv *priv = dev_get_uclass_priv(dev);
+	uint8_t cp437;
 	int ret;
 
 	/*
@@ -587,7 +588,10 @@  int vidconsole_put_char(struct udevice *dev, char ch)
 		priv->last_ch = 0;
 		break;
 	default:
-		ret = vidconsole_output_glyph(dev, ch);
+		cp437 = convert_utf8_to_cp437(ch, &priv->ucs);
+		if (cp437 == 0)
+			return 0;
+		ret = vidconsole_output_glyph(dev, cp437);
 		if (ret < 0)
 			return ret;
 		break;
diff --git a/include/video_console.h b/include/video_console.h
index a908f1412e8..f2d05e7f4e7 100644
--- a/include/video_console.h
+++ b/include/video_console.h
@@ -83,6 +83,7 @@  struct vidconsole_priv {
 	int escape_len;
 	int row_saved;
 	int col_saved;
+	u32 ucs;
 	bool cursor_visible;
 	char escape_buf[32];
 };
@@ -304,6 +305,14 @@  static inline int vidconsole_memmove(struct udevice *dev, void *dst,
 	return 0;
 }
 
+/*
+ * Convert an UTF-8 byte into the corresponding character in the CP437
+ * code page. Returns 0 if that character is part of a multi-byte sequence.
+ * for which *esc holds the state of. Repeatedly feed in more bytes until
+ * the return value returns a non-0 character.
+ */
+uint8_t convert_utf8_to_cp437(uint8_t c, uint32_t *esc);
+
 #endif
 
 #endif