From patchwork Tue Jul  3 17:06:54 2018
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Gabriel Krisman Bertazi <krisman@collabora.co.uk>
X-Patchwork-Id: 938827
Return-Path: <linux-ext4-owner@vger.kernel.org>
X-Original-To: patchwork-incoming@ozlabs.org
Delivered-To: patchwork-incoming@ozlabs.org
Authentication-Results: ozlabs.org;
	spf=none (mailfrom) smtp.mailfrom=vger.kernel.org
	(client-ip=209.132.180.67; helo=vger.kernel.org;
	envelope-from=linux-ext4-owner@vger.kernel.org;
	receiver=<UNKNOWN>)
Authentication-Results: ozlabs.org; dmarc=fail (p=none dis=none)
	header.from=collabora.co.uk
Received: from vger.kernel.org (vger.kernel.org [209.132.180.67])
	by ozlabs.org (Postfix) with ESMTP id 41KrD409KJz9s2g
	for <patchwork-incoming@ozlabs.org>;
	Wed,  4 Jul 2018 03:08:04 +1000 (AEST)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
	id S934192AbeGCRID (ORCPT <rfc822;patchwork-incoming@ozlabs.org>);
	Tue, 3 Jul 2018 13:08:03 -0400
Received: from bhuna.collabora.co.uk ([46.235.227.227]:33424 "EHLO
	bhuna.collabora.co.uk" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
	with ESMTP id S933677AbeGCRIC (ORCPT
	<rfc822; linux-ext4@vger.kernel.org>); Tue, 3 Jul 2018 13:08:02 -0400
Received: from [127.0.0.1] (localhost [127.0.0.1])
	(Authenticated sender: krisman) with ESMTPSA id 5BCF8289317
From: Gabriel Krisman Bertazi <krisman@collabora.co.uk>
To: tytso@mit.edu
Cc: linux-ext4@vger.kernel.org, darrick.wong@oracle.com,
	kernel@collabora.com, Gabriel Krisman Bertazi <krisman@collabora.co.uk>
Subject: [PATCH 14/20] nls: utf8norm: Integrate utf8norm code with NLS
	subsystem
Date: Tue,  3 Jul 2018 13:06:54 -0400
Message-Id: <20180703170700.9306-15-krisman@collabora.co.uk>
X-Mailer: git-send-email 2.18.0
In-Reply-To: <20180703170700.9306-1-krisman@collabora.co.uk>
References: <20180703170700.9306-1-krisman@collabora.co.uk>
Sender: linux-ext4-owner@vger.kernel.org
Precedence: bulk
List-ID: <linux-ext4.vger.kernel.org>
X-Mailing-List: linux-ext4@vger.kernel.org

Changes since RFC v2:
  - Integrate with NLS

Changes since RFC v1:
  - Change error return code from EIO to EINVAL. (Olaf Weber)
  - Fix issues with strncmp/strcmp.  (Olaf Weber)
  - Remove stack buffer in normalization/casefold. (Olaf Weber)
  - Include length parameter for second string on comparison functions.
  - Change length type to size_t.

Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk>
---
 fs/nls/Makefile         |   2 +-
 fs/nls/nls_utf8n-core.c | 276 ++++++++++++++++++++++++++++++++++++++++
 fs/nls/nls_utf8n-norm.c |   6 +
 fs/nls/utf8n.h          |   1 +
 4 files changed, 284 insertions(+), 1 deletion(-)
 create mode 100644 fs/nls/nls_utf8n-core.c
diff --git a/fs/nls/Makefile b/fs/nls/Makefile
index 6ff62c0fe436..3650bb58534b 100644
--- a/fs/nls/Makefile
+++ b/fs/nls/Makefile
@@ -56,7 +56,7 @@ obj-$(CONFIG_NLS_MAC_ROMANIAN)  += mac-romanian.o
 obj-$(CONFIG_NLS_MAC_ROMAN)     += mac-roman.o
 obj-$(CONFIG_NLS_MAC_TURKISH)   += mac-turkish.o
 
-nls_utf8n-y += nls_utf8n-norm.o
+nls_utf8n-y += nls_utf8n-norm.o nls_utf8n-core.o
 obj-$(CONFIG_NLS_UTF8_NORMALIZATION) += nls_utf8n.o
 
 $(obj)/nls_utf8n-norm.o: $(obj)/utf8data.h
diff --git a/fs/nls/nls_utf8n-core.c b/fs/nls/nls_utf8n-core.c
new file mode 100644
index 000000000000..d723e9327182
--- /dev/null
+++ b/fs/nls/nls_utf8n-core.c
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2017 Collabora Ltd.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/nls.h>
+#include <linux/slab.h>
+#include <linux/parser.h>
+#include <linux/string.h>
+
+#include "utf8n.h"
+
+static struct nls_charset utf8norm_info;
+
+static int utf8_strncmp(const struct nls_table *charset,
+			const unsigned char *str1, size_t len1,
+			const unsigned char *str2, size_t len2)
+{
+	const struct utf8data *data = utf8nfkdi(charset->version);
+	struct utf8cursor cur1, cur2;
+	int c1, c2;
+	int r;
+
+	r = utf8ncursor(&cur1, data, str1, len1);
+	if (r < 0)
+		return -EINVAL;
+	r = utf8ncursor(&cur2, data, str2, len2);
+	if (r < 0)
+		return -EINVAL;
+
+	do {
+		c1 = utf8byte(&cur1);
+		c2 = utf8byte(&cur2);
+
+		if (c1 < 0 || c2 < 0)
+			return -EINVAL;
+		if (c1 != c2)
+			return 1;
+	} while (c1);
+
+	return 0;
+}
+
+static int utf8_strncasecmp(const struct nls_table *charset,
+			    const unsigned char *str1, size_t len1,
+			    const unsigned char *str2, size_t len2)
+{
+	const struct utf8data *data = utf8nfkdicf(charset->version);
+	struct utf8cursor cur1, cur2;
+	int c1, c2;
+	int r;
+
+	r = utf8ncursor(&cur1, data, str1, len1);
+	if (r < 0)
+		return -EINVAL;
+
+	r = utf8ncursor(&cur2, data, str2, len2);
+	if (r < 0)
+		return -EINVAL;
+
+	do {
+		c1 = utf8byte(&cur1);
+		c2 = utf8byte(&cur2);
+
+		if (c1 < 0 || c2 < 0)
+			return -EINVAL;
+		if (c1 != c2)
+			return 1;
+	} while (c1);
+
+	return 0;
+}
+
+static int utf8_casefold(const struct nls_table *charset,
+			 const unsigned char *str, size_t len,
+			 unsigned char *dest, size_t dlen)
+{
+	const struct utf8data *data = utf8nfkdicf(charset->version);
+	struct utf8cursor cur;
+	size_t nlen = 0;
+
+	utf8ncursor(&cur, data, str, len);
+	for (nlen = 0; nlen < dlen; nlen++) {
+		dest[nlen] = utf8byte(&cur);
+		if (!dest[nlen])
+			return nlen;
+	}
+
+	return -EINVAL;
+}
+
+static int utf8_normalize(const struct nls_table *charset,
+			  const unsigned char *str,
+			  size_t len, unsigned char *dest, size_t dlen)
+{
+	const struct utf8data *data = utf8nfkdi(charset->version);
+	struct utf8cursor cur;
+	ssize_t nlen = 0;
+
+	utf8ncursor(&cur, data, str, len);
+
+	for (nlen = 0; nlen < dlen; nlen++) {
+		dest[nlen] = utf8byte(&cur);
+		if (!dest[nlen])
+			return nlen;
+	}
+
+	return -EINVAL;
+}
+
+static int utf8_uni2char(wchar_t uni, unsigned char *out, int boundlen)
+{
+	int n;
+
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	n = utf32_to_utf8(uni, out, boundlen);
+	if (n < 0) {
+		*out = '?';
+		return -EINVAL;
+	}
+	return n;
+}
+
+static int utf8_char2uni(const unsigned char *rawstring, int boundlen,
+			 wchar_t *uni)
+{
+	int n;
+	unicode_t u;
+
+	n = utf8_to_utf32(rawstring, boundlen, &u);
+	if (n < 0 || u > MAX_WCHAR_T) {
+		*uni = 0x003f;	/* ? */
+		return -EINVAL;
+	}
+	*uni = (wchar_t) u;
+	return n;
+}
+
+static unsigned char utf8_tolower(const struct nls_table *table,
+				  unsigned int c)
+{
+	return c; /* Identity */
+}
+
+static unsigned char utf8_toupper(const struct nls_table *table,
+				  unsigned int c)
+{
+	return c; /* Identity */
+}
+
+static const struct nls_ops utf8_ops = {
+	.strncmp = utf8_strncmp,
+	.strncasecmp = utf8_strncasecmp,
+	.casefold = utf8_casefold,
+	.normalize = utf8_normalize,
+	.lowercase = utf8_tolower,
+	.uppercase = utf8_toupper,
+	.uni2char = utf8_uni2char,
+	.char2uni = utf8_char2uni,
+};
+
+static int utf8_parse_version(const char *version, unsigned int *maj,
+			      unsigned int *min, unsigned int *rev)
+{
+	substring_t args[3];
+	char *tmp;
+	const struct match_token token[] = {
+		{1, "%d.%d.%d"},
+		{0, NULL}
+	};
+	int ret = 0;
+
+	tmp = kstrdup(version, GFP_KERNEL);
+	if (match_token(tmp, token, args) != 1) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (match_int(&args[0], maj) || match_int(&args[1], min) ||
+	    match_int(&args[2], rev)) {
+		ret = -EINVAL;
+		goto out;
+	}
+out:
+	kfree(tmp);
+	return ret;
+}
+
+static struct nls_table *utf8_load_charset(const char *version)
+{
+	struct nls_table *tbl = NULL;
+	unsigned int nls_version;
+
+	if (version) {
+		unsigned int maj, min, rev;
+
+		if (utf8_parse_version(version, &maj, &min, &rev) < 0)
+			return ERR_PTR(-EINVAL);
+
+		if (!utf8version_is_supported(maj, min, rev))
+			return ERR_PTR(-EINVAL);
+
+		nls_version = UNICODE_AGE(maj, min, rev);
+	} else {
+		nls_version = utf8version_latest();
+		printk(KERN_WARNING"utf8norm version not specified. "
+		       "Assuming latest supported version (%d.%d.%d).",
+		       (nls_version >> 16) & 0xff, (nls_version >> 8) & 0xff,
+		       (nls_version & 0xff));
+	}
+
+	 /* Try an already loaded table first. */
+	for (tbl = utf8norm_info.tables; tbl; tbl = tbl->next) {
+		if (tbl->version == nls_version)
+			return tbl;
+	}
+
+	tbl = kmalloc(sizeof(struct nls_table), GFP_KERNEL);
+	if (!tbl)
+		return ERR_PTR(-ENOMEM);
+
+	tbl->charset = &utf8norm_info;
+	tbl->version = nls_version;
+	tbl->ops = &utf8_ops;
+
+	tbl->next = utf8norm_info.tables;
+	utf8norm_info.tables = tbl;
+
+	return tbl;
+}
+
+static void utf8_cleanup_tables(void)
+{
+	struct nls_table *tmp, *tbl = utf8norm_info.tables;
+
+	while (tbl) {
+		tmp = tbl;
+		tbl = tbl->next;
+		kfree(tmp);
+	}
+	utf8norm_info.tables = NULL;
+}
+
+static struct nls_charset utf8norm_info = {
+	.charset = "utf8n",
+	.load_table = utf8_load_charset,
+};
+
+static int __init init_utf8(void)
+{
+	register_nls(&utf8norm_info);
+	return 0;
+}
+
+static void __exit exit_utf8(void)
+{
+	unregister_nls(&utf8norm_info);
+	utf8_cleanup_tables();
+}
+
+module_init(init_utf8);
+module_exit(exit_utf8);
+MODULE_AUTHOR("SGI, Gabriel Krisman Bertazi");
+MODULE_DESCRIPTION("UTF-8 charset operations for filesystems");
+MODULE_LICENSE("GPL");
diff --git a/fs/nls/nls_utf8n-norm.c b/fs/nls/nls_utf8n-norm.c
index 64c3cc74a2ca..abee8b376a87 100644
--- a/fs/nls/nls_utf8n-norm.c
+++ b/fs/nls/nls_utf8n-norm.c
@@ -38,6 +38,12 @@ int utf8version_is_supported(u8 maj, u8 min, u8 rev)
 }
 EXPORT_SYMBOL(utf8version_is_supported);
 
+int utf8version_latest()
+{
+	return utf8vers;
+}
+EXPORT_SYMBOL(utf8version_latest);
+
 /*
  * UTF-8 valid ranges.
  *
diff --git a/fs/nls/utf8n.h b/fs/nls/utf8n.h
index f60827663503..b4697f9bfbab 100644
--- a/fs/nls/utf8n.h
+++ b/fs/nls/utf8n.h
@@ -32,6 +32,7 @@
 
 /* Highest unicode version supported by the data tables. */
 extern int utf8version_is_supported(u8 maj, u8 min, u8 rev);
+extern int utf8version_latest(void);
 
 /*
  * Look for the correct const struct utf8data for a unicode version.