From patchwork Tue Jul 3 17:06:54 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Gabriel Krisman Bertazi X-Patchwork-Id: 938827 Return-Path: X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=linux-ext4-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=fail (p=none dis=none) header.from=collabora.co.uk Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 41KrD409KJz9s2g for ; Wed, 4 Jul 2018 03:08:04 +1000 (AEST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S934192AbeGCRID (ORCPT ); Tue, 3 Jul 2018 13:08:03 -0400 Received: from bhuna.collabora.co.uk ([46.235.227.227]:33424 "EHLO bhuna.collabora.co.uk" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S933677AbeGCRIC (ORCPT ); Tue, 3 Jul 2018 13:08:02 -0400 Received: from [127.0.0.1] (localhost [127.0.0.1]) (Authenticated sender: krisman) with ESMTPSA id 5BCF8289317 From: Gabriel Krisman Bertazi To: tytso@mit.edu Cc: linux-ext4@vger.kernel.org, darrick.wong@oracle.com, kernel@collabora.com, Gabriel Krisman Bertazi Subject: [PATCH 14/20] nls: utf8norm: Integrate utf8norm code with NLS subsystem Date: Tue, 3 Jul 2018 13:06:54 -0400 Message-Id: <20180703170700.9306-15-krisman@collabora.co.uk> X-Mailer: git-send-email 2.18.0 In-Reply-To: <20180703170700.9306-1-krisman@collabora.co.uk> References: <20180703170700.9306-1-krisman@collabora.co.uk> Sender: linux-ext4-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-ext4@vger.kernel.org Changes since RFC v2: - Integrate with NLS Changes since RFC v1: - Change error return code from EIO to EINVAL. (Olaf Weber) - Fix issues with strncmp/strcmp. (Olaf Weber) - Remove stack buffer in normalization/casefold. (Olaf Weber) - Include length parameter for second string on comparison functions. - Change length type to size_t. Signed-off-by: Gabriel Krisman Bertazi --- fs/nls/Makefile | 2 +- fs/nls/nls_utf8n-core.c | 276 ++++++++++++++++++++++++++++++++++++++++ fs/nls/nls_utf8n-norm.c | 6 + fs/nls/utf8n.h | 1 + 4 files changed, 284 insertions(+), 1 deletion(-) create mode 100644 fs/nls/nls_utf8n-core.c diff --git a/fs/nls/Makefile b/fs/nls/Makefile index 6ff62c0fe436..3650bb58534b 100644 --- a/fs/nls/Makefile +++ b/fs/nls/Makefile @@ -56,7 +56,7 @@ obj-$(CONFIG_NLS_MAC_ROMANIAN) += mac-romanian.o obj-$(CONFIG_NLS_MAC_ROMAN) += mac-roman.o obj-$(CONFIG_NLS_MAC_TURKISH) += mac-turkish.o -nls_utf8n-y += nls_utf8n-norm.o +nls_utf8n-y += nls_utf8n-norm.o nls_utf8n-core.o obj-$(CONFIG_NLS_UTF8_NORMALIZATION) += nls_utf8n.o $(obj)/nls_utf8n-norm.o: $(obj)/utf8data.h diff --git a/fs/nls/nls_utf8n-core.c b/fs/nls/nls_utf8n-core.c new file mode 100644 index 000000000000..d723e9327182 --- /dev/null +++ b/fs/nls/nls_utf8n-core.c @@ -0,0 +1,276 @@ +/* + * Copyright (c) 2017 Collabora Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include + +#include "utf8n.h" + +static struct nls_charset utf8norm_info; + +static int utf8_strncmp(const struct nls_table *charset, + const unsigned char *str1, size_t len1, + const unsigned char *str2, size_t len2) +{ + const struct utf8data *data = utf8nfkdi(charset->version); + struct utf8cursor cur1, cur2; + int c1, c2; + int r; + + r = utf8ncursor(&cur1, data, str1, len1); + if (r < 0) + return -EINVAL; + r = utf8ncursor(&cur2, data, str2, len2); + if (r < 0) + return -EINVAL; + + do { + c1 = utf8byte(&cur1); + c2 = utf8byte(&cur2); + + if (c1 < 0 || c2 < 0) + return -EINVAL; + if (c1 != c2) + return 1; + } while (c1); + + return 0; +} + +static int utf8_strncasecmp(const struct nls_table *charset, + const unsigned char *str1, size_t len1, + const unsigned char *str2, size_t len2) +{ + const struct utf8data *data = utf8nfkdicf(charset->version); + struct utf8cursor cur1, cur2; + int c1, c2; + int r; + + r = utf8ncursor(&cur1, data, str1, len1); + if (r < 0) + return -EINVAL; + + r = utf8ncursor(&cur2, data, str2, len2); + if (r < 0) + return -EINVAL; + + do { + c1 = utf8byte(&cur1); + c2 = utf8byte(&cur2); + + if (c1 < 0 || c2 < 0) + return -EINVAL; + if (c1 != c2) + return 1; + } while (c1); + + return 0; +} + +static int utf8_casefold(const struct nls_table *charset, + const unsigned char *str, size_t len, + unsigned char *dest, size_t dlen) +{ + const struct utf8data *data = utf8nfkdicf(charset->version); + struct utf8cursor cur; + size_t nlen = 0; + + utf8ncursor(&cur, data, str, len); + for (nlen = 0; nlen < dlen; nlen++) { + dest[nlen] = utf8byte(&cur); + if (!dest[nlen]) + return nlen; + } + + return -EINVAL; +} + +static int utf8_normalize(const struct nls_table *charset, + const unsigned char *str, + size_t len, unsigned char *dest, size_t dlen) +{ + const struct utf8data *data = utf8nfkdi(charset->version); + struct utf8cursor cur; + ssize_t nlen = 0; + + utf8ncursor(&cur, data, str, len); + + for (nlen = 0; nlen < dlen; nlen++) { + dest[nlen] = utf8byte(&cur); + if (!dest[nlen]) + return nlen; + } + + return -EINVAL; +} + +static int utf8_uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + int n; + + if (boundlen <= 0) + return -ENAMETOOLONG; + + n = utf32_to_utf8(uni, out, boundlen); + if (n < 0) { + *out = '?'; + return -EINVAL; + } + return n; +} + +static int utf8_char2uni(const unsigned char *rawstring, int boundlen, + wchar_t *uni) +{ + int n; + unicode_t u; + + n = utf8_to_utf32(rawstring, boundlen, &u); + if (n < 0 || u > MAX_WCHAR_T) { + *uni = 0x003f; /* ? */ + return -EINVAL; + } + *uni = (wchar_t) u; + return n; +} + +static unsigned char utf8_tolower(const struct nls_table *table, + unsigned int c) +{ + return c; /* Identity */ +} + +static unsigned char utf8_toupper(const struct nls_table *table, + unsigned int c) +{ + return c; /* Identity */ +} + +static const struct nls_ops utf8_ops = { + .strncmp = utf8_strncmp, + .strncasecmp = utf8_strncasecmp, + .casefold = utf8_casefold, + .normalize = utf8_normalize, + .lowercase = utf8_tolower, + .uppercase = utf8_toupper, + .uni2char = utf8_uni2char, + .char2uni = utf8_char2uni, +}; + +static int utf8_parse_version(const char *version, unsigned int *maj, + unsigned int *min, unsigned int *rev) +{ + substring_t args[3]; + char *tmp; + const struct match_token token[] = { + {1, "%d.%d.%d"}, + {0, NULL} + }; + int ret = 0; + + tmp = kstrdup(version, GFP_KERNEL); + if (match_token(tmp, token, args) != 1) { + ret = -EINVAL; + goto out; + } + + if (match_int(&args[0], maj) || match_int(&args[1], min) || + match_int(&args[2], rev)) { + ret = -EINVAL; + goto out; + } +out: + kfree(tmp); + return ret; +} + +static struct nls_table *utf8_load_charset(const char *version) +{ + struct nls_table *tbl = NULL; + unsigned int nls_version; + + if (version) { + unsigned int maj, min, rev; + + if (utf8_parse_version(version, &maj, &min, &rev) < 0) + return ERR_PTR(-EINVAL); + + if (!utf8version_is_supported(maj, min, rev)) + return ERR_PTR(-EINVAL); + + nls_version = UNICODE_AGE(maj, min, rev); + } else { + nls_version = utf8version_latest(); + printk(KERN_WARNING"utf8norm version not specified. " + "Assuming latest supported version (%d.%d.%d).", + (nls_version >> 16) & 0xff, (nls_version >> 8) & 0xff, + (nls_version & 0xff)); + } + + /* Try an already loaded table first. */ + for (tbl = utf8norm_info.tables; tbl; tbl = tbl->next) { + if (tbl->version == nls_version) + return tbl; + } + + tbl = kmalloc(sizeof(struct nls_table), GFP_KERNEL); + if (!tbl) + return ERR_PTR(-ENOMEM); + + tbl->charset = &utf8norm_info; + tbl->version = nls_version; + tbl->ops = &utf8_ops; + + tbl->next = utf8norm_info.tables; + utf8norm_info.tables = tbl; + + return tbl; +} + +static void utf8_cleanup_tables(void) +{ + struct nls_table *tmp, *tbl = utf8norm_info.tables; + + while (tbl) { + tmp = tbl; + tbl = tbl->next; + kfree(tmp); + } + utf8norm_info.tables = NULL; +} + +static struct nls_charset utf8norm_info = { + .charset = "utf8n", + .load_table = utf8_load_charset, +}; + +static int __init init_utf8(void) +{ + register_nls(&utf8norm_info); + return 0; +} + +static void __exit exit_utf8(void) +{ + unregister_nls(&utf8norm_info); + utf8_cleanup_tables(); +} + +module_init(init_utf8); +module_exit(exit_utf8); +MODULE_AUTHOR("SGI, Gabriel Krisman Bertazi"); +MODULE_DESCRIPTION("UTF-8 charset operations for filesystems"); +MODULE_LICENSE("GPL"); diff --git a/fs/nls/nls_utf8n-norm.c b/fs/nls/nls_utf8n-norm.c index 64c3cc74a2ca..abee8b376a87 100644 --- a/fs/nls/nls_utf8n-norm.c +++ b/fs/nls/nls_utf8n-norm.c @@ -38,6 +38,12 @@ int utf8version_is_supported(u8 maj, u8 min, u8 rev) } EXPORT_SYMBOL(utf8version_is_supported); +int utf8version_latest() +{ + return utf8vers; +} +EXPORT_SYMBOL(utf8version_latest); + /* * UTF-8 valid ranges. * diff --git a/fs/nls/utf8n.h b/fs/nls/utf8n.h index f60827663503..b4697f9bfbab 100644 --- a/fs/nls/utf8n.h +++ b/fs/nls/utf8n.h @@ -32,6 +32,7 @@ /* Highest unicode version supported by the data tables. */ extern int utf8version_is_supported(u8 maj, u8 min, u8 rev); +extern int utf8version_latest(void); /* * Look for the correct const struct utf8data for a unicode version.