libstdc++/64797 fix handling of incomplete multibyte characters

Message ID	20150304172001.GI8789@redhat.com
State	New
Headers	show Return-Path: <gcc-patches-return-392995-incoming=patchwork.ozlabs.org@gcc.gnu.org> DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:date :from:to:subject:message-id:mime-version:content-type; q=dns; s= default; b=doch0z0m0WbH3jT3+QLAgkTo5OGdwcrWvJdGUtwFfOk2+wnhKuVK9 xB06A8dXHcCe4LSmiQUiBcALbp5t7kruF9vj3ojiABl9a5hV4POniljl8lcT1e3p 5rxB3QQE5qjBtghe8vYO7O2rGr5C/AI8AKptFjoL2cxKsrRTJhM3UA= Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org Date: Wed, 4 Mar 2015 17:20:01 +0000 From: Jonathan Wakely <jwakely@redhat.com> To: libstdc++@gcc.gnu.org, gcc-patches@gcc.gnu.org Subject: [patch] libstdc++/64797 fix handling of incomplete multibyte characters Message-ID: <20150304172001.GI8789@redhat.com> MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="YD3LsXFS42OYHhNZ" Content-Disposition: inline User-Agent: Mutt/1.5.23 (2014-03-12)

commit ad00aa737cbaf61a4243a3ed46a5ed888811a1c5 Author: Jonathan Wakely <jwakely@redhat.com> Date: Thu Feb 19 11:44:58 2015 +0000 PR libstdc++/64797 * include/bits/locale_conv.h (wstring_convert::_M_conv): Handle incomplete multibyte sequences correctly. * include/std/codecvt (codecvt_utf8, codecvt_utf16, codecvt_utf8_utf16): Limit _Maxcode to maximum Unicode code point. * src/c++11/codecvt.cc (invalid_mb_sequence, incomplete_mb_character): Define constants. (is_high_surrogate, is_low_surrogate, surrogate_pair_to_code_point): Define convenience functions. (read_utf8_code_point): Return relevant constant to distinguish incomplete characters from invalid sequences. (read_utf16_code_point): Likewise. Check for invalid sequences. (ucs4_in, utf16_in): Use incomplete_mb_character constant. (utf16_out): Check for invalid sequences. (utf16_span): Fix condition. (ucs2_out): Use is_high_surrogate. (ucs2_in): Use incomplete_mb_character constant and fix condition. * testsuite/22_locale/codecvt/char16_t.cc: Fix whitespace. * testsuite/22_locale/conversions/buffer/1.cc: New. * testsuite/22_locale/conversions/string/2.cc: Use char16_t and char32_t instead of wchar_t. * testsuite/22_locale/conversions/string/3.cc: New. diff --git a/libstdc++-v3/include/bits/locale_conv.h b/libstdc++-v3/include/bits/locale_conv.h index c8a44f4..b53754d 100644 --- a/libstdc++-v3/include/bits/locale_conv.h +++ b/libstdc++-v3/include/bits/locale_conv.h @@ -198,18 +198,20 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION auto __outstr = __err ? _OutStr(__err->get_allocator()) : _OutStr(); size_t __outchars = 0; auto __next = __first; + const auto __maxlen = _M_cvt->max_length(); codecvt_base::result __result; do { - __outstr.resize(__outstr.size() + (__last - __next)); + __outstr.resize(__outstr.size() + (__last - __next) + __maxlen); auto __outnext = &__outstr.front() + __outchars; auto const __outlast = &__outstr.back() + 1; __result = ((*_M_cvt).*__memfn)(_M_state, __next, __last, __next, __outnext, __outlast, __outnext); __outchars = __outnext - &__outstr.front(); } - while (__result == codecvt_base::partial && __next != __last); + while (__result == codecvt_base::partial && __next != __last + && (__outstr.size() - __outchars) < __maxlen); __outstr.resize(__outchars); _M_count = __next - __first; @@ -428,7 +430,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION return _M_put(__next, __pending); if (!_M_put(__outbuf, __outnext - __outbuf)) - return false; + return false; } while (__next != __last && __next != __start); diff --git a/libstdc++-v3/include/std/codecvt b/libstdc++-v3/include/std/codecvt index d58a0ec..e4a7d5b 100644 --- a/libstdc++-v3/include/std/codecvt +++ b/libstdc++-v3/include/std/codecvt @@ -148,7 +148,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION public: \ explicit \ _NAME(size_t __refs = 0) \ - : __ ## _NAME ## _base<_ELEM>(_Maxcode, _Mode, __refs) { } \ + : __ ## _NAME ## _base<_ELEM>(std::min(_Maxcode, 0x10fffful), \ + _Mode, __refs) \ + { } \ } template<typename _Elem> class __codecvt_utf8_base; diff --git a/libstdc++-v3/src/c++11/codecvt.cc b/libstdc++-v3/src/c++11/codecvt.cc index aebd3f3..83ee6e0 100644 --- a/libstdc++-v3/src/c++11/codecvt.cc +++ b/libstdc++-v3/src/c++11/codecvt.cc @@ -35,8 +35,14 @@ namespace { // Largest code point that fits in a single UTF-16 code unit. const char32_t max_single_utf16_unit = 0xFFFF; + const char32_t max_code_point = 0x10FFFF; + // The functions below rely on maxcode < incomplete_mb_character + // (which is enforced by the codecvt_utf* classes on construction). + const char32_t incomplete_mb_character = char32_t(-2); + const char32_t invalid_mb_sequence = char32_t(-1); + template<typename Elem> struct range { @@ -131,13 +137,13 @@ namespace // Read a codepoint from a UTF-8 multibyte sequence. // Updates from.next if the codepoint is not greater than maxcode. - // Returns -1 if there is an invalid or incomplete multibyte character. + // Returns invalid_mb_sequence, incomplete_mb_character or the code point. char32_t read_utf8_code_point(range<const char>& from, unsigned long maxcode) { - size_t avail = from.size(); + const size_t avail = from.size(); if (avail == 0) - return -1; + return incomplete_mb_character; unsigned char c1 = from.next[0]; // https://en.wikipedia.org/wiki/UTF-8#Sample_code if (c1 < 0x80) @@ -146,14 +152,14 @@ namespace return c1; } else if (c1 < 0xC2) // continuation or overlong 2-byte sequence - return -1; + return invalid_mb_sequence; else if (c1 < 0xE0) // 2-byte sequence { if (avail < 2) - return -1; + return incomplete_mb_character; unsigned char c2 = from.next[1]; if ((c2 & 0xC0) != 0x80) - return -1; + return invalid_mb_sequence; char32_t c = (c1 << 6) + c2 - 0x3080; if (c <= maxcode) from.next += 2; @@ -162,15 +168,15 @@ namespace else if (c1 < 0xF0) // 3-byte sequence { if (avail < 3) - return -1; + return incomplete_mb_character; unsigned char c2 = from.next[1]; if ((c2 & 0xC0) != 0x80) - return -1; + return invalid_mb_sequence; if (c1 == 0xE0 && c2 < 0xA0) // overlong - return -1; + return invalid_mb_sequence; unsigned char c3 = from.next[2]; if ((c3 & 0xC0) != 0x80) - return -1; + return invalid_mb_sequence; char32_t c = (c1 << 12) + (c2 << 6) + c3 - 0xE2080; if (c <= maxcode) from.next += 3; @@ -179,27 +185,27 @@ namespace else if (c1 < 0xF5) // 4-byte sequence { if (avail < 4) - return -1; + return incomplete_mb_character; unsigned char c2 = from.next[1]; if ((c2 & 0xC0) != 0x80) - return -1; + return invalid_mb_sequence; if (c1 == 0xF0 && c2 < 0x90) // overlong - return -1; + return invalid_mb_sequence; if (c1 == 0xF4 && c2 >= 0x90) // > U+10FFFF - return -1; + return invalid_mb_sequence; unsigned char c3 = from.next[2]; if ((c3 & 0xC0) != 0x80) - return -1; + return invalid_mb_sequence; unsigned char c4 = from.next[3]; if ((c4 & 0xC0) != 0x80) - return -1; + return invalid_mb_sequence; char32_t c = (c1 << 18) + (c2 << 12) + (c3 << 6) + c4 - 0x3C82080; if (c <= maxcode) from.next += 4; return c; } else // > U+10FFFF - return -1; + return invalid_mb_sequence; } bool @@ -250,27 +256,54 @@ namespace #endif } + // Return true if c is a high-surrogate (aka leading) code point. + inline bool + is_high_surrogate(char32_t c) + { + return c >= 0xD800 && c <= 0xDBFF; + } + + // Return true if c is a low-surrogate (aka trailing) code point. + inline bool + is_low_surrogate(char32_t c) + { + return c >= 0xDC00 && c <= 0xDFFF; + } + + inline char32_t + surrogate_pair_to_code_point(char32_t high, char32_t low) + { + return (high << 10) + low - 0x35FDC00; + } + // Read a codepoint from a UTF-16 multibyte sequence. // The sequence's endianness is indicated by (mode & little_endian). // Updates from.next if the codepoint is not greater than maxcode. - // Returns -1 if there is an incomplete multibyte character. + // Returns invalid_mb_sequence, incomplete_mb_character or the code point. char32_t read_utf16_code_point(range<const char16_t>& from, unsigned long maxcode, codecvt_mode mode) { + const size_t avail = from.size(); + if (avail == 0) + return incomplete_mb_character; int inc = 1; char32_t c = adjust_byte_order(from.next[0], mode); - if (c >= 0xD800 && c <= 0xDBFF) + if (is_high_surrogate(c)) { - if (from.size() < 2) - return -1; + if (avail < 2) + return incomplete_mb_character; const char16_t c2 = adjust_byte_order(from.next[1], mode); - if (c2 >= 0xDC00 && c2 <= 0xDFFF) + if (is_low_surrogate(c2)) { - c = (c << 10) + c2 - 0x35FDC00; + c = surrogate_pair_to_code_point(c, c2); inc = 2; } + else + return invalid_mb_sequence; } + else if (is_low_surrogate(c)) + return invalid_mb_sequence; if (c <= maxcode) from.next += inc; return c; @@ -314,8 +347,8 @@ namespace while (from.size() && to.size()) { const char32_t codepoint = read_utf8_code_point(from, maxcode); - if (codepoint == char32_t(-1)) - break; + if (codepoint == incomplete_mb_character) + return codecvt_base::partial; if (codepoint > maxcode) return codecvt_base::error; *to.next++ = codepoint; @@ -352,8 +385,8 @@ namespace while (from.size() && to.size()) { const char32_t codepoint = read_utf16_code_point(from, maxcode, mode); - if (codepoint == char32_t(-1)) - break; + if (codepoint == incomplete_mb_character) + return codecvt_base::partial; if (codepoint > maxcode) return codecvt_base::error; *to.next++ = codepoint; @@ -389,11 +422,9 @@ namespace read_utf8_bom(from, mode); while (from.size() && to.size()) { - const char* first = from.next; - if ((unsigned char)*first >= 0xF0 && to.size() < 2) - return codecvt_base::partial; + const char* const first = from.next; const char32_t codepoint = read_utf8_code_point(from, maxcode); - if (codepoint == char32_t(-1)) + if (codepoint == incomplete_mb_character) return codecvt_base::partial; if (codepoint > maxcode) return codecvt_base::error; @@ -418,20 +449,22 @@ namespace { char32_t c = from.next[0]; int inc = 1; - if (c >= 0xD800 && c <= 0xDBFF) // start of surrogate pair + if (is_high_surrogate(c)) { if (from.size() < 2) return codecvt_base::ok; // stop converting at this point const char32_t c2 = from.next[1]; - if (c2 >= 0xDC00 && c2 <= 0xDFFF) + if (is_low_surrogate(c2)) { + c = surrogate_pair_to_code_point(c, c2); inc = 2; - c = (c << 10) + c2 - 0x35FDC00; } else return codecvt_base::error; } + else if (is_low_surrogate(c)) + return codecvt_base::error; if (c > maxcode) return codecvt_base::error; if (!write_utf8_code_point(to, c)) @@ -452,8 +485,8 @@ namespace while (count+1 < max) { char32_t c = read_utf8_code_point(from, maxcode); - if (c == char32_t(-1)) - break; + if (c > maxcode) + return from.next; else if (c > max_single_utf16_unit) ++count; ++count; @@ -489,7 +522,7 @@ namespace while (from.size() && to.size()) { char16_t c = from.next[0]; - if (c >= 0xD800 && c <= 0xDBFF) // start of surrogate pair + if (is_high_surrogate(c)) return codecvt_base::error; if (c > maxcode) return codecvt_base::error; @@ -510,9 +543,9 @@ namespace while (from.size() && to.size()) { const char32_t c = read_utf16_code_point(from, maxcode, mode); - if (c == char32_t(-1)) - break; - if (c >= maxcode) + if (c == incomplete_mb_character) + return codecvt_base::partial; + if (c > maxcode) return codecvt_base::error; *to.next++ = c; } diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc b/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc index 9271eca..a21a838 100644 --- a/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc +++ b/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc @@ -79,8 +79,7 @@ test01() codecvt_c16::state_type state01; state01 = {}; - codecvt_base::result res = cvt->out(state01, u16dat, u16dat_end, -from_next, + codecvt_base::result res = cvt->out(state01, u16dat, u16dat_end, from_next, buffer, buffer_end, to_next); VERIFY(res == codecvt_base::ok); diff --git a/libstdc++-v3/testsuite/22_locale/conversions/buffer/1.cc b/libstdc++-v3/testsuite/22_locale/conversions/buffer/1.cc new file mode 100644 index 0000000..f008f5a --- /dev/null +++ b/libstdc++-v3/testsuite/22_locale/conversions/buffer/1.cc @@ -0,0 +1,78 @@ +// { dg-options "-std=gnu++11" } + +// Copyright (C) 2012 Free Software Foundation +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// <http://www.gnu.org/licenses/>. + +// 22.3.3.2.3 Buffer conversions + +#include <locale> +#include <sstream> +#include <testsuite_hooks.h> + +template<typename Elem> +struct cvt : std::codecvt<Elem, char, std::mbstate_t> { }; + +template<typename Elem> +using buf_conv = std::wbuffer_convert<cvt<Elem>, Elem>; + +using std::string; +using std::stringstream; +using std::wstring; +using std::wstringstream; + +void test01() +{ + buf_conv<wchar_t> buf; + std::stringbuf sbuf; + VERIFY( buf.rdbuf() == nullptr ); + VERIFY( buf.rdbuf(&sbuf) == nullptr ); + VERIFY( buf.rdbuf() == &sbuf ); + VERIFY( buf.rdbuf(nullptr) == &sbuf ); +} + +void test02() +{ + std::stringbuf sbuf; + buf_conv<char> buf(&sbuf); // noconv + + stringstream ss; + ss.std::ios::rdbuf(&buf); + string input = "King for a day..."; + ss << input << std::flush; + string output = sbuf.str(); + VERIFY( input == output ); +} + +void test03() +{ + std::stringbuf sbuf; + buf_conv<wchar_t> buf(&sbuf); + + wstringstream ss; + ss.std::wios::rdbuf(&buf); + wstring input = L"Fool for a lifetime"; + ss << input << std::flush; + string output = sbuf.str(); + VERIFY( output == "Fool for a lifetime" ); +} + +int main() +{ + test01(); + test02(); + test03(); +} diff --git a/libstdc++-v3/testsuite/22_locale/conversions/string/2.cc b/libstdc++-v3/testsuite/22_locale/conversions/string/2.cc index 94eb75f..07d2b52 100644 --- a/libstdc++-v3/testsuite/22_locale/conversions/string/2.cc +++ b/libstdc++-v3/testsuite/22_locale/conversions/string/2.cc @@ -30,26 +30,43 @@ template<typename Elem> using str_conv = std::wstring_convert<cvt<Elem>, Elem>; using std::string; -using std::wstring; +using std::u16string; +using std::u32string; // test conversion errors, with and without error strings void test01() { - typedef str_conv<wchar_t> sc; + typedef str_conv<char16_t> sc; const sc::byte_string berr = "invalid wide string"; - const sc::wide_string werr = L"invalid byte string"; + const sc::wide_string werr = u"invalid byte string"; sc c(berr, werr); string input = "Stop"; + input += char(0xFF); + u16string woutput = c.from_bytes(input); + VERIFY( werr == woutput ); + u16string winput = u"Stop"; + winput += char16_t(0xDC00); + string output = c.to_bytes(winput); + VERIFY( berr == output ); +} + +void test02() +{ + typedef str_conv<char32_t> sc; + + const sc::byte_string berr = "invalid wide string"; + const sc::wide_string werr = U"invalid byte string"; + + sc c(berr, werr); + string input = "Halt"; input += char(0xff); - input += char(0xff); - wstring woutput = c.from_bytes(input); + u32string woutput = c.from_bytes(input); VERIFY( werr == woutput ); - wstring winput = L"Stop"; - winput += wchar_t(0xff); - winput += wchar_t(0xff); + u32string winput = U"Halt"; + winput += char32_t(-1); string output = c.to_bytes(winput); VERIFY( berr == output ); } @@ -57,4 +74,5 @@ void test01() int main() { test01(); + test02(); } diff --git a/libstdc++-v3/testsuite/22_locale/conversions/string/3.cc b/libstdc++-v3/testsuite/22_locale/conversions/string/3.cc new file mode 100644 index 0000000..7c4ac20 --- /dev/null +++ b/libstdc++-v3/testsuite/22_locale/conversions/string/3.cc @@ -0,0 +1,61 @@ +// { dg-options "-std=gnu++11" } + +// Copyright (C) 2012 Free Software Foundation +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// <http://www.gnu.org/licenses/>. + +// 22.3.3.2.2 String conversions + +#include <locale> +#include <string> +#include <testsuite_hooks.h> + +template<typename Elem> +struct cvt : std::codecvt<Elem, char, std::mbstate_t> { }; + +template<typename Elem> +using str_conv = std::wstring_convert<cvt<Elem>, Elem>; + +using std::string; +using std::u32string; + +// test construction with state, for partial conversions + +void test01() +{ + typedef str_conv<char32_t> wsc; + + wsc c; + string input = u8"\u00a3 shillings pence"; + u32string woutput = c.from_bytes(input.substr(0, 1)); + auto partial_state = c.state(); + auto partial_count = c.converted(); + + auto woutput2 = c.from_bytes("state reset on next conversion"); + VERIFY( woutput2 == U"state reset on next conversion" ); + + wsc c2(new cvt<char32_t>, partial_state); + woutput += c2.from_bytes(input.substr(partial_count)); + VERIFY( U"\u00a3 shillings pence" == woutput ); + + string roundtrip = c2.to_bytes(woutput); + VERIFY( input == roundtrip ); +} + +int main() +{ + test01(); +}

libstdc++/64797 fix handling of incomplete multibyte characters

Commit Message

Patch