Message ID | s9dd0vvrx9z.fsf_-_@taka.site |
---|---|
State | New |
Headers | show |
Series | Unicode 11.0.0 Support: Put the correct Unicode version number 11.0.0 into the generated files | expand |
On 07/10/2018 10:16 AM, Mike FABIAN wrote: > I forgot to update the Unicode version number in some of the comments > and headers: > > Put the correct Unicode version number 11.0.0 into the generated files > > In some places there was still the old Unicode version 10.0.0 in the files. > > * localedata/charmaps/UTF-8: Use correct Unicode version 11.0.0 in comment. > * localedata/locales/i18n_ctype: Use correct Unicode version in comments > and headers. > * localedata/unicode-gen/utf8_gen.py: Add option to specify Unicode version > * localedata/unicode-gen/Makefile: Use option to specify Unicode version > for utf8_gen.py > Please commit. Reviewed-by: Carlos O'Donell <carlos@redhat.com> > > -- Mike FABIAN <mfabian@redhat.com> > > > 0001-Put-the-correct-Unicode-version-number-11.0.0-into-t.patch > > > From db66553fdc621244330d713839b562eb632307a1 Mon Sep 17 00:00:00 2001 > From: Mike FABIAN <mfabian@redhat.com> > Date: Tue, 10 Jul 2018 11:25:48 +0200 > Subject: [PATCH] Put the correct Unicode version number 11.0.0 into the > generated files > > In some places there was still the old Unicode version 10.0.0 in the files. > > * localedata/charmaps/UTF-8: Use correct Unicode version 11.0.0 in comment. > * localedata/locales/i18n_ctype: Use correct Unicode version in comments > and headers. > * localedata/unicode-gen/utf8_gen.py: Add option to specify Unicode version > * localedata/unicode-gen/Makefile: Use option to specify Unicode version > for utf8_gen.py > --- > ChangeLog | 9 +++ > localedata/charmaps/UTF-8 | 2 +- > localedata/locales/i18n_ctype | 6 +- > localedata/unicode-gen/Makefile | 4 +- > localedata/unicode-gen/utf8_gen.py | 112 ++++++++++++++++++----------- > 5 files changed, 88 insertions(+), 45 deletions(-) > > diff --git a/ChangeLog b/ChangeLog > index d18c24453f..92d5677d54 100644 > --- a/ChangeLog > +++ b/ChangeLog > @@ -1,3 +1,12 @@ > +2018-07-10 Mike FABIAN <mfabian@redhat.com> > + > + * localedata/charmaps/UTF-8: Use correct Unicode version 11.0.0 in comment. > + * localedata/locales/i18n_ctype: Use correct Unicode version in comments > + and headers. > + * localedata/unicode-gen/utf8_gen.py: Add option to specify Unicode version > + * localedata/unicode-gen/Makefile: Use option to specify Unicode version > + for utf8_gen.py > + > 2018-07-10 Florian Weimer <fweimer@redhat.com> > > [BZ #23036] > diff --git a/localedata/charmaps/UTF-8 b/localedata/charmaps/UTF-8 > index 885c6ae7fc..1367aa46cf 100644 > --- a/localedata/charmaps/UTF-8 > +++ b/localedata/charmaps/UTF-8 > @@ -47069,7 +47069,7 @@ CHARMAP > <U0010FFC0>..<U0010FFFD> /xf4/x8f/xbf/x80 <Plane 16 Private Use> > END CHARMAP > > -% Character width according to Unicode 10.0.0. > +% Character width according to Unicode 11.0.0. > % - Default width is 1. > % - Double-width characters have width 2; generated from > % "grep '^[^;]*;[WF]' EastAsianWidth.txt" > diff --git a/localedata/locales/i18n_ctype b/localedata/locales/i18n_ctype > index ed59aef947..26400cbff1 100644 > --- a/localedata/locales/i18n_ctype > +++ b/localedata/locales/i18n_ctype > @@ -13,10 +13,10 @@ comment_char % > % information, but with different transliterations, can include it > % directly. > > -% Generated automatically by gen_unicode_ctype.py for Unicode 10.0.0. > +% Generated automatically by gen_unicode_ctype.py for Unicode 11.0.0. > > LC_IDENTIFICATION > -title "Unicode 10.0.0 FDCC-set" > +title "Unicode 11.0.0 FDCC-set" > source "UnicodeData.txt, DerivedCoreProperties.txt" > address "" > contact "" > @@ -25,7 +25,7 @@ tel "" > fax "" > language "" > territory "Earth" > -revision "10.0.0" > +revision "11.0.0" > date "2018-06-20" > category "i18n:2012";LC_CTYPE > END LC_IDENTIFICATION > diff --git a/localedata/unicode-gen/Makefile b/localedata/unicode-gen/Makefile > index c0faae5e58..c2b5fa75e1 100644 > --- a/localedata/unicode-gen/Makefile > +++ b/localedata/unicode-gen/Makefile > @@ -92,7 +92,9 @@ tr_TR: gen_unicode_ctype.py > > UTF-8: UnicodeData.txt EastAsianWidth.txt > UTF-8: utf8_gen.py > - $(PYTHON3) utf8_gen.py UnicodeData.txt EastAsianWidth.txt PropList.txt > + $(PYTHON3) utf8_gen.py -u UnicodeData.txt \ > + -e EastAsianWidth.txt -p PropList.txt \ > + --unicode_version $(UNICODE_VERSION) OK. > > UTF-8-report: UTF-8 ../charmaps/UTF-8 > UTF-8-report: utf8_compatibility.py > diff --git a/localedata/unicode-gen/utf8_gen.py b/localedata/unicode-gen/utf8_gen.py > index 715b753ec1..2d8d631a96 100755 > --- a/localedata/unicode-gen/utf8_gen.py > +++ b/localedata/unicode-gen/utf8_gen.py > @@ -27,6 +27,7 @@ Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt > It will output UTF-8 file > ''' > > +import argparse OK. > import sys > import re > import unicode_utils > @@ -197,9 +198,10 @@ def write_header_charmap(outfile): > outfile.write("% alias ISO-10646/UTF-8\n") > outfile.write("CHARMAP\n") > > -def write_header_width(outfile): > +def write_header_width(outfile, unicode_version): > '''Writes the header on top of the WIDTH section to the output file''' > - outfile.write('% Character width according to Unicode 10.0.0.\n') > + outfile.write('% Character width according to Unicode ' > + + '{:s}.\n'.format(unicode_version)) OK. > outfile.write('% - Default width is 1.\n') > outfile.write('% - Double-width characters have width 2; generated from\n') > outfile.write('% "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n') > @@ -292,41 +294,71 @@ def process_width(outfile, ulines, elines, plines): > width_dict[same_width_list[0]])) > > if __name__ == "__main__": > - if len(sys.argv) < 3: > - print("USAGE: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt PropList.txt") > - else: > - with open(sys.argv[1], mode='r') as UNIDATA_FILE: > - UNICODE_DATA_LINES = UNIDATA_FILE.readlines() > - with open(sys.argv[2], mode='r') as EAST_ASIAN_WIDTH_FILE: > - EAST_ASIAN_WIDTH_LINES = [] > - for LINE in EAST_ASIAN_WIDTH_FILE: > - # If characters from EastAasianWidth.txt which are from > - # from reserved ranges (i.e. not yet assigned code points) > - # are added to the WIDTH section of the UTF-8 file, then > - # “make check†produces “Unknown Character†errors for > - # these code points because such unassigned code points > - # are not in the CHARMAP section of the UTF-8 file. > - # > - # Therefore, we skip all reserved code points when reading > - # the EastAsianWidth.txt file. > - if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE): > - continue > - if re.match(r'^[^;]*;[WF]', LINE): > - EAST_ASIAN_WIDTH_LINES.append(LINE.strip()) > - with open(sys.argv[3], mode='r') as PROP_LIST_FILE: > - PROP_LIST_LINES = [] > - for LINE in PROP_LIST_FILE: > - if re.match(r'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE): > - PROP_LIST_LINES.append(LINE.strip()) > - with open('UTF-8', mode='w') as OUTFILE: > - # Processing UnicodeData.txt and write CHARMAP to UTF-8 file > - write_header_charmap(OUTFILE) > - process_charmap(UNICODE_DATA_LINES, OUTFILE) > - OUTFILE.write("END CHARMAP\n\n") > - # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file > - write_header_width(OUTFILE) > - process_width(OUTFILE, > - UNICODE_DATA_LINES, > - EAST_ASIAN_WIDTH_LINES, > - PROP_LIST_LINES) > - OUTFILE.write("END WIDTH\n") > + PARSER = argparse.ArgumentParser( > + description=''' > + Generate a UTF-8 file from UnicodeData.txt, EastAsianWidth.txt, and PropList.txt. > + ''') OK. > + PARSER.add_argument( > + '-u', '--unicode_data_file', > + nargs='?', > + type=str, > + default='UnicodeData.txt', > + help=('The UnicodeData.txt file to read, ' > + + 'default: %(default)s')) OK. > + PARSER.add_argument( > + '-e', '--east_asian_with_file', > + nargs='?', > + type=str, > + default='EastAsianWidth.txt', > + help=('The EastAsianWidth.txt file to read, ' > + + 'default: %(default)s')) OK. > + PARSER.add_argument( > + '-p', '--prop_list_file', > + nargs='?', > + type=str, > + default='PropList.txt', > + help=('The PropList.txt file to read, ' > + + 'default: %(default)s')) OK. > + PARSER.add_argument( > + '--unicode_version', > + nargs='?', > + required=True, > + type=str, > + help='The Unicode version of the input files used.') OK. > + ARGS = PARSER.parse_args() > + > + with open(ARGS.unicode_data_file, mode='r') as UNIDATA_FILE: > + UNICODE_DATA_LINES = UNIDATA_FILE.readlines() > + with open(ARGS.east_asian_with_file, mode='r') as EAST_ASIAN_WIDTH_FILE: > + EAST_ASIAN_WIDTH_LINES = [] > + for LINE in EAST_ASIAN_WIDTH_FILE: > + # If characters from EastAasianWidth.txt which are from > + # from reserved ranges (i.e. not yet assigned code points) > + # are added to the WIDTH section of the UTF-8 file, then > + # “make check†produces “Unknown Character†errors for > + # these code points because such unassigned code points > + # are not in the CHARMAP section of the UTF-8 file. > + # > + # Therefore, we skip all reserved code points when reading > + # the EastAsianWidth.txt file. > + if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE): > + continue > + if re.match(r'^[^;]*;[WF]', LINE): > + EAST_ASIAN_WIDTH_LINES.append(LINE.strip()) > + with open(ARGS.prop_list_file, mode='r') as PROP_LIST_FILE: > + PROP_LIST_LINES = [] > + for LINE in PROP_LIST_FILE: > + if re.match(r'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE): > + PROP_LIST_LINES.append(LINE.strip()) > + with open('UTF-8', mode='w') as OUTFILE: > + # Processing UnicodeData.txt and write CHARMAP to UTF-8 file > + write_header_charmap(OUTFILE) > + process_charmap(UNICODE_DATA_LINES, OUTFILE) > + OUTFILE.write("END CHARMAP\n\n") > + # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file > + write_header_width(OUTFILE, ARGS.unicode_version) > + process_width(OUTFILE, > + UNICODE_DATA_LINES, > + EAST_ASIAN_WIDTH_LINES, > + PROP_LIST_LINES) > + OUTFILE.write("END WIDTH\n") OK. > -- 2.17.1
From db66553fdc621244330d713839b562eb632307a1 Mon Sep 17 00:00:00 2001 From: Mike FABIAN <mfabian@redhat.com> Date: Tue, 10 Jul 2018 11:25:48 +0200 Subject: [PATCH] Put the correct Unicode version number 11.0.0 into the generated files In some places there was still the old Unicode version 10.0.0 in the files. * localedata/charmaps/UTF-8: Use correct Unicode version 11.0.0 in comment. * localedata/locales/i18n_ctype: Use correct Unicode version in comments and headers. * localedata/unicode-gen/utf8_gen.py: Add option to specify Unicode version * localedata/unicode-gen/Makefile: Use option to specify Unicode version for utf8_gen.py --- ChangeLog | 9 +++ localedata/charmaps/UTF-8 | 2 +- localedata/locales/i18n_ctype | 6 +- localedata/unicode-gen/Makefile | 4 +- localedata/unicode-gen/utf8_gen.py | 112 ++++++++++++++++++----------- 5 files changed, 88 insertions(+), 45 deletions(-) diff --git a/ChangeLog b/ChangeLog index d18c24453f..92d5677d54 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,12 @@ +2018-07-10 Mike FABIAN <mfabian@redhat.com> + + * localedata/charmaps/UTF-8: Use correct Unicode version 11.0.0 in comment. + * localedata/locales/i18n_ctype: Use correct Unicode version in comments + and headers. + * localedata/unicode-gen/utf8_gen.py: Add option to specify Unicode version + * localedata/unicode-gen/Makefile: Use option to specify Unicode version + for utf8_gen.py + 2018-07-10 Florian Weimer <fweimer@redhat.com> [BZ #23036] diff --git a/localedata/charmaps/UTF-8 b/localedata/charmaps/UTF-8 index 885c6ae7fc..1367aa46cf 100644 --- a/localedata/charmaps/UTF-8 +++ b/localedata/charmaps/UTF-8 @@ -47069,7 +47069,7 @@ CHARMAP <U0010FFC0>..<U0010FFFD> /xf4/x8f/xbf/x80 <Plane 16 Private Use> END CHARMAP -% Character width according to Unicode 10.0.0. +% Character width according to Unicode 11.0.0. % - Default width is 1. % - Double-width characters have width 2; generated from % "grep '^[^;]*;[WF]' EastAsianWidth.txt" diff --git a/localedata/locales/i18n_ctype b/localedata/locales/i18n_ctype index ed59aef947..26400cbff1 100644 --- a/localedata/locales/i18n_ctype +++ b/localedata/locales/i18n_ctype @@ -13,10 +13,10 @@ comment_char % % information, but with different transliterations, can include it % directly. -% Generated automatically by gen_unicode_ctype.py for Unicode 10.0.0. +% Generated automatically by gen_unicode_ctype.py for Unicode 11.0.0. LC_IDENTIFICATION -title "Unicode 10.0.0 FDCC-set" +title "Unicode 11.0.0 FDCC-set" source "UnicodeData.txt, DerivedCoreProperties.txt" address "" contact "" @@ -25,7 +25,7 @@ tel "" fax "" language "" territory "Earth" -revision "10.0.0" +revision "11.0.0" date "2018-06-20" category "i18n:2012";LC_CTYPE END LC_IDENTIFICATION diff --git a/localedata/unicode-gen/Makefile b/localedata/unicode-gen/Makefile index c0faae5e58..c2b5fa75e1 100644 --- a/localedata/unicode-gen/Makefile +++ b/localedata/unicode-gen/Makefile @@ -92,7 +92,9 @@ tr_TR: gen_unicode_ctype.py UTF-8: UnicodeData.txt EastAsianWidth.txt UTF-8: utf8_gen.py - $(PYTHON3) utf8_gen.py UnicodeData.txt EastAsianWidth.txt PropList.txt + $(PYTHON3) utf8_gen.py -u UnicodeData.txt \ + -e EastAsianWidth.txt -p PropList.txt \ + --unicode_version $(UNICODE_VERSION) UTF-8-report: UTF-8 ../charmaps/UTF-8 UTF-8-report: utf8_compatibility.py diff --git a/localedata/unicode-gen/utf8_gen.py b/localedata/unicode-gen/utf8_gen.py index 715b753ec1..2d8d631a96 100755 --- a/localedata/unicode-gen/utf8_gen.py +++ b/localedata/unicode-gen/utf8_gen.py @@ -27,6 +27,7 @@ Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt It will output UTF-8 file ''' +import argparse import sys import re import unicode_utils @@ -197,9 +198,10 @@ def write_header_charmap(outfile): outfile.write("% alias ISO-10646/UTF-8\n") outfile.write("CHARMAP\n") -def write_header_width(outfile): +def write_header_width(outfile, unicode_version): '''Writes the header on top of the WIDTH section to the output file''' - outfile.write('% Character width according to Unicode 10.0.0.\n') + outfile.write('% Character width according to Unicode ' + + '{:s}.\n'.format(unicode_version)) outfile.write('% - Default width is 1.\n') outfile.write('% - Double-width characters have width 2; generated from\n') outfile.write('% "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n') @@ -292,41 +294,71 @@ def process_width(outfile, ulines, elines, plines): width_dict[same_width_list[0]])) if __name__ == "__main__": - if len(sys.argv) < 3: - print("USAGE: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt PropList.txt") - else: - with open(sys.argv[1], mode='r') as UNIDATA_FILE: - UNICODE_DATA_LINES = UNIDATA_FILE.readlines() - with open(sys.argv[2], mode='r') as EAST_ASIAN_WIDTH_FILE: - EAST_ASIAN_WIDTH_LINES = [] - for LINE in EAST_ASIAN_WIDTH_FILE: - # If characters from EastAasianWidth.txt which are from - # from reserved ranges (i.e. not yet assigned code points) - # are added to the WIDTH section of the UTF-8 file, then - # “make check” produces “Unknown Character” errors for - # these code points because such unassigned code points - # are not in the CHARMAP section of the UTF-8 file. - # - # Therefore, we skip all reserved code points when reading - # the EastAsianWidth.txt file. - if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE): - continue - if re.match(r'^[^;]*;[WF]', LINE): - EAST_ASIAN_WIDTH_LINES.append(LINE.strip()) - with open(sys.argv[3], mode='r') as PROP_LIST_FILE: - PROP_LIST_LINES = [] - for LINE in PROP_LIST_FILE: - if re.match(r'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE): - PROP_LIST_LINES.append(LINE.strip()) - with open('UTF-8', mode='w') as OUTFILE: - # Processing UnicodeData.txt and write CHARMAP to UTF-8 file - write_header_charmap(OUTFILE) - process_charmap(UNICODE_DATA_LINES, OUTFILE) - OUTFILE.write("END CHARMAP\n\n") - # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file - write_header_width(OUTFILE) - process_width(OUTFILE, - UNICODE_DATA_LINES, - EAST_ASIAN_WIDTH_LINES, - PROP_LIST_LINES) - OUTFILE.write("END WIDTH\n") + PARSER = argparse.ArgumentParser( + description=''' + Generate a UTF-8 file from UnicodeData.txt, EastAsianWidth.txt, and PropList.txt. + ''') + PARSER.add_argument( + '-u', '--unicode_data_file', + nargs='?', + type=str, + default='UnicodeData.txt', + help=('The UnicodeData.txt file to read, ' + + 'default: %(default)s')) + PARSER.add_argument( + '-e', '--east_asian_with_file', + nargs='?', + type=str, + default='EastAsianWidth.txt', + help=('The EastAsianWidth.txt file to read, ' + + 'default: %(default)s')) + PARSER.add_argument( + '-p', '--prop_list_file', + nargs='?', + type=str, + default='PropList.txt', + help=('The PropList.txt file to read, ' + + 'default: %(default)s')) + PARSER.add_argument( + '--unicode_version', + nargs='?', + required=True, + type=str, + help='The Unicode version of the input files used.') + ARGS = PARSER.parse_args() + + with open(ARGS.unicode_data_file, mode='r') as UNIDATA_FILE: + UNICODE_DATA_LINES = UNIDATA_FILE.readlines() + with open(ARGS.east_asian_with_file, mode='r') as EAST_ASIAN_WIDTH_FILE: + EAST_ASIAN_WIDTH_LINES = [] + for LINE in EAST_ASIAN_WIDTH_FILE: + # If characters from EastAasianWidth.txt which are from + # from reserved ranges (i.e. not yet assigned code points) + # are added to the WIDTH section of the UTF-8 file, then + # “make check” produces “Unknown Character” errors for + # these code points because such unassigned code points + # are not in the CHARMAP section of the UTF-8 file. + # + # Therefore, we skip all reserved code points when reading + # the EastAsianWidth.txt file. + if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE): + continue + if re.match(r'^[^;]*;[WF]', LINE): + EAST_ASIAN_WIDTH_LINES.append(LINE.strip()) + with open(ARGS.prop_list_file, mode='r') as PROP_LIST_FILE: + PROP_LIST_LINES = [] + for LINE in PROP_LIST_FILE: + if re.match(r'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE): + PROP_LIST_LINES.append(LINE.strip()) + with open('UTF-8', mode='w') as OUTFILE: + # Processing UnicodeData.txt and write CHARMAP to UTF-8 file + write_header_charmap(OUTFILE) + process_charmap(UNICODE_DATA_LINES, OUTFILE) + OUTFILE.write("END CHARMAP\n\n") + # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file + write_header_width(OUTFILE, ARGS.unicode_version) + process_width(OUTFILE, + UNICODE_DATA_LINES, + EAST_ASIAN_WIDTH_LINES, + PROP_LIST_LINES) + OUTFILE.write("END WIDTH\n") -- 2.17.1