Skip to content

Commit

Permalink
8319993: Update Unicode Data Files to 16.0.0
Browse files Browse the repository at this point in the history
8319992: Update ICU4J to Version 76.1

Reviewed-by: jlu, joehw, iris
  • Loading branch information
naotoj committed Nov 25, 2024
1 parent a032de2 commit 15ae8d0
Show file tree
Hide file tree
Showing 27 changed files with 8,147 additions and 901 deletions.
467 changes: 366 additions & 101 deletions src/java.base/share/classes/java/lang/Character.java

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -382,6 +382,8 @@ class CharacterData00 extends CharacterData {
case 0xA7B3: mapChar = 0xAB53; break;
case 0xA7C5: mapChar = 0x0282; break;
case 0xA7C6: mapChar = 0x1D8E; break;
case 0xA7CB: mapChar = 0x0264; break;
case 0xA7DC: mapChar = 0x019B; break;
// default mapChar is already set, so no
// need to redo it here.
// default : mapChar = ch;
Expand All @@ -403,13 +405,15 @@ class CharacterData00 extends CharacterData {
if ((val & $$maskCaseOffset) == $$maskCaseOffset) {
switch(ch) {
case 0x017F: mapChar = 0x0053; break;
case 0x019B: mapChar = 0xA7DC; break;
case 0x023F: mapChar = 0x2C7E; break;
case 0x0240: mapChar = 0x2C7F; break;
case 0x0250: mapChar = 0x2C6F; break;
case 0x0251: mapChar = 0x2C6D; break;
case 0x0252: mapChar = 0x2C70; break;
case 0x025C: mapChar = 0xA7AB; break;
case 0x0261: mapChar = 0xA7AC; break;
case 0x0264: mapChar = 0xA7CB; break;
case 0x0265: mapChar = 0xA78D; break;
case 0x0266: mapChar = 0xA7AA; break;
case 0x026A: mapChar = 0xA7AE; break;
Expand Down Expand Up @@ -857,13 +861,15 @@ class CharacterData00 extends CharacterData {
else {
switch(ch) {
case 0x017F: mapChar = 0x0053; break;
case 0x019B: mapChar = 0xA7DC; break;
case 0x023F: mapChar = 0x2C7E; break;
case 0x0240: mapChar = 0x2C7F; break;
case 0x0250: mapChar = 0x2C6F; break;
case 0x0251: mapChar = 0x2C6D; break;
case 0x0252: mapChar = 0x2C70; break;
case 0x025C: mapChar = 0xA7AB; break;
case 0x0261: mapChar = 0xA7AC; break;
case 0x0264: mapChar = 0xA7CB; break;
case 0x0265: mapChar = 0xA78D; break;
case 0x0266: mapChar = 0xA7AA; break;
case 0x026A: mapChar = 0xA7AE; break;
Expand Down
178 changes: 108 additions & 70 deletions src/java.base/share/classes/jdk/internal/icu/impl/NormalizerImpl.java

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2005, 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -40,6 +40,7 @@
import jdk.internal.icu.lang.UCharacter.NumericType;
import jdk.internal.icu.text.UTF16;
import jdk.internal.icu.text.UnicodeSet;
import jdk.internal.icu.util.CodePointTrie;
import jdk.internal.icu.util.VersionInfo;

/**
Expand Down Expand Up @@ -136,10 +137,8 @@ public int getAdditional(int codepoint, int column) {
*/
public VersionInfo getAge(int codepoint)
{
int version = getAdditional(codepoint, 0) >> AGE_SHIFT_;
return VersionInfo.getInstance(
(version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_,
version & LAST_NIBBLE_MASK_, 0, 0);
int version = getAdditional(codepoint, 0) >>> AGE_SHIFT_;
return VersionInfo.getInstance(version >> 2, version & 3, 0, 0);
}

// int-value and enumerated properties --------------------------------- ***
Expand All @@ -150,7 +149,11 @@ public int getType(int c) {

/*
* Map some of the Grapheme Cluster Break values to Hangul Syllable Types.
* Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break.
* Hangul_Syllable_Type used to be fully redundant with a subset of Grapheme_Cluster_Break.
*
* Starting with Unicode 16, this is no longer true for HST=V vs. GCB=V in some cases:
* Some Kirat Rai vowels are given GCB=V for proper grapheme clustering, but
* they are of course not related to Hangul syllables.
*/
private static final int /* UHangulSyllableType */ gcbToHst[]={
HangulSyllableType.NOT_APPLICABLE, /* U_GCB_OTHER */
Expand Down Expand Up @@ -310,11 +313,16 @@ public int digit(int c) {
* 0
*/
int m_maxJTGValue_;
/** maximum values for other code values */
int m_maxValuesOther_;

/**
* Script_Extensions data
*/
public char[] m_scriptExtensions_;

CodePointTrie m_blockTrie_;

// private variables -------------------------------------------------

/**
Expand Down Expand Up @@ -534,12 +542,13 @@ private UCharacterProperty() throws IOException
int additionalVectorsOffset = bytes.getInt();
m_additionalColumnsCount_ = bytes.getInt();
int scriptExtensionsOffset = bytes.getInt();
int reservedOffset7 = bytes.getInt();
/* reservedOffset8 = */ bytes.getInt();
int blockTrieOffset = bytes.getInt();
int reservedOffset8 = bytes.getInt();
/* dataTopOffset = */ bytes.getInt();
m_maxBlockScriptValue_ = bytes.getInt();
m_maxJTGValue_ = bytes.getInt();
ICUBinary.skipBytes(bytes, (16 - 12) << 2);
m_maxValuesOther_ = bytes.getInt();
ICUBinary.skipBytes(bytes, (16 - 13) << 2);

// read the main properties trie
m_trie_ = Trie2_16.createFromSerialized(bytes);
Expand Down Expand Up @@ -574,19 +583,29 @@ private UCharacterProperty() throws IOException
}

// Script_Extensions
int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2;
int numChars = (blockTrieOffset - scriptExtensionsOffset) * 2;
if(numChars > 0) {
m_scriptExtensions_ = new char[numChars];
for(int i = 0; i < numChars; ++i) {
m_scriptExtensions_[i] = bytes.getChar();
}
}

// Read the blockTrie.
int partLength = (reservedOffset8 - blockTrieOffset) * 4;
int triePosition = bytes.position();
m_blockTrie_ = CodePointTrie.fromBinary(null, CodePointTrie.ValueWidth.BITS_16, bytes);
trieLength = bytes.position() - triePosition;
if (trieLength > partLength) {
throw new IOException("uprops.icu: not enough bytes for blockTrie");
}
ICUBinary.skipBytes(bytes, partLength - trieLength); // skip padding after trie bytes
}

private static final class IsAcceptable implements ICUBinary.Authenticate {
// @Override when we switch to Java 6
public boolean isDataVersionAcceptable(byte version[]) {
return version[0] == 7;
return version[0] == 9;
}
}

Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2005, 2022, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2005, 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -36,7 +36,7 @@

package jdk.internal.icu.util;

import java.util.HashMap;
import java.util.concurrent.ConcurrentHashMap;

/**
* Class to store version numbers of the form major.minor.milli.micro.
Expand All @@ -48,13 +48,13 @@ public final class VersionInfo
// public data members -------------------------------------------------

/**
* Data version string for ICU's internal data.
* Used for appending to data path (e.g. icudt43b)
* Data version string for ICU's data file.
* Not used when loading from resources packaged in the .jar.
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public static final String ICU_DATA_VERSION_PATH = "74b";
public static final String ICU_DATA_VERSION_PATH = "76b";

// public methods ------------------------------------------------------

Expand Down Expand Up @@ -171,7 +171,7 @@ public int compareTo(VersionInfo other)
/**
* Map of singletons
*/
private static final HashMap<Integer, Object> MAP_ = new HashMap<>();
private static final ConcurrentHashMap<Integer, Object> MAP_ = new ConcurrentHashMap<>();
/**
* Error statement string
*/
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2016, 2023, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2016, 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -35,9 +35,9 @@ public final class Grapheme {
* <p>
* See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
* for the extended grapheme cluster boundary rules. The following implementation
* is based on the annex for Unicode version 15.1.
* is based on the annex for Unicode version 16.0.
*
* @spec http://www.unicode.org/reports/tr29/tr29-43.html
* @spec http://www.unicode.org/reports/tr29/tr29-45.html
* @param src the {@code CharSequence} to be scanned
* @param off offset to start looking for the next boundary in the src
* @param limit limit offset in the src (exclusive)
Expand Down Expand Up @@ -267,7 +267,10 @@ static int getType(int cp) {
if (cp >= 0xA960 && cp <= 0xA97C)
return L;
// hangul jamo_extended B
if (cp >= 0xD7B0 && cp <= 0xD7C6)
// Kirat Rai vowel sign
if (cp >= 0xD7B0 && cp <= 0xD7C6 ||
cp == 0x16D63 ||
cp >= 0x16D67 && cp <= 0x16D6A)
return V;
if (cp >= 0xD7CB && cp <= 0xD7FB)
return T;
Expand All @@ -277,6 +280,7 @@ static int getType(int cp) {
case 0x0D4E:
case 0x111C2:
case 0x111C3:
case 0x113D1:
case 0x1193F:
case 0x11941:
case 0x11A3A:
Expand Down
19 changes: 15 additions & 4 deletions src/java.base/share/data/unicodedata/Blocks.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# Blocks-15.1.0.txt
# Date: 2023-07-28, 15:47:20 GMT
# Copyright (c) 2023 Unicode, Inc.
# For terms of use, see https://www.unicode.org/terms_of_use.html
# Blocks-16.0.0.txt
# Date: 2024-02-02
# Copyright (c) 2024 Unicode, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use and license, see https://www.unicode.org/terms_of_use.html
#
# Unicode Character Database
# For documentation, see https://www.unicode.org/reports/tr44/
Expand Down Expand Up @@ -217,6 +218,7 @@ FFF0..FFFF; Specials
10500..1052F; Elbasan
10530..1056F; Caucasian Albanian
10570..105BF; Vithkuqi
105C0..105FF; Todhri
10600..1077F; Linear A
10780..107BF; Latin Extended-F
10800..1083F; Cypriot Syllabary
Expand All @@ -239,6 +241,7 @@ FFF0..FFFF; Specials
10C00..10C4F; Old Turkic
10C80..10CFF; Old Hungarian
10D00..10D3F; Hanifi Rohingya
10D40..10D8F; Garay
10E60..10E7F; Rumi Numeral Symbols
10E80..10EBF; Yezidi
10EC0..10EFF; Arabic Extended-C
Expand All @@ -258,12 +261,14 @@ FFF0..FFFF; Specials
11280..112AF; Multani
112B0..112FF; Khudawadi
11300..1137F; Grantha
11380..113FF; Tulu-Tigalari
11400..1147F; Newa
11480..114DF; Tirhuta
11580..115FF; Siddham
11600..1165F; Modi
11660..1167F; Mongolian Supplement
11680..116CF; Takri
116D0..116FF; Myanmar Extended-C
11700..1174F; Ahom
11800..1184F; Dogra
118A0..118FF; Warang Citi
Expand All @@ -274,6 +279,7 @@ FFF0..FFFF; Specials
11AB0..11ABF; Unified Canadian Aboriginal Syllabics Extended-A
11AC0..11AFF; Pau Cin Hau
11B00..11B5F; Devanagari Extended-A
11BC0..11BFF; Sunuwar
11C00..11C6F; Bhaiksuki
11C70..11CBF; Marchen
11D00..11D5F; Masaram Gondi
Expand All @@ -288,12 +294,15 @@ FFF0..FFFF; Specials
12F90..12FFF; Cypro-Minoan
13000..1342F; Egyptian Hieroglyphs
13430..1345F; Egyptian Hieroglyph Format Controls
13460..143FF; Egyptian Hieroglyphs Extended-A
14400..1467F; Anatolian Hieroglyphs
16100..1613F; Gurung Khema
16800..16A3F; Bamum Supplement
16A40..16A6F; Mro
16A70..16ACF; Tangsa
16AD0..16AFF; Bassa Vah
16B00..16B8F; Pahawh Hmong
16D40..16D7F; Kirat Rai
16E40..16E9F; Medefaidrin
16F00..16F9F; Miao
16FE0..16FFF; Ideographic Symbols and Punctuation
Expand All @@ -308,6 +317,7 @@ FFF0..FFFF; Specials
1B170..1B2FF; Nushu
1BC00..1BC9F; Duployan
1BCA0..1BCAF; Shorthand Format Controls
1CC00..1CEBF; Symbols for Legacy Computing Supplement
1CF00..1CFCF; Znamenny Musical Notation
1D000..1D0FF; Byzantine Musical Symbols
1D100..1D1FF; Musical Symbols
Expand All @@ -325,6 +335,7 @@ FFF0..FFFF; Specials
1E290..1E2BF; Toto
1E2C0..1E2FF; Wancho
1E4D0..1E4FF; Nag Mundari
1E5D0..1E5FF; Ol Onal
1E7E0..1E7FF; Ethiopic Extended-B
1E800..1E8DF; Mende Kikakui
1E900..1E95F; Adlam
Expand Down
Loading

0 comments on commit 15ae8d0

Please sign in to comment.