From 4076a9bb3b7e5dfb023714bb6d858860a86d273a Mon Sep 17 00:00:00 2001 From: Bruno Haible Date: Sun, 22 Feb 2009 15:05:45 +0100 Subject: [PATCH] Implement new clarified decomposition of Hangul syllables. --- ChangeLog | 11 ++++++++ lib/uninorm/canonical-decomposition.c | 39 +++++++++++++++++++++------- lib/uninorm/decomposition.c | 39 +++++++++++++++++++++------- tests/uninorm/test-canonical-decomposition.c | 8 ++++++ tests/uninorm/test-compat-decomposition.c | 8 ++++++ tests/uninorm/test-decomposition.c | 9 +++++++ 6 files changed, 96 insertions(+), 18 deletions(-) diff --git a/ChangeLog b/ChangeLog index 4a07193d4..b6a4b02a8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,16 @@ 2009-02-22 Bruno Haible + Implement new clarified decomposition of Hangul syllables. + * lib/uninorm/decomposition.c (uc_decomposition): For Hangul syllables + of type LTV, return only a pairwise decomposition. + * lib/uninorm/canonical-decomposition.c (uc_canonical_decomposition): + Likewise. + * tests/uninorm/test-decomposition.c (main): Updated expected result. + * tests/uninorm/test-canonical-decomposition.c (main): Likewise. + * tests/uninorm/test-compat-decomposition.c (main): Likewise. + +2009-02-22 Bruno Haible + * lib/uninorm/u-normalize-internal.h (FUNC): At the end, handle zero-length results and shrink excess allocated memory. * tests/uninorm/test-u8-nfc.c (test_u8_nfc): Check empty string result. diff --git a/lib/uninorm/canonical-decomposition.c b/lib/uninorm/canonical-decomposition.c index 9afaead0a..210b74b9c 100644 --- a/lib/uninorm/canonical-decomposition.c +++ b/lib/uninorm/canonical-decomposition.c @@ -29,24 +29,45 @@ uc_canonical_decomposition (ucs4_t uc, ucs4_t *decomposition) { if (uc >= 0xAC00 && uc < 0xD7A4) { - /* Hangul syllable. See Unicode standard, chapter 3, - section "Hangul Syllable Decomposition". */ - unsigned int t, v, l; + /* Hangul syllable. See Unicode standard, chapter 3, section + "Hangul Syllable Decomposition", See also the clarification at + , section + "Clarification of Hangul Jamo Handling". */ + unsigned int t; uc -= 0xAC00; t = uc % 28; - uc = uc / 28; - v = uc % 21; - l = uc / 21; - decomposition[0] = 0x1100 + l; - decomposition[1] = 0x1161 + v; if (t == 0) - return 2; + { + unsigned int v, l; + + uc = uc / 28; + v = uc % 21; + l = uc / 21; + + decomposition[0] = 0x1100 + l; + decomposition[1] = 0x1161 + v; + return 2; + } else { +#if 1 /* Return the pairwise decomposition, not the full decomposition. */ + decomposition[0] = 0xAC00 + uc - t; /* = 0xAC00 + (l * 21 + v) * 28; */ + decomposition[1] = 0x11A7 + t; + return 2; +#else + unsigned int v, l; + + uc = uc / 28; + v = uc % 21; + l = uc / 21; + + decomposition[0] = 0x1100 + l; + decomposition[1] = 0x1161 + v; decomposition[2] = 0x11A7 + t; return 3; +#endif } } else if (uc < 0x110000) diff --git a/lib/uninorm/decomposition.c b/lib/uninorm/decomposition.c index d581bde2e..af0301822 100644 --- a/lib/uninorm/decomposition.c +++ b/lib/uninorm/decomposition.c @@ -27,25 +27,46 @@ uc_decomposition (ucs4_t uc, int *decomp_tag, ucs4_t *decomposition) { if (uc >= 0xAC00 && uc < 0xD7A4) { - /* Hangul syllable. See Unicode standard, chapter 3, - section "Hangul Syllable Decomposition". */ - unsigned int t, v, l; + /* Hangul syllable. See Unicode standard, chapter 3, section + "Hangul Syllable Decomposition", See also the clarification at + , section + "Clarification of Hangul Jamo Handling". */ + unsigned int t; uc -= 0xAC00; t = uc % 28; - uc = uc / 28; - v = uc % 21; - l = uc / 21; *decomp_tag = UC_DECOMP_CANONICAL; - decomposition[0] = 0x1100 + l; - decomposition[1] = 0x1161 + v; if (t == 0) - return 2; + { + unsigned int v, l; + + uc = uc / 28; + v = uc % 21; + l = uc / 21; + + decomposition[0] = 0x1100 + l; + decomposition[1] = 0x1161 + v; + return 2; + } else { +#if 1 /* Return the pairwise decomposition, not the full decomposition. */ + decomposition[0] = 0xAC00 + uc - t; /* = 0xAC00 + (l * 21 + v) * 28; */ + decomposition[1] = 0x11A7 + t; + return 2; +#else + unsigned int v, l; + + uc = uc / 28; + v = uc % 21; + l = uc / 21; + + decomposition[0] = 0x1100 + l; + decomposition[1] = 0x1161 + v; decomposition[2] = 0x11A7 + t; return 3; +#endif } } else if (uc < 0x110000) diff --git a/tests/uninorm/test-canonical-decomposition.c b/tests/uninorm/test-canonical-decomposition.c index 59061c94e..040011b55 100644 --- a/tests/uninorm/test-canonical-decomposition.c +++ b/tests/uninorm/test-canonical-decomposition.c @@ -133,10 +133,18 @@ main () /* HANGUL SYLLABLE GEUL */ ret = uc_canonical_decomposition (0xAE00, decomposed); + /* See the clarification at , + section "Clarification of Hangul Jamo Handling". */ +#if 1 + ASSERT (ret == 2); + ASSERT (decomposed[0] == 0xADF8); + ASSERT (decomposed[1] == 0x11AF); +#else ASSERT (ret == 3); ASSERT (decomposed[0] == 0x1100); ASSERT (decomposed[1] == 0x1173); ASSERT (decomposed[2] == 0x11AF); +#endif /* HANGUL SYLLABLE GEU */ ret = uc_canonical_decomposition (0xADF8, decomposed); diff --git a/tests/uninorm/test-compat-decomposition.c b/tests/uninorm/test-compat-decomposition.c index d92ae365b..49a2b09f8 100644 --- a/tests/uninorm/test-compat-decomposition.c +++ b/tests/uninorm/test-compat-decomposition.c @@ -175,10 +175,18 @@ main () /* HANGUL SYLLABLE GEUL */ ret = uc_compat_decomposition (0xAE00, decomposed); + /* See the clarification at , + section "Clarification of Hangul Jamo Handling". */ +#if 1 + ASSERT (ret == 2); + ASSERT (decomposed[0] == 0xADF8); + ASSERT (decomposed[1] == 0x11AF); +#else ASSERT (ret == 3); ASSERT (decomposed[0] == 0x1100); ASSERT (decomposed[1] == 0x1173); ASSERT (decomposed[2] == 0x11AF); +#endif /* HANGUL SYLLABLE GEU */ ret = uc_compat_decomposition (0xADF8, decomposed); diff --git a/tests/uninorm/test-decomposition.c b/tests/uninorm/test-decomposition.c index b29e78378..684dee452 100644 --- a/tests/uninorm/test-decomposition.c +++ b/tests/uninorm/test-decomposition.c @@ -194,11 +194,20 @@ main () /* HANGUL SYLLABLE GEUL */ ret = uc_decomposition (0xAE00, &tag, decomposed); + /* See the clarification at , + section "Clarification of Hangul Jamo Handling". */ +#if 1 + ASSERT (ret == 2); + ASSERT (tag == UC_DECOMP_CANONICAL); + ASSERT (decomposed[0] == 0xADF8); + ASSERT (decomposed[1] == 0x11AF); +#else ASSERT (ret == 3); ASSERT (tag == UC_DECOMP_CANONICAL); ASSERT (decomposed[0] == 0x1100); ASSERT (decomposed[1] == 0x1173); ASSERT (decomposed[2] == 0x11AF); +#endif /* HANGUL SYLLABLE GEU */ ret = uc_decomposition (0xADF8, &tag, decomposed); -- 2.11.0