From ef51234c7d0d7fb58f507b093f2f1d8aa4b8ea4b Mon Sep 17 00:00:00 2001 From: Bruno Haible Date: Sat, 26 Mar 2011 12:49:04 +0100 Subject: [PATCH] unictype/bidiclass-byname: Recognize long names as well. * lib/unictype.in.h (uc_bidi_class_byname): Allow argument to be a long name. * lib/unictype/bidi_byname.c: Include , unictype/bidi_byname.h. (uc_bidi_class_byname): Use uc_bidi_class_lookup. * lib/unictype/bidi_byname.gperf: New file. * modules/unictype/bidiclass-byname (Files): Add lib/unictype/bidi_byname.gperf. (Depends-on): Add gperf. (Makefile.am): Add rule for generating unictype/bidi_byname.h. * tests/unictype/test-bidi_byname.c (main): Test the recognition of long names. --- ChangeLog | 14 ++++ lib/unictype.in.h | 3 +- lib/unictype/bidi_byname.c | 172 +++++++------------------------------- lib/unictype/bidi_byname.gperf | 70 ++++++++++++++++ modules/unictype/bidiclass-byname | 10 +++ tests/unictype/test-bidi_byname.c | 78 +++++++++++++++++ 6 files changed, 206 insertions(+), 141 deletions(-) create mode 100644 lib/unictype/bidi_byname.gperf diff --git a/ChangeLog b/ChangeLog index f8fcc15ac..98b416ea8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,19 @@ 2011-03-26 Bruno Haible + unictype/bidiclass-byname: Recognize long names as well. + * lib/unictype.in.h (uc_bidi_class_byname): Allow argument to be a long + name. + * lib/unictype/bidi_byname.c: Include , + unictype/bidi_byname.h. + (uc_bidi_class_byname): Use uc_bidi_class_lookup. + * lib/unictype/bidi_byname.gperf: New file. + * modules/unictype/bidiclass-byname (Files): Add + lib/unictype/bidi_byname.gperf. + (Depends-on): Add gperf. + (Makefile.am): Add rule for generating unictype/bidi_byname.h. + * tests/unictype/test-bidi_byname.c (main): Test the recognition of + long names. + Tests for module 'unictype/bidiclass-longname'. * modules/unictype/bidiclass-longname-tests: New file. * tests/unictype/test-bidi_longname.c: New file. diff --git a/lib/unictype.in.h b/lib/unictype.in.h index 8d54b904d..726f465cf 100644 --- a/lib/unictype.in.h +++ b/lib/unictype.in.h @@ -316,7 +316,8 @@ extern const char * extern const char * uc_bidi_class_long_name (int bidi_class); -/* Return the bidi class given by name, e.g. "LRE". */ +/* Return the bidi class given by name, e.g. "LRE", or by long name, e.g. + "Left-to-Right Embedding". */ extern int uc_bidi_class_byname (const char *bidi_class_name); /* Same; obsolete function name. */ diff --git a/lib/unictype/bidi_byname.c b/lib/unictype/bidi_byname.c index 2aa31a9b8..4b3a92ae1 100644 --- a/lib/unictype/bidi_byname.c +++ b/lib/unictype/bidi_byname.c @@ -20,151 +20,43 @@ /* Specification. */ #include "unictype.h" +#include + +#include "unictype/bidi_byname.h" + int uc_bidi_class_byname (const char *bidi_class_name) { - switch (bidi_class_name[0]) + size_t len; + + len = strlen (bidi_class_name); + if (len <= MAX_WORD_LENGTH) { - case 'A': - switch (bidi_class_name[1]) - { - case 'L': - if (bidi_class_name[2] == '\0') - return UC_BIDI_AL; - break; - case 'N': - if (bidi_class_name[2] == '\0') - return UC_BIDI_AN; - break; - } - break; - case 'B': - switch (bidi_class_name[1]) - { - case '\0': - return UC_BIDI_B; - case 'N': - if (bidi_class_name[2] == '\0') - return UC_BIDI_BN; - break; - } - break; - case 'C': - switch (bidi_class_name[1]) - { - case 'S': - if (bidi_class_name[2] == '\0') - return UC_BIDI_CS; - break; - } - break; - case 'E': - switch (bidi_class_name[1]) - { - case 'N': - if (bidi_class_name[2] == '\0') - return UC_BIDI_EN; - break; - case 'S': - if (bidi_class_name[2] == '\0') - return UC_BIDI_ES; - break; - case 'T': - if (bidi_class_name[2] == '\0') - return UC_BIDI_ET; - break; - } - break; - case 'L': - switch (bidi_class_name[1]) - { - case '\0': - return UC_BIDI_L; - case 'R': - switch (bidi_class_name[2]) - { - case 'E': - if (bidi_class_name[3] == '\0') - return UC_BIDI_LRE; - break; - case 'O': - if (bidi_class_name[3] == '\0') - return UC_BIDI_LRO; - break; - } - break; - } - break; - case 'N': - switch (bidi_class_name[1]) - { - case 'S': - switch (bidi_class_name[2]) - { - case 'M': - if (bidi_class_name[3] == '\0') - return UC_BIDI_NSM; - break; - } - break; - } - break; - case 'O': - switch (bidi_class_name[1]) - { - case 'N': - if (bidi_class_name[2] == '\0') - return UC_BIDI_ON; - break; - } - break; - case 'P': - switch (bidi_class_name[1]) - { - case 'D': - switch (bidi_class_name[2]) - { - case 'F': - if (bidi_class_name[3] == '\0') - return UC_BIDI_PDF; - break; - } - break; - } - break; - case 'R': - switch (bidi_class_name[1]) - { - case '\0': - return UC_BIDI_R; - case 'L': - switch (bidi_class_name[2]) - { - case 'E': - if (bidi_class_name[3] == '\0') - return UC_BIDI_RLE; - break; - case 'O': - if (bidi_class_name[3] == '\0') - return UC_BIDI_RLO; + char buf[MAX_WORD_LENGTH + 1]; + const struct named_bidi_class *found; + + /* Copy bidi_class_name into buf, converting '_' and '-' to ' '. */ + { + const char *p = bidi_class_name; + char *q = buf; + + for (;; p++, q++) + { + char c = *p; + + if (c == '_' || c == '-') + c = ' '; + *q = c; + if (c == '\0') break; - } - break; - } - break; - case 'S': - if (bidi_class_name[1] == '\0') - return UC_BIDI_S; - break; - case 'W': - switch (bidi_class_name[1]) - { - case 'S': - if (bidi_class_name[2] == '\0') - return UC_BIDI_WS; - break; - } - break; + } + } + /* Here q == buf + len. */ + + /* Do a hash table lookup, with case-insensitive comparison. */ + found = uc_bidi_class_lookup (buf, len); + if (found != NULL) + return found->bidi_class; } /* Invalid bidi class name. */ return -1; diff --git a/lib/unictype/bidi_byname.gperf b/lib/unictype/bidi_byname.gperf new file mode 100644 index 000000000..9cacacf0f --- /dev/null +++ b/lib/unictype/bidi_byname.gperf @@ -0,0 +1,70 @@ +/* Bidi classes of Unicode characters. */ +struct named_bidi_class { int name; int bidi_class; }; +%struct-type +%ignore-case +%language=ANSI-C +%define hash-function-name bidi_class_hash +%define lookup-function-name uc_bidi_class_lookup +%readonly-tables +%global-table +%define word-array-name bidi_class_names +%pic +%define string-pool-name bidi_class_stringpool +%% +AL, UC_BIDI_AL +AN, UC_BIDI_AN +B, UC_BIDI_B +BN, UC_BIDI_BN +CS, UC_BIDI_CS +EN, UC_BIDI_EN +ES, UC_BIDI_ES +ET, UC_BIDI_ET +L, UC_BIDI_L +LRE, UC_BIDI_LRE +LRO, UC_BIDI_LRO +NSM, UC_BIDI_NSM +ON, UC_BIDI_ON +PDF, UC_BIDI_PDF +R, UC_BIDI_R +RLE, UC_BIDI_RLE +RLO, UC_BIDI_RLO +S, UC_BIDI_S +WS, UC_BIDI_WS +Arabic Letter, UC_BIDI_AL +ArabicLetter, UC_BIDI_AL +Arabic Number, UC_BIDI_AN +ArabicNumber, UC_BIDI_AN +Paragraph Separator, UC_BIDI_B +ParagraphSeparator, UC_BIDI_B +Boundary Neutral, UC_BIDI_BN +BoundaryNeutral, UC_BIDI_BN +Common Separator, UC_BIDI_CS +CommonSeparator, UC_BIDI_CS +European Number, UC_BIDI_EN +EuropeanNumber, UC_BIDI_EN +European Separator, UC_BIDI_ES +EuropeanSeparator, UC_BIDI_ES +European Terminator, UC_BIDI_ET +EuropeanTerminator, UC_BIDI_ET +Left To Right, UC_BIDI_L +LeftToRight, UC_BIDI_L +Left To Right Embedding, UC_BIDI_LRE +LeftToRightEmbedding, UC_BIDI_LRE +Left To Right Override, UC_BIDI_LRO +LeftToRightOverride, UC_BIDI_LRO +Nonspacing Mark, UC_BIDI_NSM +NonspacingMark, UC_BIDI_NSM +Other Neutral, UC_BIDI_ON +OtherNeutral, UC_BIDI_ON +Pop Directional Format, UC_BIDI_PDF +PopDirectionalFormat, UC_BIDI_PDF +Right To Left, UC_BIDI_R +RightToLeft, UC_BIDI_R +Right To Left Embedding, UC_BIDI_RLE +RightToLeftEmbedding, UC_BIDI_RLE +Right To Left Override, UC_BIDI_RLO +RightToLeftOverride, UC_BIDI_RLO +Segment Separator, UC_BIDI_S +SegmentSeparator, UC_BIDI_S +White Space, UC_BIDI_WS +WhiteSpace, UC_BIDI_WS diff --git a/modules/unictype/bidiclass-byname b/modules/unictype/bidiclass-byname index b96188337..e0a06e76d 100644 --- a/modules/unictype/bidiclass-byname +++ b/modules/unictype/bidiclass-byname @@ -3,9 +3,11 @@ Find a Unicode character bidi class, given its name. Files: lib/unictype/bidi_byname.c +lib/unictype/bidi_byname.gperf Depends-on: unictype/base +gperf configure.ac: gl_LIBUNISTRING_MODULE([0.9.4], [unictype/bidiclass-byname]) @@ -15,6 +17,14 @@ if LIBUNISTRING_COMPILE_UNICTYPE_BIDICLASS_BYNAME lib_SOURCES += unictype/bidi_byname.c endif +unictype/bidi_byname.h: unictype/bidi_byname.gperf + $(GPERF) -m 10 $(srcdir)/unictype/bidi_byname.gperf > $(srcdir)/unictype/bidi_byname.h-t + mv $(srcdir)/unictype/bidi_byname.h-t $(srcdir)/unictype/bidi_byname.h +BUILT_SOURCES += unictype/bidi_byname.h +MOSTLYCLEANFILES += unictype/bidi_byname.h-t +MAINTAINERCLEANFILES += unictype/bidi_byname.h +EXTRA_DIST += unictype/bidi_byname.h + Include: "unictype.h" diff --git a/tests/unictype/test-bidi_byname.c b/tests/unictype/test-bidi_byname.c index ded352cf5..3449c5bf6 100644 --- a/tests/unictype/test-bidi_byname.c +++ b/tests/unictype/test-bidi_byname.c @@ -44,6 +44,84 @@ main () ASSERT (uc_bidi_class_byname ("S") == UC_BIDI_S); ASSERT (uc_bidi_class_byname ("WS") == UC_BIDI_WS); ASSERT (uc_bidi_class_byname ("ON") == UC_BIDI_ON); + + ASSERT (uc_bidi_class_byname ("ARABIC LETTER") == UC_BIDI_AL); + ASSERT (uc_bidi_class_byname ("Arabic Letter") == UC_BIDI_AL); + ASSERT (uc_bidi_class_byname ("Arabic_Letter") == UC_BIDI_AL); + ASSERT (uc_bidi_class_byname ("ArabicLetter") == UC_BIDI_AL); + ASSERT (uc_bidi_class_byname ("ARABIC NUMBER") == UC_BIDI_AN); + ASSERT (uc_bidi_class_byname ("Arabic Number") == UC_BIDI_AN); + ASSERT (uc_bidi_class_byname ("Arabic_Number") == UC_BIDI_AN); + ASSERT (uc_bidi_class_byname ("ArabicNumber") == UC_BIDI_AN); + ASSERT (uc_bidi_class_byname ("PARAGRAPH SEPARATOR") == UC_BIDI_B); + ASSERT (uc_bidi_class_byname ("Paragraph Separator") == UC_BIDI_B); + ASSERT (uc_bidi_class_byname ("Paragraph_Separator") == UC_BIDI_B); + ASSERT (uc_bidi_class_byname ("ParagraphSeparator") == UC_BIDI_B); + ASSERT (uc_bidi_class_byname ("BOUNDARY NEUTRAL") == UC_BIDI_BN); + ASSERT (uc_bidi_class_byname ("Boundary Neutral") == UC_BIDI_BN); + ASSERT (uc_bidi_class_byname ("Boundary_Neutral") == UC_BIDI_BN); + ASSERT (uc_bidi_class_byname ("BoundaryNeutral") == UC_BIDI_BN); + ASSERT (uc_bidi_class_byname ("COMMON SEPARATOR") == UC_BIDI_CS); + ASSERT (uc_bidi_class_byname ("Common Separator") == UC_BIDI_CS); + ASSERT (uc_bidi_class_byname ("Common_Separator") == UC_BIDI_CS); + ASSERT (uc_bidi_class_byname ("CommonSeparator") == UC_BIDI_CS); + ASSERT (uc_bidi_class_byname ("EUROPEAN NUMBER") == UC_BIDI_EN); + ASSERT (uc_bidi_class_byname ("European Number") == UC_BIDI_EN); + ASSERT (uc_bidi_class_byname ("European_Number") == UC_BIDI_EN); + ASSERT (uc_bidi_class_byname ("EuropeanNumber") == UC_BIDI_EN); + ASSERT (uc_bidi_class_byname ("EUROPEAN SEPARATOR") == UC_BIDI_ES); + ASSERT (uc_bidi_class_byname ("European Separator") == UC_BIDI_ES); + ASSERT (uc_bidi_class_byname ("European_Separator") == UC_BIDI_ES); + ASSERT (uc_bidi_class_byname ("EuropeanSeparator") == UC_BIDI_ES); + ASSERT (uc_bidi_class_byname ("EUROPEAN TERMINATOR") == UC_BIDI_ET); + ASSERT (uc_bidi_class_byname ("European Terminator") == UC_BIDI_ET); + ASSERT (uc_bidi_class_byname ("European_Terminator") == UC_BIDI_ET); + ASSERT (uc_bidi_class_byname ("EuropeanTerminator") == UC_BIDI_ET); + ASSERT (uc_bidi_class_byname ("LEFT TO RIGHT") == UC_BIDI_L); + ASSERT (uc_bidi_class_byname ("Left To Right") == UC_BIDI_L); + ASSERT (uc_bidi_class_byname ("Left_To_Right") == UC_BIDI_L); + ASSERT (uc_bidi_class_byname ("LeftToRight") == UC_BIDI_L); + ASSERT (uc_bidi_class_byname ("LEFT TO RIGHT EMBEDDING") == UC_BIDI_LRE); + ASSERT (uc_bidi_class_byname ("Left To Right Embedding") == UC_BIDI_LRE); + ASSERT (uc_bidi_class_byname ("Left_To_Right_Embedding") == UC_BIDI_LRE); + ASSERT (uc_bidi_class_byname ("LeftToRightEmbedding") == UC_BIDI_LRE); + ASSERT (uc_bidi_class_byname ("LEFT TO RIGHT OVERRIDE") == UC_BIDI_LRO); + ASSERT (uc_bidi_class_byname ("Left To Right Override") == UC_BIDI_LRO); + ASSERT (uc_bidi_class_byname ("Left_To_Right_Override") == UC_BIDI_LRO); + ASSERT (uc_bidi_class_byname ("LeftToRightOverride") == UC_BIDI_LRO); + ASSERT (uc_bidi_class_byname ("NONSPACING MARK") == UC_BIDI_NSM); + ASSERT (uc_bidi_class_byname ("Nonspacing Mark") == UC_BIDI_NSM); + ASSERT (uc_bidi_class_byname ("Nonspacing_Mark") == UC_BIDI_NSM); + ASSERT (uc_bidi_class_byname ("NonspacingMark") == UC_BIDI_NSM); + ASSERT (uc_bidi_class_byname ("OTHER NEUTRAL") == UC_BIDI_ON); + ASSERT (uc_bidi_class_byname ("Other Neutral") == UC_BIDI_ON); + ASSERT (uc_bidi_class_byname ("Other_Neutral") == UC_BIDI_ON); + ASSERT (uc_bidi_class_byname ("OtherNeutral") == UC_BIDI_ON); + ASSERT (uc_bidi_class_byname ("POP DIRECTIONAL FORMAT") == UC_BIDI_PDF); + ASSERT (uc_bidi_class_byname ("Pop Directional Format") == UC_BIDI_PDF); + ASSERT (uc_bidi_class_byname ("Pop_Directional_Format") == UC_BIDI_PDF); + ASSERT (uc_bidi_class_byname ("PopDirectionalFormat") == UC_BIDI_PDF); + ASSERT (uc_bidi_class_byname ("RIGHT TO LEFT") == UC_BIDI_R); + ASSERT (uc_bidi_class_byname ("Right To Left") == UC_BIDI_R); + ASSERT (uc_bidi_class_byname ("Right_To_Left") == UC_BIDI_R); + ASSERT (uc_bidi_class_byname ("RightToLeft") == UC_BIDI_R); + ASSERT (uc_bidi_class_byname ("RIGHT TO LEFT EMBEDDING") == UC_BIDI_RLE); + ASSERT (uc_bidi_class_byname ("Right To Left Embedding") == UC_BIDI_RLE); + ASSERT (uc_bidi_class_byname ("Right_To_Left_Embedding") == UC_BIDI_RLE); + ASSERT (uc_bidi_class_byname ("RightToLeftEmbedding") == UC_BIDI_RLE); + ASSERT (uc_bidi_class_byname ("RIGHT TO LEFT OVERRIDE") == UC_BIDI_RLO); + ASSERT (uc_bidi_class_byname ("Right To Left Override") == UC_BIDI_RLO); + ASSERT (uc_bidi_class_byname ("Right_To_Left_Override") == UC_BIDI_RLO); + ASSERT (uc_bidi_class_byname ("RightToLeftOverride") == UC_BIDI_RLO); + ASSERT (uc_bidi_class_byname ("SEGMENT SEPARATOR") == UC_BIDI_S); + ASSERT (uc_bidi_class_byname ("Segment Separator") == UC_BIDI_S); + ASSERT (uc_bidi_class_byname ("Segment_Separator") == UC_BIDI_S); + ASSERT (uc_bidi_class_byname ("SegmentSeparator") == UC_BIDI_S); + ASSERT (uc_bidi_class_byname ("WHITE SPACE") == UC_BIDI_WS); + ASSERT (uc_bidi_class_byname ("White Space") == UC_BIDI_WS); + ASSERT (uc_bidi_class_byname ("White_Space") == UC_BIDI_WS); + ASSERT (uc_bidi_class_byname ("WhiteSpace") == UC_BIDI_WS); + ASSERT (uc_bidi_class_byname ("X") < 0); ASSERT (uc_bidi_class_byname ("") < 0); -- 2.11.0