+/* When we pass a Unicode character to iconv(), we must pass it in a
+ suitable encoding. The standardized Unicode encodings are
+ UTF-8, UCS-2, UCS-4, UTF-16, UTF-16BE, UTF-16LE, UTF-7.
+ UCS-2 supports only characters up to \U0000FFFF.
+ UTF-16 and variants support only characters up to \U0010FFFF.
+ UTF-7 is way too complex and not supported by glibc-2.1.
+ UCS-4 specification leaves doubts about endianness and byte order
+ mark. glibc currently interprets it as big endian without byte order
+ mark, but this is not backed by an RFC.
+ So we use UTF-8. It supports characters up to \U7FFFFFFF and is
+ unambiguously defined. */
+
+/* Stores the UTF-8 representation of the Unicode character wc in r[0..5].
+ Returns the number of bytes stored, or -1 if wc is out of range. */
+static int
+utf8_wctomb (unsigned char *r, unsigned int wc)
+{
+ int count;
+
+ if (wc < 0x80)
+ count = 1;
+ else if (wc < 0x800)
+ count = 2;
+ else if (wc < 0x10000)
+ count = 3;
+ else if (wc < 0x200000)
+ count = 4;
+ else if (wc < 0x4000000)
+ count = 5;
+ else if (wc <= 0x7fffffff)
+ count = 6;
+ else
+ return -1;
+
+ switch (count)
+ {
+ /* Note: code falls through cases! */
+ case 6: r[5] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x4000000;
+ case 5: r[4] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x200000;
+ case 4: r[3] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x10000;
+ case 3: r[2] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x800;
+ case 2: r[1] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0xc0;
+ case 1: r[0] = wc;
+ }
+
+ return count;
+}
+
+/* Luckily, the encoding's name is platform independent. */
+#define UTF8_NAME "UTF-8"