Ok, go ahead.
-- Jeff J.
On 01/10/09 01:04 PM, Corinna Vinschen wrote:
> Ping?
>
>
> Corinna
>
> On Sep 28 12:06, Corinna Vinschen wrote:
>> Hi,
>>
>> in a long winding discussion on the cygwin-developers list, it turned
>> out that we should allow CESU-8 encoding of lone UTF-16 surrogate halves
>> for at least three reasons:
>>
>> - POSIX filenames potentially contain byte values representing a UTF-16
>> surrogate half.
>> - Windows allows lone surrogate halves in filenames, which can only
>> losslessly encoded to a UTF-8 POSIX filename allowing the corresponding
>> CESU-8/UCS-2 values.
>> - Usually they are allowed in other C libraries, too.
>>
>> The below patch fixes that in __utf8_mbtowc and __utf8_wctomb.
>>
>> The patch to __utf8_mbtowc is very easy. Just allow these values again.
>>
>> The patch to __utf8_wctomb is a bit more tricky.
>>
>> Right now, if the function gets a high surrogate as input, it stores the
>> state and returns one byte, the lead byte of the corresponding UTF-8
>> value *iff* the next input value is a low surrogate. If the next value
>> is not a low surrogate, the function fails. Otherwise, it computes and
>> sends the trailing three bytes of the UTF-8 value and resets the state.
>>
>> If you want to allow CESU-8 encoding of a lone high surrogate, that
>> won't work, since the first byte already returned turns out to be wrong,
>> if the next wchar_t vcalue is not a low surrogate.
>> The patch fixes this problem by returning 0 bytes when the input wchar_t
>> is a high surrogate. This is possible, because the return value 0 has no
>> special meaning in the POSIX standard, and the calling string functions
>> (wcsnrtombs) correctly don't assume anything (like "end of string" from
>> the return value 0.
>>
>> Ok, so, if the next wchar_t value is a low surrogate, the function
>> computes the entire resulting UTF-8 value and returns all 4 bytes.
>> If the next wchar_t value is *not* a low surrogate, the function
>> computes the original value of the high surrogate and stores the
>> corresponding 3 byte UTF-8 sequence in the buffer. Then it proceeds
>> to generate the UTF-8 sequence for the current wchar_t, and eventually
>> it returns with a UTF-8 sequence for the high surrogate, plus the
>> sequence for the just incoming wchar_t.
>>
>> The easier case is if the function gets a low surrogate without having
>> stored a high surrogate in the mbstate. It simply returns the 3 CESU-8
>> bytes representing the surrogate value.
>>
>> Patch tested on Cygwin with the following test application:
>>
>> === SNIP ===
>> /* foo.c */
>> #include<stdio.h>
>> #include<wchar.h>
>> #include<string.h>
>>
>> void
>> print_c (const unsigned char *c)
>> {
>> int i;
>> for (i = 0; c[i]; ++i)
>> printf ("%02x ", c[i]);
>> puts ("");
>> }
>>
>> void
>> doit (const char *in)
>> {
>> wchar_t w[64];
>> char c1[64], c2[64];
>> int i;
>>
>> strcpy (c1, in);
>> mbstowcs (w, c1, 64);
>> wcstombs (c2, w, 64);
>> print_c (c1);
>> for (i = 0; w[i]; ++i)
>> printf ("%04x ", w[i]);
>> puts ("");
>> print_c (c2);
>> puts ("");
>> }
>>
>> int
>> main ()
>> {
>> doit (" \xf0\x90\x80\x81 ");
>> doit (" \xed\xa0\x8d ");
>> doit (" \xed\xa0\x8d");
>> doit (" \xed\xb0\x8d ");
>> doit (" \xed\xb0\x8d");
>> doit (" \xed\xa0\x8d\xed\xb0\x8d ");
>> doit (" \xed\xb0\x8d\xed\xa0\x8d ");
>> doit (" \xed\xa0\x8d \xed\xb0\x8d ");
>> doit (" \xed\xb0\x8d \xed\xa0\x8d ");
>> doit (" \xed\xa0\x8d \xed\xb0\x8d ");
>> doit (" \xed\xb0\x8d \xed\xa0\x8d ");
>> }
>> === SNAP ===
>>
>> The output is as expected:
>>
>> $ gcc -g -o foo foo.c
>> $ ./foo
>> 20 f0 90 80 81 20
>> 0020 d800 dc01 0020
>> 20 f0 90 80 81 20
>>
>> 20 ed a0 8d 20
>> 0020 d80d 0020
>> 20 ed a0 8d 20
>>
>> 20 ed a0 8d
>> 0020 d80d
>> 20 ed a0 8d
>>
>> 20 ed b0 8d 20
>> 0020 dc0d 0020
>> 20 ed b0 8d 20
>>
>> 20 ed b0 8d
>> 0020 dc0d
>> 20 ed b0 8d
>>
>> 20 ed a0 8d ed b0 8d 20<== Represents valid surrogate
>> 0020 d80d dc0d 0020
>> 20 f0 93 90 8d 20<== so that's to be expected
>>
>> 20 ed b0 8d ed a0 8d 20
>> 0020 dc0d d80d 0020
>> 20 ed b0 8d ed a0 8d 20
>>
>> 20 ed a0 8d 20 ed b0 8d 20
>> 0020 d80d 0020 dc0d 0020
>> 20 ed a0 8d 20 ed b0 8d 20
>>
>> 20 ed b0 8d 20 ed a0 8d 20
>> 0020 dc0d 0020 d80d 0020
>> 20 ed b0 8d 20 ed a0 8d 20
>>
>> 20 ed a0 8d 20 20 ed b0 8d 20
>> 0020 d80d 0020 0020 dc0d 0020
>> 20 ed a0 8d 20 20 ed b0 8d 20
>>
>> 20 ed b0 8d 20 20 ed a0 8d 20
>> 0020 dc0d 0020 0020 d80d 0020
>> 20 ed b0 8d 20 20 ed a0 8d 20
>>
>> Ok to check in?
>>
>>
>> Thanks,
>> Corinna
>>
>>
>> * libc/stdlib/mbtowc_r.c (__utf8_mbtowc): Allow CESU-8 surrogate
>> value encoding.
>> * libc/stdlib/wctomb_r.c (__utf8_mbtowc): Allow CESU-8 surrogate
>> value decoding.
>>
>>
>> Index: libc/stdlib/mbtowc_r.c
>> ===================================================================
>> RCS file: /cvs/src/src/newlib/libc/stdlib/mbtowc_r.c,v
>> retrieving revision 1.16
>> diff -u -p -r1.16 mbtowc_r.c
>> --- libc/stdlib/mbtowc_r.c 27 Sep 2009 12:21:16 -0000 1.16
>> +++ libc/stdlib/mbtowc_r.c 28 Sep 2009 09:46:08 -0000
>> @@ -295,12 +295,6 @@ _DEFUN (__utf8_mbtowc, (r, pwc, s, n, ch
>> tmp = (wchar_t)((state->__value.__wchb[0]& 0x0f)<< 12)
>> | (wchar_t)((state->__value.__wchb[1]& 0x3f)<< 6)
>> | (wchar_t)(ch& 0x3f);
>> - /* Check for invalid CESU-8 encoding of UTF-16 surrogate values. */
>> - if (tmp>= 0xd800&& tmp<= 0xdfff)
>> - {
>> - r->_errno = EILSEQ;
>> - return -1;
>> - }
>> *pwc = tmp;
>> return i;
>> }
>> Index: libc/stdlib/wctomb_r.c
>> ===================================================================
>> RCS file: /cvs/src/src/newlib/libc/stdlib/wctomb_r.c,v
>> retrieving revision 1.15
>> diff -u -p -r1.15 wctomb_r.c
>> --- libc/stdlib/wctomb_r.c 27 Sep 2009 12:21:16 -0000 1.15
>> +++ libc/stdlib/wctomb_r.c 28 Sep 2009 09:46:08 -0000
>> @@ -63,72 +63,75 @@ _DEFUN (__utf8_wctomb, (r, s, wchar, cha
>> mbstate_t *state)
>> {
>> wint_t wchar = _wchar;
>> + int ret = 0;
>>
>> if (s == NULL)
>> return 0; /* UTF-8 encoding is not state-dependent */
>>
>> - if (state->__count == -4&& (wchar< 0xdc00 || wchar>= 0xdfff))
>> + if (sizeof (wchar_t) == 2&& state->__count == -4
>> +&& (wchar< 0xdc00 || wchar>= 0xdfff))
>> {
>> - /* At this point only the second half of a surrogate pair is valid. */
>> - r->_errno = EILSEQ;
>> - return -1;
>> + /* There's a leftover lone high surrogate. Write out the CESU-8 value
>> + of the surrogate and proceed to convert the given character. Note
>> + to return extra 3 bytes. */
>> + wchar_t tmp;
>> + tmp = (state->__value.__wchb[0]<< 16 | state->__value.__wchb[1]<< 8)
>> + - 0x10000>> 10 | 0xd80d;
>> + *s++ = 0xe0 | ((tmp& 0xf000)>> 12);
>> + *s++ = 0x80 | ((tmp& 0xfc0)>> 6);
>> + *s++ = 0x80 | (tmp& 0x3f);
>> + state->__count = 0;
>> + ret = 3;
>> }
>> if (wchar<= 0x7f)
>> {
>> *s = wchar;
>> - return 1;
>> + return ret + 1;
>> }
>> if (wchar>= 0x80&& wchar<= 0x7ff)
>> {
>> *s++ = 0xc0 | ((wchar& 0x7c0)>> 6);
>> *s = 0x80 | (wchar& 0x3f);
>> - return 2;
>> + return ret + 2;
>> }
>> if (wchar>= 0x800&& wchar<= 0xffff)
>> {
>> - if (wchar>= 0xd800&& wchar<= 0xdfff)
>> + /* No UTF-16 surrogate handling in UCS-4 */
>> + if (sizeof (wchar_t) == 2&& wchar>= 0xd800&& wchar<= 0xdfff)
>> {
>> wint_t tmp;
>> - /* UTF-16 surrogates -- must not occur in normal UCS-4 data */
>> - if (sizeof (wchar_t) != 2)
>> + if (wchar<= 0xdbff)
>> {
>> - r->_errno = EILSEQ;
>> - return -1;
>> + /* First half of a surrogate pair. Store the state and
>> + return ret + 0. */
>> + tmp = ((wchar& 0x3ff)<< 10) + 0x10000;
>> + state->__value.__wchb[0] = (tmp>> 16)& 0xff;
>> + state->__value.__wchb[1] = (tmp>> 8)& 0xff;
>> + state->__count = -4;
>> + *s = (0xf0 | ((tmp& 0x1c0000)>> 18));
>> + return ret;
>> }
>> - if (wchar>= 0xdc00)
>> + if (state->__count == -4)
>> {
>> - /* Second half of a surrogate pair. It's not valid if
>> - we don't have already read a first half of a surrogate
>> - before. */
>> - if (state->__count != -4)
>> - {
>> - r->_errno = EILSEQ;
>> - return -1;
>> - }
>> - /* If it's valid, reconstruct the full Unicode value and
>> - return the trailing three bytes of the UTF-8 char. */
>> + /* Second half of a surrogate pair. Reconstruct the full
>> + Unicode value and return the trailing three bytes of the
>> + UTF-8 character. */
>> tmp = (state->__value.__wchb[0]<< 16)
>> | (state->__value.__wchb[1]<< 8)
>> | (wchar& 0x3ff);
>> state->__count = 0;
>> + *s++ = 0xf0 | ((tmp& 0x1c0000)>> 18);
>> *s++ = 0x80 | ((tmp& 0x3f000)>> 12);
>> *s++ = 0x80 | ((tmp& 0xfc0)>> 6);
>> *s = 0x80 | (tmp& 0x3f);
>> - return 3;
>> + return 4;
>> }
>> - /* First half of a surrogate pair. Store the state and return
>> - the first byte of the UTF-8 char. */
>> - tmp = ((wchar& 0x3ff)<< 10) + 0x10000;
>> - state->__value.__wchb[0] = (tmp>> 16)& 0xff;
>> - state->__value.__wchb[1] = (tmp>> 8)& 0xff;
>> - state->__count = -4;
>> - *s = (0xf0 | ((tmp& 0x1c0000)>> 18));
>> - return 1;
>> + /* Otherwise translate into CESU-8 value. */
>> }
>> *s++ = 0xe0 | ((wchar& 0xf000)>> 12);
>> *s++ = 0x80 | ((wchar& 0xfc0)>> 6);
>> *s = 0x80 | (wchar& 0x3f);
>> - return 3;
>> + return ret + 3;
>> }
>> if (wchar>= 0x10000&& wchar<= 0x10ffff)
>> {
>>
>>
>> --
>> Corinna Vinschen
>> Cygwin Project Co-Leader
>> Red Hat
>