| Deutsch English Français Italiano |
|
<vavsr7$14n11$1@raubtier-asyl.eternal-september.org> View for Bookmarking (what is this?) Look up another Usenet article |
Path: news.eternal-september.org!eternal-september.org!news.eternal-september.org!raubtier-asyl.eternal-september.org!.POSTED!not-for-mail
From: Bonita Montero <Bonita.Montero@gmail.com>
Newsgroups: comp.lang.c++
Subject: UTF16 <-> UTF32
Date: Sat, 31 Aug 2024 22:01:43 +0200
Organization: A noiseless patient Spider
Lines: 98
Message-ID: <vavsr7$14n11$1@raubtier-asyl.eternal-september.org>
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8; format=flowed
Content-Transfer-Encoding: 7bit
Injection-Date: Sat, 31 Aug 2024 22:01:43 +0200 (CEST)
Injection-Info: raubtier-asyl.eternal-september.org; posting-host="f3ac3daf96e4190349ad23da2cc878b7";
logging-data="1203233"; mail-complaints-to="abuse@eternal-september.org"; posting-account="U2FsdGVkX1/hX0msLSj+AphfevzKJq9EU2QBSuEKRZ4="
User-Agent: Mozilla Thunderbird
Cancel-Lock: sha1:EgpdMNRGlNldQxnwN0RMJGm4FH8=
Content-Language: de-DE
Today I needed conversion functions from UTF32 to UTF16 and in the
opposite direction. I wanted to allow result-string re-usage and
decided to give the result string as a reference-parameter. This
would help the result string to keep its capacity.
I think there's no way to implement that code faster.
bool u16ToU32( u16string_view str, u32string &u32Str )
{
auto iterate = [&]<bool Err>( bool_constant<Err>, auto fn ) -> bool
{
constexpr char16_t
SURR_HDR_MSK = 0xF800,
HIGH_SURR = 0xD800,
SURR_HDR = HIGH_SURR,
LOW_SURR = 0xDC00,
SURR_MASK = 0xFC00;
for( auto it = str.begin(), end = str.end(); it != end; )
if( (*it & SURR_HDR_MSK) != SURR_HDR ) [[likely]]
fn( (char32_t)*it++ );
else
{
if( Err && (*it & SURR_MASK) != HIGH_SURR ) [[unlikely]]
return false;
if( Err && it + 1 == end ) [[unlikely]]
return false;
if( Err && (it[1] & SURR_MASK) != LOW_SURR ) [[unlikely]]
return false;
fn( 0x10000 + ((char32_t)(*it & ~SURR_MASK) << 10 | (char32_t)(it[1]
& ~SURR_MASK)) );
it += 2;
}
return true;
};
size_t n = 0;
if( !iterate( true_type(), [&]( char32_t ) { ++n; } ) )
return false;
u32Str.resize_and_overwrite( n, [&]( char32_t *p, size_t n )
{
auto it = span( p, n ).begin();
iterate( false_type(), [&]( char32_t c ) { *it++ = c; } );
return n;
} );
return true;
}
pair<bool, u32string> u16ToU32( u16string_view str )
{
u32string u32Str;
if( !u16ToU32( str, u32Str ) ) [[unlikely]]
return { false, {} };
return { true, move( u32Str ) };
}
bool u32ToU16( u32string_view str, u16string &u16Str )
{
auto iterate = [&]<bool Err>( bool_constant<Err>, auto fn ) -> bool
{
constexpr char32_t
UNICODE_MAX = 0x10FFFF;
constexpr char16_t
HIGH_SURR = 0xD800,
LOW_SURR = 0xDC00,
END_SURR = 0xDFFF;
for( auto it = str.begin(), end = str.end(); it != end; )
if( !Err || *it <= UNICODE_MAX && (*it < LOW_SURR || *it > END_SURR)
) [[likely]]
if( *it <= 0xFFFF ) [[likely]]
fn( (char16_t)*it++ );
else
{
char32_t c = *it++ - 0x10000;
fn( (char16_t)(HIGH_SURR | c >> 10) );
fn( (char16_t)(LOW_SURR | c & 0x3FF) );
}
else
return false;
return true;
};
size_t n = 0;
if( !iterate( true_type(), [&]( char16_t ) { ++n; } ) ) [[unlikely]]
return false;
u16Str.resize_and_overwrite( n, [&]( char16_t *p, size_t n )
{
auto it = span( p, n ).begin();
iterate( false_type(), [&]( char16_t c ) { *it++ = c; } );
return n;
} );
return true;
}
pair<bool, u16string> u32ToU16( u32string_view str )
{
u16string u16Str;
if( !u32ToU16( str, u16Str ) ) [[unlikely]]
return { false, {} };
return { true, move( u16Str ) };
}