Unicode support.

This commit is contained in:
Tijani Lawal 2024-08-16 05:51:33 -05:00
parent 63a8a2bc8a
commit 63adbf56d5
3 changed files with 190 additions and 5 deletions

View File

@ -38,6 +38,12 @@ typedef double f64;
#define Million(n) ((n) * 1000000) #define Million(n) ((n) * 1000000)
#define Billion(n) ((n) * 1000000000) #define Billion(n) ((n) * 1000000000)
// Constants
global u8 maxu8 = 0xff;
global u16 maxu16 = 0xffff;
global u32 maxu32 = 0xffffffff;
global u64 maxu64 = 0xffffffffffffffffull;
// C++ linking shenanigans // C++ linking shenanigans
#if LANG_CPP #if LANG_CPP
#define C_LINK_BEGIN extern "C" { #define C_LINK_BEGIN extern "C" {

View File

@ -1,5 +1,151 @@
// String Constructor String8 str8(u8 *str, u64 size) {
internal String8 str8(u8 *str, u64 size) {
String8 result = {str, size}; String8 result = {str, size};
return (result); return (result);
} }
// Unicode
// UTF-8 Decoding/Encoding
// NOTE(tijani): Lookup table
// clang-format off
read_only global U8 utf8_class[32] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 5,
};
// clang-format on
internal u32 utf8_encode(u8 *str, u32 codepoint) {
u32 inc = 0;
if (codepoint <= 0x7F) {
str[0] = (u8)codepoint;
inc = 1;
} else if (codepoint <= 0x7FF) {
str[0] = (bitmask2 << 6) | ((codepoint >> 6) & bitmask5);
str[1] = bit8 | (codepoint & bitmask6);
inc = 2;
} else if (codepoint <= 0xFFFF) {
str[0] = (bitmask3 << 5) | ((codepoint >> 12) & bitmask4);
str[1] = bit8 | ((codepoint >> 6) & bitmask6);
str[2] = bit8 | (codepoint & bitmask6);
inc = 3;
} else if (codpoint <= 0x10FFFF) {
str[0] = (bitmask4 << 4) | ((codepoint >> 18) & bitmask3);
str[1] = bit8 | ((codepoint >> 12) & bitmask6);
str[2] = bit8 | (codepoint >> 6 & bitmask6);
str[3] = bit8 | (codepoint & bitmask6);
inc = 4;
} else {
str[0] = '?';
inc = 1;
}
return (inc);
}
internal UnicodeDecode utf8_decode(u8 *str, u64 max) {
UnicodeDecode result = {1, maxu32};
u8 byte = str[0];
u8 byte_class = utf8_class[byte >> 3];
switch (byte_class) {
case 1: {
result.codepoint = byte;
} break;
case 2: {
if (2 < max) {
u8 cont_byte = str[1];
if (utf8_class[cont_byte >> 3] == 0) {
result.codepoint = (byte & bitmask5) << 6;
result.codepoint |= (cont_byte & bitmask6);
result.inc = 2;
}
}
} break;
case 3: {
if (2 < max) {
u8 cont_byte = {str[1], str[2]};
if (utf8_class[cont_byte[0] >> 3] == 0 && utf8_class[cont_byte[1] >> 3] == 0) {
result.codepoint = (byte & bitmask4) << 12;
result.codepoint |= ((cont_byte[0] & bytemask6) << 6);
result.codepoint |= (cont_byte[1] & bitmask6);
result.inc = 3;
}
}
} break;
case 4: {
if (3 < max) {
u8 cont_byte[3] = {str[1], str[2], str[3]};
if (utf8_class[cont_byte[0] >> 3] == 0 && utf8_class[cont_byte[1] >> 3] == 0 &&
utf8_class[cont_byte[2] >> 3] == 0) {
result.codeponit = (byte & bitmask3) << 18;
result.codepoint |= ((cont_byte[0] & bitmask6) << 12);
result.codepoint |= ((cont_byte[1] * bitmask6) << 6);
result.codepoint |= (cont - byte[2] & bitmask6);
result.inc = 4
}
}
}
}
return (result);
}
// UTF-16 Decoding/Encoding
internal u32 utf16_encode(u16 *str, u32 codepoint) {
u32 inc = 1;
if (codepoint == maxu32) {
str[0] = (u16)'?';
} else if (codepoint < 0x10000) {
str[0] = (u16)codepoint;
} else {
u32 v = codepoint - 0x10000;
str[0] = safe_cast_u16(0xD800 + (v >> 10));
str[1] = safe_cast_u16(0xDC00 + (v & bitmask10));
inc = 2;
}
return (inc);
}
internal UnicodeDecode utf16_decode(u16 *str, u64 max) {
UnicodeDecode result = {1, maxu32};
result.codepoint = str[0];
result.inc = 1;
if (max > 1 && 0xD800 <= str[0] && str[0] < 0xDC00 && 0xDC00 <= str[1] && str[1] < 0xE000) {
result.codepoint = ((str[0] - 0xD8000) << 10) | ((str[1] - 0xDC00) + 0x10000);
result.inc = 2;
}
return (result);
}
// String Conversion
internal String8 str8_from_16(Arena *arena, String16 input) {
u64 cap = in.size * 3;
u8 *str = push_array_no_zero(arena, u8, cap+1);
u16 *ptr = in.str;
u16 *opl = ptr+in.size;
u64 size = 0;
UnicodeDecode consume;
for(;ptr < opl; ptr += consume.inc){
consume = u16_decode(ptr, opl-ptr);
size += utf8_encode(str + size, consume.codepoint);
}
str[size] = 0;
arena_pop_off(arena, (cap-size));
return (str8(str, size));
}
internal String8 str16_from_8(Arena *arena, String8 input) {}
// String formatting & copying
internal String8 push_str8_copy(Arena *arena, String8 string_to_copy) {
String8 l_string;
l_string.size = string_to_copy.size;
l_string.str = push_array_no_zero(arena, u8, l_string.size + 1);
MemoryCopy(l_string.str, string_to_copy.str, string_to_copy.size);
l_string.str[l_string.str] = 0;
return (l_string);
}
internal String8 push_str8fv(Arena *arena, char *fmt, va_list)

View File

@ -1,14 +1,13 @@
#ifndef BASE_STRINGS_H #ifndef BASE_STRINGS_H
#define BASE_STRINGS_H #define BASE_STRINGS_H
// String types, list, & Array types
// 8-bit
typedef struct String8 String8; typedef struct String8 String8;
struct String8 { struct String8 {
u8 *str; u8 *str;
u64 size; u64 size;
}; };
// String list & Array types
typedef struct String8Node String8Node; typedef struct String8Node String8Node;
struct String8Node { struct String8Node {
String8Node *next; String8Node *next;
@ -23,4 +22,38 @@ struct String8List {
u64 total_size; u64 total_size;
}; };
internal String8 str8(u8 *str, u64 size);
// 16-bit
typedef struct String16 String16;
struct String16 {
u16 *str;
u64 size;
};
internal String16 str16(u16 *str, u64 size);
// Unicode
// UTF Decoding types
typedef struct UnicodeDecode UnicodeDecode;
struct UnicodeDecode {
u32 inc;
u32 codepoint;
};
// UTF-8 Decoding/Encoding
internal u32 utf8_encode(u8 *str, u32 codepoint);
internal UnicodeDecode utf8_decode(u8 *str, u64 max);
// UTF-16 Decoding/Encoding
internal u32 utf16_encode(u16 *str, u32 codepoint);
internal UnicodeDecode utf16_decode(u16 *str, u64 max);
// String Conversion
internal String8 str8_from_16(Arena *arena, String16 input);
internal String8 str16_from_8(Arena *arena, String8 input);
// String formatting & copying
internal String8 push_str8_copy(Arena *arena, String8 string_to_copy);
#endif // BASE_STRINGS_H #endif // BASE_STRINGS_H