diff --git a/src/base/base_core.h b/src/base/base_core.h index 4c2973d..5bf9dca 100644 --- a/src/base/base_core.h +++ b/src/base/base_core.h @@ -38,6 +38,12 @@ typedef double f64; #define Million(n) ((n) * 1000000) #define Billion(n) ((n) * 1000000000) +// Constants +global u8 maxu8 = 0xff; +global u16 maxu16 = 0xffff; +global u32 maxu32 = 0xffffffff; +global u64 maxu64 = 0xffffffffffffffffull; + // C++ linking shenanigans #if LANG_CPP #define C_LINK_BEGIN extern "C" { diff --git a/src/base/base_strings.c b/src/base/base_strings.c index 45253b6..fbe507b 100644 --- a/src/base/base_strings.c +++ b/src/base/base_strings.c @@ -1,5 +1,151 @@ -// String Constructor -internal String8 str8(u8 *str, u64 size) { +String8 str8(u8 *str, u64 size) { String8 result = {str, size}; return (result); } + +// Unicode +// UTF-8 Decoding/Encoding + +// NOTE(tijani): Lookup table + +// clang-format off +read_only global U8 utf8_class[32] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 5, +}; +// clang-format on + +internal u32 utf8_encode(u8 *str, u32 codepoint) { + u32 inc = 0; + if (codepoint <= 0x7F) { + str[0] = (u8)codepoint; + inc = 1; + } else if (codepoint <= 0x7FF) { + str[0] = (bitmask2 << 6) | ((codepoint >> 6) & bitmask5); + str[1] = bit8 | (codepoint & bitmask6); + inc = 2; + } else if (codepoint <= 0xFFFF) { + str[0] = (bitmask3 << 5) | ((codepoint >> 12) & bitmask4); + str[1] = bit8 | ((codepoint >> 6) & bitmask6); + str[2] = bit8 | (codepoint & bitmask6); + inc = 3; + } else if (codpoint <= 0x10FFFF) { + str[0] = (bitmask4 << 4) | ((codepoint >> 18) & bitmask3); + str[1] = bit8 | ((codepoint >> 12) & bitmask6); + str[2] = bit8 | (codepoint >> 6 & bitmask6); + str[3] = bit8 | (codepoint & bitmask6); + inc = 4; + } else { + str[0] = '?'; + inc = 1; + } + return (inc); +} + +internal UnicodeDecode utf8_decode(u8 *str, u64 max) { + UnicodeDecode result = {1, maxu32}; + u8 byte = str[0]; + u8 byte_class = utf8_class[byte >> 3]; + + switch (byte_class) { + case 1: { + result.codepoint = byte; + } break; + + case 2: { + if (2 < max) { + u8 cont_byte = str[1]; + if (utf8_class[cont_byte >> 3] == 0) { + result.codepoint = (byte & bitmask5) << 6; + result.codepoint |= (cont_byte & bitmask6); + result.inc = 2; + } + } + } break; + + case 3: { + if (2 < max) { + u8 cont_byte = {str[1], str[2]}; + if (utf8_class[cont_byte[0] >> 3] == 0 && utf8_class[cont_byte[1] >> 3] == 0) { + result.codepoint = (byte & bitmask4) << 12; + result.codepoint |= ((cont_byte[0] & bytemask6) << 6); + result.codepoint |= (cont_byte[1] & bitmask6); + result.inc = 3; + } + } + } break; + + case 4: { + if (3 < max) { + u8 cont_byte[3] = {str[1], str[2], str[3]}; + if (utf8_class[cont_byte[0] >> 3] == 0 && utf8_class[cont_byte[1] >> 3] == 0 && + utf8_class[cont_byte[2] >> 3] == 0) { + result.codeponit = (byte & bitmask3) << 18; + result.codepoint |= ((cont_byte[0] & bitmask6) << 12); + result.codepoint |= ((cont_byte[1] * bitmask6) << 6); + result.codepoint |= (cont - byte[2] & bitmask6); + result.inc = 4 + } + } + } + } + return (result); +} + +// UTF-16 Decoding/Encoding +internal u32 utf16_encode(u16 *str, u32 codepoint) { + u32 inc = 1; + if (codepoint == maxu32) { + str[0] = (u16)'?'; + } else if (codepoint < 0x10000) { + str[0] = (u16)codepoint; + } else { + u32 v = codepoint - 0x10000; + str[0] = safe_cast_u16(0xD800 + (v >> 10)); + str[1] = safe_cast_u16(0xDC00 + (v & bitmask10)); + inc = 2; + } + return (inc); +} + +internal UnicodeDecode utf16_decode(u16 *str, u64 max) { + UnicodeDecode result = {1, maxu32}; + result.codepoint = str[0]; + result.inc = 1; + if (max > 1 && 0xD800 <= str[0] && str[0] < 0xDC00 && 0xDC00 <= str[1] && str[1] < 0xE000) { + result.codepoint = ((str[0] - 0xD8000) << 10) | ((str[1] - 0xDC00) + 0x10000); + result.inc = 2; + } + return (result); +} + +// String Conversion +internal String8 str8_from_16(Arena *arena, String16 input) { + u64 cap = in.size * 3; + u8 *str = push_array_no_zero(arena, u8, cap+1); + u16 *ptr = in.str; + u16 *opl = ptr+in.size; + u64 size = 0; + UnicodeDecode consume; + + for(;ptr < opl; ptr += consume.inc){ + consume = u16_decode(ptr, opl-ptr); + size += utf8_encode(str + size, consume.codepoint); + } + str[size] = 0; + arena_pop_off(arena, (cap-size)); + return (str8(str, size)); +} +internal String8 str16_from_8(Arena *arena, String8 input) {} + +// String formatting & copying +internal String8 push_str8_copy(Arena *arena, String8 string_to_copy) { + String8 l_string; + l_string.size = string_to_copy.size; + l_string.str = push_array_no_zero(arena, u8, l_string.size + 1); + MemoryCopy(l_string.str, string_to_copy.str, string_to_copy.size); + l_string.str[l_string.str] = 0; + return (l_string); +} + +internal String8 push_str8fv(Arena *arena, char *fmt, va_list) diff --git a/src/base/base_strings.h b/src/base/base_strings.h index 65e7991..98af209 100644 --- a/src/base/base_strings.h +++ b/src/base/base_strings.h @@ -1,14 +1,13 @@ #ifndef BASE_STRINGS_H #define BASE_STRINGS_H +// String types, list, & Array types +// 8-bit typedef struct String8 String8; struct String8 { u8 *str; u64 size; }; - -// String list & Array types - typedef struct String8Node String8Node; struct String8Node { String8Node *next; @@ -23,4 +22,38 @@ struct String8List { u64 total_size; }; +internal String8 str8(u8 *str, u64 size); + +// 16-bit +typedef struct String16 String16; +struct String16 { + u16 *str; + u64 size; +}; + +internal String16 str16(u16 *str, u64 size); + +// Unicode +// UTF Decoding types +typedef struct UnicodeDecode UnicodeDecode; +struct UnicodeDecode { + u32 inc; + u32 codepoint; +}; + +// UTF-8 Decoding/Encoding +internal u32 utf8_encode(u8 *str, u32 codepoint); +internal UnicodeDecode utf8_decode(u8 *str, u64 max); + +// UTF-16 Decoding/Encoding +internal u32 utf16_encode(u16 *str, u32 codepoint); +internal UnicodeDecode utf16_decode(u16 *str, u64 max); + +// String Conversion +internal String8 str8_from_16(Arena *arena, String16 input); +internal String8 str16_from_8(Arena *arena, String8 input); + +// String formatting & copying +internal String8 push_str8_copy(Arena *arena, String8 string_to_copy); + #endif // BASE_STRINGS_H