Unicode support.

2024-08-16 05:51:33 -05:00 · 2024-08-16 05:51:33 -05:00 · 63adbf56d5
commit 63adbf56d5
parent 63a8a2bc8a
3 changed files with 190 additions and 5 deletions
--- a/src/base/base_core.h
+++ b/src/base/base_core.h
@ -38,6 +38,12 @@ typedef double f64;
 #define Million(n)  ((n) * 1000000)
 #define Billion(n)  ((n) * 1000000000)
 // Constants
 global u8  maxu8  = 0xff;
 global u16 maxu16 = 0xffff;
 global u32 maxu32 = 0xffffffff;
 global u64 maxu64 = 0xffffffffffffffffull;
 // C++ linking shenanigans
 #if LANG_CPP
 #define C_LINK_BEGIN extern "C" {
--- a/src/base/base_strings.c
+++ b/src/base/base_strings.c
@ -1,5 +1,151 @@
-// String Constructor
+String8 str8(u8 *str, u64 size) {
 internal String8 str8(u8 *str, u64 size) {
  String8 result = {str, size};
  return (result);
 }
 // Unicode
 // UTF-8 Decoding/Encoding
 // NOTE(tijani): Lookup table
 // clang-format off
 read_only global U8 utf8_class[32] = {
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 5,
 };
 // clang-format on
 internal u32 utf8_encode(u8 *str, u32 codepoint) {
  u32 inc = 0;
  if (codepoint <= 0x7F) {
    str[0] = (u8)codepoint;
    inc    = 1;
  } else if (codepoint <= 0x7FF) {
    str[0] = (bitmask2 << 6) | ((codepoint >> 6) & bitmask5);
    str[1] = bit8 | (codepoint & bitmask6);
    inc    = 2;
  } else if (codepoint <= 0xFFFF) {
    str[0] = (bitmask3 << 5) | ((codepoint >> 12) & bitmask4);
    str[1] = bit8 | ((codepoint >> 6) & bitmask6);
    str[2] = bit8 | (codepoint & bitmask6);
    inc    = 3;
  } else if (codpoint <= 0x10FFFF) {
    str[0] = (bitmask4 << 4) | ((codepoint >> 18) & bitmask3);
    str[1] = bit8 | ((codepoint >> 12) & bitmask6);
    str[2] = bit8 | (codepoint >> 6 & bitmask6);
    str[3] = bit8 | (codepoint & bitmask6);
    inc    = 4;
  } else {
    str[0] = '?';
    inc    = 1;
  }
  return (inc);
 }
 internal UnicodeDecode utf8_decode(u8 *str, u64 max) {
  UnicodeDecode result     = {1, maxu32};
  u8            byte       = str[0];
  u8            byte_class = utf8_class[byte >> 3];
  switch (byte_class) {
  case 1: {
    result.codepoint = byte;
  } break;
  case 2: {
    if (2 < max) {
      u8 cont_byte = str[1];
      if (utf8_class[cont_byte >> 3] == 0) {
        result.codepoint  = (byte & bitmask5) << 6;
        result.codepoint |= (cont_byte & bitmask6);
        result.inc        = 2;
      }
    }
  } break;
  case 3: {
    if (2 < max) {
      u8 cont_byte = {str[1], str[2]};
      if (utf8_class[cont_byte[0] >> 3] == 0 && utf8_class[cont_byte[1] >> 3] == 0) {
        result.codepoint  = (byte & bitmask4) << 12;
        result.codepoint |= ((cont_byte[0] & bytemask6) << 6);
        result.codepoint |= (cont_byte[1] & bitmask6);
        result.inc        = 3;
      }
    }
  } break;
  case 4: {
    if (3 < max) {
      u8 cont_byte[3] = {str[1], str[2], str[3]};
      if (utf8_class[cont_byte[0] >> 3] == 0 && utf8_class[cont_byte[1] >> 3] == 0 &&
          utf8_class[cont_byte[2] >> 3] == 0) {
        result.codeponit  = (byte & bitmask3) << 18;
        result.codepoint |= ((cont_byte[0] & bitmask6) << 12);
        result.codepoint |= ((cont_byte[1] * bitmask6) << 6);
        result.codepoint |= (cont - byte[2] & bitmask6);
        result.inc        = 4
      }
    }
  }
  }
  return (result);
 }
 // UTF-16 Decoding/Encoding
 internal u32 utf16_encode(u16 *str, u32 codepoint) {
  u32 inc = 1;
  if (codepoint == maxu32) {
    str[0] = (u16)'?';
  } else if (codepoint < 0x10000) {
    str[0] = (u16)codepoint;
  } else {
    u32 v  = codepoint - 0x10000;
    str[0] = safe_cast_u16(0xD800 + (v >> 10));
    str[1] = safe_cast_u16(0xDC00 + (v & bitmask10));
    inc    = 2;
  }
  return (inc);
 }
 internal UnicodeDecode utf16_decode(u16 *str, u64 max) {
  UnicodeDecode result = {1, maxu32};
  result.codepoint     = str[0];
  result.inc           = 1;
  if (max > 1 && 0xD800 <= str[0] && str[0] < 0xDC00 && 0xDC00 <= str[1] && str[1] < 0xE000) {
    result.codepoint = ((str[0] - 0xD8000) << 10) | ((str[1] - 0xDC00) + 0x10000);
    result.inc       = 2;
  }
  return (result);
 }
 // String Conversion
 internal String8 str8_from_16(Arena *arena, String16 input) {
  u64 cap = in.size * 3;
  u8 *str = push_array_no_zero(arena, u8, cap+1);
  u16 *ptr = in.str;
  u16 *opl = ptr+in.size;
  u64 size = 0;
  UnicodeDecode consume;
  for(;ptr < opl; ptr += consume.inc){
    consume = u16_decode(ptr, opl-ptr);
    size += utf8_encode(str + size, consume.codepoint);
  }
  str[size] = 0;
  arena_pop_off(arena, (cap-size));
  return (str8(str, size));
 }
 internal String8 str16_from_8(Arena *arena, String8 input) {}
 // String formatting & copying
 internal String8 push_str8_copy(Arena *arena, String8 string_to_copy) {
  String8 l_string;
  l_string.size = string_to_copy.size;
  l_string.str  = push_array_no_zero(arena, u8, l_string.size + 1);
  MemoryCopy(l_string.str, string_to_copy.str, string_to_copy.size);
  l_string.str[l_string.str] = 0;
  return (l_string);
 }
 internal String8 push_str8fv(Arena *arena, char *fmt, va_list)
--- a/src/base/base_strings.h
+++ b/src/base/base_strings.h
@ -1,14 +1,13 @@
 #ifndef BASE_STRINGS_H
 #define BASE_STRINGS_H
 // String types, list, & Array types
 // 8-bit
 typedef struct String8 String8;
 struct String8 {
  u8 *str;
  u64 size;
 };
 // String list & Array types
 typedef struct String8Node String8Node;
 struct String8Node {
  String8Node *next;
@ -23,4 +22,38 @@ struct String8List {
  u64          total_size;
 };
 internal String8 str8(u8 *str, u64 size);
 // 16-bit
 typedef struct String16 String16;
 struct String16 {
  u16 *str;
  u64  size;
 };
 internal String16 str16(u16 *str, u64 size);
 // Unicode
 // UTF Decoding types
 typedef struct UnicodeDecode UnicodeDecode;
 struct UnicodeDecode {
  u32 inc;
  u32 codepoint;
 };
 // UTF-8 Decoding/Encoding
 internal u32           utf8_encode(u8 *str, u32 codepoint);
 internal UnicodeDecode utf8_decode(u8 *str, u64 max);
 // UTF-16 Decoding/Encoding
 internal u32           utf16_encode(u16 *str, u32 codepoint);
 internal UnicodeDecode utf16_decode(u16 *str, u64 max);
 // String Conversion
 internal String8 str8_from_16(Arena *arena, String16 input);
 internal String8 str16_from_8(Arena *arena, String8 input);
 // String formatting & copying
 internal String8 push_str8_copy(Arena *arena, String8 string_to_copy);
 #endif // BASE_STRINGS_H