2019-01-13 20:15:32 -06:00
|
|
|
#include <stdint.h>
|
|
|
|
#include <stddef.h>
|
2022-03-20 15:59:17 -05:00
|
|
|
#include <string.h>
|
2019-01-13 20:15:32 -06:00
|
|
|
#include "unicode.h"
|
|
|
|
|
2022-03-20 15:59:17 -05:00
|
|
|
int utf8_last_size(const char *str) {
|
|
|
|
int len = 0;
|
|
|
|
char *pos = strchr(str, '\0');
|
|
|
|
while (pos > str) {
|
|
|
|
--pos; ++len;
|
|
|
|
if ((*pos & 0xc0) != 0x80) {
|
|
|
|
return len;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-01-13 20:15:32 -06:00
|
|
|
size_t utf8_chsize(uint32_t ch) {
|
|
|
|
if (ch < 0x80) {
|
|
|
|
return 1;
|
|
|
|
} else if (ch < 0x800) {
|
|
|
|
return 2;
|
|
|
|
} else if (ch < 0x10000) {
|
|
|
|
return 3;
|
|
|
|
}
|
|
|
|
return 4;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t utf8_encode(char *str, uint32_t ch) {
|
|
|
|
size_t len = 0;
|
|
|
|
uint8_t first;
|
|
|
|
|
|
|
|
if (ch < 0x80) {
|
|
|
|
first = 0;
|
|
|
|
len = 1;
|
|
|
|
} else if (ch < 0x800) {
|
|
|
|
first = 0xc0;
|
|
|
|
len = 2;
|
|
|
|
} else if (ch < 0x10000) {
|
|
|
|
first = 0xe0;
|
|
|
|
len = 3;
|
|
|
|
} else {
|
|
|
|
first = 0xf0;
|
|
|
|
len = 4;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (size_t i = len - 1; i > 0; --i) {
|
|
|
|
str[i] = (ch & 0x3f) | 0x80;
|
|
|
|
ch >>= 6;
|
|
|
|
}
|
|
|
|
|
|
|
|
str[0] = ch | first;
|
|
|
|
return len;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static const struct {
|
|
|
|
uint8_t mask;
|
|
|
|
uint8_t result;
|
|
|
|
int octets;
|
|
|
|
} sizes[] = {
|
|
|
|
{ 0x80, 0x00, 1 },
|
|
|
|
{ 0xE0, 0xC0, 2 },
|
|
|
|
{ 0xF0, 0xE0, 3 },
|
|
|
|
{ 0xF8, 0xF0, 4 },
|
|
|
|
{ 0xFC, 0xF8, 5 },
|
|
|
|
{ 0xFE, 0xF8, 6 },
|
|
|
|
{ 0x80, 0x80, -1 },
|
|
|
|
};
|
|
|
|
|
|
|
|
int utf8_size(const char *s) {
|
|
|
|
uint8_t c = (uint8_t)*s;
|
|
|
|
for (size_t i = 0; i < sizeof(sizes) / sizeof(*sizes); ++i) {
|
|
|
|
if ((c & sizes[i].mask) == sizes[i].result) {
|
|
|
|
return sizes[i].octets;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return -1;
|
|
|
|
}
|