| Line |
Branch |
Exec |
Source |
| 1 |
|
|
#include "unicode.h" |
| 2 |
|
|
|
| 3 |
|
248855 |
size_t utf8_val(int* codepoint, const char* inp) { |
| 4 |
|
248855 |
unsigned char c = *inp++; |
| 5 |
|
|
int val; |
| 6 |
2/2
✓ Branch 0 taken 248523 times.
✓ Branch 1 taken 332 times.
|
248855 |
if (c < 0x80) { |
| 7 |
|
|
// plain ascii |
| 8 |
|
248523 |
*codepoint = c; |
| 9 |
|
248523 |
return 1u; |
| 10 |
|
|
} |
| 11 |
|
|
// RPG Hacker: Byte sequences starting with 0xC0 or 0xC1 are invalid. |
| 12 |
|
|
// So are byte sequences starting with anything >= 0xF5. |
| 13 |
|
|
// And anything below 0xC0 indicates a follow-up byte and should never be at the start of a sequence. |
| 14 |
2/4
✓ Branch 0 taken 332 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 332 times.
✗ Branch 3 not taken.
|
332 |
else if (c > 0xC1 && c < 0xF5) { |
| 15 |
|
|
// 1, 2 or 3 continuation bytes |
| 16 |
4/4
✓ Branch 0 taken 240 times.
✓ Branch 1 taken 92 times.
✓ Branch 2 taken 112 times.
✓ Branch 3 taken 128 times.
|
332 |
int cont_byte_count = (c >= 0xF0) ? 3 : (c >= 0xE0) ? 2 : 1; |
| 17 |
|
|
// bit hack to extract the significant bits from the start byte |
| 18 |
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 157 times.
|
332 |
val = (c & ((1 << (6 - cont_byte_count)) - 1)); |
| 19 |
2/2
✓ Branch 0 taken 626 times.
✓ Branch 1 taken 330 times.
|
956 |
for (int i = 0; i < cont_byte_count; i++) { |
| 20 |
|
626 |
unsigned char next = *inp++; |
| 21 |
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 624 times.
|
626 |
if ((next & 0xC0) != 0x80) { |
| 22 |
|
2 |
*codepoint = -1; |
| 23 |
|
2 |
return 0u; |
| 24 |
|
|
} |
| 25 |
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 291 times.
|
624 |
val = (val << 6) | (next & 0x3F); |
| 26 |
|
|
} |
| 27 |
|
330 |
if (// too many cont.bytes |
| 28 |
4/5
✓ Branch 0 taken 174 times.
✓ Branch 1 taken 156 times.
✓ Branch 2 taken 174 times.
✓ Branch 3 taken 156 times.
✗ Branch 4 not taken.
|
330 |
(*inp & 0xC0) == 0x80 || |
| 29 |
|
|
|
| 30 |
|
|
// invalid codepoints |
| 31 |
2/2
✓ Branch 0 taken 92 times.
✓ Branch 1 taken 238 times.
|
330 |
val > 0x10FFFF || |
| 32 |
|
|
|
| 33 |
|
|
// check overlong encodings |
| 34 |
3/4
✓ Branch 0 taken 92 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 110 times.
✓ Branch 3 taken 220 times.
|
330 |
(cont_byte_count == 3 && val < 0x1000) || |
| 35 |
3/4
✓ Branch 0 taken 110 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 128 times.
✓ Branch 3 taken 202 times.
|
330 |
(cont_byte_count == 2 && val < 0x800) || |
| 36 |
3/4
✓ Branch 0 taken 128 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 94 times.
✓ Branch 3 taken 236 times.
|
330 |
(cont_byte_count == 1 && val < 0x80) || |
| 37 |
|
|
|
| 38 |
|
|
// UTF16 surrogates |
| 39 |
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 94 times.
|
94 |
(val >= 0xD800 && val <= 0xDFFF) |
| 40 |
|
|
) { |
| 41 |
|
✗ |
*codepoint = -1; |
| 42 |
|
✗ |
return 0u; |
| 43 |
|
|
}; |
| 44 |
|
330 |
*codepoint = val; |
| 45 |
|
330 |
return 1u + cont_byte_count; |
| 46 |
|
|
} |
| 47 |
|
|
|
| 48 |
|
|
// if none of the above, this couldn't possibly be a valid encoding |
| 49 |
|
✗ |
*codepoint = -1; |
| 50 |
|
✗ |
return 0u; |
| 51 |
|
|
} |
| 52 |
|
|
|
| 53 |
|
✗ |
bool codepoint_to_utf8(string* out, unsigned int codepoint) { |
| 54 |
|
✗ |
*out = ""; |
| 55 |
|
✗ |
if (codepoint < 0x80) { |
| 56 |
|
✗ |
*out += (unsigned char)codepoint; |
| 57 |
|
|
} |
| 58 |
|
✗ |
else if (codepoint < 0x800) { |
| 59 |
|
✗ |
*out += (unsigned char)(0xc0 | (codepoint >> 6)); |
| 60 |
|
✗ |
*out += (unsigned char)(0x80 | (codepoint & 0x3f)); |
| 61 |
|
|
} |
| 62 |
|
✗ |
else if (codepoint < 0x10000) { |
| 63 |
|
✗ |
*out += (unsigned char)(0xe0 | (codepoint >> 12)); |
| 64 |
|
✗ |
*out += (unsigned char)(0x80 | ((codepoint >> 6) & 0x3f)); |
| 65 |
|
✗ |
*out += (unsigned char)(0x80 | (codepoint & 0x3f)); |
| 66 |
|
|
} |
| 67 |
|
✗ |
else if (codepoint < 0x110000) { |
| 68 |
|
✗ |
*out += (unsigned char)(0xf0 | (codepoint >> 18)); |
| 69 |
|
✗ |
*out += (unsigned char)(0x80 | ((codepoint >> 12) & 0x3f)); |
| 70 |
|
✗ |
*out += (unsigned char)(0x80 | ((codepoint >> 6) & 0x3f)); |
| 71 |
|
✗ |
*out += (unsigned char)(0x80 | (codepoint & 0x3f)); |
| 72 |
|
|
} |
| 73 |
|
✗ |
else return false; |
| 74 |
|
|
|
| 75 |
|
✗ |
return true; |
| 76 |
|
|
} |
| 77 |
|
|
|
| 78 |
|
332 |
bool is_valid_utf8(const char* inp, size_t inp_len) { |
| 79 |
2/2
✓ Branch 0 taken 26313 times.
✓ Branch 1 taken 330 times.
|
26643 |
for(size_t i = 0; i < inp_len;) { |
| 80 |
|
|
// optimization: if next 8 bytes are ascii, skip them |
| 81 |
2/2
✓ Branch 0 taken 25125 times.
✓ Branch 1 taken 1188 times.
|
26313 |
if(i + 8 <= inp_len) { |
| 82 |
|
12646 |
uint64_t buf; |
| 83 |
|
25125 |
memcpy(&buf, inp+i, sizeof(buf)); |
| 84 |
2/2
✓ Branch 0 taken 24851 times.
✓ Branch 1 taken 274 times.
|
25125 |
if((buf & 0x8080808080808080ull) == 0) { |
| 85 |
|
24851 |
i += 8; continue; |
| 86 |
|
|
} |
| 87 |
|
|
} |
| 88 |
|
|
|
| 89 |
|
806 |
int codepoint; |
| 90 |
1/2
✓ Branch 0 taken 801 times.
✗ Branch 1 not taken.
|
1462 |
i += utf8_val(&codepoint, inp+i); |
| 91 |
|
|
|
| 92 |
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1460 times.
|
1462 |
if (codepoint == -1) return false; |
| 93 |
|
|
} |
| 94 |
|
|
|
| 95 |
|
330 |
return true; |
| 96 |
|
|
} |
| 97 |
|
|
|
| 98 |
|
✗ |
size_t utf16_val(int* codepoint, const wchar_t* inp) |
| 99 |
|
|
{ |
| 100 |
|
✗ |
wchar_t first_word = *inp; |
| 101 |
|
|
|
| 102 |
|
✗ |
if (first_word <= 0xD800 || first_word >= 0xDFFF) |
| 103 |
|
|
{ |
| 104 |
|
|
// Single word |
| 105 |
|
✗ |
*codepoint = first_word; |
| 106 |
|
✗ |
return 1u; |
| 107 |
|
|
} |
| 108 |
|
✗ |
else if (first_word >= 0xD800 && first_word <= 0xDBFF) |
| 109 |
|
|
{ |
| 110 |
|
|
// Start of a surrogate pair |
| 111 |
|
✗ |
wchar_t second_word = *(inp + 1); |
| 112 |
|
|
|
| 113 |
|
✗ |
if (second_word >= 0xDC00 && second_word <= 0xDFFF) |
| 114 |
|
|
{ |
| 115 |
|
✗ |
*codepoint = 0x10000 |
| 116 |
|
✗ |
+ ((int)(first_word - 0xD800) << 10u) |
| 117 |
|
✗ |
+ ((int)(second_word - 0xDC00)); |
| 118 |
|
✗ |
return 2u; |
| 119 |
|
|
} |
| 120 |
|
|
} |
| 121 |
|
|
|
| 122 |
|
|
// Everything not covered above is considered invalid. |
| 123 |
|
✗ |
*codepoint = -1; |
| 124 |
|
✗ |
return 0u; |
| 125 |
|
|
} |
| 126 |
|
|
|
| 127 |
|
244560 |
bool codepoint_to_utf16(std::wstring* out, unsigned int codepoint) |
| 128 |
|
|
{ |
| 129 |
3/4
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 244554 times.
✓ Branch 2 taken 6 times.
✗ Branch 3 not taken.
|
244560 |
if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) |
| 130 |
|
|
{ |
| 131 |
|
6 |
wchar_t high = (wchar_t)(((codepoint - 0x10000) >> 10) + 0xD800); |
| 132 |
|
6 |
wchar_t low = (wchar_t)(((codepoint - 0x10000) & 0b1111111111) + 0xDC00); |
| 133 |
|
|
|
| 134 |
2/7
✓ Branch 0 taken 6 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 6 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
|
6 |
*out = std::wstring() + high + low; |
| 135 |
|
6 |
return true; |
| 136 |
|
|
} |
| 137 |
1/4
✗ Branch 0 not taken.
✓ Branch 1 taken 244554 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
244554 |
else if (codepoint <= 0xD800 || codepoint >= 0xDFFF) |
| 138 |
|
|
{ |
| 139 |
1/4
✓ Branch 0 taken 244554 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
244554 |
*out = std::wstring() + (wchar_t)codepoint; |
| 140 |
|
244554 |
return true; |
| 141 |
|
|
} |
| 142 |
|
|
|
| 143 |
|
|
// Everything not covered above should be considered invalid. |
| 144 |
|
✗ |
return false; |
| 145 |
|
|
} |
| 146 |
|
|
|
| 147 |
|
|
|
| 148 |
|
✗ |
bool utf16_to_utf8(string* result, const wchar_t* u16_str) |
| 149 |
|
|
{ |
| 150 |
|
✗ |
*result = ""; |
| 151 |
|
|
|
| 152 |
|
✗ |
int codepoint; |
| 153 |
|
|
do |
| 154 |
|
|
{ |
| 155 |
|
✗ |
u16_str += utf16_val(&codepoint, u16_str); |
| 156 |
|
|
|
| 157 |
|
✗ |
string next; |
| 158 |
|
✗ |
if (codepoint == -1 || !codepoint_to_utf8(&next, codepoint)) return false; |
| 159 |
|
|
|
| 160 |
|
✗ |
*result += next; |
| 161 |
|
✗ |
} while (codepoint != 0); |
| 162 |
|
|
|
| 163 |
|
✗ |
return true; |
| 164 |
|
|
} |
| 165 |
|
|
|
| 166 |
|
5424 |
bool utf8_to_utf16(std::wstring* result, const char* u8_str) |
| 167 |
|
|
{ |
| 168 |
1/3
✓ Branch 0 taken 5424 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
|
5424 |
*result = L""; |
| 169 |
|
|
|
| 170 |
|
✗ |
int codepoint; |
| 171 |
|
|
do |
| 172 |
|
|
{ |
| 173 |
0/2
✗ Branch 0 not taken.
✗ Branch 1 not taken.
|
244560 |
u8_str += utf8_val(&codepoint, u8_str); |
| 174 |
|
|
|
| 175 |
|
244560 |
std::wstring next; |
| 176 |
4/8
✓ Branch 0 taken 244560 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 244560 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 244560 times.
✗ Branch 6 not taken.
✓ Branch 7 taken 244560 times.
|
244560 |
if (codepoint == -1 || !codepoint_to_utf16(&next, codepoint)) return false; |
| 177 |
|
|
|
| 178 |
1/3
✓ Branch 0 taken 244560 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
|
244560 |
*result += next; |
| 179 |
3/4
✓ Branch 0 taken 244560 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 239136 times.
✓ Branch 3 taken 5424 times.
|
489120 |
} while (codepoint != 0); |
| 180 |
|
|
|
| 181 |
|
5424 |
return true; |
| 182 |
|
|
} |
| 183 |
|
|
|