Line |
Branch |
Exec |
Source |
1 |
|
|
#include "unicode.h" |
2 |
|
|
|
3 |
|
1504535 |
size_t utf8_val(int* codepoint, const char* inp) { |
4 |
|
1504535 |
unsigned char c = *inp++; |
5 |
|
|
int val; |
6 |
2/2
✓ Branch 0 taken 1503500 times.
✓ Branch 1 taken 1035 times.
|
1504535 |
if (c < 0x80) { |
7 |
|
|
// plain ascii |
8 |
|
1503500 |
*codepoint = c; |
9 |
|
1503500 |
return 1u; |
10 |
|
|
} |
11 |
|
|
// RPG Hacker: Byte sequences starting with 0xC0 or 0xC1 are invalid. |
12 |
|
|
// So are byte sequences starting with anything >= 0xF5. |
13 |
|
|
// And anything below 0xC0 indicates a follow-up byte and should never be at the start of a sequence. |
14 |
2/4
✓ Branch 0 taken 1035 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 471 times.
✗ Branch 3 not taken.
|
1035 |
else if (c > 0xC1 && c < 0xF5) { |
15 |
|
|
// 1, 2 or 3 continuation bytes |
16 |
4/4
✓ Branch 0 taken 738 times.
✓ Branch 1 taken 297 times.
✓ Branch 2 taken 342 times.
✓ Branch 3 taken 396 times.
|
1035 |
int cont_byte_count = (c >= 0xF0) ? 3 : (c >= 0xE0) ? 2 : 1; |
17 |
|
|
// bit hack to extract the significant bits from the start byte |
18 |
|
1035 |
val = (c & ((1 << (6 - cont_byte_count)) - 1)); |
19 |
2/2
✓ Branch 0 taken 1977 times.
✓ Branch 1 taken 1029 times.
|
3006 |
for (int i = 0; i < cont_byte_count; i++) { |
20 |
|
1977 |
unsigned char next = *inp++; |
21 |
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 1971 times.
|
1977 |
if ((next & 0xC0) != 0x80) { |
22 |
|
6 |
*codepoint = -1; |
23 |
|
6 |
return 0u; |
24 |
|
|
} |
25 |
|
1971 |
val = (val << 6) | (next & 0x3F); |
26 |
|
|
} |
27 |
|
1029 |
if (// too many cont.bytes |
28 |
2/4
✓ Branch 0 taken 1029 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1029 times.
✗ Branch 3 not taken.
|
1029 |
(*inp & 0xC0) == 0x80 || |
29 |
|
|
|
30 |
|
|
// invalid codepoints |
31 |
2/2
✓ Branch 0 taken 129 times.
✓ Branch 1 taken 339 times.
|
1029 |
val > 0x10FFFF || |
32 |
|
|
|
33 |
|
|
// check overlong encodings |
34 |
3/4
✓ Branch 0 taken 690 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 147 times.
✓ Branch 3 taken 321 times.
|
1029 |
(cont_byte_count == 3 && val < 0x1000) || |
35 |
3/4
✓ Branch 0 taken 708 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 192 times.
✓ Branch 3 taken 276 times.
|
1029 |
(cont_byte_count == 2 && val < 0x800) || |
36 |
3/4
✓ Branch 0 taken 753 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 132 times.
✓ Branch 3 taken 336 times.
|
1029 |
(cont_byte_count == 1 && val < 0x80) || |
37 |
|
|
|
38 |
|
|
// UTF16 surrogates |
39 |
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 693 times.
|
693 |
(val >= 0xD800 && val <= 0xDFFF) |
40 |
|
|
) { |
41 |
|
✗ |
*codepoint = -1; |
42 |
|
✗ |
return 0u; |
43 |
|
|
}; |
44 |
|
1029 |
*codepoint = val; |
45 |
|
1029 |
return 1u + cont_byte_count; |
46 |
|
|
} |
47 |
|
|
|
48 |
|
|
// if none of the above, this couldn't possibly be a valid encoding |
49 |
|
✗ |
*codepoint = -1; |
50 |
|
✗ |
return 0u; |
51 |
|
|
} |
52 |
|
|
|
53 |
|
31750 |
bool codepoint_to_utf8(string* out, unsigned int codepoint) { |
54 |
|
✗ |
*out = ""; |
55 |
2/2
✓ Branch 0 taken 31128 times.
✓ Branch 1 taken 622 times.
|
31750 |
if (codepoint < 0x80) { |
56 |
|
31128 |
*out += (unsigned char)codepoint; |
57 |
|
|
} |
58 |
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 622 times.
|
622 |
else if (codepoint < 0x800) { |
59 |
|
✗ |
*out += (unsigned char)(0xc0 | (codepoint >> 6)); |
60 |
|
✗ |
*out += (unsigned char)(0x80 | (codepoint & 0x3f)); |
61 |
|
|
} |
62 |
2/2
✓ Branch 0 taken 372 times.
✓ Branch 1 taken 250 times.
|
622 |
else if (codepoint < 0x10000) { |
63 |
|
372 |
*out += (unsigned char)(0xe0 | (codepoint >> 12)); |
64 |
|
372 |
*out += (unsigned char)(0x80 | ((codepoint >> 6) & 0x3f)); |
65 |
|
372 |
*out += (unsigned char)(0x80 | (codepoint & 0x3f)); |
66 |
|
|
} |
67 |
1/2
✓ Branch 0 taken 250 times.
✗ Branch 1 not taken.
|
250 |
else if (codepoint < 0x110000) { |
68 |
|
250 |
*out += (unsigned char)(0xf0 | (codepoint >> 18)); |
69 |
|
250 |
*out += (unsigned char)(0x80 | ((codepoint >> 12) & 0x3f)); |
70 |
|
250 |
*out += (unsigned char)(0x80 | ((codepoint >> 6) & 0x3f)); |
71 |
|
250 |
*out += (unsigned char)(0x80 | (codepoint & 0x3f)); |
72 |
|
|
} |
73 |
|
✗ |
else return false; |
74 |
|
|
|
75 |
|
✗ |
return true; |
76 |
|
|
} |
77 |
|
|
|
78 |
|
1890 |
bool is_valid_utf8(const char* inp) { |
79 |
2/2
✓ Branch 0 taken 654378 times.
✓ Branch 1 taken 1884 times.
|
656262 |
while (*inp != '\0') { |
80 |
|
|
int codepoint; |
81 |
1/2
✓ Branch 0 taken 213964 times.
✗ Branch 1 not taken.
|
654378 |
inp += utf8_val(&codepoint, inp); |
82 |
|
|
|
83 |
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 654372 times.
|
654378 |
if (codepoint == -1) return false; |
84 |
|
|
} |
85 |
|
|
|
86 |
|
964 |
return true; |
87 |
|
|
} |
88 |
|
|
|
89 |
|
31750 |
size_t utf16_val(int* codepoint, const wchar_t* inp) |
90 |
|
|
{ |
91 |
|
31750 |
wchar_t first_word = *inp; |
92 |
|
|
|
93 |
2/4
✓ Branch 0 taken 31500 times.
✓ Branch 1 taken 250 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
31750 |
if (first_word <= 0xD800 || first_word >= 0xDFFF) |
94 |
|
|
{ |
95 |
|
|
// Single word |
96 |
|
31500 |
*codepoint = first_word; |
97 |
|
31500 |
return 1u; |
98 |
|
|
} |
99 |
1/4
✓ Branch 0 taken 250 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
250 |
else if (first_word >= 0xD800 && first_word <= 0xDBFF) |
100 |
|
|
{ |
101 |
|
|
// Start of a surrogate pair |
102 |
|
250 |
wchar_t second_word = *(inp + 1); |
103 |
|
|
|
104 |
1/4
✓ Branch 0 taken 250 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
250 |
if (second_word >= 0xDC00 && second_word <= 0xDFFF) |
105 |
|
|
{ |
106 |
|
250 |
*codepoint = 0x10000 |
107 |
|
250 |
+ ((int)(first_word - 0xD800) << 10u) |
108 |
|
250 |
+ ((int)(second_word - 0xDC00)); |
109 |
|
250 |
return 2u; |
110 |
|
|
} |
111 |
|
|
} |
112 |
|
|
|
113 |
|
|
// Everything not covered above is considered invalid. |
114 |
|
✗ |
*codepoint = -1; |
115 |
|
✗ |
return 0u; |
116 |
|
|
} |
117 |
|
|
|
118 |
|
841724 |
bool codepoint_to_utf16(std::wstring* out, unsigned int codepoint) |
119 |
|
|
{ |
120 |
2/4
✓ Branch 0 taken 39 times.
✓ Branch 1 taken 841685 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
841724 |
if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) |
121 |
|
|
{ |
122 |
|
39 |
wchar_t high = (wchar_t)(((codepoint - 0x10000) >> 10) + 0xD800); |
123 |
|
39 |
wchar_t low = (wchar_t)(((codepoint - 0x10000) & 0b1111111111) + 0xDC00); |
124 |
|
|
|
125 |
2/4
✓ Branch 0 taken 39 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 39 times.
✗ Branch 3 not taken.
|
39 |
*out = std::wstring() + high + low; |
126 |
|
39 |
return true; |
127 |
|
|
} |
128 |
1/4
✓ Branch 0 taken 841685 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
841685 |
else if (codepoint <= 0xD800 || codepoint >= 0xDFFF) |
129 |
|
|
{ |
130 |
1/2
✓ Branch 0 taken 841685 times.
✗ Branch 1 not taken.
|
841685 |
*out = std::wstring() + (wchar_t)codepoint; |
131 |
|
841685 |
return true; |
132 |
|
|
} |
133 |
|
|
|
134 |
|
|
// Everything not covered above should be considered invalid. |
135 |
|
✗ |
return false; |
136 |
|
|
} |
137 |
|
|
|
138 |
|
|
|
139 |
|
1240 |
bool utf16_to_utf8(string* result, const wchar_t* u16_str) |
140 |
|
|
{ |
141 |
|
✗ |
*result = ""; |
142 |
|
|
|
143 |
|
|
int codepoint; |
144 |
|
|
do |
145 |
|
|
{ |
146 |
0/2
✗ Branch 0 not taken.
✗ Branch 1 not taken.
|
31750 |
u16_str += utf16_val(&codepoint, u16_str); |
147 |
|
|
|
148 |
|
✗ |
string next; |
149 |
2/8
✓ Branch 0 taken 31750 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 31750 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
|
31750 |
if (codepoint == -1 || !codepoint_to_utf8(&next, codepoint)) return false; |
150 |
|
|
|
151 |
0/2
✗ Branch 0 not taken.
✗ Branch 1 not taken.
|
31750 |
*result += next; |
152 |
2/4
✓ Branch 0 taken 30510 times.
✓ Branch 1 taken 1240 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
31750 |
} while (codepoint != 0); |
153 |
|
|
|
154 |
|
✗ |
return true; |
155 |
|
|
} |
156 |
|
|
|
157 |
|
18669 |
bool utf8_to_utf16(std::wstring* result, const char* u8_str) |
158 |
|
|
{ |
159 |
0/2
✗ Branch 0 not taken.
✗ Branch 1 not taken.
|
18669 |
*result = L""; |
160 |
|
|
|
161 |
|
|
int codepoint; |
162 |
|
|
do |
163 |
|
|
{ |
164 |
1/2
✓ Branch 0 taken 841724 times.
✗ Branch 1 not taken.
|
841724 |
u8_str += utf8_val(&codepoint, u8_str); |
165 |
|
|
|
166 |
|
✗ |
std::wstring next; |
167 |
3/8
✓ Branch 0 taken 841724 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 841724 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 841724 times.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
|
841724 |
if (codepoint == -1 || !codepoint_to_utf16(&next, codepoint)) return false; |
168 |
|
|
|
169 |
1/2
✓ Branch 0 taken 841724 times.
✗ Branch 1 not taken.
|
841724 |
*result += next; |
170 |
2/4
✓ Branch 0 taken 823055 times.
✓ Branch 1 taken 18669 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
|
841724 |
} while (codepoint != 0); |
171 |
|
|
|
172 |
|
✗ |
return true; |
173 |
|
|
} |
174 |
|
|
|