asar coverage - build #323 - src/asar/unicode.cpp

Line	Branch	Exec	Source
1			#include "unicode.h"
2
3		248855	size_t utf8_val(int* codepoint, const char* inp) {
4		248855	unsigned char c = *inp++;
5			int val;
6	2/2 ✓ Branch 0 taken 248523 times. ✓ Branch 1 taken 332 times.	248855	if (c < 0x80) {
7			// plain ascii
8		248523	*codepoint = c;
9		248523	return 1u;
10			}
11			// RPG Hacker: Byte sequences starting with 0xC0 or 0xC1 are invalid.
12			// So are byte sequences starting with anything >= 0xF5.
13			// And anything below 0xC0 indicates a follow-up byte and should never be at the start of a sequence.
14	2/4 ✓ Branch 0 taken 332 times. ✗ Branch 1 not taken. ✓ Branch 2 taken 332 times. ✗ Branch 3 not taken.	332	else if (c > 0xC1 && c < 0xF5) {
15			// 1, 2 or 3 continuation bytes
16	4/4 ✓ Branch 0 taken 240 times. ✓ Branch 1 taken 92 times. ✓ Branch 2 taken 112 times. ✓ Branch 3 taken 128 times.	332	int cont_byte_count = (c >= 0xF0) ? 3 : (c >= 0xE0) ? 2 : 1;
17			// bit hack to extract the significant bits from the start byte
18	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 157 times.	332	val = (c & ((1 << (6 - cont_byte_count)) - 1));
19	2/2 ✓ Branch 0 taken 626 times. ✓ Branch 1 taken 330 times.	956	for (int i = 0; i < cont_byte_count; i++) {
20		626	unsigned char next = *inp++;
21	2/2 ✓ Branch 0 taken 2 times. ✓ Branch 1 taken 624 times.	626	if ((next & 0xC0) != 0x80) {
22		2	*codepoint = -1;
23		2	return 0u;
24			}
25	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 291 times.	624	val = (val << 6) \| (next & 0x3F);
26			}
27		330	if (// too many cont.bytes
28	4/5 ✓ Branch 0 taken 174 times. ✓ Branch 1 taken 156 times. ✓ Branch 2 taken 174 times. ✓ Branch 3 taken 156 times. ✗ Branch 4 not taken.	330	(*inp & 0xC0) == 0x80 \|\|
29
30			// invalid codepoints
31	2/2 ✓ Branch 0 taken 92 times. ✓ Branch 1 taken 238 times.	330	val > 0x10FFFF \|\|
32
33			// check overlong encodings
34	3/4 ✓ Branch 0 taken 92 times. ✗ Branch 1 not taken. ✓ Branch 2 taken 110 times. ✓ Branch 3 taken 220 times.	330	(cont_byte_count == 3 && val < 0x1000) \|\|
35	3/4 ✓ Branch 0 taken 110 times. ✗ Branch 1 not taken. ✓ Branch 2 taken 128 times. ✓ Branch 3 taken 202 times.	330	(cont_byte_count == 2 && val < 0x800) \|\|
36	3/4 ✓ Branch 0 taken 128 times. ✗ Branch 1 not taken. ✓ Branch 2 taken 94 times. ✓ Branch 3 taken 236 times.	330	(cont_byte_count == 1 && val < 0x80) \|\|
37
38			// UTF16 surrogates
39	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 94 times.	94	(val >= 0xD800 && val <= 0xDFFF)
40			) {
41		✗	*codepoint = -1;
42		✗	return 0u;
43			};
44		330	*codepoint = val;
45		330	return 1u + cont_byte_count;
46			}
47
48			// if none of the above, this couldn't possibly be a valid encoding
49		✗	*codepoint = -1;
50		✗	return 0u;
51			}
52
53		✗	bool codepoint_to_utf8(string* out, unsigned int codepoint) {
54		✗	*out = "";
55		✗	if (codepoint < 0x80) {
56		✗	*out += (unsigned char)codepoint;
57			}
58		✗	else if (codepoint < 0x800) {
59		✗	*out += (unsigned char)(0xc0 \| (codepoint >> 6));
60		✗	*out += (unsigned char)(0x80 \| (codepoint & 0x3f));
61			}
62		✗	else if (codepoint < 0x10000) {
63		✗	*out += (unsigned char)(0xe0 \| (codepoint >> 12));
64		✗	*out += (unsigned char)(0x80 \| ((codepoint >> 6) & 0x3f));
65		✗	*out += (unsigned char)(0x80 \| (codepoint & 0x3f));
66			}
67		✗	else if (codepoint < 0x110000) {
68		✗	*out += (unsigned char)(0xf0 \| (codepoint >> 18));
69		✗	*out += (unsigned char)(0x80 \| ((codepoint >> 12) & 0x3f));
70		✗	*out += (unsigned char)(0x80 \| ((codepoint >> 6) & 0x3f));
71		✗	*out += (unsigned char)(0x80 \| (codepoint & 0x3f));
72			}
73		✗	else return false;
74
75		✗	return true;
76			}
77
78		332	bool is_valid_utf8(const char* inp, size_t inp_len) {
79	2/2 ✓ Branch 0 taken 26313 times. ✓ Branch 1 taken 330 times.	26643	for(size_t i = 0; i < inp_len;) {
80			// optimization: if next 8 bytes are ascii, skip them
81	2/2 ✓ Branch 0 taken 25125 times. ✓ Branch 1 taken 1188 times.	26313	if(i + 8 <= inp_len) {
82		12646	uint64_t buf;
83		25125	memcpy(&buf, inp+i, sizeof(buf));
84	2/2 ✓ Branch 0 taken 24851 times. ✓ Branch 1 taken 274 times.	25125	if((buf & 0x8080808080808080ull) == 0) {
85		24851	i += 8; continue;
86			}
87			}
88
89		806	int codepoint;
90	1/2 ✓ Branch 0 taken 801 times. ✗ Branch 1 not taken.	1462	i += utf8_val(&codepoint, inp+i);
91
92	2/2 ✓ Branch 0 taken 2 times. ✓ Branch 1 taken 1460 times.	1462	if (codepoint == -1) return false;
93			}
94
95		330	return true;
96			}
97
98		✗	size_t utf16_val(int* codepoint, const wchar_t* inp)
99			{
100		✗	wchar_t first_word = *inp;
101
102		✗	if (first_word <= 0xD800 \|\| first_word >= 0xDFFF)
103			{
104			// Single word
105		✗	*codepoint = first_word;
106		✗	return 1u;
107			}
108		✗	else if (first_word >= 0xD800 && first_word <= 0xDBFF)
109			{
110			// Start of a surrogate pair
111		✗	wchar_t second_word = *(inp + 1);
112
113		✗	if (second_word >= 0xDC00 && second_word <= 0xDFFF)
114			{
115		✗	*codepoint = 0x10000
116		✗	+ ((int)(first_word - 0xD800) << 10u)
117		✗	+ ((int)(second_word - 0xDC00));
118		✗	return 2u;
119			}
120			}
121
122			// Everything not covered above is considered invalid.
123		✗	*codepoint = -1;
124		✗	return 0u;
125			}
126
127		244560	bool codepoint_to_utf16(std::wstring* out, unsigned int codepoint)
128			{
129	3/4 ✓ Branch 0 taken 6 times. ✓ Branch 1 taken 244554 times. ✓ Branch 2 taken 6 times. ✗ Branch 3 not taken.	244560	if (codepoint >= 0x10000 && codepoint <= 0x10FFFF)
130			{
131		6	wchar_t high = (wchar_t)(((codepoint - 0x10000) >> 10) + 0xD800);
132		6	wchar_t low = (wchar_t)(((codepoint - 0x10000) & 0b1111111111) + 0xDC00);
133
134	2/7 ✓ Branch 0 taken 6 times. ✗ Branch 1 not taken. ✓ Branch 2 taken 6 times. ✗ Branch 3 not taken. ✗ Branch 4 not taken. ✗ Branch 5 not taken. ✗ Branch 6 not taken.	6	*out = std::wstring() + high + low;
135		6	return true;
136			}
137	1/4 ✗ Branch 0 not taken. ✓ Branch 1 taken 244554 times. ✗ Branch 2 not taken. ✗ Branch 3 not taken.	244554	else if (codepoint <= 0xD800 \|\| codepoint >= 0xDFFF)
138			{
139	1/4 ✓ Branch 0 taken 244554 times. ✗ Branch 1 not taken. ✗ Branch 2 not taken. ✗ Branch 3 not taken.	244554	*out = std::wstring() + (wchar_t)codepoint;
140		244554	return true;
141			}
142
143			// Everything not covered above should be considered invalid.
144		✗	return false;
145			}
146
147
148		✗	bool utf16_to_utf8(string* result, const wchar_t* u16_str)
149			{
150		✗	*result = "";
151
152		✗	int codepoint;
153			do
154			{
155		✗	u16_str += utf16_val(&codepoint, u16_str);
156
157		✗	string next;
158		✗	if (codepoint == -1 \|\| !codepoint_to_utf8(&next, codepoint)) return false;
159
160		✗	*result += next;
161		✗	} while (codepoint != 0);
162
163		✗	return true;
164			}
165
166		5424	bool utf8_to_utf16(std::wstring* result, const char* u8_str)
167			{
168	1/3 ✓ Branch 0 taken 5424 times. ✗ Branch 1 not taken. ✗ Branch 2 not taken.	5424	*result = L"";
169
170		✗	int codepoint;
171			do
172			{
173	0/2 ✗ Branch 0 not taken. ✗ Branch 1 not taken.	244560	u8_str += utf8_val(&codepoint, u8_str);
174
175		244560	std::wstring next;
176	4/8 ✓ Branch 0 taken 244560 times. ✗ Branch 1 not taken. ✓ Branch 2 taken 244560 times. ✗ Branch 3 not taken. ✗ Branch 4 not taken. ✓ Branch 5 taken 244560 times. ✗ Branch 6 not taken. ✓ Branch 7 taken 244560 times.	244560	if (codepoint == -1 \|\| !codepoint_to_utf16(&next, codepoint)) return false;
177
178	1/3 ✓ Branch 0 taken 244560 times. ✗ Branch 1 not taken. ✗ Branch 2 not taken.	244560	*result += next;
179	3/4 ✓ Branch 0 taken 244560 times. ✗ Branch 1 not taken. ✓ Branch 2 taken 239136 times. ✓ Branch 3 taken 5424 times.	489120	} while (codepoint != 0);
180
181		5424	return true;
182			}
183