asar coverage - build #262


src/asar/
File: src/asar/unicode.cpp
Date: 2025-02-27 19:01:43
Lines:
73/90
81.1%
Functions:
7/7
100.0%
Branches:
61/110
55.5%

Line Branch Exec Source
1 #include "unicode.h"
2
3 1525449 size_t utf8_val(int* codepoint, const char* inp) {
4 1525449 unsigned char c = *inp++;
5 int val;
6
2/2
✓ Branch 0 taken 1524234 times.
✓ Branch 1 taken 1215 times.
1525449 if (c < 0x80) {
7 // plain ascii
8 1524234 *codepoint = c;
9 1524234 return 1u;
10 }
11 // RPG Hacker: Byte sequences starting with 0xC0 or 0xC1 are invalid.
12 // So are byte sequences starting with anything >= 0xF5.
13 // And anything below 0xC0 indicates a follow-up byte and should never be at the start of a sequence.
14
2/4
✓ Branch 0 taken 1215 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 561 times.
✗ Branch 3 not taken.
1215 else if (c > 0xC1 && c < 0xF5) {
15 // 1, 2 or 3 continuation bytes
16
4/4
✓ Branch 0 taken 882 times.
✓ Branch 1 taken 333 times.
✓ Branch 2 taken 414 times.
✓ Branch 3 taken 468 times.
1215 int cont_byte_count = (c >= 0xF0) ? 3 : (c >= 0xE0) ? 2 : 1;
17 // bit hack to extract the significant bits from the start byte
18 1215 val = (c & ((1 << (6 - cont_byte_count)) - 1));
19
2/2
✓ Branch 0 taken 2283 times.
✓ Branch 1 taken 1209 times.
3492 for (int i = 0; i < cont_byte_count; i++) {
20 2283 unsigned char next = *inp++;
21
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 2277 times.
2283 if ((next & 0xC0) != 0x80) {
22 6 *codepoint = -1;
23 6 return 0u;
24 }
25 2277 val = (val << 6) | (next & 0x3F);
26 }
27 1209 if (// too many cont.bytes
28
2/4
✓ Branch 0 taken 1209 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1209 times.
✗ Branch 3 not taken.
1209 (*inp & 0xC0) == 0x80 ||
29
30 // invalid codepoints
31
2/2
✓ Branch 0 taken 147 times.
✓ Branch 1 taken 411 times.
1209 val > 0x10FFFF ||
32
33 // check overlong encodings
34
3/4
✓ Branch 0 taken 798 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 174 times.
✓ Branch 3 taken 384 times.
1209 (cont_byte_count == 3 && val < 0x1000) ||
35
3/4
✓ Branch 0 taken 825 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 237 times.
✓ Branch 3 taken 321 times.
1209 (cont_byte_count == 2 && val < 0x800) ||
36
3/4
✓ Branch 0 taken 888 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 150 times.
✓ Branch 3 taken 408 times.
1209 (cont_byte_count == 1 && val < 0x80) ||
37
38 // UTF16 surrogates
39
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 801 times.
801 (val >= 0xD800 && val <= 0xDFFF)
40 ) {
41 *codepoint = -1;
42 return 0u;
43 };
44 1209 *codepoint = val;
45 1209 return 1u + cont_byte_count;
46 }
47
48 // if none of the above, this couldn't possibly be a valid encoding
49 *codepoint = -1;
50 return 0u;
51 }
52
53 32501 bool codepoint_to_utf8(string* out, unsigned int codepoint) {
54 *out = "";
55
2/2
✓ Branch 0 taken 31864 times.
✓ Branch 1 taken 637 times.
32501 if (codepoint < 0x80) {
56 31864 *out += (unsigned char)codepoint;
57 }
58
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 637 times.
637 else if (codepoint < 0x800) {
59 *out += (unsigned char)(0xc0 | (codepoint >> 6));
60 *out += (unsigned char)(0x80 | (codepoint & 0x3f));
61 }
62
2/2
✓ Branch 0 taken 381 times.
✓ Branch 1 taken 256 times.
637 else if (codepoint < 0x10000) {
63 381 *out += (unsigned char)(0xe0 | (codepoint >> 12));
64 381 *out += (unsigned char)(0x80 | ((codepoint >> 6) & 0x3f));
65 381 *out += (unsigned char)(0x80 | (codepoint & 0x3f));
66 }
67
1/2
✓ Branch 0 taken 256 times.
✗ Branch 1 not taken.
256 else if (codepoint < 0x110000) {
68 256 *out += (unsigned char)(0xf0 | (codepoint >> 18));
69 256 *out += (unsigned char)(0x80 | ((codepoint >> 12) & 0x3f));
70 256 *out += (unsigned char)(0x80 | ((codepoint >> 6) & 0x3f));
71 256 *out += (unsigned char)(0x80 | (codepoint & 0x3f));
72 }
73 else return false;
74
75 return true;
76 }
77
78 1932 bool is_valid_utf8(const char* inp) {
79
2/2
✓ Branch 0 taken 669036 times.
✓ Branch 1 taken 1926 times.
670962 while (*inp != '\0') {
80 int codepoint;
81
1/2
✓ Branch 0 taken 218740 times.
✗ Branch 1 not taken.
669036 inp += utf8_val(&codepoint, inp);
82
83
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 669030 times.
669036 if (codepoint == -1) return false;
84 }
85
86 985 return true;
87 }
88
89 32501 size_t utf16_val(int* codepoint, const wchar_t* inp)
90 {
91 32501 wchar_t first_word = *inp;
92
93
2/4
✓ Branch 0 taken 32245 times.
✓ Branch 1 taken 256 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
32501 if (first_word <= 0xD800 || first_word >= 0xDFFF)
94 {
95 // Single word
96 32245 *codepoint = first_word;
97 32245 return 1u;
98 }
99
1/4
✓ Branch 0 taken 256 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
256 else if (first_word >= 0xD800 && first_word <= 0xDBFF)
100 {
101 // Start of a surrogate pair
102 256 wchar_t second_word = *(inp + 1);
103
104
1/4
✓ Branch 0 taken 256 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
256 if (second_word >= 0xDC00 && second_word <= 0xDFFF)
105 {
106 256 *codepoint = 0x10000
107 256 + ((int)(first_word - 0xD800) << 10u)
108 256 + ((int)(second_word - 0xDC00));
109 256 return 2u;
110 }
111 }
112
113 // Everything not covered above is considered invalid.
114 *codepoint = -1;
115 return 0u;
116 }
117
118 845724 bool codepoint_to_utf16(std::wstring* out, unsigned int codepoint)
119 {
120
2/4
✓ Branch 0 taken 39 times.
✓ Branch 1 taken 845685 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
845724 if (codepoint >= 0x10000 && codepoint <= 0x10FFFF)
121 {
122 39 wchar_t high = (wchar_t)(((codepoint - 0x10000) >> 10) + 0xD800);
123 39 wchar_t low = (wchar_t)(((codepoint - 0x10000) & 0b1111111111) + 0xDC00);
124
125
2/4
✓ Branch 0 taken 39 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 39 times.
✗ Branch 3 not taken.
39 *out = std::wstring() + high + low;
126 39 return true;
127 }
128
1/4
✓ Branch 0 taken 845685 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
845685 else if (codepoint <= 0xD800 || codepoint >= 0xDFFF)
129 {
130
1/2
✓ Branch 0 taken 845685 times.
✗ Branch 1 not taken.
845685 *out = std::wstring() + (wchar_t)codepoint;
131 845685 return true;
132 }
133
134 // Everything not covered above should be considered invalid.
135 return false;
136 }
137
138
139 1270 bool utf16_to_utf8(string* result, const wchar_t* u16_str)
140 {
141 *result = "";
142
143 int codepoint;
144 do
145 {
146
1/2
✓ Branch 0 taken 32501 times.
✗ Branch 1 not taken.
32501 u16_str += utf16_val(&codepoint, u16_str);
147
148 string next;
149
3/8
✓ Branch 0 taken 32501 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 32501 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 32501 times.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
32501 if (codepoint == -1 || !codepoint_to_utf8(&next, codepoint)) return false;
150
151
1/2
✓ Branch 0 taken 32501 times.
✗ Branch 1 not taken.
32501 *result += next;
152
2/4
✓ Branch 0 taken 31231 times.
✓ Branch 1 taken 1270 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
32501 } while (codepoint != 0);
153
154 return true;
155 }
156
157 18753 bool utf8_to_utf16(std::wstring* result, const char* u8_str)
158 {
159
0/2
✗ Branch 0 not taken.
✗ Branch 1 not taken.
18753 *result = L"";
160
161 int codepoint;
162 do
163 {
164
1/2
✓ Branch 0 taken 845724 times.
✗ Branch 1 not taken.
845724 u8_str += utf8_val(&codepoint, u8_str);
165
166 std::wstring next;
167
3/8
✓ Branch 0 taken 845724 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 845724 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 845724 times.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
845724 if (codepoint == -1 || !codepoint_to_utf16(&next, codepoint)) return false;
168
169
1/2
✓ Branch 0 taken 845724 times.
✗ Branch 1 not taken.
845724 *result += next;
170
2/4
✓ Branch 0 taken 826971 times.
✓ Branch 1 taken 18753 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
845724 } while (codepoint != 0);
171
172 return true;
173 }
174