Line data Source code
1 : #include "imap_util.h"
2 : #include <stdlib.h>
3 : #include <string.h>
4 : #include <stdint.h>
5 :
6 : /**
7 : * @file imap_util.c
8 : * @brief IMAP Modified UTF-7 decoder (RFC 3501 §5.1.3).
9 : */
10 :
11 : /* Modified base64 alphabet: A-Z(0-25), a-z(26-51), 0-9(52-61), +(62), ,(63) */
12 61 : static int mod64_value(char c) {
13 61 : if (c >= 'A' && c <= 'Z') return c - 'A';
14 16 : if (c >= 'a' && c <= 'z') return c - 'a' + 26;
15 8 : if (c >= '0' && c <= '9') return c - '0' + 52;
16 3 : if (c == '+') return 62;
17 2 : if (c == ',') return 63;
18 1 : return -1;
19 : }
20 :
21 : /* Encode one Unicode code point as UTF-8. Returns bytes written (1-4). */
22 19 : static int utf8_encode(uint32_t cp, char *out) {
23 19 : if (cp < 0x80) {
24 2 : out[0] = (char)cp;
25 2 : return 1;
26 : }
27 17 : if (cp < 0x800) {
28 13 : out[0] = (char)(0xC0 | (cp >> 6));
29 13 : out[1] = (char)(0x80 | (cp & 0x3F));
30 13 : return 2;
31 : }
32 4 : if (cp < 0x10000) {
33 3 : out[0] = (char)(0xE0 | (cp >> 12));
34 3 : out[1] = (char)(0x80 | ((cp >> 6) & 0x3F));
35 3 : out[2] = (char)(0x80 | (cp & 0x3F));
36 3 : return 3;
37 : }
38 1 : out[0] = (char)(0xF0 | (cp >> 18));
39 1 : out[1] = (char)(0x80 | ((cp >> 12) & 0x3F));
40 1 : out[2] = (char)(0x80 | ((cp >> 6) & 0x3F));
41 1 : out[3] = (char)(0x80 | (cp & 0x3F));
42 1 : return 4;
43 : }
44 :
45 17 : char *imap_utf7_decode(const char *s) {
46 17 : if (!s) return NULL;
47 16 : size_t len = strlen(s);
48 :
49 : /* Worst case: each input byte expands to at most 4 UTF-8 bytes */
50 16 : char *out = malloc(len * 4 + 1);
51 16 : if (!out) return NULL;
52 16 : char *dst = out;
53 :
54 16 : const char *p = s;
55 108 : while (*p) {
56 92 : if (*p != '&') {
57 72 : *dst++ = *p++;
58 72 : continue;
59 : }
60 20 : p++; /* skip '&' */
61 :
62 20 : if (*p == '-') {
63 : /* "&-" is a literal '&' */
64 1 : *dst++ = '&';
65 1 : p++;
66 1 : continue;
67 : }
68 :
69 : /* Decode modified-base64 run into raw bytes (UTF-16BE) */
70 19 : uint8_t bytes[256];
71 19 : int byte_cnt = 0;
72 19 : int bits = 0, bit_cnt = 0;
73 :
74 79 : while (*p && *p != '-') {
75 61 : int v = mod64_value(*p++);
76 61 : if (v < 0) break;
77 60 : bits = (bits << 6) | v;
78 60 : bit_cnt += 6;
79 60 : if (bit_cnt >= 8) {
80 40 : bit_cnt -= 8;
81 40 : if (byte_cnt < (int)sizeof(bytes))
82 40 : bytes[byte_cnt++] = (uint8_t)((unsigned)bits >> bit_cnt);
83 40 : bits &= (1 << bit_cnt) - 1;
84 : }
85 : }
86 19 : if (*p == '-') p++;
87 :
88 : /* Interpret raw bytes as UTF-16BE, emit as UTF-8 */
89 38 : for (int i = 0; i + 1 < byte_cnt; i += 2) {
90 19 : uint16_t unit = ((uint16_t)bytes[i] << 8) | bytes[i + 1];
91 : uint32_t cp;
92 :
93 21 : if (unit >= 0xD800 && unit <= 0xDBFF && i + 3 < byte_cnt) {
94 : /* High surrogate — pair with following low surrogate */
95 2 : uint16_t low = ((uint16_t)bytes[i + 2] << 8) | bytes[i + 3];
96 2 : if (low >= 0xDC00 && low <= 0xDFFF) {
97 1 : cp = 0x10000u
98 1 : + ((uint32_t)(unit - 0xD800) << 10)
99 1 : + (uint32_t)(low - 0xDC00);
100 1 : i += 2;
101 : } else {
102 1 : cp = unit; /* unpaired surrogate — pass through */
103 : }
104 : } else {
105 17 : cp = unit;
106 : }
107 :
108 19 : char utf8[4];
109 19 : int n = utf8_encode(cp, utf8);
110 19 : memcpy(dst, utf8, (size_t)n);
111 19 : dst += n;
112 : }
113 : }
114 16 : *dst = '\0';
115 16 : return out;
116 : }
117 :
118 49 : char *imap_utf7_encode(const char *s) {
119 49 : if (!s) return NULL;
120 49 : size_t len = strlen(s);
121 : /* Upper bound: every input byte can expand to at most 8 output chars. */
122 49 : char *out = malloc(len * 8 + 4);
123 49 : if (!out) return NULL;
124 49 : char *dst = out;
125 :
126 : static const char mod64[] =
127 : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
128 :
129 49 : const unsigned char *p = (const unsigned char *)s;
130 350 : while (*p) {
131 301 : if (*p >= 0x20 && *p <= 0x7E && *p != '&') {
132 : /* Printable ASCII (except '&'): pass through */
133 301 : *dst++ = (char)*p++;
134 0 : } else if (*p == '&') {
135 : /* '&' is escaped as "&-" */
136 0 : *dst++ = '&';
137 0 : *dst++ = '-';
138 0 : p++;
139 : } else {
140 : /* Non-ASCII run: encode as UTF-16BE in modified Base64 */
141 0 : *dst++ = '&';
142 0 : unsigned int bits = 0;
143 0 : int bit_cnt = 0;
144 :
145 0 : while (*p && !(*p >= 0x20 && *p <= 0x7E)) {
146 : /* Decode one UTF-8 code point. */
147 : uint32_t cp;
148 : int seqlen;
149 0 : if (*p < 0x80) { cp = *p; seqlen = 1; }
150 0 : else if (*p < 0xC2) { cp = 0xFFFD; seqlen = 1; }
151 0 : else if (*p < 0xE0) { cp = *p & 0x1Fu; seqlen = 2; }
152 0 : else if (*p < 0xF0) { cp = *p & 0x0Fu; seqlen = 3; }
153 0 : else { cp = *p & 0x07u; seqlen = 4; }
154 0 : for (int i = 1; i < seqlen; i++) {
155 0 : if ((p[i] & 0xC0) != 0x80) { seqlen = i; cp = 0xFFFD; break; }
156 0 : cp = (cp << 6) | (p[i] & 0x3Fu);
157 : }
158 0 : p += seqlen;
159 :
160 : /* Emit as UTF-16BE (BMP char or surrogate pair). */
161 0 : uint16_t units[2];
162 : int nunit;
163 0 : if (cp <= 0xFFFFu) {
164 0 : units[0] = (uint16_t)cp;
165 0 : nunit = 1;
166 : } else {
167 0 : cp -= 0x10000u;
168 0 : units[0] = (uint16_t)(0xD800u | (cp >> 10));
169 0 : units[1] = (uint16_t)(0xDC00u | (cp & 0x3FFu));
170 0 : nunit = 2;
171 : }
172 :
173 : /* Feed each byte of UTF-16BE into the Base64 bit stream. */
174 0 : for (int j = 0; j < nunit; j++) {
175 0 : uint8_t hi = (uint8_t)(units[j] >> 8);
176 0 : uint8_t lo = (uint8_t)(units[j] & 0xFF);
177 :
178 0 : bits = (bits << 8) | hi;
179 0 : bit_cnt += 8;
180 0 : while (bit_cnt >= 6) {
181 0 : bit_cnt -= 6;
182 0 : *dst++ = mod64[(bits >> bit_cnt) & 0x3F];
183 0 : bits &= (1u << bit_cnt) - 1u;
184 : }
185 :
186 0 : bits = (bits << 8) | lo;
187 0 : bit_cnt += 8;
188 0 : while (bit_cnt >= 6) {
189 0 : bit_cnt -= 6;
190 0 : *dst++ = mod64[(bits >> bit_cnt) & 0x3F];
191 0 : bits &= (1u << bit_cnt) - 1u;
192 : }
193 : }
194 : }
195 : /* Flush remaining bits (zero-padded to the next 6-bit boundary). */
196 0 : if (bit_cnt > 0)
197 0 : *dst++ = mod64[(bits << (6 - bit_cnt)) & 0x3F];
198 :
199 0 : *dst++ = '-';
200 : }
201 : }
202 49 : *dst = '\0';
203 49 : return out;
204 : }
|