1 |
/* |
2 |
encoding.c - character encoding |
3 |
Copyright (C) 2008 siliconforks.com |
4 |
|
5 |
This program is free software; you can redistribute it and/or modify |
6 |
it under the terms of the GNU General Public License as published by |
7 |
the Free Software Foundation; either version 2 of the License, or |
8 |
(at your option) any later version. |
9 |
|
10 |
This program is distributed in the hope that it will be useful, |
11 |
but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 |
GNU General Public License for more details. |
14 |
|
15 |
You should have received a copy of the GNU General Public License along |
16 |
with this program; if not, write to the Free Software Foundation, Inc., |
17 |
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
18 |
*/ |
19 |
|
20 |
#include <config.h> |
21 |
|
22 |
#include "encoding.h" |
23 |
|
24 |
#include <assert.h> |
25 |
#include <string.h> |
26 |
|
27 |
#ifdef HAVE_ICONV_H |
28 |
#include <iconv.h> |
29 |
#elif defined HAVE_WINDOWS_H |
30 |
#include <windows.h> |
31 |
#endif |
32 |
|
33 |
#include "util.h" |
34 |
|
35 |
static void skip_bom(jschar ** characters, size_t * num_characters) { |
36 |
jschar * c = *characters; |
37 |
size_t nc = *num_characters; |
38 |
|
39 |
size_t i; |
40 |
for (i = 0; i < nc; i++) { |
41 |
if (c[i] != 0xfeff) { |
42 |
break; |
43 |
} |
44 |
} |
45 |
|
46 |
if (i == 0) { |
47 |
return; |
48 |
} |
49 |
|
50 |
nc -= i; |
51 |
jschar * old = c; |
52 |
c = xnew(jschar, nc); |
53 |
memcpy(c, old + i, nc * sizeof(jschar)); |
54 |
free(old); |
55 |
|
56 |
*characters = c; |
57 |
*num_characters = nc; |
58 |
} |
59 |
|
60 |
#ifdef HAVE_ICONV |
61 |
|
62 |
#ifdef WORDS_BIGENDIAN |
63 |
#define UCS_2_INTERNAL "UCS-2BE" |
64 |
#else |
65 |
#define UCS_2_INTERNAL "UCS-2LE" |
66 |
#endif |
67 |
|
68 |
int jscoverage_bytes_to_characters(const char * encoding, const uint8_t * bytes, size_t num_bytes, jschar ** characters, size_t * num_characters) { |
69 |
assert(encoding != NULL); |
70 |
|
71 |
iconv_t state = iconv_open(UCS_2_INTERNAL, encoding); |
72 |
if (state == (iconv_t) -1) { |
73 |
return JSCOVERAGE_ERROR_ENCODING_NOT_SUPPORTED; |
74 |
} |
75 |
|
76 |
ICONV_CONST char * input = (char *) bytes; |
77 |
size_t input_bytes_left = num_bytes; |
78 |
|
79 |
jschar * c = xnew(jschar, num_bytes); |
80 |
char * output = (char *) c; |
81 |
size_t output_bytes_left = sizeof(jschar) * num_bytes; |
82 |
|
83 |
size_t result = iconv(state, &input, &input_bytes_left, &output, &output_bytes_left); |
84 |
if (result == (size_t) -1) { |
85 |
free(c); |
86 |
return JSCOVERAGE_ERROR_INVALID_BYTE_SEQUENCE; |
87 |
} |
88 |
|
89 |
assert(input_bytes_left == 0); |
90 |
|
91 |
size_t nc = ((jschar *) output) - c; |
92 |
|
93 |
skip_bom(&c, &nc); |
94 |
|
95 |
*characters = c; |
96 |
*num_characters = nc; |
97 |
return 0; |
98 |
} |
99 |
|
100 |
#elif HAVE_MULTIBYTETOWIDECHAR |
101 |
|
102 |
/* http://msdn.microsoft.com/en-us/library/ms776446(VS.85).aspx */ |
103 |
static struct CodePage { |
104 |
UINT value; |
105 |
LPCSTR string; |
106 |
} code_pages[] = { |
107 |
{37, "IBM037"}, /* IBM EBCDIC US-Canada */ |
108 |
{437, "IBM437"}, /* OEM United States */ |
109 |
{500, "IBM500"}, /* IBM EBCDIC International */ |
110 |
{708, "ASMO-708"}, /* Arabic (ASMO 708) */ |
111 |
{720, "DOS-720"}, /* Arabic (Transparent ASMO); Arabic (DOS) */ |
112 |
{737, "ibm737"}, /* OEM Greek (formerly 437G); Greek (DOS) */ |
113 |
{775, "ibm775"}, /* OEM Baltic; Baltic (DOS) */ |
114 |
{850, "ibm850"}, /* OEM Multilingual Latin 1; Western European (DOS) */ |
115 |
{852, "ibm852"}, /* OEM Latin 2; Central European (DOS) */ |
116 |
{855, "IBM855"}, /* OEM Cyrillic (primarily Russian) */ |
117 |
{857, "ibm857"}, /* OEM Turkish; Turkish (DOS) */ |
118 |
{858, "IBM00858"}, /* OEM Multilingual Latin 1 + Euro symbol */ |
119 |
{860, "IBM860"}, /* OEM Portuguese; Portuguese (DOS) */ |
120 |
{861, "ibm861"}, /* OEM Icelandic; Icelandic (DOS) */ |
121 |
{862, "DOS-862"}, /* OEM Hebrew; Hebrew (DOS) */ |
122 |
{863, "IBM863"}, /* OEM French Canadian; French Canadian (DOS) */ |
123 |
{864, "IBM864"}, /* OEM Arabic; Arabic (864) */ |
124 |
{865, "IBM865"}, /* OEM Nordic; Nordic (DOS) */ |
125 |
{866, "cp866"}, /* OEM Russian; Cyrillic (DOS) */ |
126 |
{869, "ibm869"}, /* OEM Modern Greek; Greek, Modern (DOS) */ |
127 |
{870, "IBM870"}, /* IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2 */ |
128 |
{874, "windows-874"}, /* ANSI/OEM Thai (same as 28605, ISO 8859-15); Thai (Windows) */ |
129 |
{875, "cp875"}, /* IBM EBCDIC Greek Modern */ |
130 |
{932, "shift_jis"}, /* ANSI/OEM Japanese; Japanese (Shift-JIS) */ |
131 |
{936, "gb2312"}, /* ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312) */ |
132 |
{949, "ks_c_5601-1987"}, /* ANSI/OEM Korean (Unified Hangul Code) */ |
133 |
{950, "big5"}, /* ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5) */ |
134 |
{1026, "IBM1026"}, /* IBM EBCDIC Turkish (Latin 5) */ |
135 |
{1047, "IBM01047"}, /* IBM EBCDIC Latin 1/Open System */ |
136 |
{1140, "IBM01140"}, /* IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro) */ |
137 |
{1141, "IBM01141"}, /* IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro) */ |
138 |
{1142, "IBM01142"}, /* IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro) */ |
139 |
{1143, "IBM01143"}, /* IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro) */ |
140 |
{1144, "IBM01144"}, /* IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro) */ |
141 |
{1145, "IBM01145"}, /* IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro) */ |
142 |
{1146, "IBM01146"}, /* IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro) */ |
143 |
{1147, "IBM01147"}, /* IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro) */ |
144 |
{1148, "IBM01148"}, /* IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro) */ |
145 |
{1149, "IBM01149"}, /* IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro) */ |
146 |
{1200, "utf-16"}, /* Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications */ |
147 |
{1201, "unicodeFFFE"}, /* Unicode UTF-16, big endian byte order; available only to managed applications */ |
148 |
{1250, "windows-1250"}, /* ANSI Central European; Central European (Windows) */ |
149 |
{1251, "windows-1251"}, /* ANSI Cyrillic; Cyrillic (Windows) */ |
150 |
{1252, "windows-1252"}, /* ANSI Latin 1; Western European (Windows) */ |
151 |
{1253, "windows-1253"}, /* ANSI Greek; Greek (Windows) */ |
152 |
{1254, "windows-1254"}, /* ANSI Turkish; Turkish (Windows) */ |
153 |
{1255, "windows-1255"}, /* ANSI Hebrew; Hebrew (Windows) */ |
154 |
{1256, "windows-1256"}, /* ANSI Arabic; Arabic (Windows) */ |
155 |
{1257, "windows-1257"}, /* ANSI Baltic; Baltic (Windows) */ |
156 |
{1258, "windows-1258"}, /* ANSI/OEM Vietnamese; Vietnamese (Windows) */ |
157 |
{1361, "Johab"}, /* Korean (Johab) */ |
158 |
{10000, "macintosh"}, /* MAC Roman; Western European (Mac) */ |
159 |
{10001, "x-mac-japanese"}, /* Japanese (Mac) */ |
160 |
{10002, "x-mac-chinesetrad"}, /* MAC Traditional Chinese (Big5); Chinese Traditional (Mac) */ |
161 |
{10003, "x-mac-korean"}, /* Korean (Mac) */ |
162 |
{10004, "x-mac-arabic"}, /* Arabic (Mac) */ |
163 |
{10005, "x-mac-hebrew"}, /* Hebrew (Mac) */ |
164 |
{10006, "x-mac-greek"}, /* Greek (Mac) */ |
165 |
{10007, "x-mac-cyrillic"}, /* Cyrillic (Mac) */ |
166 |
{10008, "x-mac-chinesesimp"}, /* MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac) */ |
167 |
{10010, "x-mac-romanian"}, /* Romanian (Mac) */ |
168 |
{10017, "x-mac-ukrainian"}, /* Ukrainian (Mac) */ |
169 |
{10021, "x-mac-thai"}, /* Thai (Mac) */ |
170 |
{10029, "x-mac-ce"}, /* MAC Latin 2; Central European (Mac) */ |
171 |
{10079, "x-mac-icelandic"}, /* Icelandic (Mac) */ |
172 |
{10081, "x-mac-turkish"}, /* Turkish (Mac) */ |
173 |
{10082, "x-mac-croatian"}, /* Croatian (Mac) */ |
174 |
{12000, "utf-32"}, /* Unicode UTF-32, little endian byte order; available only to managed applications */ |
175 |
{12001, "utf-32BE"}, /* Unicode UTF-32, big endian byte order; available only to managed applications */ |
176 |
{20000, "x-Chinese_CNS"}, /* CNS Taiwan; Chinese Traditional (CNS) */ |
177 |
{20001, "x-cp20001"}, /* TCA Taiwan */ |
178 |
{20002, "x_Chinese-Eten"}, /* Eten Taiwan; Chinese Traditional (Eten) */ |
179 |
{20003, "x-cp20003"}, /* IBM5550 Taiwan */ |
180 |
{20004, "x-cp20004"}, /* TeleText Taiwan */ |
181 |
{20005, "x-cp20005"}, /* Wang Taiwan */ |
182 |
{20105, "x-IA5"}, /* IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5) */ |
183 |
{20106, "x-IA5-German"}, /* IA5 German (7-bit) */ |
184 |
{20107, "x-IA5-Swedish"}, /* IA5 Swedish (7-bit) */ |
185 |
{20108, "x-IA5-Norwegian"}, /* IA5 Norwegian (7-bit) */ |
186 |
{20127, "us-ascii"}, /* US-ASCII (7-bit) */ |
187 |
{20261, "x-cp20261"}, /* T.61 */ |
188 |
{20269, "x-cp20269"}, /* ISO 6937 Non-Spacing Accent */ |
189 |
{20273, "IBM273"}, /* IBM EBCDIC Germany */ |
190 |
{20277, "IBM277"}, /* IBM EBCDIC Denmark-Norway */ |
191 |
{20278, "IBM278"}, /* IBM EBCDIC Finland-Sweden */ |
192 |
{20280, "IBM280"}, /* IBM EBCDIC Italy */ |
193 |
{20284, "IBM284"}, /* IBM EBCDIC Latin America-Spain */ |
194 |
{20285, "IBM285"}, /* IBM EBCDIC United Kingdom */ |
195 |
{20290, "IBM290"}, /* IBM EBCDIC Japanese Katakana Extended */ |
196 |
{20297, "IBM297"}, /* IBM EBCDIC France */ |
197 |
{20420, "IBM420"}, /* IBM EBCDIC Arabic */ |
198 |
{20423, "IBM423"}, /* IBM EBCDIC Greek */ |
199 |
{20424, "IBM424"}, /* IBM EBCDIC Hebrew */ |
200 |
{20833, "x-EBCDIC-KoreanExtended"}, /* IBM EBCDIC Korean Extended */ |
201 |
{20838, "IBM-Thai"}, /* IBM EBCDIC Thai */ |
202 |
{20866, "koi8-r"}, /* Russian (KOI8-R); Cyrillic (KOI8-R) */ |
203 |
{20871, "IBM871"}, /* IBM EBCDIC Icelandic */ |
204 |
{20880, "IBM880"}, /* IBM EBCDIC Cyrillic Russian */ |
205 |
{20905, "IBM905"}, /* IBM EBCDIC Turkish */ |
206 |
{20924, "IBM00924"}, /* IBM EBCDIC Latin 1/Open System (1047 + Euro symbol) */ |
207 |
{20932, "EUC-JP"}, /* Japanese (JIS 0208-1990 and 0121-1990) */ |
208 |
{20936, "x-cp20936"}, /* Simplified Chinese (GB2312); Chinese Simplified (GB2312-80) */ |
209 |
{20949, "x-cp20949"}, /* Korean Wansung */ |
210 |
{21025, "cp1025"}, /* IBM EBCDIC Cyrillic Serbian-Bulgarian */ |
211 |
{21866, "koi8-u"}, /* Ukrainian (KOI8-U); Cyrillic (KOI8-U) */ |
212 |
{28591, "iso-8859-1"}, /* ISO 8859-1 Latin 1; Western European (ISO) */ |
213 |
{28592, "iso-8859-2"}, /* ISO 8859-2 Central European; Central European (ISO) */ |
214 |
{28593, "iso-8859-3"}, /* ISO 8859-3 Latin 3 */ |
215 |
{28594, "iso-8859-4"}, /* ISO 8859-4 Baltic */ |
216 |
{28595, "iso-8859-5"}, /* ISO 8859-5 Cyrillic */ |
217 |
{28596, "iso-8859-6"}, /* ISO 8859-6 Arabic */ |
218 |
{28597, "iso-8859-7"}, /* ISO 8859-7 Greek */ |
219 |
{28598, "iso-8859-8"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Visual) */ |
220 |
{28599, "iso-8859-9"}, /* ISO 8859-9 Turkish */ |
221 |
{28603, "iso-8859-13"}, /* ISO 8859-13 Estonian */ |
222 |
{28605, "iso-8859-15"}, /* ISO 8859-15 Latin 9 */ |
223 |
{29001, "x-Europa"}, /* Europa 3 */ |
224 |
{38598, "iso-8859-8-i"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Logical) */ |
225 |
{50220, "iso-2022-jp"}, /* ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS) */ |
226 |
{50221, "csISO2022JP"}, /* ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana) */ |
227 |
{50222, "iso-2022-jp"}, /* ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI) */ |
228 |
{50225, "iso-2022-kr"}, /* ISO 2022 Korean */ |
229 |
{50227, "x-cp50227"}, /* ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022) */ |
230 |
{51932, "euc-jp"}, /* EUC Japanese */ |
231 |
{51936, "EUC-CN"}, /* EUC Simplified Chinese; Chinese Simplified (EUC) */ |
232 |
{51949, "euc-kr"}, /* EUC Korean */ |
233 |
{52936, "hz-gb-2312"}, /* HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ) */ |
234 |
{54936, "GB18030"}, /* Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030) */ |
235 |
{57002, "x-iscii-de"}, /* ISCII Devanagari */ |
236 |
{57003, "x-iscii-be"}, /* ISCII Bengali */ |
237 |
{57004, "x-iscii-ta"}, /* ISCII Tamil */ |
238 |
{57005, "x-iscii-te"}, /* ISCII Telugu */ |
239 |
{57006, "x-iscii-as"}, /* ISCII Assamese */ |
240 |
{57007, "x-iscii-or"}, /* ISCII Oriya */ |
241 |
{57008, "x-iscii-ka"}, /* ISCII Kannada */ |
242 |
{57009, "x-iscii-ma"}, /* ISCII Malayalam */ |
243 |
{57010, "x-iscii-gu"}, /* ISCII Gujarati */ |
244 |
{57011, "x-iscii-pa"}, /* ISCII Punjabi */ |
245 |
{65000, "utf-7"}, /* Unicode (UTF-7) */ |
246 |
{65001, "utf-8"}, /* Unicode (UTF-8) */ |
247 |
}; |
248 |
|
249 |
int find_code_page(const char * encoding, UINT * code_page) { |
250 |
for (size_t i = 0; i < sizeof(code_pages) / sizeof(code_pages[0]); i++) { |
251 |
if (strcasecmp(encoding, code_pages[i].string) == 0) { |
252 |
*code_page = code_pages[i].value; |
253 |
return 0; |
254 |
} |
255 |
} |
256 |
return -1; |
257 |
} |
258 |
|
259 |
int jscoverage_bytes_to_characters(const char * encoding, const uint8_t * bytes, size_t num_bytes, jschar ** characters, size_t * num_characters) { |
260 |
assert(encoding != NULL); |
261 |
|
262 |
if (num_bytes == 0) { |
263 |
*characters = xnew(jschar, 0); |
264 |
*num_characters = 0; |
265 |
return 0; |
266 |
} |
267 |
|
268 |
UINT code_page; |
269 |
if (find_code_page(encoding, &code_page) != 0) { |
270 |
return JSCOVERAGE_ERROR_ENCODING_NOT_SUPPORTED; |
271 |
} |
272 |
|
273 |
if (num_bytes > INT_MAX) { |
274 |
fatal("overflow"); |
275 |
} |
276 |
|
277 |
*characters = xnew(jschar, num_bytes); |
278 |
|
279 |
int result = MultiByteToWideChar(code_page, 0, bytes, num_bytes, *characters, num_bytes); |
280 |
if (result == 0) { |
281 |
free(*characters); |
282 |
return JSCOVERAGE_ERROR_INVALID_BYTE_SEQUENCE; |
283 |
} |
284 |
|
285 |
*num_characters = result; |
286 |
skip_bom(characters, num_characters); |
287 |
return 0; |
288 |
} |
289 |
|
290 |
#else |
291 |
|
292 |
int jscoverage_bytes_to_characters(const char * encoding, const uint8_t * bytes, size_t num_bytes, jschar ** characters, size_t * num_characters) { |
293 |
assert(encoding != NULL); |
294 |
|
295 |
if (strcasecmp(encoding, "us-ascii") != 0 && strcasecmp(encoding, "iso-8859-1") != 0 && strcasecmp(encoding, "utf-8") != 0) { |
296 |
return JSCOVERAGE_ERROR_ENCODING_NOT_SUPPORTED; |
297 |
} |
298 |
|
299 |
jschar * c = xnew(jschar, num_bytes); |
300 |
for (size_t i = 0; i < num_bytes; i++) { |
301 |
if (bytes[i]) > 127) { |
302 |
free(c); |
303 |
return JSCOVERAGE_ERROR_ENCODING_NOT_SUPPORTED; |
304 |
} |
305 |
c[i] = bytes[i]; |
306 |
} |
307 |
|
308 |
*characters = c; |
309 |
*num_characters = num_bytes; |
310 |
return 0; |
311 |
} |
312 |
|
313 |
#endif |