/[jscoverage]/trunk/encoding.c
ViewVC logotype

Annotation of /trunk/encoding.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 313 - (hide annotations)
Mon Oct 13 17:52:20 2008 UTC (10 years, 5 months ago) by siliconforks
File MIME type: text/plain
File size: 13560 byte(s)
Typo fix.
1 siliconforks 174 /*
2     encoding.c - character encoding
3     Copyright (C) 2008 siliconforks.com
4    
5     This program is free software; you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation; either version 2 of the License, or
8     (at your option) any later version.
9    
10     This program is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13     GNU General Public License for more details.
14    
15     You should have received a copy of the GNU General Public License along
16     with this program; if not, write to the Free Software Foundation, Inc.,
17     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18     */
19    
20     #include <config.h>
21    
22     #include "encoding.h"
23    
24     #include <assert.h>
25 siliconforks 297 #include <limits.h>
26 siliconforks 174 #include <string.h>
27    
28     #ifdef HAVE_ICONV_H
29     #include <iconv.h>
30     #elif defined HAVE_WINDOWS_H
31     #include <windows.h>
32     #endif
33    
34     #include "util.h"
35    
36     static void skip_bom(jschar ** characters, size_t * num_characters) {
37     jschar * c = *characters;
38     size_t nc = *num_characters;
39    
40     size_t i;
41     for (i = 0; i < nc; i++) {
42     if (c[i] != 0xfeff) {
43     break;
44     }
45     }
46    
47     if (i == 0) {
48     return;
49     }
50    
51     nc -= i;
52     jschar * old = c;
53     c = xnew(jschar, nc);
54     memcpy(c, old + i, nc * sizeof(jschar));
55     free(old);
56    
57     *characters = c;
58     *num_characters = nc;
59     }
60    
61     #ifdef HAVE_ICONV
62    
63     #ifdef WORDS_BIGENDIAN
64 siliconforks 188 #define UTF_16_INTERNAL "UTF-16BE"
65 siliconforks 174 #else
66 siliconforks 188 #define UTF_16_INTERNAL "UTF-16LE"
67 siliconforks 174 #endif
68    
69     int jscoverage_bytes_to_characters(const char * encoding, const uint8_t * bytes, size_t num_bytes, jschar ** characters, size_t * num_characters) {
70     assert(encoding != NULL);
71    
72 siliconforks 188 iconv_t state = iconv_open(UTF_16_INTERNAL, encoding);
73 siliconforks 174 if (state == (iconv_t) -1) {
74     return JSCOVERAGE_ERROR_ENCODING_NOT_SUPPORTED;
75     }
76    
77     ICONV_CONST char * input = (char *) bytes;
78     size_t input_bytes_left = num_bytes;
79    
80     jschar * c = xnew(jschar, num_bytes);
81     char * output = (char *) c;
82     size_t output_bytes_left = sizeof(jschar) * num_bytes;
83    
84     size_t result = iconv(state, &input, &input_bytes_left, &output, &output_bytes_left);
85 siliconforks 178 iconv_close(state);
86 siliconforks 174 if (result == (size_t) -1) {
87     free(c);
88     return JSCOVERAGE_ERROR_INVALID_BYTE_SEQUENCE;
89     }
90    
91     assert(input_bytes_left == 0);
92    
93     size_t nc = ((jschar *) output) - c;
94    
95     skip_bom(&c, &nc);
96    
97     *characters = c;
98     *num_characters = nc;
99     return 0;
100     }
101    
102     #elif HAVE_MULTIBYTETOWIDECHAR
103    
104     /* http://msdn.microsoft.com/en-us/library/ms776446(VS.85).aspx */
105     static struct CodePage {
106     UINT value;
107     LPCSTR string;
108     } code_pages[] = {
109     {37, "IBM037"}, /* IBM EBCDIC US-Canada */
110     {437, "IBM437"}, /* OEM United States */
111     {500, "IBM500"}, /* IBM EBCDIC International */
112     {708, "ASMO-708"}, /* Arabic (ASMO 708) */
113     {720, "DOS-720"}, /* Arabic (Transparent ASMO); Arabic (DOS) */
114     {737, "ibm737"}, /* OEM Greek (formerly 437G); Greek (DOS) */
115     {775, "ibm775"}, /* OEM Baltic; Baltic (DOS) */
116     {850, "ibm850"}, /* OEM Multilingual Latin 1; Western European (DOS) */
117     {852, "ibm852"}, /* OEM Latin 2; Central European (DOS) */
118     {855, "IBM855"}, /* OEM Cyrillic (primarily Russian) */
119     {857, "ibm857"}, /* OEM Turkish; Turkish (DOS) */
120     {858, "IBM00858"}, /* OEM Multilingual Latin 1 + Euro symbol */
121     {860, "IBM860"}, /* OEM Portuguese; Portuguese (DOS) */
122     {861, "ibm861"}, /* OEM Icelandic; Icelandic (DOS) */
123     {862, "DOS-862"}, /* OEM Hebrew; Hebrew (DOS) */
124     {863, "IBM863"}, /* OEM French Canadian; French Canadian (DOS) */
125     {864, "IBM864"}, /* OEM Arabic; Arabic (864) */
126     {865, "IBM865"}, /* OEM Nordic; Nordic (DOS) */
127     {866, "cp866"}, /* OEM Russian; Cyrillic (DOS) */
128     {869, "ibm869"}, /* OEM Modern Greek; Greek, Modern (DOS) */
129     {870, "IBM870"}, /* IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2 */
130     {874, "windows-874"}, /* ANSI/OEM Thai (same as 28605, ISO 8859-15); Thai (Windows) */
131     {875, "cp875"}, /* IBM EBCDIC Greek Modern */
132     {932, "shift_jis"}, /* ANSI/OEM Japanese; Japanese (Shift-JIS) */
133     {936, "gb2312"}, /* ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312) */
134     {949, "ks_c_5601-1987"}, /* ANSI/OEM Korean (Unified Hangul Code) */
135     {950, "big5"}, /* ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5) */
136     {1026, "IBM1026"}, /* IBM EBCDIC Turkish (Latin 5) */
137     {1047, "IBM01047"}, /* IBM EBCDIC Latin 1/Open System */
138     {1140, "IBM01140"}, /* IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro) */
139     {1141, "IBM01141"}, /* IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro) */
140     {1142, "IBM01142"}, /* IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro) */
141     {1143, "IBM01143"}, /* IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro) */
142     {1144, "IBM01144"}, /* IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro) */
143     {1145, "IBM01145"}, /* IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro) */
144     {1146, "IBM01146"}, /* IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro) */
145     {1147, "IBM01147"}, /* IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro) */
146     {1148, "IBM01148"}, /* IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro) */
147     {1149, "IBM01149"}, /* IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro) */
148     {1200, "utf-16"}, /* Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications */
149     {1201, "unicodeFFFE"}, /* Unicode UTF-16, big endian byte order; available only to managed applications */
150     {1250, "windows-1250"}, /* ANSI Central European; Central European (Windows) */
151     {1251, "windows-1251"}, /* ANSI Cyrillic; Cyrillic (Windows) */
152     {1252, "windows-1252"}, /* ANSI Latin 1; Western European (Windows) */
153     {1253, "windows-1253"}, /* ANSI Greek; Greek (Windows) */
154     {1254, "windows-1254"}, /* ANSI Turkish; Turkish (Windows) */
155     {1255, "windows-1255"}, /* ANSI Hebrew; Hebrew (Windows) */
156     {1256, "windows-1256"}, /* ANSI Arabic; Arabic (Windows) */
157     {1257, "windows-1257"}, /* ANSI Baltic; Baltic (Windows) */
158     {1258, "windows-1258"}, /* ANSI/OEM Vietnamese; Vietnamese (Windows) */
159     {1361, "Johab"}, /* Korean (Johab) */
160     {10000, "macintosh"}, /* MAC Roman; Western European (Mac) */
161     {10001, "x-mac-japanese"}, /* Japanese (Mac) */
162     {10002, "x-mac-chinesetrad"}, /* MAC Traditional Chinese (Big5); Chinese Traditional (Mac) */
163     {10003, "x-mac-korean"}, /* Korean (Mac) */
164     {10004, "x-mac-arabic"}, /* Arabic (Mac) */
165     {10005, "x-mac-hebrew"}, /* Hebrew (Mac) */
166     {10006, "x-mac-greek"}, /* Greek (Mac) */
167     {10007, "x-mac-cyrillic"}, /* Cyrillic (Mac) */
168     {10008, "x-mac-chinesesimp"}, /* MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac) */
169     {10010, "x-mac-romanian"}, /* Romanian (Mac) */
170     {10017, "x-mac-ukrainian"}, /* Ukrainian (Mac) */
171     {10021, "x-mac-thai"}, /* Thai (Mac) */
172     {10029, "x-mac-ce"}, /* MAC Latin 2; Central European (Mac) */
173     {10079, "x-mac-icelandic"}, /* Icelandic (Mac) */
174     {10081, "x-mac-turkish"}, /* Turkish (Mac) */
175     {10082, "x-mac-croatian"}, /* Croatian (Mac) */
176     {12000, "utf-32"}, /* Unicode UTF-32, little endian byte order; available only to managed applications */
177     {12001, "utf-32BE"}, /* Unicode UTF-32, big endian byte order; available only to managed applications */
178     {20000, "x-Chinese_CNS"}, /* CNS Taiwan; Chinese Traditional (CNS) */
179     {20001, "x-cp20001"}, /* TCA Taiwan */
180     {20002, "x_Chinese-Eten"}, /* Eten Taiwan; Chinese Traditional (Eten) */
181     {20003, "x-cp20003"}, /* IBM5550 Taiwan */
182     {20004, "x-cp20004"}, /* TeleText Taiwan */
183     {20005, "x-cp20005"}, /* Wang Taiwan */
184     {20105, "x-IA5"}, /* IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5) */
185     {20106, "x-IA5-German"}, /* IA5 German (7-bit) */
186     {20107, "x-IA5-Swedish"}, /* IA5 Swedish (7-bit) */
187     {20108, "x-IA5-Norwegian"}, /* IA5 Norwegian (7-bit) */
188     {20127, "us-ascii"}, /* US-ASCII (7-bit) */
189     {20261, "x-cp20261"}, /* T.61 */
190     {20269, "x-cp20269"}, /* ISO 6937 Non-Spacing Accent */
191     {20273, "IBM273"}, /* IBM EBCDIC Germany */
192     {20277, "IBM277"}, /* IBM EBCDIC Denmark-Norway */
193     {20278, "IBM278"}, /* IBM EBCDIC Finland-Sweden */
194     {20280, "IBM280"}, /* IBM EBCDIC Italy */
195     {20284, "IBM284"}, /* IBM EBCDIC Latin America-Spain */
196     {20285, "IBM285"}, /* IBM EBCDIC United Kingdom */
197     {20290, "IBM290"}, /* IBM EBCDIC Japanese Katakana Extended */
198     {20297, "IBM297"}, /* IBM EBCDIC France */
199     {20420, "IBM420"}, /* IBM EBCDIC Arabic */
200     {20423, "IBM423"}, /* IBM EBCDIC Greek */
201     {20424, "IBM424"}, /* IBM EBCDIC Hebrew */
202     {20833, "x-EBCDIC-KoreanExtended"}, /* IBM EBCDIC Korean Extended */
203     {20838, "IBM-Thai"}, /* IBM EBCDIC Thai */
204     {20866, "koi8-r"}, /* Russian (KOI8-R); Cyrillic (KOI8-R) */
205     {20871, "IBM871"}, /* IBM EBCDIC Icelandic */
206     {20880, "IBM880"}, /* IBM EBCDIC Cyrillic Russian */
207     {20905, "IBM905"}, /* IBM EBCDIC Turkish */
208     {20924, "IBM00924"}, /* IBM EBCDIC Latin 1/Open System (1047 + Euro symbol) */
209     {20932, "EUC-JP"}, /* Japanese (JIS 0208-1990 and 0121-1990) */
210     {20936, "x-cp20936"}, /* Simplified Chinese (GB2312); Chinese Simplified (GB2312-80) */
211     {20949, "x-cp20949"}, /* Korean Wansung */
212     {21025, "cp1025"}, /* IBM EBCDIC Cyrillic Serbian-Bulgarian */
213     {21866, "koi8-u"}, /* Ukrainian (KOI8-U); Cyrillic (KOI8-U) */
214     {28591, "iso-8859-1"}, /* ISO 8859-1 Latin 1; Western European (ISO) */
215     {28592, "iso-8859-2"}, /* ISO 8859-2 Central European; Central European (ISO) */
216     {28593, "iso-8859-3"}, /* ISO 8859-3 Latin 3 */
217     {28594, "iso-8859-4"}, /* ISO 8859-4 Baltic */
218     {28595, "iso-8859-5"}, /* ISO 8859-5 Cyrillic */
219     {28596, "iso-8859-6"}, /* ISO 8859-6 Arabic */
220     {28597, "iso-8859-7"}, /* ISO 8859-7 Greek */
221     {28598, "iso-8859-8"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Visual) */
222     {28599, "iso-8859-9"}, /* ISO 8859-9 Turkish */
223     {28603, "iso-8859-13"}, /* ISO 8859-13 Estonian */
224     {28605, "iso-8859-15"}, /* ISO 8859-15 Latin 9 */
225     {29001, "x-Europa"}, /* Europa 3 */
226     {38598, "iso-8859-8-i"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Logical) */
227     {50220, "iso-2022-jp"}, /* ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS) */
228     {50221, "csISO2022JP"}, /* ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana) */
229     {50222, "iso-2022-jp"}, /* ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI) */
230     {50225, "iso-2022-kr"}, /* ISO 2022 Korean */
231     {50227, "x-cp50227"}, /* ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022) */
232     {51932, "euc-jp"}, /* EUC Japanese */
233     {51936, "EUC-CN"}, /* EUC Simplified Chinese; Chinese Simplified (EUC) */
234     {51949, "euc-kr"}, /* EUC Korean */
235     {52936, "hz-gb-2312"}, /* HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ) */
236     {54936, "GB18030"}, /* Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030) */
237     {57002, "x-iscii-de"}, /* ISCII Devanagari */
238     {57003, "x-iscii-be"}, /* ISCII Bengali */
239     {57004, "x-iscii-ta"}, /* ISCII Tamil */
240     {57005, "x-iscii-te"}, /* ISCII Telugu */
241     {57006, "x-iscii-as"}, /* ISCII Assamese */
242     {57007, "x-iscii-or"}, /* ISCII Oriya */
243     {57008, "x-iscii-ka"}, /* ISCII Kannada */
244     {57009, "x-iscii-ma"}, /* ISCII Malayalam */
245     {57010, "x-iscii-gu"}, /* ISCII Gujarati */
246     {57011, "x-iscii-pa"}, /* ISCII Punjabi */
247     {65000, "utf-7"}, /* Unicode (UTF-7) */
248     {65001, "utf-8"}, /* Unicode (UTF-8) */
249     };
250    
251     int find_code_page(const char * encoding, UINT * code_page) {
252     for (size_t i = 0; i < sizeof(code_pages) / sizeof(code_pages[0]); i++) {
253     if (strcasecmp(encoding, code_pages[i].string) == 0) {
254     *code_page = code_pages[i].value;
255     return 0;
256     }
257     }
258     return -1;
259     }
260    
261     int jscoverage_bytes_to_characters(const char * encoding, const uint8_t * bytes, size_t num_bytes, jschar ** characters, size_t * num_characters) {
262     assert(encoding != NULL);
263    
264     if (num_bytes == 0) {
265     *characters = xnew(jschar, 0);
266     *num_characters = 0;
267     return 0;
268     }
269    
270     UINT code_page;
271     if (find_code_page(encoding, &code_page) != 0) {
272     return JSCOVERAGE_ERROR_ENCODING_NOT_SUPPORTED;
273     }
274    
275     if (num_bytes > INT_MAX) {
276     fatal("overflow");
277     }
278    
279     *characters = xnew(jschar, num_bytes);
280    
281 siliconforks 308 int result = MultiByteToWideChar(code_page, MB_ERR_INVALID_CHARS, bytes, num_bytes, *characters, num_bytes);
282 siliconforks 174 if (result == 0) {
283     free(*characters);
284     return JSCOVERAGE_ERROR_INVALID_BYTE_SEQUENCE;
285     }
286    
287     *num_characters = result;
288     skip_bom(characters, num_characters);
289     return 0;
290     }
291    
292     #else
293    
294     int jscoverage_bytes_to_characters(const char * encoding, const uint8_t * bytes, size_t num_bytes, jschar ** characters, size_t * num_characters) {
295     assert(encoding != NULL);
296    
297     if (strcasecmp(encoding, "us-ascii") != 0 && strcasecmp(encoding, "iso-8859-1") != 0 && strcasecmp(encoding, "utf-8") != 0) {
298     return JSCOVERAGE_ERROR_ENCODING_NOT_SUPPORTED;
299     }
300    
301     jschar * c = xnew(jschar, num_bytes);
302     for (size_t i = 0; i < num_bytes; i++) {
303 siliconforks 313 if (bytes[i] > 127) {
304 siliconforks 174 free(c);
305     return JSCOVERAGE_ERROR_ENCODING_NOT_SUPPORTED;
306     }
307     c[i] = bytes[i];
308     }
309    
310     *characters = c;
311     *num_characters = num_bytes;
312     return 0;
313     }
314    
315     #endif

  ViewVC Help
Powered by ViewVC 1.1.24