Annotation of XML/encoding.c, revision 1.3
1.1 daniel 1: /*
2: * encoding.c : implements the encoding conversion functions needed for XML
3: *
4: * Related specs:
5: * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6: * [ISO-10646] UTF-8 and UTF-16 in Annexes
7: * [ISO-8859-1] ISO Latin-1 characters codes.
8: * [UNICODE] The Unicode Consortium, "The Unicode Standard --
9: * Worldwide Character Encoding -- Version 1.0", Addison-
10: * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
11: * described in Unicode Technical Report #4.
12: * [US-ASCII] Coded Character Set--7-bit American Standard Code for
13: * Information Interchange, ANSI X3.4-1986.
14: *
15: * Original code from "Martin J. Duerst" <duerst@w3.org>
16: *
17: * See Copyright for the status of this software.
18: *
1.3 ! daniel 19: * $Id: encoding.c,v 1.4 1998/11/01 19:34:25 veillard Exp $
1.1 daniel 20: *
21: * Daniel.Veillard@w3.org
22: */
23:
24: #include "encoding.h"
1.3 ! daniel 25:
! 26: /*
! 27: * From rfc2044: encoding of the Unicode values on UTF-8:
! 28: *
! 29: * UCS-4 range (hex.) UTF-8 octet sequence (binary)
! 30: * 0000 0000-0000 007F 0xxxxxxx
! 31: * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
! 32: * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
! 33: *
! 34: * I hope we won't use values > 0xFFFF anytime soon !
! 35: */
1.1 daniel 36:
37: /**
38: * isolat1ToUTF8:
39: * @out: a pointer ot an array of bytes to store the result
40: * @outlen: the lenght of @out
41: * @in: a pointer ot an array of ISO Latin 1 chars
42: * @inlen: the lenght of @in
43: *
44: * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
45: * block of chars out.
46: * return values: number of byte written, or -1 by lack of space.
47: */
48: int
49: isolat1ToUTF8(unsigned char* out, int outlen, unsigned char* in, int inlen)
50: {
51: unsigned char* outstart= out;
52: unsigned char* outend= out+outlen;
53: unsigned char* inend= in+inlen;
54: unsigned char c;
55:
56: while (in < inend) {
57: c= *in++;
58: if (c < 0x80) {
59: if (out >= outend) return -1;
60: *out++ = c;
61: }
62: else {
63: if (out >= outend) return -1;
64: *out++ = 0xC0 | (c >> 6);
65: if (out >= outend) return -1;
66: *out++ = 0x80 | (0x3F & c);
67: }
68: }
69: return out-outstart;
70: }
71:
72: /**
73: * UTF8Toisolat1:
74: * @out: a pointer ot an array of bytes to store the result
75: * @outlen: the lenght of @out
76: * @in: a pointer ot an array of UTF-8 chars
77: * @inlen: the lenght of @in
78: *
79: * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
80: * block of chars out.
81: * TODO: need a fallback mechanism ...
82: * return values: the number of byte written, or -1 by lack of space, or -2
83: * if the transcoding failed.
84: */
85: int
86: UTF8Toisolat1(unsigned char* out, int outlen, unsigned char* in, int inlen)
87: {
88: unsigned char* outstart= out;
89: unsigned char* outend= out+outlen;
90: unsigned char* inend= in+inlen;
91: unsigned char c;
92:
93: while (in < inend) {
94: c= *in++;
95: if (c < 0x80) {
96: if (out >= outend) return -1;
97: *out++= c;
98: }
99: else if (((c & 0xFE) == 0xC2) && in<inend) {
100: if (out >= outend) return -1;
101: *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
102: }
103: else return -2;
104: }
105: return out-outstart;
106: }
107:
108: /**
109: * UTF16ToUTF8:
110: * @out: a pointer ot an array of bytes to store the result
111: * @outlen: the lenght of @out
112: * @in: a pointer ot an array of UTF-16 chars (array of unsigned shorts)
113: * @inlen: the lenght of @in
114: *
115: * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
116: * block of chars out.
117: * return values: number of byte written, or -1 by lack of space.
118: */
119: int
120: UTF16ToUTF8(unsigned char* out, int outlen, unsigned short* in, int inlen)
121: {
122: unsigned char* outstart= out;
123: unsigned char* outend= out+outlen;
124: unsigned short* inend= in+inlen;
125: unsigned int c, d;
126: int bits;
127:
128: while (in < inend) {
129: c= *in++;
130: if ((c & 0xFC00) == 0xD800) { /* surrogates */
131: if ((in<inend) && (((d=*in++) & 0xFC00) == 0xDC00)) {
132: c &= 0x03FF;
133: c <<= 10;
134: c |= d & 0x03FF;
135: c += 0x10000;
136: }
137: else return -1;
138: }
139:
140: /* assertion: c is a single UTF-4 value */
141:
142: if (out >= outend) return -1;
143: if (c < 0x80) { *out++= c; bits= -6; }
144: else if (c < 0x800) { *out++= (c >> 6) | 0xC0; bits= 0; }
145: else if (c < 0x10000) { *out++= (c >> 12) | 0xE0; bits= 6; }
146: else { *out++= (c >> 18) | 0xF0; bits= 12; }
147:
148: for ( ; bits < 0; bits-= 6) {
149: if (out >= outend) return -1;
150: *out++= (c >> bits) & 0x3F;
151: }
152: }
153: return out-outstart;
154: }
155:
156: /**
157: * UTF8ToUTF16:
158: * @out: a pointer ot an array of shorts to store the result
159: * @outlen: the lenght of @out (number of shorts)
160: * @in: a pointer ot an array of UTF-8 chars
161: * @inlen: the lenght of @in
162: *
163: * Take a block of UTF-8 chars in and try to convert it to an UTF-16
164: * block of chars out.
165: * TODO: need a fallback mechanism ...
166: * return values: the number of byte written, or -1 by lack of space, or -2
167: * if the transcoding failed.
168: */
169: int
170: UTF8ToUTF16(unsigned short* out, int outlen, unsigned char* in, int inlen)
171: {
172: unsigned short* outstart= out;
173: unsigned short* outend= out+outlen;
174: unsigned char* inend= in+inlen;
175: unsigned int c, d, trailing;
176:
177: while (in < inend) {
178: d= *in++;
179: if (d < 0x80) { c= d; trailing= 0; }
180: else if (d < 0xC0) return -2; /* trailing byte in leading position */
181: else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
182: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
183: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
184: else return -2; /* no chance for this in UTF-16 */
185:
186: for ( ; trailing; trailing--) {
187: if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) return -1;
188: c <<= 6;
189: c |= d & 0x3F;
190: }
191:
192: /* assertion: c is a single UTF-4 value */
193: if (c < 0x10000) {
194: if (out >= outend) return -1;
195: *out++ = c;
196: }
197: else if (c < 0x110000) {
198: if (out+1 >= outend) return -1;
199: c -= 0x10000;
200: *out++ = 0xD800 | (c >> 10);
201: *out++ = 0xDC00 | (c & 0x03FF);
202: }
203: else return -1;
204: }
205: return out-outstart;
206: }
207:
208:
Webmaster