Annotation of XML/SGMLparser.c, revision 1.3
1.1 veillard 1: /*
2: * SGMLparser.c : an attempt to parse Docbook documents
3: *
4: * See Copyright for the status of this software.
5: *
6: * Daniel.Veillard@w3.org
7: */
8:
9: #ifdef WIN32
10: #include "win32config.h"
11: #else
12: #include "config.h"
13: #endif
14:
15: #include "xmlversion.h"
16: #ifdef LIBXML_SGML_ENABLED
17:
18: #include <stdio.h>
19: #include <string.h>
20: #ifdef HAVE_CTYPE_H
21: #include <ctype.h>
22: #endif
23: #ifdef HAVE_STDLIB_H
24: #include <stdlib.h>
25: #endif
26: #ifdef HAVE_SYS_STAT_H
27: #include <sys/stat.h>
28: #endif
29: #ifdef HAVE_FCNTL_H
30: #include <fcntl.h>
31: #endif
32: #ifdef HAVE_UNISTD_H
33: #include <unistd.h>
34: #endif
35: #ifdef HAVE_ZLIB_H
36: #include <zlib.h>
37: #endif
38:
39: #include <libxml/xmlmemory.h>
40: #include <libxml/tree.h>
41: #include <libxml/SGMLparser.h>
42: #include <libxml/entities.h>
43: #include <libxml/encoding.h>
44: #include <libxml/parser.h>
45: #include <libxml/valid.h>
46: #include <libxml/parserInternals.h>
47: #include <libxml/xmlIO.h>
48: #include <libxml/SAX.h>
1.3 ! veillard 49: #include <libxml/uri.h>
1.1 veillard 50: #include "xml-error.h"
51:
52: #define SGML_MAX_NAMELEN 1000
53: #define INPUT_CHUNK 50
54: #define SGML_PARSER_BIG_BUFFER_SIZE 1000
55: #define SGML_PARSER_BUFFER_SIZE 100
56:
57: /* #define DEBUG */
58: /* #define DEBUG_PUSH */
59:
60: /************************************************************************
61: * *
62: * Parser stacks related functions and macros *
63: * *
64: ************************************************************************/
65:
66: /*
67: * Generic function for accessing stacks in the Parser Context
68: */
69:
70: #define PUSH_AND_POP(scope, type, name) \
71: scope int sgml##name##Push(sgmlParserCtxtPtr ctxt, type value) { \
72: if (ctxt->name##Nr >= ctxt->name##Max) { \
73: ctxt->name##Max *= 2; \
74: ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
75: ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
76: if (ctxt->name##Tab == NULL) { \
77: fprintf(stderr, "realloc failed !\n"); \
78: return(0); \
79: } \
80: } \
81: ctxt->name##Tab[ctxt->name##Nr] = value; \
82: ctxt->name = value; \
83: return(ctxt->name##Nr++); \
84: } \
85: scope type sgml##name##Pop(sgmlParserCtxtPtr ctxt) { \
86: type ret; \
87: if (ctxt->name##Nr < 0) return(0); \
88: ctxt->name##Nr--; \
89: if (ctxt->name##Nr < 0) return(0); \
90: if (ctxt->name##Nr > 0) \
91: ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
92: else \
93: ctxt->name = NULL; \
94: ret = ctxt->name##Tab[ctxt->name##Nr]; \
95: ctxt->name##Tab[ctxt->name##Nr] = 0; \
96: return(ret); \
97: } \
98:
99: PUSH_AND_POP(extern, xmlNodePtr, node)
100: PUSH_AND_POP(extern, xmlChar*, name)
101:
102: /*
103: * Macros for accessing the content. Those should be used only by the parser,
104: * and not exported.
105: *
106: * Dirty macros, i.e. one need to make assumption on the context to use them
107: *
108: * CUR_PTR return the current pointer to the xmlChar to be parsed.
109: * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
110: * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
111: * in UNICODE mode. This should be used internally by the parser
112: * only to compare to ASCII values otherwise it would break when
113: * running with UTF-8 encoding.
114: * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
115: * to compare on ASCII based substring.
116: * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
117: * it should be used only to compare on ASCII based substring.
118: * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
119: * strings within the parser.
120: *
121: * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
122: *
123: * CURRENT Returns the current char value, with the full decoding of
124: * UTF-8 if we are using this mode. It returns an int.
125: * NEXT Skip to the next character, this does the proper decoding
126: * in UTF-8 mode. It also pop-up unfinished entities on the fly.
127: * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
128: */
129:
130: #define UPPER (toupper(*ctxt->input->cur))
131:
132: #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
133:
134: #define NXT(val) ctxt->input->cur[(val)]
135:
136: #define UPP(val) (toupper(ctxt->input->cur[(val)]))
137:
138: #define CUR_PTR ctxt->input->cur
139:
140: #define SHRINK xmlParserInputShrink(ctxt->input)
141:
142: #define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
143:
144: #define CURRENT ((int) (*ctxt->input->cur))
145:
146: #define SKIP_BLANKS sgmlSkipBlankChars(ctxt);
147:
148: #if 0
149: #define CUR ((int) (*ctxt->input->cur))
150: #define NEXT sgmlNextChar(ctxt);
151: #else
152: /* Inported from XML */
153:
154: /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
155: #define CUR ((int) (*ctxt->input->cur))
156: #define NEXT xmlNextChar(ctxt);ctxt->nbChars++;
157:
158: #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
159: #define NXT(val) ctxt->input->cur[(val)]
160: #define CUR_PTR ctxt->input->cur
161:
162:
163: #define NEXTL(l) \
164: if (*(ctxt->input->cur) == '\n') { \
165: ctxt->input->line++; ctxt->input->col = 1; \
166: } else ctxt->input->col++; \
167: ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;
168:
169: /************
170: \
171: if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
172: if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
173: ************/
174:
175: #define CUR_CHAR(l) sgmlCurrentChar(ctxt, &l);
176: #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l);
177:
178: #define COPY_BUF(l,b,i,v) \
179: if (l == 1) b[i++] = (xmlChar) v; \
180: else i += xmlCopyChar(l,&b[i],v);
181: #endif
182:
183: /**
184: * sgmlCurrentChar:
185: * @ctxt: the SGML parser context
186: * @len: pointer to the length of the char read
187: *
188: * The current char value, if using UTF-8 this may actaully span multiple
189: * bytes in the input buffer. Implement the end of line normalization:
190: * 2.11 End-of-Line Handling
191: * If the encoding is unspecified, in the case we find an ISO-Latin-1
192: * char, then the encoding converter is plugged in automatically.
193: *
194: * Returns the current char value and its lenght
195: */
196:
197: int
198: sgmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
199: if (ctxt->instate == XML_PARSER_EOF)
200: return(0);
201:
202: if (ctxt->token != 0) {
203: *len = 0;
204: return(ctxt->token);
205: }
206: if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
207: /*
208: * We are supposed to handle UTF8, check it's valid
209: * From rfc2044: encoding of the Unicode values on UTF-8:
210: *
211: * UCS-4 range (hex.) UTF-8 octet sequence (binary)
212: * 0000 0000-0000 007F 0xxxxxxx
213: * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
214: * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
215: *
216: * Check for the 0x110000 limit too
217: */
218: const unsigned char *cur = ctxt->input->cur;
219: unsigned char c;
220: unsigned int val;
221:
222: c = *cur;
223: if (c & 0x80) {
224: if (cur[1] == 0)
225: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
226: if ((cur[1] & 0xc0) != 0x80)
227: goto encoding_error;
228: if ((c & 0xe0) == 0xe0) {
229:
230: if (cur[2] == 0)
231: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
232: if ((cur[2] & 0xc0) != 0x80)
233: goto encoding_error;
234: if ((c & 0xf0) == 0xf0) {
235: if (cur[3] == 0)
236: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
237: if (((c & 0xf8) != 0xf0) ||
238: ((cur[3] & 0xc0) != 0x80))
239: goto encoding_error;
240: /* 4-byte code */
241: *len = 4;
242: val = (cur[0] & 0x7) << 18;
243: val |= (cur[1] & 0x3f) << 12;
244: val |= (cur[2] & 0x3f) << 6;
245: val |= cur[3] & 0x3f;
246: } else {
247: /* 3-byte code */
248: *len = 3;
249: val = (cur[0] & 0xf) << 12;
250: val |= (cur[1] & 0x3f) << 6;
251: val |= cur[2] & 0x3f;
252: }
253: } else {
254: /* 2-byte code */
255: *len = 2;
256: val = (cur[0] & 0x1f) << 6;
257: val |= cur[1] & 0x3f;
258: }
259: if (!IS_CHAR(val)) {
260: if ((ctxt->sax != NULL) &&
261: (ctxt->sax->error != NULL))
262: ctxt->sax->error(ctxt->userData,
263: "Char 0x%X out of allowed range\n", val);
264: ctxt->errNo = XML_ERR_INVALID_ENCODING;
265: ctxt->wellFormed = 0;
266: ctxt->disableSAX = 1;
267: }
268: return(val);
269: } else {
270: /* 1-byte code */
271: *len = 1;
272: return((int) *ctxt->input->cur);
273: }
274: }
275: /*
276: * Assume it's a fixed lenght encoding (1) with
277: * a compatibke encoding for the ASCII set, since
278: * XML constructs only use < 128 chars
279: */
280: *len = 1;
281: if ((int) *ctxt->input->cur < 0x80)
282: return((int) *ctxt->input->cur);
283:
284: /*
285: * Humm this is bad, do an automatic flow conversion
286: */
287: xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
288: ctxt->charset = XML_CHAR_ENCODING_UTF8;
289: return(xmlCurrentChar(ctxt, len));
290:
291: encoding_error:
292: /*
293: * If we detect an UTF8 error that probably mean that the
294: * input encoding didn't get properly advertized in the
295: * declaration header. Report the error and switch the encoding
296: * to ISO-Latin-1 (if you don't like this policy, just declare the
297: * encoding !)
298: */
299: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
300: ctxt->sax->error(ctxt->userData,
301: "Input is not proper UTF-8, indicate encoding !\n");
302: ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
303: ctxt->input->cur[0], ctxt->input->cur[1],
304: ctxt->input->cur[2], ctxt->input->cur[3]);
305: }
306: ctxt->errNo = XML_ERR_INVALID_ENCODING;
307:
308: ctxt->charset = XML_CHAR_ENCODING_8859_1;
309: *len = 1;
310: return((int) *ctxt->input->cur);
311: }
312:
313: /**
314: * sgmlNextChar:
315: * @ctxt: the SGML parser context
316: *
317: * Skip to the next char input char.
318: */
319:
320: void
321: sgmlNextChar(sgmlParserCtxtPtr ctxt) {
322: if (ctxt->instate == XML_PARSER_EOF)
323: return;
324: if ((*ctxt->input->cur == 0) &&
325: (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
326: xmlPopInput(ctxt);
327: } else {
328: if (*(ctxt->input->cur) == '\n') {
329: ctxt->input->line++; ctxt->input->col = 1;
330: } else ctxt->input->col++;
331: ctxt->input->cur++;
332: ctxt->nbChars++;
333: if (*ctxt->input->cur == 0)
334: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
335: }
336: }
337:
338: /**
339: * sgmlSkipBlankChars:
340: * @ctxt: the SGML parser context
341: *
342: * skip all blanks character found at that point in the input streams.
343: *
344: * Returns the number of space chars skipped
345: */
346:
347: int
348: sgmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
349: int res = 0;
350:
351: while (IS_BLANK(*(ctxt->input->cur))) {
352: if ((*ctxt->input->cur == 0) &&
353: (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
354: xmlPopInput(ctxt);
355: } else {
356: if (*(ctxt->input->cur) == '\n') {
357: ctxt->input->line++; ctxt->input->col = 1;
358: } else ctxt->input->col++;
359: ctxt->input->cur++;
360: ctxt->nbChars++;
361: if (*ctxt->input->cur == 0)
362: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
363: }
364: res++;
365: }
366: return(res);
367: }
368:
369:
370:
371: /************************************************************************
372: * *
373: * The list of SGML elements and their properties *
374: * *
375: ************************************************************************/
376:
377: /*
378: * Start Tag: 1 means the start tag can be ommited
379: * End Tag: 1 means the end tag can be ommited
380: * 2 means it's forbidden (empty elements)
381: * Depr: this element is deprecated
382: * DTD: 1 means that this element is valid only in the Loose DTD
383: * 2 means that this element is valid only in the Frameset DTD
384: *
385: * Name,Start Tag,End Tag, Empty, Depr., DTD, Description
386: */
387: sgmlElemDesc docbookElementTable[] = {
388: { "abbrev", 0, 0, 0, 3, 0, "" }, /* word */
389: { "abstract", 0, 0, 0, 9, 0, "" }, /* title */
390: { "accel", 0, 0, 0, 7, 0, "" }, /* smallcptr */
391: { "ackno", 0, 0, 0, 4, 0, "" }, /* docinfo */
392: { "acronym", 0, 0, 0, 3, 0, "" }, /* word */
393: { "action", 0, 0, 0, 7, 0, "" }, /* smallcptr */
394: { "address", 0, 0, 0, 1, 0, "" },
395: { "affiliation",0, 0, 0, 9, 0, "" }, /* shortaffil */
396: { "alt", 0, 0, 0, 1, 0, "" },
397: { "anchor", 0, 2, 1, 0, 0, "" },
398: { "answer", 0, 0, 0, 9, 0, "" }, /* label */
399: { "appendix", 0, 0, 0, 9, 0, "" }, /* appendixinfo */
400: { "appendixinfo",0, 0, 0, 9, 0, "" }, /* graphic */
401: { "application",0, 0, 0, 2, 0, "" }, /* para */
402: { "area", 0, 2, 1, 0, 0, "" },
403: { "areaset", 0, 0, 0, 9, 0, "" }, /* area */
404: { "areaspec", 0, 0, 0, 9, 0, "" }, /* area */
405: { "arg", 0, 0, 0, 1, 0, "" },
406: { "article", 0, 0, 0, 9, 0, "" }, /* div.title.content */
407: { "articleinfo",0, 0, 0, 9, 0, "" }, /* graphic */
408: { "artpagenums",0, 0, 0, 4, 0, "" }, /* docinfo */
409: { "attribution",0, 0, 0, 2, 0, "" }, /* para */
410: { "audiodata", 0, 2, 1, 0, 0, "" },
411: { "audioobject",0, 0, 0, 9, 0, "" }, /* objectinfo */
412: { "authorblurb",0, 0, 0, 9, 0, "" }, /* title */
413: { "authorgroup",0, 0, 0, 9, 0, "" }, /* author */
414: { "authorinitials",0, 0, 0, 4, 0, "" }, /* docinfo */
415: { "author", 0, 0, 0, 9, 0, "" }, /* person.ident.mix */
416: { "beginpage", 0, 2, 1, 0, 0, "" },
417: { "bibliodiv", 0, 0, 0, 9, 0, "" }, /* sect.title.content */
418: { "biblioentry",0, 0, 0, 9, 0, "" }, /* articleinfo */
419: { "bibliography",0, 0, 0, 9, 0, "" }, /* bibliographyinfo */
420: { "bibliographyinfo",0, 0, 0, 9, 0, "" }, /* graphic */
421: { "bibliomisc", 0, 0, 0, 2, 0, "" }, /* para */
422: { "bibliomixed",0, 0, 0, 1, 0, "" }, /* %bibliocomponent.mix, bibliomset) */
423: { "bibliomset", 0, 0, 0, 1, 0, "" }, /* %bibliocomponent.mix; | bibliomset) */
424: { "biblioset", 0, 0, 0, 9, 0, "" }, /* bibliocomponent.mix */
425: { "blockquote", 0, 0, 0, 9, 0, "" }, /* title */
426: { "book", 0, 0, 0, 9, 0, "" }, /* div.title.content */
427: { "bookinfo", 0, 0, 0, 9, 0, "" }, /* graphic */
428: { "bridgehead", 0, 0, 0, 8, 0, "" }, /* title */
429: { "callout", 0, 0, 0, 9, 0, "" }, /* component.mix */
430: { "calloutlist",0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
431: { "caption", 0, 0, 0, 9, 0, "" }, /* textobject.mix */
432: { "caution", 0, 0, 0, 9, 0, "" }, /* title */
433: { "chapter", 0, 0, 0, 9, 0, "" }, /* chapterinfo */
434: { "chapterinfo",0, 0, 0, 9, 0, "" }, /* graphic */
435: { "citation", 0, 0, 0, 2, 0, "" }, /* para */
436: { "citerefentry",0, 0, 0, 9, 0, "" }, /* refentrytitle */
437: { "citetitle", 0, 0, 0, 2, 0, "" }, /* para */
438: { "city", 0, 0, 0, 4, 0, "" }, /* docinfo */
439: { "classname", 0, 0, 0, 7, 0, "" }, /* smallcptr */
440: { "classsynopsisinfo",0,0, 0, 9, 0, "" }, /* cptr */
441: { "classsynopsis",0, 0, 0, 9, 0, "" }, /* ooclass */
442: { "cmdsynopsis",0, 0, 0, 9, 0, "" }, /* command */
443: { "co", 0, 2, 1, 0, 0, "" },
444: { "collab", 0, 0, 0, 9, 0, "" }, /* collabname */
445: { "collabname", 0, 0, 0, 4, 0, "" }, /* docinfo */
446: { "colophon", 0, 0, 0, 9, 0, "" }, /* sect.title.content */
447: { "colspec", 0, 2, 1, 0, 0, "" },
448: { "colspec", 0, 2, 1, 0, 0, "" },
449: { "command", 0, 0, 0, 9, 0, "" }, /* cptr */
450: { "computeroutput",0, 0, 0, 9, 0, "" }, /* cptr */
451: { "confdates", 0, 0, 0, 4, 0, "" }, /* docinfo */
452: { "confgroup", 0, 0, 0, 9, 0, "" }, /* confdates */
453: { "confnum", 0, 0, 0, 4, 0, "" }, /* docinfo */
454: { "confsponsor",0, 0, 0, 4, 0, "" }, /* docinfo */
455: { "conftitle", 0, 0, 0, 4, 0, "" }, /* docinfo */
456: { "constant", 0, 0, 0, 7, 0, "" }, /* smallcptr */
457: { "constructorsynopsis",0,0, 0, 9, 0, "" }, /* modifier */
458: { "contractnum",0, 0, 0, 4, 0, "" }, /* docinfo */
459: { "contractsponsor",0, 0, 0, 4, 0, "" }, /* docinfo */
460: { "contrib", 0, 0, 0, 4, 0, "" }, /* docinfo */
461: { "copyright", 0, 0, 0, 9, 0, "" }, /* year */
462: { "corpauthor", 0, 0, 0, 4, 0, "" }, /* docinfo */
463: { "corpname", 0, 0, 0, 4, 0, "" }, /* docinfo */
464: { "country", 0, 0, 0, 4, 0, "" }, /* docinfo */
465: { "database", 0, 0, 0, 7, 0, "" }, /* smallcptr */
466: { "date", 0, 0, 0, 4, 0, "" }, /* docinfo */
467: { "dedication", 0, 0, 0, 9, 0, "" }, /* sect.title.content */
468: { "destructorsynopsis",0,0, 0, 9, 0, "" }, /* modifier */
469: { "edition", 0, 0, 0, 4, 0, "" }, /* docinfo */
470: { "editor", 0, 0, 0, 9, 0, "" }, /* person.ident.mix */
471: { "email", 0, 0, 0, 4, 0, "" }, /* docinfo */
472: { "emphasis", 0, 0, 0, 2, 0, "" }, /* para */
473: { "entry", 0, 0, 0, 9, 0, "" }, /* tbl.entry.mdl */
474: { "entrytbl", 0, 0, 0, 9, 0, "" }, /* tbl.entrytbl.mdl */
475: { "envar", 0, 0, 0, 7, 0, "" }, /* smallcptr */
476: { "epigraph", 0, 0, 0, 9, 0, "" }, /* attribution */
477: { "equation", 0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
478: { "errorcode", 0, 0, 0, 7, 0, "" }, /* smallcptr */
479: { "errorname", 0, 0, 0, 7, 0, "" }, /* smallcptr */
480: { "errortype", 0, 0, 0, 7, 0, "" }, /* smallcptr */
481: { "example", 0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
482: { "exceptionname",0, 0, 0, 7, 0, "" }, /* smallcptr */
483: { "fax", 0, 0, 0, 4, 0, "" }, /* docinfo */
484: { "fieldsynopsis", 0, 0, 0, 9, 0, "" }, /* modifier */
485: { "figure", 0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
486: { "filename", 0, 0, 0, 7, 0, "" }, /* smallcptr */
487: { "firstname", 0, 0, 0, 4, 0, "" }, /* docinfo */
488: { "firstterm", 0, 0, 0, 3, 0, "" }, /* word */
489: { "footnote", 0, 0, 0, 9, 0, "" }, /* footnote.mix */
490: { "footnoteref",0, 2, 1, 0, 0, "" },
491: { "foreignphrase",0, 0, 0, 2, 0, "" }, /* para */
492: { "formalpara", 0, 0, 0, 9, 0, "" }, /* title */
493: { "funcdef", 0, 0, 0, 1, 0, "" },
494: { "funcparams", 0, 0, 0, 9, 0, "" }, /* cptr */
495: { "funcprototype",0, 0, 0, 9, 0, "" }, /* funcdef */
496: { "funcsynopsis",0, 0, 0, 9, 0, "" }, /* funcsynopsisinfo */
497: { "funcsynopsisinfo", 0, 0, 0, 9, 0, "" }, /* cptr */
498: { "function", 0, 0, 0, 9, 0, "" }, /* cptr */
499: { "glossary", 0, 0, 0, 9, 0, "" }, /* glossaryinfo */
500: { "glossaryinfo",0, 0, 0, 9, 0, "" }, /* graphic */
501: { "glossdef", 0, 0, 0, 9, 0, "" }, /* glossdef.mix */
502: { "glossdiv", 0, 0, 0, 9, 0, "" }, /* sect.title.content */
503: { "glossentry", 0, 0, 0, 9, 0, "" }, /* glossterm */
504: { "glosslist", 0, 0, 0, 9, 0, "" }, /* glossentry */
505: { "glossseealso",0, 0, 0, 2, 0, "" }, /* para */
506: { "glosssee", 0, 0, 0, 2, 0, "" }, /* para */
507: { "glossterm", 0, 0, 0, 2, 0, "" }, /* para */
508: { "graphic", 0, 2, 1, 0, 0, "" },
509: { "graphicco", 0, 0, 0, 9, 0, "" }, /* areaspec */
510: { "group", 0, 0, 0, 9, 0, "" }, /* arg */
511: { "guibutton", 0, 0, 0, 7, 0, "" }, /* smallcptr */
512: { "guiicon", 0, 0, 0, 7, 0, "" }, /* smallcptr */
513: { "guilabel", 0, 0, 0, 7, 0, "" }, /* smallcptr */
514: { "guimenuitem",0, 0, 0, 7, 0, "" }, /* smallcptr */
515: { "guimenu", 0, 0, 0, 7, 0, "" }, /* smallcptr */
516: { "guisubmenu", 0, 0, 0, 7, 0, "" }, /* smallcptr */
517: { "hardware", 0, 0, 0, 7, 0, "" }, /* smallcptr */
518: { "highlights", 0, 0, 0, 9, 0, "" }, /* highlights.mix */
519: { "holder", 0, 0, 0, 4, 0, "" }, /* docinfo */
520: { "honorific", 0, 0, 0, 4, 0, "" }, /* docinfo */
521: { "imagedata", 0, 2, 1, 0, 0, "" },
522: { "imageobjectco",0, 0, 0, 9, 0, "" }, /* areaspec */
523: { "imageobject",0, 0, 0, 9, 0, "" }, /* objectinfo */
524: { "important", 0, 0, 0, 9, 0, "" }, /* title */
525: { "indexdiv", 0, 0, 0, 9, 0, "" }, /* sect.title.content */
526: { "indexentry", 0, 0, 0, 9, 0, "" }, /* primaryie */
527: { "index", 0, 0, 0, 9, 0, "" }, /* indexinfo */
528: { "indexinfo", 0, 0, 0, 9, 0, "" }, /* graphic */
529: { "indexterm", 0, 0, 0, 9, 0, "" }, /* primary */
530: { "informalequation",0, 0, 0, 9, 0, "" }, /* equation.content */
531: { "informalexample",0, 0, 0, 9, 0, "" }, /* example.mix */
532: { "informalfigure",0, 0, 0, 9, 0, "" }, /* figure.mix */
533: { "informaltable",0, 0, 0, 9, 0, "" }, /* graphic */
534: { "initializer",0, 0, 0, 7, 0, "" }, /* smallcptr */
535: { "inlineequation",0, 0, 0, 9, 0, "" }, /* inlineequation.content */
536: { "inlinegraphic",0, 2, 1, 0, 0, "" },
537: { "inlinemediaobject",0,0, 0, 9, 0, "" }, /* objectinfo */
538: { "interfacename",0, 0, 0, 7, 0, "" }, /* smallcptr */
539: { "interface", 0, 0, 0, 7, 0, "" }, /* smallcptr */
540: { "invpartnumber",0, 0, 0, 4, 0, "" }, /* docinfo */
541: { "isbn", 0, 0, 0, 4, 0, "" }, /* docinfo */
542: { "issn", 0, 0, 0, 4, 0, "" }, /* docinfo */
543: { "issuenum", 0, 0, 0, 4, 0, "" }, /* docinfo */
544: { "itemizedlist",0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
545: { "itermset", 0, 0, 0, 9, 0, "" }, /* indexterm */
546: { "jobtitle", 0, 0, 0, 4, 0, "" }, /* docinfo */
547: { "keycap", 0, 0, 0, 7, 0, "" }, /* smallcptr */
548: { "keycode", 0, 0, 0, 7, 0, "" }, /* smallcptr */
549: { "keycombo", 0, 0, 0, 9, 0, "" }, /* keycap */
550: { "keysym", 0, 0, 0, 7, 0, "" }, /* smallcptr */
551: { "keyword", 0, 0, 0, 1, 0, "" },
552: { "keywordset", 0, 0, 0, 9, 0, "" }, /* keyword */
553: { "label", 0, 0, 0, 3, 0, "" }, /* word */
554: { "legalnotice",0, 0, 0, 9, 0, "" }, /* title */
555: { "lineage", 0, 0, 0, 4, 0, "" }, /* docinfo */
556: { "lineannotation",0, 0, 0, 2, 0, "" }, /* para */
557: { "link", 0, 0, 0, 2, 0, "" }, /* para */
558: { "listitem", 0, 0, 0, 9, 0, "" }, /* component.mix */
559: { "literal", 0, 0, 0, 9, 0, "" }, /* cptr */
560: { "literallayout",0, 0, 0, 2, 0, "" }, /* para */
561: { "lot", 0, 0, 0, 9, 0, "" }, /* bookcomponent.title.content */
562: { "lotentry", 0, 0, 0, 2, 0, "" }, /* para */
563: { "manvolnum", 0, 0, 0, 3, 0, "" }, /* word */
564: { "markup", 0, 0, 0, 7, 0, "" }, /* smallcptr */
565: { "medialabel", 0, 0, 0, 7, 0, "" }, /* smallcptr */
566: { "mediaobjectco",0, 0, 0, 9, 0, "" }, /* objectinfo */
567: { "mediaobject",0, 0, 0, 9, 0, "" }, /* objectinfo */
568: { "member", 0, 0, 0, 2, 0, "" }, /* para */
569: { "menuchoice", 0, 0, 0, 9, 0, "" }, /* shortcut */
570: { "methodname", 0, 0, 0, 7, 0, "" }, /* smallcptr */
571: { "methodparam",0, 0, 0, 9, 0, "" }, /* modifier */
572: { "methodsynopsis",0, 0, 0, 9, 0, "" }, /* modifier */
573: { "modespec", 0, 0, 0, 4, 0, "" }, /* docinfo */
574: { "modifier", 0, 0, 0, 7, 0, "" }, /* smallcptr */
575: { "mousebutton",0, 0, 0, 7, 0, "" }, /* smallcptr */
576: { "msgaud", 0, 0, 0, 2, 0, "" }, /* para */
577: { "msgentry", 0, 0, 0, 9, 0, "" }, /* msg */
578: { "msgexplan", 0, 0, 0, 9, 0, "" }, /* title */
579: { "msginfo", 0, 0, 0, 9, 0, "" }, /* msglevel */
580: { "msglevel", 0, 0, 0, 7, 0, "" }, /* smallcptr */
581: { "msgmain", 0, 0, 0, 9, 0, "" }, /* title */
582: { "msgorig", 0, 0, 0, 7, 0, "" }, /* smallcptr */
583: { "msgrel", 0, 0, 0, 9, 0, "" }, /* title */
584: { "msgset", 0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
585: { "msgsub", 0, 0, 0, 9, 0, "" }, /* title */
586: { "msgtext", 0, 0, 0, 9, 0, "" }, /* component.mix */
587: { "msg", 0, 0, 0, 9, 0, "" }, /* title */
588: { "note", 0, 0, 0, 9, 0, "" }, /* title */
589: { "objectinfo", 0, 0, 0, 9, 0, "" }, /* graphic */
590: { "olink", 0, 0, 0, 2, 0, "" }, /* para */
591: { "ooclass", 0, 0, 0, 9, 0, "" }, /* modifier */
592: { "ooexception",0, 0, 0, 9, 0, "" }, /* modifier */
593: { "oointerface",0, 0, 0, 9, 0, "" }, /* modifier */
594: { "optional", 0, 0, 0, 9, 0, "" }, /* cptr */
595: { "option", 0, 0, 0, 7, 0, "" }, /* smallcptr */
596: { "orderedlist",0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
597: { "orgdiv", 0, 0, 0, 4, 0, "" }, /* docinfo */
598: { "orgname", 0, 0, 0, 4, 0, "" }, /* docinfo */
599: { "otheraddr", 0, 0, 0, 4, 0, "" }, /* docinfo */
600: { "othercredit",0, 0, 0, 9, 0, "" }, /* person.ident.mix */
601: { "othername", 0, 0, 0, 4, 0, "" }, /* docinfo */
602: { "pagenums", 0, 0, 0, 4, 0, "" }, /* docinfo */
603: { "paramdef", 0, 0, 0, 1, 0, "" },
604: { "parameter", 0, 0, 0, 7, 0, "" }, /* smallcptr */
605: { "para", 0, 0, 0, 2, 0, "" }, /* para */
606: { "partinfo", 0, 0, 0, 9, 0, "" }, /* graphic */
607: { "partintro", 0, 0, 0, 9, 0, "" }, /* div.title.content */
608: { "part", 0, 0, 0, 9, 0, "" }, /* partinfo */
609: { "phone", 0, 0, 0, 4, 0, "" }, /* docinfo */
610: { "phrase", 0, 0, 0, 2, 0, "" }, /* para */
611: { "pob", 0, 0, 0, 4, 0, "" }, /* docinfo */
612: { "postcode", 0, 0, 0, 4, 0, "" }, /* docinfo */
613: { "prefaceinfo",0, 0, 0, 9, 0, "" }, /* graphic */
614: { "preface", 0, 0, 0, 9, 0, "" }, /* prefaceinfo */
615: { "primaryie", 0, 0, 0, 4, 0, "" }, /* ndxterm */
616: { "primary ", 0, 0, 0, 4, 0, "" }, /* ndxterm */
617: { "printhistory",0, 0, 0, 9, 0, "" }, /* para.class */
618: { "procedure", 0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
619: { "productname",0, 0, 0, 2, 0, "" }, /* para */
620: { "productnumber",0, 0, 0, 4, 0, "" }, /* docinfo */
621: { "programlistingco",0, 0, 0, 9, 0, "" }, /* areaspec */
622: { "programlisting",0, 0, 0, 2, 0, "" }, /* para */
623: { "prompt", 0, 0, 0, 7, 0, "" }, /* smallcptr */
624: { "property", 0, 0, 0, 7, 0, "" }, /* smallcptr */
625: { "pubdate", 0, 0, 0, 4, 0, "" }, /* docinfo */
626: { "publishername",0, 0, 0, 4, 0, "" }, /* docinfo */
627: { "publisher", 0, 0, 0, 9, 0, "" }, /* publishername */
628: { "pubsnumber", 0, 0, 0, 4, 0, "" }, /* docinfo */
629: { "qandadiv", 0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
630: { "qandaentry", 0, 0, 0, 9, 0, "" }, /* revhistory */
631: { "qandaset", 0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
632: { "question", 0, 0, 0, 9, 0, "" }, /* label */
633: { "quote", 0, 0, 0, 2, 0, "" }, /* para */
634: { "refclass", 0, 0, 0, 9, 0, "" }, /* refclass.char.mix */
635: { "refdescriptor",0, 0, 0, 9, 0, "" }, /* refname.char.mix */
636: { "refentryinfo",0, 0, 0, 9, 0, "" }, /* graphic */
637: { "refentry", 0, 0, 0, 9, 0, "" }, /* ndxterm.class */
638: { "refentrytitle",0, 0, 0, 2, 0, "" }, /* para */
639: { "referenceinfo",0, 0, 0, 9, 0, "" }, /* graphic */
640: { "reference", 0, 0, 0, 9, 0, "" }, /* referenceinfo */
641: { "refmeta", 0, 0, 0, 9, 0, "" }, /* ndxterm.class */
642: { "refmiscinfo",0, 0, 0, 4, 0, "" }, /* docinfo */
643: { "refnamediv", 0, 0, 0, 9, 0, "" }, /* refdescriptor */
644: { "refname", 0, 0, 0, 9, 0, "" }, /* refname.char.mix */
645: { "refpurpose", 0, 0, 0, 9, 0, "" }, /* refinline.char.mix */
646: { "refsect1info",0, 0, 0, 9, 0, "" }, /* graphic */
647: { "refsect1", 0, 0, 0, 9, 0, "" }, /* refsect */
648: { "refsect2info",0, 0, 0, 9, 0, "" }, /* graphic */
649: { "refsect2", 0, 0, 0, 9, 0, "" }, /* refsect */
650: { "refsect3info",0, 0, 0, 9, 0, "" }, /* graphic */
651: { "refsect3", 0, 0, 0, 9, 0, "" }, /* refsect */
652: { "refsynopsisdivinfo",0,0, 0, 9, 0, "" }, /* graphic */
653: { "refsynopsisdiv",0, 0, 0, 9, 0, "" }, /* refsynopsisdivinfo */
654: { "releaseinfo",0, 0, 0, 4, 0, "" }, /* docinfo */
655: { "remark", 0, 0, 0, 2, 0, "" }, /* para */
656: { "replaceable",0, 0, 0, 1, 0, "" },
657: { "returnvalue",0, 0, 0, 7, 0, "" }, /* smallcptr */
658: { "revdescription",0, 0, 0, 9, 0, "" }, /* revdescription.mix */
659: { "revhistory", 0, 0, 0, 9, 0, "" }, /* revision */
660: { "revision", 0, 0, 0, 9, 0, "" }, /* revnumber */
661: { "revnumber", 0, 0, 0, 4, 0, "" }, /* docinfo */
662: { "revremark", 0, 0, 0, 4, 0, "" }, /* docinfo */
663: { "row", 0, 0, 0, 9, 0, "" }, /* tbl.row.mdl */
664: { "row", 0, 0, 0, 9, 0, "" }, /* tbl.row.mdl */
665: { "sbr", 0, 2, 1, 0, 0, "" },
666: { "screenco", 0, 0, 0, 9, 0, "" }, /* areaspec */
667: { "screeninfo", 0, 0, 0, 2, 0, "" }, /* para */
668: { "screen", 0, 0, 0, 2, 0, "" }, /* para */
669: { "screenshot", 0, 0, 0, 9, 0, "" }, /* screeninfo */
670: { "secondaryie",0, 0, 0, 4, 0, "" }, /* ndxterm */
671: { "secondary", 0, 0, 0, 4, 0, "" }, /* ndxterm */
672: { "sect1info", 0, 0, 0, 9, 0, "" }, /* graphic */
673: { "sect1", 0, 0, 0, 9, 0, "" }, /* sect */
674: { "sect2info", 0, 0, 0, 9, 0, "" }, /* graphic */
675: { "sect2", 0, 0, 0, 9, 0, "" }, /* sect */
676: { "sect3info", 0, 0, 0, 9, 0, "" }, /* graphic */
677: { "sect3", 0, 0, 0, 9, 0, "" }, /* sect */
678: { "sect4info", 0, 0, 0, 9, 0, "" }, /* graphic */
679: { "sect4", 0, 0, 0, 9, 0, "" }, /* sect */
680: { "sect5info", 0, 0, 0, 9, 0, "" }, /* graphic */
681: { "sect5", 0, 0, 0, 9, 0, "" }, /* sect */
682: { "sectioninfo",0, 0, 0, 9, 0, "" }, /* graphic */
683: { "section", 0, 0, 0, 9, 0, "" }, /* sectioninfo */
684: { "seealsoie", 0, 0, 0, 4, 0, "" }, /* ndxterm */
685: { "seealso", 0, 0, 0, 4, 0, "" }, /* ndxterm */
686: { "seeie", 0, 0, 0, 4, 0, "" }, /* ndxterm */
687: { "see", 0, 0, 0, 4, 0, "" }, /* ndxterm */
688: { "seglistitem",0, 0, 0, 9, 0, "" }, /* seg */
689: { "segmentedlist",0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
690: { "seg", 0, 0, 0, 2, 0, "" }, /* para */
691: { "segtitle", 0, 0, 0, 8, 0, "" }, /* title */
692: { "seriesvolnums", 0, 0, 0, 4, 0, "" }, /* docinfo */
693: { "set", 0, 0, 0, 9, 0, "" }, /* div.title.content */
694: { "setindexinfo",0, 0, 0, 9, 0, "" }, /* graphic */
695: { "setindex", 0, 0, 0, 9, 0, "" }, /* setindexinfo */
696: { "setinfo", 0, 0, 0, 9, 0, "" }, /* graphic */
697: { "sgmltag", 0, 0, 0, 7, 0, "" }, /* smallcptr */
698: { "shortaffil", 0, 0, 0, 4, 0, "" }, /* docinfo */
699: { "shortcut", 0, 0, 0, 9, 0, "" }, /* keycap */
700: { "sidebarinfo",0, 0, 0, 9, 0, "" }, /* graphic */
701: { "sidebar", 0, 0, 0, 9, 0, "" }, /* sidebarinfo */
702: { "simpara", 0, 0, 0, 2, 0, "" }, /* para */
703: { "simplelist", 0, 0, 0, 9, 0, "" }, /* member */
704: { "simplemsgentry", 0, 0, 0, 9, 0, "" }, /* msgtext */
705: { "simplesect", 0, 0, 0, 9, 0, "" }, /* sect.title.content */
706: { "spanspec", 0, 2, 1, 0, 0, "" },
707: { "state", 0, 0, 0, 4, 0, "" }, /* docinfo */
708: { "step", 0, 0, 0, 9, 0, "" }, /* title */
709: { "street", 0, 0, 0, 4, 0, "" }, /* docinfo */
710: { "structfield",0, 0, 0, 7, 0, "" }, /* smallcptr */
711: { "structname", 0, 0, 0, 7, 0, "" }, /* smallcptr */
712: { "subjectset", 0, 0, 0, 9, 0, "" }, /* subject */
713: { "subject", 0, 0, 0, 9, 0, "" }, /* subjectterm */
714: { "subjectterm",0, 0, 0, 1, 0, "" },
715: { "subscript", 0, 0, 0, 1, 0, "" },
716: { "substeps", 0, 0, 0, 9, 0, "" }, /* step */
717: { "subtitle", 0, 0, 0, 8, 0, "" }, /* title */
718: { "superscript", 0, 0, 0, 1, 0, "" },
719: { "surname", 0, 0, 0, 4, 0, "" }, /* docinfo */
720: { "symbol", 0, 0, 0, 7, 0, "" }, /* smallcptr */
721: { "synopfragment", 0, 0, 0, 9, 0, "" }, /* arg */
722: { "synopfragmentref", 0, 0, 0, 1, 0, "" },
723: { "synopsis", 0, 0, 0, 2, 0, "" }, /* para */
724: { "systemitem", 0, 0, 0, 7, 0, "" }, /* smallcptr */
725: { "table", 0, 0, 0, 9, 0, "" }, /* tbl.table.mdl */
726: /* { "%tbl.table.name;", 0, 0, 0, 9, 0, "" },*/ /* tbl.table.mdl */
727: { "tbody", 0, 0, 0, 9, 0, "" }, /* row */
728: { "tbody", 0, 0, 0, 9, 0, "" }, /* row */
729: { "term", 0, 0, 0, 2, 0, "" }, /* para */
730: { "tertiaryie", 0, 0, 0, 4, 0, "" }, /* ndxterm */
731: { "tertiary ", 0, 0, 0, 4, 0, "" }, /* ndxterm */
732: { "textobject", 0, 0, 0, 9, 0, "" }, /* objectinfo */
733: { "tfoot", 0, 0, 0, 9, 0, "" }, /* tbl.hdft.mdl */
734: { "tgroup", 0, 0, 0, 9, 0, "" }, /* tbl.tgroup.mdl */
735: { "tgroup", 0, 0, 0, 9, 0, "" }, /* tbl.tgroup.mdl */
736: { "thead", 0, 0, 0, 9, 0, "" }, /* row */
737: { "thead", 0, 0, 0, 9, 0, "" }, /* tbl.hdft.mdl */
738: { "tip", 0, 0, 0, 9, 0, "" }, /* title */
739: { "titleabbrev",0, 0, 0, 8, 0, "" }, /* title */
740: { "title", 0, 0, 0, 8, 0, "" }, /* title */
741: { "tocback", 0, 0, 0, 2, 0, "" }, /* para */
742: { "toc", 0, 0, 0, 9, 0, "" }, /* bookcomponent.title.content */
743: { "tocchap", 0, 0, 0, 9, 0, "" }, /* tocentry */
744: { "tocentry", 0, 0, 0, 2, 0, "" }, /* para */
745: { "tocfront", 0, 0, 0, 2, 0, "" }, /* para */
746: { "toclevel1", 0, 0, 0, 9, 0, "" }, /* tocentry */
747: { "toclevel2", 0, 0, 0, 9, 0, "" }, /* tocentry */
748: { "toclevel3", 0, 0, 0, 9, 0, "" }, /* tocentry */
749: { "toclevel4", 0, 0, 0, 9, 0, "" }, /* tocentry */
750: { "toclevel5", 0, 0, 0, 9, 0, "" }, /* tocentry */
751: { "tocpart", 0, 0, 0, 9, 0, "" }, /* tocentry */
752: { "token", 0, 0, 0, 7, 0, "" }, /* smallcptr */
753: { "trademark", 0, 0, 0, 1, 0, "" },
754: { "type", 0, 0, 0, 7, 0, "" }, /* smallcptr */
755: { "ulink", 0, 0, 0, 2, 0, "" }, /* para */
756: { "userinput", 0, 0, 0, 9, 0, "" }, /* cptr */
757: { "varargs", 0, 2, 1, 0, 0, "" },
758: { "variablelist",0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
759: { "varlistentry",0, 0, 0, 9, 0, "" }, /* term */
760: { "varname", 0, 0, 0, 7, 0, "" }, /* smallcptr */
761: { "videodata", 0, 2, 1, 0, 0, "" },
762: { "videoobject",0, 0, 0, 9, 0, "" }, /* objectinfo */
763: { "void", 0, 2, 1, 0, 0, "" },
764: { "volumenum", 0, 0, 0, 4, 0, "" }, /* docinfo */
765: { "warning", 0, 0, 0, 9, 0, "" }, /* title */
766: { "wordasword", 0, 0, 0, 3, 0, "" }, /* word */
767: { "xref", 0, 2, 1, 0, 0, "" },
768: { "year", 0, 0, 0, 4, 0, "" }, /* docinfo */
769: };
770:
771: /*
772: * start tags that imply the end of a current element
773: * any tag of each line implies the end of the current element if the type of
774: * that element is in the same line
775: */
776: char *sgmlEquEnd[] = {
777: "dt", "dd", "li", "option", NULL,
778: "h1", "h2", "h3", "h4", "h5", "h6", NULL,
779: "ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
780: NULL
781: };
782: /*
783: * acording the SGML DTD, HR should be added to the 2nd line above, as it
784: * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
785: * because many documents contain rules in headings...
786: */
787:
788: /*
789: * start tags that imply the end of current element
790: */
791: char *sgmlStartClose[] = {
792: NULL
793: };
794:
795: /*
796: * The list of SGML elements which are supposed not to have
797: * CDATA content and where a p element will be implied
798: *
799: * TODO: extend that list by reading the SGML SGML DtD on
800: * implied paragraph
801: */
802: static char *sgmlNoContentElements[] = {
803: NULL
804: };
805:
806:
807: static char** sgmlStartCloseIndex[100];
808: static int sgmlStartCloseIndexinitialized = 0;
809:
810: /************************************************************************
811: * *
812: * functions to handle SGML specific data *
813: * *
814: ************************************************************************/
815:
816: /**
817: * sgmlInitAutoClose:
818: *
819: * Initialize the sgmlStartCloseIndex for fast lookup of closing tags names.
820: *
821: */
822: void
823: sgmlInitAutoClose(void) {
824: int index, i = 0;
825:
826: if (sgmlStartCloseIndexinitialized) return;
827:
828: for (index = 0;index < 100;index ++) sgmlStartCloseIndex[index] = NULL;
829: index = 0;
830: while ((sgmlStartClose[i] != NULL) && (index < 100 - 1)) {
831: sgmlStartCloseIndex[index++] = &sgmlStartClose[i];
832: while (sgmlStartClose[i] != NULL) i++;
833: i++;
834: }
835: }
836:
837: /**
838: * sgmlTagLookup:
839: * @tag: The tag name
840: *
841: * Lookup the SGML tag in the ElementTable
842: *
843: * Returns the related sgmlElemDescPtr or NULL if not found.
844: */
845: sgmlElemDescPtr
846: sgmlTagLookup(const xmlChar *tag) {
847: int i;
848:
849: for (i = 0; i < (sizeof(docbookElementTable) /
850: sizeof(docbookElementTable[0]));i++) {
851: if (!xmlStrcmp(tag, BAD_CAST docbookElementTable[i].name))
852: return(&docbookElementTable[i]);
853: }
854: return(NULL);
855: }
856:
857: /**
858: * sgmlCheckAutoClose:
859: * @newtag: The new tag name
860: * @oldtag: The old tag name
861: *
862: * Checks wether the new tag is one of the registered valid tags for closing old.
863: * Initialize the sgmlStartCloseIndex for fast lookup of closing tags names.
864: *
865: * Returns 0 if no, 1 if yes.
866: */
867: int
868: sgmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
869: int i, index;
870: char **close;
871:
872: if (sgmlStartCloseIndexinitialized == 0) sgmlInitAutoClose();
873:
874: /* inefficient, but not a big deal */
875: for (index = 0; index < 100;index++) {
876: close = sgmlStartCloseIndex[index];
877: if (close == NULL) return(0);
878: if (!xmlStrcmp(BAD_CAST *close, newtag)) break;
879: }
880:
881: i = close - sgmlStartClose;
882: i++;
883: while (sgmlStartClose[i] != NULL) {
884: if (!xmlStrcmp(BAD_CAST sgmlStartClose[i], oldtag)) {
885: return(1);
886: }
887: i++;
888: }
889: return(0);
890: }
891:
892: /**
893: * sgmlAutoCloseOnClose:
894: * @ctxt: an SGML parser context
895: * @newtag: The new tag name
896: *
897: * The HTmL DtD allows an ending tag to implicitely close other tags.
898: */
899: void
900: sgmlAutoCloseOnClose(sgmlParserCtxtPtr ctxt, const xmlChar *newtag) {
901: sgmlElemDescPtr info;
902: xmlChar *oldname;
903: int i;
904:
905: if ((newtag[0] == '/') && (newtag[1] == 0))
906: return;
907:
908: #ifdef DEBUG
909: fprintf(stderr,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
910: for (i = 0;i < ctxt->nameNr;i++)
911: fprintf(stderr,"%d : %s\n", i, ctxt->nameTab[i]);
912: #endif
913:
914: for (i = (ctxt->nameNr - 1);i >= 0;i--) {
915: if (!xmlStrcmp(newtag, ctxt->nameTab[i])) break;
916: }
917: if (i < 0) return;
918:
919: while (xmlStrcmp(newtag, ctxt->name)) {
920: info = sgmlTagLookup(ctxt->name);
921: if ((info == NULL) || (info->endTag == 1)) {
922: #ifdef DEBUG
923: fprintf(stderr,"sgmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
924: #endif
925: } else {
926: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
927: ctxt->sax->error(ctxt->userData,
928: "Opening and ending tag mismatch: %s and %s\n",
929: newtag, ctxt->name);
930: ctxt->wellFormed = 0;
931: }
932: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
933: ctxt->sax->endElement(ctxt->userData, ctxt->name);
934: oldname = sgmlnamePop(ctxt);
935: if (oldname != NULL) {
936: #ifdef DEBUG
937: fprintf(stderr,"sgmlAutoCloseOnClose: popped %s\n", oldname);
938: #endif
939: xmlFree(oldname);
940: }
941: }
942: }
943:
944: /**
945: * sgmlAutoClose:
946: * @ctxt: an SGML parser context
947: * @newtag: The new tag name or NULL
948: *
949: * The HTmL DtD allows a tag to implicitely close other tags.
950: * The list is kept in sgmlStartClose array. This function is
951: * called when a new tag has been detected and generates the
952: * appropriates closes if possible/needed.
953: * If newtag is NULL this mean we are at the end of the resource
954: * and we should check
955: */
956: void
957: sgmlAutoClose(sgmlParserCtxtPtr ctxt, const xmlChar *newtag) {
958: xmlChar *oldname;
959: while ((newtag != NULL) && (ctxt->name != NULL) &&
960: (sgmlCheckAutoClose(newtag, ctxt->name))) {
961: #ifdef DEBUG
962: fprintf(stderr,"sgmlAutoClose: %s closes %s\n", newtag, ctxt->name);
963: #endif
964: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
965: ctxt->sax->endElement(ctxt->userData, ctxt->name);
966: oldname = sgmlnamePop(ctxt);
967: if (oldname != NULL) {
968: #ifdef DEBUG
969: fprintf(stderr,"sgmlAutoClose: popped %s\n", oldname);
970: #endif
971: xmlFree(oldname);
972: }
973: }
974: #if 0
975: if (newtag == NULL) {
976: sgmlAutoCloseOnClose(ctxt, BAD_CAST"head");
977: sgmlAutoCloseOnClose(ctxt, BAD_CAST"body");
978: sgmlAutoCloseOnClose(ctxt, BAD_CAST"sgml");
979: }
980: while ((newtag == NULL) && (ctxt->name != NULL) &&
981: ((!xmlStrcmp(ctxt->name, BAD_CAST"head")) ||
982: (!xmlStrcmp(ctxt->name, BAD_CAST"body")) ||
983: (!xmlStrcmp(ctxt->name, BAD_CAST"sgml")))) {
984: #ifdef DEBUG
985: fprintf(stderr,"sgmlAutoClose: EOF closes %s\n", ctxt->name);
986: #endif
987: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
988: ctxt->sax->endElement(ctxt->userData, ctxt->name);
989: oldname = sgmlnamePop(ctxt);
990: if (oldname != NULL) {
991: #ifdef DEBUG
992: fprintf(stderr,"sgmlAutoClose: popped %s\n", oldname);
993: #endif
994: xmlFree(oldname);
995: }
996: }
997: #endif
998: }
999:
1000: /**
1001: * sgmlAutoCloseTag:
1002: * @doc: the SGML document
1003: * @name: The tag name
1004: * @elem: the SGML element
1005: *
1006: * The HTmL DtD allows a tag to implicitely close other tags.
1007: * The list is kept in sgmlStartClose array. This function checks
1008: * if the element or one of it's children would autoclose the
1009: * given tag.
1010: *
1011: * Returns 1 if autoclose, 0 otherwise
1012: */
1013: int
1014: sgmlAutoCloseTag(sgmlDocPtr doc, const xmlChar *name, sgmlNodePtr elem) {
1015: sgmlNodePtr child;
1016:
1017: if (elem == NULL) return(1);
1018: if (!xmlStrcmp(name, elem->name)) return(0);
1019: if (sgmlCheckAutoClose(elem->name, name)) return(1);
1020: child = elem->children;
1021: while (child != NULL) {
1022: if (sgmlAutoCloseTag(doc, name, child)) return(1);
1023: child = child->next;
1024: }
1025: return(0);
1026: }
1027:
1028: /**
1029: * sgmlIsAutoClosed:
1030: * @doc: the SGML document
1031: * @elem: the SGML element
1032: *
1033: * The HTmL DtD allows a tag to implicitely close other tags.
1034: * The list is kept in sgmlStartClose array. This function checks
1035: * if a tag is autoclosed by one of it's child
1036: *
1037: * Returns 1 if autoclosed, 0 otherwise
1038: */
1039: int
1040: sgmlIsAutoClosed(sgmlDocPtr doc, sgmlNodePtr elem) {
1041: sgmlNodePtr child;
1042:
1043: if (elem == NULL) return(1);
1044: child = elem->children;
1045: while (child != NULL) {
1046: if (sgmlAutoCloseTag(doc, elem->name, child)) return(1);
1047: child = child->next;
1048: }
1049: return(0);
1050: }
1051:
1052: /**
1053: * sgmlCheckImplied:
1054: * @ctxt: an SGML parser context
1055: * @newtag: The new tag name
1056: *
1057: * The HTmL DtD allows a tag to exists only implicitely
1058: * called when a new tag has been detected and generates the
1059: * appropriates implicit tags if missing
1060: */
1061: void
1062: sgmlCheckImplied(sgmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1063: #if 0
1064: if (!xmlStrcmp(newtag, BAD_CAST"sgml"))
1065: return;
1066: if (ctxt->nameNr <= 0) {
1067: #ifdef DEBUG
1068: fprintf(stderr,"Implied element sgml: pushed sgml\n");
1069: #endif
1070: sgmlnamePush(ctxt, xmlStrdup(BAD_CAST"sgml"));
1071: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1072: ctxt->sax->startElement(ctxt->userData, BAD_CAST"sgml", NULL);
1073: }
1074: if ((!xmlStrcmp(newtag, BAD_CAST"body")) || (!xmlStrcmp(newtag, BAD_CAST"head")))
1075: return;
1076: if (ctxt->nameNr <= 1) {
1077: if ((!xmlStrcmp(newtag, BAD_CAST"script")) ||
1078: (!xmlStrcmp(newtag, BAD_CAST"style")) ||
1079: (!xmlStrcmp(newtag, BAD_CAST"meta")) ||
1080: (!xmlStrcmp(newtag, BAD_CAST"link")) ||
1081: (!xmlStrcmp(newtag, BAD_CAST"title")) ||
1082: (!xmlStrcmp(newtag, BAD_CAST"base"))) {
1083: /*
1084: * dropped OBJECT ... i you put it first BODY will be
1085: * assumed !
1086: */
1087: #ifdef DEBUG
1088: fprintf(stderr,"Implied element head: pushed head\n");
1089: #endif
1090: sgmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
1091: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1092: ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1093: } else {
1094: #ifdef DEBUG
1095: fprintf(stderr,"Implied element body: pushed body\n");
1096: #endif
1097: sgmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
1098: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1099: ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1100: }
1101: }
1102: #endif
1103: }
1104:
1105: /**
1106: * sgmlCheckParagraph
1107: * @ctxt: an SGML parser context
1108: *
1109: * Check whether a p element need to be implied before inserting
1110: * characters in the current element.
1111: *
1112: * Returns 1 if a paragraph has been inserted, 0 if not and -1
1113: * in case of error.
1114: */
1115:
1116: int
1117: sgmlCheckParagraph(sgmlParserCtxtPtr ctxt) {
1118: const xmlChar *tag;
1119: int i;
1120:
1121: if (ctxt == NULL)
1122: return(-1);
1123: tag = ctxt->name;
1124: if (tag == NULL) {
1125: sgmlAutoClose(ctxt, BAD_CAST"p");
1126: sgmlCheckImplied(ctxt, BAD_CAST"p");
1127: sgmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
1128: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1129: ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1130: return(1);
1131: }
1132: for (i = 0; sgmlNoContentElements[i] != NULL; i++) {
1133: if (!xmlStrcmp(tag, BAD_CAST sgmlNoContentElements[i])) {
1134: #ifdef DEBUG
1135: fprintf(stderr,"Implied element paragraph\n");
1136: #endif
1137: sgmlAutoClose(ctxt, BAD_CAST"p");
1138: sgmlCheckImplied(ctxt, BAD_CAST"p");
1139: sgmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
1140: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1141: ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1142: return(1);
1143: }
1144: }
1145: return(0);
1146: }
1147:
1148: /************************************************************************
1149: * *
1150: * The list of SGML predefined entities *
1151: * *
1152: ************************************************************************/
1153:
1154:
1155: sgmlEntityDesc docbookEntitiesTable[] = {
1156: /*
1157: * the 4 absolute ones, plus apostrophe.
1158: */
1159: { 0x0026, "amp", "AMPERSAND" },
1160: { 0x003C, "lt", "LESS-THAN SIGN" },
1161:
1162: /*
1163: * Converted with VI macros from docbook ent files
1164: */
1165: { 0x0021, "excl", "EXCLAMATION MARK" },
1166: { 0x0022, "quot", "QUOTATION MARK" },
1167: { 0x0023, "num", "NUMBER SIGN" },
1168: { 0x0024, "dollar", "DOLLAR SIGN" },
1169: { 0x0025, "percnt", "PERCENT SIGN" },
1170: { 0x0027, "apos", "APOSTROPHE" },
1171: { 0x0028, "lpar", "LEFT PARENTHESIS" },
1172: { 0x0029, "rpar", "RIGHT PARENTHESIS" },
1173: { 0x002A, "ast", "ASTERISK OPERATOR" },
1174: { 0x002B, "plus", "PLUS SIGN" },
1175: { 0x002C, "comma", "COMMA" },
1176: { 0x002D, "hyphen", "HYPHEN-MINUS" },
1177: { 0x002E, "period", "FULL STOP" },
1178: { 0x002F, "sol", "SOLIDUS" },
1179: { 0x003A, "colon", "COLON" },
1180: { 0x003B, "semi", "SEMICOLON" },
1181: { 0x003D, "equals", "EQUALS SIGN" },
1182: { 0x003E, "gt", "GREATER-THAN SIGN" },
1183: { 0x003F, "quest", "QUESTION MARK" },
1184: { 0x0040, "commat", "COMMERCIAL AT" },
1185: { 0x005B, "lsqb", "LEFT SQUARE BRACKET" },
1186: { 0x005C, "bsol", "REVERSE SOLIDUS" },
1187: { 0x005D, "rsqb", "RIGHT SQUARE BRACKET" },
1188: { 0x005E, "circ", "RING OPERATOR" },
1189: { 0x005F, "lowbar", "LOW LINE" },
1190: { 0x0060, "grave", "GRAVE ACCENT" },
1191: { 0x007B, "lcub", "LEFT CURLY BRACKET" },
1192: { 0x007C, "verbar", "VERTICAL LINE" },
1193: { 0x007D, "rcub", "RIGHT CURLY BRACKET" },
1194: { 0x00A0, "nbsp", "NO-BREAK SPACE" },
1195: { 0x00A1, "iexcl", "INVERTED EXCLAMATION MARK" },
1196: { 0x00A2, "cent", "CENT SIGN" },
1197: { 0x00A3, "pound", "POUND SIGN" },
1198: { 0x00A4, "curren", "CURRENCY SIGN" },
1199: { 0x00A5, "yen", "YEN SIGN" },
1200: { 0x00A6, "brvbar", "BROKEN BAR" },
1201: { 0x00A7, "sect", "SECTION SIGN" },
1202: { 0x00A8, "die", "" },
1203: { 0x00A8, "Dot", "" },
1204: { 0x00A8, "uml", "" },
1205: { 0x00A9, "copy", "COPYRIGHT SIGN" },
1206: { 0x00AA, "ordf", "FEMININE ORDINAL INDICATOR" },
1207: { 0x00AB, "laquo", "LEFT-POINTING DOUBLE ANGLE QUOTATION MARK" },
1208: { 0x00AC, "not", "NOT SIGN" },
1209: { 0x00AD, "shy", "SOFT HYPHEN" },
1210: { 0x00AE, "reg", "REG TRADE MARK SIGN" },
1211: { 0x00AF, "macr", "MACRON" },
1212: { 0x00B0, "deg", "DEGREE SIGN" },
1213: { 0x00B1, "plusmn", "PLUS-MINUS SIGN" },
1214: { 0x00B2, "sup2", "SUPERSCRIPT TWO" },
1215: { 0x00B3, "sup3", "SUPERSCRIPT THREE" },
1216: { 0x00B4, "acute", "ACUTE ACCENT" },
1217: { 0x00B5, "micro", "MICRO SIGN" },
1218: { 0x00B6, "para", "PILCROW SIGN" },
1219: { 0x00B7, "middot", "MIDDLE DOT" },
1220: { 0x00B8, "cedil", "CEDILLA" },
1221: { 0x00B9, "sup1", "SUPERSCRIPT ONE" },
1222: { 0x00BA, "ordm", "MASCULINE ORDINAL INDICATOR" },
1223: { 0x00BB, "raquo", "RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK" },
1224: { 0x00BC, "frac14", "VULGAR FRACTION ONE QUARTER" },
1225: { 0x00BD, "frac12", "VULGAR FRACTION ONE HALF" },
1226: { 0x00BD, "half", "VULGAR FRACTION ONE HALF" },
1227: { 0x00BE, "frac34", "VULGAR FRACTION THREE QUARTERS" },
1228: { 0x00BF, "iquest", "INVERTED QUESTION MARK" },
1229: { 0x00C0, "Agrave", "LATIN CAPITAL LETTER A WITH GRAVE" },
1230: { 0x00C1, "Aacute", "LATIN CAPITAL LETTER A WITH ACUTE" },
1231: { 0x00C2, "Acirc", "LATIN CAPITAL LETTER A WITH CIRCUMFLEX" },
1232: { 0x00C3, "Atilde", "LATIN CAPITAL LETTER A WITH TILDE" },
1233: { 0x00C4, "Auml", "LATIN CAPITAL LETTER A WITH DIAERESIS" },
1234: { 0x00C5, "Aring", "LATIN CAPITAL LETTER A WITH RING ABOVE" },
1235: { 0x00C6, "AElig", "LATIN CAPITAL LETTER AE" },
1236: { 0x00C7, "Ccedil", "LATIN CAPITAL LETTER C WITH CEDILLA" },
1237: { 0x00C8, "Egrave", "LATIN CAPITAL LETTER E WITH GRAVE" },
1238: { 0x00C9, "Eacute", "LATIN CAPITAL LETTER E WITH ACUTE" },
1239: { 0x00CA, "Ecirc", "LATIN CAPITAL LETTER E WITH CIRCUMFLEX" },
1240: { 0x00CB, "Euml", "LATIN CAPITAL LETTER E WITH DIAERESIS" },
1241: { 0x00CC, "Igrave", "LATIN CAPITAL LETTER I WITH GRAVE" },
1242: { 0x00CD, "Iacute", "LATIN CAPITAL LETTER I WITH ACUTE" },
1243: { 0x00CE, "Icirc", "LATIN CAPITAL LETTER I WITH CIRCUMFLEX" },
1244: { 0x00CF, "Iuml", "LATIN CAPITAL LETTER I WITH DIAERESIS" },
1245: { 0x00D0, "ETH", "LATIN CAPITAL LETTER ETH" },
1246: { 0x00D1, "Ntilde", "LATIN CAPITAL LETTER N WITH TILDE" },
1247: { 0x00D2, "Ograve", "LATIN CAPITAL LETTER O WITH GRAVE" },
1248: { 0x00D3, "Oacute", "LATIN CAPITAL LETTER O WITH ACUTE" },
1249: { 0x00D4, "Ocirc", "LATIN CAPITAL LETTER O WITH CIRCUMFLEX" },
1250: { 0x00D5, "Otilde", "LATIN CAPITAL LETTER O WITH TILDE" },
1251: { 0x00D6, "Ouml", "LATIN CAPITAL LETTER O WITH DIAERESIS" },
1252: { 0x00D7, "times", "MULTIPLICATION SIGN" },
1253: { 0x00D8, "Oslash", "LATIN CAPITAL LETTER O WITH STROKE" },
1254: { 0x00D9, "Ugrave", "LATIN CAPITAL LETTER U WITH GRAVE" },
1255: { 0x00DA, "Uacute", "LATIN CAPITAL LETTER U WITH ACUTE" },
1256: { 0x00DB, "Ucirc", "LATIN CAPITAL LETTER U WITH CIRCUMFLEX" },
1257: { 0x00DC, "Uuml", "LATIN CAPITAL LETTER U WITH DIAERESIS" },
1258: { 0x00DD, "Yacute", "LATIN CAPITAL LETTER Y WITH ACUTE" },
1259: { 0x00DE, "THORN", "LATIN CAPITAL LETTER THORN" },
1260: { 0x00DF, "szlig", "LATIN SMALL LETTER SHARP S" },
1261: { 0x00E0, "agrave", "LATIN SMALL LETTER A WITH GRAVE" },
1262: { 0x00E1, "aacute", "LATIN SMALL LETTER A WITH ACUTE" },
1263: { 0x00E2, "acirc", "LATIN SMALL LETTER A WITH CIRCUMFLEX" },
1264: { 0x00E3, "atilde", "LATIN SMALL LETTER A WITH TILDE" },
1265: { 0x00E4, "auml", "LATIN SMALL LETTER A WITH DIAERESIS" },
1266: { 0x00E5, "aring", "LATIN SMALL LETTER A WITH RING ABOVE" },
1267: { 0x00E6, "aelig", "LATIN SMALL LETTER AE" },
1268: { 0x00E7, "ccedil", "LATIN SMALL LETTER C WITH CEDILLA" },
1269: { 0x00E8, "egrave", "LATIN SMALL LETTER E WITH GRAVE" },
1270: { 0x00E9, "eacute", "LATIN SMALL LETTER E WITH ACUTE" },
1271: { 0x00EA, "ecirc", "LATIN SMALL LETTER E WITH CIRCUMFLEX" },
1272: { 0x00EB, "euml", "LATIN SMALL LETTER E WITH DIAERESIS" },
1273: { 0x00EC, "igrave", "LATIN SMALL LETTER I WITH GRAVE" },
1274: { 0x00ED, "iacute", "LATIN SMALL LETTER I WITH ACUTE" },
1275: { 0x00EE, "icirc", "LATIN SMALL LETTER I WITH CIRCUMFLEX" },
1276: { 0x00EF, "iuml", "LATIN SMALL LETTER I WITH DIAERESIS" },
1277: { 0x00F0, "eth", "LATIN SMALL LETTER ETH" },
1278: { 0x00F1, "ntilde", "LATIN SMALL LETTER N WITH TILDE" },
1279: { 0x00F2, "ograve", "LATIN SMALL LETTER O WITH GRAVE" },
1280: { 0x00F3, "oacute", "LATIN SMALL LETTER O WITH ACUTE" },
1281: { 0x00F4, "ocirc", "LATIN SMALL LETTER O WITH CIRCUMFLEX" },
1282: { 0x00F5, "otilde", "LATIN SMALL LETTER O WITH TILDE" },
1283: { 0x00F6, "ouml", "LATIN SMALL LETTER O WITH DIAERESIS" },
1284: { 0x00F7, "divide", "DIVISION SIGN" },
1285: { 0x00F8, "oslash", "CIRCLED DIVISION SLASH" },
1286: { 0x00F9, "ugrave", "LATIN SMALL LETTER U WITH GRAVE" },
1287: { 0x00FA, "uacute", "LATIN SMALL LETTER U WITH ACUTE" },
1288: { 0x00FB, "ucirc", "LATIN SMALL LETTER U WITH CIRCUMFLEX" },
1289: { 0x00FC, "uuml", "LATIN SMALL LETTER U WITH DIAERESIS" },
1290: { 0x00FD, "yacute", "LATIN SMALL LETTER Y WITH ACUTE" },
1291: { 0x00FE, "thorn", "LATIN SMALL LETTER THORN" },
1292: { 0x00FF, "yuml", "LATIN SMALL LETTER Y WITH DIAERESIS" },
1293: { 0x0100, "Amacr", "LATIN CAPITAL LETTER A WITH MACRON" },
1294: { 0x0101, "amacr", "LATIN SMALL LETTER A WITH MACRON" },
1295: { 0x0102, "Abreve", "LATIN CAPITAL LETTER A WITH BREVE" },
1296: { 0x0103, "abreve", "LATIN SMALL LETTER A WITH BREVE" },
1297: { 0x0104, "Aogon", "LATIN CAPITAL LETTER A WITH OGONEK" },
1298: { 0x0105, "aogon", "LATIN SMALL LETTER A WITH OGONEK" },
1299: { 0x0106, "Cacute", "LATIN CAPITAL LETTER C WITH ACUTE" },
1300: { 0x0107, "cacute", "LATIN SMALL LETTER C WITH ACUTE" },
1301: { 0x0108, "Ccirc", "LATIN CAPITAL LETTER C WITH CIRCUMFLEX" },
1302: { 0x0109, "ccirc", "LATIN SMALL LETTER C WITH CIRCUMFLEX" },
1303: { 0x010A, "Cdot", "LATIN CAPITAL LETTER C WITH DOT ABOVE" },
1304: { 0x010B, "cdot", "DOT OPERATOR" },
1305: { 0x010C, "Ccaron", "LATIN CAPITAL LETTER C WITH CARON" },
1306: { 0x010D, "ccaron", "LATIN SMALL LETTER C WITH CARON" },
1307: { 0x010E, "Dcaron", "LATIN CAPITAL LETTER D WITH CARON" },
1308: { 0x010F, "dcaron", "LATIN SMALL LETTER D WITH CARON" },
1309: { 0x0110, "Dstrok", "LATIN CAPITAL LETTER D WITH STROKE" },
1310: { 0x0111, "dstrok", "LATIN SMALL LETTER D WITH STROKE" },
1311: { 0x0112, "Emacr", "LATIN CAPITAL LETTER E WITH MACRON" },
1312: { 0x0113, "emacr", "LATIN SMALL LETTER E WITH MACRON" },
1313: { 0x0116, "Edot", "LATIN CAPITAL LETTER E WITH DOT ABOVE" },
1314: { 0x0117, "edot", "LATIN SMALL LETTER E WITH DOT ABOVE" },
1315: { 0x0118, "Eogon", "LATIN CAPITAL LETTER E WITH OGONEK" },
1316: { 0x0119, "eogon", "LATIN SMALL LETTER E WITH OGONEK" },
1317: { 0x011A, "Ecaron", "LATIN CAPITAL LETTER E WITH CARON" },
1318: { 0x011B, "ecaron", "LATIN SMALL LETTER E WITH CARON" },
1319: { 0x011C, "Gcirc", "LATIN CAPITAL LETTER G WITH CIRCUMFLEX" },
1320: { 0x011D, "gcirc", "LATIN SMALL LETTER G WITH CIRCUMFLEX" },
1321: { 0x011E, "Gbreve", "LATIN CAPITAL LETTER G WITH BREVE" },
1322: { 0x011F, "gbreve", "LATIN SMALL LETTER G WITH BREVE" },
1323: { 0x0120, "Gdot", "LATIN CAPITAL LETTER G WITH DOT ABOVE" },
1324: { 0x0121, "gdot", "LATIN SMALL LETTER G WITH DOT ABOVE" },
1325: { 0x0122, "Gcedil", "LATIN CAPITAL LETTER G WITH CEDILLA" },
1326: { 0x0124, "Hcirc", "LATIN CAPITAL LETTER H WITH CIRCUMFLEX" },
1327: { 0x0125, "hcirc", "LATIN SMALL LETTER H WITH CIRCUMFLEX" },
1328: { 0x0126, "Hstrok", "LATIN CAPITAL LETTER H WITH STROKE" },
1329: { 0x0127, "hstrok", "LATIN SMALL LETTER H WITH STROKE" },
1330: { 0x0128, "Itilde", "LATIN CAPITAL LETTER I WITH TILDE" },
1331: { 0x0129, "itilde", "LATIN SMALL LETTER I WITH TILDE" },
1332: { 0x012A, "Imacr", "LATIN CAPITAL LETTER I WITH MACRON" },
1333: { 0x012B, "imacr", "LATIN SMALL LETTER I WITH MACRON" },
1334: { 0x012E, "Iogon", "LATIN CAPITAL LETTER I WITH OGONEK" },
1335: { 0x012F, "iogon", "LATIN SMALL LETTER I WITH OGONEK" },
1336: { 0x0130, "Idot", "LATIN CAPITAL LETTER I WITH DOT ABOVE" },
1337: { 0x0131, "inodot", "LATIN SMALL LETTER DOTLESS I" },
1338: { 0x0131, "inodot", "LATIN SMALL LETTER DOTLESS I" },
1339: { 0x0132, "IJlig", "LATIN CAPITAL LIGATURE IJ" },
1340: { 0x0133, "ijlig", "LATIN SMALL LIGATURE IJ" },
1341: { 0x0134, "Jcirc", "LATIN CAPITAL LETTER J WITH CIRCUMFLEX" },
1342: { 0x0135, "jcirc", "LATIN SMALL LETTER J WITH CIRCUMFLEX" },
1343: { 0x0136, "Kcedil", "LATIN CAPITAL LETTER K WITH CEDILLA" },
1344: { 0x0137, "kcedil", "LATIN SMALL LETTER K WITH CEDILLA" },
1345: { 0x0138, "kgreen", "LATIN SMALL LETTER KRA" },
1346: { 0x0139, "Lacute", "LATIN CAPITAL LETTER L WITH ACUTE" },
1347: { 0x013A, "lacute", "LATIN SMALL LETTER L WITH ACUTE" },
1348: { 0x013B, "Lcedil", "LATIN CAPITAL LETTER L WITH CEDILLA" },
1349: { 0x013C, "lcedil", "LATIN SMALL LETTER L WITH CEDILLA" },
1350: { 0x013D, "Lcaron", "LATIN CAPITAL LETTER L WITH CARON" },
1351: { 0x013E, "lcaron", "LATIN SMALL LETTER L WITH CARON" },
1352: { 0x013F, "Lmidot", "LATIN CAPITAL LETTER L WITH MIDDLE DOT" },
1353: { 0x0140, "lmidot", "LATIN SMALL LETTER L WITH MIDDLE DOT" },
1354: { 0x0141, "Lstrok", "LATIN CAPITAL LETTER L WITH STROKE" },
1355: { 0x0142, "lstrok", "LATIN SMALL LETTER L WITH STROKE" },
1356: { 0x0143, "Nacute", "LATIN CAPITAL LETTER N WITH ACUTE" },
1357: { 0x0144, "nacute", "LATIN SMALL LETTER N WITH ACUTE" },
1358: { 0x0145, "Ncedil", "LATIN CAPITAL LETTER N WITH CEDILLA" },
1359: { 0x0146, "ncedil", "LATIN SMALL LETTER N WITH CEDILLA" },
1360: { 0x0147, "Ncaron", "LATIN CAPITAL LETTER N WITH CARON" },
1361: { 0x0148, "ncaron", "LATIN SMALL LETTER N WITH CARON" },
1362: { 0x0149, "napos", "LATIN SMALL LETTER N PRECEDED BY APOSTROPHE" },
1363: { 0x014A, "ENG", "LATIN CAPITAL LETTER ENG" },
1364: { 0x014B, "eng", "LATIN SMALL LETTER ENG" },
1365: { 0x014C, "Omacr", "LATIN CAPITAL LETTER O WITH MACRON" },
1366: { 0x014D, "omacr", "LATIN SMALL LETTER O WITH MACRON" },
1367: { 0x0150, "Odblac", "LATIN CAPITAL LETTER O WITH DOUBLE ACUTE" },
1368: { 0x0151, "odblac", "LATIN SMALL LETTER O WITH DOUBLE ACUTE" },
1369: { 0x0152, "OElig", "LATIN CAPITAL LIGATURE OE" },
1370: { 0x0153, "oelig", "LATIN SMALL LIGATURE OE" },
1371: { 0x0154, "Racute", "LATIN CAPITAL LETTER R WITH ACUTE" },
1372: { 0x0155, "racute", "LATIN SMALL LETTER R WITH ACUTE" },
1373: { 0x0156, "Rcedil", "LATIN CAPITAL LETTER R WITH CEDILLA" },
1374: { 0x0157, "rcedil", "LATIN SMALL LETTER R WITH CEDILLA" },
1375: { 0x0158, "Rcaron", "LATIN CAPITAL LETTER R WITH CARON" },
1376: { 0x0159, "rcaron", "LATIN SMALL LETTER R WITH CARON" },
1377: { 0x015A, "Sacute", "LATIN CAPITAL LETTER S WITH ACUTE" },
1378: { 0x015B, "sacute", "LATIN SMALL LETTER S WITH ACUTE" },
1379: { 0x015C, "Scirc", "LATIN CAPITAL LETTER S WITH CIRCUMFLEX" },
1380: { 0x015D, "scirc", "LATIN SMALL LETTER S WITH CIRCUMFLEX" },
1381: { 0x015E, "Scedil", "LATIN CAPITAL LETTER S WITH CEDILLA" },
1382: { 0x015F, "scedil", "LATIN SMALL LETTER S WITH CEDILLA" },
1383: { 0x0160, "Scaron", "LATIN CAPITAL LETTER S WITH CARON" },
1384: { 0x0161, "scaron", "LATIN SMALL LETTER S WITH CARON" },
1385: { 0x0162, "Tcedil", "LATIN CAPITAL LETTER T WITH CEDILLA" },
1386: { 0x0163, "tcedil", "LATIN SMALL LETTER T WITH CEDILLA" },
1387: { 0x0164, "Tcaron", "LATIN CAPITAL LETTER T WITH CARON" },
1388: { 0x0165, "tcaron", "LATIN SMALL LETTER T WITH CARON" },
1389: { 0x0166, "Tstrok", "LATIN CAPITAL LETTER T WITH STROKE" },
1390: { 0x0167, "tstrok", "LATIN SMALL LETTER T WITH STROKE" },
1391: { 0x0168, "Utilde", "LATIN CAPITAL LETTER U WITH TILDE" },
1392: { 0x0169, "utilde", "LATIN SMALL LETTER U WITH TILDE" },
1393: { 0x016A, "Umacr", "LATIN CAPITAL LETTER U WITH MACRON" },
1394: { 0x016B, "umacr", "LATIN SMALL LETTER U WITH MACRON" },
1395: { 0x016C, "Ubreve", "LATIN CAPITAL LETTER U WITH BREVE" },
1396: { 0x016D, "ubreve", "LATIN SMALL LETTER U WITH BREVE" },
1397: { 0x016E, "Uring", "LATIN CAPITAL LETTER U WITH RING ABOVE" },
1398: { 0x016F, "uring", "LATIN SMALL LETTER U WITH RING ABOVE" },
1399: { 0x0170, "Udblac", "LATIN CAPITAL LETTER U WITH DOUBLE ACUTE" },
1400: { 0x0171, "udblac", "LATIN SMALL LETTER U WITH DOUBLE ACUTE" },
1401: { 0x0172, "Uogon", "LATIN CAPITAL LETTER U WITH OGONEK" },
1402: { 0x0173, "uogon", "LATIN SMALL LETTER U WITH OGONEK" },
1403: { 0x0174, "Wcirc", "LATIN CAPITAL LETTER W WITH CIRCUMFLEX" },
1404: { 0x0175, "wcirc", "LATIN SMALL LETTER W WITH CIRCUMFLEX" },
1405: { 0x0176, "Ycirc", "LATIN CAPITAL LETTER Y WITH CIRCUMFLEX" },
1406: { 0x0177, "ycirc", "LATIN SMALL LETTER Y WITH CIRCUMFLEX" },
1407: { 0x0178, "Yuml", "LATIN CAPITAL LETTER Y WITH DIAERESIS" },
1408: { 0x0179, "Zacute", "LATIN CAPITAL LETTER Z WITH ACUTE" },
1409: { 0x017A, "zacute", "LATIN SMALL LETTER Z WITH ACUTE" },
1410: { 0x017B, "Zdot", "LATIN CAPITAL LETTER Z WITH DOT ABOVE" },
1411: { 0x017C, "zdot", "LATIN SMALL LETTER Z WITH DOT ABOVE" },
1412: { 0x017D, "Zcaron", "LATIN CAPITAL LETTER Z WITH CARON" },
1413: { 0x017E, "zcaron", "LATIN SMALL LETTER Z WITH CARON" },
1414: { 0x0192, "fnof", "LATIN SMALL LETTER F WITH HOOK" },
1415: { 0x01F5, "gacute", "LATIN SMALL LETTER G WITH ACUTE" },
1416: { 0x02C7, "caron", "CARON" },
1417: { 0x02D8, "breve", "BREVE" },
1418: { 0x02D9, "dot", "DOT ABOVE" },
1419: { 0x02DA, "ring", "RING ABOVE" },
1420: { 0x02DB, "ogon", "OGONEK" },
1421: { 0x02DC, "tilde", "TILDE" },
1422: { 0x02DD, "dblac", "DOUBLE ACUTE ACCENT" },
1423: { 0x0386, "Aacgr", "GREEK CAPITAL LETTER ALPHA WITH TONOS" },
1424: { 0x0388, "Eacgr", "GREEK CAPITAL LETTER EPSILON WITH TONOS" },
1425: { 0x0389, "EEacgr", "GREEK CAPITAL LETTER ETA WITH TONOS" },
1426: { 0x038A, "Iacgr", "GREEK CAPITAL LETTER IOTA WITH TONOS" },
1427: { 0x038C, "Oacgr", "GREEK CAPITAL LETTER OMICRON WITH TONOS" },
1428: { 0x038E, "Uacgr", "GREEK CAPITAL LETTER UPSILON WITH TONOS" },
1429: { 0x038F, "OHacgr", "GREEK CAPITAL LETTER OMEGA WITH TONOS" },
1430: { 0x0390, "idiagr", "GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS" },
1431: { 0x0391, "Agr", "GREEK CAPITAL LETTER ALPHA" },
1432: { 0x0392, "Bgr", "GREEK CAPITAL LETTER BETA" },
1433: { 0x0393, "b.Gamma", "GREEK CAPITAL LETTER GAMMA" },
1434: { 0x0393, "Gamma", "GREEK CAPITAL LETTER GAMMA" },
1435: { 0x0393, "Ggr", "GREEK CAPITAL LETTER GAMMA" },
1436: { 0x0394, "b.Delta", "GREEK CAPITAL LETTER DELTA" },
1437: { 0x0394, "Delta", "GREEK CAPITAL LETTER DELTA" },
1438: { 0x0394, "Dgr", "GREEK CAPITAL LETTER DELTA" },
1439: { 0x0395, "Egr", "GREEK CAPITAL LETTER EPSILON" },
1440: { 0x0396, "Zgr", "GREEK CAPITAL LETTER ZETA" },
1441: { 0x0397, "EEgr", "GREEK CAPITAL LETTER ETA" },
1442: { 0x0398, "b.Theta", "GREEK CAPITAL LETTER THETA" },
1443: { 0x0398, "Theta", "GREEK CAPITAL LETTER THETA" },
1444: { 0x0398, "THgr", "GREEK CAPITAL LETTER THETA" },
1445: { 0x0399, "Igr", "GREEK CAPITAL LETTER IOTA" },
1446: { 0x039A, "Kgr", "GREEK CAPITAL LETTER KAPPA" },
1447: { 0x039B, "b.Lambda", "GREEK CAPITAL LETTER LAMDA" },
1448: { 0x039B, "Lambda", "GREEK CAPITAL LETTER LAMDA" },
1449: { 0x039B, "Lgr", "GREEK CAPITAL LETTER LAMDA" },
1450: { 0x039C, "Mgr", "GREEK CAPITAL LETTER MU" },
1451: { 0x039D, "Ngr", "GREEK CAPITAL LETTER NU" },
1452: { 0x039E, "b.Xi", "GREEK CAPITAL LETTER XI" },
1453: { 0x039E, "Xgr", "GREEK CAPITAL LETTER XI" },
1454: { 0x039E, "Xi", "GREEK CAPITAL LETTER XI" },
1455: { 0x039F, "Ogr", "GREEK CAPITAL LETTER OMICRON" },
1456: { 0x03A0, "b.Pi", "GREEK CAPITAL LETTER PI" },
1457: { 0x03A0, "Pgr", "GREEK CAPITAL LETTER PI" },
1458: { 0x03A0, "Pi", "GREEK CAPITAL LETTER PI" },
1459: { 0x03A1, "Rgr", "GREEK CAPITAL LETTER RHO" },
1460: { 0x03A3, "b.Sigma", "GREEK CAPITAL LETTER SIGMA" },
1461: { 0x03A3, "Sgr", "GREEK CAPITAL LETTER SIGMA" },
1462: { 0x03A3, "Sigma", "GREEK CAPITAL LETTER SIGMA" },
1463: { 0x03A4, "Tgr", "GREEK CAPITAL LETTER TAU" },
1464: { 0x03A5, "Ugr", "" },
1465: { 0x03A6, "b.Phi", "GREEK CAPITAL LETTER PHI" },
1466: { 0x03A6, "PHgr", "GREEK CAPITAL LETTER PHI" },
1467: { 0x03A6, "Phi", "GREEK CAPITAL LETTER PHI" },
1468: { 0x03A7, "KHgr", "GREEK CAPITAL LETTER CHI" },
1469: { 0x03A8, "b.Psi", "GREEK CAPITAL LETTER PSI" },
1470: { 0x03A8, "PSgr", "GREEK CAPITAL LETTER PSI" },
1471: { 0x03A8, "Psi", "GREEK CAPITAL LETTER PSI" },
1472: { 0x03A9, "b.Omega", "GREEK CAPITAL LETTER OMEGA" },
1473: { 0x03A9, "OHgr", "GREEK CAPITAL LETTER OMEGA" },
1474: { 0x03A9, "Omega", "GREEK CAPITAL LETTER OMEGA" },
1475: { 0x03AA, "Idigr", "GREEK CAPITAL LETTER IOTA WITH DIALYTIKA" },
1476: { 0x03AB, "Udigr", "GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA" },
1477: { 0x03AC, "aacgr", "GREEK SMALL LETTER ALPHA WITH TONOS" },
1478: { 0x03AD, "eacgr", "GREEK SMALL LETTER EPSILON WITH TONOS" },
1479: { 0x03AE, "eeacgr", "GREEK SMALL LETTER ETA WITH TONOS" },
1480: { 0x03AF, "iacgr", "GREEK SMALL LETTER IOTA WITH TONOS" },
1481: { 0x03B0, "udiagr", "GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS" },
1482: { 0x03B1, "agr", "" },
1483: { 0x03B1, "alpha", "" },
1484: { 0x03B1, "b.alpha", "" },
1485: { 0x03B2, "b.beta", "GREEK SMALL LETTER BETA" },
1486: { 0x03B2, "beta", "GREEK SMALL LETTER BETA" },
1487: { 0x03B2, "bgr", "GREEK SMALL LETTER BETA" },
1488: { 0x03B3, "b.gamma", "GREEK SMALL LETTER GAMMA" },
1489: { 0x03B3, "gamma", "GREEK SMALL LETTER GAMMA" },
1490: { 0x03B3, "ggr", "GREEK SMALL LETTER GAMMA" },
1491: { 0x03B4, "b.delta", "GREEK SMALL LETTER DELTA" },
1492: { 0x03B4, "delta", "GREEK SMALL LETTER DELTA" },
1493: { 0x03B4, "dgr", "GREEK SMALL LETTER DELTA" },
1494: { 0x03B5, "b.epsi", "" },
1495: { 0x03B5, "b.epsis", "" },
1496: { 0x03B5, "b.epsiv", "" },
1497: { 0x03B5, "egr", "" },
1498: { 0x03B5, "epsiv", "" },
1499: { 0x03B6, "b.zeta", "GREEK SMALL LETTER ZETA" },
1500: { 0x03B6, "zeta", "GREEK SMALL LETTER ZETA" },
1501: { 0x03B6, "zgr", "GREEK SMALL LETTER ZETA" },
1502: { 0x03B7, "b.eta", "GREEK SMALL LETTER ETA" },
1503: { 0x03B7, "eegr", "GREEK SMALL LETTER ETA" },
1504: { 0x03B7, "eta", "GREEK SMALL LETTER ETA" },
1505: { 0x03B8, "b.thetas", "" },
1506: { 0x03B8, "thetas", "" },
1507: { 0x03B8, "thgr", "" },
1508: { 0x03B9, "b.iota", "GREEK SMALL LETTER IOTA" },
1509: { 0x03B9, "igr", "GREEK SMALL LETTER IOTA" },
1510: { 0x03B9, "iota", "GREEK SMALL LETTER IOTA" },
1511: { 0x03BA, "b.kappa", "GREEK SMALL LETTER KAPPA" },
1512: { 0x03BA, "kappa", "GREEK SMALL LETTER KAPPA" },
1513: { 0x03BA, "kgr", "GREEK SMALL LETTER KAPPA" },
1514: { 0x03BB, "b.lambda", "GREEK SMALL LETTER LAMDA" },
1515: { 0x03BB, "lambda", "GREEK SMALL LETTER LAMDA" },
1516: { 0x03BB, "lgr", "GREEK SMALL LETTER LAMDA" },
1517: { 0x03BC, "b.mu", "GREEK SMALL LETTER MU" },
1518: { 0x03BC, "mgr", "GREEK SMALL LETTER MU" },
1519: { 0x03BC, "mu", "GREEK SMALL LETTER MU" },
1520: { 0x03BD, "b.nu", "GREEK SMALL LETTER NU" },
1521: { 0x03BD, "ngr", "GREEK SMALL LETTER NU" },
1522: { 0x03BD, "nu", "GREEK SMALL LETTER NU" },
1523: { 0x03BE, "b.xi", "GREEK SMALL LETTER XI" },
1524: { 0x03BE, "xgr", "GREEK SMALL LETTER XI" },
1525: { 0x03BE, "xi", "GREEK SMALL LETTER XI" },
1526: { 0x03BF, "ogr", "GREEK SMALL LETTER OMICRON" },
1527: { 0x03C0, "b.pi", "GREEK SMALL LETTER PI" },
1528: { 0x03C0, "pgr", "GREEK SMALL LETTER PI" },
1529: { 0x03C0, "pi", "GREEK SMALL LETTER PI" },
1530: { 0x03C1, "b.rho", "GREEK SMALL LETTER RHO" },
1531: { 0x03C1, "rgr", "GREEK SMALL LETTER RHO" },
1532: { 0x03C1, "rho", "GREEK SMALL LETTER RHO" },
1533: { 0x03C2, "b.sigmav", "" },
1534: { 0x03C2, "sfgr", "" },
1535: { 0x03C2, "sigmav", "" },
1536: { 0x03C3, "b.sigma", "GREEK SMALL LETTER SIGMA" },
1537: { 0x03C3, "sgr", "GREEK SMALL LETTER SIGMA" },
1538: { 0x03C3, "sigma", "GREEK SMALL LETTER SIGMA" },
1539: { 0x03C4, "b.tau", "GREEK SMALL LETTER TAU" },
1540: { 0x03C4, "tau", "GREEK SMALL LETTER TAU" },
1541: { 0x03C4, "tgr", "GREEK SMALL LETTER TAU" },
1542: { 0x03C5, "b.upsi", "GREEK SMALL LETTER UPSILON" },
1543: { 0x03C5, "ugr", "GREEK SMALL LETTER UPSILON" },
1544: { 0x03C5, "upsi", "GREEK SMALL LETTER UPSILON" },
1545: { 0x03C6, "b.phis", "GREEK SMALL LETTER PHI" },
1546: { 0x03C6, "phgr", "GREEK SMALL LETTER PHI" },
1547: { 0x03C6, "phis", "GREEK SMALL LETTER PHI" },
1548: { 0x03C7, "b.chi", "GREEK SMALL LETTER CHI" },
1549: { 0x03C7, "chi", "GREEK SMALL LETTER CHI" },
1550: { 0x03C7, "khgr", "GREEK SMALL LETTER CHI" },
1551: { 0x03C8, "b.psi", "GREEK SMALL LETTER PSI" },
1552: { 0x03C8, "psgr", "GREEK SMALL LETTER PSI" },
1553: { 0x03C8, "psi", "GREEK SMALL LETTER PSI" },
1554: { 0x03C9, "b.omega", "GREEK SMALL LETTER OMEGA" },
1555: { 0x03C9, "ohgr", "GREEK SMALL LETTER OMEGA" },
1556: { 0x03C9, "omega", "GREEK SMALL LETTER OMEGA" },
1557: { 0x03CA, "idigr", "GREEK SMALL LETTER IOTA WITH DIALYTIKA" },
1558: { 0x03CB, "udigr", "GREEK SMALL LETTER UPSILON WITH DIALYTIKA" },
1559: { 0x03CC, "oacgr", "GREEK SMALL LETTER OMICRON WITH TONOS" },
1560: { 0x03CD, "uacgr", "GREEK SMALL LETTER UPSILON WITH TONOS" },
1561: { 0x03CE, "ohacgr", "GREEK SMALL LETTER OMEGA WITH TONOS" },
1562: { 0x03D1, "b.thetav", "" },
1563: { 0x03D1, "thetav", "" },
1564: { 0x03D2, "b.Upsi", "" },
1565: { 0x03D2, "Upsi", "" },
1566: { 0x03D5, "b.phiv", "GREEK PHI SYMBOL" },
1567: { 0x03D5, "phiv", "GREEK PHI SYMBOL" },
1568: { 0x03D6, "b.piv", "GREEK PI SYMBOL" },
1569: { 0x03D6, "piv", "GREEK PI SYMBOL" },
1570: { 0x03DC, "b.gammad", "GREEK LETTER DIGAMMA" },
1571: { 0x03DC, "gammad", "GREEK LETTER DIGAMMA" },
1572: { 0x03F0, "b.kappav", "GREEK KAPPA SYMBOL" },
1573: { 0x03F0, "kappav", "GREEK KAPPA SYMBOL" },
1574: { 0x03F1, "b.rhov", "GREEK RHO SYMBOL" },
1575: { 0x03F1, "rhov", "GREEK RHO SYMBOL" },
1576: { 0x0401, "IOcy", "CYRILLIC CAPITAL LETTER IO" },
1577: { 0x0402, "DJcy", "CYRILLIC CAPITAL LETTER DJE" },
1578: { 0x0403, "GJcy", "CYRILLIC CAPITAL LETTER GJE" },
1579: { 0x0404, "Jukcy", "CYRILLIC CAPITAL LETTER UKRAINIAN IE" },
1580: { 0x0405, "DScy", "CYRILLIC CAPITAL LETTER DZE" },
1581: { 0x0406, "Iukcy", "CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I" },
1582: { 0x0407, "YIcy", "CYRILLIC CAPITAL LETTER YI" },
1583: { 0x0408, "Jsercy", "CYRILLIC CAPITAL LETTER JE" },
1584: { 0x0409, "LJcy", "CYRILLIC CAPITAL LETTER LJE" },
1585: { 0x040A, "NJcy", "CYRILLIC CAPITAL LETTER NJE" },
1586: { 0x040B, "TSHcy", "CYRILLIC CAPITAL LETTER TSHE" },
1587: { 0x040C, "KJcy", "CYRILLIC CAPITAL LETTER KJE" },
1588: { 0x040E, "Ubrcy", "CYRILLIC CAPITAL LETTER SHORT U" },
1589: { 0x040F, "DZcy", "CYRILLIC CAPITAL LETTER DZHE" },
1590: { 0x0410, "Acy", "CYRILLIC CAPITAL LETTER A" },
1591: { 0x0411, "Bcy", "CYRILLIC CAPITAL LETTER BE" },
1592: { 0x0412, "Vcy", "CYRILLIC CAPITAL LETTER VE" },
1593: { 0x0413, "Gcy", "CYRILLIC CAPITAL LETTER GHE" },
1594: { 0x0414, "Dcy", "CYRILLIC CAPITAL LETTER DE" },
1595: { 0x0415, "IEcy", "CYRILLIC CAPITAL LETTER IE" },
1596: { 0x0416, "ZHcy", "CYRILLIC CAPITAL LETTER ZHE" },
1597: { 0x0417, "Zcy", "CYRILLIC CAPITAL LETTER ZE" },
1598: { 0x0418, "Icy", "CYRILLIC CAPITAL LETTER I" },
1599: { 0x0419, "Jcy", "CYRILLIC CAPITAL LETTER SHORT I" },
1600: { 0x041A, "Kcy", "CYRILLIC CAPITAL LETTER KA" },
1601: { 0x041B, "Lcy", "CYRILLIC CAPITAL LETTER EL" },
1602: { 0x041C, "Mcy", "CYRILLIC CAPITAL LETTER EM" },
1603: { 0x041D, "Ncy", "CYRILLIC CAPITAL LETTER EN" },
1604: { 0x041E, "Ocy", "CYRILLIC CAPITAL LETTER O" },
1605: { 0x041F, "Pcy", "CYRILLIC CAPITAL LETTER PE" },
1606: { 0x0420, "Rcy", "CYRILLIC CAPITAL LETTER ER" },
1607: { 0x0421, "Scy", "CYRILLIC CAPITAL LETTER ES" },
1608: { 0x0422, "Tcy", "CYRILLIC CAPITAL LETTER TE" },
1609: { 0x0423, "Ucy", "CYRILLIC CAPITAL LETTER U" },
1610: { 0x0424, "Fcy", "CYRILLIC CAPITAL LETTER EF" },
1611: { 0x0425, "KHcy", "CYRILLIC CAPITAL LETTER HA" },
1612: { 0x0426, "TScy", "CYRILLIC CAPITAL LETTER TSE" },
1613: { 0x0427, "CHcy", "CYRILLIC CAPITAL LETTER CHE" },
1614: { 0x0428, "SHcy", "CYRILLIC CAPITAL LETTER SHA" },
1615: { 0x0429, "SHCHcy", "CYRILLIC CAPITAL LETTER SHCHA" },
1616: { 0x042A, "HARDcy", "CYRILLIC CAPITAL LETTER HARD SIGN" },
1617: { 0x042B, "Ycy", "CYRILLIC CAPITAL LETTER YERU" },
1618: { 0x042C, "SOFTcy", "CYRILLIC CAPITAL LETTER SOFT SIGN" },
1619: { 0x042D, "Ecy", "CYRILLIC CAPITAL LETTER E" },
1620: { 0x042E, "YUcy", "CYRILLIC CAPITAL LETTER YU" },
1621: { 0x042F, "YAcy", "CYRILLIC CAPITAL LETTER YA" },
1622: { 0x0430, "acy", "CYRILLIC SMALL LETTER A" },
1623: { 0x0431, "bcy", "CYRILLIC SMALL LETTER BE" },
1624: { 0x0432, "vcy", "CYRILLIC SMALL LETTER VE" },
1625: { 0x0433, "gcy", "CYRILLIC SMALL LETTER GHE" },
1626: { 0x0434, "dcy", "CYRILLIC SMALL LETTER DE" },
1627: { 0x0435, "iecy", "CYRILLIC SMALL LETTER IE" },
1628: { 0x0436, "zhcy", "CYRILLIC SMALL LETTER ZHE" },
1629: { 0x0437, "zcy", "CYRILLIC SMALL LETTER ZE" },
1630: { 0x0438, "icy", "CYRILLIC SMALL LETTER I" },
1631: { 0x0439, "jcy", "CYRILLIC SMALL LETTER SHORT I" },
1632: { 0x043A, "kcy", "CYRILLIC SMALL LETTER KA" },
1633: { 0x043B, "lcy", "CYRILLIC SMALL LETTER EL" },
1634: { 0x043C, "mcy", "CYRILLIC SMALL LETTER EM" },
1635: { 0x043D, "ncy", "CYRILLIC SMALL LETTER EN" },
1636: { 0x043E, "ocy", "CYRILLIC SMALL LETTER O" },
1637: { 0x043F, "pcy", "CYRILLIC SMALL LETTER PE" },
1638: { 0x0440, "rcy", "CYRILLIC SMALL LETTER ER" },
1639: { 0x0441, "scy", "CYRILLIC SMALL LETTER ES" },
1640: { 0x0442, "tcy", "CYRILLIC SMALL LETTER TE" },
1641: { 0x0443, "ucy", "CYRILLIC SMALL LETTER U" },
1642: { 0x0444, "fcy", "CYRILLIC SMALL LETTER EF" },
1643: { 0x0445, "khcy", "CYRILLIC SMALL LETTER HA" },
1644: { 0x0446, "tscy", "CYRILLIC SMALL LETTER TSE" },
1645: { 0x0447, "chcy", "CYRILLIC SMALL LETTER CHE" },
1646: { 0x0448, "shcy", "CYRILLIC SMALL LETTER SHA" },
1647: { 0x0449, "shchcy", "CYRILLIC SMALL LETTER SHCHA" },
1648: { 0x044A, "hardcy", "CYRILLIC SMALL LETTER HARD SIGN" },
1649: { 0x044B, "ycy", "CYRILLIC SMALL LETTER YERU" },
1650: { 0x044C, "softcy", "CYRILLIC SMALL LETTER SOFT SIGN" },
1651: { 0x044D, "ecy", "CYRILLIC SMALL LETTER E" },
1652: { 0x044E, "yucy", "CYRILLIC SMALL LETTER YU" },
1653: { 0x044F, "yacy", "CYRILLIC SMALL LETTER YA" },
1654: { 0x0451, "iocy", "CYRILLIC SMALL LETTER IO" },
1655: { 0x0452, "djcy", "CYRILLIC SMALL LETTER DJE" },
1656: { 0x0453, "gjcy", "CYRILLIC SMALL LETTER GJE" },
1657: { 0x0454, "jukcy", "CYRILLIC SMALL LETTER UKRAINIAN IE" },
1658: { 0x0455, "dscy", "CYRILLIC SMALL LETTER DZE" },
1659: { 0x0456, "iukcy", "CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I" },
1660: { 0x0457, "yicy", "CYRILLIC SMALL LETTER YI" },
1661: { 0x0458, "jsercy", "CYRILLIC SMALL LETTER JE" },
1662: { 0x0459, "ljcy", "CYRILLIC SMALL LETTER LJE" },
1663: { 0x045A, "njcy", "CYRILLIC SMALL LETTER NJE" },
1664: { 0x045B, "tshcy", "CYRILLIC SMALL LETTER TSHE" },
1665: { 0x045C, "kjcy", "CYRILLIC SMALL LETTER KJE" },
1666: { 0x045E, "ubrcy", "CYRILLIC SMALL LETTER SHORT U" },
1667: { 0x045F, "dzcy", "CYRILLIC SMALL LETTER DZHE" },
1668: { 0x2002, "ensp", "EN SPACE" },
1669: { 0x2003, "emsp", "EM SPACE" },
1670: { 0x2004, "emsp13", "THREE-PER-EM SPACE" },
1671: { 0x2005, "emsp14", "FOUR-PER-EM SPACE" },
1672: { 0x2007, "numsp", "FIGURE SPACE" },
1673: { 0x2008, "puncsp", "PUNCTUATION SPACE" },
1674: { 0x2009, "thinsp", "THIN SPACE" },
1675: { 0x200A, "hairsp", "HAIR SPACE" },
1676: { 0x2010, "dash", "HYPHEN" },
1677: { 0x2013, "ndash", "EN DASH" },
1678: { 0x2014, "mdash", "EM DASH" },
1679: { 0x2015, "horbar", "HORIZONTAL BAR" },
1680: { 0x2016, "Verbar", "DOUBLE VERTICAL LINE" },
1681: { 0x2018, "lsquo", "" },
1682: { 0x2018, "rsquor", "" },
1683: { 0x2019, "rsquo", "RIGHT SINGLE QUOTATION MARK" },
1684: { 0x201A, "lsquor", "SINGLE LOW-9 QUOTATION MARK" },
1685: { 0x201C, "ldquo", "" },
1686: { 0x201C, "rdquor", "" },
1687: { 0x201D, "rdquo", "RIGHT DOUBLE QUOTATION MARK" },
1688: { 0x201E, "ldquor", "DOUBLE LOW-9 QUOTATION MARK" },
1689: { 0x2020, "dagger", "DAGGER" },
1690: { 0x2021, "Dagger", "DOUBLE DAGGER" },
1691: { 0x2022, "bull", "BULLET" },
1692: { 0x2025, "nldr", "TWO DOT LEADER" },
1693: { 0x2026, "hellip", "HORIZONTAL ELLIPSIS" },
1694: { 0x2026, "mldr", "HORIZONTAL ELLIPSIS" },
1695: { 0x2030, "permil", "PER MILLE SIGN" },
1696: { 0x2032, "prime", "PRIME" },
1697: { 0x2032, "vprime", "PRIME" },
1698: { 0x2033, "Prime", "DOUBLE PRIME" },
1699: { 0x2034, "tprime", "TRIPLE PRIME" },
1700: { 0x2035, "bprime", "REVERSED PRIME" },
1701: { 0x2041, "caret", "CARET" },
1702: { 0x2043, "hybull", "HYPHEN BULLET" },
1703: { 0x20DB, "tdot", "COMBINING THREE DOTS ABOVE" },
1704: { 0x20DC, "DotDot", "COMBINING FOUR DOTS ABOVE" },
1705: { 0x2105, "incare", "CARE OF" },
1706: { 0x210B, "hamilt", "SCRIPT CAPITAL H" },
1707: { 0x210F, "planck", "PLANCK CONSTANT OVER TWO PI" },
1708: { 0x2111, "image", "BLACK-LETTER CAPITAL I" },
1709: { 0x2112, "lagran", "SCRIPT CAPITAL L" },
1710: { 0x2113, "ell", "SCRIPT SMALL L" },
1711: { 0x2116, "numero", "NUMERO SIGN" },
1712: { 0x2117, "copysr", "SOUND RECORDING COPYRIGHT" },
1713: { 0x2118, "weierp", "SCRIPT CAPITAL P" },
1714: { 0x211C, "real", "BLACK-LETTER CAPITAL R" },
1715: { 0x211E, "rx", "PRESCRIPTION TAKE" },
1716: { 0x2122, "trade", "TRADE MARK SIGN" },
1717: { 0x2126, "ohm", "OHM SIGN" },
1718: { 0x212B, "angst", "ANGSTROM SIGN" },
1719: { 0x212C, "bernou", "SCRIPT CAPITAL B" },
1720: { 0x2133, "phmmat", "SCRIPT CAPITAL M" },
1721: { 0x2134, "order", "SCRIPT SMALL O" },
1722: { 0x2135, "aleph", "ALEF SYMBOL" },
1723: { 0x2136, "beth", "BET SYMBOL" },
1724: { 0x2137, "gimel", "GIMEL SYMBOL" },
1725: { 0x2138, "daleth", "DALET SYMBOL" },
1726: { 0x2153, "frac13", "VULGAR FRACTION ONE THIRD" },
1727: { 0x2154, "frac23", "VULGAR FRACTION TWO THIRDS" },
1728: { 0x2155, "frac15", "VULGAR FRACTION ONE FIFTH" },
1729: { 0x2156, "frac25", "VULGAR FRACTION TWO FIFTHS" },
1730: { 0x2157, "frac35", "VULGAR FRACTION THREE FIFTHS" },
1731: { 0x2158, "frac45", "VULGAR FRACTION FOUR FIFTHS" },
1732: { 0x2159, "frac16", "VULGAR FRACTION ONE SIXTH" },
1733: { 0x215A, "frac56", "VULGAR FRACTION FIVE SIXTHS" },
1734: { 0x215B, "frac18", "" },
1735: { 0x215C, "frac38", "" },
1736: { 0x215D, "frac58", "" },
1737: { 0x215E, "frac78", "" },
1738: { 0x2190, "larr", "LEFTWARDS DOUBLE ARROW" },
1739: { 0x2191, "uarr", "UPWARDS ARROW" },
1740: { 0x2192, "rarr", "RIGHTWARDS DOUBLE ARROW" },
1741: { 0x2193, "darr", "DOWNWARDS ARROW" },
1742: { 0x2194, "harr", "LEFT RIGHT ARROW" },
1743: { 0x2194, "xhArr", "LEFT RIGHT ARROW" },
1744: { 0x2194, "xharr", "LEFT RIGHT ARROW" },
1745: { 0x2195, "varr", "UP DOWN ARROW" },
1746: { 0x2196, "nwarr", "NORTH WEST ARROW" },
1747: { 0x2197, "nearr", "NORTH EAST ARROW" },
1748: { 0x2198, "drarr", "SOUTH EAST ARROW" },
1749: { 0x2199, "dlarr", "SOUTH WEST ARROW" },
1750: { 0x219A, "nlarr", "LEFTWARDS ARROW WITH STROKE" },
1751: { 0x219B, "nrarr", "RIGHTWARDS ARROW WITH STROKE" },
1752: { 0x219D, "rarrw", "RIGHTWARDS SQUIGGLE ARROW" },
1753: { 0x219E, "Larr", "LEFTWARDS TWO HEADED ARROW" },
1754: { 0x21A0, "Rarr", "RIGHTWARDS TWO HEADED ARROW" },
1755: { 0x21A2, "larrtl", "LEFTWARDS ARROW WITH TAIL" },
1756: { 0x21A3, "rarrtl", "RIGHTWARDS ARROW WITH TAIL" },
1757: { 0x21A6, "map", "RIGHTWARDS ARROW FROM BAR" },
1758: { 0x21A9, "larrhk", "LEFTWARDS ARROW WITH HOOK" },
1759: { 0x21AA, "rarrhk", "RIGHTWARDS ARROW WITH HOOK" },
1760: { 0x21AB, "larrlp", "LEFTWARDS ARROW WITH LOOP" },
1761: { 0x21AC, "rarrlp", "RIGHTWARDS ARROW WITH LOOP" },
1762: { 0x21AD, "harrw", "LEFT RIGHT WAVE ARROW" },
1763: { 0x21AE, "nharr", "LEFT RIGHT ARROW WITH STROKE" },
1764: { 0x21B0, "lsh", "UPWARDS ARROW WITH TIP LEFTWARDS" },
1765: { 0x21B1, "rsh", "UPWARDS ARROW WITH TIP RIGHTWARDS" },
1766: { 0x21B6, "cularr", "ANTICLOCKWISE TOP SEMICIRCLE ARROW" },
1767: { 0x21B7, "curarr", "CLOCKWISE TOP SEMICIRCLE ARROW" },
1768: { 0x21BA, "olarr", "ANTICLOCKWISE OPEN CIRCLE ARROW" },
1769: { 0x21BB, "orarr", "CLOCKWISE OPEN CIRCLE ARROW" },
1770: { 0x21BC, "lharu", "LEFTWARDS HARPOON WITH BARB UPWARDS" },
1771: { 0x21BD, "lhard", "LEFTWARDS HARPOON WITH BARB DOWNWARDS" },
1772: { 0x21BE, "uharr", "UPWARDS HARPOON WITH BARB RIGHTWARDS" },
1773: { 0x21BF, "uharl", "UPWARDS HARPOON WITH BARB LEFTWARDS" },
1774: { 0x21C0, "rharu", "RIGHTWARDS HARPOON WITH BARB UPWARDS" },
1775: { 0x21C1, "rhard", "RIGHTWARDS HARPOON WITH BARB DOWNWARDS" },
1776: { 0x21C2, "dharr", "DOWNWARDS HARPOON WITH BARB RIGHTWARDS" },
1777: { 0x21C3, "dharl", "DOWNWARDS HARPOON WITH BARB LEFTWARDS" },
1778: { 0x21C4, "rlarr2", "RIGHTWARDS ARROW OVER LEFTWARDS ARROW" },
1779: { 0x21C6, "lrarr2", "LEFTWARDS ARROW OVER RIGHTWARDS ARROW" },
1780: { 0x21C7, "larr2", "LEFTWARDS PAIRED ARROWS" },
1781: { 0x21C8, "uarr2", "UPWARDS PAIRED ARROWS" },
1782: { 0x21C9, "rarr2", "RIGHTWARDS PAIRED ARROWS" },
1783: { 0x21CA, "darr2", "DOWNWARDS PAIRED ARROWS" },
1784: { 0x21CB, "lrhar2", "LEFTWARDS HARPOON OVER RIGHTWARDS HARPOON" },
1785: { 0x21CC, "rlhar2", "RIGHTWARDS HARPOON OVER LEFTWARDS HARPOON" },
1786: { 0x21CD, "nlArr", "LEFTWARDS DOUBLE ARROW WITH STROKE" },
1787: { 0x21CE, "nhArr", "LEFT RIGHT DOUBLE ARROW WITH STROKE" },
1788: { 0x21CF, "nrArr", "RIGHTWARDS DOUBLE ARROW WITH STROKE" },
1789: { 0x21D0, "lArr", "LEFTWARDS ARROW" },
1790: { 0x21D0, "xlArr", "LEFTWARDS DOUBLE ARROW" },
1791: { 0x21D1, "uArr", "UPWARDS DOUBLE ARROW" },
1792: { 0x21D2, "rArr", "RIGHTWARDS ARROW" },
1793: { 0x21D2, "xrArr", "RIGHTWARDS DOUBLE ARROW" },
1794: { 0x21D3, "dArr", "DOWNWARDS DOUBLE ARROW" },
1795: { 0x21D4, "hArr", "" },
1796: { 0x21D4, "iff", "LEFT RIGHT DOUBLE ARROW" },
1797: { 0x21D5, "vArr", "UP DOWN DOUBLE ARROW" },
1798: { 0x21DA, "lAarr", "LEFTWARDS TRIPLE ARROW" },
1799: { 0x21DB, "rAarr", "RIGHTWARDS TRIPLE ARROW" },
1800: { 0x2200, "forall", "" },
1801: { 0x2201, "comp", "COMPLEMENT" },
1802: { 0x2202, "part", "" },
1803: { 0x2203, "exist", "" },
1804: { 0x2204, "nexist", "THERE DOES NOT EXIST" },
1805: { 0x2205, "empty", "" },
1806: { 0x2207, "nabla", "NABLA" },
1807: { 0x2209, "notin", "" },
1808: { 0x220A, "epsi", "" },
1809: { 0x220A, "epsis", "" },
1810: { 0x220A, "isin", "" },
1811: { 0x220D, "bepsi", "SMALL CONTAINS AS MEMBER" },
1812: { 0x220D, "ni", "" },
1813: { 0x220F, "prod", "N-ARY PRODUCT" },
1814: { 0x2210, "amalg", "N-ARY COPRODUCT" },
1815: { 0x2210, "coprod", "N-ARY COPRODUCT" },
1816: { 0x2210, "samalg", "" },
1817: { 0x2211, "sum", "N-ARY SUMMATION" },
1818: { 0x2212, "minus", "MINUS SIGN" },
1819: { 0x2213, "mnplus", "" },
1820: { 0x2214, "plusdo", "DOT PLUS" },
1821: { 0x2216, "setmn", "SET MINUS" },
1822: { 0x2216, "ssetmn", "SET MINUS" },
1823: { 0x2217, "lowast", "ASTERISK OPERATOR" },
1824: { 0x2218, "compfn", "RING OPERATOR" },
1825: { 0x221A, "radic", "" },
1826: { 0x221D, "prop", "" },
1827: { 0x221D, "vprop", "" },
1828: { 0x221E, "infin", "" },
1829: { 0x221F, "ang90", "RIGHT ANGLE" },
1830: { 0x2220, "ang", "ANGLE" },
1831: { 0x2221, "angmsd", "MEASURED ANGLE" },
1832: { 0x2222, "angsph", "" },
1833: { 0x2223, "mid", "" },
1834: { 0x2224, "nmid", "DOES NOT DIVIDE" },
1835: { 0x2225, "par", "PARALLEL TO" },
1836: { 0x2225, "spar", "PARALLEL TO" },
1837: { 0x2226, "npar", "NOT PARALLEL TO" },
1838: { 0x2226, "nspar", "NOT PARALLEL TO" },
1839: { 0x2227, "and", "" },
1840: { 0x2228, "or", "" },
1841: { 0x2229, "cap", "" },
1842: { 0x222A, "cup", "" },
1843: { 0x222B, "int", "" },
1844: { 0x222E, "conint", "" },
1845: { 0x2234, "there4", "" },
1846: { 0x2235, "becaus", "BECAUSE" },
1847: { 0x223C, "sim", "" },
1848: { 0x223C, "thksim", "TILDE OPERATOR" },
1849: { 0x223D, "bsim", "" },
1850: { 0x2240, "wreath", "WREATH PRODUCT" },
1851: { 0x2241, "nsim", "" },
1852: { 0x2243, "sime", "" },
1853: { 0x2244, "nsime", "" },
1854: { 0x2245, "cong", "" },
1855: { 0x2247, "ncong", "NEITHER APPROXIMATELY NOR ACTUALLY EQUAL TO" },
1856: { 0x2248, "ap", "" },
1857: { 0x2248, "thkap", "ALMOST EQUAL TO" },
1858: { 0x2249, "nap", "NOT ALMOST EQUAL TO" },
1859: { 0x224A, "ape", "" },
1860: { 0x224C, "bcong", "ALL EQUAL TO" },
1861: { 0x224D, "asymp", "EQUIVALENT TO" },
1862: { 0x224E, "bump", "" },
1863: { 0x224F, "bumpe", "" },
1864: { 0x2250, "esdot", "" },
1865: { 0x2251, "eDot", "" },
1866: { 0x2252, "efDot", "" },
1867: { 0x2253, "erDot", "" },
1868: { 0x2254, "colone", "" },
1869: { 0x2255, "ecolon", "" },
1870: { 0x2256, "ecir", "" },
1871: { 0x2257, "cire", "" },
1872: { 0x2259, "wedgeq", "ESTIMATES" },
1873: { 0x225C, "trie", "" },
1874: { 0x2260, "ne", "" },
1875: { 0x2261, "equiv", "" },
1876: { 0x2262, "nequiv", "NOT IDENTICAL TO" },
1877: { 0x2264, "le", "" },
1878: { 0x2264, "les", "LESS-THAN OR EQUAL TO" },
1879: { 0x2265, "ge", "GREATER-THAN OR EQUAL TO" },
1880: { 0x2265, "ges", "GREATER-THAN OR EQUAL TO" },
1881: { 0x2266, "lE", "" },
1882: { 0x2267, "gE", "" },
1883: { 0x2268, "lnE", "" },
1884: { 0x2268, "lne", "" },
1885: { 0x2268, "lvnE", "LESS-THAN BUT NOT EQUAL TO" },
1886: { 0x2269, "gnE", "" },
1887: { 0x2269, "gne", "" },
1888: { 0x2269, "gvnE", "GREATER-THAN BUT NOT EQUAL TO" },
1889: { 0x226A, "Lt", "MUCH LESS-THAN" },
1890: { 0x226B, "Gt", "MUCH GREATER-THAN" },
1891: { 0x226C, "twixt", "BETWEEN" },
1892: { 0x226E, "nlt", "NOT LESS-THAN" },
1893: { 0x226F, "ngt", "NOT GREATER-THAN" },
1894: { 0x2270, "nlE", "" },
1895: { 0x2270, "nle", "NEITHER LESS-THAN NOR EQUAL TO" },
1896: { 0x2270, "nles", "" },
1897: { 0x2271, "ngE", "" },
1898: { 0x2271, "nge", "NEITHER GREATER-THAN NOR EQUAL TO" },
1899: { 0x2271, "nges", "" },
1900: { 0x2272, "lap", "LESS-THAN OR EQUIVALENT TO" },
1901: { 0x2272, "lsim", "LESS-THAN OR EQUIVALENT TO" },
1902: { 0x2273, "gap", "GREATER-THAN OR EQUIVALENT TO" },
1903: { 0x2273, "gsim", "GREATER-THAN OR EQUIVALENT TO" },
1904: { 0x2276, "lg", "LESS-THAN OR GREATER-THAN" },
1905: { 0x2277, "gl", "" },
1906: { 0x227A, "pr", "" },
1907: { 0x227B, "sc", "" },
1908: { 0x227C, "cupre", "" },
1909: { 0x227C, "pre", "" },
1910: { 0x227D, "sccue", "" },
1911: { 0x227D, "sce", "" },
1912: { 0x227E, "prap", "" },
1913: { 0x227E, "prsim", "" },
1914: { 0x227F, "scap", "" },
1915: { 0x227F, "scsim", "" },
1916: { 0x2280, "npr", "DOES NOT PRECEDE" },
1917: { 0x2281, "nsc", "DOES NOT SUCCEED" },
1918: { 0x2282, "sub", "" },
1919: { 0x2283, "sup", "" },
1920: { 0x2284, "nsub", "NOT A SUBSET OF" },
1921: { 0x2285, "nsup", "NOT A SUPERSET OF" },
1922: { 0x2286, "subE", "" },
1923: { 0x2286, "sube", "" },
1924: { 0x2287, "supE", "" },
1925: { 0x2287, "supe", "" },
1926: { 0x2288, "nsubE", "" },
1927: { 0x2288, "nsube", "" },
1928: { 0x2289, "nsupE", "" },
1929: { 0x2289, "nsupe", "" },
1930: { 0x228A, "subne", "" },
1931: { 0x228A, "subnE", "SUBSET OF WITH NOT EQUAL TO" },
1932: { 0x228A, "vsubne", "SUBSET OF WITH NOT EQUAL TO" },
1933: { 0x228B, "supnE", "" },
1934: { 0x228B, "supne", "" },
1935: { 0x228B, "vsupnE", "SUPERSET OF WITH NOT EQUAL TO" },
1936: { 0x228B, "vsupne", "SUPERSET OF WITH NOT EQUAL TO" },
1937: { 0x228E, "uplus", "MULTISET UNION" },
1938: { 0x228F, "sqsub", "" },
1939: { 0x2290, "sqsup", "" },
1940: { 0x2291, "sqsube", "" },
1941: { 0x2292, "sqsupe", "" },
1942: { 0x2293, "sqcap", "SQUARE CAP" },
1943: { 0x2294, "sqcup", "SQUARE CUP" },
1944: { 0x2295, "oplus", "CIRCLED PLUS" },
1945: { 0x2296, "ominus", "CIRCLED MINUS" },
1946: { 0x2297, "otimes", "CIRCLED TIMES" },
1947: { 0x2298, "osol", "CIRCLED DIVISION SLASH" },
1948: { 0x2299, "odot", "CIRCLED DOT OPERATOR" },
1949: { 0x229A, "ocir", "CIRCLED RING OPERATOR" },
1950: { 0x229B, "oast", "CIRCLED ASTERISK OPERATOR" },
1951: { 0x229D, "odash", "CIRCLED DASH" },
1952: { 0x229E, "plusb", "SQUARED PLUS" },
1953: { 0x229F, "minusb", "SQUARED MINUS" },
1954: { 0x22A0, "timesb", "SQUARED TIMES" },
1955: { 0x22A1, "sdotb", "SQUARED DOT OPERATOR" },
1956: { 0x22A2, "vdash", "" },
1957: { 0x22A3, "dashv", "" },
1958: { 0x22A4, "top", "DOWN TACK" },
1959: { 0x22A5, "bottom", "" },
1960: { 0x22A5, "perp", "" },
1961: { 0x22A7, "models", "MODELS" },
1962: { 0x22A8, "vDash", "" },
1963: { 0x22A9, "Vdash", "" },
1964: { 0x22AA, "Vvdash", "" },
1965: { 0x22AC, "nvdash", "DOES NOT PROVE" },
1966: { 0x22AD, "nvDash", "NOT TRUE" },
1967: { 0x22AE, "nVdash", "DOES NOT FORCE" },
1968: { 0x22AF, "nVDash", "NEGATED DOUBLE VERTICAL BAR DOUBLE RIGHT TURNSTILE" },
1969: { 0x22B2, "vltri", "" },
1970: { 0x22B3, "vrtri", "" },
1971: { 0x22B4, "ltrie", "" },
1972: { 0x22B5, "rtrie", "" },
1973: { 0x22B8, "mumap", "MULTIMAP" },
1974: { 0x22BA, "intcal", "INTERCALATE" },
1975: { 0x22BB, "veebar", "" },
1976: { 0x22BC, "barwed", "NAND" },
1977: { 0x22C4, "diam", "DIAMOND OPERATOR" },
1978: { 0x22C5, "sdot", "DOT OPERATOR" },
1979: { 0x22C6, "sstarf", "STAR OPERATOR" },
1980: { 0x22C6, "star", "STAR OPERATOR" },
1981: { 0x22C7, "divonx", "DIVISION TIMES" },
1982: { 0x22C8, "bowtie", "" },
1983: { 0x22C9, "ltimes", "LEFT NORMAL FACTOR SEMIDIRECT PRODUCT" },
1984: { 0x22CA, "rtimes", "RIGHT NORMAL FACTOR SEMIDIRECT PRODUCT" },
1985: { 0x22CB, "lthree", "LEFT SEMIDIRECT PRODUCT" },
1986: { 0x22CC, "rthree", "RIGHT SEMIDIRECT PRODUCT" },
1987: { 0x22CD, "bsime", "" },
1988: { 0x22CE, "cuvee", "CURLY LOGICAL OR" },
1989: { 0x22CF, "cuwed", "CURLY LOGICAL AND" },
1990: { 0x22D0, "Sub", "" },
1991: { 0x22D1, "Sup", "" },
1992: { 0x22D2, "Cap", "DOUBLE INTERSECTION" },
1993: { 0x22D3, "Cup", "DOUBLE UNION" },
1994: { 0x22D4, "fork", "" },
1995: { 0x22D6, "ldot", "" },
1996: { 0x22D7, "gsdot", "" },
1997: { 0x22D8, "Ll", "" },
1998: { 0x22D9, "Gg", "VERY MUCH GREATER-THAN" },
1999: { 0x22DA, "lEg", "" },
2000: { 0x22DA, "leg", "" },
2001: { 0x22DB, "gEl", "" },
2002: { 0x22DB, "gel", "" },
2003: { 0x22DC, "els", "" },
2004: { 0x22DD, "egs", "" },
2005: { 0x22DE, "cuepr", "" },
2006: { 0x22DF, "cuesc", "" },
2007: { 0x22E0, "npre", "DOES NOT PRECEDE OR EQUAL" },
2008: { 0x22E1, "nsce", "DOES NOT SUCCEED OR EQUAL" },
2009: { 0x22E6, "lnsim", "" },
2010: { 0x22E7, "gnsim", "GREATER-THAN BUT NOT EQUIVALENT TO" },
2011: { 0x22E8, "prnap", "" },
2012: { 0x22E8, "prnsim", "" },
2013: { 0x22E9, "scnap", "" },
2014: { 0x22E9, "scnsim", "" },
2015: { 0x22EA, "nltri", "NOT NORMAL SUBGROUP OF" },
2016: { 0x22EB, "nrtri", "DOES NOT CONTAIN AS NORMAL SUBGROUP" },
2017: { 0x22EC, "nltrie", "NOT NORMAL SUBGROUP OF OR EQUAL TO" },
2018: { 0x22ED, "nrtrie", "DOES NOT CONTAIN AS NORMAL SUBGROUP OR EQUAL" },
2019: { 0x22EE, "vellip", "" },
2020: { 0x2306, "Barwed", "PERSPECTIVE" },
2021: { 0x2308, "lceil", "LEFT CEILING" },
2022: { 0x2309, "rceil", "RIGHT CEILING" },
2023: { 0x230A, "lfloor", "LEFT FLOOR" },
2024: { 0x230B, "rfloor", "RIGHT FLOOR" },
2025: { 0x230C, "drcrop", "BOTTOM RIGHT CROP" },
2026: { 0x230D, "dlcrop", "BOTTOM LEFT CROP" },
2027: { 0x230E, "urcrop", "TOP RIGHT CROP" },
2028: { 0x230F, "ulcrop", "TOP LEFT CROP" },
2029: { 0x2315, "telrec", "TELEPHONE RECORDER" },
2030: { 0x2316, "target", "POSITION INDICATOR" },
2031: { 0x231C, "ulcorn", "TOP LEFT CORNER" },
2032: { 0x231D, "urcorn", "TOP RIGHT CORNER" },
2033: { 0x231E, "dlcorn", "BOTTOM LEFT CORNER" },
2034: { 0x231F, "drcorn", "BOTTOM RIGHT CORNER" },
2035: { 0x2322, "frown", "" },
2036: { 0x2322, "sfrown", "FROWN" },
2037: { 0x2323, "smile", "" },
2038: { 0x2323, "ssmile", "SMILE" },
2039: { 0x2423, "blank", "OPEN BOX" },
2040: { 0x24C8, "oS", "CIRCLED LATIN CAPITAL LETTER S" },
2041: { 0x2500, "boxh", "BOX DRAWINGS LIGHT HORIZONTAL" },
2042: { 0x2502, "boxv", "BOX DRAWINGS LIGHT VERTICAL" },
2043: { 0x250C, "boxdr", "BOX DRAWINGS LIGHT DOWN AND RIGHT" },
2044: { 0x2510, "boxdl", "BOX DRAWINGS LIGHT DOWN AND LEFT" },
2045: { 0x2514, "boxur", "BOX DRAWINGS LIGHT UP AND RIGHT" },
2046: { 0x2518, "boxul", "BOX DRAWINGS LIGHT UP AND LEFT" },
2047: { 0x251C, "boxvr", "BOX DRAWINGS LIGHT VERTICAL AND RIGHT" },
2048: { 0x2524, "boxvl", "BOX DRAWINGS LIGHT VERTICAL AND LEFT" },
2049: { 0x252C, "boxhd", "BOX DRAWINGS LIGHT DOWN AND HORIZONTAL" },
2050: { 0x2534, "boxhu", "BOX DRAWINGS LIGHT UP AND HORIZONTAL" },
2051: { 0x253C, "boxvh", "BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL" },
2052: { 0x2550, "boxH", "BOX DRAWINGS DOUBLE HORIZONTAL" },
2053: { 0x2551, "boxV", "BOX DRAWINGS DOUBLE VERTICAL" },
2054: { 0x2552, "boxDR", "BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE" },
2055: { 0x2553, "boxDr", "BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE" },
2056: { 0x2554, "boxdR", "BOX DRAWINGS DOUBLE DOWN AND RIGHT" },
2057: { 0x2555, "boxDL", "BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE" },
2058: { 0x2556, "boxdL", "BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE" },
2059: { 0x2557, "boxDl", "BOX DRAWINGS DOUBLE DOWN AND LEFT" },
2060: { 0x2558, "boxUR", "BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE" },
2061: { 0x2559, "boxuR", "BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE" },
2062: { 0x255A, "boxUr", "BOX DRAWINGS DOUBLE UP AND RIGHT" },
2063: { 0x255B, "boxUL", "BOX DRAWINGS UP SINGLE AND LEFT DOUBLE" },
2064: { 0x255C, "boxUl", "BOX DRAWINGS UP DOUBLE AND LEFT SINGLE" },
2065: { 0x255D, "boxuL", "BOX DRAWINGS DOUBLE UP AND LEFT" },
2066: { 0x255E, "boxvR", "BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE" },
2067: { 0x255F, "boxVR", "BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE" },
2068: { 0x2560, "boxVr", "BOX DRAWINGS DOUBLE VERTICAL AND RIGHT" },
2069: { 0x2561, "boxvL", "BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE" },
2070: { 0x2562, "boxVL", "BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE" },
2071: { 0x2563, "boxVl", "BOX DRAWINGS DOUBLE VERTICAL AND LEFT" },
2072: { 0x2564, "boxhD", "BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE" },
2073: { 0x2565, "boxHD", "BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE" },
2074: { 0x2566, "boxHd", "BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL" },
2075: { 0x2567, "boxhU", "BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE" },
2076: { 0x2568, "boxHU", "BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE" },
2077: { 0x2569, "boxHu", "BOX DRAWINGS DOUBLE UP AND HORIZONTAL" },
2078: { 0x256A, "boxvH", "BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE" },
2079: { 0x256B, "boxVH", "BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE" },
2080: { 0x256C, "boxVh", "BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL" },
2081: { 0x2580, "uhblk", "UPPER HALF BLOCK" },
2082: { 0x2584, "lhblk", "LOWER HALF BLOCK" },
2083: { 0x2588, "block", "FULL BLOCK" },
2084: { 0x2591, "blk14", "LIGHT SHADE" },
2085: { 0x2592, "blk12", "MEDIUM SHADE" },
2086: { 0x2593, "blk34", "DARK SHADE" },
2087: { 0x25A1, "square", "WHITE SQUARE" },
2088: { 0x25A1, "squ", "WHITE SQUARE" },
2089: { 0x25AA, "squf", "" },
2090: { 0x25AD, "rect", "WHITE RECTANGLE" },
2091: { 0x25AE, "marker", "BLACK VERTICAL RECTANGLE" },
2092: { 0x25B3, "xutri", "WHITE UP-POINTING TRIANGLE" },
2093: { 0x25B4, "utrif", "BLACK UP-POINTING TRIANGLE" },
2094: { 0x25B5, "utri", "WHITE UP-POINTING TRIANGLE" },
2095: { 0x25B8, "rtrif", "BLACK RIGHT-POINTING TRIANGLE" },
2096: { 0x25B9, "rtri", "WHITE RIGHT-POINTING TRIANGLE" },
2097: { 0x25BD, "xdtri", "WHITE DOWN-POINTING TRIANGLE" },
2098: { 0x25BE, "dtrif", "BLACK DOWN-POINTING TRIANGLE" },
2099: { 0x25BF, "dtri", "WHITE DOWN-POINTING TRIANGLE" },
2100: { 0x25C2, "ltrif", "BLACK LEFT-POINTING TRIANGLE" },
2101: { 0x25C3, "ltri", "WHITE LEFT-POINTING TRIANGLE" },
2102: { 0x25CA, "loz", "LOZENGE" },
2103: { 0x25CB, "cir", "WHITE CIRCLE" },
2104: { 0x25CB, "xcirc", "WHITE CIRCLE" },
2105: { 0x2605, "starf", "BLACK STAR" },
2106: { 0x260E, "phone", "TELEPHONE SIGN" },
2107: { 0x2640, "female", "" },
2108: { 0x2642, "male", "MALE SIGN" },
2109: { 0x2660, "spades", "BLACK SPADE SUIT" },
2110: { 0x2663, "clubs", "BLACK CLUB SUIT" },
2111: { 0x2665, "hearts", "BLACK HEART SUIT" },
2112: { 0x2666, "diams", "BLACK DIAMOND SUIT" },
2113: { 0x2669, "sung", "" },
2114: { 0x266D, "flat", "MUSIC FLAT SIGN" },
2115: { 0x266E, "natur", "MUSIC NATURAL SIGN" },
2116: { 0x266F, "sharp", "MUSIC SHARP SIGN" },
2117: { 0x2713, "check", "CHECK MARK" },
2118: { 0x2717, "cross", "BALLOT X" },
2119: { 0x2720, "malt", "MALTESE CROSS" },
2120: { 0x2726, "lozf", "" },
2121: { 0x2736, "sext", "SIX POINTED BLACK STAR" },
2122: { 0x3008, "lang", "" },
2123: { 0x3009, "rang", "" },
2124: { 0xE291, "rpargt", "" },
2125: { 0xE2A2, "lnap", "" },
2126: { 0xE2AA, "nsmid", "" },
2127: { 0xE2B3, "prnE", "" },
2128: { 0xE2B5, "scnE", "" },
2129: { 0xE2B8, "vsubnE", "" },
2130: { 0xE301, "smid", "" },
2131: { 0xE411, "gnap", "" },
2132: { 0xFB00, "fflig", "" },
2133: { 0xFB01, "filig", "" },
2134: { 0xFB02, "fllig", "" },
2135: { 0xFB03, "ffilig", "" },
2136: { 0xFB04, "ffllig", "" },
2137: { 0xFE68, "sbsol", "SMALL REVERSE SOLIDUS" },
2138: };
2139:
2140: /************************************************************************
2141: * *
2142: * Commodity functions to handle entities *
2143: * *
2144: ************************************************************************/
2145:
2146: /*
2147: * Macro used to grow the current buffer.
2148: */
2149: #define growBuffer(buffer) { \
2150: buffer##_size *= 2; \
2151: buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
2152: if (buffer == NULL) { \
2153: perror("realloc failed"); \
2154: return(NULL); \
2155: } \
2156: }
2157:
2158: /**
2159: * sgmlEntityLookup:
2160: * @name: the entity name
2161: *
2162: * Lookup the given entity in EntitiesTable
2163: *
2164: * TODO: the linear scan is really ugly, an hash table is really needed.
2165: *
2166: * Returns the associated sgmlEntityDescPtr if found, NULL otherwise.
2167: */
2168: sgmlEntityDescPtr
2169: sgmlEntityLookup(const xmlChar *name) {
2170: int i;
2171:
2172: for (i = 0;i < (sizeof(docbookEntitiesTable)/
2173: sizeof(docbookEntitiesTable[0]));i++) {
2174: if (!xmlStrcmp(name, BAD_CAST docbookEntitiesTable[i].name)) {
2175: #ifdef DEBUG
2176: fprintf(stderr,"Found entity %s\n", name);
2177: #endif
2178: return(&docbookEntitiesTable[i]);
2179: }
2180: }
2181: return(NULL);
2182: }
2183:
2184: /**
2185: * sgmlEntityValueLookup:
2186: * @value: the entity's unicode value
2187: *
2188: * Lookup the given entity in EntitiesTable
2189: *
2190: * TODO: the linear scan is really ugly, an hash table is really needed.
2191: *
2192: * Returns the associated sgmlEntityDescPtr if found, NULL otherwise.
2193: */
2194: sgmlEntityDescPtr
2195: sgmlEntityValueLookup(int value) {
2196: int i;
2197: #ifdef DEBUG
2198: int lv = 0;
2199: #endif
2200:
2201: for (i = 0;i < (sizeof(docbookEntitiesTable)/
2202: sizeof(docbookEntitiesTable[0]));i++) {
2203: if (docbookEntitiesTable[i].value >= value) {
2204: if (docbookEntitiesTable[i].value > value)
2205: break;
2206: #ifdef DEBUG
2207: fprintf(stderr,"Found entity %s\n", docbookEntitiesTable[i].name);
2208: #endif
2209: return(&docbookEntitiesTable[i]);
2210: }
2211: #ifdef DEBUG
2212: if (lv > docbookEntitiesTable[i].value) {
2213: fprintf(stderr, "docbookEntitiesTable[] is not sorted (%d > %d)!\n",
2214: lv, docbookEntitiesTable[i].value);
2215: }
2216: lv = docbookEntitiesTable[i].value;
2217: #endif
2218: }
2219: return(NULL);
2220: }
2221:
2222: /**
2223: * UTF8ToSgml:
2224: * @out: a pointer to an array of bytes to store the result
2225: * @outlen: the length of @out
2226: * @in: a pointer to an array of UTF-8 chars
2227: * @inlen: the length of @in
2228: *
2229: * Take a block of UTF-8 chars in and try to convert it to an ASCII
2230: * plus SGML entities block of chars out.
2231: *
2232: * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2233: * The value of @inlen after return is the number of octets consumed
2234: * as the return value is positive, else unpredictiable.
2235: * The value of @outlen after return is the number of octets consumed.
2236: */
2237: int
2238: UTF8ToSgml(unsigned char* out, int *outlen,
2239: const unsigned char* in, int *inlen) {
2240: const unsigned char* processed = in;
2241: const unsigned char* outend;
2242: const unsigned char* outstart = out;
2243: const unsigned char* instart = in;
2244: const unsigned char* inend;
2245: unsigned int c, d;
2246: int trailing;
2247:
2248: if (in == NULL) {
2249: /*
2250: * initialization nothing to do
2251: */
2252: *outlen = 0;
2253: *inlen = 0;
2254: return(0);
2255: }
2256: inend = in + (*inlen);
2257: outend = out + (*outlen);
2258: while (in < inend) {
2259: d = *in++;
2260: if (d < 0x80) { c= d; trailing= 0; }
2261: else if (d < 0xC0) {
2262: /* trailing byte in leading position */
2263: *outlen = out - outstart;
2264: *inlen = processed - instart;
2265: return(-2);
2266: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2267: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2268: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2269: else {
2270: /* no chance for this in Ascii */
2271: *outlen = out - outstart;
2272: *inlen = processed - instart;
2273: return(-2);
2274: }
2275:
2276: if (inend - in < trailing) {
2277: break;
2278: }
2279:
2280: for ( ; trailing; trailing--) {
2281: if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2282: break;
2283: c <<= 6;
2284: c |= d & 0x3F;
2285: }
2286:
2287: /* assertion: c is a single UTF-4 value */
2288: if (c < 0x80) {
2289: if (out + 1 >= outend)
2290: break;
2291: *out++ = c;
2292: } else {
2293: int len;
2294: sgmlEntityDescPtr ent;
2295:
2296: /*
2297: * Try to lookup a predefined SGML entity for it
2298: */
2299:
2300: ent = sgmlEntityValueLookup(c);
2301: if (ent == NULL) {
2302: /* no chance for this in Ascii */
2303: *outlen = out - outstart;
2304: *inlen = processed - instart;
2305: return(-2);
2306: }
2307: len = strlen(ent->name);
2308: if (out + 2 + len >= outend)
2309: break;
2310: *out++ = '&';
2311: memcpy(out, ent->name, len);
2312: out += len;
2313: *out++ = ';';
2314: }
2315: processed = in;
2316: }
2317: *outlen = out - outstart;
2318: *inlen = processed - instart;
2319: return(0);
2320: }
2321:
2322: /**
2323: * sgmlEncodeEntities:
2324: * @out: a pointer to an array of bytes to store the result
2325: * @outlen: the length of @out
2326: * @in: a pointer to an array of UTF-8 chars
2327: * @inlen: the length of @in
2328: * @quoteChar: the quote character to escape (' or ") or zero.
2329: *
2330: * Take a block of UTF-8 chars in and try to convert it to an ASCII
2331: * plus SGML entities block of chars out.
2332: *
2333: * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2334: * The value of @inlen after return is the number of octets consumed
2335: * as the return value is positive, else unpredictiable.
2336: * The value of @outlen after return is the number of octets consumed.
2337: */
2338: int
2339: sgmlEncodeEntities(unsigned char* out, int *outlen,
2340: const unsigned char* in, int *inlen, int quoteChar) {
2341: const unsigned char* processed = in;
2342: const unsigned char* outend = out + (*outlen);
2343: const unsigned char* outstart = out;
2344: const unsigned char* instart = in;
2345: const unsigned char* inend = in + (*inlen);
2346: unsigned int c, d;
2347: int trailing;
2348:
2349: while (in < inend) {
2350: d = *in++;
2351: if (d < 0x80) { c= d; trailing= 0; }
2352: else if (d < 0xC0) {
2353: /* trailing byte in leading position */
2354: *outlen = out - outstart;
2355: *inlen = processed - instart;
2356: return(-2);
2357: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2358: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2359: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2360: else {
2361: /* no chance for this in Ascii */
2362: *outlen = out - outstart;
2363: *inlen = processed - instart;
2364: return(-2);
2365: }
2366:
2367: if (inend - in < trailing)
2368: break;
2369:
2370: while (trailing--) {
2371: if (((d= *in++) & 0xC0) != 0x80) {
2372: *outlen = out - outstart;
2373: *inlen = processed - instart;
2374: return(-2);
2375: }
2376: c <<= 6;
2377: c |= d & 0x3F;
2378: }
2379:
2380: /* assertion: c is a single UTF-4 value */
2381: if (c < 0x80 && c != quoteChar && c != '&' && c != '<' && c != '>') {
2382: if (out >= outend)
2383: break;
2384: *out++ = c;
2385: } else {
2386: sgmlEntityDescPtr ent;
2387: const char *cp;
2388: char nbuf[16];
2389: int len;
2390:
2391: /*
2392: * Try to lookup a predefined SGML entity for it
2393: */
2394: ent = sgmlEntityValueLookup(c);
2395: if (ent == NULL) {
2396: sprintf(nbuf, "#%u", c);
2397: cp = nbuf;
2398: }
2399: else
2400: cp = ent->name;
2401: len = strlen(cp);
2402: if (out + 2 + len > outend)
2403: break;
2404: *out++ = '&';
2405: memcpy(out, cp, len);
2406: out += len;
2407: *out++ = ';';
2408: }
2409: processed = in;
2410: }
2411: *outlen = out - outstart;
2412: *inlen = processed - instart;
2413: return(0);
2414: }
2415:
2416: /**
2417: * sgmlDecodeEntities:
2418: * @ctxt: the parser context
2419: * @len: the len to decode (in bytes !), -1 for no size limit
2420: * @end: an end marker xmlChar, 0 if none
2421: * @end2: an end marker xmlChar, 0 if none
2422: * @end3: an end marker xmlChar, 0 if none
2423: *
2424: * Subtitute the SGML entities by their value
2425: *
2426: * DEPRECATED !!!!
2427: *
2428: * Returns A newly allocated string with the substitution done. The caller
2429: * must deallocate it !
2430: */
2431: xmlChar *
2432: sgmlDecodeEntities(sgmlParserCtxtPtr ctxt, int len,
2433: xmlChar end, xmlChar end2, xmlChar end3) {
2434: xmlChar *name = NULL;
2435: xmlChar *buffer = NULL;
2436: unsigned int buffer_size = 0;
2437: unsigned int nbchars = 0;
2438: sgmlEntityDescPtr ent;
2439: unsigned int max = (unsigned int) len;
2440: int c,l;
2441:
2442: if (ctxt->depth > 40) {
2443: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2444: ctxt->sax->error(ctxt->userData,
2445: "Detected entity reference loop\n");
2446: ctxt->wellFormed = 0;
2447: ctxt->disableSAX = 1;
2448: ctxt->errNo = XML_ERR_ENTITY_LOOP;
2449: return(NULL);
2450: }
2451:
2452: /*
2453: * allocate a translation buffer.
2454: */
2455: buffer_size = SGML_PARSER_BIG_BUFFER_SIZE;
2456: buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
2457: if (buffer == NULL) {
2458: perror("xmlDecodeEntities: malloc failed");
2459: return(NULL);
2460: }
2461:
2462: /*
2463: * Ok loop until we reach one of the ending char or a size limit.
2464: */
2465: c = CUR_CHAR(l);
2466: while ((nbchars < max) && (c != end) &&
2467: (c != end2) && (c != end3)) {
2468:
2469: if (c == 0) break;
2470: if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
2471: int val = sgmlParseCharRef(ctxt);
2472: COPY_BUF(0,buffer,nbchars,val);
2473: NEXTL(l);
2474: } else if ((c == '&') && (ctxt->token != '&')) {
2475: ent = sgmlParseEntityRef(ctxt, &name);
2476: if (name != NULL) {
2477: if (ent != NULL) {
2478: int val = ent->value;
2479: COPY_BUF(0,buffer,nbchars,val);
2480: NEXTL(l);
2481: } else {
2482: const xmlChar *cur = name;
2483:
2484: buffer[nbchars++] = '&';
2485: if (nbchars > buffer_size - SGML_PARSER_BUFFER_SIZE) {
2486: growBuffer(buffer);
2487: }
2488: while (*cur != 0) {
2489: buffer[nbchars++] = *cur++;
2490: }
2491: buffer[nbchars++] = ';';
2492: }
2493: }
2494: } else {
2495: COPY_BUF(l,buffer,nbchars,c);
2496: NEXTL(l);
2497: if (nbchars > buffer_size - SGML_PARSER_BUFFER_SIZE) {
2498: growBuffer(buffer);
2499: }
2500: }
2501: c = CUR_CHAR(l);
2502: }
2503: buffer[nbchars++] = 0;
2504: return(buffer);
2505: }
2506:
2507: /************************************************************************
2508: * *
2509: * Commodity functions to handle streams *
2510: * *
2511: ************************************************************************/
2512:
2513: /**
2514: * sgmlFreeInputStream:
2515: * @input: an sgmlParserInputPtr
2516: *
2517: * Free up an input stream.
2518: */
2519: void
2520: sgmlFreeInputStream(sgmlParserInputPtr input) {
2521: if (input == NULL) return;
2522:
2523: if (input->filename != NULL) xmlFree((char *) input->filename);
2524: if (input->directory != NULL) xmlFree((char *) input->directory);
2525: if ((input->free != NULL) && (input->base != NULL))
2526: input->free((xmlChar *) input->base);
2527: if (input->buf != NULL)
2528: xmlFreeParserInputBuffer(input->buf);
2529: memset(input, -1, sizeof(sgmlParserInput));
2530: xmlFree(input);
2531: }
2532:
2533: /**
2534: * sgmlNewInputStream:
2535: * @ctxt: an SGML parser context
2536: *
2537: * Create a new input stream structure
2538: * Returns the new input stream or NULL
2539: */
2540: sgmlParserInputPtr
2541: sgmlNewInputStream(sgmlParserCtxtPtr ctxt) {
2542: sgmlParserInputPtr input;
2543:
2544: input = (xmlParserInputPtr) xmlMalloc(sizeof(sgmlParserInput));
2545: if (input == NULL) {
2546: ctxt->errNo = XML_ERR_NO_MEMORY;
2547: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2548: ctxt->sax->error(ctxt->userData,
2549: "malloc: couldn't allocate a new input stream\n");
2550: ctxt->errNo = XML_ERR_NO_MEMORY;
2551: return(NULL);
2552: }
2553: memset(input, 0, sizeof(sgmlParserInput));
2554: input->filename = NULL;
2555: input->directory = NULL;
2556: input->base = NULL;
2557: input->cur = NULL;
2558: input->buf = NULL;
2559: input->line = 1;
2560: input->col = 1;
2561: input->buf = NULL;
2562: input->free = NULL;
2563: input->version = NULL;
2564: input->consumed = 0;
2565: input->length = 0;
2566: return(input);
2567: }
2568:
2569:
2570: /************************************************************************
2571: * *
2572: * Commodity functions, cleanup needed ? *
2573: * *
2574: ************************************************************************/
2575:
2576: /**
2577: * areBlanks:
2578: * @ctxt: an SGML parser context
2579: * @str: a xmlChar *
2580: * @len: the size of @str
2581: *
2582: * Is this a sequence of blank chars that one can ignore ?
2583: *
2584: * Returns 1 if ignorable 0 otherwise.
2585: */
2586:
2587: static int areBlanks(sgmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2588: int i;
2589: xmlNodePtr lastChild;
2590:
2591: for (i = 0;i < len;i++)
2592: if (!(IS_BLANK(str[i]))) return(0);
2593:
2594: if (CUR == 0) return(1);
2595: if (CUR != '<') return(0);
2596: if (ctxt->name == NULL)
2597: return(1);
2598: #if 0
2599: if (!xmlStrcmp(ctxt->name, BAD_CAST"sgml"))
2600: return(1);
2601: if (!xmlStrcmp(ctxt->name, BAD_CAST"head"))
2602: return(1);
2603: if (!xmlStrcmp(ctxt->name, BAD_CAST"body"))
2604: return(1);
2605: #endif
2606: if (ctxt->node == NULL) return(0);
2607: lastChild = xmlGetLastChild(ctxt->node);
2608: if (lastChild == NULL) {
2609: if (ctxt->node->content != NULL) return(0);
2610: } else if (xmlNodeIsText(lastChild))
2611: return(0);
2612: return(1);
2613: }
2614:
2615: /**
2616: * sgmlHandleEntity:
2617: * @ctxt: an SGML parser context
2618: * @entity: an XML entity pointer.
2619: *
2620: * Default handling of an SGML entity, call the parser with the
2621: * substitution string
2622: */
2623:
2624: void
2625: sgmlHandleEntity(sgmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
2626: int len;
2627:
2628: if (entity->content == NULL) {
2629: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2630: ctxt->sax->error(ctxt->userData, "sgmlHandleEntity %s: content == NULL\n",
2631: entity->name);
2632: ctxt->wellFormed = 0;
2633: return;
2634: }
2635: len = xmlStrlen(entity->content);
2636:
2637: /*
2638: * Just handle the content as a set of chars.
2639: */
2640: sgmlCheckParagraph(ctxt);
2641: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
2642: ctxt->sax->characters(ctxt->userData, entity->content, len);
2643:
2644: }
2645:
2646: /**
2647: * sgmlNewDocNoDtD:
2648: * @URI: URI for the dtd, or NULL
2649: * @ExternalID: the external ID of the DTD, or NULL
2650: *
2651: * Returns a new document, do not intialize the DTD if not provided
2652: */
2653: sgmlDocPtr
2654: sgmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2655: xmlDocPtr cur;
2656:
2657: /*
2658: * Allocate a new document and fill the fields.
2659: */
2660: cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2661: if (cur == NULL) {
2662: fprintf(stderr, "xmlNewDoc : malloc failed\n");
2663: return(NULL);
2664: }
2665: memset(cur, 0, sizeof(xmlDoc));
2666:
2667: cur->type = XML_SGML_DOCUMENT_NODE;
2668: cur->version = NULL;
2669: cur->intSubset = NULL;
2670: if ((ExternalID != NULL) ||
2671: (URI != NULL))
2672: xmlCreateIntSubset(cur, BAD_CAST "SGML", ExternalID, URI);
2673: cur->doc = cur;
2674: cur->name = NULL;
2675: cur->children = NULL;
2676: cur->extSubset = NULL;
2677: cur->oldNs = NULL;
2678: cur->encoding = NULL;
2679: cur->standalone = 1;
2680: cur->compression = 0;
2681: cur->ids = NULL;
2682: cur->refs = NULL;
2683: #ifndef XML_WITHOUT_CORBA
2684: cur->_private = NULL;
2685: #endif
2686: return(cur);
2687: }
2688:
2689: /**
2690: * sgmlNewDoc:
2691: * @URI: URI for the dtd, or NULL
2692: * @ExternalID: the external ID of the DTD, or NULL
2693: *
2694: * Returns a new document
2695: */
2696: sgmlDocPtr
2697: sgmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2698: if ((URI == NULL) && (ExternalID == NULL))
2699: return(sgmlNewDocNoDtD(
2700: BAD_CAST "-//W3C//DTD SGML 4.0 Transitional//EN",
2701: BAD_CAST "http://www.w3.org/TR/REC-docbook/loose.dtd"));
2702:
2703: return(sgmlNewDocNoDtD(URI, ExternalID));
2704: }
2705:
2706:
2707: /************************************************************************
2708: * *
2709: * The parser itself *
2710: * Relates to http://www.w3.org/TR/docbook *
2711: * *
2712: ************************************************************************/
2713:
2714: /************************************************************************
2715: * *
2716: * The parser itself *
2717: * *
2718: ************************************************************************/
2719:
2720: /**
2721: * sgmlParseSGMLName:
2722: * @ctxt: an SGML parser context
2723: *
2724: * parse an SGML tag or attribute name, note that we convert it to lowercase
2725: * since SGML names are not case-sensitive.
2726: *
2727: * Returns the Tag Name parsed or NULL
2728: */
2729:
2730: xmlChar *
2731: sgmlParseSGMLName(sgmlParserCtxtPtr ctxt) {
2732: xmlChar *ret = NULL;
2733: int i = 0;
2734: xmlChar loc[SGML_PARSER_BUFFER_SIZE];
2735:
2736: if (!IS_LETTER(CUR) && (CUR != '_') &&
2737: (CUR != ':')) return(NULL);
2738:
2739: while ((i < SGML_PARSER_BUFFER_SIZE) &&
2740: ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2741: (CUR == ':') || (CUR == '_'))) {
2742: if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2743: else loc[i] = CUR;
2744: i++;
2745:
2746: NEXT;
2747: }
2748:
2749: ret = xmlStrndup(loc, i);
2750:
2751: return(ret);
2752: }
2753:
2754: /**
2755: * sgmlParseName:
2756: * @ctxt: an SGML parser context
2757: *
2758: * parse an SGML name, this routine is case sensistive.
2759: *
2760: * Returns the Name parsed or NULL
2761: */
2762:
2763: xmlChar *
2764: sgmlParseName(sgmlParserCtxtPtr ctxt) {
2765: xmlChar buf[SGML_MAX_NAMELEN];
2766: int len = 0;
2767:
2768: GROW;
2769: if (!IS_LETTER(CUR) && (CUR != '_')) {
2770: return(NULL);
2771: }
2772:
2773: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2774: (CUR == '.') || (CUR == '-') ||
2775: (CUR == '_') || (CUR == ':') ||
2776: (IS_COMBINING(CUR)) ||
2777: (IS_EXTENDER(CUR))) {
2778: buf[len++] = CUR;
2779: NEXT;
2780: if (len >= SGML_MAX_NAMELEN) {
2781: fprintf(stderr,
2782: "sgmlParseName: reached SGML_MAX_NAMELEN limit\n");
2783: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2784: (CUR == '.') || (CUR == '-') ||
2785: (CUR == '_') || (CUR == ':') ||
2786: (IS_COMBINING(CUR)) ||
2787: (IS_EXTENDER(CUR)))
2788: NEXT;
2789: break;
2790: }
2791: }
2792: return(xmlStrndup(buf, len));
2793: }
2794:
2795: /**
2796: * sgmlParseSGMLAttribute:
2797: * @ctxt: an SGML parser context
2798: * @stop: a char stop value
2799: *
2800: * parse an SGML attribute value till the stop (quote), if
2801: * stop is 0 then it stops at the first space
2802: *
2803: * Returns the attribute parsed or NULL
2804: */
2805:
2806: xmlChar *
2807: sgmlParseSGMLAttribute(sgmlParserCtxtPtr ctxt, const xmlChar stop) {
2808: #if 0
2809: xmlChar buf[SGML_MAX_NAMELEN];
2810: int len = 0;
2811:
2812: GROW;
2813: while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
2814: if ((stop == 0) && (IS_BLANK(CUR))) break;
2815: buf[len++] = CUR;
2816: NEXT;
2817: if (len >= SGML_MAX_NAMELEN) {
2818: fprintf(stderr,
2819: "sgmlParseSGMLAttribute: reached SGML_MAX_NAMELEN limit\n");
2820: while ((!IS_BLANK(CUR)) && (CUR != '<') &&
2821: (CUR != '>') &&
2822: (CUR != '\'') && (CUR != '"'))
2823: NEXT;
2824: break;
2825: }
2826: }
2827: return(xmlStrndup(buf, len));
2828: #else
2829: xmlChar *buffer = NULL;
2830: int buffer_size = 0;
2831: xmlChar *out = NULL;
2832: xmlChar *name = NULL;
2833:
2834: xmlChar *cur = NULL;
2835: sgmlEntityDescPtr ent;
2836:
2837: /*
2838: * allocate a translation buffer.
2839: */
2840: buffer_size = SGML_PARSER_BIG_BUFFER_SIZE;
2841: buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
2842: if (buffer == NULL) {
2843: perror("sgmlParseSGMLAttribute: malloc failed");
2844: return(NULL);
2845: }
2846: out = buffer;
2847:
2848: /*
2849: * Ok loop until we reach one of the ending chars
2850: */
2851: while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
2852: if ((stop == 0) && (IS_BLANK(CUR))) break;
2853: if (CUR == '&') {
2854: if (NXT(1) == '#') {
2855: unsigned int c;
2856: int bits;
2857:
2858: c = sgmlParseCharRef(ctxt);
2859: if (c < 0x80)
2860: { *out++ = c; bits= -6; }
2861: else if (c < 0x800)
2862: { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2863: else if (c < 0x10000)
2864: { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2865: else
2866: { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2867:
2868: for ( ; bits >= 0; bits-= 6) {
2869: *out++ = ((c >> bits) & 0x3F) | 0x80;
2870: }
2871: } else {
2872: ent = sgmlParseEntityRef(ctxt, &name);
2873: if (name == NULL) {
2874: *out++ = '&';
2875: if (out - buffer > buffer_size - 100) {
2876: int index = out - buffer;
2877:
2878: growBuffer(buffer);
2879: out = &buffer[index];
2880: }
2881: } else if (ent == NULL) {
2882: *out++ = '&';
2883: cur = name;
2884: while (*cur != 0) {
2885: if (out - buffer > buffer_size - 100) {
2886: int index = out - buffer;
2887:
2888: growBuffer(buffer);
2889: out = &buffer[index];
2890: }
2891: *out++ = *cur++;
2892: }
2893: xmlFree(name);
2894: } else {
2895: unsigned int c;
2896: int bits;
2897:
2898: if (out - buffer > buffer_size - 100) {
2899: int index = out - buffer;
2900:
2901: growBuffer(buffer);
2902: out = &buffer[index];
2903: }
2904: c = (xmlChar)ent->value;
2905: if (c < 0x80)
2906: { *out++ = c; bits= -6; }
2907: else if (c < 0x800)
2908: { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2909: else if (c < 0x10000)
2910: { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2911: else
2912: { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2913:
2914: for ( ; bits >= 0; bits-= 6) {
2915: *out++ = ((c >> bits) & 0x3F) | 0x80;
2916: }
2917: xmlFree(name);
2918: }
2919: }
2920: } else {
2921: unsigned int c;
2922: int bits;
2923:
2924: if (out - buffer > buffer_size - 100) {
2925: int index = out - buffer;
2926:
2927: growBuffer(buffer);
2928: out = &buffer[index];
2929: }
2930: c = CUR;
2931: if (c < 0x80)
2932: { *out++ = c; bits= -6; }
2933: else if (c < 0x800)
2934: { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2935: else if (c < 0x10000)
2936: { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2937: else
2938: { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2939:
2940: for ( ; bits >= 0; bits-= 6) {
2941: *out++ = ((c >> bits) & 0x3F) | 0x80;
2942: }
2943: NEXT;
2944: }
2945: }
2946: *out++ = 0;
2947: return(buffer);
2948: #endif
2949: }
2950:
2951: /**
2952: * sgmlParseNmtoken:
2953: * @ctxt: an SGML parser context
2954: *
2955: * parse an SGML Nmtoken.
2956: *
2957: * Returns the Nmtoken parsed or NULL
2958: */
2959:
2960: xmlChar *
2961: sgmlParseNmtoken(sgmlParserCtxtPtr ctxt) {
2962: xmlChar buf[SGML_MAX_NAMELEN];
2963: int len = 0;
2964:
2965: GROW;
2966: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2967: (CUR == '.') || (CUR == '-') ||
2968: (CUR == '_') || (CUR == ':') ||
2969: (IS_COMBINING(CUR)) ||
2970: (IS_EXTENDER(CUR))) {
2971: buf[len++] = CUR;
2972: NEXT;
2973: if (len >= SGML_MAX_NAMELEN) {
2974: fprintf(stderr,
2975: "sgmlParseNmtoken: reached SGML_MAX_NAMELEN limit\n");
2976: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2977: (CUR == '.') || (CUR == '-') ||
2978: (CUR == '_') || (CUR == ':') ||
2979: (IS_COMBINING(CUR)) ||
2980: (IS_EXTENDER(CUR)))
2981: NEXT;
2982: break;
2983: }
2984: }
2985: return(xmlStrndup(buf, len));
2986: }
2987:
2988: /**
2989: * sgmlParseEntityRef:
2990: * @ctxt: an SGML parser context
2991: * @str: location to store the entity name
2992: *
2993: * parse an SGML ENTITY references
2994: *
2995: * [68] EntityRef ::= '&' Name ';'
2996: *
2997: * Returns the associated sgmlEntityDescPtr if found, or NULL otherwise,
2998: * if non-NULL *str will have to be freed by the caller.
2999: */
3000: sgmlEntityDescPtr
3001: sgmlParseEntityRef(sgmlParserCtxtPtr ctxt, xmlChar **str) {
3002: xmlChar *name;
3003: sgmlEntityDescPtr ent = NULL;
3004: *str = NULL;
3005:
3006: if (CUR == '&') {
3007: NEXT;
3008: name = sgmlParseName(ctxt);
3009: if (name == NULL) {
3010: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3011: ctxt->sax->error(ctxt->userData, "sgmlParseEntityRef: no name\n");
3012: ctxt->wellFormed = 0;
3013: } else {
3014: GROW;
3015: if (CUR == ';') {
3016: *str = name;
3017:
3018: /*
3019: * Lookup the entity in the table.
3020: */
3021: ent = sgmlEntityLookup(name);
3022: if (ent != NULL) /* OK that's ugly !!! */
3023: NEXT;
3024: } else {
3025: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3026: ctxt->sax->error(ctxt->userData,
3027: "sgmlParseEntityRef: expecting ';'\n");
3028: *str = name;
3029: }
3030: }
3031: }
3032: return(ent);
3033: }
3034:
3035: /**
3036: * sgmlParseAttValue:
3037: * @ctxt: an SGML parser context
3038: *
3039: * parse a value for an attribute
3040: * Note: the parser won't do substitution of entities here, this
3041: * will be handled later in xmlStringGetNodeList, unless it was
3042: * asked for ctxt->replaceEntities != 0
3043: *
3044: * Returns the AttValue parsed or NULL.
3045: */
3046:
3047: xmlChar *
3048: sgmlParseAttValue(sgmlParserCtxtPtr ctxt) {
3049: xmlChar *ret = NULL;
3050:
3051: if (CUR == '"') {
3052: NEXT;
3053: ret = sgmlParseSGMLAttribute(ctxt, '"');
3054: if (CUR != '"') {
3055: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3056: ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
3057: ctxt->wellFormed = 0;
3058: } else
3059: NEXT;
3060: } else if (CUR == '\'') {
3061: NEXT;
3062: ret = sgmlParseSGMLAttribute(ctxt, '\'');
3063: if (CUR != '\'') {
3064: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3065: ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
3066: ctxt->wellFormed = 0;
3067: } else
3068: NEXT;
3069: } else {
3070: /*
3071: * That's an SGMLism, the attribute value may not be quoted
3072: */
3073: ret = sgmlParseSGMLAttribute(ctxt, 0);
3074: if (ret == NULL) {
3075: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3076: ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
3077: ctxt->wellFormed = 0;
3078: }
3079: }
3080: return(ret);
3081: }
3082:
3083: /**
3084: * sgmlParseSystemLiteral:
3085: * @ctxt: an SGML parser context
3086: *
3087: * parse an SGML Literal
3088: *
3089: * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
3090: *
3091: * Returns the SystemLiteral parsed or NULL
3092: */
3093:
3094: xmlChar *
3095: sgmlParseSystemLiteral(sgmlParserCtxtPtr ctxt) {
3096: const xmlChar *q;
3097: xmlChar *ret = NULL;
3098:
3099: if (CUR == '"') {
3100: NEXT;
3101: q = CUR_PTR;
3102: while ((IS_CHAR(CUR)) && (CUR != '"'))
3103: NEXT;
3104: if (!IS_CHAR(CUR)) {
3105: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3106: ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
3107: ctxt->wellFormed = 0;
3108: } else {
3109: ret = xmlStrndup(q, CUR_PTR - q);
3110: NEXT;
3111: }
3112: } else if (CUR == '\'') {
3113: NEXT;
3114: q = CUR_PTR;
3115: while ((IS_CHAR(CUR)) && (CUR != '\''))
3116: NEXT;
3117: if (!IS_CHAR(CUR)) {
3118: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3119: ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
3120: ctxt->wellFormed = 0;
3121: } else {
3122: ret = xmlStrndup(q, CUR_PTR - q);
3123: NEXT;
3124: }
3125: } else {
3126: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3127: ctxt->sax->error(ctxt->userData,
3128: "SystemLiteral \" or ' expected\n");
3129: ctxt->wellFormed = 0;
3130: }
3131:
3132: return(ret);
3133: }
3134:
3135: /**
3136: * sgmlParsePubidLiteral:
3137: * @ctxt: an SGML parser context
3138: *
3139: * parse an SGML public literal
3140: *
3141: * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
3142: *
3143: * Returns the PubidLiteral parsed or NULL.
3144: */
3145:
3146: xmlChar *
3147: sgmlParsePubidLiteral(sgmlParserCtxtPtr ctxt) {
3148: const xmlChar *q;
3149: xmlChar *ret = NULL;
3150: /*
3151: * Name ::= (Letter | '_') (NameChar)*
3152: */
3153: if (CUR == '"') {
3154: NEXT;
3155: q = CUR_PTR;
3156: while (IS_PUBIDCHAR(CUR)) NEXT;
3157: if (CUR != '"') {
3158: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3159: ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
3160: ctxt->wellFormed = 0;
3161: } else {
3162: ret = xmlStrndup(q, CUR_PTR - q);
3163: NEXT;
3164: }
3165: } else if (CUR == '\'') {
3166: NEXT;
3167: q = CUR_PTR;
3168: while ((IS_LETTER(CUR)) && (CUR != '\''))
3169: NEXT;
3170: if (!IS_LETTER(CUR)) {
3171: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3172: ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
3173: ctxt->wellFormed = 0;
3174: } else {
3175: ret = xmlStrndup(q, CUR_PTR - q);
3176: NEXT;
3177: }
3178: } else {
3179: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3180: ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
3181: ctxt->wellFormed = 0;
3182: }
3183:
3184: return(ret);
3185: }
3186:
3187: /**
3188: * sgmlParseCharData:
3189: * @ctxt: an SGML parser context
3190: * @cdata: int indicating whether we are within a CDATA section
3191: *
3192: * parse a CharData section.
3193: * if we are within a CDATA section ']]>' marks an end of section.
3194: *
3195: * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3196: */
3197:
3198: void
3199: sgmlParseCharData(sgmlParserCtxtPtr ctxt, int cdata) {
3200: xmlChar buf[SGML_PARSER_BIG_BUFFER_SIZE + 5];
3201: int nbchar = 0;
3202: int cur, l;
3203:
3204: SHRINK;
3205: cur = CUR_CHAR(l);
3206: while (((cur != '<') || (ctxt->token == '<')) &&
3207: ((cur != '&') || (ctxt->token == '&')) &&
3208: (IS_CHAR(cur))) {
3209: COPY_BUF(l,buf,nbchar,cur);
3210: if (nbchar >= SGML_PARSER_BIG_BUFFER_SIZE) {
3211: /*
3212: * Ok the segment is to be consumed as chars.
3213: */
3214: if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3215: if (areBlanks(ctxt, buf, nbchar)) {
3216: if (ctxt->sax->ignorableWhitespace != NULL)
3217: ctxt->sax->ignorableWhitespace(ctxt->userData,
3218: buf, nbchar);
3219: } else {
3220: sgmlCheckParagraph(ctxt);
3221: if (ctxt->sax->characters != NULL)
3222: ctxt->sax->characters(ctxt->userData, buf, nbchar);
3223: }
3224: }
3225: nbchar = 0;
3226: }
3227: NEXTL(l);
3228: cur = CUR_CHAR(l);
3229: }
3230: if (nbchar != 0) {
3231: /*
3232: * Ok the segment is to be consumed as chars.
3233: */
3234: if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3235: if (areBlanks(ctxt, buf, nbchar)) {
3236: if (ctxt->sax->ignorableWhitespace != NULL)
3237: ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
3238: } else {
3239: sgmlCheckParagraph(ctxt);
3240: if (ctxt->sax->characters != NULL)
3241: ctxt->sax->characters(ctxt->userData, buf, nbchar);
3242: }
3243: }
3244: }
3245: }
3246:
3247: /**
3248: * sgmlParseExternalID:
3249: * @ctxt: an SGML parser context
3250: * @publicID: a xmlChar** receiving PubidLiteral
3251: * @strict: indicate whether we should restrict parsing to only
3252: * production [75], see NOTE below
3253: *
3254: * Parse an External ID or a Public ID
3255: *
3256: * NOTE: Productions [75] and [83] interract badly since [75] can generate
3257: * 'PUBLIC' S PubidLiteral S SystemLiteral
3258: *
3259: * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3260: * | 'PUBLIC' S PubidLiteral S SystemLiteral
3261: *
3262: * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3263: *
3264: * Returns the function returns SystemLiteral and in the second
3265: * case publicID receives PubidLiteral, is strict is off
3266: * it is possible to return NULL and have publicID set.
3267: */
3268:
3269: xmlChar *
3270: sgmlParseExternalID(sgmlParserCtxtPtr ctxt, xmlChar **publicID, int strict) {
3271: xmlChar *URI = NULL;
3272:
3273: if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3274: (UPP(2) == 'S') && (UPP(3) == 'T') &&
3275: (UPP(4) == 'E') && (UPP(5) == 'M')) {
3276: SKIP(6);
3277: if (!IS_BLANK(CUR)) {
3278: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3279: ctxt->sax->error(ctxt->userData,
3280: "Space required after 'SYSTEM'\n");
3281: ctxt->wellFormed = 0;
3282: }
3283: SKIP_BLANKS;
3284: URI = sgmlParseSystemLiteral(ctxt);
3285: if (URI == NULL) {
3286: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3287: ctxt->sax->error(ctxt->userData,
3288: "sgmlParseExternalID: SYSTEM, no URI\n");
3289: ctxt->wellFormed = 0;
3290: }
3291: } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3292: (UPP(2) == 'B') && (UPP(3) == 'L') &&
3293: (UPP(4) == 'I') && (UPP(5) == 'C')) {
3294: SKIP(6);
3295: if (!IS_BLANK(CUR)) {
3296: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3297: ctxt->sax->error(ctxt->userData,
3298: "Space required after 'PUBLIC'\n");
3299: ctxt->wellFormed = 0;
3300: }
3301: SKIP_BLANKS;
3302: *publicID = sgmlParsePubidLiteral(ctxt);
3303: if (*publicID == NULL) {
3304: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3305: ctxt->sax->error(ctxt->userData,
3306: "sgmlParseExternalID: PUBLIC, no Public Identifier\n");
3307: ctxt->wellFormed = 0;
3308: }
3309: SKIP_BLANKS;
3310: if ((CUR == '"') || (CUR == '\'')) {
3311: URI = sgmlParseSystemLiteral(ctxt);
3312: }
3313: }
3314: return(URI);
3315: }
3316:
3317: /**
3318: * sgmlParseComment:
3319: * @ctxt: an SGML parser context
3320: *
3321: * Parse an XML (SGML) comment <!-- .... -->
3322: *
3323: * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3324: */
3325: void
3326: sgmlParseComment(sgmlParserCtxtPtr ctxt) {
3327: xmlChar *buf = NULL;
3328: int len;
3329: int size = SGML_PARSER_BUFFER_SIZE;
3330: int q, ql;
3331: int r, rl;
3332: int cur, l;
3333: xmlParserInputState state;
3334:
3335: /*
3336: * Check that there is a comment right here.
3337: */
3338: if ((RAW != '<') || (NXT(1) != '!') ||
3339: (NXT(2) != '-') || (NXT(3) != '-')) return;
3340:
3341: state = ctxt->instate;
3342: ctxt->instate = XML_PARSER_COMMENT;
3343: SHRINK;
3344: SKIP(4);
3345: buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
3346: if (buf == NULL) {
3347: fprintf(stderr, "malloc of %d byte failed\n", size);
3348: ctxt->instate = state;
3349: return;
3350: }
3351: q = CUR_CHAR(ql);
3352: NEXTL(ql);
3353: r = CUR_CHAR(rl);
3354: NEXTL(rl);
3355: cur = CUR_CHAR(l);
3356: len = 0;
3357: while (IS_CHAR(cur) &&
3358: ((cur != '>') ||
3359: (r != '-') || (q != '-'))) {
3360: if (len + 5 >= size) {
3361: size *= 2;
3362: buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3363: if (buf == NULL) {
3364: fprintf(stderr, "realloc of %d byte failed\n", size);
3365: ctxt->instate = state;
3366: return;
3367: }
3368: }
3369: COPY_BUF(ql,buf,len,q);
3370: q = r;
3371: ql = rl;
3372: r = cur;
3373: rl = l;
3374: NEXTL(l);
3375: cur = CUR_CHAR(l);
3376: if (cur == 0) {
3377: SHRINK;
3378: GROW;
3379: cur = CUR_CHAR(l);
3380: }
3381: }
3382: buf[len] = 0;
3383: if (!IS_CHAR(cur)) {
3384: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3385: ctxt->sax->error(ctxt->userData,
3386: "Comment not terminated \n<!--%.50s\n", buf);
3387: ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
3388: ctxt->wellFormed = 0;
3389: xmlFree(buf);
3390: } else {
3391: NEXT;
3392: if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3393: (!ctxt->disableSAX))
3394: ctxt->sax->comment(ctxt->userData, buf);
3395: xmlFree(buf);
3396: }
3397: ctxt->instate = state;
3398: }
3399:
3400: /**
3401: * sgmlParseCharRef:
3402: * @ctxt: an SGML parser context
3403: *
3404: * parse Reference declarations
3405: *
3406: * [66] CharRef ::= '&#' [0-9]+ ';' |
3407: * '&#x' [0-9a-fA-F]+ ';'
3408: *
3409: * Returns the value parsed (as an int)
3410: */
3411: int
3412: sgmlParseCharRef(sgmlParserCtxtPtr ctxt) {
3413: int val = 0;
3414:
3415: if ((CUR == '&') && (NXT(1) == '#') &&
3416: (NXT(2) == 'x')) {
3417: SKIP(3);
3418: while (CUR != ';') {
3419: if ((CUR >= '0') && (CUR <= '9'))
3420: val = val * 16 + (CUR - '0');
3421: else if ((CUR >= 'a') && (CUR <= 'f'))
3422: val = val * 16 + (CUR - 'a') + 10;
3423: else if ((CUR >= 'A') && (CUR <= 'F'))
3424: val = val * 16 + (CUR - 'A') + 10;
3425: else {
3426: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3427: ctxt->sax->error(ctxt->userData,
3428: "sgmlParseCharRef: invalid hexadecimal value\n");
3429: ctxt->wellFormed = 0;
3430: val = 0;
3431: break;
3432: }
3433: NEXT;
3434: }
3435: if (CUR == ';')
3436: NEXT;
3437: } else if ((CUR == '&') && (NXT(1) == '#')) {
3438: SKIP(2);
3439: while (CUR != ';') {
3440: if ((CUR >= '0') && (CUR <= '9'))
3441: val = val * 10 + (CUR - '0');
3442: else {
3443: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3444: ctxt->sax->error(ctxt->userData,
3445: "sgmlParseCharRef: invalid decimal value\n");
3446: ctxt->wellFormed = 0;
3447: val = 0;
3448: break;
3449: }
3450: NEXT;
3451: }
3452: if (CUR == ';')
3453: NEXT;
3454: } else {
3455: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3456: ctxt->sax->error(ctxt->userData, "sgmlParseCharRef: invalid value\n");
3457: ctxt->wellFormed = 0;
3458: }
3459: /*
3460: * Check the value IS_CHAR ...
3461: */
3462: if (IS_CHAR(val)) {
3463: return(val);
3464: } else {
3465: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3466: ctxt->sax->error(ctxt->userData, "sgmlParseCharRef: invalid xmlChar value %d\n",
3467: val);
3468: ctxt->wellFormed = 0;
3469: }
3470: return(0);
3471: }
3472:
3473:
3474: /**
3475: * sgmlParseDocTypeDecl :
3476: * @ctxt: an SGML parser context
3477: *
3478: * parse a DOCTYPE declaration
3479: *
3480: * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3481: * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3482: */
3483:
3484: void
3485: sgmlParseDocTypeDecl(sgmlParserCtxtPtr ctxt) {
3486: xmlChar *name;
3487: xmlChar *ExternalID = NULL;
3488: xmlChar *URI = NULL;
3489:
3490: /*
3491: * We know that '<!DOCTYPE' has been detected.
3492: */
3493: SKIP(9);
3494:
3495: SKIP_BLANKS;
3496:
3497: /*
3498: * Parse the DOCTYPE name.
3499: */
3500: name = sgmlParseName(ctxt);
3501: if (name == NULL) {
3502: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3503: ctxt->sax->error(ctxt->userData, "sgmlParseDocTypeDecl : no DOCTYPE name !\n");
3504: ctxt->wellFormed = 0;
3505: }
3506: /*
3507: * Check that upper(name) == "SGML" !!!!!!!!!!!!!
3508: */
3509:
3510: SKIP_BLANKS;
3511:
3512: /*
3513: * Check for SystemID and ExternalID
3514: */
3515: URI = sgmlParseExternalID(ctxt, &ExternalID, 0);
3516: SKIP_BLANKS;
3517:
3518: /*
1.2 veillard 3519: * Create or update the document accordingly to the DOCTYPE
3520: */
3521: if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3522: (!ctxt->disableSAX))
3523: ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3524:
3525: /*
3526: * Is there any internal subset declarations ?
3527: * they are handled separately in sgmlParseInternalSubset()
3528: */
3529: if (RAW == '[')
3530: return;
3531:
3532:
3533: /*
1.1 veillard 3534: * We should be at the end of the DOCTYPE declaration.
3535: */
3536: if (CUR != '>') {
3537: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3538: ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
3539: ctxt->wellFormed = 0;
3540: /* We shouldn't try to resynchronize ... */
3541: }
3542: NEXT;
3543:
3544: /*
3545: * Cleanup, since we don't use all those identifiers
3546: */
3547: if (URI != NULL) xmlFree(URI);
3548: if (ExternalID != NULL) xmlFree(ExternalID);
3549: if (name != NULL) xmlFree(name);
3550: }
3551:
3552: /**
3553: * sgmlParseAttribute:
3554: * @ctxt: an SGML parser context
3555: * @value: a xmlChar ** used to store the value of the attribute
3556: *
3557: * parse an attribute
3558: *
3559: * [41] Attribute ::= Name Eq AttValue
3560: *
3561: * [25] Eq ::= S? '=' S?
3562: *
3563: * With namespace:
3564: *
3565: * [NS 11] Attribute ::= QName Eq AttValue
3566: *
3567: * Also the case QName == xmlns:??? is handled independently as a namespace
3568: * definition.
3569: *
3570: * Returns the attribute name, and the value in *value.
3571: */
3572:
3573: xmlChar *
3574: sgmlParseAttribute(sgmlParserCtxtPtr ctxt, xmlChar **value) {
3575: xmlChar *name, *val = NULL;
3576:
3577: *value = NULL;
3578: name = sgmlParseName(ctxt);
3579: if (name == NULL) {
3580: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3581: ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
3582: ctxt->wellFormed = 0;
3583: return(NULL);
3584: }
3585:
3586: /*
3587: * read the value
3588: */
3589: SKIP_BLANKS;
3590: if (CUR == '=') {
3591: NEXT;
3592: SKIP_BLANKS;
3593: val = sgmlParseAttValue(ctxt);
3594: /******
3595: } else {
3596: * TODO : some attribute must have values, some may not
3597: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3598: ctxt->sax->warning(ctxt->userData,
3599: "No value for attribute %s\n", name); */
3600: }
3601:
3602: *value = val;
3603: return(name);
3604: }
3605:
3606: /**
3607: * sgmlCheckEncoding:
3608: * @ctxt: an SGML parser context
3609: * @attvalue: the attribute value
3610: *
3611: * Checks an http-equiv attribute from a Meta tag to detect
3612: * the encoding
3613: * If a new encoding is detected the parser is switched to decode
3614: * it and pass UTF8
3615: */
3616: void
3617: sgmlCheckEncoding(sgmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3618: const xmlChar *encoding;
3619:
3620: if ((ctxt == NULL) || (attvalue == NULL))
3621: return;
3622:
3623: encoding = xmlStrstr(attvalue, BAD_CAST"charset=");
3624: if (encoding == NULL)
3625: encoding = xmlStrstr(attvalue, BAD_CAST"Charset=");
3626: if (encoding == NULL)
3627: encoding = xmlStrstr(attvalue, BAD_CAST"CHARSET=");
3628: if (encoding != NULL) {
3629: encoding += 8;
3630: } else {
3631: encoding = xmlStrstr(attvalue, BAD_CAST"charset =");
3632: if (encoding == NULL)
3633: encoding = xmlStrstr(attvalue, BAD_CAST"Charset =");
3634: if (encoding == NULL)
3635: encoding = xmlStrstr(attvalue, BAD_CAST"CHARSET =");
3636: if (encoding != NULL)
3637: encoding += 9;
3638: }
3639: if (encoding != NULL) {
3640: xmlCharEncoding enc;
3641: xmlCharEncodingHandlerPtr handler;
3642:
3643: while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3644:
3645: if (ctxt->input->encoding != NULL)
3646: xmlFree((xmlChar *) ctxt->input->encoding);
3647: ctxt->input->encoding = xmlStrdup(encoding);
3648:
3649: enc = xmlParseCharEncoding((const char *) encoding);
3650: /*
3651: * registered set of known encodings
3652: */
3653: if (enc != XML_CHAR_ENCODING_ERROR) {
3654: xmlSwitchEncoding(ctxt, enc);
3655: ctxt->charset = XML_CHAR_ENCODING_UTF8;
3656: } else {
3657: /*
3658: * fallback for unknown encodings
3659: */
3660: handler = xmlFindCharEncodingHandler((const char *) encoding);
3661: if (handler != NULL) {
3662: xmlSwitchToEncoding(ctxt, handler);
3663: ctxt->charset = XML_CHAR_ENCODING_UTF8;
3664: } else {
3665: ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3666: }
3667: }
3668:
3669: if ((ctxt->input->buf != NULL) &&
3670: (ctxt->input->buf->encoder != NULL) &&
3671: (ctxt->input->buf->raw != NULL) &&
3672: (ctxt->input->buf->buffer != NULL)) {
3673: int nbchars;
3674: int processed;
3675:
3676: /*
3677: * convert as much as possible to the parser reading buffer.
3678: */
3679: processed = ctxt->input->cur - ctxt->input->base;
3680: xmlBufferShrink(ctxt->input->buf->buffer, processed);
3681: nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3682: ctxt->input->buf->buffer,
3683: ctxt->input->buf->raw);
3684: if (nbchars < 0) {
3685: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3686: ctxt->sax->error(ctxt->userData,
3687: "sgmlCheckEncoding: encoder error\n");
3688: ctxt->errNo = XML_ERR_INVALID_ENCODING;
3689: }
3690: ctxt->input->base =
3691: ctxt->input->cur = ctxt->input->buf->buffer->content;
3692: }
3693: }
3694: }
3695:
3696: /**
3697: * sgmlCheckMeta:
3698: * @ctxt: an SGML parser context
3699: * @atts: the attributes values
3700: *
3701: * Checks an attributes from a Meta tag
3702: */
3703: void
3704: sgmlCheckMeta(sgmlParserCtxtPtr ctxt, const xmlChar **atts) {
3705: int i;
3706: const xmlChar *att, *value;
3707: int http = 0;
3708: const xmlChar *content = NULL;
3709:
3710: if ((ctxt == NULL) || (atts == NULL))
3711: return;
3712:
3713: i = 0;
3714: att = atts[i++];
3715: while (att != NULL) {
3716: value = atts[i++];
3717: if ((value != NULL) &&
3718: ((!xmlStrcmp(att, BAD_CAST"http-equiv")) ||
3719: (!xmlStrcmp(att, BAD_CAST"Http-Equiv")) ||
3720: (!xmlStrcmp(att, BAD_CAST"HTTP-EQUIV"))) &&
3721: ((!xmlStrcmp(value, BAD_CAST"Content-Type")) ||
3722: (!xmlStrcmp(value, BAD_CAST"content-type")) ||
3723: (!xmlStrcmp(value, BAD_CAST"CONTENT-TYPE"))))
3724: http = 1;
3725: else if ((value != NULL) &&
3726: ((!xmlStrcmp(att, BAD_CAST"content")) ||
3727: (!xmlStrcmp(att, BAD_CAST"Content")) ||
3728: (!xmlStrcmp(att, BAD_CAST"CONTENT"))))
3729: content = value;
3730: att = atts[i++];
3731: }
3732: if ((http) && (content != NULL))
3733: sgmlCheckEncoding(ctxt, content);
3734:
3735: }
3736:
3737: /**
3738: * sgmlParseStartTag:
3739: * @ctxt: an SGML parser context
3740: *
3741: * parse a start of tag either for rule element or
3742: * EmptyElement. In both case we don't parse the tag closing chars.
3743: *
3744: * [40] STag ::= '<' Name (S Attribute)* S? '>'
3745: *
3746: * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3747: *
3748: * With namespace:
3749: *
3750: * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3751: *
3752: * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3753: *
3754: */
3755:
3756: void
3757: sgmlParseStartTag(sgmlParserCtxtPtr ctxt) {
3758: xmlChar *name;
3759: xmlChar *attname;
3760: xmlChar *attvalue;
3761: const xmlChar **atts = NULL;
3762: int nbatts = 0;
3763: int maxatts = 0;
3764: int meta = 0;
3765: int i;
3766:
3767: if (CUR != '<') return;
3768: NEXT;
3769:
3770: GROW;
3771: name = sgmlParseSGMLName(ctxt);
3772: if (name == NULL) {
3773: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3774: ctxt->sax->error(ctxt->userData,
3775: "sgmlParseStartTag: invalid element name\n");
3776: ctxt->wellFormed = 0;
3777: return;
3778: }
3779: if (!xmlStrcmp(name, BAD_CAST"meta"))
3780: meta = 1;
3781:
3782: /*
3783: * Check for auto-closure of SGML elements.
3784: */
3785: sgmlAutoClose(ctxt, name);
3786:
3787: /*
3788: * Check for implied SGML elements.
3789: */
3790: sgmlCheckImplied(ctxt, name);
3791:
3792: /*
3793: * Now parse the attributes, it ends up with the ending
3794: *
3795: * (S Attribute)* S?
3796: */
3797: SKIP_BLANKS;
3798: while ((IS_CHAR(CUR)) &&
3799: (CUR != '>') &&
3800: ((CUR != '/') || (NXT(1) != '>'))) {
3801: long cons = ctxt->nbChars;
3802:
3803: GROW;
3804: attname = sgmlParseAttribute(ctxt, &attvalue);
3805: if (attname != NULL) {
3806:
3807: /*
3808: * Well formedness requires at most one declaration of an attribute
3809: */
3810: for (i = 0; i < nbatts;i += 2) {
3811: if (!xmlStrcmp(atts[i], attname)) {
3812: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3813: ctxt->sax->error(ctxt->userData,
3814: "Attribute %s redefined\n",
3815: attname);
3816: ctxt->wellFormed = 0;
3817: xmlFree(attname);
3818: if (attvalue != NULL)
3819: xmlFree(attvalue);
3820: goto failed;
3821: }
3822: }
3823:
3824: /*
3825: * Add the pair to atts
3826: */
3827: if (atts == NULL) {
3828: maxatts = 10;
3829: atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
3830: if (atts == NULL) {
3831: fprintf(stderr, "malloc of %ld byte failed\n",
3832: maxatts * (long)sizeof(xmlChar *));
3833: if (name != NULL) xmlFree(name);
3834: return;
3835: }
3836: } else if (nbatts + 4 > maxatts) {
3837: maxatts *= 2;
3838: atts = (const xmlChar **) xmlRealloc(atts, maxatts * sizeof(xmlChar *));
3839: if (atts == NULL) {
3840: fprintf(stderr, "realloc of %ld byte failed\n",
3841: maxatts * (long)sizeof(xmlChar *));
3842: if (name != NULL) xmlFree(name);
3843: return;
3844: }
3845: }
3846: atts[nbatts++] = attname;
3847: atts[nbatts++] = attvalue;
3848: atts[nbatts] = NULL;
3849: atts[nbatts + 1] = NULL;
3850: }
3851:
3852: failed:
3853: SKIP_BLANKS;
3854: if (cons == ctxt->nbChars) {
3855: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3856: ctxt->sax->error(ctxt->userData,
3857: "sgmlParseStartTag: problem parsing attributes\n");
3858: ctxt->wellFormed = 0;
3859: break;
3860: }
3861: }
3862:
3863: /*
3864: * Handle specific association to the META tag
3865: */
3866: if (meta)
3867: sgmlCheckMeta(ctxt, atts);
3868:
3869: /*
3870: * SAX: Start of Element !
3871: */
3872: sgmlnamePush(ctxt, xmlStrdup(name));
3873: #ifdef DEBUG
3874: fprintf(stderr,"Start of element %s: pushed %s\n", name, ctxt->name);
3875: #endif
3876: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3877: ctxt->sax->startElement(ctxt->userData, name, atts);
3878:
3879: if (atts != NULL) {
3880: for (i = 0;i < nbatts;i++) {
3881: if (atts[i] != NULL)
3882: xmlFree((xmlChar *) atts[i]);
3883: }
3884: xmlFree((void *) atts);
3885: }
3886: if (name != NULL) xmlFree(name);
3887: }
3888:
3889: /**
3890: * sgmlParseEndTag:
3891: * @ctxt: an SGML parser context
3892: *
3893: * parse an end of tag
3894: *
3895: * [42] ETag ::= '</' Name S? '>'
3896: *
3897: * With namespace
3898: *
3899: * [NS 9] ETag ::= '</' QName S? '>'
3900: */
3901:
3902: void
3903: sgmlParseEndTag(sgmlParserCtxtPtr ctxt) {
3904: xmlChar *name;
3905: xmlChar *oldname;
3906: int i;
3907:
3908: if ((CUR != '<') || (NXT(1) != '/')) {
3909: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3910: ctxt->sax->error(ctxt->userData, "sgmlParseEndTag: '</' not found\n");
3911: ctxt->wellFormed = 0;
3912: return;
3913: }
3914: SKIP(2);
3915:
3916: name = sgmlParseSGMLName(ctxt);
3917: if (name == NULL) {
3918: if (CUR == '>') {
3919: NEXT;
3920: oldname = sgmlnamePop(ctxt);
3921: if (oldname != NULL) {
3922: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3923: ctxt->sax->endElement(ctxt->userData, name);
3924: #ifdef DEBUG
3925: fprintf(stderr,"End of tag </>: popping out %s\n", oldname);
3926: #endif
3927: xmlFree(oldname);
3928: #ifdef DEBUG
3929: } else {
3930: fprintf(stderr,"End of tag </>: stack empty !!!\n");
3931: #endif
3932: }
3933: return;
3934: } else
3935: return;
3936: }
3937:
3938: /*
3939: * We should definitely be at the ending "S? '>'" part
3940: */
3941: SKIP_BLANKS;
3942: if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3943: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3944: ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3945: ctxt->wellFormed = 0;
3946: } else
3947: NEXT;
3948:
3949: /*
3950: * If the name read is not one of the element in the parsing stack
3951: * then return, it's just an error.
3952: */
3953: for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3954: if (!xmlStrcmp(name, ctxt->nameTab[i])) break;
3955: }
3956: if (i < 0) {
3957: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3958: ctxt->sax->error(ctxt->userData,
3959: "Unexpected end tag : %s\n", name);
3960: xmlFree(name);
3961: ctxt->wellFormed = 0;
3962: return;
3963: }
3964:
3965:
3966: /*
3967: * Check for auto-closure of SGML elements.
3968: */
3969:
3970: sgmlAutoCloseOnClose(ctxt, name);
3971:
3972: /*
3973: * Well formedness constraints, opening and closing must match.
3974: * With the exception that the autoclose may have popped stuff out
3975: * of the stack.
3976: */
3977: if (((name[0] != '/') || (name[1] != 0)) &&
3978: (xmlStrcmp(name, ctxt->name))) {
3979: #ifdef DEBUG
3980: fprintf(stderr,"End of tag %s: expecting %s\n", name, ctxt->name);
3981: #endif
3982: if ((ctxt->name != NULL) &&
3983: (xmlStrcmp(ctxt->name, name))) {
3984: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3985: ctxt->sax->error(ctxt->userData,
3986: "Opening and ending tag mismatch: %s and %s\n",
3987: name, ctxt->name);
3988: ctxt->wellFormed = 0;
3989: }
3990: }
3991:
3992: /*
3993: * SAX: End of Tag
3994: */
3995: oldname = ctxt->name;
3996: if (((name[0] == '/') && (name[1] == 0)) ||
3997: ((oldname != NULL) && (!xmlStrcmp(oldname, name)))) {
3998: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3999: ctxt->sax->endElement(ctxt->userData, name);
4000: oldname = sgmlnamePop(ctxt);
4001: if (oldname != NULL) {
4002: #ifdef DEBUG
4003: fprintf(stderr,"End of tag %s: popping out %s\n", name, oldname);
4004: #endif
4005: xmlFree(oldname);
4006: #ifdef DEBUG
4007: } else {
4008: fprintf(stderr,"End of tag %s: stack empty !!!\n", name);
4009: #endif
4010: }
4011: }
4012:
4013: if (name != NULL)
4014: xmlFree(name);
4015:
4016: return;
4017: }
4018:
4019:
4020: /**
4021: * sgmlParseReference:
4022: * @ctxt: an SGML parser context
4023: *
4024: * parse and handle entity references in content,
4025: * this will end-up in a call to character() since this is either a
4026: * CharRef, or a predefined entity.
4027: */
4028: void
4029: sgmlParseReference(sgmlParserCtxtPtr ctxt) {
4030: sgmlEntityDescPtr ent;
4031: xmlChar out[6];
4032: xmlChar *name;
4033: if (CUR != '&') return;
4034:
4035: if (NXT(1) == '#') {
4036: unsigned int c;
4037: int bits, i = 0;
4038:
4039: c = sgmlParseCharRef(ctxt);
4040: if (c < 0x80) { out[i++]= c; bits= -6; }
4041: else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4042: else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4043: else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4044:
4045: for ( ; bits >= 0; bits-= 6) {
4046: out[i++]= ((c >> bits) & 0x3F) | 0x80;
4047: }
4048: out[i] = 0;
4049:
4050: sgmlCheckParagraph(ctxt);
4051: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4052: ctxt->sax->characters(ctxt->userData, out, i);
4053: } else {
4054: ent = sgmlParseEntityRef(ctxt, &name);
4055: if (name == NULL) {
4056: sgmlCheckParagraph(ctxt);
4057: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4058: ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4059: return;
4060: }
4061: if ((ent == NULL) || (ent->value <= 0)) {
4062: sgmlCheckParagraph(ctxt);
4063: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4064: ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4065: ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4066: /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4067: }
4068: } else {
4069: unsigned int c;
4070: int bits, i = 0;
4071:
4072: c = ent->value;
4073: if (c < 0x80)
4074: { out[i++]= c; bits= -6; }
4075: else if (c < 0x800)
4076: { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4077: else if (c < 0x10000)
4078: { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4079: else
4080: { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4081:
4082: for ( ; bits >= 0; bits-= 6) {
4083: out[i++]= ((c >> bits) & 0x3F) | 0x80;
4084: }
4085: out[i] = 0;
4086:
4087: sgmlCheckParagraph(ctxt);
4088: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4089: ctxt->sax->characters(ctxt->userData, out, i);
4090: }
4091: xmlFree(name);
4092: }
4093: }
4094:
4095: /**
4096: * sgmlParseContent:
4097: * @ctxt: an SGML parser context
4098: * @name: the node name
4099: *
4100: * Parse a content: comment, sub-element, reference or text.
4101: *
4102: */
4103:
4104: void
4105: sgmlParseContent(sgmlParserCtxtPtr ctxt) {
4106: xmlChar *currentNode;
4107: int depth;
4108:
4109: currentNode = xmlStrdup(ctxt->name);
4110: depth = ctxt->nameNr;
4111: while (1) {
4112: long cons = ctxt->nbChars;
4113:
4114: GROW;
4115: /*
4116: * Our tag or one of it's parent or children is ending.
4117: */
4118: if ((CUR == '<') && (NXT(1) == '/')) {
4119: sgmlParseEndTag(ctxt);
4120: if (currentNode != NULL) xmlFree(currentNode);
4121: return;
4122: }
4123:
4124: /*
4125: * Has this node been popped out during parsing of
4126: * the next element
4127: */
4128: if ((xmlStrcmp(currentNode, ctxt->name)) &&
4129: (depth >= ctxt->nameNr)) {
4130: if (currentNode != NULL) xmlFree(currentNode);
4131: return;
4132: }
4133:
4134: /*
4135: * Sometimes DOCTYPE arrives in the middle of the document
4136: */
4137: if ((CUR == '<') && (NXT(1) == '!') &&
4138: (UPP(2) == 'D') && (UPP(3) == 'O') &&
4139: (UPP(4) == 'C') && (UPP(5) == 'T') &&
4140: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4141: (UPP(8) == 'E')) {
4142: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4143: ctxt->sax->error(ctxt->userData,
4144: "Misplaced DOCTYPE declaration\n");
4145: ctxt->wellFormed = 0;
4146: sgmlParseDocTypeDecl(ctxt);
4147: }
4148:
4149: /*
4150: * First case : a comment
4151: */
4152: if ((CUR == '<') && (NXT(1) == '!') &&
4153: (NXT(2) == '-') && (NXT(3) == '-')) {
4154: sgmlParseComment(ctxt);
4155: }
4156:
4157: /*
4158: * Second case : a sub-element.
4159: */
4160: else if (CUR == '<') {
4161: sgmlParseElement(ctxt);
4162: }
4163:
4164: /*
4165: * Third case : a reference. If if has not been resolved,
4166: * parsing returns it's Name, create the node
4167: */
4168: else if (CUR == '&') {
4169: sgmlParseReference(ctxt);
4170: }
4171:
4172: /*
4173: * Fourth : end of the resource
4174: */
4175: else if (CUR == 0) {
4176: sgmlAutoClose(ctxt, NULL);
4177: }
4178:
4179: /*
4180: * Last case, text. Note that References are handled directly.
4181: */
4182: else {
4183: sgmlParseCharData(ctxt, 0);
4184: }
4185:
4186: if (cons == ctxt->nbChars) {
4187: if (ctxt->node != NULL) {
4188: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4189: ctxt->sax->error(ctxt->userData,
4190: "detected an error in element content\n");
4191: ctxt->wellFormed = 0;
4192: }
4193: break;
4194: }
4195:
4196: GROW;
4197: }
4198: if (currentNode != NULL) xmlFree(currentNode);
4199: }
4200:
4201: /**
4202: * sgmlParseElement:
4203: * @ctxt: an SGML parser context
4204: *
4205: * parse an SGML element, this is highly recursive
4206: *
4207: * [39] element ::= EmptyElemTag | STag content ETag
4208: *
4209: * [41] Attribute ::= Name Eq AttValue
4210: */
4211:
4212: void
4213: sgmlParseElement(sgmlParserCtxtPtr ctxt) {
4214: xmlChar *name;
4215: xmlChar *currentNode = NULL;
4216: sgmlElemDescPtr info;
4217: sgmlParserNodeInfo node_info;
4218: xmlChar *oldname;
4219: int depth = ctxt->nameNr;
4220:
4221: /* Capture start position */
4222: if (ctxt->record_info) {
4223: node_info.begin_pos = ctxt->input->consumed +
4224: (CUR_PTR - ctxt->input->base);
4225: node_info.begin_line = ctxt->input->line;
4226: }
4227:
4228: oldname = xmlStrdup(ctxt->name);
4229: sgmlParseStartTag(ctxt);
4230: name = ctxt->name;
4231: #ifdef DEBUG
4232: if (oldname == NULL)
4233: fprintf(stderr, "Start of element %s\n", name);
4234: else if (name == NULL)
4235: fprintf(stderr, "Start of element failed, was %s\n", oldname);
4236: else
4237: fprintf(stderr, "Start of element %s, was %s\n", name, oldname);
4238: #endif
4239: if (((depth == ctxt->nameNr) && (!xmlStrcmp(oldname, ctxt->name))) ||
4240: (name == NULL)) {
4241: if (CUR == '>')
4242: NEXT;
4243: if (oldname != NULL)
4244: xmlFree(oldname);
4245: return;
4246: }
4247: if (oldname != NULL)
4248: xmlFree(oldname);
4249:
4250: /*
4251: * Lookup the info for that element.
4252: */
4253: info = sgmlTagLookup(name);
4254: if (info == NULL) {
4255: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4256: ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
4257: name);
4258: ctxt->wellFormed = 0;
4259: } else if (info->depr) {
4260: /***************************
4261: if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4262: ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
4263: name);
4264: ***************************/
4265: }
4266:
4267: /*
4268: * Check for an Empty Element labelled the XML/SGML way
4269: */
4270: if ((CUR == '/') && (NXT(1) == '>')) {
4271: SKIP(2);
4272: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4273: ctxt->sax->endElement(ctxt->userData, name);
4274: oldname = sgmlnamePop(ctxt);
4275: #ifdef DEBUG
4276: fprintf(stderr,"End of tag the XML way: popping out %s\n", oldname);
4277: #endif
4278: if (oldname != NULL)
4279: xmlFree(oldname);
4280: return;
4281: }
4282:
4283: if (CUR == '>') {
4284: NEXT;
4285: } else {
4286: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4287: ctxt->sax->error(ctxt->userData,
4288: "Couldn't find end of Start Tag %s\n",
4289: name);
4290: ctxt->wellFormed = 0;
4291:
4292: /*
4293: * end of parsing of this node.
4294: */
4295: if (!xmlStrcmp(name, ctxt->name)) {
4296: nodePop(ctxt);
4297: oldname = sgmlnamePop(ctxt);
4298: #ifdef DEBUG
4299: fprintf(stderr,"End of start tag problem: popping out %s\n", oldname);
4300: #endif
4301: if (oldname != NULL)
4302: xmlFree(oldname);
4303: }
4304:
4305: /*
4306: * Capture end position and add node
4307: */
4308: if ( currentNode != NULL && ctxt->record_info ) {
4309: node_info.end_pos = ctxt->input->consumed +
4310: (CUR_PTR - ctxt->input->base);
4311: node_info.end_line = ctxt->input->line;
4312: node_info.node = ctxt->node;
4313: xmlParserAddNodeInfo(ctxt, &node_info);
4314: }
4315: return;
4316: }
4317:
4318: /*
4319: * Check for an Empty Element from DTD definition
4320: */
4321: if ((info != NULL) && (info->empty)) {
4322: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4323: ctxt->sax->endElement(ctxt->userData, name);
4324: oldname = sgmlnamePop(ctxt);
4325: #ifdef DEBUG
4326: fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
4327: #endif
4328: if (oldname != NULL)
4329: xmlFree(oldname);
4330: return;
4331: }
4332:
4333: /*
4334: * Parse the content of the element:
4335: */
4336: currentNode = xmlStrdup(ctxt->name);
4337: depth = ctxt->nameNr;
4338: while (IS_CHAR(CUR)) {
4339: sgmlParseContent(ctxt);
4340: if (ctxt->nameNr < depth) break;
4341: }
4342:
4343: if (!IS_CHAR(CUR)) {
4344: /************
4345: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4346: ctxt->sax->error(ctxt->userData,
4347: "Premature end of data in tag %s\n", currentNode);
4348: ctxt->wellFormed = 0;
4349: *************/
4350:
4351: /*
4352: * end of parsing of this node.
4353: */
4354: nodePop(ctxt);
4355: oldname = sgmlnamePop(ctxt);
4356: #ifdef DEBUG
4357: fprintf(stderr,"Premature end of tag %s : popping out %s\n", name, oldname);
4358: #endif
4359: if (oldname != NULL)
4360: xmlFree(oldname);
4361: if (currentNode != NULL)
4362: xmlFree(currentNode);
4363: return;
4364: }
4365:
4366: /*
4367: * Capture end position and add node
4368: */
4369: if ( currentNode != NULL && ctxt->record_info ) {
4370: node_info.end_pos = ctxt->input->consumed +
4371: (CUR_PTR - ctxt->input->base);
4372: node_info.end_line = ctxt->input->line;
4373: node_info.node = ctxt->node;
4374: xmlParserAddNodeInfo(ctxt, &node_info);
4375: }
4376: if (currentNode != NULL)
4377: xmlFree(currentNode);
4378: }
4379:
4380: /**
1.3 ! veillard 4381: * sgmlParseEntityDecl:
! 4382: * @ctxt: an SGML parser context
! 4383: *
! 4384: * parse <!ENTITY declarations
! 4385: *
! 4386: */
! 4387:
! 4388: void
! 4389: sgmlParseEntityDecl(xmlParserCtxtPtr ctxt) {
! 4390: xmlChar *name = NULL;
! 4391: xmlChar *value = NULL;
! 4392: xmlChar *URI = NULL, *literal = NULL;
! 4393: xmlChar *ndata = NULL;
! 4394: int isParameter = 0;
! 4395: xmlChar *orig = NULL;
! 4396:
! 4397: GROW;
! 4398: if ((RAW == '<') && (NXT(1) == '!') &&
! 4399: (NXT(2) == 'E') && (NXT(3) == 'N') &&
! 4400: (NXT(4) == 'T') && (NXT(5) == 'I') &&
! 4401: (NXT(6) == 'T') && (NXT(7) == 'Y')) {
! 4402: xmlParserInputPtr input = ctxt->input;
! 4403: ctxt->instate = XML_PARSER_ENTITY_DECL;
! 4404: SHRINK;
! 4405: SKIP(8);
! 4406: if (!IS_BLANK(CUR)) {
! 4407: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
! 4408: ctxt->sax->error(ctxt->userData,
! 4409: "Space required after '<!ENTITY'\n");
! 4410: ctxt->errNo = XML_ERR_SPACE_REQUIRED;
! 4411: ctxt->wellFormed = 0;
! 4412: ctxt->disableSAX = 1;
! 4413: }
! 4414: SKIP_BLANKS;
! 4415:
! 4416: if (RAW == '%') {
! 4417: NEXT;
! 4418: if (!IS_BLANK(CUR)) {
! 4419: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
! 4420: ctxt->sax->error(ctxt->userData,
! 4421: "Space required after '%'\n");
! 4422: ctxt->errNo = XML_ERR_SPACE_REQUIRED;
! 4423: ctxt->wellFormed = 0;
! 4424: ctxt->disableSAX = 1;
! 4425: }
! 4426: SKIP_BLANKS;
! 4427: isParameter = 1;
! 4428: }
! 4429:
! 4430: name = xmlParseName(ctxt);
! 4431: if (name == NULL) {
! 4432: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
! 4433: ctxt->sax->error(ctxt->userData, "sgmlarseEntityDecl: no name\n");
! 4434: ctxt->errNo = XML_ERR_NAME_REQUIRED;
! 4435: ctxt->wellFormed = 0;
! 4436: ctxt->disableSAX = 1;
! 4437: return;
! 4438: }
! 4439: if (!IS_BLANK(CUR)) {
! 4440: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
! 4441: ctxt->sax->error(ctxt->userData,
! 4442: "Space required after the entity name\n");
! 4443: ctxt->errNo = XML_ERR_SPACE_REQUIRED;
! 4444: ctxt->wellFormed = 0;
! 4445: ctxt->disableSAX = 1;
! 4446: }
! 4447: SKIP_BLANKS;
! 4448:
! 4449: /*
! 4450: * handle the various case of definitions...
! 4451: */
! 4452: if (isParameter) {
! 4453: if ((RAW == '"') || (RAW == '\'')) {
! 4454: value = xmlParseEntityValue(ctxt, &orig);
! 4455: if (value) {
! 4456: if ((ctxt->sax != NULL) &&
! 4457: (!ctxt->disableSAX) && (ctxt->sax->entityDecl != NULL))
! 4458: ctxt->sax->entityDecl(ctxt->userData, name,
! 4459: XML_INTERNAL_PARAMETER_ENTITY,
! 4460: NULL, NULL, value);
! 4461: }
! 4462: } else {
! 4463: URI = xmlParseExternalID(ctxt, &literal, 1);
! 4464: if ((URI == NULL) && (literal == NULL)) {
! 4465: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
! 4466: ctxt->sax->error(ctxt->userData,
! 4467: "Entity value required\n");
! 4468: ctxt->errNo = XML_ERR_VALUE_REQUIRED;
! 4469: ctxt->wellFormed = 0;
! 4470: ctxt->disableSAX = 1;
! 4471: }
! 4472: if (URI) {
! 4473: xmlURIPtr uri;
! 4474:
! 4475: uri = xmlParseURI((const char *) URI);
! 4476: if (uri == NULL) {
! 4477: if ((ctxt->sax != NULL) &&
! 4478: (!ctxt->disableSAX) &&
! 4479: (ctxt->sax->error != NULL))
! 4480: ctxt->sax->error(ctxt->userData,
! 4481: "Invalid URI: %s\n", URI);
! 4482: ctxt->wellFormed = 0;
! 4483: ctxt->errNo = XML_ERR_INVALID_URI;
! 4484: } else {
! 4485: if (uri->fragment != NULL) {
! 4486: if ((ctxt->sax != NULL) &&
! 4487: (!ctxt->disableSAX) &&
! 4488: (ctxt->sax->error != NULL))
! 4489: ctxt->sax->error(ctxt->userData,
! 4490: "Fragment not allowed: %s\n", URI);
! 4491: ctxt->wellFormed = 0;
! 4492: ctxt->errNo = XML_ERR_URI_FRAGMENT;
! 4493: } else {
! 4494: if ((ctxt->sax != NULL) &&
! 4495: (!ctxt->disableSAX) &&
! 4496: (ctxt->sax->entityDecl != NULL))
! 4497: ctxt->sax->entityDecl(ctxt->userData, name,
! 4498: XML_EXTERNAL_PARAMETER_ENTITY,
! 4499: literal, URI, NULL);
! 4500: }
! 4501: xmlFreeURI(uri);
! 4502: }
! 4503: }
! 4504: }
! 4505: } else {
! 4506: if ((RAW == '"') || (RAW == '\'')) {
! 4507: value = xmlParseEntityValue(ctxt, &orig);
! 4508: if ((ctxt->sax != NULL) &&
! 4509: (!ctxt->disableSAX) && (ctxt->sax->entityDecl != NULL))
! 4510: ctxt->sax->entityDecl(ctxt->userData, name,
! 4511: XML_INTERNAL_GENERAL_ENTITY,
! 4512: NULL, NULL, value);
! 4513: } else {
! 4514: URI = xmlParseExternalID(ctxt, &literal, 1);
! 4515: if ((URI == NULL) && (literal == NULL)) {
! 4516: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
! 4517: ctxt->sax->error(ctxt->userData,
! 4518: "Entity value required\n");
! 4519: ctxt->errNo = XML_ERR_VALUE_REQUIRED;
! 4520: ctxt->wellFormed = 0;
! 4521: ctxt->disableSAX = 1;
! 4522: }
! 4523: if (URI) {
! 4524: xmlURIPtr uri;
! 4525:
! 4526: uri = xmlParseURI((const char *)URI);
! 4527: if (uri == NULL) {
! 4528: if ((ctxt->sax != NULL) &&
! 4529: (!ctxt->disableSAX) &&
! 4530: (ctxt->sax->error != NULL))
! 4531: ctxt->sax->error(ctxt->userData,
! 4532: "Invalid URI: %s\n", URI);
! 4533: ctxt->wellFormed = 0;
! 4534: ctxt->errNo = XML_ERR_INVALID_URI;
! 4535: } else {
! 4536: if (uri->fragment != NULL) {
! 4537: if ((ctxt->sax != NULL) &&
! 4538: (!ctxt->disableSAX) &&
! 4539: (ctxt->sax->error != NULL))
! 4540: ctxt->sax->error(ctxt->userData,
! 4541: "Fragment not allowed: %s\n", URI);
! 4542: ctxt->wellFormed = 0;
! 4543: ctxt->errNo = XML_ERR_URI_FRAGMENT;
! 4544: }
! 4545: xmlFreeURI(uri);
! 4546: }
! 4547: }
! 4548: if ((RAW != '>') && (!IS_BLANK(CUR))) {
! 4549: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
! 4550: ctxt->sax->error(ctxt->userData,
! 4551: "Space required before content model\n");
! 4552: ctxt->errNo = XML_ERR_SPACE_REQUIRED;
! 4553: ctxt->wellFormed = 0;
! 4554: ctxt->disableSAX = 1;
! 4555: }
! 4556: SKIP_BLANKS;
! 4557:
! 4558: /*
! 4559: * SGML specific: here we can get the content model
! 4560: */
! 4561: if (RAW != '>') {
! 4562: xmlChar *contmod;
! 4563:
! 4564: contmod = xmlParseName(ctxt);
! 4565:
! 4566: if (contmod == NULL) {
! 4567: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
! 4568: ctxt->sax->error(ctxt->userData,
! 4569: "Could not parse entity content model\n");
! 4570: ctxt->errNo = XML_ERR_SPACE_REQUIRED;
! 4571: ctxt->wellFormed = 0;
! 4572: ctxt->disableSAX = 1;
! 4573: } else {
! 4574: if (!xmlStrcmp(contmod, BAD_CAST"NDATA")) {
! 4575: if (!IS_BLANK(CUR)) {
! 4576: if ((ctxt->sax != NULL) &&
! 4577: (ctxt->sax->error != NULL))
! 4578: ctxt->sax->error(ctxt->userData,
! 4579: "Space required after 'NDATA'\n");
! 4580: ctxt->errNo = XML_ERR_SPACE_REQUIRED;
! 4581: ctxt->wellFormed = 0;
! 4582: ctxt->disableSAX = 1;
! 4583: }
! 4584: SKIP_BLANKS;
! 4585: ndata = xmlParseName(ctxt);
! 4586: if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
! 4587: (ctxt->sax->unparsedEntityDecl != NULL)) {
! 4588: ctxt->sax->unparsedEntityDecl(ctxt->userData,
! 4589: name, literal, URI, ndata);
! 4590: }
! 4591: } else if (!xmlStrcmp(contmod, BAD_CAST"SUBDOC")) {
! 4592: if ((ctxt->sax != NULL) &&
! 4593: (ctxt->sax->warning != NULL))
! 4594: ctxt->sax->warning(ctxt->userData,
! 4595: "SUBDOC entities are not supported\n");
! 4596: SKIP_BLANKS;
! 4597: ndata = xmlParseName(ctxt);
! 4598: if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
! 4599: (ctxt->sax->unparsedEntityDecl != NULL)) {
! 4600: ctxt->sax->unparsedEntityDecl(ctxt->userData,
! 4601: name, literal, URI, ndata);
! 4602: }
! 4603: } else if (!xmlStrcmp(contmod, BAD_CAST"CDATA")) {
! 4604: if ((ctxt->sax != NULL) &&
! 4605: (ctxt->sax->warning != NULL))
! 4606: ctxt->sax->warning(ctxt->userData,
! 4607: "CDATA entities are not supported\n");
! 4608: SKIP_BLANKS;
! 4609: ndata = xmlParseName(ctxt);
! 4610: if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
! 4611: (ctxt->sax->unparsedEntityDecl != NULL)) {
! 4612: ctxt->sax->unparsedEntityDecl(ctxt->userData,
! 4613: name, literal, URI, ndata);
! 4614: }
! 4615: }
! 4616: xmlFree(contmod);
! 4617: }
! 4618: } else {
! 4619: if ((ctxt->sax != NULL) &&
! 4620: (!ctxt->disableSAX) && (ctxt->sax->entityDecl != NULL))
! 4621: ctxt->sax->entityDecl(ctxt->userData, name,
! 4622: XML_EXTERNAL_GENERAL_PARSED_ENTITY,
! 4623: literal, URI, NULL);
! 4624: }
! 4625: }
! 4626: }
! 4627: SKIP_BLANKS;
! 4628: if (RAW != '>') {
! 4629: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
! 4630: ctxt->sax->error(ctxt->userData,
! 4631: "sgmlParseEntityDecl: entity %s not terminated\n", name);
! 4632: ctxt->errNo = XML_ERR_ENTITY_NOT_FINISHED;
! 4633: ctxt->wellFormed = 0;
! 4634: ctxt->disableSAX = 1;
! 4635: } else {
! 4636: if (input != ctxt->input) {
! 4637: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
! 4638: ctxt->sax->error(ctxt->userData,
! 4639: "Entity declaration doesn't start and stop in the same entity\n");
! 4640: ctxt->errNo = XML_ERR_ENTITY_BOUNDARY;
! 4641: ctxt->wellFormed = 0;
! 4642: ctxt->disableSAX = 1;
! 4643: }
! 4644: NEXT;
! 4645: }
! 4646: if (orig != NULL) {
! 4647: /*
! 4648: * Ugly mechanism to save the raw entity value.
! 4649: */
! 4650: xmlEntityPtr cur = NULL;
! 4651:
! 4652: if (isParameter) {
! 4653: if ((ctxt->sax != NULL) &&
! 4654: (ctxt->sax->getParameterEntity != NULL))
! 4655: cur = ctxt->sax->getParameterEntity(ctxt->userData, name);
! 4656: } else {
! 4657: if ((ctxt->sax != NULL) &&
! 4658: (ctxt->sax->getEntity != NULL))
! 4659: cur = ctxt->sax->getEntity(ctxt->userData, name);
! 4660: }
! 4661: if (cur != NULL) {
! 4662: if (cur->orig != NULL)
! 4663: xmlFree(orig);
! 4664: else
! 4665: cur->orig = orig;
! 4666: } else
! 4667: xmlFree(orig);
! 4668: }
! 4669: if (name != NULL) xmlFree(name);
! 4670: if (value != NULL) xmlFree(value);
! 4671: if (URI != NULL) xmlFree(URI);
! 4672: if (literal != NULL) xmlFree(literal);
! 4673: if (ndata != NULL) xmlFree(ndata);
! 4674: }
! 4675: }
! 4676:
! 4677: /**
! 4678: * sgmlParseMarkupDecl:
! 4679: * @ctxt: an SGML parser context
! 4680: *
! 4681: * parse Markup declarations
! 4682: *
! 4683: * [29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl |
! 4684: * NotationDecl | PI | Comment
! 4685: */
! 4686: void
! 4687: sgmlParseMarkupDecl(xmlParserCtxtPtr ctxt) {
! 4688: GROW;
! 4689: xmlParseElementDecl(ctxt);
! 4690: xmlParseAttributeListDecl(ctxt);
! 4691: sgmlParseEntityDecl(ctxt);
! 4692: xmlParseNotationDecl(ctxt);
! 4693: xmlParsePI(ctxt);
! 4694: xmlParseComment(ctxt);
! 4695: /*
! 4696: * This is only for internal subset. On external entities,
! 4697: * the replacement is done before parsing stage
! 4698: */
! 4699: if ((ctxt->external == 0) && (ctxt->inputNr == 1))
! 4700: xmlParsePEReference(ctxt);
! 4701: ctxt->instate = XML_PARSER_DTD;
! 4702: }
! 4703:
! 4704: /**
! 4705: * sgmlParseInternalsubset:
! 4706: * @ctxt: an SGML parser context
! 4707: *
! 4708: * parse the internal subset declaration
! 4709: *
! 4710: * [28 end] ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
! 4711: */
! 4712:
! 4713: void
! 4714: sgmlParseInternalSubset(xmlParserCtxtPtr ctxt) {
! 4715: /*
! 4716: * Is there any DTD definition ?
! 4717: */
! 4718: if (RAW == '[') {
! 4719: ctxt->instate = XML_PARSER_DTD;
! 4720: NEXT;
! 4721: /*
! 4722: * Parse the succession of Markup declarations and
! 4723: * PEReferences.
! 4724: * Subsequence (markupdecl | PEReference | S)*
! 4725: */
! 4726: while (RAW != ']') {
! 4727: const xmlChar *check = CUR_PTR;
! 4728: int cons = ctxt->input->consumed;
! 4729:
! 4730: SKIP_BLANKS;
! 4731: sgmlParseMarkupDecl(ctxt);
! 4732: xmlParsePEReference(ctxt);
! 4733:
! 4734: /*
! 4735: * Pop-up of finished entities.
! 4736: */
! 4737: while ((RAW == 0) && (ctxt->inputNr > 1))
! 4738: xmlPopInput(ctxt);
! 4739:
! 4740: if ((CUR_PTR == check) && (cons == ctxt->input->consumed)) {
! 4741: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
! 4742: ctxt->sax->error(ctxt->userData,
! 4743: "sgmlParseInternalSubset: error detected in Markup declaration\n");
! 4744: ctxt->wellFormed = 0;
! 4745: ctxt->disableSAX = 1;
! 4746: ctxt->errNo = XML_ERR_INTERNAL_ERROR;
! 4747: break;
! 4748: }
! 4749: }
! 4750: if (RAW == ']') {
! 4751: NEXT;
! 4752: SKIP_BLANKS;
! 4753: }
! 4754: }
! 4755:
! 4756: /*
! 4757: * We should be at the end of the DOCTYPE declaration.
! 4758: */
! 4759: if (RAW != '>') {
! 4760: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
! 4761: ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
! 4762: ctxt->wellFormed = 0;
! 4763: ctxt->disableSAX = 1;
! 4764: ctxt->errNo = XML_ERR_DOCTYPE_NOT_FINISHED;
! 4765: }
! 4766: NEXT;
! 4767: }
! 4768:
! 4769: /**
1.2 veillard 4770: * sgmlParseMisc:
4771: * @ctxt: an XML parser context
4772: *
4773: * parse an XML Misc* optionnal field.
4774: *
4775: * [27] Misc ::= Comment | PI | S
4776: */
4777:
4778: void
4779: sgmlParseMisc(xmlParserCtxtPtr ctxt) {
4780: while (((RAW == '<') && (NXT(1) == '?')) ||
4781: ((RAW == '<') && (NXT(1) == '!') &&
4782: (NXT(2) == '-') && (NXT(3) == '-')) ||
4783: IS_BLANK(CUR)) {
4784: if ((RAW == '<') && (NXT(1) == '?')) {
4785: xmlParsePI(ctxt); /* TODO: SGML PIs differs */
4786: } else if (IS_BLANK(CUR)) {
4787: NEXT;
4788: } else
4789: xmlParseComment(ctxt);
4790: }
4791: }
4792:
4793: /**
1.1 veillard 4794: * sgmlParseDocument :
4795: * @ctxt: an SGML parser context
4796: *
4797: * parse an SGML document (and build a tree if using the standard SAX
4798: * interface).
4799: *
4800: * Returns 0, -1 in case of error. the parser context is augmented
4801: * as a result of the parsing.
4802: */
4803:
4804: int
4805: sgmlParseDocument(sgmlParserCtxtPtr ctxt) {
1.2 veillard 4806: xmlChar start[4];
4807: xmlCharEncoding enc;
1.1 veillard 4808: xmlDtdPtr dtd;
4809:
4810: sgmlDefaultSAXHandlerInit();
4811: ctxt->html = 2;
4812:
4813: GROW;
4814: /*
4815: * SAX: beginning of the document processing.
4816: */
4817: if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4818: ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4819:
1.2 veillard 4820: /*
4821: * Get the 4 first bytes and decode the charset
4822: * if enc != XML_CHAR_ENCODING_NONE
4823: * plug some encoding conversion routines.
4824: */
4825: start[0] = RAW;
4826: start[1] = NXT(1);
4827: start[2] = NXT(2);
4828: start[3] = NXT(3);
4829: enc = xmlDetectCharEncoding(start, 4);
4830: if (enc != XML_CHAR_ENCODING_NONE) {
4831: xmlSwitchEncoding(ctxt, enc);
4832: }
4833:
1.1 veillard 4834: /*
4835: * Wipe out everything which is before the first '<'
4836: */
4837: SKIP_BLANKS;
4838: if (CUR == 0) {
4839: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4840: ctxt->sax->error(ctxt->userData, "Document is empty\n");
4841: ctxt->wellFormed = 0;
4842: }
4843:
4844: if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4845: ctxt->sax->startDocument(ctxt->userData);
4846:
4847:
4848: /*
1.2 veillard 4849: * The Misc part of the Prolog
1.1 veillard 4850: */
1.2 veillard 4851: GROW;
4852: sgmlParseMisc(ctxt);
1.1 veillard 4853:
4854: /*
4855: * Then possibly doc type declaration(s) and more Misc
4856: * (doctypedecl Misc*)?
4857: */
1.2 veillard 4858: GROW;
4859: if ((RAW == '<') && (NXT(1) == '!') &&
4860: (NXT(2) == 'D') && (NXT(3) == 'O') &&
4861: (NXT(4) == 'C') && (NXT(5) == 'T') &&
4862: (NXT(6) == 'Y') && (NXT(7) == 'P') &&
4863: (NXT(8) == 'E')) {
4864:
4865: ctxt->inSubset = 1;
1.1 veillard 4866: sgmlParseDocTypeDecl(ctxt);
1.2 veillard 4867: if (RAW == '[') {
4868: ctxt->instate = XML_PARSER_DTD;
1.3 ! veillard 4869: sgmlParseInternalSubset(ctxt);
1.2 veillard 4870: }
4871:
4872: /*
4873: * Create and update the external subset.
4874: */
4875: ctxt->inSubset = 2;
4876: if ((ctxt->sax != NULL) && (ctxt->sax->externalSubset != NULL) &&
4877: (!ctxt->disableSAX))
4878: ctxt->sax->externalSubset(ctxt->userData, ctxt->intSubName,
4879: ctxt->extSubSystem, ctxt->extSubURI);
4880: ctxt->inSubset = 0;
4881:
4882:
4883: ctxt->instate = XML_PARSER_PROLOG;
4884: sgmlParseMisc(ctxt);
1.1 veillard 4885: }
4886:
4887: /*
4888: * Time to start parsing the tree itself
4889: */
4890: sgmlParseContent(ctxt);
4891:
4892: /*
4893: * autoclose
4894: */
4895: if (CUR == 0)
4896: sgmlAutoClose(ctxt, NULL);
4897:
4898:
4899: /*
4900: * SAX: end of the document processing.
4901: */
4902: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4903: ctxt->sax->endDocument(ctxt->userData);
4904:
4905: if (ctxt->myDoc != NULL) {
4906: dtd = xmlGetIntSubset(ctxt->myDoc);
4907: if (dtd == NULL)
4908: ctxt->myDoc->intSubset =
4909: xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "SGML",
4910: BAD_CAST "-//W3C//DTD SGML 4.0 Transitional//EN",
4911: BAD_CAST "http://www.w3.org/TR/REC-docbook/loose.dtd");
4912: }
4913: if (! ctxt->wellFormed) return(-1);
4914: return(0);
4915: }
4916:
4917:
4918: /************************************************************************
4919: * *
4920: * Parser contexts handling *
4921: * *
4922: ************************************************************************/
4923:
4924: /**
4925: * xmlInitParserCtxt:
4926: * @ctxt: an SGML parser context
4927: *
4928: * Initialize a parser context
4929: */
4930:
4931: void
4932: sgmlInitParserCtxt(sgmlParserCtxtPtr ctxt)
4933: {
4934: sgmlSAXHandler *sax;
4935:
4936: if (ctxt == NULL) return;
4937: memset(ctxt, 0, sizeof(sgmlParserCtxt));
4938:
4939: sax = (sgmlSAXHandler *) xmlMalloc(sizeof(sgmlSAXHandler));
4940: if (sax == NULL) {
4941: fprintf(stderr, "sgmlInitParserCtxt: out of memory\n");
4942: }
4943: memset(sax, 0, sizeof(sgmlSAXHandler));
4944:
4945: /* Allocate the Input stack */
4946: ctxt->inputTab = (sgmlParserInputPtr *)
4947: xmlMalloc(5 * sizeof(sgmlParserInputPtr));
4948: if (ctxt->inputTab == NULL) {
4949: fprintf(stderr, "sgmlInitParserCtxt: out of memory\n");
4950: }
4951: ctxt->inputNr = 0;
4952: ctxt->inputMax = 5;
4953: ctxt->input = NULL;
4954: ctxt->version = NULL;
4955: ctxt->encoding = NULL;
4956: ctxt->standalone = -1;
4957: ctxt->instate = XML_PARSER_START;
4958:
4959: /* Allocate the Node stack */
4960: ctxt->nodeTab = (sgmlNodePtr *) xmlMalloc(10 * sizeof(sgmlNodePtr));
4961: ctxt->nodeNr = 0;
4962: ctxt->nodeMax = 10;
4963: ctxt->node = NULL;
4964:
4965: /* Allocate the Name stack */
4966: ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4967: ctxt->nameNr = 0;
4968: ctxt->nameMax = 10;
4969: ctxt->name = NULL;
4970:
4971: if (sax == NULL) ctxt->sax = &sgmlDefaultSAXHandler;
4972: else {
4973: ctxt->sax = sax;
4974: memcpy(sax, &sgmlDefaultSAXHandler, sizeof(sgmlSAXHandler));
4975: }
4976: ctxt->userData = ctxt;
4977: ctxt->myDoc = NULL;
4978: ctxt->wellFormed = 1;
4979: ctxt->replaceEntities = 0;
4980: ctxt->html = 2;
4981: ctxt->record_info = 0;
4982: ctxt->validate = 0;
4983: ctxt->nbChars = 0;
4984: ctxt->checkIndex = 0;
4985: xmlInitNodeInfoSeq(&ctxt->node_seq);
4986: }
4987:
4988: /**
4989: * sgmlFreeParserCtxt:
4990: * @ctxt: an SGML parser context
4991: *
4992: * Free all the memory used by a parser context. However the parsed
4993: * document in ctxt->myDoc is not freed.
4994: */
4995:
4996: void
4997: sgmlFreeParserCtxt(sgmlParserCtxtPtr ctxt)
4998: {
4999: xmlFreeParserCtxt(ctxt);
5000: }
5001:
5002: /**
5003: * sgmlCreateDocParserCtxt :
5004: * @cur: a pointer to an array of xmlChar
5005: * @encoding: a free form C string describing the SGML document encoding, or NULL
5006: *
5007: * Create a parser context for an SGML document.
5008: *
5009: * Returns the new parser context or NULL
5010: */
5011: sgmlParserCtxtPtr
5012: sgmlCreateDocParserCtxt(xmlChar *cur, const char *encoding) {
5013: sgmlParserCtxtPtr ctxt;
5014: sgmlParserInputPtr input;
5015: /* sgmlCharEncoding enc; */
5016:
5017: ctxt = (sgmlParserCtxtPtr) xmlMalloc(sizeof(sgmlParserCtxt));
5018: if (ctxt == NULL) {
5019: perror("malloc");
5020: return(NULL);
5021: }
5022: sgmlInitParserCtxt(ctxt);
5023: input = (sgmlParserInputPtr) xmlMalloc(sizeof(sgmlParserInput));
5024: if (input == NULL) {
5025: perror("malloc");
5026: xmlFree(ctxt);
5027: return(NULL);
5028: }
5029: memset(input, 0, sizeof(sgmlParserInput));
5030:
5031: input->line = 1;
5032: input->col = 1;
5033: input->base = cur;
5034: input->cur = cur;
5035:
5036: inputPush(ctxt, input);
5037: return(ctxt);
5038: }
5039:
5040: /************************************************************************
5041: * *
5042: * Progressive parsing interfaces *
5043: * *
5044: ************************************************************************/
5045:
5046: /**
5047: * sgmlParseLookupSequence:
5048: * @ctxt: an SGML parser context
5049: * @first: the first char to lookup
5050: * @next: the next char to lookup or zero
5051: * @third: the next char to lookup or zero
5052: *
5053: * Try to find if a sequence (first, next, third) or just (first next) or
5054: * (first) is available in the input stream.
5055: * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5056: * to avoid rescanning sequences of bytes, it DOES change the state of the
5057: * parser, do not use liberally.
5058: * This is basically similar to xmlParseLookupSequence()
5059: *
5060: * Returns the index to the current parsing point if the full sequence
5061: * is available, -1 otherwise.
5062: */
5063: int
5064: sgmlParseLookupSequence(sgmlParserCtxtPtr ctxt, xmlChar first,
5065: xmlChar next, xmlChar third) {
5066: int base, len;
5067: sgmlParserInputPtr in;
5068: const xmlChar *buf;
5069:
5070: in = ctxt->input;
5071: if (in == NULL) return(-1);
5072: base = in->cur - in->base;
5073: if (base < 0) return(-1);
5074: if (ctxt->checkIndex > base)
5075: base = ctxt->checkIndex;
5076: if (in->buf == NULL) {
5077: buf = in->base;
5078: len = in->length;
5079: } else {
5080: buf = in->buf->buffer->content;
5081: len = in->buf->buffer->use;
5082: }
5083: /* take into account the sequence length */
5084: if (third) len -= 2;
5085: else if (next) len --;
5086: for (;base < len;base++) {
5087: if (buf[base] == first) {
5088: if (third != 0) {
5089: if ((buf[base + 1] != next) ||
5090: (buf[base + 2] != third)) continue;
5091: } else if (next != 0) {
5092: if (buf[base + 1] != next) continue;
5093: }
5094: ctxt->checkIndex = 0;
5095: #ifdef DEBUG_PUSH
5096: if (next == 0)
5097: fprintf(stderr, "HPP: lookup '%c' found at %d\n",
5098: first, base);
5099: else if (third == 0)
5100: fprintf(stderr, "HPP: lookup '%c%c' found at %d\n",
5101: first, next, base);
5102: else
5103: fprintf(stderr, "HPP: lookup '%c%c%c' found at %d\n",
5104: first, next, third, base);
5105: #endif
5106: return(base - (in->cur - in->base));
5107: }
5108: }
5109: ctxt->checkIndex = base;
5110: #ifdef DEBUG_PUSH
5111: if (next == 0)
5112: fprintf(stderr, "HPP: lookup '%c' failed\n", first);
5113: else if (third == 0)
5114: fprintf(stderr, "HPP: lookup '%c%c' failed\n", first, next);
5115: else
5116: fprintf(stderr, "HPP: lookup '%c%c%c' failed\n", first, next, third);
5117: #endif
5118: return(-1);
5119: }
5120:
5121: /**
5122: * sgmlParseTryOrFinish:
5123: * @ctxt: an SGML parser context
5124: * @terminate: last chunk indicator
5125: *
5126: * Try to progress on parsing
5127: *
5128: * Returns zero if no parsing was possible
5129: */
5130: int
5131: sgmlParseTryOrFinish(sgmlParserCtxtPtr ctxt, int terminate) {
5132: int ret = 0;
5133: sgmlParserInputPtr in;
5134: int avail = 0;
5135: xmlChar cur, next;
5136:
5137: #ifdef DEBUG_PUSH
5138: switch (ctxt->instate) {
5139: case XML_PARSER_EOF:
5140: fprintf(stderr, "HPP: try EOF\n"); break;
5141: case XML_PARSER_START:
5142: fprintf(stderr, "HPP: try START\n"); break;
5143: case XML_PARSER_MISC:
5144: fprintf(stderr, "HPP: try MISC\n");break;
5145: case XML_PARSER_COMMENT:
5146: fprintf(stderr, "HPP: try COMMENT\n");break;
5147: case XML_PARSER_PROLOG:
5148: fprintf(stderr, "HPP: try PROLOG\n");break;
5149: case XML_PARSER_START_TAG:
5150: fprintf(stderr, "HPP: try START_TAG\n");break;
5151: case XML_PARSER_CONTENT:
5152: fprintf(stderr, "HPP: try CONTENT\n");break;
5153: case XML_PARSER_CDATA_SECTION:
5154: fprintf(stderr, "HPP: try CDATA_SECTION\n");break;
5155: case XML_PARSER_END_TAG:
5156: fprintf(stderr, "HPP: try END_TAG\n");break;
5157: case XML_PARSER_ENTITY_DECL:
5158: fprintf(stderr, "HPP: try ENTITY_DECL\n");break;
5159: case XML_PARSER_ENTITY_VALUE:
5160: fprintf(stderr, "HPP: try ENTITY_VALUE\n");break;
5161: case XML_PARSER_ATTRIBUTE_VALUE:
5162: fprintf(stderr, "HPP: try ATTRIBUTE_VALUE\n");break;
5163: case XML_PARSER_DTD:
5164: fprintf(stderr, "HPP: try DTD\n");break;
5165: case XML_PARSER_EPILOG:
5166: fprintf(stderr, "HPP: try EPILOG\n");break;
5167: case XML_PARSER_PI:
5168: fprintf(stderr, "HPP: try PI\n");break;
5169: }
5170: #endif
5171:
5172: while (1) {
5173:
5174: in = ctxt->input;
5175: if (in == NULL) break;
5176: if (in->buf == NULL)
5177: avail = in->length - (in->cur - in->base);
5178: else
5179: avail = in->buf->buffer->use - (in->cur - in->base);
5180: if ((avail == 0) && (terminate)) {
5181: sgmlAutoClose(ctxt, NULL);
5182: if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5183: /*
5184: * SAX: end of the document processing.
5185: */
5186: ctxt->instate = XML_PARSER_EOF;
5187: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5188: ctxt->sax->endDocument(ctxt->userData);
5189: }
5190: }
5191: if (avail < 1)
5192: goto done;
5193: switch (ctxt->instate) {
5194: case XML_PARSER_EOF:
5195: /*
5196: * Document parsing is done !
5197: */
5198: goto done;
5199: case XML_PARSER_START:
5200: /*
5201: * Very first chars read from the document flow.
5202: */
5203: cur = in->cur[0];
5204: if (IS_BLANK(cur)) {
5205: SKIP_BLANKS;
5206: if (in->buf == NULL)
5207: avail = in->length - (in->cur - in->base);
5208: else
5209: avail = in->buf->buffer->use - (in->cur - in->base);
5210: }
5211: if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5212: ctxt->sax->setDocumentLocator(ctxt->userData,
5213: &xmlDefaultSAXLocator);
5214: if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5215: (!ctxt->disableSAX))
5216: ctxt->sax->startDocument(ctxt->userData);
5217:
5218: cur = in->cur[0];
5219: next = in->cur[1];
5220: if ((cur == '<') && (next == '!') &&
5221: (UPP(2) == 'D') && (UPP(3) == 'O') &&
5222: (UPP(4) == 'C') && (UPP(5) == 'T') &&
5223: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5224: (UPP(8) == 'E')) {
5225: if ((!terminate) &&
5226: (sgmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
5227: goto done;
5228: #ifdef DEBUG_PUSH
5229: fprintf(stderr, "HPP: Parsing internal subset\n");
5230: #endif
5231: sgmlParseDocTypeDecl(ctxt);
5232: ctxt->instate = XML_PARSER_PROLOG;
5233: #ifdef DEBUG_PUSH
5234: fprintf(stderr, "HPP: entering PROLOG\n");
5235: #endif
5236: } else {
5237: ctxt->instate = XML_PARSER_MISC;
5238: }
5239: #ifdef DEBUG_PUSH
5240: fprintf(stderr, "HPP: entering MISC\n");
5241: #endif
5242: break;
5243: case XML_PARSER_MISC:
5244: SKIP_BLANKS;
5245: if (in->buf == NULL)
5246: avail = in->length - (in->cur - in->base);
5247: else
5248: avail = in->buf->buffer->use - (in->cur - in->base);
5249: if (avail < 2)
5250: goto done;
5251: cur = in->cur[0];
5252: next = in->cur[1];
5253: if ((cur == '<') && (next == '!') &&
5254: (in->cur[2] == '-') && (in->cur[3] == '-')) {
5255: if ((!terminate) &&
5256: (sgmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
5257: goto done;
5258: #ifdef DEBUG_PUSH
5259: fprintf(stderr, "HPP: Parsing Comment\n");
5260: #endif
5261: sgmlParseComment(ctxt);
5262: ctxt->instate = XML_PARSER_MISC;
5263: } else if ((cur == '<') && (next == '!') &&
5264: (UPP(2) == 'D') && (UPP(3) == 'O') &&
5265: (UPP(4) == 'C') && (UPP(5) == 'T') &&
5266: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5267: (UPP(8) == 'E')) {
5268: if ((!terminate) &&
5269: (sgmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
5270: goto done;
5271: #ifdef DEBUG_PUSH
5272: fprintf(stderr, "HPP: Parsing internal subset\n");
5273: #endif
5274: sgmlParseDocTypeDecl(ctxt);
5275: ctxt->instate = XML_PARSER_PROLOG;
5276: #ifdef DEBUG_PUSH
5277: fprintf(stderr, "HPP: entering PROLOG\n");
5278: #endif
5279: } else if ((cur == '<') && (next == '!') &&
5280: (avail < 9)) {
5281: goto done;
5282: } else {
5283: ctxt->instate = XML_PARSER_START_TAG;
5284: #ifdef DEBUG_PUSH
5285: fprintf(stderr, "HPP: entering START_TAG\n");
5286: #endif
5287: }
5288: break;
5289: case XML_PARSER_PROLOG:
5290: SKIP_BLANKS;
5291: if (in->buf == NULL)
5292: avail = in->length - (in->cur - in->base);
5293: else
5294: avail = in->buf->buffer->use - (in->cur - in->base);
5295: if (avail < 2)
5296: goto done;
5297: cur = in->cur[0];
5298: next = in->cur[1];
5299: if ((cur == '<') && (next == '!') &&
5300: (in->cur[2] == '-') && (in->cur[3] == '-')) {
5301: if ((!terminate) &&
5302: (sgmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
5303: goto done;
5304: #ifdef DEBUG_PUSH
5305: fprintf(stderr, "HPP: Parsing Comment\n");
5306: #endif
5307: sgmlParseComment(ctxt);
5308: ctxt->instate = XML_PARSER_PROLOG;
5309: } else if ((cur == '<') && (next == '!') &&
5310: (avail < 4)) {
5311: goto done;
5312: } else {
5313: ctxt->instate = XML_PARSER_START_TAG;
5314: #ifdef DEBUG_PUSH
5315: fprintf(stderr, "HPP: entering START_TAG\n");
5316: #endif
5317: }
5318: break;
5319: case XML_PARSER_EPILOG:
5320: if (in->buf == NULL)
5321: avail = in->length - (in->cur - in->base);
5322: else
5323: avail = in->buf->buffer->use - (in->cur - in->base);
5324: if (avail < 1)
5325: goto done;
5326: cur = in->cur[0];
5327: if (IS_BLANK(cur)) {
5328: sgmlParseCharData(ctxt, 0);
5329: goto done;
5330: }
5331: if (avail < 2)
5332: goto done;
5333: next = in->cur[1];
5334: if ((cur == '<') && (next == '!') &&
5335: (in->cur[2] == '-') && (in->cur[3] == '-')) {
5336: if ((!terminate) &&
5337: (sgmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
5338: goto done;
5339: #ifdef DEBUG_PUSH
5340: fprintf(stderr, "HPP: Parsing Comment\n");
5341: #endif
5342: sgmlParseComment(ctxt);
5343: ctxt->instate = XML_PARSER_EPILOG;
5344: } else if ((cur == '<') && (next == '!') &&
5345: (avail < 4)) {
5346: goto done;
5347: } else {
5348: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
5349: ctxt->sax->error(ctxt->userData,
5350: "Extra content at the end of the document\n");
5351: ctxt->wellFormed = 0;
5352: ctxt->errNo = XML_ERR_DOCUMENT_END;
5353: ctxt->instate = XML_PARSER_EOF;
5354: #ifdef DEBUG_PUSH
5355: fprintf(stderr, "HPP: entering EOF\n");
5356: #endif
5357: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5358: ctxt->sax->endDocument(ctxt->userData);
5359: goto done;
5360: }
5361: break;
5362: case XML_PARSER_START_TAG: {
5363: xmlChar *name, *oldname;
5364: int depth = ctxt->nameNr;
5365: sgmlElemDescPtr info;
5366:
5367: if (avail < 2)
5368: goto done;
5369: cur = in->cur[0];
5370: if (cur != '<') {
5371: ctxt->instate = XML_PARSER_CONTENT;
5372: #ifdef DEBUG_PUSH
5373: fprintf(stderr, "HPP: entering CONTENT\n");
5374: #endif
5375: break;
5376: }
5377: if ((!terminate) &&
5378: (sgmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
5379: goto done;
5380:
5381: oldname = xmlStrdup(ctxt->name);
5382: sgmlParseStartTag(ctxt);
5383: name = ctxt->name;
5384: #ifdef DEBUG
5385: if (oldname == NULL)
5386: fprintf(stderr, "Start of element %s\n", name);
5387: else if (name == NULL)
5388: fprintf(stderr, "Start of element failed, was %s\n",
5389: oldname);
5390: else
5391: fprintf(stderr, "Start of element %s, was %s\n",
5392: name, oldname);
5393: #endif
5394: if (((depth == ctxt->nameNr) &&
5395: (!xmlStrcmp(oldname, ctxt->name))) ||
5396: (name == NULL)) {
5397: if (CUR == '>')
5398: NEXT;
5399: if (oldname != NULL)
5400: xmlFree(oldname);
5401: break;
5402: }
5403: if (oldname != NULL)
5404: xmlFree(oldname);
5405:
5406: /*
5407: * Lookup the info for that element.
5408: */
5409: info = sgmlTagLookup(name);
5410: if (info == NULL) {
5411: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
5412: ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
5413: name);
5414: ctxt->wellFormed = 0;
5415: } else if (info->depr) {
5416: /***************************
5417: if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
5418: ctxt->sax->warning(ctxt->userData,
5419: "Tag %s is deprecated\n",
5420: name);
5421: ***************************/
5422: }
5423:
5424: /*
5425: * Check for an Empty Element labelled the XML/SGML way
5426: */
5427: if ((CUR == '/') && (NXT(1) == '>')) {
5428: SKIP(2);
5429: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5430: ctxt->sax->endElement(ctxt->userData, name);
5431: oldname = sgmlnamePop(ctxt);
5432: #ifdef DEBUG
5433: fprintf(stderr,"End of tag the XML way: popping out %s\n",
5434: oldname);
5435: #endif
5436: if (oldname != NULL)
5437: xmlFree(oldname);
5438: ctxt->instate = XML_PARSER_CONTENT;
5439: #ifdef DEBUG_PUSH
5440: fprintf(stderr, "HPP: entering CONTENT\n");
5441: #endif
5442: break;
5443: }
5444:
5445: if (CUR == '>') {
5446: NEXT;
5447: } else {
5448: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
5449: ctxt->sax->error(ctxt->userData,
5450: "Couldn't find end of Start Tag %s\n",
5451: name);
5452: ctxt->wellFormed = 0;
5453:
5454: /*
5455: * end of parsing of this node.
5456: */
5457: if (!xmlStrcmp(name, ctxt->name)) {
5458: nodePop(ctxt);
5459: oldname = sgmlnamePop(ctxt);
5460: #ifdef DEBUG
5461: fprintf(stderr,
5462: "End of start tag problem: popping out %s\n", oldname);
5463: #endif
5464: if (oldname != NULL)
5465: xmlFree(oldname);
5466: }
5467:
5468: ctxt->instate = XML_PARSER_CONTENT;
5469: #ifdef DEBUG_PUSH
5470: fprintf(stderr, "HPP: entering CONTENT\n");
5471: #endif
5472: break;
5473: }
5474:
5475: /*
5476: * Check for an Empty Element from DTD definition
5477: */
5478: if ((info != NULL) && (info->empty)) {
5479: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5480: ctxt->sax->endElement(ctxt->userData, name);
5481: oldname = sgmlnamePop(ctxt);
5482: #ifdef DEBUG
5483: fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
5484: #endif
5485: if (oldname != NULL)
5486: xmlFree(oldname);
5487: }
5488: ctxt->instate = XML_PARSER_CONTENT;
5489: #ifdef DEBUG_PUSH
5490: fprintf(stderr, "HPP: entering CONTENT\n");
5491: #endif
5492: break;
5493: }
5494: case XML_PARSER_CONTENT: {
5495: long cons;
5496: /*
5497: * Handle preparsed entities and charRef
5498: */
5499: if (ctxt->token != 0) {
5500: xmlChar chr[2] = { 0 , 0 } ;
5501:
5502: chr[0] = (xmlChar) ctxt->token;
5503: sgmlCheckParagraph(ctxt);
5504: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5505: ctxt->sax->characters(ctxt->userData, chr, 1);
5506: ctxt->token = 0;
5507: ctxt->checkIndex = 0;
5508: }
5509: if ((avail == 1) && (terminate)) {
5510: cur = in->cur[0];
5511: if ((cur != '<') && (cur != '&')) {
5512: if (ctxt->sax != NULL) {
5513: if (IS_BLANK(cur)) {
5514: if (ctxt->sax->ignorableWhitespace != NULL)
5515: ctxt->sax->ignorableWhitespace(
5516: ctxt->userData, &cur, 1);
5517: } else {
5518: sgmlCheckParagraph(ctxt);
5519: if (ctxt->sax->characters != NULL)
5520: ctxt->sax->characters(
5521: ctxt->userData, &cur, 1);
5522: }
5523: }
5524: ctxt->token = 0;
5525: ctxt->checkIndex = 0;
5526: NEXT;
5527: }
5528: break;
5529: }
5530: if (avail < 2)
5531: goto done;
5532: cur = in->cur[0];
5533: next = in->cur[1];
5534: cons = ctxt->nbChars;
5535: /*
5536: * Sometimes DOCTYPE arrives in the middle of the document
5537: */
5538: if ((cur == '<') && (next == '!') &&
5539: (UPP(2) == 'D') && (UPP(3) == 'O') &&
5540: (UPP(4) == 'C') && (UPP(5) == 'T') &&
5541: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5542: (UPP(8) == 'E')) {
5543: if ((!terminate) &&
5544: (sgmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
5545: goto done;
5546: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
5547: ctxt->sax->error(ctxt->userData,
5548: "Misplaced DOCTYPE declaration\n");
5549: ctxt->wellFormed = 0;
5550: sgmlParseDocTypeDecl(ctxt);
5551: } else if ((cur == '<') && (next == '!') &&
5552: (in->cur[2] == '-') && (in->cur[3] == '-')) {
5553: if ((!terminate) &&
5554: (sgmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
5555: goto done;
5556: #ifdef DEBUG_PUSH
5557: fprintf(stderr, "HPP: Parsing Comment\n");
5558: #endif
5559: sgmlParseComment(ctxt);
5560: ctxt->instate = XML_PARSER_CONTENT;
5561: } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5562: goto done;
5563: } else if ((cur == '<') && (next == '/')) {
5564: ctxt->instate = XML_PARSER_END_TAG;
5565: ctxt->checkIndex = 0;
5566: #ifdef DEBUG_PUSH
5567: fprintf(stderr, "HPP: entering END_TAG\n");
5568: #endif
5569: break;
5570: } else if (cur == '<') {
5571: ctxt->instate = XML_PARSER_START_TAG;
5572: ctxt->checkIndex = 0;
5573: #ifdef DEBUG_PUSH
5574: fprintf(stderr, "HPP: entering START_TAG\n");
5575: #endif
5576: break;
5577: } else if (cur == '&') {
5578: if ((!terminate) &&
5579: (sgmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
5580: goto done;
5581: #ifdef DEBUG_PUSH
5582: fprintf(stderr, "HPP: Parsing Reference\n");
5583: #endif
5584: /* TODO: check generation of subtrees if noent !!! */
5585: sgmlParseReference(ctxt);
5586: } else {
5587: /* TODO Avoid the extra copy, handle directly !!!!!! */
5588: /*
5589: * Goal of the following test is :
5590: * - minimize calls to the SAX 'character' callback
5591: * when they are mergeable
5592: */
5593: if ((ctxt->inputNr == 1) &&
5594: (avail < SGML_PARSER_BIG_BUFFER_SIZE)) {
5595: if ((!terminate) &&
5596: (sgmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
5597: goto done;
5598: }
5599: ctxt->checkIndex = 0;
5600: #ifdef DEBUG_PUSH
5601: fprintf(stderr, "HPP: Parsing char data\n");
5602: #endif
5603: sgmlParseCharData(ctxt, 0);
5604: }
5605: if (cons == ctxt->nbChars) {
5606: if (ctxt->node != NULL) {
5607: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
5608: ctxt->sax->error(ctxt->userData,
5609: "detected an error in element content\n");
5610: ctxt->wellFormed = 0;
5611: NEXT;
5612: }
5613: break;
5614: }
5615:
5616: break;
5617: }
5618: case XML_PARSER_END_TAG:
5619: if (avail < 2)
5620: goto done;
5621: if ((!terminate) &&
5622: (sgmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
5623: goto done;
5624: sgmlParseEndTag(ctxt);
5625: if (ctxt->nameNr == 0) {
5626: ctxt->instate = XML_PARSER_EPILOG;
5627: } else {
5628: ctxt->instate = XML_PARSER_CONTENT;
5629: }
5630: ctxt->checkIndex = 0;
5631: #ifdef DEBUG_PUSH
5632: fprintf(stderr, "HPP: entering CONTENT\n");
5633: #endif
5634: break;
5635: case XML_PARSER_CDATA_SECTION:
5636: fprintf(stderr, "HPP: internal error, state == CDATA\n");
5637: ctxt->instate = XML_PARSER_CONTENT;
5638: ctxt->checkIndex = 0;
5639: #ifdef DEBUG_PUSH
5640: fprintf(stderr, "HPP: entering CONTENT\n");
5641: #endif
5642: break;
5643: case XML_PARSER_DTD:
5644: fprintf(stderr, "HPP: internal error, state == DTD\n");
5645: ctxt->instate = XML_PARSER_CONTENT;
5646: ctxt->checkIndex = 0;
5647: #ifdef DEBUG_PUSH
5648: fprintf(stderr, "HPP: entering CONTENT\n");
5649: #endif
5650: break;
5651: case XML_PARSER_COMMENT:
5652: fprintf(stderr, "HPP: internal error, state == COMMENT\n");
5653: ctxt->instate = XML_PARSER_CONTENT;
5654: ctxt->checkIndex = 0;
5655: #ifdef DEBUG_PUSH
5656: fprintf(stderr, "HPP: entering CONTENT\n");
5657: #endif
5658: break;
5659: case XML_PARSER_PI:
5660: fprintf(stderr, "HPP: internal error, state == PI\n");
5661: ctxt->instate = XML_PARSER_CONTENT;
5662: ctxt->checkIndex = 0;
5663: #ifdef DEBUG_PUSH
5664: fprintf(stderr, "HPP: entering CONTENT\n");
5665: #endif
5666: break;
5667: case XML_PARSER_ENTITY_DECL:
5668: fprintf(stderr, "HPP: internal error, state == ENTITY_DECL\n");
5669: ctxt->instate = XML_PARSER_CONTENT;
5670: ctxt->checkIndex = 0;
5671: #ifdef DEBUG_PUSH
5672: fprintf(stderr, "HPP: entering CONTENT\n");
5673: #endif
5674: break;
5675: case XML_PARSER_ENTITY_VALUE:
5676: fprintf(stderr, "HPP: internal error, state == ENTITY_VALUE\n");
5677: ctxt->instate = XML_PARSER_CONTENT;
5678: ctxt->checkIndex = 0;
5679: #ifdef DEBUG_PUSH
5680: fprintf(stderr, "HPP: entering DTD\n");
5681: #endif
5682: break;
5683: case XML_PARSER_ATTRIBUTE_VALUE:
5684: fprintf(stderr, "HPP: internal error, state == ATTRIBUTE_VALUE\n");
5685: ctxt->instate = XML_PARSER_START_TAG;
5686: ctxt->checkIndex = 0;
5687: #ifdef DEBUG_PUSH
5688: fprintf(stderr, "HPP: entering START_TAG\n");
5689: #endif
5690: break;
5691: case XML_PARSER_SYSTEM_LITERAL:
5692: fprintf(stderr, "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
5693: ctxt->instate = XML_PARSER_CONTENT;
5694: ctxt->checkIndex = 0;
5695: #ifdef DEBUG_PUSH
5696: fprintf(stderr, "HPP: entering CONTENT\n");
5697: #endif
5698: break;
5699: }
5700: }
5701: done:
5702: if ((avail == 0) && (terminate)) {
5703: sgmlAutoClose(ctxt, NULL);
5704: if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5705: /*
5706: * SAX: end of the document processing.
5707: */
5708: ctxt->instate = XML_PARSER_EOF;
5709: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5710: ctxt->sax->endDocument(ctxt->userData);
5711: }
5712: }
5713: if ((ctxt->myDoc != NULL) &&
5714: ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5715: (ctxt->instate == XML_PARSER_EPILOG))) {
5716: xmlDtdPtr dtd;
5717: dtd = xmlGetIntSubset(ctxt->myDoc);
5718: if (dtd == NULL)
5719: ctxt->myDoc->intSubset =
5720: xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "SGML",
5721: BAD_CAST "-//W3C//DTD SGML 4.0 Transitional//EN",
5722: BAD_CAST "http://www.w3.org/TR/REC-docbook/loose.dtd");
5723: }
5724: #ifdef DEBUG_PUSH
5725: fprintf(stderr, "HPP: done %d\n", ret);
5726: #endif
5727: return(ret);
5728: }
5729:
5730: /**
5731: * sgmlParseTry:
5732: * @ctxt: an SGML parser context
5733: *
5734: * Try to progress on parsing
5735: *
5736: * Returns zero if no parsing was possible
5737: */
5738: int
5739: sgmlParseTry(sgmlParserCtxtPtr ctxt) {
5740: return(sgmlParseTryOrFinish(ctxt, 0));
5741: }
5742:
5743: /**
5744: * sgmlParseChunk:
5745: * @ctxt: an XML parser context
5746: * @chunk: an char array
5747: * @size: the size in byte of the chunk
5748: * @terminate: last chunk indicator
5749: *
5750: * Parse a Chunk of memory
5751: *
5752: * Returns zero if no error, the xmlParserErrors otherwise.
5753: */
5754: int
5755: sgmlParseChunk(sgmlParserCtxtPtr ctxt, const char *chunk, int size,
5756: int terminate) {
5757: if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5758: (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5759: int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5760: int cur = ctxt->input->cur - ctxt->input->base;
5761:
5762: xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5763: ctxt->input->base = ctxt->input->buf->buffer->content + base;
5764: ctxt->input->cur = ctxt->input->base + cur;
5765: #ifdef DEBUG_PUSH
5766: fprintf(stderr, "HPP: pushed %d\n", size);
5767: #endif
5768:
5769: if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5770: sgmlParseTryOrFinish(ctxt, terminate);
5771: } else if (ctxt->instate != XML_PARSER_EOF) {
5772: xmlParserInputBufferPush(ctxt->input->buf, 0, "");
5773: sgmlParseTryOrFinish(ctxt, terminate);
5774: }
5775: if (terminate) {
5776: if ((ctxt->instate != XML_PARSER_EOF) &&
5777: (ctxt->instate != XML_PARSER_EPILOG) &&
5778: (ctxt->instate != XML_PARSER_MISC)) {
5779: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
5780: ctxt->sax->error(ctxt->userData,
5781: "Extra content at the end of the document\n");
5782: ctxt->wellFormed = 0;
5783: ctxt->errNo = XML_ERR_DOCUMENT_END;
5784: }
5785: if (ctxt->instate != XML_PARSER_EOF) {
5786: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5787: ctxt->sax->endDocument(ctxt->userData);
5788: }
5789: ctxt->instate = XML_PARSER_EOF;
5790: }
5791: return((xmlParserErrors) ctxt->errNo);
5792: }
5793:
5794: /************************************************************************
5795: * *
5796: * User entry points *
5797: * *
5798: ************************************************************************/
5799:
5800: /**
5801: * sgmlCreatePushParserCtxt :
5802: * @sax: a SAX handler
5803: * @user_data: The user data returned on SAX callbacks
5804: * @chunk: a pointer to an array of chars
5805: * @size: number of chars in the array
5806: * @filename: an optional file name or URI
5807: * @enc: an optional encoding
5808: *
5809: * Create a parser context for using the SGML parser in push mode
5810: * To allow content encoding detection, @size should be >= 4
5811: * The value of @filename is used for fetching external entities
5812: * and error/warning reports.
5813: *
5814: * Returns the new parser context or NULL
5815: */
5816: sgmlParserCtxtPtr
5817: sgmlCreatePushParserCtxt(sgmlSAXHandlerPtr sax, void *user_data,
5818: const char *chunk, int size, const char *filename,
5819: xmlCharEncoding enc) {
5820: sgmlParserCtxtPtr ctxt;
5821: sgmlParserInputPtr inputStream;
5822: xmlParserInputBufferPtr buf;
5823:
5824: buf = xmlAllocParserInputBuffer(enc);
5825: if (buf == NULL) return(NULL);
5826:
5827: ctxt = (sgmlParserCtxtPtr) xmlMalloc(sizeof(sgmlParserCtxt));
5828: if (ctxt == NULL) {
5829: xmlFree(buf);
5830: return(NULL);
5831: }
5832: memset(ctxt, 0, sizeof(sgmlParserCtxt));
5833: sgmlInitParserCtxt(ctxt);
5834: if (sax != NULL) {
5835: if (ctxt->sax != &sgmlDefaultSAXHandler)
5836: xmlFree(ctxt->sax);
5837: ctxt->sax = (sgmlSAXHandlerPtr) xmlMalloc(sizeof(sgmlSAXHandler));
5838: if (ctxt->sax == NULL) {
5839: xmlFree(buf);
5840: xmlFree(ctxt);
5841: return(NULL);
5842: }
5843: memcpy(ctxt->sax, sax, sizeof(sgmlSAXHandler));
5844: if (user_data != NULL)
5845: ctxt->userData = user_data;
5846: }
5847: if (filename == NULL) {
5848: ctxt->directory = NULL;
5849: } else {
5850: ctxt->directory = xmlParserGetDirectory(filename);
5851: }
5852:
5853: inputStream = sgmlNewInputStream(ctxt);
5854: if (inputStream == NULL) {
5855: xmlFreeParserCtxt(ctxt);
5856: return(NULL);
5857: }
5858:
5859: if (filename == NULL)
5860: inputStream->filename = NULL;
5861: else
5862: inputStream->filename = xmlMemStrdup(filename);
5863: inputStream->buf = buf;
5864: inputStream->base = inputStream->buf->buffer->content;
5865: inputStream->cur = inputStream->buf->buffer->content;
5866:
5867: inputPush(ctxt, inputStream);
5868:
5869: if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5870: (ctxt->input->buf != NULL)) {
5871: xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5872: #ifdef DEBUG_PUSH
5873: fprintf(stderr, "HPP: pushed %d\n", size);
5874: #endif
5875: }
5876:
5877: return(ctxt);
5878: }
5879:
5880: /**
5881: * sgmlSAXParseDoc :
5882: * @cur: a pointer to an array of xmlChar
5883: * @encoding: a free form C string describing the SGML document encoding, or NULL
5884: * @sax: the SAX handler block
5885: * @userData: if using SAX, this pointer will be provided on callbacks.
5886: *
5887: * parse an SGML in-memory document and build a tree.
5888: * It use the given SAX function block to handle the parsing callback.
5889: * If sax is NULL, fallback to the default DOM tree building routines.
5890: *
5891: * Returns the resulting document tree
5892: */
5893:
5894: sgmlDocPtr
5895: sgmlSAXParseDoc(xmlChar *cur, const char *encoding, sgmlSAXHandlerPtr sax, void *userData) {
5896: sgmlDocPtr ret;
5897: sgmlParserCtxtPtr ctxt;
5898:
5899: if (cur == NULL) return(NULL);
5900:
5901:
5902: ctxt = sgmlCreateDocParserCtxt(cur, encoding);
5903: if (ctxt == NULL) return(NULL);
5904: if (sax != NULL) {
5905: ctxt->sax = sax;
5906: ctxt->userData = userData;
5907: }
5908:
5909: sgmlParseDocument(ctxt);
5910: ret = ctxt->myDoc;
5911: if (sax != NULL) {
5912: ctxt->sax = NULL;
5913: ctxt->userData = NULL;
5914: }
5915: sgmlFreeParserCtxt(ctxt);
5916:
5917: return(ret);
5918: }
5919:
5920: /**
5921: * sgmlParseDoc :
5922: * @cur: a pointer to an array of xmlChar
5923: * @encoding: a free form C string describing the SGML document encoding, or NULL
5924: *
5925: * parse an SGML in-memory document and build a tree.
5926: *
5927: * Returns the resulting document tree
5928: */
5929:
5930: sgmlDocPtr
5931: sgmlParseDoc(xmlChar *cur, const char *encoding) {
5932: return(sgmlSAXParseDoc(cur, encoding, NULL, NULL));
5933: }
5934:
5935:
5936: /**
5937: * sgmlCreateFileParserCtxt :
5938: * @filename: the filename
5939: * @encoding: a free form C string describing the SGML document encoding, or NULL
5940: *
5941: * Create a parser context for a file content.
5942: * Automatic support for ZLIB/Compress compressed document is provided
5943: * by default if found at compile-time.
5944: *
5945: * Returns the new parser context or NULL
5946: */
5947: sgmlParserCtxtPtr
5948: sgmlCreateFileParserCtxt(const char *filename, const char *encoding)
5949: {
5950: sgmlParserCtxtPtr ctxt;
5951: sgmlParserInputPtr inputStream;
5952: xmlParserInputBufferPtr buf;
5953: /* sgmlCharEncoding enc; */
5954:
5955: buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
5956: if (buf == NULL) return(NULL);
5957:
5958: ctxt = (sgmlParserCtxtPtr) xmlMalloc(sizeof(sgmlParserCtxt));
5959: if (ctxt == NULL) {
5960: perror("malloc");
5961: return(NULL);
5962: }
5963: memset(ctxt, 0, sizeof(sgmlParserCtxt));
5964: sgmlInitParserCtxt(ctxt);
5965: inputStream = (sgmlParserInputPtr) xmlMalloc(sizeof(sgmlParserInput));
5966: if (inputStream == NULL) {
5967: perror("malloc");
5968: xmlFree(ctxt);
5969: return(NULL);
5970: }
5971: memset(inputStream, 0, sizeof(sgmlParserInput));
5972:
5973: inputStream->filename = xmlMemStrdup(filename);
5974: inputStream->line = 1;
5975: inputStream->col = 1;
5976: inputStream->buf = buf;
5977: inputStream->directory = NULL;
5978:
5979: inputStream->base = inputStream->buf->buffer->content;
5980: inputStream->cur = inputStream->buf->buffer->content;
5981: inputStream->free = NULL;
5982:
5983: inputPush(ctxt, inputStream);
5984: return(ctxt);
5985: }
5986:
5987: /**
5988: * sgmlSAXParseFile :
5989: * @filename: the filename
5990: * @encoding: a free form C string describing the SGML document encoding, or NULL
5991: * @sax: the SAX handler block
5992: * @userData: if using SAX, this pointer will be provided on callbacks.
5993: *
5994: * parse an SGML file and build a tree. Automatic support for ZLIB/Compress
5995: * compressed document is provided by default if found at compile-time.
5996: * It use the given SAX function block to handle the parsing callback.
5997: * If sax is NULL, fallback to the default DOM tree building routines.
5998: *
5999: * Returns the resulting document tree
6000: */
6001:
6002: sgmlDocPtr
6003: sgmlSAXParseFile(const char *filename, const char *encoding, sgmlSAXHandlerPtr sax,
6004: void *userData) {
6005: sgmlDocPtr ret;
6006: sgmlParserCtxtPtr ctxt;
6007: sgmlSAXHandlerPtr oldsax = NULL;
6008:
6009: ctxt = sgmlCreateFileParserCtxt(filename, encoding);
6010: if (ctxt == NULL) return(NULL);
6011: if (sax != NULL) {
6012: oldsax = ctxt->sax;
6013: ctxt->sax = sax;
6014: ctxt->userData = userData;
6015: }
6016:
6017: sgmlParseDocument(ctxt);
6018:
6019: ret = ctxt->myDoc;
6020: if (sax != NULL) {
6021: ctxt->sax = oldsax;
6022: ctxt->userData = NULL;
6023: }
6024: sgmlFreeParserCtxt(ctxt);
6025:
6026: return(ret);
6027: }
6028:
6029: /**
6030: * sgmlParseFile :
6031: * @filename: the filename
6032: * @encoding: a free form C string describing the SGML document encoding, or NULL
6033: *
6034: * parse an SGML file and build a tree. Automatic support for ZLIB/Compress
6035: * compressed document is provided by default if found at compile-time.
6036: *
6037: * Returns the resulting document tree
6038: */
6039:
6040: sgmlDocPtr
6041: sgmlParseFile(const char *filename, const char *encoding) {
6042: return(sgmlSAXParseFile(filename, encoding, NULL, NULL));
6043: }
6044:
6045: #endif /* LIBXML_SGML_ENABLED */
Webmaster