Annotation of XML/SGMLparser.c, revision 1.9
1.1 veillard 1: /*
2: * SGMLparser.c : an attempt to parse Docbook documents
3: *
4: * See Copyright for the status of this software.
5: *
6: * Daniel.Veillard@w3.org
7: */
8:
9: #ifdef WIN32
10: #include "win32config.h"
11: #else
12: #include "config.h"
13: #endif
14:
15: #include "xmlversion.h"
16: #ifdef LIBXML_SGML_ENABLED
17:
18: #include <stdio.h>
19: #include <string.h>
20: #ifdef HAVE_CTYPE_H
21: #include <ctype.h>
22: #endif
23: #ifdef HAVE_STDLIB_H
24: #include <stdlib.h>
25: #endif
26: #ifdef HAVE_SYS_STAT_H
27: #include <sys/stat.h>
28: #endif
29: #ifdef HAVE_FCNTL_H
30: #include <fcntl.h>
31: #endif
32: #ifdef HAVE_UNISTD_H
33: #include <unistd.h>
34: #endif
35: #ifdef HAVE_ZLIB_H
36: #include <zlib.h>
37: #endif
38:
39: #include <libxml/xmlmemory.h>
40: #include <libxml/tree.h>
41: #include <libxml/SGMLparser.h>
42: #include <libxml/entities.h>
43: #include <libxml/encoding.h>
44: #include <libxml/parser.h>
45: #include <libxml/valid.h>
46: #include <libxml/parserInternals.h>
47: #include <libxml/xmlIO.h>
48: #include <libxml/SAX.h>
1.3 veillard 49: #include <libxml/uri.h>
1.8 veillard 50: #include <libxml/xmlerror.h>
1.1 veillard 51:
52: #define SGML_MAX_NAMELEN 1000
53: #define SGML_PARSER_BIG_BUFFER_SIZE 1000
54: #define SGML_PARSER_BUFFER_SIZE 100
55:
56: /* #define DEBUG */
57: /* #define DEBUG_PUSH */
58:
59: /************************************************************************
60: * *
61: * Parser stacks related functions and macros *
62: * *
63: ************************************************************************/
64:
65: /*
66: * Generic function for accessing stacks in the Parser Context
67: */
68:
69: #define PUSH_AND_POP(scope, type, name) \
70: scope int sgml##name##Push(sgmlParserCtxtPtr ctxt, type value) { \
71: if (ctxt->name##Nr >= ctxt->name##Max) { \
72: ctxt->name##Max *= 2; \
73: ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
74: ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
75: if (ctxt->name##Tab == NULL) { \
76: fprintf(stderr, "realloc failed !\n"); \
77: return(0); \
78: } \
79: } \
80: ctxt->name##Tab[ctxt->name##Nr] = value; \
81: ctxt->name = value; \
82: return(ctxt->name##Nr++); \
83: } \
84: scope type sgml##name##Pop(sgmlParserCtxtPtr ctxt) { \
85: type ret; \
86: if (ctxt->name##Nr < 0) return(0); \
87: ctxt->name##Nr--; \
88: if (ctxt->name##Nr < 0) return(0); \
89: if (ctxt->name##Nr > 0) \
90: ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
91: else \
92: ctxt->name = NULL; \
93: ret = ctxt->name##Tab[ctxt->name##Nr]; \
94: ctxt->name##Tab[ctxt->name##Nr] = 0; \
95: return(ret); \
96: } \
97:
98: PUSH_AND_POP(extern, xmlNodePtr, node)
99: PUSH_AND_POP(extern, xmlChar*, name)
100:
101: /*
102: * Macros for accessing the content. Those should be used only by the parser,
103: * and not exported.
104: *
105: * Dirty macros, i.e. one need to make assumption on the context to use them
106: *
107: * CUR_PTR return the current pointer to the xmlChar to be parsed.
108: * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
109: * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
110: * in UNICODE mode. This should be used internally by the parser
111: * only to compare to ASCII values otherwise it would break when
112: * running with UTF-8 encoding.
113: * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
114: * to compare on ASCII based substring.
115: * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
116: * it should be used only to compare on ASCII based substring.
117: * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
118: * strings within the parser.
119: *
120: * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
121: *
122: * CURRENT Returns the current char value, with the full decoding of
123: * UTF-8 if we are using this mode. It returns an int.
124: * NEXT Skip to the next character, this does the proper decoding
125: * in UTF-8 mode. It also pop-up unfinished entities on the fly.
126: * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
127: */
128:
129: #define UPPER (toupper(*ctxt->input->cur))
130:
131: #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
132:
133: #define NXT(val) ctxt->input->cur[(val)]
134:
135: #define UPP(val) (toupper(ctxt->input->cur[(val)]))
136:
137: #define CUR_PTR ctxt->input->cur
138:
139: #define SHRINK xmlParserInputShrink(ctxt->input)
140:
141: #define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
142:
143: #define CURRENT ((int) (*ctxt->input->cur))
144:
1.9 ! veillard 145: #define SKIP_BLANKS sgmlSkipBlankChars(ctxt)
1.1 veillard 146:
147: #if 0
148: #define CUR ((int) (*ctxt->input->cur))
149: #define NEXT sgmlNextChar(ctxt);
150: #else
151: /* Inported from XML */
152:
153: /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
154: #define CUR ((int) (*ctxt->input->cur))
1.9 ! veillard 155: #define NEXT xmlNextChar(ctxt),ctxt->nbChars++
1.1 veillard 156:
157: #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
158: #define NXT(val) ctxt->input->cur[(val)]
159: #define CUR_PTR ctxt->input->cur
160:
161:
1.9 ! veillard 162: #define NEXTL(l) do { \
1.1 veillard 163: if (*(ctxt->input->cur) == '\n') { \
164: ctxt->input->line++; ctxt->input->col = 1; \
165: } else ctxt->input->col++; \
1.9 ! veillard 166: ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
! 167: } while (0)
1.1 veillard 168:
169: /************
170: \
171: if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
172: if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
173: ************/
174:
1.9 ! veillard 175: #define CUR_CHAR(l) sgmlCurrentChar(ctxt, &l)
! 176: #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
1.1 veillard 177:
178: #define COPY_BUF(l,b,i,v) \
179: if (l == 1) b[i++] = (xmlChar) v; \
1.9 ! veillard 180: else i += xmlCopyChar(l,&b[i],v)
1.1 veillard 181: #endif
182:
183: /**
184: * sgmlCurrentChar:
185: * @ctxt: the SGML parser context
186: * @len: pointer to the length of the char read
187: *
188: * The current char value, if using UTF-8 this may actaully span multiple
189: * bytes in the input buffer. Implement the end of line normalization:
190: * 2.11 End-of-Line Handling
191: * If the encoding is unspecified, in the case we find an ISO-Latin-1
192: * char, then the encoding converter is plugged in automatically.
193: *
194: * Returns the current char value and its lenght
195: */
196:
197: int
198: sgmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
199: if (ctxt->instate == XML_PARSER_EOF)
200: return(0);
201:
202: if (ctxt->token != 0) {
203: *len = 0;
204: return(ctxt->token);
205: }
206: if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
207: /*
208: * We are supposed to handle UTF8, check it's valid
209: * From rfc2044: encoding of the Unicode values on UTF-8:
210: *
211: * UCS-4 range (hex.) UTF-8 octet sequence (binary)
212: * 0000 0000-0000 007F 0xxxxxxx
213: * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
214: * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
215: *
216: * Check for the 0x110000 limit too
217: */
218: const unsigned char *cur = ctxt->input->cur;
219: unsigned char c;
220: unsigned int val;
221:
222: c = *cur;
223: if (c & 0x80) {
224: if (cur[1] == 0)
225: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
226: if ((cur[1] & 0xc0) != 0x80)
227: goto encoding_error;
228: if ((c & 0xe0) == 0xe0) {
229:
230: if (cur[2] == 0)
231: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
232: if ((cur[2] & 0xc0) != 0x80)
233: goto encoding_error;
234: if ((c & 0xf0) == 0xf0) {
235: if (cur[3] == 0)
236: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
237: if (((c & 0xf8) != 0xf0) ||
238: ((cur[3] & 0xc0) != 0x80))
239: goto encoding_error;
240: /* 4-byte code */
241: *len = 4;
242: val = (cur[0] & 0x7) << 18;
243: val |= (cur[1] & 0x3f) << 12;
244: val |= (cur[2] & 0x3f) << 6;
245: val |= cur[3] & 0x3f;
246: } else {
247: /* 3-byte code */
248: *len = 3;
249: val = (cur[0] & 0xf) << 12;
250: val |= (cur[1] & 0x3f) << 6;
251: val |= cur[2] & 0x3f;
252: }
253: } else {
254: /* 2-byte code */
255: *len = 2;
256: val = (cur[0] & 0x1f) << 6;
257: val |= cur[1] & 0x3f;
258: }
259: if (!IS_CHAR(val)) {
1.6 veillard 260: ctxt->errNo = XML_ERR_INVALID_ENCODING;
1.1 veillard 261: if ((ctxt->sax != NULL) &&
262: (ctxt->sax->error != NULL))
263: ctxt->sax->error(ctxt->userData,
264: "Char 0x%X out of allowed range\n", val);
265: ctxt->wellFormed = 0;
266: ctxt->disableSAX = 1;
267: }
268: return(val);
269: } else {
270: /* 1-byte code */
271: *len = 1;
272: return((int) *ctxt->input->cur);
273: }
274: }
275: /*
276: * Assume it's a fixed lenght encoding (1) with
277: * a compatibke encoding for the ASCII set, since
278: * XML constructs only use < 128 chars
279: */
280: *len = 1;
281: if ((int) *ctxt->input->cur < 0x80)
282: return((int) *ctxt->input->cur);
283:
284: /*
285: * Humm this is bad, do an automatic flow conversion
286: */
287: xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
288: ctxt->charset = XML_CHAR_ENCODING_UTF8;
289: return(xmlCurrentChar(ctxt, len));
290:
291: encoding_error:
292: /*
293: * If we detect an UTF8 error that probably mean that the
294: * input encoding didn't get properly advertized in the
295: * declaration header. Report the error and switch the encoding
296: * to ISO-Latin-1 (if you don't like this policy, just declare the
297: * encoding !)
298: */
1.6 veillard 299: ctxt->errNo = XML_ERR_INVALID_ENCODING;
1.1 veillard 300: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
301: ctxt->sax->error(ctxt->userData,
302: "Input is not proper UTF-8, indicate encoding !\n");
303: ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
304: ctxt->input->cur[0], ctxt->input->cur[1],
305: ctxt->input->cur[2], ctxt->input->cur[3]);
306: }
307:
308: ctxt->charset = XML_CHAR_ENCODING_8859_1;
309: *len = 1;
310: return((int) *ctxt->input->cur);
311: }
312:
313: /**
314: * sgmlNextChar:
315: * @ctxt: the SGML parser context
316: *
317: * Skip to the next char input char.
318: */
319:
320: void
321: sgmlNextChar(sgmlParserCtxtPtr ctxt) {
322: if (ctxt->instate == XML_PARSER_EOF)
323: return;
324: if ((*ctxt->input->cur == 0) &&
325: (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
326: xmlPopInput(ctxt);
327: } else {
328: if (*(ctxt->input->cur) == '\n') {
329: ctxt->input->line++; ctxt->input->col = 1;
330: } else ctxt->input->col++;
331: ctxt->input->cur++;
332: ctxt->nbChars++;
333: if (*ctxt->input->cur == 0)
334: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
335: }
336: }
337:
338: /**
339: * sgmlSkipBlankChars:
340: * @ctxt: the SGML parser context
341: *
342: * skip all blanks character found at that point in the input streams.
343: *
344: * Returns the number of space chars skipped
345: */
346:
347: int
348: sgmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
349: int res = 0;
350:
351: while (IS_BLANK(*(ctxt->input->cur))) {
352: if ((*ctxt->input->cur == 0) &&
353: (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
354: xmlPopInput(ctxt);
355: } else {
356: if (*(ctxt->input->cur) == '\n') {
357: ctxt->input->line++; ctxt->input->col = 1;
358: } else ctxt->input->col++;
359: ctxt->input->cur++;
360: ctxt->nbChars++;
361: if (*ctxt->input->cur == 0)
362: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
363: }
364: res++;
365: }
366: return(res);
367: }
368:
369:
370:
371: /************************************************************************
372: * *
373: * The list of SGML elements and their properties *
374: * *
375: ************************************************************************/
376:
377: /*
378: * Start Tag: 1 means the start tag can be ommited
379: * End Tag: 1 means the end tag can be ommited
380: * 2 means it's forbidden (empty elements)
381: * Depr: this element is deprecated
382: * DTD: 1 means that this element is valid only in the Loose DTD
383: * 2 means that this element is valid only in the Frameset DTD
384: *
385: * Name,Start Tag,End Tag, Empty, Depr., DTD, Description
386: */
387: sgmlElemDesc docbookElementTable[] = {
388: { "abbrev", 0, 0, 0, 3, 0, "" }, /* word */
389: { "abstract", 0, 0, 0, 9, 0, "" }, /* title */
390: { "accel", 0, 0, 0, 7, 0, "" }, /* smallcptr */
391: { "ackno", 0, 0, 0, 4, 0, "" }, /* docinfo */
392: { "acronym", 0, 0, 0, 3, 0, "" }, /* word */
393: { "action", 0, 0, 0, 7, 0, "" }, /* smallcptr */
394: { "address", 0, 0, 0, 1, 0, "" },
395: { "affiliation",0, 0, 0, 9, 0, "" }, /* shortaffil */
396: { "alt", 0, 0, 0, 1, 0, "" },
397: { "anchor", 0, 2, 1, 0, 0, "" },
398: { "answer", 0, 0, 0, 9, 0, "" }, /* label */
399: { "appendix", 0, 0, 0, 9, 0, "" }, /* appendixinfo */
400: { "appendixinfo",0, 0, 0, 9, 0, "" }, /* graphic */
401: { "application",0, 0, 0, 2, 0, "" }, /* para */
402: { "area", 0, 2, 1, 0, 0, "" },
403: { "areaset", 0, 0, 0, 9, 0, "" }, /* area */
404: { "areaspec", 0, 0, 0, 9, 0, "" }, /* area */
405: { "arg", 0, 0, 0, 1, 0, "" },
406: { "article", 0, 0, 0, 9, 0, "" }, /* div.title.content */
407: { "articleinfo",0, 0, 0, 9, 0, "" }, /* graphic */
408: { "artpagenums",0, 0, 0, 4, 0, "" }, /* docinfo */
409: { "attribution",0, 0, 0, 2, 0, "" }, /* para */
410: { "audiodata", 0, 2, 1, 0, 0, "" },
411: { "audioobject",0, 0, 0, 9, 0, "" }, /* objectinfo */
412: { "authorblurb",0, 0, 0, 9, 0, "" }, /* title */
413: { "authorgroup",0, 0, 0, 9, 0, "" }, /* author */
414: { "authorinitials",0, 0, 0, 4, 0, "" }, /* docinfo */
415: { "author", 0, 0, 0, 9, 0, "" }, /* person.ident.mix */
416: { "beginpage", 0, 2, 1, 0, 0, "" },
417: { "bibliodiv", 0, 0, 0, 9, 0, "" }, /* sect.title.content */
418: { "biblioentry",0, 0, 0, 9, 0, "" }, /* articleinfo */
419: { "bibliography",0, 0, 0, 9, 0, "" }, /* bibliographyinfo */
420: { "bibliographyinfo",0, 0, 0, 9, 0, "" }, /* graphic */
421: { "bibliomisc", 0, 0, 0, 2, 0, "" }, /* para */
422: { "bibliomixed",0, 0, 0, 1, 0, "" }, /* %bibliocomponent.mix, bibliomset) */
423: { "bibliomset", 0, 0, 0, 1, 0, "" }, /* %bibliocomponent.mix; | bibliomset) */
424: { "biblioset", 0, 0, 0, 9, 0, "" }, /* bibliocomponent.mix */
425: { "blockquote", 0, 0, 0, 9, 0, "" }, /* title */
426: { "book", 0, 0, 0, 9, 0, "" }, /* div.title.content */
427: { "bookinfo", 0, 0, 0, 9, 0, "" }, /* graphic */
428: { "bridgehead", 0, 0, 0, 8, 0, "" }, /* title */
429: { "callout", 0, 0, 0, 9, 0, "" }, /* component.mix */
430: { "calloutlist",0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
431: { "caption", 0, 0, 0, 9, 0, "" }, /* textobject.mix */
432: { "caution", 0, 0, 0, 9, 0, "" }, /* title */
433: { "chapter", 0, 0, 0, 9, 0, "" }, /* chapterinfo */
434: { "chapterinfo",0, 0, 0, 9, 0, "" }, /* graphic */
435: { "citation", 0, 0, 0, 2, 0, "" }, /* para */
436: { "citerefentry",0, 0, 0, 9, 0, "" }, /* refentrytitle */
437: { "citetitle", 0, 0, 0, 2, 0, "" }, /* para */
438: { "city", 0, 0, 0, 4, 0, "" }, /* docinfo */
439: { "classname", 0, 0, 0, 7, 0, "" }, /* smallcptr */
440: { "classsynopsisinfo",0,0, 0, 9, 0, "" }, /* cptr */
441: { "classsynopsis",0, 0, 0, 9, 0, "" }, /* ooclass */
442: { "cmdsynopsis",0, 0, 0, 9, 0, "" }, /* command */
443: { "co", 0, 2, 1, 0, 0, "" },
444: { "collab", 0, 0, 0, 9, 0, "" }, /* collabname */
445: { "collabname", 0, 0, 0, 4, 0, "" }, /* docinfo */
446: { "colophon", 0, 0, 0, 9, 0, "" }, /* sect.title.content */
447: { "colspec", 0, 2, 1, 0, 0, "" },
448: { "colspec", 0, 2, 1, 0, 0, "" },
449: { "command", 0, 0, 0, 9, 0, "" }, /* cptr */
450: { "computeroutput",0, 0, 0, 9, 0, "" }, /* cptr */
451: { "confdates", 0, 0, 0, 4, 0, "" }, /* docinfo */
452: { "confgroup", 0, 0, 0, 9, 0, "" }, /* confdates */
453: { "confnum", 0, 0, 0, 4, 0, "" }, /* docinfo */
454: { "confsponsor",0, 0, 0, 4, 0, "" }, /* docinfo */
455: { "conftitle", 0, 0, 0, 4, 0, "" }, /* docinfo */
456: { "constant", 0, 0, 0, 7, 0, "" }, /* smallcptr */
457: { "constructorsynopsis",0,0, 0, 9, 0, "" }, /* modifier */
458: { "contractnum",0, 0, 0, 4, 0, "" }, /* docinfo */
459: { "contractsponsor",0, 0, 0, 4, 0, "" }, /* docinfo */
460: { "contrib", 0, 0, 0, 4, 0, "" }, /* docinfo */
461: { "copyright", 0, 0, 0, 9, 0, "" }, /* year */
462: { "corpauthor", 0, 0, 0, 4, 0, "" }, /* docinfo */
463: { "corpname", 0, 0, 0, 4, 0, "" }, /* docinfo */
464: { "country", 0, 0, 0, 4, 0, "" }, /* docinfo */
465: { "database", 0, 0, 0, 7, 0, "" }, /* smallcptr */
466: { "date", 0, 0, 0, 4, 0, "" }, /* docinfo */
467: { "dedication", 0, 0, 0, 9, 0, "" }, /* sect.title.content */
468: { "destructorsynopsis",0,0, 0, 9, 0, "" }, /* modifier */
469: { "edition", 0, 0, 0, 4, 0, "" }, /* docinfo */
470: { "editor", 0, 0, 0, 9, 0, "" }, /* person.ident.mix */
471: { "email", 0, 0, 0, 4, 0, "" }, /* docinfo */
472: { "emphasis", 0, 0, 0, 2, 0, "" }, /* para */
473: { "entry", 0, 0, 0, 9, 0, "" }, /* tbl.entry.mdl */
474: { "entrytbl", 0, 0, 0, 9, 0, "" }, /* tbl.entrytbl.mdl */
475: { "envar", 0, 0, 0, 7, 0, "" }, /* smallcptr */
476: { "epigraph", 0, 0, 0, 9, 0, "" }, /* attribution */
477: { "equation", 0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
478: { "errorcode", 0, 0, 0, 7, 0, "" }, /* smallcptr */
479: { "errorname", 0, 0, 0, 7, 0, "" }, /* smallcptr */
480: { "errortype", 0, 0, 0, 7, 0, "" }, /* smallcptr */
481: { "example", 0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
482: { "exceptionname",0, 0, 0, 7, 0, "" }, /* smallcptr */
483: { "fax", 0, 0, 0, 4, 0, "" }, /* docinfo */
484: { "fieldsynopsis", 0, 0, 0, 9, 0, "" }, /* modifier */
485: { "figure", 0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
486: { "filename", 0, 0, 0, 7, 0, "" }, /* smallcptr */
487: { "firstname", 0, 0, 0, 4, 0, "" }, /* docinfo */
488: { "firstterm", 0, 0, 0, 3, 0, "" }, /* word */
489: { "footnote", 0, 0, 0, 9, 0, "" }, /* footnote.mix */
490: { "footnoteref",0, 2, 1, 0, 0, "" },
491: { "foreignphrase",0, 0, 0, 2, 0, "" }, /* para */
492: { "formalpara", 0, 0, 0, 9, 0, "" }, /* title */
493: { "funcdef", 0, 0, 0, 1, 0, "" },
494: { "funcparams", 0, 0, 0, 9, 0, "" }, /* cptr */
495: { "funcprototype",0, 0, 0, 9, 0, "" }, /* funcdef */
496: { "funcsynopsis",0, 0, 0, 9, 0, "" }, /* funcsynopsisinfo */
497: { "funcsynopsisinfo", 0, 0, 0, 9, 0, "" }, /* cptr */
498: { "function", 0, 0, 0, 9, 0, "" }, /* cptr */
499: { "glossary", 0, 0, 0, 9, 0, "" }, /* glossaryinfo */
500: { "glossaryinfo",0, 0, 0, 9, 0, "" }, /* graphic */
501: { "glossdef", 0, 0, 0, 9, 0, "" }, /* glossdef.mix */
502: { "glossdiv", 0, 0, 0, 9, 0, "" }, /* sect.title.content */
503: { "glossentry", 0, 0, 0, 9, 0, "" }, /* glossterm */
504: { "glosslist", 0, 0, 0, 9, 0, "" }, /* glossentry */
505: { "glossseealso",0, 0, 0, 2, 0, "" }, /* para */
506: { "glosssee", 0, 0, 0, 2, 0, "" }, /* para */
507: { "glossterm", 0, 0, 0, 2, 0, "" }, /* para */
508: { "graphic", 0, 2, 1, 0, 0, "" },
509: { "graphicco", 0, 0, 0, 9, 0, "" }, /* areaspec */
510: { "group", 0, 0, 0, 9, 0, "" }, /* arg */
511: { "guibutton", 0, 0, 0, 7, 0, "" }, /* smallcptr */
512: { "guiicon", 0, 0, 0, 7, 0, "" }, /* smallcptr */
513: { "guilabel", 0, 0, 0, 7, 0, "" }, /* smallcptr */
514: { "guimenuitem",0, 0, 0, 7, 0, "" }, /* smallcptr */
515: { "guimenu", 0, 0, 0, 7, 0, "" }, /* smallcptr */
516: { "guisubmenu", 0, 0, 0, 7, 0, "" }, /* smallcptr */
517: { "hardware", 0, 0, 0, 7, 0, "" }, /* smallcptr */
518: { "highlights", 0, 0, 0, 9, 0, "" }, /* highlights.mix */
519: { "holder", 0, 0, 0, 4, 0, "" }, /* docinfo */
520: { "honorific", 0, 0, 0, 4, 0, "" }, /* docinfo */
521: { "imagedata", 0, 2, 1, 0, 0, "" },
522: { "imageobjectco",0, 0, 0, 9, 0, "" }, /* areaspec */
523: { "imageobject",0, 0, 0, 9, 0, "" }, /* objectinfo */
524: { "important", 0, 0, 0, 9, 0, "" }, /* title */
525: { "indexdiv", 0, 0, 0, 9, 0, "" }, /* sect.title.content */
526: { "indexentry", 0, 0, 0, 9, 0, "" }, /* primaryie */
527: { "index", 0, 0, 0, 9, 0, "" }, /* indexinfo */
528: { "indexinfo", 0, 0, 0, 9, 0, "" }, /* graphic */
529: { "indexterm", 0, 0, 0, 9, 0, "" }, /* primary */
530: { "informalequation",0, 0, 0, 9, 0, "" }, /* equation.content */
531: { "informalexample",0, 0, 0, 9, 0, "" }, /* example.mix */
532: { "informalfigure",0, 0, 0, 9, 0, "" }, /* figure.mix */
533: { "informaltable",0, 0, 0, 9, 0, "" }, /* graphic */
534: { "initializer",0, 0, 0, 7, 0, "" }, /* smallcptr */
535: { "inlineequation",0, 0, 0, 9, 0, "" }, /* inlineequation.content */
536: { "inlinegraphic",0, 2, 1, 0, 0, "" },
537: { "inlinemediaobject",0,0, 0, 9, 0, "" }, /* objectinfo */
538: { "interfacename",0, 0, 0, 7, 0, "" }, /* smallcptr */
539: { "interface", 0, 0, 0, 7, 0, "" }, /* smallcptr */
540: { "invpartnumber",0, 0, 0, 4, 0, "" }, /* docinfo */
541: { "isbn", 0, 0, 0, 4, 0, "" }, /* docinfo */
542: { "issn", 0, 0, 0, 4, 0, "" }, /* docinfo */
543: { "issuenum", 0, 0, 0, 4, 0, "" }, /* docinfo */
544: { "itemizedlist",0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
545: { "itermset", 0, 0, 0, 9, 0, "" }, /* indexterm */
546: { "jobtitle", 0, 0, 0, 4, 0, "" }, /* docinfo */
547: { "keycap", 0, 0, 0, 7, 0, "" }, /* smallcptr */
548: { "keycode", 0, 0, 0, 7, 0, "" }, /* smallcptr */
549: { "keycombo", 0, 0, 0, 9, 0, "" }, /* keycap */
550: { "keysym", 0, 0, 0, 7, 0, "" }, /* smallcptr */
551: { "keyword", 0, 0, 0, 1, 0, "" },
552: { "keywordset", 0, 0, 0, 9, 0, "" }, /* keyword */
553: { "label", 0, 0, 0, 3, 0, "" }, /* word */
554: { "legalnotice",0, 0, 0, 9, 0, "" }, /* title */
555: { "lineage", 0, 0, 0, 4, 0, "" }, /* docinfo */
556: { "lineannotation",0, 0, 0, 2, 0, "" }, /* para */
557: { "link", 0, 0, 0, 2, 0, "" }, /* para */
558: { "listitem", 0, 0, 0, 9, 0, "" }, /* component.mix */
559: { "literal", 0, 0, 0, 9, 0, "" }, /* cptr */
560: { "literallayout",0, 0, 0, 2, 0, "" }, /* para */
561: { "lot", 0, 0, 0, 9, 0, "" }, /* bookcomponent.title.content */
562: { "lotentry", 0, 0, 0, 2, 0, "" }, /* para */
563: { "manvolnum", 0, 0, 0, 3, 0, "" }, /* word */
564: { "markup", 0, 0, 0, 7, 0, "" }, /* smallcptr */
565: { "medialabel", 0, 0, 0, 7, 0, "" }, /* smallcptr */
566: { "mediaobjectco",0, 0, 0, 9, 0, "" }, /* objectinfo */
567: { "mediaobject",0, 0, 0, 9, 0, "" }, /* objectinfo */
568: { "member", 0, 0, 0, 2, 0, "" }, /* para */
569: { "menuchoice", 0, 0, 0, 9, 0, "" }, /* shortcut */
570: { "methodname", 0, 0, 0, 7, 0, "" }, /* smallcptr */
571: { "methodparam",0, 0, 0, 9, 0, "" }, /* modifier */
572: { "methodsynopsis",0, 0, 0, 9, 0, "" }, /* modifier */
573: { "modespec", 0, 0, 0, 4, 0, "" }, /* docinfo */
574: { "modifier", 0, 0, 0, 7, 0, "" }, /* smallcptr */
575: { "mousebutton",0, 0, 0, 7, 0, "" }, /* smallcptr */
576: { "msgaud", 0, 0, 0, 2, 0, "" }, /* para */
577: { "msgentry", 0, 0, 0, 9, 0, "" }, /* msg */
578: { "msgexplan", 0, 0, 0, 9, 0, "" }, /* title */
579: { "msginfo", 0, 0, 0, 9, 0, "" }, /* msglevel */
580: { "msglevel", 0, 0, 0, 7, 0, "" }, /* smallcptr */
581: { "msgmain", 0, 0, 0, 9, 0, "" }, /* title */
582: { "msgorig", 0, 0, 0, 7, 0, "" }, /* smallcptr */
583: { "msgrel", 0, 0, 0, 9, 0, "" }, /* title */
584: { "msgset", 0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
585: { "msgsub", 0, 0, 0, 9, 0, "" }, /* title */
586: { "msgtext", 0, 0, 0, 9, 0, "" }, /* component.mix */
587: { "msg", 0, 0, 0, 9, 0, "" }, /* title */
588: { "note", 0, 0, 0, 9, 0, "" }, /* title */
589: { "objectinfo", 0, 0, 0, 9, 0, "" }, /* graphic */
590: { "olink", 0, 0, 0, 2, 0, "" }, /* para */
591: { "ooclass", 0, 0, 0, 9, 0, "" }, /* modifier */
592: { "ooexception",0, 0, 0, 9, 0, "" }, /* modifier */
593: { "oointerface",0, 0, 0, 9, 0, "" }, /* modifier */
594: { "optional", 0, 0, 0, 9, 0, "" }, /* cptr */
595: { "option", 0, 0, 0, 7, 0, "" }, /* smallcptr */
596: { "orderedlist",0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
597: { "orgdiv", 0, 0, 0, 4, 0, "" }, /* docinfo */
598: { "orgname", 0, 0, 0, 4, 0, "" }, /* docinfo */
599: { "otheraddr", 0, 0, 0, 4, 0, "" }, /* docinfo */
600: { "othercredit",0, 0, 0, 9, 0, "" }, /* person.ident.mix */
601: { "othername", 0, 0, 0, 4, 0, "" }, /* docinfo */
602: { "pagenums", 0, 0, 0, 4, 0, "" }, /* docinfo */
603: { "paramdef", 0, 0, 0, 1, 0, "" },
604: { "parameter", 0, 0, 0, 7, 0, "" }, /* smallcptr */
605: { "para", 0, 0, 0, 2, 0, "" }, /* para */
606: { "partinfo", 0, 0, 0, 9, 0, "" }, /* graphic */
607: { "partintro", 0, 0, 0, 9, 0, "" }, /* div.title.content */
608: { "part", 0, 0, 0, 9, 0, "" }, /* partinfo */
609: { "phone", 0, 0, 0, 4, 0, "" }, /* docinfo */
610: { "phrase", 0, 0, 0, 2, 0, "" }, /* para */
611: { "pob", 0, 0, 0, 4, 0, "" }, /* docinfo */
612: { "postcode", 0, 0, 0, 4, 0, "" }, /* docinfo */
613: { "prefaceinfo",0, 0, 0, 9, 0, "" }, /* graphic */
614: { "preface", 0, 0, 0, 9, 0, "" }, /* prefaceinfo */
615: { "primaryie", 0, 0, 0, 4, 0, "" }, /* ndxterm */
616: { "primary ", 0, 0, 0, 4, 0, "" }, /* ndxterm */
617: { "printhistory",0, 0, 0, 9, 0, "" }, /* para.class */
618: { "procedure", 0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
619: { "productname",0, 0, 0, 2, 0, "" }, /* para */
620: { "productnumber",0, 0, 0, 4, 0, "" }, /* docinfo */
621: { "programlistingco",0, 0, 0, 9, 0, "" }, /* areaspec */
622: { "programlisting",0, 0, 0, 2, 0, "" }, /* para */
623: { "prompt", 0, 0, 0, 7, 0, "" }, /* smallcptr */
624: { "property", 0, 0, 0, 7, 0, "" }, /* smallcptr */
625: { "pubdate", 0, 0, 0, 4, 0, "" }, /* docinfo */
626: { "publishername",0, 0, 0, 4, 0, "" }, /* docinfo */
627: { "publisher", 0, 0, 0, 9, 0, "" }, /* publishername */
628: { "pubsnumber", 0, 0, 0, 4, 0, "" }, /* docinfo */
629: { "qandadiv", 0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
630: { "qandaentry", 0, 0, 0, 9, 0, "" }, /* revhistory */
631: { "qandaset", 0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
632: { "question", 0, 0, 0, 9, 0, "" }, /* label */
633: { "quote", 0, 0, 0, 2, 0, "" }, /* para */
634: { "refclass", 0, 0, 0, 9, 0, "" }, /* refclass.char.mix */
635: { "refdescriptor",0, 0, 0, 9, 0, "" }, /* refname.char.mix */
636: { "refentryinfo",0, 0, 0, 9, 0, "" }, /* graphic */
637: { "refentry", 0, 0, 0, 9, 0, "" }, /* ndxterm.class */
638: { "refentrytitle",0, 0, 0, 2, 0, "" }, /* para */
639: { "referenceinfo",0, 0, 0, 9, 0, "" }, /* graphic */
640: { "reference", 0, 0, 0, 9, 0, "" }, /* referenceinfo */
641: { "refmeta", 0, 0, 0, 9, 0, "" }, /* ndxterm.class */
642: { "refmiscinfo",0, 0, 0, 4, 0, "" }, /* docinfo */
643: { "refnamediv", 0, 0, 0, 9, 0, "" }, /* refdescriptor */
644: { "refname", 0, 0, 0, 9, 0, "" }, /* refname.char.mix */
645: { "refpurpose", 0, 0, 0, 9, 0, "" }, /* refinline.char.mix */
646: { "refsect1info",0, 0, 0, 9, 0, "" }, /* graphic */
647: { "refsect1", 0, 0, 0, 9, 0, "" }, /* refsect */
648: { "refsect2info",0, 0, 0, 9, 0, "" }, /* graphic */
649: { "refsect2", 0, 0, 0, 9, 0, "" }, /* refsect */
650: { "refsect3info",0, 0, 0, 9, 0, "" }, /* graphic */
651: { "refsect3", 0, 0, 0, 9, 0, "" }, /* refsect */
652: { "refsynopsisdivinfo",0,0, 0, 9, 0, "" }, /* graphic */
653: { "refsynopsisdiv",0, 0, 0, 9, 0, "" }, /* refsynopsisdivinfo */
654: { "releaseinfo",0, 0, 0, 4, 0, "" }, /* docinfo */
655: { "remark", 0, 0, 0, 2, 0, "" }, /* para */
656: { "replaceable",0, 0, 0, 1, 0, "" },
657: { "returnvalue",0, 0, 0, 7, 0, "" }, /* smallcptr */
658: { "revdescription",0, 0, 0, 9, 0, "" }, /* revdescription.mix */
659: { "revhistory", 0, 0, 0, 9, 0, "" }, /* revision */
660: { "revision", 0, 0, 0, 9, 0, "" }, /* revnumber */
661: { "revnumber", 0, 0, 0, 4, 0, "" }, /* docinfo */
662: { "revremark", 0, 0, 0, 4, 0, "" }, /* docinfo */
663: { "row", 0, 0, 0, 9, 0, "" }, /* tbl.row.mdl */
664: { "row", 0, 0, 0, 9, 0, "" }, /* tbl.row.mdl */
665: { "sbr", 0, 2, 1, 0, 0, "" },
666: { "screenco", 0, 0, 0, 9, 0, "" }, /* areaspec */
667: { "screeninfo", 0, 0, 0, 2, 0, "" }, /* para */
668: { "screen", 0, 0, 0, 2, 0, "" }, /* para */
669: { "screenshot", 0, 0, 0, 9, 0, "" }, /* screeninfo */
670: { "secondaryie",0, 0, 0, 4, 0, "" }, /* ndxterm */
671: { "secondary", 0, 0, 0, 4, 0, "" }, /* ndxterm */
672: { "sect1info", 0, 0, 0, 9, 0, "" }, /* graphic */
673: { "sect1", 0, 0, 0, 9, 0, "" }, /* sect */
674: { "sect2info", 0, 0, 0, 9, 0, "" }, /* graphic */
675: { "sect2", 0, 0, 0, 9, 0, "" }, /* sect */
676: { "sect3info", 0, 0, 0, 9, 0, "" }, /* graphic */
677: { "sect3", 0, 0, 0, 9, 0, "" }, /* sect */
678: { "sect4info", 0, 0, 0, 9, 0, "" }, /* graphic */
679: { "sect4", 0, 0, 0, 9, 0, "" }, /* sect */
680: { "sect5info", 0, 0, 0, 9, 0, "" }, /* graphic */
681: { "sect5", 0, 0, 0, 9, 0, "" }, /* sect */
682: { "sectioninfo",0, 0, 0, 9, 0, "" }, /* graphic */
683: { "section", 0, 0, 0, 9, 0, "" }, /* sectioninfo */
684: { "seealsoie", 0, 0, 0, 4, 0, "" }, /* ndxterm */
685: { "seealso", 0, 0, 0, 4, 0, "" }, /* ndxterm */
686: { "seeie", 0, 0, 0, 4, 0, "" }, /* ndxterm */
687: { "see", 0, 0, 0, 4, 0, "" }, /* ndxterm */
688: { "seglistitem",0, 0, 0, 9, 0, "" }, /* seg */
689: { "segmentedlist",0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
690: { "seg", 0, 0, 0, 2, 0, "" }, /* para */
691: { "segtitle", 0, 0, 0, 8, 0, "" }, /* title */
692: { "seriesvolnums", 0, 0, 0, 4, 0, "" }, /* docinfo */
693: { "set", 0, 0, 0, 9, 0, "" }, /* div.title.content */
694: { "setindexinfo",0, 0, 0, 9, 0, "" }, /* graphic */
695: { "setindex", 0, 0, 0, 9, 0, "" }, /* setindexinfo */
696: { "setinfo", 0, 0, 0, 9, 0, "" }, /* graphic */
697: { "sgmltag", 0, 0, 0, 7, 0, "" }, /* smallcptr */
698: { "shortaffil", 0, 0, 0, 4, 0, "" }, /* docinfo */
699: { "shortcut", 0, 0, 0, 9, 0, "" }, /* keycap */
700: { "sidebarinfo",0, 0, 0, 9, 0, "" }, /* graphic */
701: { "sidebar", 0, 0, 0, 9, 0, "" }, /* sidebarinfo */
702: { "simpara", 0, 0, 0, 2, 0, "" }, /* para */
703: { "simplelist", 0, 0, 0, 9, 0, "" }, /* member */
704: { "simplemsgentry", 0, 0, 0, 9, 0, "" }, /* msgtext */
705: { "simplesect", 0, 0, 0, 9, 0, "" }, /* sect.title.content */
706: { "spanspec", 0, 2, 1, 0, 0, "" },
707: { "state", 0, 0, 0, 4, 0, "" }, /* docinfo */
708: { "step", 0, 0, 0, 9, 0, "" }, /* title */
709: { "street", 0, 0, 0, 4, 0, "" }, /* docinfo */
710: { "structfield",0, 0, 0, 7, 0, "" }, /* smallcptr */
711: { "structname", 0, 0, 0, 7, 0, "" }, /* smallcptr */
712: { "subjectset", 0, 0, 0, 9, 0, "" }, /* subject */
713: { "subject", 0, 0, 0, 9, 0, "" }, /* subjectterm */
714: { "subjectterm",0, 0, 0, 1, 0, "" },
715: { "subscript", 0, 0, 0, 1, 0, "" },
716: { "substeps", 0, 0, 0, 9, 0, "" }, /* step */
717: { "subtitle", 0, 0, 0, 8, 0, "" }, /* title */
718: { "superscript", 0, 0, 0, 1, 0, "" },
719: { "surname", 0, 0, 0, 4, 0, "" }, /* docinfo */
720: { "symbol", 0, 0, 0, 7, 0, "" }, /* smallcptr */
721: { "synopfragment", 0, 0, 0, 9, 0, "" }, /* arg */
722: { "synopfragmentref", 0, 0, 0, 1, 0, "" },
723: { "synopsis", 0, 0, 0, 2, 0, "" }, /* para */
724: { "systemitem", 0, 0, 0, 7, 0, "" }, /* smallcptr */
725: { "table", 0, 0, 0, 9, 0, "" }, /* tbl.table.mdl */
726: /* { "%tbl.table.name;", 0, 0, 0, 9, 0, "" },*/ /* tbl.table.mdl */
727: { "tbody", 0, 0, 0, 9, 0, "" }, /* row */
728: { "tbody", 0, 0, 0, 9, 0, "" }, /* row */
729: { "term", 0, 0, 0, 2, 0, "" }, /* para */
730: { "tertiaryie", 0, 0, 0, 4, 0, "" }, /* ndxterm */
731: { "tertiary ", 0, 0, 0, 4, 0, "" }, /* ndxterm */
732: { "textobject", 0, 0, 0, 9, 0, "" }, /* objectinfo */
733: { "tfoot", 0, 0, 0, 9, 0, "" }, /* tbl.hdft.mdl */
734: { "tgroup", 0, 0, 0, 9, 0, "" }, /* tbl.tgroup.mdl */
735: { "tgroup", 0, 0, 0, 9, 0, "" }, /* tbl.tgroup.mdl */
736: { "thead", 0, 0, 0, 9, 0, "" }, /* row */
737: { "thead", 0, 0, 0, 9, 0, "" }, /* tbl.hdft.mdl */
738: { "tip", 0, 0, 0, 9, 0, "" }, /* title */
739: { "titleabbrev",0, 0, 0, 8, 0, "" }, /* title */
740: { "title", 0, 0, 0, 8, 0, "" }, /* title */
741: { "tocback", 0, 0, 0, 2, 0, "" }, /* para */
742: { "toc", 0, 0, 0, 9, 0, "" }, /* bookcomponent.title.content */
743: { "tocchap", 0, 0, 0, 9, 0, "" }, /* tocentry */
744: { "tocentry", 0, 0, 0, 2, 0, "" }, /* para */
745: { "tocfront", 0, 0, 0, 2, 0, "" }, /* para */
746: { "toclevel1", 0, 0, 0, 9, 0, "" }, /* tocentry */
747: { "toclevel2", 0, 0, 0, 9, 0, "" }, /* tocentry */
748: { "toclevel3", 0, 0, 0, 9, 0, "" }, /* tocentry */
749: { "toclevel4", 0, 0, 0, 9, 0, "" }, /* tocentry */
750: { "toclevel5", 0, 0, 0, 9, 0, "" }, /* tocentry */
751: { "tocpart", 0, 0, 0, 9, 0, "" }, /* tocentry */
752: { "token", 0, 0, 0, 7, 0, "" }, /* smallcptr */
753: { "trademark", 0, 0, 0, 1, 0, "" },
754: { "type", 0, 0, 0, 7, 0, "" }, /* smallcptr */
755: { "ulink", 0, 0, 0, 2, 0, "" }, /* para */
756: { "userinput", 0, 0, 0, 9, 0, "" }, /* cptr */
757: { "varargs", 0, 2, 1, 0, 0, "" },
758: { "variablelist",0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
759: { "varlistentry",0, 0, 0, 9, 0, "" }, /* term */
760: { "varname", 0, 0, 0, 7, 0, "" }, /* smallcptr */
761: { "videodata", 0, 2, 1, 0, 0, "" },
762: { "videoobject",0, 0, 0, 9, 0, "" }, /* objectinfo */
763: { "void", 0, 2, 1, 0, 0, "" },
764: { "volumenum", 0, 0, 0, 4, 0, "" }, /* docinfo */
765: { "warning", 0, 0, 0, 9, 0, "" }, /* title */
766: { "wordasword", 0, 0, 0, 3, 0, "" }, /* word */
767: { "xref", 0, 2, 1, 0, 0, "" },
768: { "year", 0, 0, 0, 4, 0, "" }, /* docinfo */
769: };
770:
771: /*
772: * start tags that imply the end of a current element
773: * any tag of each line implies the end of the current element if the type of
774: * that element is in the same line
775: */
776: char *sgmlEquEnd[] = {
777: "dt", "dd", "li", "option", NULL,
778: "h1", "h2", "h3", "h4", "h5", "h6", NULL,
779: "ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
780: NULL
781: };
782: /*
783: * acording the SGML DTD, HR should be added to the 2nd line above, as it
784: * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
785: * because many documents contain rules in headings...
786: */
787:
788: /*
789: * start tags that imply the end of current element
790: */
791: char *sgmlStartClose[] = {
792: NULL
793: };
794:
795: /*
796: * The list of SGML elements which are supposed not to have
797: * CDATA content and where a p element will be implied
798: *
799: * TODO: extend that list by reading the SGML SGML DtD on
800: * implied paragraph
801: */
802: static char *sgmlNoContentElements[] = {
803: NULL
804: };
805:
806:
807: static char** sgmlStartCloseIndex[100];
808: static int sgmlStartCloseIndexinitialized = 0;
809:
810: /************************************************************************
811: * *
812: * functions to handle SGML specific data *
813: * *
814: ************************************************************************/
815:
816: /**
817: * sgmlInitAutoClose:
818: *
819: * Initialize the sgmlStartCloseIndex for fast lookup of closing tags names.
820: *
821: */
822: void
823: sgmlInitAutoClose(void) {
824: int index, i = 0;
825:
826: if (sgmlStartCloseIndexinitialized) return;
827:
828: for (index = 0;index < 100;index ++) sgmlStartCloseIndex[index] = NULL;
829: index = 0;
830: while ((sgmlStartClose[i] != NULL) && (index < 100 - 1)) {
831: sgmlStartCloseIndex[index++] = &sgmlStartClose[i];
832: while (sgmlStartClose[i] != NULL) i++;
833: i++;
834: }
835: }
836:
837: /**
838: * sgmlTagLookup:
839: * @tag: The tag name
840: *
841: * Lookup the SGML tag in the ElementTable
842: *
843: * Returns the related sgmlElemDescPtr or NULL if not found.
844: */
845: sgmlElemDescPtr
846: sgmlTagLookup(const xmlChar *tag) {
847: int i;
848:
849: for (i = 0; i < (sizeof(docbookElementTable) /
850: sizeof(docbookElementTable[0]));i++) {
1.7 veillard 851: if (xmlStrEqual(tag, BAD_CAST docbookElementTable[i].name))
1.1 veillard 852: return(&docbookElementTable[i]);
853: }
854: return(NULL);
855: }
856:
857: /**
858: * sgmlCheckAutoClose:
859: * @newtag: The new tag name
860: * @oldtag: The old tag name
861: *
862: * Checks wether the new tag is one of the registered valid tags for closing old.
863: * Initialize the sgmlStartCloseIndex for fast lookup of closing tags names.
864: *
865: * Returns 0 if no, 1 if yes.
866: */
867: int
868: sgmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
869: int i, index;
870: char **close;
871:
872: if (sgmlStartCloseIndexinitialized == 0) sgmlInitAutoClose();
873:
874: /* inefficient, but not a big deal */
875: for (index = 0; index < 100;index++) {
876: close = sgmlStartCloseIndex[index];
877: if (close == NULL) return(0);
1.7 veillard 878: if (xmlStrEqual(BAD_CAST *close, newtag)) break;
1.1 veillard 879: }
880:
881: i = close - sgmlStartClose;
882: i++;
883: while (sgmlStartClose[i] != NULL) {
1.7 veillard 884: if (xmlStrEqual(BAD_CAST sgmlStartClose[i], oldtag)) {
1.1 veillard 885: return(1);
886: }
887: i++;
888: }
889: return(0);
890: }
891:
892: /**
893: * sgmlAutoCloseOnClose:
894: * @ctxt: an SGML parser context
895: * @newtag: The new tag name
896: *
897: * The HTmL DtD allows an ending tag to implicitely close other tags.
898: */
899: void
900: sgmlAutoCloseOnClose(sgmlParserCtxtPtr ctxt, const xmlChar *newtag) {
901: sgmlElemDescPtr info;
902: xmlChar *oldname;
903: int i;
904:
905: if ((newtag[0] == '/') && (newtag[1] == 0))
906: return;
907:
908: #ifdef DEBUG
909: fprintf(stderr,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
910: for (i = 0;i < ctxt->nameNr;i++)
911: fprintf(stderr,"%d : %s\n", i, ctxt->nameTab[i]);
912: #endif
913:
914: for (i = (ctxt->nameNr - 1);i >= 0;i--) {
1.7 veillard 915: if (xmlStrEqual(newtag, ctxt->nameTab[i])) break;
1.1 veillard 916: }
917: if (i < 0) return;
918:
1.7 veillard 919: while (!xmlStrEqual(newtag, ctxt->name)) {
1.1 veillard 920: info = sgmlTagLookup(ctxt->name);
921: if ((info == NULL) || (info->endTag == 1)) {
922: #ifdef DEBUG
923: fprintf(stderr,"sgmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
924: #endif
925: } else {
926: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
927: ctxt->sax->error(ctxt->userData,
928: "Opening and ending tag mismatch: %s and %s\n",
929: newtag, ctxt->name);
930: ctxt->wellFormed = 0;
931: }
932: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
933: ctxt->sax->endElement(ctxt->userData, ctxt->name);
934: oldname = sgmlnamePop(ctxt);
935: if (oldname != NULL) {
936: #ifdef DEBUG
937: fprintf(stderr,"sgmlAutoCloseOnClose: popped %s\n", oldname);
938: #endif
939: xmlFree(oldname);
940: }
941: }
942: }
943:
944: /**
945: * sgmlAutoClose:
946: * @ctxt: an SGML parser context
947: * @newtag: The new tag name or NULL
948: *
949: * The HTmL DtD allows a tag to implicitely close other tags.
950: * The list is kept in sgmlStartClose array. This function is
951: * called when a new tag has been detected and generates the
952: * appropriates closes if possible/needed.
953: * If newtag is NULL this mean we are at the end of the resource
954: * and we should check
955: */
956: void
957: sgmlAutoClose(sgmlParserCtxtPtr ctxt, const xmlChar *newtag) {
958: xmlChar *oldname;
959: while ((newtag != NULL) && (ctxt->name != NULL) &&
960: (sgmlCheckAutoClose(newtag, ctxt->name))) {
961: #ifdef DEBUG
962: fprintf(stderr,"sgmlAutoClose: %s closes %s\n", newtag, ctxt->name);
963: #endif
964: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
965: ctxt->sax->endElement(ctxt->userData, ctxt->name);
966: oldname = sgmlnamePop(ctxt);
967: if (oldname != NULL) {
968: #ifdef DEBUG
969: fprintf(stderr,"sgmlAutoClose: popped %s\n", oldname);
970: #endif
971: xmlFree(oldname);
972: }
973: }
974: #if 0
975: if (newtag == NULL) {
976: sgmlAutoCloseOnClose(ctxt, BAD_CAST"head");
977: sgmlAutoCloseOnClose(ctxt, BAD_CAST"body");
978: sgmlAutoCloseOnClose(ctxt, BAD_CAST"sgml");
979: }
980: while ((newtag == NULL) && (ctxt->name != NULL) &&
1.7 veillard 981: ((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
982: (xmlStrEqual(ctxt->name, BAD_CAST"body")) ||
983: (xmlStrEqual(ctxt->name, BAD_CAST"sgml")))) {
1.1 veillard 984: #ifdef DEBUG
985: fprintf(stderr,"sgmlAutoClose: EOF closes %s\n", ctxt->name);
986: #endif
987: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
988: ctxt->sax->endElement(ctxt->userData, ctxt->name);
989: oldname = sgmlnamePop(ctxt);
990: if (oldname != NULL) {
991: #ifdef DEBUG
992: fprintf(stderr,"sgmlAutoClose: popped %s\n", oldname);
993: #endif
994: xmlFree(oldname);
995: }
996: }
997: #endif
998: }
999:
1000: /**
1001: * sgmlAutoCloseTag:
1002: * @doc: the SGML document
1003: * @name: The tag name
1004: * @elem: the SGML element
1005: *
1006: * The HTmL DtD allows a tag to implicitely close other tags.
1007: * The list is kept in sgmlStartClose array. This function checks
1008: * if the element or one of it's children would autoclose the
1009: * given tag.
1010: *
1011: * Returns 1 if autoclose, 0 otherwise
1012: */
1013: int
1014: sgmlAutoCloseTag(sgmlDocPtr doc, const xmlChar *name, sgmlNodePtr elem) {
1015: sgmlNodePtr child;
1016:
1017: if (elem == NULL) return(1);
1.7 veillard 1018: if (xmlStrEqual(name, elem->name)) return(0);
1.1 veillard 1019: if (sgmlCheckAutoClose(elem->name, name)) return(1);
1020: child = elem->children;
1021: while (child != NULL) {
1022: if (sgmlAutoCloseTag(doc, name, child)) return(1);
1023: child = child->next;
1024: }
1025: return(0);
1026: }
1027:
1028: /**
1029: * sgmlIsAutoClosed:
1030: * @doc: the SGML document
1031: * @elem: the SGML element
1032: *
1033: * The HTmL DtD allows a tag to implicitely close other tags.
1034: * The list is kept in sgmlStartClose array. This function checks
1035: * if a tag is autoclosed by one of it's child
1036: *
1037: * Returns 1 if autoclosed, 0 otherwise
1038: */
1039: int
1040: sgmlIsAutoClosed(sgmlDocPtr doc, sgmlNodePtr elem) {
1041: sgmlNodePtr child;
1042:
1043: if (elem == NULL) return(1);
1044: child = elem->children;
1045: while (child != NULL) {
1046: if (sgmlAutoCloseTag(doc, elem->name, child)) return(1);
1047: child = child->next;
1048: }
1049: return(0);
1050: }
1051:
1052: /**
1053: * sgmlCheckImplied:
1054: * @ctxt: an SGML parser context
1055: * @newtag: The new tag name
1056: *
1057: * The HTmL DtD allows a tag to exists only implicitely
1058: * called when a new tag has been detected and generates the
1059: * appropriates implicit tags if missing
1060: */
1061: void
1062: sgmlCheckImplied(sgmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1063: #if 0
1.7 veillard 1064: if (xmlStrEqual(newtag, BAD_CAST"sgml"))
1.1 veillard 1065: return;
1066: if (ctxt->nameNr <= 0) {
1067: #ifdef DEBUG
1068: fprintf(stderr,"Implied element sgml: pushed sgml\n");
1069: #endif
1070: sgmlnamePush(ctxt, xmlStrdup(BAD_CAST"sgml"));
1071: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1072: ctxt->sax->startElement(ctxt->userData, BAD_CAST"sgml", NULL);
1073: }
1.7 veillard 1074: if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1.1 veillard 1075: return;
1076: if (ctxt->nameNr <= 1) {
1.7 veillard 1077: if ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1078: (xmlStrEqual(newtag, BAD_CAST"style")) ||
1079: (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1080: (xmlStrEqual(newtag, BAD_CAST"link")) ||
1081: (xmlStrEqual(newtag, BAD_CAST"title")) ||
1082: (xmlStrEqual(newtag, BAD_CAST"base"))) {
1.1 veillard 1083: /*
1084: * dropped OBJECT ... i you put it first BODY will be
1085: * assumed !
1086: */
1087: #ifdef DEBUG
1088: fprintf(stderr,"Implied element head: pushed head\n");
1089: #endif
1090: sgmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
1091: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1092: ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1093: } else {
1094: #ifdef DEBUG
1095: fprintf(stderr,"Implied element body: pushed body\n");
1096: #endif
1097: sgmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
1098: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1099: ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1100: }
1101: }
1102: #endif
1103: }
1104:
1105: /**
1106: * sgmlCheckParagraph
1107: * @ctxt: an SGML parser context
1108: *
1109: * Check whether a p element need to be implied before inserting
1110: * characters in the current element.
1111: *
1112: * Returns 1 if a paragraph has been inserted, 0 if not and -1
1113: * in case of error.
1114: */
1115:
1116: int
1117: sgmlCheckParagraph(sgmlParserCtxtPtr ctxt) {
1118: const xmlChar *tag;
1119: int i;
1120:
1121: if (ctxt == NULL)
1122: return(-1);
1123: tag = ctxt->name;
1124: if (tag == NULL) {
1125: sgmlAutoClose(ctxt, BAD_CAST"p");
1126: sgmlCheckImplied(ctxt, BAD_CAST"p");
1127: sgmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
1128: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1129: ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1130: return(1);
1131: }
1132: for (i = 0; sgmlNoContentElements[i] != NULL; i++) {
1.7 veillard 1133: if (xmlStrEqual(tag, BAD_CAST sgmlNoContentElements[i])) {
1.1 veillard 1134: #ifdef DEBUG
1135: fprintf(stderr,"Implied element paragraph\n");
1136: #endif
1137: sgmlAutoClose(ctxt, BAD_CAST"p");
1138: sgmlCheckImplied(ctxt, BAD_CAST"p");
1139: sgmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
1140: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1141: ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1142: return(1);
1143: }
1144: }
1145: return(0);
1146: }
1147:
1148: /************************************************************************
1149: * *
1150: * The list of SGML predefined entities *
1151: * *
1152: ************************************************************************/
1153:
1154:
1155: sgmlEntityDesc docbookEntitiesTable[] = {
1156: /*
1157: * the 4 absolute ones, plus apostrophe.
1158: */
1159: { 0x0026, "amp", "AMPERSAND" },
1160: { 0x003C, "lt", "LESS-THAN SIGN" },
1161:
1162: /*
1163: * Converted with VI macros from docbook ent files
1164: */
1165: { 0x0021, "excl", "EXCLAMATION MARK" },
1166: { 0x0022, "quot", "QUOTATION MARK" },
1167: { 0x0023, "num", "NUMBER SIGN" },
1168: { 0x0024, "dollar", "DOLLAR SIGN" },
1169: { 0x0025, "percnt", "PERCENT SIGN" },
1170: { 0x0027, "apos", "APOSTROPHE" },
1171: { 0x0028, "lpar", "LEFT PARENTHESIS" },
1172: { 0x0029, "rpar", "RIGHT PARENTHESIS" },
1173: { 0x002A, "ast", "ASTERISK OPERATOR" },
1174: { 0x002B, "plus", "PLUS SIGN" },
1175: { 0x002C, "comma", "COMMA" },
1176: { 0x002D, "hyphen", "HYPHEN-MINUS" },
1177: { 0x002E, "period", "FULL STOP" },
1178: { 0x002F, "sol", "SOLIDUS" },
1179: { 0x003A, "colon", "COLON" },
1180: { 0x003B, "semi", "SEMICOLON" },
1181: { 0x003D, "equals", "EQUALS SIGN" },
1182: { 0x003E, "gt", "GREATER-THAN SIGN" },
1183: { 0x003F, "quest", "QUESTION MARK" },
1184: { 0x0040, "commat", "COMMERCIAL AT" },
1185: { 0x005B, "lsqb", "LEFT SQUARE BRACKET" },
1186: { 0x005C, "bsol", "REVERSE SOLIDUS" },
1187: { 0x005D, "rsqb", "RIGHT SQUARE BRACKET" },
1188: { 0x005E, "circ", "RING OPERATOR" },
1189: { 0x005F, "lowbar", "LOW LINE" },
1190: { 0x0060, "grave", "GRAVE ACCENT" },
1191: { 0x007B, "lcub", "LEFT CURLY BRACKET" },
1192: { 0x007C, "verbar", "VERTICAL LINE" },
1193: { 0x007D, "rcub", "RIGHT CURLY BRACKET" },
1194: { 0x00A0, "nbsp", "NO-BREAK SPACE" },
1195: { 0x00A1, "iexcl", "INVERTED EXCLAMATION MARK" },
1196: { 0x00A2, "cent", "CENT SIGN" },
1197: { 0x00A3, "pound", "POUND SIGN" },
1198: { 0x00A4, "curren", "CURRENCY SIGN" },
1199: { 0x00A5, "yen", "YEN SIGN" },
1200: { 0x00A6, "brvbar", "BROKEN BAR" },
1201: { 0x00A7, "sect", "SECTION SIGN" },
1202: { 0x00A8, "die", "" },
1203: { 0x00A8, "Dot", "" },
1204: { 0x00A8, "uml", "" },
1205: { 0x00A9, "copy", "COPYRIGHT SIGN" },
1206: { 0x00AA, "ordf", "FEMININE ORDINAL INDICATOR" },
1207: { 0x00AB, "laquo", "LEFT-POINTING DOUBLE ANGLE QUOTATION MARK" },
1208: { 0x00AC, "not", "NOT SIGN" },
1209: { 0x00AD, "shy", "SOFT HYPHEN" },
1210: { 0x00AE, "reg", "REG TRADE MARK SIGN" },
1211: { 0x00AF, "macr", "MACRON" },
1212: { 0x00B0, "deg", "DEGREE SIGN" },
1213: { 0x00B1, "plusmn", "PLUS-MINUS SIGN" },
1214: { 0x00B2, "sup2", "SUPERSCRIPT TWO" },
1215: { 0x00B3, "sup3", "SUPERSCRIPT THREE" },
1216: { 0x00B4, "acute", "ACUTE ACCENT" },
1217: { 0x00B5, "micro", "MICRO SIGN" },
1218: { 0x00B6, "para", "PILCROW SIGN" },
1219: { 0x00B7, "middot", "MIDDLE DOT" },
1220: { 0x00B8, "cedil", "CEDILLA" },
1221: { 0x00B9, "sup1", "SUPERSCRIPT ONE" },
1222: { 0x00BA, "ordm", "MASCULINE ORDINAL INDICATOR" },
1223: { 0x00BB, "raquo", "RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK" },
1224: { 0x00BC, "frac14", "VULGAR FRACTION ONE QUARTER" },
1225: { 0x00BD, "frac12", "VULGAR FRACTION ONE HALF" },
1226: { 0x00BD, "half", "VULGAR FRACTION ONE HALF" },
1227: { 0x00BE, "frac34", "VULGAR FRACTION THREE QUARTERS" },
1228: { 0x00BF, "iquest", "INVERTED QUESTION MARK" },
1229: { 0x00C0, "Agrave", "LATIN CAPITAL LETTER A WITH GRAVE" },
1230: { 0x00C1, "Aacute", "LATIN CAPITAL LETTER A WITH ACUTE" },
1231: { 0x00C2, "Acirc", "LATIN CAPITAL LETTER A WITH CIRCUMFLEX" },
1232: { 0x00C3, "Atilde", "LATIN CAPITAL LETTER A WITH TILDE" },
1233: { 0x00C4, "Auml", "LATIN CAPITAL LETTER A WITH DIAERESIS" },
1234: { 0x00C5, "Aring", "LATIN CAPITAL LETTER A WITH RING ABOVE" },
1235: { 0x00C6, "AElig", "LATIN CAPITAL LETTER AE" },
1236: { 0x00C7, "Ccedil", "LATIN CAPITAL LETTER C WITH CEDILLA" },
1237: { 0x00C8, "Egrave", "LATIN CAPITAL LETTER E WITH GRAVE" },
1238: { 0x00C9, "Eacute", "LATIN CAPITAL LETTER E WITH ACUTE" },
1239: { 0x00CA, "Ecirc", "LATIN CAPITAL LETTER E WITH CIRCUMFLEX" },
1240: { 0x00CB, "Euml", "LATIN CAPITAL LETTER E WITH DIAERESIS" },
1241: { 0x00CC, "Igrave", "LATIN CAPITAL LETTER I WITH GRAVE" },
1242: { 0x00CD, "Iacute", "LATIN CAPITAL LETTER I WITH ACUTE" },
1243: { 0x00CE, "Icirc", "LATIN CAPITAL LETTER I WITH CIRCUMFLEX" },
1244: { 0x00CF, "Iuml", "LATIN CAPITAL LETTER I WITH DIAERESIS" },
1245: { 0x00D0, "ETH", "LATIN CAPITAL LETTER ETH" },
1246: { 0x00D1, "Ntilde", "LATIN CAPITAL LETTER N WITH TILDE" },
1247: { 0x00D2, "Ograve", "LATIN CAPITAL LETTER O WITH GRAVE" },
1248: { 0x00D3, "Oacute", "LATIN CAPITAL LETTER O WITH ACUTE" },
1249: { 0x00D4, "Ocirc", "LATIN CAPITAL LETTER O WITH CIRCUMFLEX" },
1250: { 0x00D5, "Otilde", "LATIN CAPITAL LETTER O WITH TILDE" },
1251: { 0x00D6, "Ouml", "LATIN CAPITAL LETTER O WITH DIAERESIS" },
1252: { 0x00D7, "times", "MULTIPLICATION SIGN" },
1253: { 0x00D8, "Oslash", "LATIN CAPITAL LETTER O WITH STROKE" },
1254: { 0x00D9, "Ugrave", "LATIN CAPITAL LETTER U WITH GRAVE" },
1255: { 0x00DA, "Uacute", "LATIN CAPITAL LETTER U WITH ACUTE" },
1256: { 0x00DB, "Ucirc", "LATIN CAPITAL LETTER U WITH CIRCUMFLEX" },
1257: { 0x00DC, "Uuml", "LATIN CAPITAL LETTER U WITH DIAERESIS" },
1258: { 0x00DD, "Yacute", "LATIN CAPITAL LETTER Y WITH ACUTE" },
1259: { 0x00DE, "THORN", "LATIN CAPITAL LETTER THORN" },
1260: { 0x00DF, "szlig", "LATIN SMALL LETTER SHARP S" },
1261: { 0x00E0, "agrave", "LATIN SMALL LETTER A WITH GRAVE" },
1262: { 0x00E1, "aacute", "LATIN SMALL LETTER A WITH ACUTE" },
1263: { 0x00E2, "acirc", "LATIN SMALL LETTER A WITH CIRCUMFLEX" },
1264: { 0x00E3, "atilde", "LATIN SMALL LETTER A WITH TILDE" },
1265: { 0x00E4, "auml", "LATIN SMALL LETTER A WITH DIAERESIS" },
1266: { 0x00E5, "aring", "LATIN SMALL LETTER A WITH RING ABOVE" },
1267: { 0x00E6, "aelig", "LATIN SMALL LETTER AE" },
1268: { 0x00E7, "ccedil", "LATIN SMALL LETTER C WITH CEDILLA" },
1269: { 0x00E8, "egrave", "LATIN SMALL LETTER E WITH GRAVE" },
1270: { 0x00E9, "eacute", "LATIN SMALL LETTER E WITH ACUTE" },
1271: { 0x00EA, "ecirc", "LATIN SMALL LETTER E WITH CIRCUMFLEX" },
1272: { 0x00EB, "euml", "LATIN SMALL LETTER E WITH DIAERESIS" },
1273: { 0x00EC, "igrave", "LATIN SMALL LETTER I WITH GRAVE" },
1274: { 0x00ED, "iacute", "LATIN SMALL LETTER I WITH ACUTE" },
1275: { 0x00EE, "icirc", "LATIN SMALL LETTER I WITH CIRCUMFLEX" },
1276: { 0x00EF, "iuml", "LATIN SMALL LETTER I WITH DIAERESIS" },
1277: { 0x00F0, "eth", "LATIN SMALL LETTER ETH" },
1278: { 0x00F1, "ntilde", "LATIN SMALL LETTER N WITH TILDE" },
1279: { 0x00F2, "ograve", "LATIN SMALL LETTER O WITH GRAVE" },
1280: { 0x00F3, "oacute", "LATIN SMALL LETTER O WITH ACUTE" },
1281: { 0x00F4, "ocirc", "LATIN SMALL LETTER O WITH CIRCUMFLEX" },
1282: { 0x00F5, "otilde", "LATIN SMALL LETTER O WITH TILDE" },
1283: { 0x00F6, "ouml", "LATIN SMALL LETTER O WITH DIAERESIS" },
1284: { 0x00F7, "divide", "DIVISION SIGN" },
1285: { 0x00F8, "oslash", "CIRCLED DIVISION SLASH" },
1286: { 0x00F9, "ugrave", "LATIN SMALL LETTER U WITH GRAVE" },
1287: { 0x00FA, "uacute", "LATIN SMALL LETTER U WITH ACUTE" },
1288: { 0x00FB, "ucirc", "LATIN SMALL LETTER U WITH CIRCUMFLEX" },
1289: { 0x00FC, "uuml", "LATIN SMALL LETTER U WITH DIAERESIS" },
1290: { 0x00FD, "yacute", "LATIN SMALL LETTER Y WITH ACUTE" },
1291: { 0x00FE, "thorn", "LATIN SMALL LETTER THORN" },
1292: { 0x00FF, "yuml", "LATIN SMALL LETTER Y WITH DIAERESIS" },
1293: { 0x0100, "Amacr", "LATIN CAPITAL LETTER A WITH MACRON" },
1294: { 0x0101, "amacr", "LATIN SMALL LETTER A WITH MACRON" },
1295: { 0x0102, "Abreve", "LATIN CAPITAL LETTER A WITH BREVE" },
1296: { 0x0103, "abreve", "LATIN SMALL LETTER A WITH BREVE" },
1297: { 0x0104, "Aogon", "LATIN CAPITAL LETTER A WITH OGONEK" },
1298: { 0x0105, "aogon", "LATIN SMALL LETTER A WITH OGONEK" },
1299: { 0x0106, "Cacute", "LATIN CAPITAL LETTER C WITH ACUTE" },
1300: { 0x0107, "cacute", "LATIN SMALL LETTER C WITH ACUTE" },
1301: { 0x0108, "Ccirc", "LATIN CAPITAL LETTER C WITH CIRCUMFLEX" },
1302: { 0x0109, "ccirc", "LATIN SMALL LETTER C WITH CIRCUMFLEX" },
1303: { 0x010A, "Cdot", "LATIN CAPITAL LETTER C WITH DOT ABOVE" },
1304: { 0x010B, "cdot", "DOT OPERATOR" },
1305: { 0x010C, "Ccaron", "LATIN CAPITAL LETTER C WITH CARON" },
1306: { 0x010D, "ccaron", "LATIN SMALL LETTER C WITH CARON" },
1307: { 0x010E, "Dcaron", "LATIN CAPITAL LETTER D WITH CARON" },
1308: { 0x010F, "dcaron", "LATIN SMALL LETTER D WITH CARON" },
1309: { 0x0110, "Dstrok", "LATIN CAPITAL LETTER D WITH STROKE" },
1310: { 0x0111, "dstrok", "LATIN SMALL LETTER D WITH STROKE" },
1311: { 0x0112, "Emacr", "LATIN CAPITAL LETTER E WITH MACRON" },
1312: { 0x0113, "emacr", "LATIN SMALL LETTER E WITH MACRON" },
1313: { 0x0116, "Edot", "LATIN CAPITAL LETTER E WITH DOT ABOVE" },
1314: { 0x0117, "edot", "LATIN SMALL LETTER E WITH DOT ABOVE" },
1315: { 0x0118, "Eogon", "LATIN CAPITAL LETTER E WITH OGONEK" },
1316: { 0x0119, "eogon", "LATIN SMALL LETTER E WITH OGONEK" },
1317: { 0x011A, "Ecaron", "LATIN CAPITAL LETTER E WITH CARON" },
1318: { 0x011B, "ecaron", "LATIN SMALL LETTER E WITH CARON" },
1319: { 0x011C, "Gcirc", "LATIN CAPITAL LETTER G WITH CIRCUMFLEX" },
1320: { 0x011D, "gcirc", "LATIN SMALL LETTER G WITH CIRCUMFLEX" },
1321: { 0x011E, "Gbreve", "LATIN CAPITAL LETTER G WITH BREVE" },
1322: { 0x011F, "gbreve", "LATIN SMALL LETTER G WITH BREVE" },
1323: { 0x0120, "Gdot", "LATIN CAPITAL LETTER G WITH DOT ABOVE" },
1324: { 0x0121, "gdot", "LATIN SMALL LETTER G WITH DOT ABOVE" },
1325: { 0x0122, "Gcedil", "LATIN CAPITAL LETTER G WITH CEDILLA" },
1326: { 0x0124, "Hcirc", "LATIN CAPITAL LETTER H WITH CIRCUMFLEX" },
1327: { 0x0125, "hcirc", "LATIN SMALL LETTER H WITH CIRCUMFLEX" },
1328: { 0x0126, "Hstrok", "LATIN CAPITAL LETTER H WITH STROKE" },
1329: { 0x0127, "hstrok", "LATIN SMALL LETTER H WITH STROKE" },
1330: { 0x0128, "Itilde", "LATIN CAPITAL LETTER I WITH TILDE" },
1331: { 0x0129, "itilde", "LATIN SMALL LETTER I WITH TILDE" },
1332: { 0x012A, "Imacr", "LATIN CAPITAL LETTER I WITH MACRON" },
1333: { 0x012B, "imacr", "LATIN SMALL LETTER I WITH MACRON" },
1334: { 0x012E, "Iogon", "LATIN CAPITAL LETTER I WITH OGONEK" },
1335: { 0x012F, "iogon", "LATIN SMALL LETTER I WITH OGONEK" },
1336: { 0x0130, "Idot", "LATIN CAPITAL LETTER I WITH DOT ABOVE" },
1337: { 0x0131, "inodot", "LATIN SMALL LETTER DOTLESS I" },
1338: { 0x0131, "inodot", "LATIN SMALL LETTER DOTLESS I" },
1339: { 0x0132, "IJlig", "LATIN CAPITAL LIGATURE IJ" },
1340: { 0x0133, "ijlig", "LATIN SMALL LIGATURE IJ" },
1341: { 0x0134, "Jcirc", "LATIN CAPITAL LETTER J WITH CIRCUMFLEX" },
1342: { 0x0135, "jcirc", "LATIN SMALL LETTER J WITH CIRCUMFLEX" },
1343: { 0x0136, "Kcedil", "LATIN CAPITAL LETTER K WITH CEDILLA" },
1344: { 0x0137, "kcedil", "LATIN SMALL LETTER K WITH CEDILLA" },
1345: { 0x0138, "kgreen", "LATIN SMALL LETTER KRA" },
1346: { 0x0139, "Lacute", "LATIN CAPITAL LETTER L WITH ACUTE" },
1347: { 0x013A, "lacute", "LATIN SMALL LETTER L WITH ACUTE" },
1348: { 0x013B, "Lcedil", "LATIN CAPITAL LETTER L WITH CEDILLA" },
1349: { 0x013C, "lcedil", "LATIN SMALL LETTER L WITH CEDILLA" },
1350: { 0x013D, "Lcaron", "LATIN CAPITAL LETTER L WITH CARON" },
1351: { 0x013E, "lcaron", "LATIN SMALL LETTER L WITH CARON" },
1352: { 0x013F, "Lmidot", "LATIN CAPITAL LETTER L WITH MIDDLE DOT" },
1353: { 0x0140, "lmidot", "LATIN SMALL LETTER L WITH MIDDLE DOT" },
1354: { 0x0141, "Lstrok", "LATIN CAPITAL LETTER L WITH STROKE" },
1355: { 0x0142, "lstrok", "LATIN SMALL LETTER L WITH STROKE" },
1356: { 0x0143, "Nacute", "LATIN CAPITAL LETTER N WITH ACUTE" },
1357: { 0x0144, "nacute", "LATIN SMALL LETTER N WITH ACUTE" },
1358: { 0x0145, "Ncedil", "LATIN CAPITAL LETTER N WITH CEDILLA" },
1359: { 0x0146, "ncedil", "LATIN SMALL LETTER N WITH CEDILLA" },
1360: { 0x0147, "Ncaron", "LATIN CAPITAL LETTER N WITH CARON" },
1361: { 0x0148, "ncaron", "LATIN SMALL LETTER N WITH CARON" },
1362: { 0x0149, "napos", "LATIN SMALL LETTER N PRECEDED BY APOSTROPHE" },
1363: { 0x014A, "ENG", "LATIN CAPITAL LETTER ENG" },
1364: { 0x014B, "eng", "LATIN SMALL LETTER ENG" },
1365: { 0x014C, "Omacr", "LATIN CAPITAL LETTER O WITH MACRON" },
1366: { 0x014D, "omacr", "LATIN SMALL LETTER O WITH MACRON" },
1367: { 0x0150, "Odblac", "LATIN CAPITAL LETTER O WITH DOUBLE ACUTE" },
1368: { 0x0151, "odblac", "LATIN SMALL LETTER O WITH DOUBLE ACUTE" },
1369: { 0x0152, "OElig", "LATIN CAPITAL LIGATURE OE" },
1370: { 0x0153, "oelig", "LATIN SMALL LIGATURE OE" },
1371: { 0x0154, "Racute", "LATIN CAPITAL LETTER R WITH ACUTE" },
1372: { 0x0155, "racute", "LATIN SMALL LETTER R WITH ACUTE" },
1373: { 0x0156, "Rcedil", "LATIN CAPITAL LETTER R WITH CEDILLA" },
1374: { 0x0157, "rcedil", "LATIN SMALL LETTER R WITH CEDILLA" },
1375: { 0x0158, "Rcaron", "LATIN CAPITAL LETTER R WITH CARON" },
1376: { 0x0159, "rcaron", "LATIN SMALL LETTER R WITH CARON" },
1377: { 0x015A, "Sacute", "LATIN CAPITAL LETTER S WITH ACUTE" },
1378: { 0x015B, "sacute", "LATIN SMALL LETTER S WITH ACUTE" },
1379: { 0x015C, "Scirc", "LATIN CAPITAL LETTER S WITH CIRCUMFLEX" },
1380: { 0x015D, "scirc", "LATIN SMALL LETTER S WITH CIRCUMFLEX" },
1381: { 0x015E, "Scedil", "LATIN CAPITAL LETTER S WITH CEDILLA" },
1382: { 0x015F, "scedil", "LATIN SMALL LETTER S WITH CEDILLA" },
1383: { 0x0160, "Scaron", "LATIN CAPITAL LETTER S WITH CARON" },
1384: { 0x0161, "scaron", "LATIN SMALL LETTER S WITH CARON" },
1385: { 0x0162, "Tcedil", "LATIN CAPITAL LETTER T WITH CEDILLA" },
1386: { 0x0163, "tcedil", "LATIN SMALL LETTER T WITH CEDILLA" },
1387: { 0x0164, "Tcaron", "LATIN CAPITAL LETTER T WITH CARON" },
1388: { 0x0165, "tcaron", "LATIN SMALL LETTER T WITH CARON" },
1389: { 0x0166, "Tstrok", "LATIN CAPITAL LETTER T WITH STROKE" },
1390: { 0x0167, "tstrok", "LATIN SMALL LETTER T WITH STROKE" },
1391: { 0x0168, "Utilde", "LATIN CAPITAL LETTER U WITH TILDE" },
1392: { 0x0169, "utilde", "LATIN SMALL LETTER U WITH TILDE" },
1393: { 0x016A, "Umacr", "LATIN CAPITAL LETTER U WITH MACRON" },
1394: { 0x016B, "umacr", "LATIN SMALL LETTER U WITH MACRON" },
1395: { 0x016C, "Ubreve", "LATIN CAPITAL LETTER U WITH BREVE" },
1396: { 0x016D, "ubreve", "LATIN SMALL LETTER U WITH BREVE" },
1397: { 0x016E, "Uring", "LATIN CAPITAL LETTER U WITH RING ABOVE" },
1398: { 0x016F, "uring", "LATIN SMALL LETTER U WITH RING ABOVE" },
1399: { 0x0170, "Udblac", "LATIN CAPITAL LETTER U WITH DOUBLE ACUTE" },
1400: { 0x0171, "udblac", "LATIN SMALL LETTER U WITH DOUBLE ACUTE" },
1401: { 0x0172, "Uogon", "LATIN CAPITAL LETTER U WITH OGONEK" },
1402: { 0x0173, "uogon", "LATIN SMALL LETTER U WITH OGONEK" },
1403: { 0x0174, "Wcirc", "LATIN CAPITAL LETTER W WITH CIRCUMFLEX" },
1404: { 0x0175, "wcirc", "LATIN SMALL LETTER W WITH CIRCUMFLEX" },
1405: { 0x0176, "Ycirc", "LATIN CAPITAL LETTER Y WITH CIRCUMFLEX" },
1406: { 0x0177, "ycirc", "LATIN SMALL LETTER Y WITH CIRCUMFLEX" },
1407: { 0x0178, "Yuml", "LATIN CAPITAL LETTER Y WITH DIAERESIS" },
1408: { 0x0179, "Zacute", "LATIN CAPITAL LETTER Z WITH ACUTE" },
1409: { 0x017A, "zacute", "LATIN SMALL LETTER Z WITH ACUTE" },
1410: { 0x017B, "Zdot", "LATIN CAPITAL LETTER Z WITH DOT ABOVE" },
1411: { 0x017C, "zdot", "LATIN SMALL LETTER Z WITH DOT ABOVE" },
1412: { 0x017D, "Zcaron", "LATIN CAPITAL LETTER Z WITH CARON" },
1413: { 0x017E, "zcaron", "LATIN SMALL LETTER Z WITH CARON" },
1414: { 0x0192, "fnof", "LATIN SMALL LETTER F WITH HOOK" },
1415: { 0x01F5, "gacute", "LATIN SMALL LETTER G WITH ACUTE" },
1416: { 0x02C7, "caron", "CARON" },
1417: { 0x02D8, "breve", "BREVE" },
1418: { 0x02D9, "dot", "DOT ABOVE" },
1419: { 0x02DA, "ring", "RING ABOVE" },
1420: { 0x02DB, "ogon", "OGONEK" },
1421: { 0x02DC, "tilde", "TILDE" },
1422: { 0x02DD, "dblac", "DOUBLE ACUTE ACCENT" },
1423: { 0x0386, "Aacgr", "GREEK CAPITAL LETTER ALPHA WITH TONOS" },
1424: { 0x0388, "Eacgr", "GREEK CAPITAL LETTER EPSILON WITH TONOS" },
1425: { 0x0389, "EEacgr", "GREEK CAPITAL LETTER ETA WITH TONOS" },
1426: { 0x038A, "Iacgr", "GREEK CAPITAL LETTER IOTA WITH TONOS" },
1427: { 0x038C, "Oacgr", "GREEK CAPITAL LETTER OMICRON WITH TONOS" },
1428: { 0x038E, "Uacgr", "GREEK CAPITAL LETTER UPSILON WITH TONOS" },
1429: { 0x038F, "OHacgr", "GREEK CAPITAL LETTER OMEGA WITH TONOS" },
1430: { 0x0390, "idiagr", "GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS" },
1431: { 0x0391, "Agr", "GREEK CAPITAL LETTER ALPHA" },
1432: { 0x0392, "Bgr", "GREEK CAPITAL LETTER BETA" },
1433: { 0x0393, "b.Gamma", "GREEK CAPITAL LETTER GAMMA" },
1434: { 0x0393, "Gamma", "GREEK CAPITAL LETTER GAMMA" },
1435: { 0x0393, "Ggr", "GREEK CAPITAL LETTER GAMMA" },
1436: { 0x0394, "b.Delta", "GREEK CAPITAL LETTER DELTA" },
1437: { 0x0394, "Delta", "GREEK CAPITAL LETTER DELTA" },
1438: { 0x0394, "Dgr", "GREEK CAPITAL LETTER DELTA" },
1439: { 0x0395, "Egr", "GREEK CAPITAL LETTER EPSILON" },
1440: { 0x0396, "Zgr", "GREEK CAPITAL LETTER ZETA" },
1441: { 0x0397, "EEgr", "GREEK CAPITAL LETTER ETA" },
1442: { 0x0398, "b.Theta", "GREEK CAPITAL LETTER THETA" },
1443: { 0x0398, "Theta", "GREEK CAPITAL LETTER THETA" },
1444: { 0x0398, "THgr", "GREEK CAPITAL LETTER THETA" },
1445: { 0x0399, "Igr", "GREEK CAPITAL LETTER IOTA" },
1446: { 0x039A, "Kgr", "GREEK CAPITAL LETTER KAPPA" },
1447: { 0x039B, "b.Lambda", "GREEK CAPITAL LETTER LAMDA" },
1448: { 0x039B, "Lambda", "GREEK CAPITAL LETTER LAMDA" },
1449: { 0x039B, "Lgr", "GREEK CAPITAL LETTER LAMDA" },
1450: { 0x039C, "Mgr", "GREEK CAPITAL LETTER MU" },
1451: { 0x039D, "Ngr", "GREEK CAPITAL LETTER NU" },
1452: { 0x039E, "b.Xi", "GREEK CAPITAL LETTER XI" },
1453: { 0x039E, "Xgr", "GREEK CAPITAL LETTER XI" },
1454: { 0x039E, "Xi", "GREEK CAPITAL LETTER XI" },
1455: { 0x039F, "Ogr", "GREEK CAPITAL LETTER OMICRON" },
1456: { 0x03A0, "b.Pi", "GREEK CAPITAL LETTER PI" },
1457: { 0x03A0, "Pgr", "GREEK CAPITAL LETTER PI" },
1458: { 0x03A0, "Pi", "GREEK CAPITAL LETTER PI" },
1459: { 0x03A1, "Rgr", "GREEK CAPITAL LETTER RHO" },
1460: { 0x03A3, "b.Sigma", "GREEK CAPITAL LETTER SIGMA" },
1461: { 0x03A3, "Sgr", "GREEK CAPITAL LETTER SIGMA" },
1462: { 0x03A3, "Sigma", "GREEK CAPITAL LETTER SIGMA" },
1463: { 0x03A4, "Tgr", "GREEK CAPITAL LETTER TAU" },
1464: { 0x03A5, "Ugr", "" },
1465: { 0x03A6, "b.Phi", "GREEK CAPITAL LETTER PHI" },
1466: { 0x03A6, "PHgr", "GREEK CAPITAL LETTER PHI" },
1467: { 0x03A6, "Phi", "GREEK CAPITAL LETTER PHI" },
1468: { 0x03A7, "KHgr", "GREEK CAPITAL LETTER CHI" },
1469: { 0x03A8, "b.Psi", "GREEK CAPITAL LETTER PSI" },
1470: { 0x03A8, "PSgr", "GREEK CAPITAL LETTER PSI" },
1471: { 0x03A8, "Psi", "GREEK CAPITAL LETTER PSI" },
1472: { 0x03A9, "b.Omega", "GREEK CAPITAL LETTER OMEGA" },
1473: { 0x03A9, "OHgr", "GREEK CAPITAL LETTER OMEGA" },
1474: { 0x03A9, "Omega", "GREEK CAPITAL LETTER OMEGA" },
1475: { 0x03AA, "Idigr", "GREEK CAPITAL LETTER IOTA WITH DIALYTIKA" },
1476: { 0x03AB, "Udigr", "GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA" },
1477: { 0x03AC, "aacgr", "GREEK SMALL LETTER ALPHA WITH TONOS" },
1478: { 0x03AD, "eacgr", "GREEK SMALL LETTER EPSILON WITH TONOS" },
1479: { 0x03AE, "eeacgr", "GREEK SMALL LETTER ETA WITH TONOS" },
1480: { 0x03AF, "iacgr", "GREEK SMALL LETTER IOTA WITH TONOS" },
1481: { 0x03B0, "udiagr", "GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS" },
1482: { 0x03B1, "agr", "" },
1483: { 0x03B1, "alpha", "" },
1484: { 0x03B1, "b.alpha", "" },
1485: { 0x03B2, "b.beta", "GREEK SMALL LETTER BETA" },
1486: { 0x03B2, "beta", "GREEK SMALL LETTER BETA" },
1487: { 0x03B2, "bgr", "GREEK SMALL LETTER BETA" },
1488: { 0x03B3, "b.gamma", "GREEK SMALL LETTER GAMMA" },
1489: { 0x03B3, "gamma", "GREEK SMALL LETTER GAMMA" },
1490: { 0x03B3, "ggr", "GREEK SMALL LETTER GAMMA" },
1491: { 0x03B4, "b.delta", "GREEK SMALL LETTER DELTA" },
1492: { 0x03B4, "delta", "GREEK SMALL LETTER DELTA" },
1493: { 0x03B4, "dgr", "GREEK SMALL LETTER DELTA" },
1494: { 0x03B5, "b.epsi", "" },
1495: { 0x03B5, "b.epsis", "" },
1496: { 0x03B5, "b.epsiv", "" },
1497: { 0x03B5, "egr", "" },
1498: { 0x03B5, "epsiv", "" },
1499: { 0x03B6, "b.zeta", "GREEK SMALL LETTER ZETA" },
1500: { 0x03B6, "zeta", "GREEK SMALL LETTER ZETA" },
1501: { 0x03B6, "zgr", "GREEK SMALL LETTER ZETA" },
1502: { 0x03B7, "b.eta", "GREEK SMALL LETTER ETA" },
1503: { 0x03B7, "eegr", "GREEK SMALL LETTER ETA" },
1504: { 0x03B7, "eta", "GREEK SMALL LETTER ETA" },
1505: { 0x03B8, "b.thetas", "" },
1506: { 0x03B8, "thetas", "" },
1507: { 0x03B8, "thgr", "" },
1508: { 0x03B9, "b.iota", "GREEK SMALL LETTER IOTA" },
1509: { 0x03B9, "igr", "GREEK SMALL LETTER IOTA" },
1510: { 0x03B9, "iota", "GREEK SMALL LETTER IOTA" },
1511: { 0x03BA, "b.kappa", "GREEK SMALL LETTER KAPPA" },
1512: { 0x03BA, "kappa", "GREEK SMALL LETTER KAPPA" },
1513: { 0x03BA, "kgr", "GREEK SMALL LETTER KAPPA" },
1514: { 0x03BB, "b.lambda", "GREEK SMALL LETTER LAMDA" },
1515: { 0x03BB, "lambda", "GREEK SMALL LETTER LAMDA" },
1516: { 0x03BB, "lgr", "GREEK SMALL LETTER LAMDA" },
1517: { 0x03BC, "b.mu", "GREEK SMALL LETTER MU" },
1518: { 0x03BC, "mgr", "GREEK SMALL LETTER MU" },
1519: { 0x03BC, "mu", "GREEK SMALL LETTER MU" },
1520: { 0x03BD, "b.nu", "GREEK SMALL LETTER NU" },
1521: { 0x03BD, "ngr", "GREEK SMALL LETTER NU" },
1522: { 0x03BD, "nu", "GREEK SMALL LETTER NU" },
1523: { 0x03BE, "b.xi", "GREEK SMALL LETTER XI" },
1524: { 0x03BE, "xgr", "GREEK SMALL LETTER XI" },
1525: { 0x03BE, "xi", "GREEK SMALL LETTER XI" },
1526: { 0x03BF, "ogr", "GREEK SMALL LETTER OMICRON" },
1527: { 0x03C0, "b.pi", "GREEK SMALL LETTER PI" },
1528: { 0x03C0, "pgr", "GREEK SMALL LETTER PI" },
1529: { 0x03C0, "pi", "GREEK SMALL LETTER PI" },
1530: { 0x03C1, "b.rho", "GREEK SMALL LETTER RHO" },
1531: { 0x03C1, "rgr", "GREEK SMALL LETTER RHO" },
1532: { 0x03C1, "rho", "GREEK SMALL LETTER RHO" },
1533: { 0x03C2, "b.sigmav", "" },
1534: { 0x03C2, "sfgr", "" },
1535: { 0x03C2, "sigmav", "" },
1536: { 0x03C3, "b.sigma", "GREEK SMALL LETTER SIGMA" },
1537: { 0x03C3, "sgr", "GREEK SMALL LETTER SIGMA" },
1538: { 0x03C3, "sigma", "GREEK SMALL LETTER SIGMA" },
1539: { 0x03C4, "b.tau", "GREEK SMALL LETTER TAU" },
1540: { 0x03C4, "tau", "GREEK SMALL LETTER TAU" },
1541: { 0x03C4, "tgr", "GREEK SMALL LETTER TAU" },
1542: { 0x03C5, "b.upsi", "GREEK SMALL LETTER UPSILON" },
1543: { 0x03C5, "ugr", "GREEK SMALL LETTER UPSILON" },
1544: { 0x03C5, "upsi", "GREEK SMALL LETTER UPSILON" },
1545: { 0x03C6, "b.phis", "GREEK SMALL LETTER PHI" },
1546: { 0x03C6, "phgr", "GREEK SMALL LETTER PHI" },
1547: { 0x03C6, "phis", "GREEK SMALL LETTER PHI" },
1548: { 0x03C7, "b.chi", "GREEK SMALL LETTER CHI" },
1549: { 0x03C7, "chi", "GREEK SMALL LETTER CHI" },
1550: { 0x03C7, "khgr", "GREEK SMALL LETTER CHI" },
1551: { 0x03C8, "b.psi", "GREEK SMALL LETTER PSI" },
1552: { 0x03C8, "psgr", "GREEK SMALL LETTER PSI" },
1553: { 0x03C8, "psi", "GREEK SMALL LETTER PSI" },
1554: { 0x03C9, "b.omega", "GREEK SMALL LETTER OMEGA" },
1555: { 0x03C9, "ohgr", "GREEK SMALL LETTER OMEGA" },
1556: { 0x03C9, "omega", "GREEK SMALL LETTER OMEGA" },
1557: { 0x03CA, "idigr", "GREEK SMALL LETTER IOTA WITH DIALYTIKA" },
1558: { 0x03CB, "udigr", "GREEK SMALL LETTER UPSILON WITH DIALYTIKA" },
1559: { 0x03CC, "oacgr", "GREEK SMALL LETTER OMICRON WITH TONOS" },
1560: { 0x03CD, "uacgr", "GREEK SMALL LETTER UPSILON WITH TONOS" },
1561: { 0x03CE, "ohacgr", "GREEK SMALL LETTER OMEGA WITH TONOS" },
1562: { 0x03D1, "b.thetav", "" },
1563: { 0x03D1, "thetav", "" },
1564: { 0x03D2, "b.Upsi", "" },
1565: { 0x03D2, "Upsi", "" },
1566: { 0x03D5, "b.phiv", "GREEK PHI SYMBOL" },
1567: { 0x03D5, "phiv", "GREEK PHI SYMBOL" },
1568: { 0x03D6, "b.piv", "GREEK PI SYMBOL" },
1569: { 0x03D6, "piv", "GREEK PI SYMBOL" },
1570: { 0x03DC, "b.gammad", "GREEK LETTER DIGAMMA" },
1571: { 0x03DC, "gammad", "GREEK LETTER DIGAMMA" },
1572: { 0x03F0, "b.kappav", "GREEK KAPPA SYMBOL" },
1573: { 0x03F0, "kappav", "GREEK KAPPA SYMBOL" },
1574: { 0x03F1, "b.rhov", "GREEK RHO SYMBOL" },
1575: { 0x03F1, "rhov", "GREEK RHO SYMBOL" },
1576: { 0x0401, "IOcy", "CYRILLIC CAPITAL LETTER IO" },
1577: { 0x0402, "DJcy", "CYRILLIC CAPITAL LETTER DJE" },
1578: { 0x0403, "GJcy", "CYRILLIC CAPITAL LETTER GJE" },
1579: { 0x0404, "Jukcy", "CYRILLIC CAPITAL LETTER UKRAINIAN IE" },
1580: { 0x0405, "DScy", "CYRILLIC CAPITAL LETTER DZE" },
1581: { 0x0406, "Iukcy", "CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I" },
1582: { 0x0407, "YIcy", "CYRILLIC CAPITAL LETTER YI" },
1583: { 0x0408, "Jsercy", "CYRILLIC CAPITAL LETTER JE" },
1584: { 0x0409, "LJcy", "CYRILLIC CAPITAL LETTER LJE" },
1585: { 0x040A, "NJcy", "CYRILLIC CAPITAL LETTER NJE" },
1586: { 0x040B, "TSHcy", "CYRILLIC CAPITAL LETTER TSHE" },
1587: { 0x040C, "KJcy", "CYRILLIC CAPITAL LETTER KJE" },
1588: { 0x040E, "Ubrcy", "CYRILLIC CAPITAL LETTER SHORT U" },
1589: { 0x040F, "DZcy", "CYRILLIC CAPITAL LETTER DZHE" },
1590: { 0x0410, "Acy", "CYRILLIC CAPITAL LETTER A" },
1591: { 0x0411, "Bcy", "CYRILLIC CAPITAL LETTER BE" },
1592: { 0x0412, "Vcy", "CYRILLIC CAPITAL LETTER VE" },
1593: { 0x0413, "Gcy", "CYRILLIC CAPITAL LETTER GHE" },
1594: { 0x0414, "Dcy", "CYRILLIC CAPITAL LETTER DE" },
1595: { 0x0415, "IEcy", "CYRILLIC CAPITAL LETTER IE" },
1596: { 0x0416, "ZHcy", "CYRILLIC CAPITAL LETTER ZHE" },
1597: { 0x0417, "Zcy", "CYRILLIC CAPITAL LETTER ZE" },
1598: { 0x0418, "Icy", "CYRILLIC CAPITAL LETTER I" },
1599: { 0x0419, "Jcy", "CYRILLIC CAPITAL LETTER SHORT I" },
1600: { 0x041A, "Kcy", "CYRILLIC CAPITAL LETTER KA" },
1601: { 0x041B, "Lcy", "CYRILLIC CAPITAL LETTER EL" },
1602: { 0x041C, "Mcy", "CYRILLIC CAPITAL LETTER EM" },
1603: { 0x041D, "Ncy", "CYRILLIC CAPITAL LETTER EN" },
1604: { 0x041E, "Ocy", "CYRILLIC CAPITAL LETTER O" },
1605: { 0x041F, "Pcy", "CYRILLIC CAPITAL LETTER PE" },
1606: { 0x0420, "Rcy", "CYRILLIC CAPITAL LETTER ER" },
1607: { 0x0421, "Scy", "CYRILLIC CAPITAL LETTER ES" },
1608: { 0x0422, "Tcy", "CYRILLIC CAPITAL LETTER TE" },
1609: { 0x0423, "Ucy", "CYRILLIC CAPITAL LETTER U" },
1610: { 0x0424, "Fcy", "CYRILLIC CAPITAL LETTER EF" },
1611: { 0x0425, "KHcy", "CYRILLIC CAPITAL LETTER HA" },
1612: { 0x0426, "TScy", "CYRILLIC CAPITAL LETTER TSE" },
1613: { 0x0427, "CHcy", "CYRILLIC CAPITAL LETTER CHE" },
1614: { 0x0428, "SHcy", "CYRILLIC CAPITAL LETTER SHA" },
1615: { 0x0429, "SHCHcy", "CYRILLIC CAPITAL LETTER SHCHA" },
1616: { 0x042A, "HARDcy", "CYRILLIC CAPITAL LETTER HARD SIGN" },
1617: { 0x042B, "Ycy", "CYRILLIC CAPITAL LETTER YERU" },
1618: { 0x042C, "SOFTcy", "CYRILLIC CAPITAL LETTER SOFT SIGN" },
1619: { 0x042D, "Ecy", "CYRILLIC CAPITAL LETTER E" },
1620: { 0x042E, "YUcy", "CYRILLIC CAPITAL LETTER YU" },
1621: { 0x042F, "YAcy", "CYRILLIC CAPITAL LETTER YA" },
1622: { 0x0430, "acy", "CYRILLIC SMALL LETTER A" },
1623: { 0x0431, "bcy", "CYRILLIC SMALL LETTER BE" },
1624: { 0x0432, "vcy", "CYRILLIC SMALL LETTER VE" },
1625: { 0x0433, "gcy", "CYRILLIC SMALL LETTER GHE" },
1626: { 0x0434, "dcy", "CYRILLIC SMALL LETTER DE" },
1627: { 0x0435, "iecy", "CYRILLIC SMALL LETTER IE" },
1628: { 0x0436, "zhcy", "CYRILLIC SMALL LETTER ZHE" },
1629: { 0x0437, "zcy", "CYRILLIC SMALL LETTER ZE" },
1630: { 0x0438, "icy", "CYRILLIC SMALL LETTER I" },
1631: { 0x0439, "jcy", "CYRILLIC SMALL LETTER SHORT I" },
1632: { 0x043A, "kcy", "CYRILLIC SMALL LETTER KA" },
1633: { 0x043B, "lcy", "CYRILLIC SMALL LETTER EL" },
1634: { 0x043C, "mcy", "CYRILLIC SMALL LETTER EM" },
1635: { 0x043D, "ncy", "CYRILLIC SMALL LETTER EN" },
1636: { 0x043E, "ocy", "CYRILLIC SMALL LETTER O" },
1637: { 0x043F, "pcy", "CYRILLIC SMALL LETTER PE" },
1638: { 0x0440, "rcy", "CYRILLIC SMALL LETTER ER" },
1639: { 0x0441, "scy", "CYRILLIC SMALL LETTER ES" },
1640: { 0x0442, "tcy", "CYRILLIC SMALL LETTER TE" },
1641: { 0x0443, "ucy", "CYRILLIC SMALL LETTER U" },
1642: { 0x0444, "fcy", "CYRILLIC SMALL LETTER EF" },
1643: { 0x0445, "khcy", "CYRILLIC SMALL LETTER HA" },
1644: { 0x0446, "tscy", "CYRILLIC SMALL LETTER TSE" },
1645: { 0x0447, "chcy", "CYRILLIC SMALL LETTER CHE" },
1646: { 0x0448, "shcy", "CYRILLIC SMALL LETTER SHA" },
1647: { 0x0449, "shchcy", "CYRILLIC SMALL LETTER SHCHA" },
1648: { 0x044A, "hardcy", "CYRILLIC SMALL LETTER HARD SIGN" },
1649: { 0x044B, "ycy", "CYRILLIC SMALL LETTER YERU" },
1650: { 0x044C, "softcy", "CYRILLIC SMALL LETTER SOFT SIGN" },
1651: { 0x044D, "ecy", "CYRILLIC SMALL LETTER E" },
1652: { 0x044E, "yucy", "CYRILLIC SMALL LETTER YU" },
1653: { 0x044F, "yacy", "CYRILLIC SMALL LETTER YA" },
1654: { 0x0451, "iocy", "CYRILLIC SMALL LETTER IO" },
1655: { 0x0452, "djcy", "CYRILLIC SMALL LETTER DJE" },
1656: { 0x0453, "gjcy", "CYRILLIC SMALL LETTER GJE" },
1657: { 0x0454, "jukcy", "CYRILLIC SMALL LETTER UKRAINIAN IE" },
1658: { 0x0455, "dscy", "CYRILLIC SMALL LETTER DZE" },
1659: { 0x0456, "iukcy", "CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I" },
1660: { 0x0457, "yicy", "CYRILLIC SMALL LETTER YI" },
1661: { 0x0458, "jsercy", "CYRILLIC SMALL LETTER JE" },
1662: { 0x0459, "ljcy", "CYRILLIC SMALL LETTER LJE" },
1663: { 0x045A, "njcy", "CYRILLIC SMALL LETTER NJE" },
1664: { 0x045B, "tshcy", "CYRILLIC SMALL LETTER TSHE" },
1665: { 0x045C, "kjcy", "CYRILLIC SMALL LETTER KJE" },
1666: { 0x045E, "ubrcy", "CYRILLIC SMALL LETTER SHORT U" },
1667: { 0x045F, "dzcy", "CYRILLIC SMALL LETTER DZHE" },
1668: { 0x2002, "ensp", "EN SPACE" },
1669: { 0x2003, "emsp", "EM SPACE" },
1670: { 0x2004, "emsp13", "THREE-PER-EM SPACE" },
1671: { 0x2005, "emsp14", "FOUR-PER-EM SPACE" },
1672: { 0x2007, "numsp", "FIGURE SPACE" },
1673: { 0x2008, "puncsp", "PUNCTUATION SPACE" },
1674: { 0x2009, "thinsp", "THIN SPACE" },
1675: { 0x200A, "hairsp", "HAIR SPACE" },
1676: { 0x2010, "dash", "HYPHEN" },
1677: { 0x2013, "ndash", "EN DASH" },
1678: { 0x2014, "mdash", "EM DASH" },
1679: { 0x2015, "horbar", "HORIZONTAL BAR" },
1680: { 0x2016, "Verbar", "DOUBLE VERTICAL LINE" },
1681: { 0x2018, "lsquo", "" },
1682: { 0x2018, "rsquor", "" },
1683: { 0x2019, "rsquo", "RIGHT SINGLE QUOTATION MARK" },
1684: { 0x201A, "lsquor", "SINGLE LOW-9 QUOTATION MARK" },
1685: { 0x201C, "ldquo", "" },
1686: { 0x201C, "rdquor", "" },
1687: { 0x201D, "rdquo", "RIGHT DOUBLE QUOTATION MARK" },
1688: { 0x201E, "ldquor", "DOUBLE LOW-9 QUOTATION MARK" },
1689: { 0x2020, "dagger", "DAGGER" },
1690: { 0x2021, "Dagger", "DOUBLE DAGGER" },
1691: { 0x2022, "bull", "BULLET" },
1692: { 0x2025, "nldr", "TWO DOT LEADER" },
1693: { 0x2026, "hellip", "HORIZONTAL ELLIPSIS" },
1694: { 0x2026, "mldr", "HORIZONTAL ELLIPSIS" },
1695: { 0x2030, "permil", "PER MILLE SIGN" },
1696: { 0x2032, "prime", "PRIME" },
1697: { 0x2032, "vprime", "PRIME" },
1698: { 0x2033, "Prime", "DOUBLE PRIME" },
1699: { 0x2034, "tprime", "TRIPLE PRIME" },
1700: { 0x2035, "bprime", "REVERSED PRIME" },
1701: { 0x2041, "caret", "CARET" },
1702: { 0x2043, "hybull", "HYPHEN BULLET" },
1703: { 0x20DB, "tdot", "COMBINING THREE DOTS ABOVE" },
1704: { 0x20DC, "DotDot", "COMBINING FOUR DOTS ABOVE" },
1705: { 0x2105, "incare", "CARE OF" },
1706: { 0x210B, "hamilt", "SCRIPT CAPITAL H" },
1707: { 0x210F, "planck", "PLANCK CONSTANT OVER TWO PI" },
1708: { 0x2111, "image", "BLACK-LETTER CAPITAL I" },
1709: { 0x2112, "lagran", "SCRIPT CAPITAL L" },
1710: { 0x2113, "ell", "SCRIPT SMALL L" },
1711: { 0x2116, "numero", "NUMERO SIGN" },
1712: { 0x2117, "copysr", "SOUND RECORDING COPYRIGHT" },
1713: { 0x2118, "weierp", "SCRIPT CAPITAL P" },
1714: { 0x211C, "real", "BLACK-LETTER CAPITAL R" },
1715: { 0x211E, "rx", "PRESCRIPTION TAKE" },
1716: { 0x2122, "trade", "TRADE MARK SIGN" },
1717: { 0x2126, "ohm", "OHM SIGN" },
1718: { 0x212B, "angst", "ANGSTROM SIGN" },
1719: { 0x212C, "bernou", "SCRIPT CAPITAL B" },
1720: { 0x2133, "phmmat", "SCRIPT CAPITAL M" },
1721: { 0x2134, "order", "SCRIPT SMALL O" },
1722: { 0x2135, "aleph", "ALEF SYMBOL" },
1723: { 0x2136, "beth", "BET SYMBOL" },
1724: { 0x2137, "gimel", "GIMEL SYMBOL" },
1725: { 0x2138, "daleth", "DALET SYMBOL" },
1726: { 0x2153, "frac13", "VULGAR FRACTION ONE THIRD" },
1727: { 0x2154, "frac23", "VULGAR FRACTION TWO THIRDS" },
1728: { 0x2155, "frac15", "VULGAR FRACTION ONE FIFTH" },
1729: { 0x2156, "frac25", "VULGAR FRACTION TWO FIFTHS" },
1730: { 0x2157, "frac35", "VULGAR FRACTION THREE FIFTHS" },
1731: { 0x2158, "frac45", "VULGAR FRACTION FOUR FIFTHS" },
1732: { 0x2159, "frac16", "VULGAR FRACTION ONE SIXTH" },
1733: { 0x215A, "frac56", "VULGAR FRACTION FIVE SIXTHS" },
1734: { 0x215B, "frac18", "" },
1735: { 0x215C, "frac38", "" },
1736: { 0x215D, "frac58", "" },
1737: { 0x215E, "frac78", "" },
1738: { 0x2190, "larr", "LEFTWARDS DOUBLE ARROW" },
1739: { 0x2191, "uarr", "UPWARDS ARROW" },
1740: { 0x2192, "rarr", "RIGHTWARDS DOUBLE ARROW" },
1741: { 0x2193, "darr", "DOWNWARDS ARROW" },
1742: { 0x2194, "harr", "LEFT RIGHT ARROW" },
1743: { 0x2194, "xhArr", "LEFT RIGHT ARROW" },
1744: { 0x2194, "xharr", "LEFT RIGHT ARROW" },
1745: { 0x2195, "varr", "UP DOWN ARROW" },
1746: { 0x2196, "nwarr", "NORTH WEST ARROW" },
1747: { 0x2197, "nearr", "NORTH EAST ARROW" },
1748: { 0x2198, "drarr", "SOUTH EAST ARROW" },
1749: { 0x2199, "dlarr", "SOUTH WEST ARROW" },
1750: { 0x219A, "nlarr", "LEFTWARDS ARROW WITH STROKE" },
1751: { 0x219B, "nrarr", "RIGHTWARDS ARROW WITH STROKE" },
1752: { 0x219D, "rarrw", "RIGHTWARDS SQUIGGLE ARROW" },
1753: { 0x219E, "Larr", "LEFTWARDS TWO HEADED ARROW" },
1754: { 0x21A0, "Rarr", "RIGHTWARDS TWO HEADED ARROW" },
1755: { 0x21A2, "larrtl", "LEFTWARDS ARROW WITH TAIL" },
1756: { 0x21A3, "rarrtl", "RIGHTWARDS ARROW WITH TAIL" },
1757: { 0x21A6, "map", "RIGHTWARDS ARROW FROM BAR" },
1758: { 0x21A9, "larrhk", "LEFTWARDS ARROW WITH HOOK" },
1759: { 0x21AA, "rarrhk", "RIGHTWARDS ARROW WITH HOOK" },
1760: { 0x21AB, "larrlp", "LEFTWARDS ARROW WITH LOOP" },
1761: { 0x21AC, "rarrlp", "RIGHTWARDS ARROW WITH LOOP" },
1762: { 0x21AD, "harrw", "LEFT RIGHT WAVE ARROW" },
1763: { 0x21AE, "nharr", "LEFT RIGHT ARROW WITH STROKE" },
1764: { 0x21B0, "lsh", "UPWARDS ARROW WITH TIP LEFTWARDS" },
1765: { 0x21B1, "rsh", "UPWARDS ARROW WITH TIP RIGHTWARDS" },
1766: { 0x21B6, "cularr", "ANTICLOCKWISE TOP SEMICIRCLE ARROW" },
1767: { 0x21B7, "curarr", "CLOCKWISE TOP SEMICIRCLE ARROW" },
1768: { 0x21BA, "olarr", "ANTICLOCKWISE OPEN CIRCLE ARROW" },
1769: { 0x21BB, "orarr", "CLOCKWISE OPEN CIRCLE ARROW" },
1770: { 0x21BC, "lharu", "LEFTWARDS HARPOON WITH BARB UPWARDS" },
1771: { 0x21BD, "lhard", "LEFTWARDS HARPOON WITH BARB DOWNWARDS" },
1772: { 0x21BE, "uharr", "UPWARDS HARPOON WITH BARB RIGHTWARDS" },
1773: { 0x21BF, "uharl", "UPWARDS HARPOON WITH BARB LEFTWARDS" },
1774: { 0x21C0, "rharu", "RIGHTWARDS HARPOON WITH BARB UPWARDS" },
1775: { 0x21C1, "rhard", "RIGHTWARDS HARPOON WITH BARB DOWNWARDS" },
1776: { 0x21C2, "dharr", "DOWNWARDS HARPOON WITH BARB RIGHTWARDS" },
1777: { 0x21C3, "dharl", "DOWNWARDS HARPOON WITH BARB LEFTWARDS" },
1778: { 0x21C4, "rlarr2", "RIGHTWARDS ARROW OVER LEFTWARDS ARROW" },
1779: { 0x21C6, "lrarr2", "LEFTWARDS ARROW OVER RIGHTWARDS ARROW" },
1780: { 0x21C7, "larr2", "LEFTWARDS PAIRED ARROWS" },
1781: { 0x21C8, "uarr2", "UPWARDS PAIRED ARROWS" },
1782: { 0x21C9, "rarr2", "RIGHTWARDS PAIRED ARROWS" },
1783: { 0x21CA, "darr2", "DOWNWARDS PAIRED ARROWS" },
1784: { 0x21CB, "lrhar2", "LEFTWARDS HARPOON OVER RIGHTWARDS HARPOON" },
1785: { 0x21CC, "rlhar2", "RIGHTWARDS HARPOON OVER LEFTWARDS HARPOON" },
1786: { 0x21CD, "nlArr", "LEFTWARDS DOUBLE ARROW WITH STROKE" },
1787: { 0x21CE, "nhArr", "LEFT RIGHT DOUBLE ARROW WITH STROKE" },
1788: { 0x21CF, "nrArr", "RIGHTWARDS DOUBLE ARROW WITH STROKE" },
1789: { 0x21D0, "lArr", "LEFTWARDS ARROW" },
1790: { 0x21D0, "xlArr", "LEFTWARDS DOUBLE ARROW" },
1791: { 0x21D1, "uArr", "UPWARDS DOUBLE ARROW" },
1792: { 0x21D2, "rArr", "RIGHTWARDS ARROW" },
1793: { 0x21D2, "xrArr", "RIGHTWARDS DOUBLE ARROW" },
1794: { 0x21D3, "dArr", "DOWNWARDS DOUBLE ARROW" },
1795: { 0x21D4, "hArr", "" },
1796: { 0x21D4, "iff", "LEFT RIGHT DOUBLE ARROW" },
1797: { 0x21D5, "vArr", "UP DOWN DOUBLE ARROW" },
1798: { 0x21DA, "lAarr", "LEFTWARDS TRIPLE ARROW" },
1799: { 0x21DB, "rAarr", "RIGHTWARDS TRIPLE ARROW" },
1800: { 0x2200, "forall", "" },
1801: { 0x2201, "comp", "COMPLEMENT" },
1802: { 0x2202, "part", "" },
1803: { 0x2203, "exist", "" },
1804: { 0x2204, "nexist", "THERE DOES NOT EXIST" },
1805: { 0x2205, "empty", "" },
1806: { 0x2207, "nabla", "NABLA" },
1807: { 0x2209, "notin", "" },
1808: { 0x220A, "epsi", "" },
1809: { 0x220A, "epsis", "" },
1810: { 0x220A, "isin", "" },
1811: { 0x220D, "bepsi", "SMALL CONTAINS AS MEMBER" },
1812: { 0x220D, "ni", "" },
1813: { 0x220F, "prod", "N-ARY PRODUCT" },
1814: { 0x2210, "amalg", "N-ARY COPRODUCT" },
1815: { 0x2210, "coprod", "N-ARY COPRODUCT" },
1816: { 0x2210, "samalg", "" },
1817: { 0x2211, "sum", "N-ARY SUMMATION" },
1818: { 0x2212, "minus", "MINUS SIGN" },
1819: { 0x2213, "mnplus", "" },
1820: { 0x2214, "plusdo", "DOT PLUS" },
1821: { 0x2216, "setmn", "SET MINUS" },
1822: { 0x2216, "ssetmn", "SET MINUS" },
1823: { 0x2217, "lowast", "ASTERISK OPERATOR" },
1824: { 0x2218, "compfn", "RING OPERATOR" },
1825: { 0x221A, "radic", "" },
1826: { 0x221D, "prop", "" },
1827: { 0x221D, "vprop", "" },
1828: { 0x221E, "infin", "" },
1829: { 0x221F, "ang90", "RIGHT ANGLE" },
1830: { 0x2220, "ang", "ANGLE" },
1831: { 0x2221, "angmsd", "MEASURED ANGLE" },
1832: { 0x2222, "angsph", "" },
1833: { 0x2223, "mid", "" },
1834: { 0x2224, "nmid", "DOES NOT DIVIDE" },
1835: { 0x2225, "par", "PARALLEL TO" },
1836: { 0x2225, "spar", "PARALLEL TO" },
1837: { 0x2226, "npar", "NOT PARALLEL TO" },
1838: { 0x2226, "nspar", "NOT PARALLEL TO" },
1839: { 0x2227, "and", "" },
1840: { 0x2228, "or", "" },
1841: { 0x2229, "cap", "" },
1842: { 0x222A, "cup", "" },
1843: { 0x222B, "int", "" },
1844: { 0x222E, "conint", "" },
1845: { 0x2234, "there4", "" },
1846: { 0x2235, "becaus", "BECAUSE" },
1847: { 0x223C, "sim", "" },
1848: { 0x223C, "thksim", "TILDE OPERATOR" },
1849: { 0x223D, "bsim", "" },
1850: { 0x2240, "wreath", "WREATH PRODUCT" },
1851: { 0x2241, "nsim", "" },
1852: { 0x2243, "sime", "" },
1853: { 0x2244, "nsime", "" },
1854: { 0x2245, "cong", "" },
1855: { 0x2247, "ncong", "NEITHER APPROXIMATELY NOR ACTUALLY EQUAL TO" },
1856: { 0x2248, "ap", "" },
1857: { 0x2248, "thkap", "ALMOST EQUAL TO" },
1858: { 0x2249, "nap", "NOT ALMOST EQUAL TO" },
1859: { 0x224A, "ape", "" },
1860: { 0x224C, "bcong", "ALL EQUAL TO" },
1861: { 0x224D, "asymp", "EQUIVALENT TO" },
1862: { 0x224E, "bump", "" },
1863: { 0x224F, "bumpe", "" },
1864: { 0x2250, "esdot", "" },
1865: { 0x2251, "eDot", "" },
1866: { 0x2252, "efDot", "" },
1867: { 0x2253, "erDot", "" },
1868: { 0x2254, "colone", "" },
1869: { 0x2255, "ecolon", "" },
1870: { 0x2256, "ecir", "" },
1871: { 0x2257, "cire", "" },
1872: { 0x2259, "wedgeq", "ESTIMATES" },
1873: { 0x225C, "trie", "" },
1874: { 0x2260, "ne", "" },
1875: { 0x2261, "equiv", "" },
1876: { 0x2262, "nequiv", "NOT IDENTICAL TO" },
1877: { 0x2264, "le", "" },
1878: { 0x2264, "les", "LESS-THAN OR EQUAL TO" },
1879: { 0x2265, "ge", "GREATER-THAN OR EQUAL TO" },
1880: { 0x2265, "ges", "GREATER-THAN OR EQUAL TO" },
1881: { 0x2266, "lE", "" },
1882: { 0x2267, "gE", "" },
1883: { 0x2268, "lnE", "" },
1884: { 0x2268, "lne", "" },
1885: { 0x2268, "lvnE", "LESS-THAN BUT NOT EQUAL TO" },
1886: { 0x2269, "gnE", "" },
1887: { 0x2269, "gne", "" },
1888: { 0x2269, "gvnE", "GREATER-THAN BUT NOT EQUAL TO" },
1889: { 0x226A, "Lt", "MUCH LESS-THAN" },
1890: { 0x226B, "Gt", "MUCH GREATER-THAN" },
1891: { 0x226C, "twixt", "BETWEEN" },
1892: { 0x226E, "nlt", "NOT LESS-THAN" },
1893: { 0x226F, "ngt", "NOT GREATER-THAN" },
1894: { 0x2270, "nlE", "" },
1895: { 0x2270, "nle", "NEITHER LESS-THAN NOR EQUAL TO" },
1896: { 0x2270, "nles", "" },
1897: { 0x2271, "ngE", "" },
1898: { 0x2271, "nge", "NEITHER GREATER-THAN NOR EQUAL TO" },
1899: { 0x2271, "nges", "" },
1900: { 0x2272, "lap", "LESS-THAN OR EQUIVALENT TO" },
1901: { 0x2272, "lsim", "LESS-THAN OR EQUIVALENT TO" },
1902: { 0x2273, "gap", "GREATER-THAN OR EQUIVALENT TO" },
1903: { 0x2273, "gsim", "GREATER-THAN OR EQUIVALENT TO" },
1904: { 0x2276, "lg", "LESS-THAN OR GREATER-THAN" },
1905: { 0x2277, "gl", "" },
1906: { 0x227A, "pr", "" },
1907: { 0x227B, "sc", "" },
1908: { 0x227C, "cupre", "" },
1909: { 0x227C, "pre", "" },
1910: { 0x227D, "sccue", "" },
1911: { 0x227D, "sce", "" },
1912: { 0x227E, "prap", "" },
1913: { 0x227E, "prsim", "" },
1914: { 0x227F, "scap", "" },
1915: { 0x227F, "scsim", "" },
1916: { 0x2280, "npr", "DOES NOT PRECEDE" },
1917: { 0x2281, "nsc", "DOES NOT SUCCEED" },
1918: { 0x2282, "sub", "" },
1919: { 0x2283, "sup", "" },
1920: { 0x2284, "nsub", "NOT A SUBSET OF" },
1921: { 0x2285, "nsup", "NOT A SUPERSET OF" },
1922: { 0x2286, "subE", "" },
1923: { 0x2286, "sube", "" },
1924: { 0x2287, "supE", "" },
1925: { 0x2287, "supe", "" },
1926: { 0x2288, "nsubE", "" },
1927: { 0x2288, "nsube", "" },
1928: { 0x2289, "nsupE", "" },
1929: { 0x2289, "nsupe", "" },
1930: { 0x228A, "subne", "" },
1931: { 0x228A, "subnE", "SUBSET OF WITH NOT EQUAL TO" },
1932: { 0x228A, "vsubne", "SUBSET OF WITH NOT EQUAL TO" },
1933: { 0x228B, "supnE", "" },
1934: { 0x228B, "supne", "" },
1935: { 0x228B, "vsupnE", "SUPERSET OF WITH NOT EQUAL TO" },
1936: { 0x228B, "vsupne", "SUPERSET OF WITH NOT EQUAL TO" },
1937: { 0x228E, "uplus", "MULTISET UNION" },
1938: { 0x228F, "sqsub", "" },
1939: { 0x2290, "sqsup", "" },
1940: { 0x2291, "sqsube", "" },
1941: { 0x2292, "sqsupe", "" },
1942: { 0x2293, "sqcap", "SQUARE CAP" },
1943: { 0x2294, "sqcup", "SQUARE CUP" },
1944: { 0x2295, "oplus", "CIRCLED PLUS" },
1945: { 0x2296, "ominus", "CIRCLED MINUS" },
1946: { 0x2297, "otimes", "CIRCLED TIMES" },
1947: { 0x2298, "osol", "CIRCLED DIVISION SLASH" },
1948: { 0x2299, "odot", "CIRCLED DOT OPERATOR" },
1949: { 0x229A, "ocir", "CIRCLED RING OPERATOR" },
1950: { 0x229B, "oast", "CIRCLED ASTERISK OPERATOR" },
1951: { 0x229D, "odash", "CIRCLED DASH" },
1952: { 0x229E, "plusb", "SQUARED PLUS" },
1953: { 0x229F, "minusb", "SQUARED MINUS" },
1954: { 0x22A0, "timesb", "SQUARED TIMES" },
1955: { 0x22A1, "sdotb", "SQUARED DOT OPERATOR" },
1956: { 0x22A2, "vdash", "" },
1957: { 0x22A3, "dashv", "" },
1958: { 0x22A4, "top", "DOWN TACK" },
1959: { 0x22A5, "bottom", "" },
1960: { 0x22A5, "perp", "" },
1961: { 0x22A7, "models", "MODELS" },
1962: { 0x22A8, "vDash", "" },
1963: { 0x22A9, "Vdash", "" },
1964: { 0x22AA, "Vvdash", "" },
1965: { 0x22AC, "nvdash", "DOES NOT PROVE" },
1966: { 0x22AD, "nvDash", "NOT TRUE" },
1967: { 0x22AE, "nVdash", "DOES NOT FORCE" },
1968: { 0x22AF, "nVDash", "NEGATED DOUBLE VERTICAL BAR DOUBLE RIGHT TURNSTILE" },
1969: { 0x22B2, "vltri", "" },
1970: { 0x22B3, "vrtri", "" },
1971: { 0x22B4, "ltrie", "" },
1972: { 0x22B5, "rtrie", "" },
1973: { 0x22B8, "mumap", "MULTIMAP" },
1974: { 0x22BA, "intcal", "INTERCALATE" },
1975: { 0x22BB, "veebar", "" },
1976: { 0x22BC, "barwed", "NAND" },
1977: { 0x22C4, "diam", "DIAMOND OPERATOR" },
1978: { 0x22C5, "sdot", "DOT OPERATOR" },
1979: { 0x22C6, "sstarf", "STAR OPERATOR" },
1980: { 0x22C6, "star", "STAR OPERATOR" },
1981: { 0x22C7, "divonx", "DIVISION TIMES" },
1982: { 0x22C8, "bowtie", "" },
1983: { 0x22C9, "ltimes", "LEFT NORMAL FACTOR SEMIDIRECT PRODUCT" },
1984: { 0x22CA, "rtimes", "RIGHT NORMAL FACTOR SEMIDIRECT PRODUCT" },
1985: { 0x22CB, "lthree", "LEFT SEMIDIRECT PRODUCT" },
1986: { 0x22CC, "rthree", "RIGHT SEMIDIRECT PRODUCT" },
1987: { 0x22CD, "bsime", "" },
1988: { 0x22CE, "cuvee", "CURLY LOGICAL OR" },
1989: { 0x22CF, "cuwed", "CURLY LOGICAL AND" },
1990: { 0x22D0, "Sub", "" },
1991: { 0x22D1, "Sup", "" },
1992: { 0x22D2, "Cap", "DOUBLE INTERSECTION" },
1993: { 0x22D3, "Cup", "DOUBLE UNION" },
1994: { 0x22D4, "fork", "" },
1995: { 0x22D6, "ldot", "" },
1996: { 0x22D7, "gsdot", "" },
1997: { 0x22D8, "Ll", "" },
1998: { 0x22D9, "Gg", "VERY MUCH GREATER-THAN" },
1999: { 0x22DA, "lEg", "" },
2000: { 0x22DA, "leg", "" },
2001: { 0x22DB, "gEl", "" },
2002: { 0x22DB, "gel", "" },
2003: { 0x22DC, "els", "" },
2004: { 0x22DD, "egs", "" },
2005: { 0x22DE, "cuepr", "" },
2006: { 0x22DF, "cuesc", "" },
2007: { 0x22E0, "npre", "DOES NOT PRECEDE OR EQUAL" },
2008: { 0x22E1, "nsce", "DOES NOT SUCCEED OR EQUAL" },
2009: { 0x22E6, "lnsim", "" },
2010: { 0x22E7, "gnsim", "GREATER-THAN BUT NOT EQUIVALENT TO" },
2011: { 0x22E8, "prnap", "" },
2012: { 0x22E8, "prnsim", "" },
2013: { 0x22E9, "scnap", "" },
2014: { 0x22E9, "scnsim", "" },
2015: { 0x22EA, "nltri", "NOT NORMAL SUBGROUP OF" },
2016: { 0x22EB, "nrtri", "DOES NOT CONTAIN AS NORMAL SUBGROUP" },
2017: { 0x22EC, "nltrie", "NOT NORMAL SUBGROUP OF OR EQUAL TO" },
2018: { 0x22ED, "nrtrie", "DOES NOT CONTAIN AS NORMAL SUBGROUP OR EQUAL" },
2019: { 0x22EE, "vellip", "" },
2020: { 0x2306, "Barwed", "PERSPECTIVE" },
2021: { 0x2308, "lceil", "LEFT CEILING" },
2022: { 0x2309, "rceil", "RIGHT CEILING" },
2023: { 0x230A, "lfloor", "LEFT FLOOR" },
2024: { 0x230B, "rfloor", "RIGHT FLOOR" },
2025: { 0x230C, "drcrop", "BOTTOM RIGHT CROP" },
2026: { 0x230D, "dlcrop", "BOTTOM LEFT CROP" },
2027: { 0x230E, "urcrop", "TOP RIGHT CROP" },
2028: { 0x230F, "ulcrop", "TOP LEFT CROP" },
2029: { 0x2315, "telrec", "TELEPHONE RECORDER" },
2030: { 0x2316, "target", "POSITION INDICATOR" },
2031: { 0x231C, "ulcorn", "TOP LEFT CORNER" },
2032: { 0x231D, "urcorn", "TOP RIGHT CORNER" },
2033: { 0x231E, "dlcorn", "BOTTOM LEFT CORNER" },
2034: { 0x231F, "drcorn", "BOTTOM RIGHT CORNER" },
2035: { 0x2322, "frown", "" },
2036: { 0x2322, "sfrown", "FROWN" },
2037: { 0x2323, "smile", "" },
2038: { 0x2323, "ssmile", "SMILE" },
2039: { 0x2423, "blank", "OPEN BOX" },
2040: { 0x24C8, "oS", "CIRCLED LATIN CAPITAL LETTER S" },
2041: { 0x2500, "boxh", "BOX DRAWINGS LIGHT HORIZONTAL" },
2042: { 0x2502, "boxv", "BOX DRAWINGS LIGHT VERTICAL" },
2043: { 0x250C, "boxdr", "BOX DRAWINGS LIGHT DOWN AND RIGHT" },
2044: { 0x2510, "boxdl", "BOX DRAWINGS LIGHT DOWN AND LEFT" },
2045: { 0x2514, "boxur", "BOX DRAWINGS LIGHT UP AND RIGHT" },
2046: { 0x2518, "boxul", "BOX DRAWINGS LIGHT UP AND LEFT" },
2047: { 0x251C, "boxvr", "BOX DRAWINGS LIGHT VERTICAL AND RIGHT" },
2048: { 0x2524, "boxvl", "BOX DRAWINGS LIGHT VERTICAL AND LEFT" },
2049: { 0x252C, "boxhd", "BOX DRAWINGS LIGHT DOWN AND HORIZONTAL" },
2050: { 0x2534, "boxhu", "BOX DRAWINGS LIGHT UP AND HORIZONTAL" },
2051: { 0x253C, "boxvh", "BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL" },
2052: { 0x2550, "boxH", "BOX DRAWINGS DOUBLE HORIZONTAL" },
2053: { 0x2551, "boxV", "BOX DRAWINGS DOUBLE VERTICAL" },
2054: { 0x2552, "boxDR", "BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE" },
2055: { 0x2553, "boxDr", "BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE" },
2056: { 0x2554, "boxdR", "BOX DRAWINGS DOUBLE DOWN AND RIGHT" },
2057: { 0x2555, "boxDL", "BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE" },
2058: { 0x2556, "boxdL", "BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE" },
2059: { 0x2557, "boxDl", "BOX DRAWINGS DOUBLE DOWN AND LEFT" },
2060: { 0x2558, "boxUR", "BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE" },
2061: { 0x2559, "boxuR", "BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE" },
2062: { 0x255A, "boxUr", "BOX DRAWINGS DOUBLE UP AND RIGHT" },
2063: { 0x255B, "boxUL", "BOX DRAWINGS UP SINGLE AND LEFT DOUBLE" },
2064: { 0x255C, "boxUl", "BOX DRAWINGS UP DOUBLE AND LEFT SINGLE" },
2065: { 0x255D, "boxuL", "BOX DRAWINGS DOUBLE UP AND LEFT" },
2066: { 0x255E, "boxvR", "BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE" },
2067: { 0x255F, "boxVR", "BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE" },
2068: { 0x2560, "boxVr", "BOX DRAWINGS DOUBLE VERTICAL AND RIGHT" },
2069: { 0x2561, "boxvL", "BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE" },
2070: { 0x2562, "boxVL", "BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE" },
2071: { 0x2563, "boxVl", "BOX DRAWINGS DOUBLE VERTICAL AND LEFT" },
2072: { 0x2564, "boxhD", "BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE" },
2073: { 0x2565, "boxHD", "BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE" },
2074: { 0x2566, "boxHd", "BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL" },
2075: { 0x2567, "boxhU", "BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE" },
2076: { 0x2568, "boxHU", "BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE" },
2077: { 0x2569, "boxHu", "BOX DRAWINGS DOUBLE UP AND HORIZONTAL" },
2078: { 0x256A, "boxvH", "BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE" },
2079: { 0x256B, "boxVH", "BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE" },
2080: { 0x256C, "boxVh", "BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL" },
2081: { 0x2580, "uhblk", "UPPER HALF BLOCK" },
2082: { 0x2584, "lhblk", "LOWER HALF BLOCK" },
2083: { 0x2588, "block", "FULL BLOCK" },
2084: { 0x2591, "blk14", "LIGHT SHADE" },
2085: { 0x2592, "blk12", "MEDIUM SHADE" },
2086: { 0x2593, "blk34", "DARK SHADE" },
2087: { 0x25A1, "square", "WHITE SQUARE" },
2088: { 0x25A1, "squ", "WHITE SQUARE" },
2089: { 0x25AA, "squf", "" },
2090: { 0x25AD, "rect", "WHITE RECTANGLE" },
2091: { 0x25AE, "marker", "BLACK VERTICAL RECTANGLE" },
2092: { 0x25B3, "xutri", "WHITE UP-POINTING TRIANGLE" },
2093: { 0x25B4, "utrif", "BLACK UP-POINTING TRIANGLE" },
2094: { 0x25B5, "utri", "WHITE UP-POINTING TRIANGLE" },
2095: { 0x25B8, "rtrif", "BLACK RIGHT-POINTING TRIANGLE" },
2096: { 0x25B9, "rtri", "WHITE RIGHT-POINTING TRIANGLE" },
2097: { 0x25BD, "xdtri", "WHITE DOWN-POINTING TRIANGLE" },
2098: { 0x25BE, "dtrif", "BLACK DOWN-POINTING TRIANGLE" },
2099: { 0x25BF, "dtri", "WHITE DOWN-POINTING TRIANGLE" },
2100: { 0x25C2, "ltrif", "BLACK LEFT-POINTING TRIANGLE" },
2101: { 0x25C3, "ltri", "WHITE LEFT-POINTING TRIANGLE" },
2102: { 0x25CA, "loz", "LOZENGE" },
2103: { 0x25CB, "cir", "WHITE CIRCLE" },
2104: { 0x25CB, "xcirc", "WHITE CIRCLE" },
2105: { 0x2605, "starf", "BLACK STAR" },
2106: { 0x260E, "phone", "TELEPHONE SIGN" },
2107: { 0x2640, "female", "" },
2108: { 0x2642, "male", "MALE SIGN" },
2109: { 0x2660, "spades", "BLACK SPADE SUIT" },
2110: { 0x2663, "clubs", "BLACK CLUB SUIT" },
2111: { 0x2665, "hearts", "BLACK HEART SUIT" },
2112: { 0x2666, "diams", "BLACK DIAMOND SUIT" },
2113: { 0x2669, "sung", "" },
2114: { 0x266D, "flat", "MUSIC FLAT SIGN" },
2115: { 0x266E, "natur", "MUSIC NATURAL SIGN" },
2116: { 0x266F, "sharp", "MUSIC SHARP SIGN" },
2117: { 0x2713, "check", "CHECK MARK" },
2118: { 0x2717, "cross", "BALLOT X" },
2119: { 0x2720, "malt", "MALTESE CROSS" },
2120: { 0x2726, "lozf", "" },
2121: { 0x2736, "sext", "SIX POINTED BLACK STAR" },
2122: { 0x3008, "lang", "" },
2123: { 0x3009, "rang", "" },
2124: { 0xE291, "rpargt", "" },
2125: { 0xE2A2, "lnap", "" },
2126: { 0xE2AA, "nsmid", "" },
2127: { 0xE2B3, "prnE", "" },
2128: { 0xE2B5, "scnE", "" },
2129: { 0xE2B8, "vsubnE", "" },
2130: { 0xE301, "smid", "" },
2131: { 0xE411, "gnap", "" },
2132: { 0xFB00, "fflig", "" },
2133: { 0xFB01, "filig", "" },
2134: { 0xFB02, "fllig", "" },
2135: { 0xFB03, "ffilig", "" },
2136: { 0xFB04, "ffllig", "" },
2137: { 0xFE68, "sbsol", "SMALL REVERSE SOLIDUS" },
2138: };
2139:
2140: /************************************************************************
2141: * *
2142: * Commodity functions to handle entities *
2143: * *
2144: ************************************************************************/
2145:
2146: /*
2147: * Macro used to grow the current buffer.
2148: */
2149: #define growBuffer(buffer) { \
2150: buffer##_size *= 2; \
2151: buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
2152: if (buffer == NULL) { \
2153: perror("realloc failed"); \
2154: return(NULL); \
2155: } \
2156: }
2157:
2158: /**
2159: * sgmlEntityLookup:
2160: * @name: the entity name
2161: *
2162: * Lookup the given entity in EntitiesTable
2163: *
2164: * TODO: the linear scan is really ugly, an hash table is really needed.
2165: *
2166: * Returns the associated sgmlEntityDescPtr if found, NULL otherwise.
2167: */
2168: sgmlEntityDescPtr
2169: sgmlEntityLookup(const xmlChar *name) {
2170: int i;
2171:
2172: for (i = 0;i < (sizeof(docbookEntitiesTable)/
2173: sizeof(docbookEntitiesTable[0]));i++) {
1.7 veillard 2174: if (xmlStrEqual(name, BAD_CAST docbookEntitiesTable[i].name)) {
1.1 veillard 2175: #ifdef DEBUG
2176: fprintf(stderr,"Found entity %s\n", name);
2177: #endif
2178: return(&docbookEntitiesTable[i]);
2179: }
2180: }
2181: return(NULL);
2182: }
2183:
2184: /**
2185: * sgmlEntityValueLookup:
2186: * @value: the entity's unicode value
2187: *
2188: * Lookup the given entity in EntitiesTable
2189: *
2190: * TODO: the linear scan is really ugly, an hash table is really needed.
2191: *
2192: * Returns the associated sgmlEntityDescPtr if found, NULL otherwise.
2193: */
2194: sgmlEntityDescPtr
2195: sgmlEntityValueLookup(int value) {
2196: int i;
2197: #ifdef DEBUG
2198: int lv = 0;
2199: #endif
2200:
2201: for (i = 0;i < (sizeof(docbookEntitiesTable)/
2202: sizeof(docbookEntitiesTable[0]));i++) {
2203: if (docbookEntitiesTable[i].value >= value) {
2204: if (docbookEntitiesTable[i].value > value)
2205: break;
2206: #ifdef DEBUG
2207: fprintf(stderr,"Found entity %s\n", docbookEntitiesTable[i].name);
2208: #endif
2209: return(&docbookEntitiesTable[i]);
2210: }
2211: #ifdef DEBUG
2212: if (lv > docbookEntitiesTable[i].value) {
2213: fprintf(stderr, "docbookEntitiesTable[] is not sorted (%d > %d)!\n",
2214: lv, docbookEntitiesTable[i].value);
2215: }
2216: lv = docbookEntitiesTable[i].value;
2217: #endif
2218: }
2219: return(NULL);
2220: }
2221:
2222: /**
2223: * UTF8ToSgml:
2224: * @out: a pointer to an array of bytes to store the result
2225: * @outlen: the length of @out
2226: * @in: a pointer to an array of UTF-8 chars
2227: * @inlen: the length of @in
2228: *
2229: * Take a block of UTF-8 chars in and try to convert it to an ASCII
2230: * plus SGML entities block of chars out.
2231: *
2232: * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2233: * The value of @inlen after return is the number of octets consumed
2234: * as the return value is positive, else unpredictiable.
2235: * The value of @outlen after return is the number of octets consumed.
2236: */
2237: int
2238: UTF8ToSgml(unsigned char* out, int *outlen,
2239: const unsigned char* in, int *inlen) {
2240: const unsigned char* processed = in;
2241: const unsigned char* outend;
2242: const unsigned char* outstart = out;
2243: const unsigned char* instart = in;
2244: const unsigned char* inend;
2245: unsigned int c, d;
2246: int trailing;
2247:
2248: if (in == NULL) {
2249: /*
2250: * initialization nothing to do
2251: */
2252: *outlen = 0;
2253: *inlen = 0;
2254: return(0);
2255: }
2256: inend = in + (*inlen);
2257: outend = out + (*outlen);
2258: while (in < inend) {
2259: d = *in++;
2260: if (d < 0x80) { c= d; trailing= 0; }
2261: else if (d < 0xC0) {
2262: /* trailing byte in leading position */
2263: *outlen = out - outstart;
2264: *inlen = processed - instart;
2265: return(-2);
2266: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2267: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2268: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2269: else {
2270: /* no chance for this in Ascii */
2271: *outlen = out - outstart;
2272: *inlen = processed - instart;
2273: return(-2);
2274: }
2275:
2276: if (inend - in < trailing) {
2277: break;
2278: }
2279:
2280: for ( ; trailing; trailing--) {
2281: if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2282: break;
2283: c <<= 6;
2284: c |= d & 0x3F;
2285: }
2286:
2287: /* assertion: c is a single UTF-4 value */
2288: if (c < 0x80) {
2289: if (out + 1 >= outend)
2290: break;
2291: *out++ = c;
2292: } else {
2293: int len;
2294: sgmlEntityDescPtr ent;
2295:
2296: /*
2297: * Try to lookup a predefined SGML entity for it
2298: */
2299:
2300: ent = sgmlEntityValueLookup(c);
2301: if (ent == NULL) {
2302: /* no chance for this in Ascii */
2303: *outlen = out - outstart;
2304: *inlen = processed - instart;
2305: return(-2);
2306: }
2307: len = strlen(ent->name);
2308: if (out + 2 + len >= outend)
2309: break;
2310: *out++ = '&';
2311: memcpy(out, ent->name, len);
2312: out += len;
2313: *out++ = ';';
2314: }
2315: processed = in;
2316: }
2317: *outlen = out - outstart;
2318: *inlen = processed - instart;
2319: return(0);
2320: }
2321:
2322: /**
2323: * sgmlEncodeEntities:
2324: * @out: a pointer to an array of bytes to store the result
2325: * @outlen: the length of @out
2326: * @in: a pointer to an array of UTF-8 chars
2327: * @inlen: the length of @in
2328: * @quoteChar: the quote character to escape (' or ") or zero.
2329: *
2330: * Take a block of UTF-8 chars in and try to convert it to an ASCII
2331: * plus SGML entities block of chars out.
2332: *
2333: * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2334: * The value of @inlen after return is the number of octets consumed
2335: * as the return value is positive, else unpredictiable.
2336: * The value of @outlen after return is the number of octets consumed.
2337: */
2338: int
2339: sgmlEncodeEntities(unsigned char* out, int *outlen,
2340: const unsigned char* in, int *inlen, int quoteChar) {
2341: const unsigned char* processed = in;
2342: const unsigned char* outend = out + (*outlen);
2343: const unsigned char* outstart = out;
2344: const unsigned char* instart = in;
2345: const unsigned char* inend = in + (*inlen);
2346: unsigned int c, d;
2347: int trailing;
2348:
2349: while (in < inend) {
2350: d = *in++;
2351: if (d < 0x80) { c= d; trailing= 0; }
2352: else if (d < 0xC0) {
2353: /* trailing byte in leading position */
2354: *outlen = out - outstart;
2355: *inlen = processed - instart;
2356: return(-2);
2357: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2358: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2359: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2360: else {
2361: /* no chance for this in Ascii */
2362: *outlen = out - outstart;
2363: *inlen = processed - instart;
2364: return(-2);
2365: }
2366:
2367: if (inend - in < trailing)
2368: break;
2369:
2370: while (trailing--) {
2371: if (((d= *in++) & 0xC0) != 0x80) {
2372: *outlen = out - outstart;
2373: *inlen = processed - instart;
2374: return(-2);
2375: }
2376: c <<= 6;
2377: c |= d & 0x3F;
2378: }
2379:
2380: /* assertion: c is a single UTF-4 value */
2381: if (c < 0x80 && c != quoteChar && c != '&' && c != '<' && c != '>') {
2382: if (out >= outend)
2383: break;
2384: *out++ = c;
2385: } else {
2386: sgmlEntityDescPtr ent;
2387: const char *cp;
2388: char nbuf[16];
2389: int len;
2390:
2391: /*
2392: * Try to lookup a predefined SGML entity for it
2393: */
2394: ent = sgmlEntityValueLookup(c);
2395: if (ent == NULL) {
2396: sprintf(nbuf, "#%u", c);
2397: cp = nbuf;
2398: }
2399: else
2400: cp = ent->name;
2401: len = strlen(cp);
2402: if (out + 2 + len > outend)
2403: break;
2404: *out++ = '&';
2405: memcpy(out, cp, len);
2406: out += len;
2407: *out++ = ';';
2408: }
2409: processed = in;
2410: }
2411: *outlen = out - outstart;
2412: *inlen = processed - instart;
2413: return(0);
2414: }
2415:
2416: /**
2417: * sgmlDecodeEntities:
2418: * @ctxt: the parser context
2419: * @len: the len to decode (in bytes !), -1 for no size limit
2420: * @end: an end marker xmlChar, 0 if none
2421: * @end2: an end marker xmlChar, 0 if none
2422: * @end3: an end marker xmlChar, 0 if none
2423: *
2424: * Subtitute the SGML entities by their value
2425: *
2426: * DEPRECATED !!!!
2427: *
2428: * Returns A newly allocated string with the substitution done. The caller
2429: * must deallocate it !
2430: */
2431: xmlChar *
2432: sgmlDecodeEntities(sgmlParserCtxtPtr ctxt, int len,
2433: xmlChar end, xmlChar end2, xmlChar end3) {
2434: xmlChar *name = NULL;
2435: xmlChar *buffer = NULL;
2436: unsigned int buffer_size = 0;
2437: unsigned int nbchars = 0;
2438: sgmlEntityDescPtr ent;
2439: unsigned int max = (unsigned int) len;
2440: int c,l;
2441:
2442: if (ctxt->depth > 40) {
1.6 veillard 2443: ctxt->errNo = XML_ERR_ENTITY_LOOP;
1.1 veillard 2444: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2445: ctxt->sax->error(ctxt->userData,
2446: "Detected entity reference loop\n");
2447: ctxt->wellFormed = 0;
2448: ctxt->disableSAX = 1;
2449: return(NULL);
2450: }
2451:
2452: /*
2453: * allocate a translation buffer.
2454: */
2455: buffer_size = SGML_PARSER_BIG_BUFFER_SIZE;
2456: buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
2457: if (buffer == NULL) {
2458: perror("xmlDecodeEntities: malloc failed");
2459: return(NULL);
2460: }
2461:
2462: /*
2463: * Ok loop until we reach one of the ending char or a size limit.
2464: */
2465: c = CUR_CHAR(l);
2466: while ((nbchars < max) && (c != end) &&
2467: (c != end2) && (c != end3)) {
2468:
2469: if (c == 0) break;
2470: if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
2471: int val = sgmlParseCharRef(ctxt);
2472: COPY_BUF(0,buffer,nbchars,val);
2473: NEXTL(l);
2474: } else if ((c == '&') && (ctxt->token != '&')) {
2475: ent = sgmlParseEntityRef(ctxt, &name);
2476: if (name != NULL) {
2477: if (ent != NULL) {
2478: int val = ent->value;
2479: COPY_BUF(0,buffer,nbchars,val);
2480: NEXTL(l);
2481: } else {
2482: const xmlChar *cur = name;
2483:
2484: buffer[nbchars++] = '&';
2485: if (nbchars > buffer_size - SGML_PARSER_BUFFER_SIZE) {
2486: growBuffer(buffer);
2487: }
2488: while (*cur != 0) {
2489: buffer[nbchars++] = *cur++;
2490: }
2491: buffer[nbchars++] = ';';
2492: }
2493: }
2494: } else {
2495: COPY_BUF(l,buffer,nbchars,c);
2496: NEXTL(l);
2497: if (nbchars > buffer_size - SGML_PARSER_BUFFER_SIZE) {
2498: growBuffer(buffer);
2499: }
2500: }
2501: c = CUR_CHAR(l);
2502: }
2503: buffer[nbchars++] = 0;
2504: return(buffer);
2505: }
2506:
2507: /************************************************************************
2508: * *
2509: * Commodity functions to handle streams *
2510: * *
2511: ************************************************************************/
2512:
2513: /**
2514: * sgmlFreeInputStream:
2515: * @input: an sgmlParserInputPtr
2516: *
2517: * Free up an input stream.
2518: */
2519: void
2520: sgmlFreeInputStream(sgmlParserInputPtr input) {
2521: if (input == NULL) return;
2522:
2523: if (input->filename != NULL) xmlFree((char *) input->filename);
2524: if (input->directory != NULL) xmlFree((char *) input->directory);
2525: if ((input->free != NULL) && (input->base != NULL))
2526: input->free((xmlChar *) input->base);
2527: if (input->buf != NULL)
2528: xmlFreeParserInputBuffer(input->buf);
2529: memset(input, -1, sizeof(sgmlParserInput));
2530: xmlFree(input);
2531: }
2532:
2533: /**
2534: * sgmlNewInputStream:
2535: * @ctxt: an SGML parser context
2536: *
2537: * Create a new input stream structure
2538: * Returns the new input stream or NULL
2539: */
2540: sgmlParserInputPtr
2541: sgmlNewInputStream(sgmlParserCtxtPtr ctxt) {
2542: sgmlParserInputPtr input;
2543:
2544: input = (xmlParserInputPtr) xmlMalloc(sizeof(sgmlParserInput));
2545: if (input == NULL) {
2546: ctxt->errNo = XML_ERR_NO_MEMORY;
2547: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2548: ctxt->sax->error(ctxt->userData,
2549: "malloc: couldn't allocate a new input stream\n");
2550: return(NULL);
2551: }
2552: memset(input, 0, sizeof(sgmlParserInput));
2553: input->filename = NULL;
2554: input->directory = NULL;
2555: input->base = NULL;
2556: input->cur = NULL;
2557: input->buf = NULL;
2558: input->line = 1;
2559: input->col = 1;
2560: input->buf = NULL;
2561: input->free = NULL;
2562: input->version = NULL;
2563: input->consumed = 0;
2564: input->length = 0;
2565: return(input);
2566: }
2567:
2568:
2569: /************************************************************************
2570: * *
2571: * Commodity functions, cleanup needed ? *
2572: * *
2573: ************************************************************************/
2574:
2575: /**
2576: * areBlanks:
2577: * @ctxt: an SGML parser context
2578: * @str: a xmlChar *
2579: * @len: the size of @str
2580: *
2581: * Is this a sequence of blank chars that one can ignore ?
2582: *
2583: * Returns 1 if ignorable 0 otherwise.
2584: */
2585:
2586: static int areBlanks(sgmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2587: int i;
2588: xmlNodePtr lastChild;
2589:
2590: for (i = 0;i < len;i++)
2591: if (!(IS_BLANK(str[i]))) return(0);
2592:
2593: if (CUR == 0) return(1);
2594: if (CUR != '<') return(0);
2595: if (ctxt->name == NULL)
2596: return(1);
2597: #if 0
1.7 veillard 2598: if (xmlStrEqual(ctxt->name, BAD_CAST"sgml"))
1.1 veillard 2599: return(1);
1.7 veillard 2600: if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1.1 veillard 2601: return(1);
1.7 veillard 2602: if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1.1 veillard 2603: return(1);
2604: #endif
2605: if (ctxt->node == NULL) return(0);
2606: lastChild = xmlGetLastChild(ctxt->node);
2607: if (lastChild == NULL) {
2608: if (ctxt->node->content != NULL) return(0);
2609: } else if (xmlNodeIsText(lastChild))
2610: return(0);
2611: return(1);
2612: }
2613:
2614: /**
2615: * sgmlHandleEntity:
2616: * @ctxt: an SGML parser context
2617: * @entity: an XML entity pointer.
2618: *
2619: * Default handling of an SGML entity, call the parser with the
2620: * substitution string
2621: */
2622:
2623: void
2624: sgmlHandleEntity(sgmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
2625: int len;
2626:
2627: if (entity->content == NULL) {
2628: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2629: ctxt->sax->error(ctxt->userData, "sgmlHandleEntity %s: content == NULL\n",
2630: entity->name);
2631: ctxt->wellFormed = 0;
2632: return;
2633: }
2634: len = xmlStrlen(entity->content);
2635:
2636: /*
2637: * Just handle the content as a set of chars.
2638: */
2639: sgmlCheckParagraph(ctxt);
2640: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
2641: ctxt->sax->characters(ctxt->userData, entity->content, len);
2642:
2643: }
2644:
2645: /**
2646: * sgmlNewDocNoDtD:
2647: * @URI: URI for the dtd, or NULL
2648: * @ExternalID: the external ID of the DTD, or NULL
2649: *
2650: * Returns a new document, do not intialize the DTD if not provided
2651: */
2652: sgmlDocPtr
2653: sgmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2654: xmlDocPtr cur;
2655:
2656: /*
2657: * Allocate a new document and fill the fields.
2658: */
2659: cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2660: if (cur == NULL) {
2661: fprintf(stderr, "xmlNewDoc : malloc failed\n");
2662: return(NULL);
2663: }
2664: memset(cur, 0, sizeof(xmlDoc));
2665:
2666: cur->type = XML_SGML_DOCUMENT_NODE;
2667: cur->version = NULL;
2668: cur->intSubset = NULL;
2669: if ((ExternalID != NULL) ||
2670: (URI != NULL))
2671: xmlCreateIntSubset(cur, BAD_CAST "SGML", ExternalID, URI);
2672: cur->doc = cur;
2673: cur->name = NULL;
2674: cur->children = NULL;
2675: cur->extSubset = NULL;
2676: cur->oldNs = NULL;
2677: cur->encoding = NULL;
2678: cur->standalone = 1;
2679: cur->compression = 0;
2680: cur->ids = NULL;
2681: cur->refs = NULL;
2682: #ifndef XML_WITHOUT_CORBA
2683: cur->_private = NULL;
2684: #endif
2685: return(cur);
2686: }
2687:
2688: /**
2689: * sgmlNewDoc:
2690: * @URI: URI for the dtd, or NULL
2691: * @ExternalID: the external ID of the DTD, or NULL
2692: *
2693: * Returns a new document
2694: */
2695: sgmlDocPtr
2696: sgmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2697: if ((URI == NULL) && (ExternalID == NULL))
2698: return(sgmlNewDocNoDtD(
2699: BAD_CAST "-//W3C//DTD SGML 4.0 Transitional//EN",
2700: BAD_CAST "http://www.w3.org/TR/REC-docbook/loose.dtd"));
2701:
2702: return(sgmlNewDocNoDtD(URI, ExternalID));
2703: }
2704:
2705:
2706: /************************************************************************
2707: * *
2708: * The parser itself *
2709: * Relates to http://www.w3.org/TR/docbook *
2710: * *
2711: ************************************************************************/
2712:
2713: /************************************************************************
2714: * *
2715: * The parser itself *
2716: * *
2717: ************************************************************************/
2718:
2719: /**
2720: * sgmlParseSGMLName:
2721: * @ctxt: an SGML parser context
2722: *
2723: * parse an SGML tag or attribute name, note that we convert it to lowercase
2724: * since SGML names are not case-sensitive.
2725: *
2726: * Returns the Tag Name parsed or NULL
2727: */
2728:
2729: xmlChar *
2730: sgmlParseSGMLName(sgmlParserCtxtPtr ctxt) {
2731: xmlChar *ret = NULL;
2732: int i = 0;
2733: xmlChar loc[SGML_PARSER_BUFFER_SIZE];
2734:
2735: if (!IS_LETTER(CUR) && (CUR != '_') &&
2736: (CUR != ':')) return(NULL);
2737:
2738: while ((i < SGML_PARSER_BUFFER_SIZE) &&
2739: ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2740: (CUR == ':') || (CUR == '_'))) {
2741: if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2742: else loc[i] = CUR;
2743: i++;
2744:
2745: NEXT;
2746: }
2747:
2748: ret = xmlStrndup(loc, i);
2749:
2750: return(ret);
2751: }
2752:
2753: /**
2754: * sgmlParseName:
2755: * @ctxt: an SGML parser context
2756: *
2757: * parse an SGML name, this routine is case sensistive.
2758: *
2759: * Returns the Name parsed or NULL
2760: */
2761:
2762: xmlChar *
2763: sgmlParseName(sgmlParserCtxtPtr ctxt) {
2764: xmlChar buf[SGML_MAX_NAMELEN];
2765: int len = 0;
2766:
2767: GROW;
2768: if (!IS_LETTER(CUR) && (CUR != '_')) {
2769: return(NULL);
2770: }
2771:
2772: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2773: (CUR == '.') || (CUR == '-') ||
2774: (CUR == '_') || (CUR == ':') ||
2775: (IS_COMBINING(CUR)) ||
2776: (IS_EXTENDER(CUR))) {
2777: buf[len++] = CUR;
2778: NEXT;
2779: if (len >= SGML_MAX_NAMELEN) {
2780: fprintf(stderr,
2781: "sgmlParseName: reached SGML_MAX_NAMELEN limit\n");
2782: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2783: (CUR == '.') || (CUR == '-') ||
2784: (CUR == '_') || (CUR == ':') ||
2785: (IS_COMBINING(CUR)) ||
2786: (IS_EXTENDER(CUR)))
2787: NEXT;
2788: break;
2789: }
2790: }
2791: return(xmlStrndup(buf, len));
2792: }
2793:
2794: /**
2795: * sgmlParseSGMLAttribute:
2796: * @ctxt: an SGML parser context
2797: * @stop: a char stop value
2798: *
2799: * parse an SGML attribute value till the stop (quote), if
2800: * stop is 0 then it stops at the first space
2801: *
2802: * Returns the attribute parsed or NULL
2803: */
2804:
2805: xmlChar *
2806: sgmlParseSGMLAttribute(sgmlParserCtxtPtr ctxt, const xmlChar stop) {
2807: #if 0
2808: xmlChar buf[SGML_MAX_NAMELEN];
2809: int len = 0;
2810:
2811: GROW;
2812: while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
2813: if ((stop == 0) && (IS_BLANK(CUR))) break;
2814: buf[len++] = CUR;
2815: NEXT;
2816: if (len >= SGML_MAX_NAMELEN) {
2817: fprintf(stderr,
2818: "sgmlParseSGMLAttribute: reached SGML_MAX_NAMELEN limit\n");
2819: while ((!IS_BLANK(CUR)) && (CUR != '<') &&
2820: (CUR != '>') &&
2821: (CUR != '\'') && (CUR != '"'))
2822: NEXT;
2823: break;
2824: }
2825: }
2826: return(xmlStrndup(buf, len));
2827: #else
2828: xmlChar *buffer = NULL;
2829: int buffer_size = 0;
2830: xmlChar *out = NULL;
2831: xmlChar *name = NULL;
2832:
2833: xmlChar *cur = NULL;
2834: sgmlEntityDescPtr ent;
2835:
2836: /*
2837: * allocate a translation buffer.
2838: */
2839: buffer_size = SGML_PARSER_BIG_BUFFER_SIZE;
2840: buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
2841: if (buffer == NULL) {
2842: perror("sgmlParseSGMLAttribute: malloc failed");
2843: return(NULL);
2844: }
2845: out = buffer;
2846:
2847: /*
2848: * Ok loop until we reach one of the ending chars
2849: */
2850: while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
2851: if ((stop == 0) && (IS_BLANK(CUR))) break;
2852: if (CUR == '&') {
2853: if (NXT(1) == '#') {
2854: unsigned int c;
2855: int bits;
2856:
2857: c = sgmlParseCharRef(ctxt);
2858: if (c < 0x80)
2859: { *out++ = c; bits= -6; }
2860: else if (c < 0x800)
2861: { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2862: else if (c < 0x10000)
2863: { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2864: else
2865: { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2866:
2867: for ( ; bits >= 0; bits-= 6) {
2868: *out++ = ((c >> bits) & 0x3F) | 0x80;
2869: }
2870: } else {
2871: ent = sgmlParseEntityRef(ctxt, &name);
2872: if (name == NULL) {
2873: *out++ = '&';
2874: if (out - buffer > buffer_size - 100) {
2875: int index = out - buffer;
2876:
2877: growBuffer(buffer);
2878: out = &buffer[index];
2879: }
2880: } else if (ent == NULL) {
2881: *out++ = '&';
2882: cur = name;
2883: while (*cur != 0) {
2884: if (out - buffer > buffer_size - 100) {
2885: int index = out - buffer;
2886:
2887: growBuffer(buffer);
2888: out = &buffer[index];
2889: }
2890: *out++ = *cur++;
2891: }
2892: xmlFree(name);
2893: } else {
2894: unsigned int c;
2895: int bits;
2896:
2897: if (out - buffer > buffer_size - 100) {
2898: int index = out - buffer;
2899:
2900: growBuffer(buffer);
2901: out = &buffer[index];
2902: }
2903: c = (xmlChar)ent->value;
2904: if (c < 0x80)
2905: { *out++ = c; bits= -6; }
2906: else if (c < 0x800)
2907: { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2908: else if (c < 0x10000)
2909: { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2910: else
2911: { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2912:
2913: for ( ; bits >= 0; bits-= 6) {
2914: *out++ = ((c >> bits) & 0x3F) | 0x80;
2915: }
2916: xmlFree(name);
2917: }
2918: }
2919: } else {
2920: unsigned int c;
2921: int bits;
2922:
2923: if (out - buffer > buffer_size - 100) {
2924: int index = out - buffer;
2925:
2926: growBuffer(buffer);
2927: out = &buffer[index];
2928: }
2929: c = CUR;
2930: if (c < 0x80)
2931: { *out++ = c; bits= -6; }
2932: else if (c < 0x800)
2933: { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2934: else if (c < 0x10000)
2935: { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2936: else
2937: { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2938:
2939: for ( ; bits >= 0; bits-= 6) {
2940: *out++ = ((c >> bits) & 0x3F) | 0x80;
2941: }
2942: NEXT;
2943: }
2944: }
2945: *out++ = 0;
2946: return(buffer);
2947: #endif
2948: }
2949:
2950: /**
2951: * sgmlParseNmtoken:
2952: * @ctxt: an SGML parser context
2953: *
2954: * parse an SGML Nmtoken.
2955: *
2956: * Returns the Nmtoken parsed or NULL
2957: */
2958:
2959: xmlChar *
2960: sgmlParseNmtoken(sgmlParserCtxtPtr ctxt) {
2961: xmlChar buf[SGML_MAX_NAMELEN];
2962: int len = 0;
2963:
2964: GROW;
2965: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2966: (CUR == '.') || (CUR == '-') ||
2967: (CUR == '_') || (CUR == ':') ||
2968: (IS_COMBINING(CUR)) ||
2969: (IS_EXTENDER(CUR))) {
2970: buf[len++] = CUR;
2971: NEXT;
2972: if (len >= SGML_MAX_NAMELEN) {
2973: fprintf(stderr,
2974: "sgmlParseNmtoken: reached SGML_MAX_NAMELEN limit\n");
2975: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2976: (CUR == '.') || (CUR == '-') ||
2977: (CUR == '_') || (CUR == ':') ||
2978: (IS_COMBINING(CUR)) ||
2979: (IS_EXTENDER(CUR)))
2980: NEXT;
2981: break;
2982: }
2983: }
2984: return(xmlStrndup(buf, len));
2985: }
2986:
2987: /**
2988: * sgmlParseEntityRef:
2989: * @ctxt: an SGML parser context
2990: * @str: location to store the entity name
2991: *
2992: * parse an SGML ENTITY references
2993: *
2994: * [68] EntityRef ::= '&' Name ';'
2995: *
2996: * Returns the associated sgmlEntityDescPtr if found, or NULL otherwise,
2997: * if non-NULL *str will have to be freed by the caller.
2998: */
2999: sgmlEntityDescPtr
3000: sgmlParseEntityRef(sgmlParserCtxtPtr ctxt, xmlChar **str) {
3001: xmlChar *name;
3002: sgmlEntityDescPtr ent = NULL;
3003: *str = NULL;
3004:
3005: if (CUR == '&') {
3006: NEXT;
3007: name = sgmlParseName(ctxt);
3008: if (name == NULL) {
3009: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3010: ctxt->sax->error(ctxt->userData, "sgmlParseEntityRef: no name\n");
3011: ctxt->wellFormed = 0;
3012: } else {
3013: GROW;
3014: if (CUR == ';') {
3015: *str = name;
3016:
3017: /*
3018: * Lookup the entity in the table.
3019: */
3020: ent = sgmlEntityLookup(name);
3021: if (ent != NULL) /* OK that's ugly !!! */
3022: NEXT;
3023: } else {
3024: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3025: ctxt->sax->error(ctxt->userData,
3026: "sgmlParseEntityRef: expecting ';'\n");
3027: *str = name;
3028: }
3029: }
3030: }
3031: return(ent);
3032: }
3033:
3034: /**
3035: * sgmlParseAttValue:
3036: * @ctxt: an SGML parser context
3037: *
3038: * parse a value for an attribute
3039: * Note: the parser won't do substitution of entities here, this
3040: * will be handled later in xmlStringGetNodeList, unless it was
3041: * asked for ctxt->replaceEntities != 0
3042: *
3043: * Returns the AttValue parsed or NULL.
3044: */
3045:
3046: xmlChar *
3047: sgmlParseAttValue(sgmlParserCtxtPtr ctxt) {
3048: xmlChar *ret = NULL;
3049:
3050: if (CUR == '"') {
3051: NEXT;
3052: ret = sgmlParseSGMLAttribute(ctxt, '"');
3053: if (CUR != '"') {
3054: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3055: ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
3056: ctxt->wellFormed = 0;
3057: } else
3058: NEXT;
3059: } else if (CUR == '\'') {
3060: NEXT;
3061: ret = sgmlParseSGMLAttribute(ctxt, '\'');
3062: if (CUR != '\'') {
3063: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3064: ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
3065: ctxt->wellFormed = 0;
3066: } else
3067: NEXT;
3068: } else {
3069: /*
3070: * That's an SGMLism, the attribute value may not be quoted
3071: */
3072: ret = sgmlParseSGMLAttribute(ctxt, 0);
3073: if (ret == NULL) {
3074: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3075: ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
3076: ctxt->wellFormed = 0;
3077: }
3078: }
3079: return(ret);
3080: }
3081:
3082: /**
3083: * sgmlParseSystemLiteral:
3084: * @ctxt: an SGML parser context
3085: *
3086: * parse an SGML Literal
3087: *
3088: * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
3089: *
3090: * Returns the SystemLiteral parsed or NULL
3091: */
3092:
3093: xmlChar *
3094: sgmlParseSystemLiteral(sgmlParserCtxtPtr ctxt) {
3095: const xmlChar *q;
3096: xmlChar *ret = NULL;
3097:
3098: if (CUR == '"') {
3099: NEXT;
3100: q = CUR_PTR;
3101: while ((IS_CHAR(CUR)) && (CUR != '"'))
3102: NEXT;
3103: if (!IS_CHAR(CUR)) {
3104: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3105: ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
3106: ctxt->wellFormed = 0;
3107: } else {
3108: ret = xmlStrndup(q, CUR_PTR - q);
3109: NEXT;
3110: }
3111: } else if (CUR == '\'') {
3112: NEXT;
3113: q = CUR_PTR;
3114: while ((IS_CHAR(CUR)) && (CUR != '\''))
3115: NEXT;
3116: if (!IS_CHAR(CUR)) {
3117: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3118: ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
3119: ctxt->wellFormed = 0;
3120: } else {
3121: ret = xmlStrndup(q, CUR_PTR - q);
3122: NEXT;
3123: }
3124: } else {
3125: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3126: ctxt->sax->error(ctxt->userData,
3127: "SystemLiteral \" or ' expected\n");
3128: ctxt->wellFormed = 0;
3129: }
3130:
3131: return(ret);
3132: }
3133:
3134: /**
3135: * sgmlParsePubidLiteral:
3136: * @ctxt: an SGML parser context
3137: *
3138: * parse an SGML public literal
3139: *
3140: * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
3141: *
3142: * Returns the PubidLiteral parsed or NULL.
3143: */
3144:
3145: xmlChar *
3146: sgmlParsePubidLiteral(sgmlParserCtxtPtr ctxt) {
3147: const xmlChar *q;
3148: xmlChar *ret = NULL;
3149: /*
3150: * Name ::= (Letter | '_') (NameChar)*
3151: */
3152: if (CUR == '"') {
3153: NEXT;
3154: q = CUR_PTR;
3155: while (IS_PUBIDCHAR(CUR)) NEXT;
3156: if (CUR != '"') {
3157: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3158: ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
3159: ctxt->wellFormed = 0;
3160: } else {
3161: ret = xmlStrndup(q, CUR_PTR - q);
3162: NEXT;
3163: }
3164: } else if (CUR == '\'') {
3165: NEXT;
3166: q = CUR_PTR;
3167: while ((IS_LETTER(CUR)) && (CUR != '\''))
3168: NEXT;
3169: if (!IS_LETTER(CUR)) {
3170: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3171: ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
3172: ctxt->wellFormed = 0;
3173: } else {
3174: ret = xmlStrndup(q, CUR_PTR - q);
3175: NEXT;
3176: }
3177: } else {
3178: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3179: ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
3180: ctxt->wellFormed = 0;
3181: }
3182:
3183: return(ret);
3184: }
3185:
3186: /**
3187: * sgmlParseCharData:
3188: * @ctxt: an SGML parser context
3189: * @cdata: int indicating whether we are within a CDATA section
3190: *
3191: * parse a CharData section.
3192: * if we are within a CDATA section ']]>' marks an end of section.
3193: *
3194: * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3195: */
3196:
3197: void
3198: sgmlParseCharData(sgmlParserCtxtPtr ctxt, int cdata) {
3199: xmlChar buf[SGML_PARSER_BIG_BUFFER_SIZE + 5];
3200: int nbchar = 0;
3201: int cur, l;
3202:
3203: SHRINK;
3204: cur = CUR_CHAR(l);
3205: while (((cur != '<') || (ctxt->token == '<')) &&
3206: ((cur != '&') || (ctxt->token == '&')) &&
3207: (IS_CHAR(cur))) {
3208: COPY_BUF(l,buf,nbchar,cur);
3209: if (nbchar >= SGML_PARSER_BIG_BUFFER_SIZE) {
3210: /*
3211: * Ok the segment is to be consumed as chars.
3212: */
3213: if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3214: if (areBlanks(ctxt, buf, nbchar)) {
3215: if (ctxt->sax->ignorableWhitespace != NULL)
3216: ctxt->sax->ignorableWhitespace(ctxt->userData,
3217: buf, nbchar);
3218: } else {
3219: sgmlCheckParagraph(ctxt);
3220: if (ctxt->sax->characters != NULL)
3221: ctxt->sax->characters(ctxt->userData, buf, nbchar);
3222: }
3223: }
3224: nbchar = 0;
3225: }
3226: NEXTL(l);
3227: cur = CUR_CHAR(l);
3228: }
3229: if (nbchar != 0) {
3230: /*
3231: * Ok the segment is to be consumed as chars.
3232: */
3233: if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3234: if (areBlanks(ctxt, buf, nbchar)) {
3235: if (ctxt->sax->ignorableWhitespace != NULL)
3236: ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
3237: } else {
3238: sgmlCheckParagraph(ctxt);
3239: if (ctxt->sax->characters != NULL)
3240: ctxt->sax->characters(ctxt->userData, buf, nbchar);
3241: }
3242: }
3243: }
3244: }
3245:
3246: /**
3247: * sgmlParseExternalID:
3248: * @ctxt: an SGML parser context
3249: * @publicID: a xmlChar** receiving PubidLiteral
3250: * @strict: indicate whether we should restrict parsing to only
3251: * production [75], see NOTE below
3252: *
3253: * Parse an External ID or a Public ID
3254: *
3255: * NOTE: Productions [75] and [83] interract badly since [75] can generate
3256: * 'PUBLIC' S PubidLiteral S SystemLiteral
3257: *
3258: * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3259: * | 'PUBLIC' S PubidLiteral S SystemLiteral
3260: *
3261: * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3262: *
3263: * Returns the function returns SystemLiteral and in the second
3264: * case publicID receives PubidLiteral, is strict is off
3265: * it is possible to return NULL and have publicID set.
3266: */
3267:
3268: xmlChar *
3269: sgmlParseExternalID(sgmlParserCtxtPtr ctxt, xmlChar **publicID, int strict) {
3270: xmlChar *URI = NULL;
3271:
3272: if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3273: (UPP(2) == 'S') && (UPP(3) == 'T') &&
3274: (UPP(4) == 'E') && (UPP(5) == 'M')) {
3275: SKIP(6);
3276: if (!IS_BLANK(CUR)) {
3277: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3278: ctxt->sax->error(ctxt->userData,
3279: "Space required after 'SYSTEM'\n");
3280: ctxt->wellFormed = 0;
3281: }
3282: SKIP_BLANKS;
3283: URI = sgmlParseSystemLiteral(ctxt);
3284: if (URI == NULL) {
3285: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3286: ctxt->sax->error(ctxt->userData,
3287: "sgmlParseExternalID: SYSTEM, no URI\n");
3288: ctxt->wellFormed = 0;
3289: }
3290: } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3291: (UPP(2) == 'B') && (UPP(3) == 'L') &&
3292: (UPP(4) == 'I') && (UPP(5) == 'C')) {
3293: SKIP(6);
3294: if (!IS_BLANK(CUR)) {
3295: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3296: ctxt->sax->error(ctxt->userData,
3297: "Space required after 'PUBLIC'\n");
3298: ctxt->wellFormed = 0;
3299: }
3300: SKIP_BLANKS;
3301: *publicID = sgmlParsePubidLiteral(ctxt);
3302: if (*publicID == NULL) {
3303: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3304: ctxt->sax->error(ctxt->userData,
3305: "sgmlParseExternalID: PUBLIC, no Public Identifier\n");
3306: ctxt->wellFormed = 0;
3307: }
3308: SKIP_BLANKS;
3309: if ((CUR == '"') || (CUR == '\'')) {
3310: URI = sgmlParseSystemLiteral(ctxt);
3311: }
3312: }
3313: return(URI);
3314: }
3315:
3316: /**
3317: * sgmlParseComment:
3318: * @ctxt: an SGML parser context
3319: *
3320: * Parse an XML (SGML) comment <!-- .... -->
3321: *
3322: * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3323: */
3324: void
3325: sgmlParseComment(sgmlParserCtxtPtr ctxt) {
3326: xmlChar *buf = NULL;
3327: int len;
3328: int size = SGML_PARSER_BUFFER_SIZE;
3329: int q, ql;
3330: int r, rl;
3331: int cur, l;
3332: xmlParserInputState state;
3333:
3334: /*
3335: * Check that there is a comment right here.
3336: */
3337: if ((RAW != '<') || (NXT(1) != '!') ||
3338: (NXT(2) != '-') || (NXT(3) != '-')) return;
3339:
3340: state = ctxt->instate;
3341: ctxt->instate = XML_PARSER_COMMENT;
3342: SHRINK;
3343: SKIP(4);
3344: buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
3345: if (buf == NULL) {
3346: fprintf(stderr, "malloc of %d byte failed\n", size);
3347: ctxt->instate = state;
3348: return;
3349: }
3350: q = CUR_CHAR(ql);
3351: NEXTL(ql);
3352: r = CUR_CHAR(rl);
3353: NEXTL(rl);
3354: cur = CUR_CHAR(l);
3355: len = 0;
3356: while (IS_CHAR(cur) &&
3357: ((cur != '>') ||
3358: (r != '-') || (q != '-'))) {
3359: if (len + 5 >= size) {
3360: size *= 2;
3361: buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3362: if (buf == NULL) {
3363: fprintf(stderr, "realloc of %d byte failed\n", size);
3364: ctxt->instate = state;
3365: return;
3366: }
3367: }
3368: COPY_BUF(ql,buf,len,q);
3369: q = r;
3370: ql = rl;
3371: r = cur;
3372: rl = l;
3373: NEXTL(l);
3374: cur = CUR_CHAR(l);
3375: if (cur == 0) {
3376: SHRINK;
3377: GROW;
3378: cur = CUR_CHAR(l);
3379: }
3380: }
3381: buf[len] = 0;
3382: if (!IS_CHAR(cur)) {
1.6 veillard 3383: ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
1.1 veillard 3384: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3385: ctxt->sax->error(ctxt->userData,
3386: "Comment not terminated \n<!--%.50s\n", buf);
3387: ctxt->wellFormed = 0;
3388: xmlFree(buf);
3389: } else {
3390: NEXT;
3391: if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3392: (!ctxt->disableSAX))
3393: ctxt->sax->comment(ctxt->userData, buf);
3394: xmlFree(buf);
3395: }
3396: ctxt->instate = state;
3397: }
3398:
3399: /**
3400: * sgmlParseCharRef:
3401: * @ctxt: an SGML parser context
3402: *
3403: * parse Reference declarations
3404: *
3405: * [66] CharRef ::= '&#' [0-9]+ ';' |
3406: * '&#x' [0-9a-fA-F]+ ';'
3407: *
3408: * Returns the value parsed (as an int)
3409: */
3410: int
3411: sgmlParseCharRef(sgmlParserCtxtPtr ctxt) {
3412: int val = 0;
3413:
3414: if ((CUR == '&') && (NXT(1) == '#') &&
3415: (NXT(2) == 'x')) {
3416: SKIP(3);
3417: while (CUR != ';') {
3418: if ((CUR >= '0') && (CUR <= '9'))
3419: val = val * 16 + (CUR - '0');
3420: else if ((CUR >= 'a') && (CUR <= 'f'))
3421: val = val * 16 + (CUR - 'a') + 10;
3422: else if ((CUR >= 'A') && (CUR <= 'F'))
3423: val = val * 16 + (CUR - 'A') + 10;
3424: else {
3425: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3426: ctxt->sax->error(ctxt->userData,
3427: "sgmlParseCharRef: invalid hexadecimal value\n");
3428: ctxt->wellFormed = 0;
3429: val = 0;
3430: break;
3431: }
3432: NEXT;
3433: }
3434: if (CUR == ';')
3435: NEXT;
3436: } else if ((CUR == '&') && (NXT(1) == '#')) {
3437: SKIP(2);
3438: while (CUR != ';') {
3439: if ((CUR >= '0') && (CUR <= '9'))
3440: val = val * 10 + (CUR - '0');
3441: else {
3442: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3443: ctxt->sax->error(ctxt->userData,
3444: "sgmlParseCharRef: invalid decimal value\n");
3445: ctxt->wellFormed = 0;
3446: val = 0;
3447: break;
3448: }
3449: NEXT;
3450: }
3451: if (CUR == ';')
3452: NEXT;
3453: } else {
3454: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3455: ctxt->sax->error(ctxt->userData, "sgmlParseCharRef: invalid value\n");
3456: ctxt->wellFormed = 0;
3457: }
3458: /*
3459: * Check the value IS_CHAR ...
3460: */
3461: if (IS_CHAR(val)) {
3462: return(val);
3463: } else {
3464: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3465: ctxt->sax->error(ctxt->userData, "sgmlParseCharRef: invalid xmlChar value %d\n",
3466: val);
3467: ctxt->wellFormed = 0;
3468: }
3469: return(0);
3470: }
3471:
3472:
3473: /**
3474: * sgmlParseDocTypeDecl :
3475: * @ctxt: an SGML parser context
3476: *
3477: * parse a DOCTYPE declaration
3478: *
3479: * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3480: * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3481: */
3482:
3483: void
3484: sgmlParseDocTypeDecl(sgmlParserCtxtPtr ctxt) {
3485: xmlChar *name;
3486: xmlChar *ExternalID = NULL;
3487: xmlChar *URI = NULL;
3488:
3489: /*
3490: * We know that '<!DOCTYPE' has been detected.
3491: */
3492: SKIP(9);
3493:
3494: SKIP_BLANKS;
3495:
3496: /*
3497: * Parse the DOCTYPE name.
3498: */
3499: name = sgmlParseName(ctxt);
3500: if (name == NULL) {
3501: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3502: ctxt->sax->error(ctxt->userData, "sgmlParseDocTypeDecl : no DOCTYPE name !\n");
3503: ctxt->wellFormed = 0;
3504: }
3505: /*
3506: * Check that upper(name) == "SGML" !!!!!!!!!!!!!
3507: */
3508:
3509: SKIP_BLANKS;
3510:
3511: /*
3512: * Check for SystemID and ExternalID
3513: */
3514: URI = sgmlParseExternalID(ctxt, &ExternalID, 0);
3515: SKIP_BLANKS;
3516:
3517: /*
1.2 veillard 3518: * Create or update the document accordingly to the DOCTYPE
3519: */
3520: if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3521: (!ctxt->disableSAX))
3522: ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3523:
3524: /*
3525: * Is there any internal subset declarations ?
3526: * they are handled separately in sgmlParseInternalSubset()
3527: */
3528: if (RAW == '[')
3529: return;
3530:
3531:
3532: /*
1.1 veillard 3533: * We should be at the end of the DOCTYPE declaration.
3534: */
3535: if (CUR != '>') {
3536: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3537: ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
3538: ctxt->wellFormed = 0;
3539: /* We shouldn't try to resynchronize ... */
3540: }
3541: NEXT;
3542:
3543: /*
3544: * Cleanup, since we don't use all those identifiers
3545: */
3546: if (URI != NULL) xmlFree(URI);
3547: if (ExternalID != NULL) xmlFree(ExternalID);
3548: if (name != NULL) xmlFree(name);
3549: }
3550:
3551: /**
3552: * sgmlParseAttribute:
3553: * @ctxt: an SGML parser context
3554: * @value: a xmlChar ** used to store the value of the attribute
3555: *
3556: * parse an attribute
3557: *
3558: * [41] Attribute ::= Name Eq AttValue
3559: *
3560: * [25] Eq ::= S? '=' S?
3561: *
3562: * With namespace:
3563: *
3564: * [NS 11] Attribute ::= QName Eq AttValue
3565: *
3566: * Also the case QName == xmlns:??? is handled independently as a namespace
3567: * definition.
3568: *
3569: * Returns the attribute name, and the value in *value.
3570: */
3571:
3572: xmlChar *
3573: sgmlParseAttribute(sgmlParserCtxtPtr ctxt, xmlChar **value) {
3574: xmlChar *name, *val = NULL;
3575:
3576: *value = NULL;
3577: name = sgmlParseName(ctxt);
3578: if (name == NULL) {
3579: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3580: ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
3581: ctxt->wellFormed = 0;
3582: return(NULL);
3583: }
3584:
3585: /*
3586: * read the value
3587: */
3588: SKIP_BLANKS;
3589: if (CUR == '=') {
3590: NEXT;
3591: SKIP_BLANKS;
3592: val = sgmlParseAttValue(ctxt);
3593: /******
3594: } else {
3595: * TODO : some attribute must have values, some may not
3596: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3597: ctxt->sax->warning(ctxt->userData,
3598: "No value for attribute %s\n", name); */
3599: }
3600:
3601: *value = val;
3602: return(name);
3603: }
3604:
3605: /**
3606: * sgmlCheckEncoding:
3607: * @ctxt: an SGML parser context
3608: * @attvalue: the attribute value
3609: *
3610: * Checks an http-equiv attribute from a Meta tag to detect
3611: * the encoding
3612: * If a new encoding is detected the parser is switched to decode
3613: * it and pass UTF8
3614: */
3615: void
3616: sgmlCheckEncoding(sgmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3617: const xmlChar *encoding;
3618:
3619: if ((ctxt == NULL) || (attvalue == NULL))
3620: return;
3621:
3622: encoding = xmlStrstr(attvalue, BAD_CAST"charset=");
3623: if (encoding == NULL)
3624: encoding = xmlStrstr(attvalue, BAD_CAST"Charset=");
3625: if (encoding == NULL)
3626: encoding = xmlStrstr(attvalue, BAD_CAST"CHARSET=");
3627: if (encoding != NULL) {
3628: encoding += 8;
3629: } else {
3630: encoding = xmlStrstr(attvalue, BAD_CAST"charset =");
3631: if (encoding == NULL)
3632: encoding = xmlStrstr(attvalue, BAD_CAST"Charset =");
3633: if (encoding == NULL)
3634: encoding = xmlStrstr(attvalue, BAD_CAST"CHARSET =");
3635: if (encoding != NULL)
3636: encoding += 9;
3637: }
3638: if (encoding != NULL) {
3639: xmlCharEncoding enc;
3640: xmlCharEncodingHandlerPtr handler;
3641:
3642: while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3643:
3644: if (ctxt->input->encoding != NULL)
3645: xmlFree((xmlChar *) ctxt->input->encoding);
3646: ctxt->input->encoding = xmlStrdup(encoding);
3647:
3648: enc = xmlParseCharEncoding((const char *) encoding);
3649: /*
3650: * registered set of known encodings
3651: */
3652: if (enc != XML_CHAR_ENCODING_ERROR) {
3653: xmlSwitchEncoding(ctxt, enc);
3654: ctxt->charset = XML_CHAR_ENCODING_UTF8;
3655: } else {
3656: /*
3657: * fallback for unknown encodings
3658: */
3659: handler = xmlFindCharEncodingHandler((const char *) encoding);
3660: if (handler != NULL) {
3661: xmlSwitchToEncoding(ctxt, handler);
3662: ctxt->charset = XML_CHAR_ENCODING_UTF8;
3663: } else {
3664: ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3665: }
3666: }
3667:
3668: if ((ctxt->input->buf != NULL) &&
3669: (ctxt->input->buf->encoder != NULL) &&
3670: (ctxt->input->buf->raw != NULL) &&
3671: (ctxt->input->buf->buffer != NULL)) {
3672: int nbchars;
3673: int processed;
3674:
3675: /*
3676: * convert as much as possible to the parser reading buffer.
3677: */
3678: processed = ctxt->input->cur - ctxt->input->base;
3679: xmlBufferShrink(ctxt->input->buf->buffer, processed);
3680: nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3681: ctxt->input->buf->buffer,
3682: ctxt->input->buf->raw);
3683: if (nbchars < 0) {
1.6 veillard 3684: ctxt->errNo = XML_ERR_INVALID_ENCODING;
1.1 veillard 3685: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3686: ctxt->sax->error(ctxt->userData,
3687: "sgmlCheckEncoding: encoder error\n");
3688: }
3689: ctxt->input->base =
3690: ctxt->input->cur = ctxt->input->buf->buffer->content;
3691: }
3692: }
3693: }
3694:
3695: /**
3696: * sgmlCheckMeta:
3697: * @ctxt: an SGML parser context
3698: * @atts: the attributes values
3699: *
3700: * Checks an attributes from a Meta tag
3701: */
3702: void
3703: sgmlCheckMeta(sgmlParserCtxtPtr ctxt, const xmlChar **atts) {
3704: int i;
3705: const xmlChar *att, *value;
3706: int http = 0;
3707: const xmlChar *content = NULL;
3708:
3709: if ((ctxt == NULL) || (atts == NULL))
3710: return;
3711:
3712: i = 0;
3713: att = atts[i++];
3714: while (att != NULL) {
3715: value = atts[i++];
3716: if ((value != NULL) &&
1.7 veillard 3717: ((xmlStrEqual(att, BAD_CAST"http-equiv")) ||
3718: (xmlStrEqual(att, BAD_CAST"Http-Equiv")) ||
3719: (xmlStrEqual(att, BAD_CAST"HTTP-EQUIV"))) &&
3720: ((xmlStrEqual(value, BAD_CAST"Content-Type")) ||
3721: (xmlStrEqual(value, BAD_CAST"content-type")) ||
3722: (xmlStrEqual(value, BAD_CAST"CONTENT-TYPE"))))
1.1 veillard 3723: http = 1;
3724: else if ((value != NULL) &&
1.7 veillard 3725: ((xmlStrEqual(att, BAD_CAST"content")) ||
3726: (xmlStrEqual(att, BAD_CAST"Content")) ||
3727: (xmlStrEqual(att, BAD_CAST"CONTENT"))))
1.1 veillard 3728: content = value;
3729: att = atts[i++];
3730: }
3731: if ((http) && (content != NULL))
3732: sgmlCheckEncoding(ctxt, content);
3733:
3734: }
3735:
3736: /**
3737: * sgmlParseStartTag:
3738: * @ctxt: an SGML parser context
3739: *
3740: * parse a start of tag either for rule element or
3741: * EmptyElement. In both case we don't parse the tag closing chars.
3742: *
3743: * [40] STag ::= '<' Name (S Attribute)* S? '>'
3744: *
3745: * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3746: *
3747: * With namespace:
3748: *
3749: * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3750: *
3751: * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3752: *
3753: */
3754:
3755: void
3756: sgmlParseStartTag(sgmlParserCtxtPtr ctxt) {
3757: xmlChar *name;
3758: xmlChar *attname;
3759: xmlChar *attvalue;
3760: const xmlChar **atts = NULL;
3761: int nbatts = 0;
3762: int maxatts = 0;
3763: int meta = 0;
3764: int i;
3765:
3766: if (CUR != '<') return;
3767: NEXT;
3768:
3769: GROW;
3770: name = sgmlParseSGMLName(ctxt);
3771: if (name == NULL) {
3772: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3773: ctxt->sax->error(ctxt->userData,
3774: "sgmlParseStartTag: invalid element name\n");
3775: ctxt->wellFormed = 0;
3776: return;
3777: }
1.7 veillard 3778: if (xmlStrEqual(name, BAD_CAST"meta"))
1.1 veillard 3779: meta = 1;
3780:
3781: /*
3782: * Check for auto-closure of SGML elements.
3783: */
3784: sgmlAutoClose(ctxt, name);
3785:
3786: /*
3787: * Check for implied SGML elements.
3788: */
3789: sgmlCheckImplied(ctxt, name);
3790:
3791: /*
3792: * Now parse the attributes, it ends up with the ending
3793: *
3794: * (S Attribute)* S?
3795: */
3796: SKIP_BLANKS;
3797: while ((IS_CHAR(CUR)) &&
3798: (CUR != '>') &&
3799: ((CUR != '/') || (NXT(1) != '>'))) {
3800: long cons = ctxt->nbChars;
3801:
3802: GROW;
3803: attname = sgmlParseAttribute(ctxt, &attvalue);
3804: if (attname != NULL) {
3805:
3806: /*
3807: * Well formedness requires at most one declaration of an attribute
3808: */
3809: for (i = 0; i < nbatts;i += 2) {
1.7 veillard 3810: if (xmlStrEqual(atts[i], attname)) {
1.1 veillard 3811: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3812: ctxt->sax->error(ctxt->userData,
3813: "Attribute %s redefined\n",
3814: attname);
3815: ctxt->wellFormed = 0;
3816: xmlFree(attname);
3817: if (attvalue != NULL)
3818: xmlFree(attvalue);
3819: goto failed;
3820: }
3821: }
3822:
3823: /*
3824: * Add the pair to atts
3825: */
3826: if (atts == NULL) {
3827: maxatts = 10;
3828: atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
3829: if (atts == NULL) {
3830: fprintf(stderr, "malloc of %ld byte failed\n",
3831: maxatts * (long)sizeof(xmlChar *));
3832: if (name != NULL) xmlFree(name);
3833: return;
3834: }
3835: } else if (nbatts + 4 > maxatts) {
3836: maxatts *= 2;
3837: atts = (const xmlChar **) xmlRealloc(atts, maxatts * sizeof(xmlChar *));
3838: if (atts == NULL) {
3839: fprintf(stderr, "realloc of %ld byte failed\n",
3840: maxatts * (long)sizeof(xmlChar *));
3841: if (name != NULL) xmlFree(name);
3842: return;
3843: }
3844: }
3845: atts[nbatts++] = attname;
3846: atts[nbatts++] = attvalue;
3847: atts[nbatts] = NULL;
3848: atts[nbatts + 1] = NULL;
3849: }
3850:
3851: failed:
3852: SKIP_BLANKS;
3853: if (cons == ctxt->nbChars) {
3854: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3855: ctxt->sax->error(ctxt->userData,
3856: "sgmlParseStartTag: problem parsing attributes\n");
3857: ctxt->wellFormed = 0;
3858: break;
3859: }
3860: }
3861:
3862: /*
3863: * Handle specific association to the META tag
3864: */
3865: if (meta)
3866: sgmlCheckMeta(ctxt, atts);
3867:
3868: /*
3869: * SAX: Start of Element !
3870: */
3871: sgmlnamePush(ctxt, xmlStrdup(name));
3872: #ifdef DEBUG
3873: fprintf(stderr,"Start of element %s: pushed %s\n", name, ctxt->name);
3874: #endif
3875: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3876: ctxt->sax->startElement(ctxt->userData, name, atts);
3877:
3878: if (atts != NULL) {
3879: for (i = 0;i < nbatts;i++) {
3880: if (atts[i] != NULL)
3881: xmlFree((xmlChar *) atts[i]);
3882: }
3883: xmlFree((void *) atts);
3884: }
3885: if (name != NULL) xmlFree(name);
3886: }
3887:
3888: /**
3889: * sgmlParseEndTag:
3890: * @ctxt: an SGML parser context
3891: *
3892: * parse an end of tag
3893: *
3894: * [42] ETag ::= '</' Name S? '>'
3895: *
3896: * With namespace
3897: *
3898: * [NS 9] ETag ::= '</' QName S? '>'
3899: */
3900:
3901: void
3902: sgmlParseEndTag(sgmlParserCtxtPtr ctxt) {
3903: xmlChar *name;
3904: xmlChar *oldname;
3905: int i;
3906:
3907: if ((CUR != '<') || (NXT(1) != '/')) {
3908: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3909: ctxt->sax->error(ctxt->userData, "sgmlParseEndTag: '</' not found\n");
3910: ctxt->wellFormed = 0;
3911: return;
3912: }
3913: SKIP(2);
3914:
3915: name = sgmlParseSGMLName(ctxt);
3916: if (name == NULL) {
3917: if (CUR == '>') {
3918: NEXT;
3919: oldname = sgmlnamePop(ctxt);
3920: if (oldname != NULL) {
3921: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3922: ctxt->sax->endElement(ctxt->userData, name);
3923: #ifdef DEBUG
3924: fprintf(stderr,"End of tag </>: popping out %s\n", oldname);
3925: #endif
3926: xmlFree(oldname);
3927: #ifdef DEBUG
3928: } else {
3929: fprintf(stderr,"End of tag </>: stack empty !!!\n");
3930: #endif
3931: }
3932: return;
3933: } else
3934: return;
3935: }
3936:
3937: /*
3938: * We should definitely be at the ending "S? '>'" part
3939: */
3940: SKIP_BLANKS;
3941: if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3942: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3943: ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3944: ctxt->wellFormed = 0;
3945: } else
3946: NEXT;
3947:
3948: /*
3949: * If the name read is not one of the element in the parsing stack
3950: * then return, it's just an error.
3951: */
3952: for (i = (ctxt->nameNr - 1);i >= 0;i--) {
1.7 veillard 3953: if (xmlStrEqual(name, ctxt->nameTab[i])) break;
1.1 veillard 3954: }
3955: if (i < 0) {
3956: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3957: ctxt->sax->error(ctxt->userData,
3958: "Unexpected end tag : %s\n", name);
3959: xmlFree(name);
3960: ctxt->wellFormed = 0;
3961: return;
3962: }
3963:
3964:
3965: /*
3966: * Check for auto-closure of SGML elements.
3967: */
3968:
3969: sgmlAutoCloseOnClose(ctxt, name);
3970:
3971: /*
3972: * Well formedness constraints, opening and closing must match.
3973: * With the exception that the autoclose may have popped stuff out
3974: * of the stack.
3975: */
3976: if (((name[0] != '/') || (name[1] != 0)) &&
1.7 veillard 3977: (!xmlStrEqual(name, ctxt->name))) {
1.1 veillard 3978: #ifdef DEBUG
3979: fprintf(stderr,"End of tag %s: expecting %s\n", name, ctxt->name);
3980: #endif
3981: if ((ctxt->name != NULL) &&
1.7 veillard 3982: (!xmlStrEqual(ctxt->name, name))) {
1.1 veillard 3983: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3984: ctxt->sax->error(ctxt->userData,
3985: "Opening and ending tag mismatch: %s and %s\n",
3986: name, ctxt->name);
3987: ctxt->wellFormed = 0;
3988: }
3989: }
3990:
3991: /*
3992: * SAX: End of Tag
3993: */
3994: oldname = ctxt->name;
3995: if (((name[0] == '/') && (name[1] == 0)) ||
1.7 veillard 3996: ((oldname != NULL) && (xmlStrEqual(oldname, name)))) {
1.1 veillard 3997: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3998: ctxt->sax->endElement(ctxt->userData, name);
3999: oldname = sgmlnamePop(ctxt);
4000: if (oldname != NULL) {
4001: #ifdef DEBUG
4002: fprintf(stderr,"End of tag %s: popping out %s\n", name, oldname);
4003: #endif
4004: xmlFree(oldname);
4005: #ifdef DEBUG
4006: } else {
4007: fprintf(stderr,"End of tag %s: stack empty !!!\n", name);
4008: #endif
4009: }
4010: }
4011:
4012: if (name != NULL)
4013: xmlFree(name);
4014:
4015: return;
4016: }
4017:
4018:
4019: /**
4020: * sgmlParseReference:
4021: * @ctxt: an SGML parser context
4022: *
4023: * parse and handle entity references in content,
4024: * this will end-up in a call to character() since this is either a
4025: * CharRef, or a predefined entity.
4026: */
4027: void
4028: sgmlParseReference(sgmlParserCtxtPtr ctxt) {
4029: sgmlEntityDescPtr ent;
4030: xmlChar out[6];
4031: xmlChar *name;
4032: if (CUR != '&') return;
4033:
4034: if (NXT(1) == '#') {
4035: unsigned int c;
4036: int bits, i = 0;
4037:
4038: c = sgmlParseCharRef(ctxt);
4039: if (c < 0x80) { out[i++]= c; bits= -6; }
4040: else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4041: else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4042: else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4043:
4044: for ( ; bits >= 0; bits-= 6) {
4045: out[i++]= ((c >> bits) & 0x3F) | 0x80;
4046: }
4047: out[i] = 0;
4048:
4049: sgmlCheckParagraph(ctxt);
4050: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4051: ctxt->sax->characters(ctxt->userData, out, i);
4052: } else {
4053: ent = sgmlParseEntityRef(ctxt, &name);
4054: if (name == NULL) {
4055: sgmlCheckParagraph(ctxt);
4056: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4057: ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4058: return;
4059: }
4060: if ((ent == NULL) || (ent->value <= 0)) {
4061: sgmlCheckParagraph(ctxt);
4062: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4063: ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4064: ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4065: /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4066: }
4067: } else {
4068: unsigned int c;
4069: int bits, i = 0;
4070:
4071: c = ent->value;
4072: if (c < 0x80)
4073: { out[i++]= c; bits= -6; }
4074: else if (c < 0x800)
4075: { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4076: else if (c < 0x10000)
4077: { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4078: else
4079: { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4080:
4081: for ( ; bits >= 0; bits-= 6) {
4082: out[i++]= ((c >> bits) & 0x3F) | 0x80;
4083: }
4084: out[i] = 0;
4085:
4086: sgmlCheckParagraph(ctxt);
4087: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4088: ctxt->sax->characters(ctxt->userData, out, i);
4089: }
4090: xmlFree(name);
4091: }
4092: }
4093:
4094: /**
4095: * sgmlParseContent:
4096: * @ctxt: an SGML parser context
4097: * @name: the node name
4098: *
4099: * Parse a content: comment, sub-element, reference or text.
4100: *
4101: */
4102:
4103: void
4104: sgmlParseContent(sgmlParserCtxtPtr ctxt) {
4105: xmlChar *currentNode;
4106: int depth;
4107:
4108: currentNode = xmlStrdup(ctxt->name);
4109: depth = ctxt->nameNr;
4110: while (1) {
4111: long cons = ctxt->nbChars;
4112:
4113: GROW;
4114: /*
4115: * Our tag or one of it's parent or children is ending.
4116: */
4117: if ((CUR == '<') && (NXT(1) == '/')) {
4118: sgmlParseEndTag(ctxt);
4119: if (currentNode != NULL) xmlFree(currentNode);
4120: return;
4121: }
4122:
4123: /*
4124: * Has this node been popped out during parsing of
4125: * the next element
4126: */
1.7 veillard 4127: if ((!xmlStrEqual(currentNode, ctxt->name)) &&
1.1 veillard 4128: (depth >= ctxt->nameNr)) {
4129: if (currentNode != NULL) xmlFree(currentNode);
4130: return;
4131: }
4132:
4133: /*
4134: * Sometimes DOCTYPE arrives in the middle of the document
4135: */
4136: if ((CUR == '<') && (NXT(1) == '!') &&
4137: (UPP(2) == 'D') && (UPP(3) == 'O') &&
4138: (UPP(4) == 'C') && (UPP(5) == 'T') &&
4139: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4140: (UPP(8) == 'E')) {
4141: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4142: ctxt->sax->error(ctxt->userData,
4143: "Misplaced DOCTYPE declaration\n");
4144: ctxt->wellFormed = 0;
4145: sgmlParseDocTypeDecl(ctxt);
4146: }
4147:
4148: /*
4149: * First case : a comment
4150: */
4151: if ((CUR == '<') && (NXT(1) == '!') &&
4152: (NXT(2) == '-') && (NXT(3) == '-')) {
4153: sgmlParseComment(ctxt);
4154: }
4155:
4156: /*
4157: * Second case : a sub-element.
4158: */
4159: else if (CUR == '<') {
4160: sgmlParseElement(ctxt);
4161: }
4162:
4163: /*
4164: * Third case : a reference. If if has not been resolved,
4165: * parsing returns it's Name, create the node
4166: */
4167: else if (CUR == '&') {
4168: sgmlParseReference(ctxt);
4169: }
4170:
4171: /*
4172: * Fourth : end of the resource
4173: */
4174: else if (CUR == 0) {
4175: sgmlAutoClose(ctxt, NULL);
4176: }
4177:
4178: /*
4179: * Last case, text. Note that References are handled directly.
4180: */
4181: else {
4182: sgmlParseCharData(ctxt, 0);
4183: }
4184:
4185: if (cons == ctxt->nbChars) {
4186: if (ctxt->node != NULL) {
4187: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4188: ctxt->sax->error(ctxt->userData,
4189: "detected an error in element content\n");
4190: ctxt->wellFormed = 0;
4191: }
4192: break;
4193: }
4194:
4195: GROW;
4196: }
4197: if (currentNode != NULL) xmlFree(currentNode);
4198: }
4199:
4200: /**
4201: * sgmlParseElement:
4202: * @ctxt: an SGML parser context
4203: *
4204: * parse an SGML element, this is highly recursive
4205: *
4206: * [39] element ::= EmptyElemTag | STag content ETag
4207: *
4208: * [41] Attribute ::= Name Eq AttValue
4209: */
4210:
4211: void
4212: sgmlParseElement(sgmlParserCtxtPtr ctxt) {
4213: xmlChar *name;
4214: xmlChar *currentNode = NULL;
4215: sgmlElemDescPtr info;
4216: sgmlParserNodeInfo node_info;
4217: xmlChar *oldname;
4218: int depth = ctxt->nameNr;
4219:
4220: /* Capture start position */
4221: if (ctxt->record_info) {
4222: node_info.begin_pos = ctxt->input->consumed +
4223: (CUR_PTR - ctxt->input->base);
4224: node_info.begin_line = ctxt->input->line;
4225: }
4226:
4227: oldname = xmlStrdup(ctxt->name);
4228: sgmlParseStartTag(ctxt);
4229: name = ctxt->name;
4230: #ifdef DEBUG
4231: if (oldname == NULL)
4232: fprintf(stderr, "Start of element %s\n", name);
4233: else if (name == NULL)
4234: fprintf(stderr, "Start of element failed, was %s\n", oldname);
4235: else
4236: fprintf(stderr, "Start of element %s, was %s\n", name, oldname);
4237: #endif
1.7 veillard 4238: if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
1.1 veillard 4239: (name == NULL)) {
4240: if (CUR == '>')
4241: NEXT;
4242: if (oldname != NULL)
4243: xmlFree(oldname);
4244: return;
4245: }
4246: if (oldname != NULL)
4247: xmlFree(oldname);
4248:
4249: /*
4250: * Lookup the info for that element.
4251: */
4252: info = sgmlTagLookup(name);
4253: if (info == NULL) {
4254: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.4 veillard 4255: ctxt->sax->error(ctxt->userData, "Tag %s unknown\n",
1.1 veillard 4256: name);
4257: ctxt->wellFormed = 0;
4258: } else if (info->depr) {
4259: /***************************
4260: if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4261: ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
4262: name);
4263: ***************************/
4264: }
4265:
4266: /*
4267: * Check for an Empty Element labelled the XML/SGML way
4268: */
4269: if ((CUR == '/') && (NXT(1) == '>')) {
4270: SKIP(2);
4271: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4272: ctxt->sax->endElement(ctxt->userData, name);
4273: oldname = sgmlnamePop(ctxt);
4274: #ifdef DEBUG
4275: fprintf(stderr,"End of tag the XML way: popping out %s\n", oldname);
4276: #endif
4277: if (oldname != NULL)
4278: xmlFree(oldname);
4279: return;
4280: }
4281:
4282: if (CUR == '>') {
4283: NEXT;
4284: } else {
4285: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4286: ctxt->sax->error(ctxt->userData,
4287: "Couldn't find end of Start Tag %s\n",
4288: name);
4289: ctxt->wellFormed = 0;
4290:
4291: /*
4292: * end of parsing of this node.
4293: */
1.7 veillard 4294: if (xmlStrEqual(name, ctxt->name)) {
1.1 veillard 4295: nodePop(ctxt);
4296: oldname = sgmlnamePop(ctxt);
4297: #ifdef DEBUG
4298: fprintf(stderr,"End of start tag problem: popping out %s\n", oldname);
4299: #endif
4300: if (oldname != NULL)
4301: xmlFree(oldname);
4302: }
4303:
4304: /*
4305: * Capture end position and add node
4306: */
4307: if ( currentNode != NULL && ctxt->record_info ) {
4308: node_info.end_pos = ctxt->input->consumed +
4309: (CUR_PTR - ctxt->input->base);
4310: node_info.end_line = ctxt->input->line;
4311: node_info.node = ctxt->node;
4312: xmlParserAddNodeInfo(ctxt, &node_info);
4313: }
4314: return;
4315: }
4316:
4317: /*
4318: * Check for an Empty Element from DTD definition
4319: */
4320: if ((info != NULL) && (info->empty)) {
4321: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4322: ctxt->sax->endElement(ctxt->userData, name);
4323: oldname = sgmlnamePop(ctxt);
4324: #ifdef DEBUG
4325: fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
4326: #endif
4327: if (oldname != NULL)
4328: xmlFree(oldname);
4329: return;
4330: }
4331:
4332: /*
4333: * Parse the content of the element:
4334: */
4335: currentNode = xmlStrdup(ctxt->name);
4336: depth = ctxt->nameNr;
4337: while (IS_CHAR(CUR)) {
4338: sgmlParseContent(ctxt);
4339: if (ctxt->nameNr < depth) break;
4340: }
4341:
4342: if (!IS_CHAR(CUR)) {
4343: /************
4344: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4345: ctxt->sax->error(ctxt->userData,
4346: "Premature end of data in tag %s\n", currentNode);
4347: ctxt->wellFormed = 0;
4348: *************/
4349:
4350: /*
4351: * end of parsing of this node.
4352: */
4353: nodePop(ctxt);
4354: oldname = sgmlnamePop(ctxt);
4355: #ifdef DEBUG
4356: fprintf(stderr,"Premature end of tag %s : popping out %s\n", name, oldname);
4357: #endif
4358: if (oldname != NULL)
4359: xmlFree(oldname);
4360: if (currentNode != NULL)
4361: xmlFree(currentNode);
4362: return;
4363: }
4364:
4365: /*
4366: * Capture end position and add node
4367: */
4368: if ( currentNode != NULL && ctxt->record_info ) {
4369: node_info.end_pos = ctxt->input->consumed +
4370: (CUR_PTR - ctxt->input->base);
4371: node_info.end_line = ctxt->input->line;
4372: node_info.node = ctxt->node;
4373: xmlParserAddNodeInfo(ctxt, &node_info);
4374: }
4375: if (currentNode != NULL)
4376: xmlFree(currentNode);
4377: }
4378:
4379: /**
1.3 veillard 4380: * sgmlParseEntityDecl:
4381: * @ctxt: an SGML parser context
4382: *
4383: * parse <!ENTITY declarations
4384: *
4385: */
4386:
4387: void
4388: sgmlParseEntityDecl(xmlParserCtxtPtr ctxt) {
4389: xmlChar *name = NULL;
4390: xmlChar *value = NULL;
4391: xmlChar *URI = NULL, *literal = NULL;
4392: xmlChar *ndata = NULL;
4393: int isParameter = 0;
4394: xmlChar *orig = NULL;
4395:
4396: GROW;
4397: if ((RAW == '<') && (NXT(1) == '!') &&
4398: (NXT(2) == 'E') && (NXT(3) == 'N') &&
4399: (NXT(4) == 'T') && (NXT(5) == 'I') &&
4400: (NXT(6) == 'T') && (NXT(7) == 'Y')) {
4401: xmlParserInputPtr input = ctxt->input;
4402: ctxt->instate = XML_PARSER_ENTITY_DECL;
4403: SHRINK;
4404: SKIP(8);
4405: if (!IS_BLANK(CUR)) {
1.6 veillard 4406: ctxt->errNo = XML_ERR_SPACE_REQUIRED;
1.3 veillard 4407: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4408: ctxt->sax->error(ctxt->userData,
4409: "Space required after '<!ENTITY'\n");
4410: ctxt->wellFormed = 0;
4411: ctxt->disableSAX = 1;
4412: }
4413: SKIP_BLANKS;
4414:
4415: if (RAW == '%') {
4416: NEXT;
4417: if (!IS_BLANK(CUR)) {
1.6 veillard 4418: ctxt->errNo = XML_ERR_SPACE_REQUIRED;
1.3 veillard 4419: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4420: ctxt->sax->error(ctxt->userData,
4421: "Space required after '%'\n");
4422: ctxt->wellFormed = 0;
4423: ctxt->disableSAX = 1;
4424: }
4425: SKIP_BLANKS;
4426: isParameter = 1;
4427: }
4428:
4429: name = xmlParseName(ctxt);
4430: if (name == NULL) {
1.6 veillard 4431: ctxt->errNo = XML_ERR_NAME_REQUIRED;
1.3 veillard 4432: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4433: ctxt->sax->error(ctxt->userData, "sgmlarseEntityDecl: no name\n");
4434: ctxt->wellFormed = 0;
4435: ctxt->disableSAX = 1;
4436: return;
4437: }
4438: if (!IS_BLANK(CUR)) {
1.6 veillard 4439: ctxt->errNo = XML_ERR_SPACE_REQUIRED;
1.3 veillard 4440: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4441: ctxt->sax->error(ctxt->userData,
4442: "Space required after the entity name\n");
4443: ctxt->wellFormed = 0;
4444: ctxt->disableSAX = 1;
4445: }
4446: SKIP_BLANKS;
4447:
4448: /*
4449: * handle the various case of definitions...
4450: */
4451: if (isParameter) {
4452: if ((RAW == '"') || (RAW == '\'')) {
4453: value = xmlParseEntityValue(ctxt, &orig);
4454: if (value) {
4455: if ((ctxt->sax != NULL) &&
4456: (!ctxt->disableSAX) && (ctxt->sax->entityDecl != NULL))
4457: ctxt->sax->entityDecl(ctxt->userData, name,
4458: XML_INTERNAL_PARAMETER_ENTITY,
4459: NULL, NULL, value);
4460: }
4461: } else {
4462: URI = xmlParseExternalID(ctxt, &literal, 1);
4463: if ((URI == NULL) && (literal == NULL)) {
1.6 veillard 4464: ctxt->errNo = XML_ERR_VALUE_REQUIRED;
1.3 veillard 4465: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4466: ctxt->sax->error(ctxt->userData,
4467: "Entity value required\n");
4468: ctxt->wellFormed = 0;
4469: ctxt->disableSAX = 1;
4470: }
4471: if (URI) {
4472: xmlURIPtr uri;
4473:
4474: uri = xmlParseURI((const char *) URI);
4475: if (uri == NULL) {
1.6 veillard 4476: ctxt->errNo = XML_ERR_INVALID_URI;
1.3 veillard 4477: if ((ctxt->sax != NULL) &&
4478: (!ctxt->disableSAX) &&
4479: (ctxt->sax->error != NULL))
4480: ctxt->sax->error(ctxt->userData,
4481: "Invalid URI: %s\n", URI);
4482: ctxt->wellFormed = 0;
4483: } else {
4484: if (uri->fragment != NULL) {
1.6 veillard 4485: ctxt->errNo = XML_ERR_URI_FRAGMENT;
1.3 veillard 4486: if ((ctxt->sax != NULL) &&
4487: (!ctxt->disableSAX) &&
4488: (ctxt->sax->error != NULL))
4489: ctxt->sax->error(ctxt->userData,
4490: "Fragment not allowed: %s\n", URI);
4491: ctxt->wellFormed = 0;
4492: } else {
4493: if ((ctxt->sax != NULL) &&
4494: (!ctxt->disableSAX) &&
4495: (ctxt->sax->entityDecl != NULL))
4496: ctxt->sax->entityDecl(ctxt->userData, name,
4497: XML_EXTERNAL_PARAMETER_ENTITY,
4498: literal, URI, NULL);
4499: }
4500: xmlFreeURI(uri);
4501: }
4502: }
4503: }
4504: } else {
4505: if ((RAW == '"') || (RAW == '\'')) {
4506: value = xmlParseEntityValue(ctxt, &orig);
4507: if ((ctxt->sax != NULL) &&
4508: (!ctxt->disableSAX) && (ctxt->sax->entityDecl != NULL))
4509: ctxt->sax->entityDecl(ctxt->userData, name,
4510: XML_INTERNAL_GENERAL_ENTITY,
4511: NULL, NULL, value);
4512: } else {
4513: URI = xmlParseExternalID(ctxt, &literal, 1);
4514: if ((URI == NULL) && (literal == NULL)) {
1.6 veillard 4515: ctxt->errNo = XML_ERR_VALUE_REQUIRED;
1.3 veillard 4516: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4517: ctxt->sax->error(ctxt->userData,
4518: "Entity value required\n");
4519: ctxt->wellFormed = 0;
4520: ctxt->disableSAX = 1;
4521: }
4522: if (URI) {
4523: xmlURIPtr uri;
4524:
4525: uri = xmlParseURI((const char *)URI);
4526: if (uri == NULL) {
1.6 veillard 4527: ctxt->errNo = XML_ERR_INVALID_URI;
1.3 veillard 4528: if ((ctxt->sax != NULL) &&
4529: (!ctxt->disableSAX) &&
4530: (ctxt->sax->error != NULL))
4531: ctxt->sax->error(ctxt->userData,
4532: "Invalid URI: %s\n", URI);
4533: ctxt->wellFormed = 0;
4534: } else {
4535: if (uri->fragment != NULL) {
1.6 veillard 4536: ctxt->errNo = XML_ERR_URI_FRAGMENT;
1.3 veillard 4537: if ((ctxt->sax != NULL) &&
4538: (!ctxt->disableSAX) &&
4539: (ctxt->sax->error != NULL))
4540: ctxt->sax->error(ctxt->userData,
4541: "Fragment not allowed: %s\n", URI);
4542: ctxt->wellFormed = 0;
4543: }
4544: xmlFreeURI(uri);
4545: }
4546: }
4547: if ((RAW != '>') && (!IS_BLANK(CUR))) {
1.6 veillard 4548: ctxt->errNo = XML_ERR_SPACE_REQUIRED;
1.3 veillard 4549: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4550: ctxt->sax->error(ctxt->userData,
4551: "Space required before content model\n");
4552: ctxt->wellFormed = 0;
4553: ctxt->disableSAX = 1;
4554: }
4555: SKIP_BLANKS;
4556:
4557: /*
4558: * SGML specific: here we can get the content model
4559: */
4560: if (RAW != '>') {
4561: xmlChar *contmod;
4562:
4563: contmod = xmlParseName(ctxt);
4564:
4565: if (contmod == NULL) {
1.6 veillard 4566: ctxt->errNo = XML_ERR_SPACE_REQUIRED;
1.3 veillard 4567: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4568: ctxt->sax->error(ctxt->userData,
4569: "Could not parse entity content model\n");
4570: ctxt->wellFormed = 0;
4571: ctxt->disableSAX = 1;
4572: } else {
1.7 veillard 4573: if (xmlStrEqual(contmod, BAD_CAST"NDATA")) {
1.3 veillard 4574: if (!IS_BLANK(CUR)) {
1.6 veillard 4575: ctxt->errNo = XML_ERR_SPACE_REQUIRED;
1.3 veillard 4576: if ((ctxt->sax != NULL) &&
4577: (ctxt->sax->error != NULL))
4578: ctxt->sax->error(ctxt->userData,
4579: "Space required after 'NDATA'\n");
4580: ctxt->wellFormed = 0;
4581: ctxt->disableSAX = 1;
4582: }
4583: SKIP_BLANKS;
4584: ndata = xmlParseName(ctxt);
4585: if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4586: (ctxt->sax->unparsedEntityDecl != NULL)) {
4587: ctxt->sax->unparsedEntityDecl(ctxt->userData,
4588: name, literal, URI, ndata);
4589: }
1.7 veillard 4590: } else if (xmlStrEqual(contmod, BAD_CAST"SUBDOC")) {
1.3 veillard 4591: if ((ctxt->sax != NULL) &&
4592: (ctxt->sax->warning != NULL))
4593: ctxt->sax->warning(ctxt->userData,
4594: "SUBDOC entities are not supported\n");
4595: SKIP_BLANKS;
4596: ndata = xmlParseName(ctxt);
4597: if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4598: (ctxt->sax->unparsedEntityDecl != NULL)) {
4599: ctxt->sax->unparsedEntityDecl(ctxt->userData,
4600: name, literal, URI, ndata);
4601: }
1.7 veillard 4602: } else if (xmlStrEqual(contmod, BAD_CAST"CDATA")) {
1.3 veillard 4603: if ((ctxt->sax != NULL) &&
4604: (ctxt->sax->warning != NULL))
4605: ctxt->sax->warning(ctxt->userData,
4606: "CDATA entities are not supported\n");
4607: SKIP_BLANKS;
4608: ndata = xmlParseName(ctxt);
4609: if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4610: (ctxt->sax->unparsedEntityDecl != NULL)) {
4611: ctxt->sax->unparsedEntityDecl(ctxt->userData,
4612: name, literal, URI, ndata);
4613: }
4614: }
4615: xmlFree(contmod);
4616: }
4617: } else {
4618: if ((ctxt->sax != NULL) &&
4619: (!ctxt->disableSAX) && (ctxt->sax->entityDecl != NULL))
4620: ctxt->sax->entityDecl(ctxt->userData, name,
4621: XML_EXTERNAL_GENERAL_PARSED_ENTITY,
4622: literal, URI, NULL);
4623: }
4624: }
4625: }
4626: SKIP_BLANKS;
4627: if (RAW != '>') {
1.6 veillard 4628: ctxt->errNo = XML_ERR_ENTITY_NOT_FINISHED;
1.3 veillard 4629: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4630: ctxt->sax->error(ctxt->userData,
4631: "sgmlParseEntityDecl: entity %s not terminated\n", name);
4632: ctxt->wellFormed = 0;
4633: ctxt->disableSAX = 1;
4634: } else {
4635: if (input != ctxt->input) {
1.6 veillard 4636: ctxt->errNo = XML_ERR_ENTITY_BOUNDARY;
1.3 veillard 4637: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4638: ctxt->sax->error(ctxt->userData,
4639: "Entity declaration doesn't start and stop in the same entity\n");
4640: ctxt->wellFormed = 0;
4641: ctxt->disableSAX = 1;
4642: }
4643: NEXT;
4644: }
4645: if (orig != NULL) {
4646: /*
4647: * Ugly mechanism to save the raw entity value.
4648: */
4649: xmlEntityPtr cur = NULL;
4650:
4651: if (isParameter) {
4652: if ((ctxt->sax != NULL) &&
4653: (ctxt->sax->getParameterEntity != NULL))
4654: cur = ctxt->sax->getParameterEntity(ctxt->userData, name);
4655: } else {
4656: if ((ctxt->sax != NULL) &&
4657: (ctxt->sax->getEntity != NULL))
4658: cur = ctxt->sax->getEntity(ctxt->userData, name);
4659: }
4660: if (cur != NULL) {
4661: if (cur->orig != NULL)
4662: xmlFree(orig);
4663: else
4664: cur->orig = orig;
4665: } else
4666: xmlFree(orig);
4667: }
4668: if (name != NULL) xmlFree(name);
4669: if (value != NULL) xmlFree(value);
4670: if (URI != NULL) xmlFree(URI);
4671: if (literal != NULL) xmlFree(literal);
4672: if (ndata != NULL) xmlFree(ndata);
4673: }
4674: }
4675:
4676: /**
4677: * sgmlParseMarkupDecl:
4678: * @ctxt: an SGML parser context
4679: *
4680: * parse Markup declarations
4681: *
4682: * [29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl |
4683: * NotationDecl | PI | Comment
4684: */
4685: void
4686: sgmlParseMarkupDecl(xmlParserCtxtPtr ctxt) {
4687: GROW;
4688: xmlParseElementDecl(ctxt);
4689: xmlParseAttributeListDecl(ctxt);
4690: sgmlParseEntityDecl(ctxt);
4691: xmlParseNotationDecl(ctxt);
4692: xmlParsePI(ctxt);
4693: xmlParseComment(ctxt);
4694: /*
4695: * This is only for internal subset. On external entities,
4696: * the replacement is done before parsing stage
4697: */
4698: if ((ctxt->external == 0) && (ctxt->inputNr == 1))
4699: xmlParsePEReference(ctxt);
4700: ctxt->instate = XML_PARSER_DTD;
4701: }
4702:
4703: /**
4704: * sgmlParseInternalsubset:
4705: * @ctxt: an SGML parser context
4706: *
4707: * parse the internal subset declaration
4708: *
4709: * [28 end] ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
4710: */
4711:
4712: void
4713: sgmlParseInternalSubset(xmlParserCtxtPtr ctxt) {
4714: /*
4715: * Is there any DTD definition ?
4716: */
4717: if (RAW == '[') {
4718: ctxt->instate = XML_PARSER_DTD;
4719: NEXT;
4720: /*
4721: * Parse the succession of Markup declarations and
4722: * PEReferences.
4723: * Subsequence (markupdecl | PEReference | S)*
4724: */
4725: while (RAW != ']') {
4726: const xmlChar *check = CUR_PTR;
4727: int cons = ctxt->input->consumed;
4728:
4729: SKIP_BLANKS;
4730: sgmlParseMarkupDecl(ctxt);
4731: xmlParsePEReference(ctxt);
4732:
4733: /*
4734: * Pop-up of finished entities.
4735: */
4736: while ((RAW == 0) && (ctxt->inputNr > 1))
4737: xmlPopInput(ctxt);
4738:
4739: if ((CUR_PTR == check) && (cons == ctxt->input->consumed)) {
1.6 veillard 4740: ctxt->errNo = XML_ERR_INTERNAL_ERROR;
1.3 veillard 4741: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4742: ctxt->sax->error(ctxt->userData,
4743: "sgmlParseInternalSubset: error detected in Markup declaration\n");
4744: ctxt->wellFormed = 0;
4745: ctxt->disableSAX = 1;
4746: break;
4747: }
4748: }
4749: if (RAW == ']') {
4750: NEXT;
4751: SKIP_BLANKS;
4752: }
4753: }
4754:
4755: /*
4756: * We should be at the end of the DOCTYPE declaration.
4757: */
4758: if (RAW != '>') {
1.6 veillard 4759: ctxt->errNo = XML_ERR_DOCTYPE_NOT_FINISHED;
1.3 veillard 4760: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4761: ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
4762: ctxt->wellFormed = 0;
4763: ctxt->disableSAX = 1;
4764: }
4765: NEXT;
4766: }
4767:
4768: /**
1.2 veillard 4769: * sgmlParseMisc:
4770: * @ctxt: an XML parser context
4771: *
4772: * parse an XML Misc* optionnal field.
4773: *
4774: * [27] Misc ::= Comment | PI | S
4775: */
4776:
4777: void
4778: sgmlParseMisc(xmlParserCtxtPtr ctxt) {
4779: while (((RAW == '<') && (NXT(1) == '?')) ||
4780: ((RAW == '<') && (NXT(1) == '!') &&
4781: (NXT(2) == '-') && (NXT(3) == '-')) ||
4782: IS_BLANK(CUR)) {
4783: if ((RAW == '<') && (NXT(1) == '?')) {
4784: xmlParsePI(ctxt); /* TODO: SGML PIs differs */
4785: } else if (IS_BLANK(CUR)) {
4786: NEXT;
4787: } else
4788: xmlParseComment(ctxt);
4789: }
4790: }
4791:
4792: /**
1.1 veillard 4793: * sgmlParseDocument :
4794: * @ctxt: an SGML parser context
4795: *
4796: * parse an SGML document (and build a tree if using the standard SAX
4797: * interface).
4798: *
4799: * Returns 0, -1 in case of error. the parser context is augmented
4800: * as a result of the parsing.
4801: */
4802:
4803: int
4804: sgmlParseDocument(sgmlParserCtxtPtr ctxt) {
1.2 veillard 4805: xmlChar start[4];
4806: xmlCharEncoding enc;
1.1 veillard 4807: xmlDtdPtr dtd;
4808:
4809: sgmlDefaultSAXHandlerInit();
4810: ctxt->html = 2;
4811:
4812: GROW;
4813: /*
4814: * SAX: beginning of the document processing.
4815: */
4816: if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4817: ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4818:
1.2 veillard 4819: /*
4820: * Get the 4 first bytes and decode the charset
4821: * if enc != XML_CHAR_ENCODING_NONE
4822: * plug some encoding conversion routines.
4823: */
4824: start[0] = RAW;
4825: start[1] = NXT(1);
4826: start[2] = NXT(2);
4827: start[3] = NXT(3);
4828: enc = xmlDetectCharEncoding(start, 4);
4829: if (enc != XML_CHAR_ENCODING_NONE) {
4830: xmlSwitchEncoding(ctxt, enc);
4831: }
4832:
1.1 veillard 4833: /*
4834: * Wipe out everything which is before the first '<'
4835: */
4836: SKIP_BLANKS;
4837: if (CUR == 0) {
4838: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4839: ctxt->sax->error(ctxt->userData, "Document is empty\n");
4840: ctxt->wellFormed = 0;
4841: }
4842:
4843: if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4844: ctxt->sax->startDocument(ctxt->userData);
4845:
4846:
4847: /*
1.2 veillard 4848: * The Misc part of the Prolog
1.1 veillard 4849: */
1.2 veillard 4850: GROW;
4851: sgmlParseMisc(ctxt);
1.1 veillard 4852:
4853: /*
4854: * Then possibly doc type declaration(s) and more Misc
4855: * (doctypedecl Misc*)?
4856: */
1.2 veillard 4857: GROW;
4858: if ((RAW == '<') && (NXT(1) == '!') &&
4859: (NXT(2) == 'D') && (NXT(3) == 'O') &&
4860: (NXT(4) == 'C') && (NXT(5) == 'T') &&
4861: (NXT(6) == 'Y') && (NXT(7) == 'P') &&
4862: (NXT(8) == 'E')) {
4863:
4864: ctxt->inSubset = 1;
1.1 veillard 4865: sgmlParseDocTypeDecl(ctxt);
1.2 veillard 4866: if (RAW == '[') {
4867: ctxt->instate = XML_PARSER_DTD;
1.3 veillard 4868: sgmlParseInternalSubset(ctxt);
1.2 veillard 4869: }
4870:
4871: /*
4872: * Create and update the external subset.
4873: */
4874: ctxt->inSubset = 2;
4875: if ((ctxt->sax != NULL) && (ctxt->sax->externalSubset != NULL) &&
4876: (!ctxt->disableSAX))
4877: ctxt->sax->externalSubset(ctxt->userData, ctxt->intSubName,
4878: ctxt->extSubSystem, ctxt->extSubURI);
4879: ctxt->inSubset = 0;
4880:
4881:
4882: ctxt->instate = XML_PARSER_PROLOG;
4883: sgmlParseMisc(ctxt);
1.1 veillard 4884: }
4885:
4886: /*
4887: * Time to start parsing the tree itself
4888: */
4889: sgmlParseContent(ctxt);
4890:
4891: /*
4892: * autoclose
4893: */
4894: if (CUR == 0)
4895: sgmlAutoClose(ctxt, NULL);
4896:
4897:
4898: /*
4899: * SAX: end of the document processing.
4900: */
4901: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4902: ctxt->sax->endDocument(ctxt->userData);
4903:
4904: if (ctxt->myDoc != NULL) {
4905: dtd = xmlGetIntSubset(ctxt->myDoc);
4906: if (dtd == NULL)
4907: ctxt->myDoc->intSubset =
4908: xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "SGML",
4909: BAD_CAST "-//W3C//DTD SGML 4.0 Transitional//EN",
4910: BAD_CAST "http://www.w3.org/TR/REC-docbook/loose.dtd");
4911: }
4912: if (! ctxt->wellFormed) return(-1);
4913: return(0);
4914: }
4915:
4916:
4917: /************************************************************************
4918: * *
4919: * Parser contexts handling *
4920: * *
4921: ************************************************************************/
4922:
4923: /**
4924: * xmlInitParserCtxt:
4925: * @ctxt: an SGML parser context
4926: *
4927: * Initialize a parser context
4928: */
4929:
4930: void
4931: sgmlInitParserCtxt(sgmlParserCtxtPtr ctxt)
4932: {
4933: sgmlSAXHandler *sax;
4934:
4935: if (ctxt == NULL) return;
4936: memset(ctxt, 0, sizeof(sgmlParserCtxt));
4937:
4938: sax = (sgmlSAXHandler *) xmlMalloc(sizeof(sgmlSAXHandler));
4939: if (sax == NULL) {
4940: fprintf(stderr, "sgmlInitParserCtxt: out of memory\n");
4941: }
4942: memset(sax, 0, sizeof(sgmlSAXHandler));
4943:
4944: /* Allocate the Input stack */
4945: ctxt->inputTab = (sgmlParserInputPtr *)
4946: xmlMalloc(5 * sizeof(sgmlParserInputPtr));
4947: if (ctxt->inputTab == NULL) {
4948: fprintf(stderr, "sgmlInitParserCtxt: out of memory\n");
4949: }
4950: ctxt->inputNr = 0;
4951: ctxt->inputMax = 5;
4952: ctxt->input = NULL;
4953: ctxt->version = NULL;
4954: ctxt->encoding = NULL;
4955: ctxt->standalone = -1;
4956: ctxt->instate = XML_PARSER_START;
4957:
4958: /* Allocate the Node stack */
4959: ctxt->nodeTab = (sgmlNodePtr *) xmlMalloc(10 * sizeof(sgmlNodePtr));
4960: ctxt->nodeNr = 0;
4961: ctxt->nodeMax = 10;
4962: ctxt->node = NULL;
4963:
4964: /* Allocate the Name stack */
4965: ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4966: ctxt->nameNr = 0;
4967: ctxt->nameMax = 10;
4968: ctxt->name = NULL;
4969:
4970: if (sax == NULL) ctxt->sax = &sgmlDefaultSAXHandler;
4971: else {
4972: ctxt->sax = sax;
4973: memcpy(sax, &sgmlDefaultSAXHandler, sizeof(sgmlSAXHandler));
4974: }
4975: ctxt->userData = ctxt;
4976: ctxt->myDoc = NULL;
4977: ctxt->wellFormed = 1;
4978: ctxt->replaceEntities = 0;
4979: ctxt->html = 2;
4980: ctxt->record_info = 0;
4981: ctxt->validate = 0;
4982: ctxt->nbChars = 0;
4983: ctxt->checkIndex = 0;
4984: xmlInitNodeInfoSeq(&ctxt->node_seq);
4985: }
4986:
4987: /**
4988: * sgmlFreeParserCtxt:
4989: * @ctxt: an SGML parser context
4990: *
4991: * Free all the memory used by a parser context. However the parsed
4992: * document in ctxt->myDoc is not freed.
4993: */
4994:
4995: void
4996: sgmlFreeParserCtxt(sgmlParserCtxtPtr ctxt)
4997: {
4998: xmlFreeParserCtxt(ctxt);
4999: }
5000:
5001: /**
5002: * sgmlCreateDocParserCtxt :
5003: * @cur: a pointer to an array of xmlChar
5004: * @encoding: a free form C string describing the SGML document encoding, or NULL
5005: *
5006: * Create a parser context for an SGML document.
5007: *
5008: * Returns the new parser context or NULL
5009: */
5010: sgmlParserCtxtPtr
5011: sgmlCreateDocParserCtxt(xmlChar *cur, const char *encoding) {
5012: sgmlParserCtxtPtr ctxt;
5013: sgmlParserInputPtr input;
5014: /* sgmlCharEncoding enc; */
5015:
5016: ctxt = (sgmlParserCtxtPtr) xmlMalloc(sizeof(sgmlParserCtxt));
5017: if (ctxt == NULL) {
5018: perror("malloc");
5019: return(NULL);
5020: }
5021: sgmlInitParserCtxt(ctxt);
5022: input = (sgmlParserInputPtr) xmlMalloc(sizeof(sgmlParserInput));
5023: if (input == NULL) {
5024: perror("malloc");
5025: xmlFree(ctxt);
5026: return(NULL);
5027: }
5028: memset(input, 0, sizeof(sgmlParserInput));
5029:
5030: input->line = 1;
5031: input->col = 1;
5032: input->base = cur;
5033: input->cur = cur;
5034:
5035: inputPush(ctxt, input);
5036: return(ctxt);
5037: }
5038:
5039: /************************************************************************
5040: * *
5041: * Progressive parsing interfaces *
5042: * *
5043: ************************************************************************/
5044:
5045: /**
5046: * sgmlParseLookupSequence:
5047: * @ctxt: an SGML parser context
5048: * @first: the first char to lookup
5049: * @next: the next char to lookup or zero
5050: * @third: the next char to lookup or zero
5051: *
5052: * Try to find if a sequence (first, next, third) or just (first next) or
5053: * (first) is available in the input stream.
5054: * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5055: * to avoid rescanning sequences of bytes, it DOES change the state of the
5056: * parser, do not use liberally.
5057: * This is basically similar to xmlParseLookupSequence()
5058: *
5059: * Returns the index to the current parsing point if the full sequence
5060: * is available, -1 otherwise.
5061: */
5062: int
5063: sgmlParseLookupSequence(sgmlParserCtxtPtr ctxt, xmlChar first,
5064: xmlChar next, xmlChar third) {
5065: int base, len;
5066: sgmlParserInputPtr in;
5067: const xmlChar *buf;
5068:
5069: in = ctxt->input;
5070: if (in == NULL) return(-1);
5071: base = in->cur - in->base;
5072: if (base < 0) return(-1);
5073: if (ctxt->checkIndex > base)
5074: base = ctxt->checkIndex;
5075: if (in->buf == NULL) {
5076: buf = in->base;
5077: len = in->length;
5078: } else {
5079: buf = in->buf->buffer->content;
5080: len = in->buf->buffer->use;
5081: }
5082: /* take into account the sequence length */
5083: if (third) len -= 2;
5084: else if (next) len --;
5085: for (;base < len;base++) {
5086: if (buf[base] == first) {
5087: if (third != 0) {
5088: if ((buf[base + 1] != next) ||
5089: (buf[base + 2] != third)) continue;
5090: } else if (next != 0) {
5091: if (buf[base + 1] != next) continue;
5092: }
5093: ctxt->checkIndex = 0;
5094: #ifdef DEBUG_PUSH
5095: if (next == 0)
5096: fprintf(stderr, "HPP: lookup '%c' found at %d\n",
5097: first, base);
5098: else if (third == 0)
5099: fprintf(stderr, "HPP: lookup '%c%c' found at %d\n",
5100: first, next, base);
5101: else
5102: fprintf(stderr, "HPP: lookup '%c%c%c' found at %d\n",
5103: first, next, third, base);
5104: #endif
5105: return(base - (in->cur - in->base));
5106: }
5107: }
5108: ctxt->checkIndex = base;
5109: #ifdef DEBUG_PUSH
5110: if (next == 0)
5111: fprintf(stderr, "HPP: lookup '%c' failed\n", first);
5112: else if (third == 0)
5113: fprintf(stderr, "HPP: lookup '%c%c' failed\n", first, next);
5114: else
5115: fprintf(stderr, "HPP: lookup '%c%c%c' failed\n", first, next, third);
5116: #endif
5117: return(-1);
5118: }
5119:
5120: /**
5121: * sgmlParseTryOrFinish:
5122: * @ctxt: an SGML parser context
5123: * @terminate: last chunk indicator
5124: *
5125: * Try to progress on parsing
5126: *
5127: * Returns zero if no parsing was possible
5128: */
5129: int
5130: sgmlParseTryOrFinish(sgmlParserCtxtPtr ctxt, int terminate) {
5131: int ret = 0;
5132: sgmlParserInputPtr in;
5133: int avail = 0;
5134: xmlChar cur, next;
5135:
5136: #ifdef DEBUG_PUSH
5137: switch (ctxt->instate) {
5138: case XML_PARSER_EOF:
5139: fprintf(stderr, "HPP: try EOF\n"); break;
5140: case XML_PARSER_START:
5141: fprintf(stderr, "HPP: try START\n"); break;
5142: case XML_PARSER_MISC:
5143: fprintf(stderr, "HPP: try MISC\n");break;
5144: case XML_PARSER_COMMENT:
5145: fprintf(stderr, "HPP: try COMMENT\n");break;
5146: case XML_PARSER_PROLOG:
5147: fprintf(stderr, "HPP: try PROLOG\n");break;
5148: case XML_PARSER_START_TAG:
5149: fprintf(stderr, "HPP: try START_TAG\n");break;
5150: case XML_PARSER_CONTENT:
5151: fprintf(stderr, "HPP: try CONTENT\n");break;
5152: case XML_PARSER_CDATA_SECTION:
5153: fprintf(stderr, "HPP: try CDATA_SECTION\n");break;
5154: case XML_PARSER_END_TAG:
5155: fprintf(stderr, "HPP: try END_TAG\n");break;
5156: case XML_PARSER_ENTITY_DECL:
5157: fprintf(stderr, "HPP: try ENTITY_DECL\n");break;
5158: case XML_PARSER_ENTITY_VALUE:
5159: fprintf(stderr, "HPP: try ENTITY_VALUE\n");break;
5160: case XML_PARSER_ATTRIBUTE_VALUE:
5161: fprintf(stderr, "HPP: try ATTRIBUTE_VALUE\n");break;
5162: case XML_PARSER_DTD:
5163: fprintf(stderr, "HPP: try DTD\n");break;
5164: case XML_PARSER_EPILOG:
5165: fprintf(stderr, "HPP: try EPILOG\n");break;
5166: case XML_PARSER_PI:
5167: fprintf(stderr, "HPP: try PI\n");break;
5168: }
5169: #endif
5170:
5171: while (1) {
5172:
5173: in = ctxt->input;
5174: if (in == NULL) break;
5175: if (in->buf == NULL)
5176: avail = in->length - (in->cur - in->base);
5177: else
5178: avail = in->buf->buffer->use - (in->cur - in->base);
5179: if ((avail == 0) && (terminate)) {
5180: sgmlAutoClose(ctxt, NULL);
5181: if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5182: /*
5183: * SAX: end of the document processing.
5184: */
5185: ctxt->instate = XML_PARSER_EOF;
5186: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5187: ctxt->sax->endDocument(ctxt->userData);
5188: }
5189: }
5190: if (avail < 1)
5191: goto done;
5192: switch (ctxt->instate) {
5193: case XML_PARSER_EOF:
5194: /*
5195: * Document parsing is done !
5196: */
5197: goto done;
5198: case XML_PARSER_START:
5199: /*
5200: * Very first chars read from the document flow.
5201: */
5202: cur = in->cur[0];
5203: if (IS_BLANK(cur)) {
5204: SKIP_BLANKS;
5205: if (in->buf == NULL)
5206: avail = in->length - (in->cur - in->base);
5207: else
5208: avail = in->buf->buffer->use - (in->cur - in->base);
5209: }
5210: if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5211: ctxt->sax->setDocumentLocator(ctxt->userData,
5212: &xmlDefaultSAXLocator);
5213: if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5214: (!ctxt->disableSAX))
5215: ctxt->sax->startDocument(ctxt->userData);
5216:
5217: cur = in->cur[0];
5218: next = in->cur[1];
5219: if ((cur == '<') && (next == '!') &&
5220: (UPP(2) == 'D') && (UPP(3) == 'O') &&
5221: (UPP(4) == 'C') && (UPP(5) == 'T') &&
5222: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5223: (UPP(8) == 'E')) {
5224: if ((!terminate) &&
5225: (sgmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
5226: goto done;
5227: #ifdef DEBUG_PUSH
5228: fprintf(stderr, "HPP: Parsing internal subset\n");
5229: #endif
5230: sgmlParseDocTypeDecl(ctxt);
5231: ctxt->instate = XML_PARSER_PROLOG;
5232: #ifdef DEBUG_PUSH
5233: fprintf(stderr, "HPP: entering PROLOG\n");
5234: #endif
5235: } else {
5236: ctxt->instate = XML_PARSER_MISC;
5237: }
5238: #ifdef DEBUG_PUSH
5239: fprintf(stderr, "HPP: entering MISC\n");
5240: #endif
5241: break;
5242: case XML_PARSER_MISC:
5243: SKIP_BLANKS;
5244: if (in->buf == NULL)
5245: avail = in->length - (in->cur - in->base);
5246: else
5247: avail = in->buf->buffer->use - (in->cur - in->base);
5248: if (avail < 2)
5249: goto done;
5250: cur = in->cur[0];
5251: next = in->cur[1];
5252: if ((cur == '<') && (next == '!') &&
5253: (in->cur[2] == '-') && (in->cur[3] == '-')) {
5254: if ((!terminate) &&
5255: (sgmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
5256: goto done;
5257: #ifdef DEBUG_PUSH
5258: fprintf(stderr, "HPP: Parsing Comment\n");
5259: #endif
5260: sgmlParseComment(ctxt);
5261: ctxt->instate = XML_PARSER_MISC;
5262: } else if ((cur == '<') && (next == '!') &&
5263: (UPP(2) == 'D') && (UPP(3) == 'O') &&
5264: (UPP(4) == 'C') && (UPP(5) == 'T') &&
5265: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5266: (UPP(8) == 'E')) {
5267: if ((!terminate) &&
5268: (sgmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
5269: goto done;
5270: #ifdef DEBUG_PUSH
5271: fprintf(stderr, "HPP: Parsing internal subset\n");
5272: #endif
5273: sgmlParseDocTypeDecl(ctxt);
5274: ctxt->instate = XML_PARSER_PROLOG;
5275: #ifdef DEBUG_PUSH
5276: fprintf(stderr, "HPP: entering PROLOG\n");
5277: #endif
5278: } else if ((cur == '<') && (next == '!') &&
5279: (avail < 9)) {
5280: goto done;
5281: } else {
5282: ctxt->instate = XML_PARSER_START_TAG;
5283: #ifdef DEBUG_PUSH
5284: fprintf(stderr, "HPP: entering START_TAG\n");
5285: #endif
5286: }
5287: break;
5288: case XML_PARSER_PROLOG:
5289: SKIP_BLANKS;
5290: if (in->buf == NULL)
5291: avail = in->length - (in->cur - in->base);
5292: else
5293: avail = in->buf->buffer->use - (in->cur - in->base);
5294: if (avail < 2)
5295: goto done;
5296: cur = in->cur[0];
5297: next = in->cur[1];
5298: if ((cur == '<') && (next == '!') &&
5299: (in->cur[2] == '-') && (in->cur[3] == '-')) {
5300: if ((!terminate) &&
5301: (sgmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
5302: goto done;
5303: #ifdef DEBUG_PUSH
5304: fprintf(stderr, "HPP: Parsing Comment\n");
5305: #endif
5306: sgmlParseComment(ctxt);
5307: ctxt->instate = XML_PARSER_PROLOG;
5308: } else if ((cur == '<') && (next == '!') &&
5309: (avail < 4)) {
5310: goto done;
5311: } else {
5312: ctxt->instate = XML_PARSER_START_TAG;
5313: #ifdef DEBUG_PUSH
5314: fprintf(stderr, "HPP: entering START_TAG\n");
5315: #endif
5316: }
5317: break;
5318: case XML_PARSER_EPILOG:
5319: if (in->buf == NULL)
5320: avail = in->length - (in->cur - in->base);
5321: else
5322: avail = in->buf->buffer->use - (in->cur - in->base);
5323: if (avail < 1)
5324: goto done;
5325: cur = in->cur[0];
5326: if (IS_BLANK(cur)) {
5327: sgmlParseCharData(ctxt, 0);
5328: goto done;
5329: }
5330: if (avail < 2)
5331: goto done;
5332: next = in->cur[1];
5333: if ((cur == '<') && (next == '!') &&
5334: (in->cur[2] == '-') && (in->cur[3] == '-')) {
5335: if ((!terminate) &&
5336: (sgmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
5337: goto done;
5338: #ifdef DEBUG_PUSH
5339: fprintf(stderr, "HPP: Parsing Comment\n");
5340: #endif
5341: sgmlParseComment(ctxt);
5342: ctxt->instate = XML_PARSER_EPILOG;
5343: } else if ((cur == '<') && (next == '!') &&
5344: (avail < 4)) {
5345: goto done;
5346: } else {
1.6 veillard 5347: ctxt->errNo = XML_ERR_DOCUMENT_END;
1.1 veillard 5348: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
5349: ctxt->sax->error(ctxt->userData,
5350: "Extra content at the end of the document\n");
5351: ctxt->wellFormed = 0;
5352: ctxt->instate = XML_PARSER_EOF;
5353: #ifdef DEBUG_PUSH
5354: fprintf(stderr, "HPP: entering EOF\n");
5355: #endif
5356: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5357: ctxt->sax->endDocument(ctxt->userData);
5358: goto done;
5359: }
5360: break;
5361: case XML_PARSER_START_TAG: {
5362: xmlChar *name, *oldname;
5363: int depth = ctxt->nameNr;
5364: sgmlElemDescPtr info;
5365:
5366: if (avail < 2)
5367: goto done;
5368: cur = in->cur[0];
5369: if (cur != '<') {
5370: ctxt->instate = XML_PARSER_CONTENT;
5371: #ifdef DEBUG_PUSH
5372: fprintf(stderr, "HPP: entering CONTENT\n");
5373: #endif
5374: break;
5375: }
5376: if ((!terminate) &&
5377: (sgmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
5378: goto done;
5379:
5380: oldname = xmlStrdup(ctxt->name);
5381: sgmlParseStartTag(ctxt);
5382: name = ctxt->name;
5383: #ifdef DEBUG
5384: if (oldname == NULL)
5385: fprintf(stderr, "Start of element %s\n", name);
5386: else if (name == NULL)
5387: fprintf(stderr, "Start of element failed, was %s\n",
5388: oldname);
5389: else
5390: fprintf(stderr, "Start of element %s, was %s\n",
5391: name, oldname);
5392: #endif
5393: if (((depth == ctxt->nameNr) &&
1.7 veillard 5394: (xmlStrEqual(oldname, ctxt->name))) ||
1.1 veillard 5395: (name == NULL)) {
5396: if (CUR == '>')
5397: NEXT;
5398: if (oldname != NULL)
5399: xmlFree(oldname);
5400: break;
5401: }
5402: if (oldname != NULL)
5403: xmlFree(oldname);
5404:
5405: /*
5406: * Lookup the info for that element.
5407: */
5408: info = sgmlTagLookup(name);
5409: if (info == NULL) {
5410: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.4 veillard 5411: ctxt->sax->error(ctxt->userData, "Tag %s unknown\n",
1.1 veillard 5412: name);
5413: ctxt->wellFormed = 0;
5414: } else if (info->depr) {
5415: /***************************
5416: if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
5417: ctxt->sax->warning(ctxt->userData,
5418: "Tag %s is deprecated\n",
5419: name);
5420: ***************************/
5421: }
5422:
5423: /*
5424: * Check for an Empty Element labelled the XML/SGML way
5425: */
5426: if ((CUR == '/') && (NXT(1) == '>')) {
5427: SKIP(2);
5428: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5429: ctxt->sax->endElement(ctxt->userData, name);
5430: oldname = sgmlnamePop(ctxt);
5431: #ifdef DEBUG
5432: fprintf(stderr,"End of tag the XML way: popping out %s\n",
5433: oldname);
5434: #endif
5435: if (oldname != NULL)
5436: xmlFree(oldname);
5437: ctxt->instate = XML_PARSER_CONTENT;
5438: #ifdef DEBUG_PUSH
5439: fprintf(stderr, "HPP: entering CONTENT\n");
5440: #endif
5441: break;
5442: }
5443:
5444: if (CUR == '>') {
5445: NEXT;
5446: } else {
5447: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
5448: ctxt->sax->error(ctxt->userData,
5449: "Couldn't find end of Start Tag %s\n",
5450: name);
5451: ctxt->wellFormed = 0;
5452:
5453: /*
5454: * end of parsing of this node.
5455: */
1.7 veillard 5456: if (xmlStrEqual(name, ctxt->name)) {
1.1 veillard 5457: nodePop(ctxt);
5458: oldname = sgmlnamePop(ctxt);
5459: #ifdef DEBUG
5460: fprintf(stderr,
5461: "End of start tag problem: popping out %s\n", oldname);
5462: #endif
5463: if (oldname != NULL)
5464: xmlFree(oldname);
5465: }
5466:
5467: ctxt->instate = XML_PARSER_CONTENT;
5468: #ifdef DEBUG_PUSH
5469: fprintf(stderr, "HPP: entering CONTENT\n");
5470: #endif
5471: break;
5472: }
5473:
5474: /*
5475: * Check for an Empty Element from DTD definition
5476: */
5477: if ((info != NULL) && (info->empty)) {
5478: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5479: ctxt->sax->endElement(ctxt->userData, name);
5480: oldname = sgmlnamePop(ctxt);
5481: #ifdef DEBUG
5482: fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
5483: #endif
5484: if (oldname != NULL)
5485: xmlFree(oldname);
5486: }
5487: ctxt->instate = XML_PARSER_CONTENT;
5488: #ifdef DEBUG_PUSH
5489: fprintf(stderr, "HPP: entering CONTENT\n");
5490: #endif
5491: break;
5492: }
5493: case XML_PARSER_CONTENT: {
5494: long cons;
5495: /*
5496: * Handle preparsed entities and charRef
5497: */
5498: if (ctxt->token != 0) {
5499: xmlChar chr[2] = { 0 , 0 } ;
5500:
5501: chr[0] = (xmlChar) ctxt->token;
5502: sgmlCheckParagraph(ctxt);
5503: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5504: ctxt->sax->characters(ctxt->userData, chr, 1);
5505: ctxt->token = 0;
5506: ctxt->checkIndex = 0;
5507: }
5508: if ((avail == 1) && (terminate)) {
5509: cur = in->cur[0];
5510: if ((cur != '<') && (cur != '&')) {
5511: if (ctxt->sax != NULL) {
5512: if (IS_BLANK(cur)) {
5513: if (ctxt->sax->ignorableWhitespace != NULL)
5514: ctxt->sax->ignorableWhitespace(
5515: ctxt->userData, &cur, 1);
5516: } else {
5517: sgmlCheckParagraph(ctxt);
5518: if (ctxt->sax->characters != NULL)
5519: ctxt->sax->characters(
5520: ctxt->userData, &cur, 1);
5521: }
5522: }
5523: ctxt->token = 0;
5524: ctxt->checkIndex = 0;
5525: NEXT;
5526: }
5527: break;
5528: }
5529: if (avail < 2)
5530: goto done;
5531: cur = in->cur[0];
5532: next = in->cur[1];
5533: cons = ctxt->nbChars;
5534: /*
5535: * Sometimes DOCTYPE arrives in the middle of the document
5536: */
5537: if ((cur == '<') && (next == '!') &&
5538: (UPP(2) == 'D') && (UPP(3) == 'O') &&
5539: (UPP(4) == 'C') && (UPP(5) == 'T') &&
5540: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5541: (UPP(8) == 'E')) {
5542: if ((!terminate) &&
5543: (sgmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
5544: goto done;
5545: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
5546: ctxt->sax->error(ctxt->userData,
5547: "Misplaced DOCTYPE declaration\n");
5548: ctxt->wellFormed = 0;
5549: sgmlParseDocTypeDecl(ctxt);
5550: } else if ((cur == '<') && (next == '!') &&
5551: (in->cur[2] == '-') && (in->cur[3] == '-')) {
5552: if ((!terminate) &&
5553: (sgmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
5554: goto done;
5555: #ifdef DEBUG_PUSH
5556: fprintf(stderr, "HPP: Parsing Comment\n");
5557: #endif
5558: sgmlParseComment(ctxt);
5559: ctxt->instate = XML_PARSER_CONTENT;
5560: } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5561: goto done;
5562: } else if ((cur == '<') && (next == '/')) {
5563: ctxt->instate = XML_PARSER_END_TAG;
5564: ctxt->checkIndex = 0;
5565: #ifdef DEBUG_PUSH
5566: fprintf(stderr, "HPP: entering END_TAG\n");
5567: #endif
5568: break;
5569: } else if (cur == '<') {
5570: ctxt->instate = XML_PARSER_START_TAG;
5571: ctxt->checkIndex = 0;
5572: #ifdef DEBUG_PUSH
5573: fprintf(stderr, "HPP: entering START_TAG\n");
5574: #endif
5575: break;
5576: } else if (cur == '&') {
5577: if ((!terminate) &&
5578: (sgmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
5579: goto done;
5580: #ifdef DEBUG_PUSH
5581: fprintf(stderr, "HPP: Parsing Reference\n");
5582: #endif
5583: /* TODO: check generation of subtrees if noent !!! */
5584: sgmlParseReference(ctxt);
5585: } else {
5586: /* TODO Avoid the extra copy, handle directly !!!!!! */
5587: /*
5588: * Goal of the following test is :
5589: * - minimize calls to the SAX 'character' callback
5590: * when they are mergeable
5591: */
5592: if ((ctxt->inputNr == 1) &&
5593: (avail < SGML_PARSER_BIG_BUFFER_SIZE)) {
5594: if ((!terminate) &&
5595: (sgmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
5596: goto done;
5597: }
5598: ctxt->checkIndex = 0;
5599: #ifdef DEBUG_PUSH
5600: fprintf(stderr, "HPP: Parsing char data\n");
5601: #endif
5602: sgmlParseCharData(ctxt, 0);
5603: }
5604: if (cons == ctxt->nbChars) {
5605: if (ctxt->node != NULL) {
5606: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
5607: ctxt->sax->error(ctxt->userData,
5608: "detected an error in element content\n");
5609: ctxt->wellFormed = 0;
5610: NEXT;
5611: }
5612: break;
5613: }
5614:
5615: break;
5616: }
5617: case XML_PARSER_END_TAG:
5618: if (avail < 2)
5619: goto done;
5620: if ((!terminate) &&
5621: (sgmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
5622: goto done;
5623: sgmlParseEndTag(ctxt);
5624: if (ctxt->nameNr == 0) {
5625: ctxt->instate = XML_PARSER_EPILOG;
5626: } else {
5627: ctxt->instate = XML_PARSER_CONTENT;
5628: }
5629: ctxt->checkIndex = 0;
5630: #ifdef DEBUG_PUSH
5631: fprintf(stderr, "HPP: entering CONTENT\n");
5632: #endif
5633: break;
5634: case XML_PARSER_CDATA_SECTION:
5635: fprintf(stderr, "HPP: internal error, state == CDATA\n");
5636: ctxt->instate = XML_PARSER_CONTENT;
5637: ctxt->checkIndex = 0;
5638: #ifdef DEBUG_PUSH
5639: fprintf(stderr, "HPP: entering CONTENT\n");
5640: #endif
5641: break;
5642: case XML_PARSER_DTD:
5643: fprintf(stderr, "HPP: internal error, state == DTD\n");
5644: ctxt->instate = XML_PARSER_CONTENT;
5645: ctxt->checkIndex = 0;
5646: #ifdef DEBUG_PUSH
5647: fprintf(stderr, "HPP: entering CONTENT\n");
5648: #endif
5649: break;
5650: case XML_PARSER_COMMENT:
5651: fprintf(stderr, "HPP: internal error, state == COMMENT\n");
5652: ctxt->instate = XML_PARSER_CONTENT;
5653: ctxt->checkIndex = 0;
5654: #ifdef DEBUG_PUSH
5655: fprintf(stderr, "HPP: entering CONTENT\n");
5656: #endif
5657: break;
5658: case XML_PARSER_PI:
5659: fprintf(stderr, "HPP: internal error, state == PI\n");
5660: ctxt->instate = XML_PARSER_CONTENT;
5661: ctxt->checkIndex = 0;
5662: #ifdef DEBUG_PUSH
5663: fprintf(stderr, "HPP: entering CONTENT\n");
5664: #endif
5665: break;
5666: case XML_PARSER_ENTITY_DECL:
5667: fprintf(stderr, "HPP: internal error, state == ENTITY_DECL\n");
5668: ctxt->instate = XML_PARSER_CONTENT;
5669: ctxt->checkIndex = 0;
5670: #ifdef DEBUG_PUSH
5671: fprintf(stderr, "HPP: entering CONTENT\n");
5672: #endif
5673: break;
5674: case XML_PARSER_ENTITY_VALUE:
5675: fprintf(stderr, "HPP: internal error, state == ENTITY_VALUE\n");
5676: ctxt->instate = XML_PARSER_CONTENT;
5677: ctxt->checkIndex = 0;
5678: #ifdef DEBUG_PUSH
5679: fprintf(stderr, "HPP: entering DTD\n");
5680: #endif
5681: break;
5682: case XML_PARSER_ATTRIBUTE_VALUE:
5683: fprintf(stderr, "HPP: internal error, state == ATTRIBUTE_VALUE\n");
5684: ctxt->instate = XML_PARSER_START_TAG;
5685: ctxt->checkIndex = 0;
5686: #ifdef DEBUG_PUSH
5687: fprintf(stderr, "HPP: entering START_TAG\n");
5688: #endif
5689: break;
5690: case XML_PARSER_SYSTEM_LITERAL:
5691: fprintf(stderr, "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
5692: ctxt->instate = XML_PARSER_CONTENT;
5693: ctxt->checkIndex = 0;
5694: #ifdef DEBUG_PUSH
5695: fprintf(stderr, "HPP: entering CONTENT\n");
5696: #endif
5697: break;
5698: }
5699: }
5700: done:
5701: if ((avail == 0) && (terminate)) {
5702: sgmlAutoClose(ctxt, NULL);
5703: if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5704: /*
5705: * SAX: end of the document processing.
5706: */
5707: ctxt->instate = XML_PARSER_EOF;
5708: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5709: ctxt->sax->endDocument(ctxt->userData);
5710: }
5711: }
5712: if ((ctxt->myDoc != NULL) &&
5713: ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5714: (ctxt->instate == XML_PARSER_EPILOG))) {
5715: xmlDtdPtr dtd;
5716: dtd = xmlGetIntSubset(ctxt->myDoc);
5717: if (dtd == NULL)
5718: ctxt->myDoc->intSubset =
5719: xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "SGML",
5720: BAD_CAST "-//W3C//DTD SGML 4.0 Transitional//EN",
5721: BAD_CAST "http://www.w3.org/TR/REC-docbook/loose.dtd");
5722: }
5723: #ifdef DEBUG_PUSH
5724: fprintf(stderr, "HPP: done %d\n", ret);
5725: #endif
5726: return(ret);
5727: }
5728:
5729: /**
5730: * sgmlParseTry:
5731: * @ctxt: an SGML parser context
5732: *
5733: * Try to progress on parsing
5734: *
5735: * Returns zero if no parsing was possible
5736: */
5737: int
5738: sgmlParseTry(sgmlParserCtxtPtr ctxt) {
5739: return(sgmlParseTryOrFinish(ctxt, 0));
5740: }
5741:
5742: /**
5743: * sgmlParseChunk:
5744: * @ctxt: an XML parser context
5745: * @chunk: an char array
5746: * @size: the size in byte of the chunk
5747: * @terminate: last chunk indicator
5748: *
5749: * Parse a Chunk of memory
5750: *
5751: * Returns zero if no error, the xmlParserErrors otherwise.
5752: */
5753: int
5754: sgmlParseChunk(sgmlParserCtxtPtr ctxt, const char *chunk, int size,
5755: int terminate) {
5756: if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5757: (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5758: int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5759: int cur = ctxt->input->cur - ctxt->input->base;
5760:
5761: xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5762: ctxt->input->base = ctxt->input->buf->buffer->content + base;
5763: ctxt->input->cur = ctxt->input->base + cur;
5764: #ifdef DEBUG_PUSH
5765: fprintf(stderr, "HPP: pushed %d\n", size);
5766: #endif
5767:
5768: if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5769: sgmlParseTryOrFinish(ctxt, terminate);
5770: } else if (ctxt->instate != XML_PARSER_EOF) {
5771: xmlParserInputBufferPush(ctxt->input->buf, 0, "");
5772: sgmlParseTryOrFinish(ctxt, terminate);
5773: }
5774: if (terminate) {
5775: if ((ctxt->instate != XML_PARSER_EOF) &&
5776: (ctxt->instate != XML_PARSER_EPILOG) &&
5777: (ctxt->instate != XML_PARSER_MISC)) {
1.6 veillard 5778: ctxt->errNo = XML_ERR_DOCUMENT_END;
1.1 veillard 5779: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
5780: ctxt->sax->error(ctxt->userData,
5781: "Extra content at the end of the document\n");
5782: ctxt->wellFormed = 0;
5783: }
5784: if (ctxt->instate != XML_PARSER_EOF) {
5785: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5786: ctxt->sax->endDocument(ctxt->userData);
5787: }
5788: ctxt->instate = XML_PARSER_EOF;
5789: }
5790: return((xmlParserErrors) ctxt->errNo);
5791: }
5792:
5793: /************************************************************************
5794: * *
5795: * User entry points *
5796: * *
5797: ************************************************************************/
5798:
5799: /**
5800: * sgmlCreatePushParserCtxt :
5801: * @sax: a SAX handler
5802: * @user_data: The user data returned on SAX callbacks
5803: * @chunk: a pointer to an array of chars
5804: * @size: number of chars in the array
5805: * @filename: an optional file name or URI
5806: * @enc: an optional encoding
5807: *
5808: * Create a parser context for using the SGML parser in push mode
5809: * To allow content encoding detection, @size should be >= 4
5810: * The value of @filename is used for fetching external entities
5811: * and error/warning reports.
5812: *
5813: * Returns the new parser context or NULL
5814: */
5815: sgmlParserCtxtPtr
5816: sgmlCreatePushParserCtxt(sgmlSAXHandlerPtr sax, void *user_data,
5817: const char *chunk, int size, const char *filename,
5818: xmlCharEncoding enc) {
5819: sgmlParserCtxtPtr ctxt;
5820: sgmlParserInputPtr inputStream;
5821: xmlParserInputBufferPtr buf;
5822:
5823: buf = xmlAllocParserInputBuffer(enc);
5824: if (buf == NULL) return(NULL);
5825:
5826: ctxt = (sgmlParserCtxtPtr) xmlMalloc(sizeof(sgmlParserCtxt));
5827: if (ctxt == NULL) {
5828: xmlFree(buf);
5829: return(NULL);
5830: }
5831: memset(ctxt, 0, sizeof(sgmlParserCtxt));
5832: sgmlInitParserCtxt(ctxt);
5833: if (sax != NULL) {
5834: if (ctxt->sax != &sgmlDefaultSAXHandler)
5835: xmlFree(ctxt->sax);
5836: ctxt->sax = (sgmlSAXHandlerPtr) xmlMalloc(sizeof(sgmlSAXHandler));
5837: if (ctxt->sax == NULL) {
5838: xmlFree(buf);
5839: xmlFree(ctxt);
5840: return(NULL);
5841: }
5842: memcpy(ctxt->sax, sax, sizeof(sgmlSAXHandler));
5843: if (user_data != NULL)
5844: ctxt->userData = user_data;
5845: }
5846: if (filename == NULL) {
5847: ctxt->directory = NULL;
5848: } else {
5849: ctxt->directory = xmlParserGetDirectory(filename);
5850: }
5851:
5852: inputStream = sgmlNewInputStream(ctxt);
5853: if (inputStream == NULL) {
5854: xmlFreeParserCtxt(ctxt);
5855: return(NULL);
5856: }
5857:
5858: if (filename == NULL)
5859: inputStream->filename = NULL;
5860: else
5861: inputStream->filename = xmlMemStrdup(filename);
5862: inputStream->buf = buf;
5863: inputStream->base = inputStream->buf->buffer->content;
5864: inputStream->cur = inputStream->buf->buffer->content;
5865:
5866: inputPush(ctxt, inputStream);
5867:
5868: if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5869: (ctxt->input->buf != NULL)) {
5870: xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5871: #ifdef DEBUG_PUSH
5872: fprintf(stderr, "HPP: pushed %d\n", size);
5873: #endif
5874: }
5875:
5876: return(ctxt);
5877: }
5878:
5879: /**
5880: * sgmlSAXParseDoc :
5881: * @cur: a pointer to an array of xmlChar
5882: * @encoding: a free form C string describing the SGML document encoding, or NULL
5883: * @sax: the SAX handler block
5884: * @userData: if using SAX, this pointer will be provided on callbacks.
5885: *
5886: * parse an SGML in-memory document and build a tree.
5887: * It use the given SAX function block to handle the parsing callback.
5888: * If sax is NULL, fallback to the default DOM tree building routines.
5889: *
5890: * Returns the resulting document tree
5891: */
5892:
5893: sgmlDocPtr
5894: sgmlSAXParseDoc(xmlChar *cur, const char *encoding, sgmlSAXHandlerPtr sax, void *userData) {
5895: sgmlDocPtr ret;
5896: sgmlParserCtxtPtr ctxt;
5897:
5898: if (cur == NULL) return(NULL);
5899:
5900:
5901: ctxt = sgmlCreateDocParserCtxt(cur, encoding);
5902: if (ctxt == NULL) return(NULL);
5903: if (sax != NULL) {
5904: ctxt->sax = sax;
5905: ctxt->userData = userData;
5906: }
5907:
5908: sgmlParseDocument(ctxt);
5909: ret = ctxt->myDoc;
5910: if (sax != NULL) {
5911: ctxt->sax = NULL;
5912: ctxt->userData = NULL;
5913: }
5914: sgmlFreeParserCtxt(ctxt);
5915:
5916: return(ret);
5917: }
5918:
5919: /**
5920: * sgmlParseDoc :
5921: * @cur: a pointer to an array of xmlChar
5922: * @encoding: a free form C string describing the SGML document encoding, or NULL
5923: *
5924: * parse an SGML in-memory document and build a tree.
5925: *
5926: * Returns the resulting document tree
5927: */
5928:
5929: sgmlDocPtr
5930: sgmlParseDoc(xmlChar *cur, const char *encoding) {
5931: return(sgmlSAXParseDoc(cur, encoding, NULL, NULL));
5932: }
5933:
5934:
5935: /**
5936: * sgmlCreateFileParserCtxt :
5937: * @filename: the filename
5938: * @encoding: a free form C string describing the SGML document encoding, or NULL
5939: *
5940: * Create a parser context for a file content.
5941: * Automatic support for ZLIB/Compress compressed document is provided
5942: * by default if found at compile-time.
5943: *
5944: * Returns the new parser context or NULL
5945: */
5946: sgmlParserCtxtPtr
5947: sgmlCreateFileParserCtxt(const char *filename, const char *encoding)
5948: {
5949: sgmlParserCtxtPtr ctxt;
5950: sgmlParserInputPtr inputStream;
5951: xmlParserInputBufferPtr buf;
5952: /* sgmlCharEncoding enc; */
5953:
5954: buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
5955: if (buf == NULL) return(NULL);
5956:
5957: ctxt = (sgmlParserCtxtPtr) xmlMalloc(sizeof(sgmlParserCtxt));
5958: if (ctxt == NULL) {
5959: perror("malloc");
5960: return(NULL);
5961: }
5962: memset(ctxt, 0, sizeof(sgmlParserCtxt));
5963: sgmlInitParserCtxt(ctxt);
5964: inputStream = (sgmlParserInputPtr) xmlMalloc(sizeof(sgmlParserInput));
5965: if (inputStream == NULL) {
5966: perror("malloc");
5967: xmlFree(ctxt);
5968: return(NULL);
5969: }
5970: memset(inputStream, 0, sizeof(sgmlParserInput));
5971:
5972: inputStream->filename = xmlMemStrdup(filename);
5973: inputStream->line = 1;
5974: inputStream->col = 1;
5975: inputStream->buf = buf;
5976: inputStream->directory = NULL;
5977:
5978: inputStream->base = inputStream->buf->buffer->content;
5979: inputStream->cur = inputStream->buf->buffer->content;
5980: inputStream->free = NULL;
5981:
5982: inputPush(ctxt, inputStream);
5983: return(ctxt);
5984: }
5985:
5986: /**
5987: * sgmlSAXParseFile :
5988: * @filename: the filename
5989: * @encoding: a free form C string describing the SGML document encoding, or NULL
5990: * @sax: the SAX handler block
5991: * @userData: if using SAX, this pointer will be provided on callbacks.
5992: *
5993: * parse an SGML file and build a tree. Automatic support for ZLIB/Compress
5994: * compressed document is provided by default if found at compile-time.
5995: * It use the given SAX function block to handle the parsing callback.
5996: * If sax is NULL, fallback to the default DOM tree building routines.
5997: *
5998: * Returns the resulting document tree
5999: */
6000:
6001: sgmlDocPtr
6002: sgmlSAXParseFile(const char *filename, const char *encoding, sgmlSAXHandlerPtr sax,
6003: void *userData) {
6004: sgmlDocPtr ret;
6005: sgmlParserCtxtPtr ctxt;
6006: sgmlSAXHandlerPtr oldsax = NULL;
6007:
6008: ctxt = sgmlCreateFileParserCtxt(filename, encoding);
6009: if (ctxt == NULL) return(NULL);
6010: if (sax != NULL) {
6011: oldsax = ctxt->sax;
6012: ctxt->sax = sax;
6013: ctxt->userData = userData;
6014: }
6015:
6016: sgmlParseDocument(ctxt);
6017:
6018: ret = ctxt->myDoc;
6019: if (sax != NULL) {
6020: ctxt->sax = oldsax;
6021: ctxt->userData = NULL;
6022: }
6023: sgmlFreeParserCtxt(ctxt);
6024:
6025: return(ret);
6026: }
6027:
6028: /**
6029: * sgmlParseFile :
6030: * @filename: the filename
6031: * @encoding: a free form C string describing the SGML document encoding, or NULL
6032: *
6033: * parse an SGML file and build a tree. Automatic support for ZLIB/Compress
6034: * compressed document is provided by default if found at compile-time.
6035: *
6036: * Returns the resulting document tree
6037: */
6038:
6039: sgmlDocPtr
6040: sgmlParseFile(const char *filename, const char *encoding) {
6041: return(sgmlSAXParseFile(filename, encoding, NULL, NULL));
6042: }
6043:
6044: #endif /* LIBXML_SGML_ENABLED */
Webmaster