Annotation of XML/SGMLparser.c, revision 1.5
1.1 veillard 1: /*
2: * SGMLparser.c : an attempt to parse Docbook documents
3: *
4: * See Copyright for the status of this software.
5: *
6: * Daniel.Veillard@w3.org
7: */
8:
9: #ifdef WIN32
10: #include "win32config.h"
11: #else
12: #include "config.h"
13: #endif
14:
15: #include "xmlversion.h"
16: #ifdef LIBXML_SGML_ENABLED
17:
18: #include <stdio.h>
19: #include <string.h>
20: #ifdef HAVE_CTYPE_H
21: #include <ctype.h>
22: #endif
23: #ifdef HAVE_STDLIB_H
24: #include <stdlib.h>
25: #endif
26: #ifdef HAVE_SYS_STAT_H
27: #include <sys/stat.h>
28: #endif
29: #ifdef HAVE_FCNTL_H
30: #include <fcntl.h>
31: #endif
32: #ifdef HAVE_UNISTD_H
33: #include <unistd.h>
34: #endif
35: #ifdef HAVE_ZLIB_H
36: #include <zlib.h>
37: #endif
38:
39: #include <libxml/xmlmemory.h>
40: #include <libxml/tree.h>
41: #include <libxml/SGMLparser.h>
42: #include <libxml/entities.h>
43: #include <libxml/encoding.h>
44: #include <libxml/parser.h>
45: #include <libxml/valid.h>
46: #include <libxml/parserInternals.h>
47: #include <libxml/xmlIO.h>
48: #include <libxml/SAX.h>
1.3 veillard 49: #include <libxml/uri.h>
1.1 veillard 50: #include "xml-error.h"
51:
52: #define SGML_MAX_NAMELEN 1000
53: #define SGML_PARSER_BIG_BUFFER_SIZE 1000
54: #define SGML_PARSER_BUFFER_SIZE 100
55:
56: /* #define DEBUG */
57: /* #define DEBUG_PUSH */
58:
59: /************************************************************************
60: * *
61: * Parser stacks related functions and macros *
62: * *
63: ************************************************************************/
64:
65: /*
66: * Generic function for accessing stacks in the Parser Context
67: */
68:
69: #define PUSH_AND_POP(scope, type, name) \
70: scope int sgml##name##Push(sgmlParserCtxtPtr ctxt, type value) { \
71: if (ctxt->name##Nr >= ctxt->name##Max) { \
72: ctxt->name##Max *= 2; \
73: ctxt->name##Tab = (type *) xmlRealloc(ctxt->name##Tab, \
74: ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \
75: if (ctxt->name##Tab == NULL) { \
76: fprintf(stderr, "realloc failed !\n"); \
77: return(0); \
78: } \
79: } \
80: ctxt->name##Tab[ctxt->name##Nr] = value; \
81: ctxt->name = value; \
82: return(ctxt->name##Nr++); \
83: } \
84: scope type sgml##name##Pop(sgmlParserCtxtPtr ctxt) { \
85: type ret; \
86: if (ctxt->name##Nr < 0) return(0); \
87: ctxt->name##Nr--; \
88: if (ctxt->name##Nr < 0) return(0); \
89: if (ctxt->name##Nr > 0) \
90: ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \
91: else \
92: ctxt->name = NULL; \
93: ret = ctxt->name##Tab[ctxt->name##Nr]; \
94: ctxt->name##Tab[ctxt->name##Nr] = 0; \
95: return(ret); \
96: } \
97:
98: PUSH_AND_POP(extern, xmlNodePtr, node)
99: PUSH_AND_POP(extern, xmlChar*, name)
100:
101: /*
102: * Macros for accessing the content. Those should be used only by the parser,
103: * and not exported.
104: *
105: * Dirty macros, i.e. one need to make assumption on the context to use them
106: *
107: * CUR_PTR return the current pointer to the xmlChar to be parsed.
108: * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
109: * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
110: * in UNICODE mode. This should be used internally by the parser
111: * only to compare to ASCII values otherwise it would break when
112: * running with UTF-8 encoding.
113: * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
114: * to compare on ASCII based substring.
115: * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
116: * it should be used only to compare on ASCII based substring.
117: * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
118: * strings within the parser.
119: *
120: * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
121: *
122: * CURRENT Returns the current char value, with the full decoding of
123: * UTF-8 if we are using this mode. It returns an int.
124: * NEXT Skip to the next character, this does the proper decoding
125: * in UTF-8 mode. It also pop-up unfinished entities on the fly.
126: * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
127: */
128:
129: #define UPPER (toupper(*ctxt->input->cur))
130:
131: #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
132:
133: #define NXT(val) ctxt->input->cur[(val)]
134:
135: #define UPP(val) (toupper(ctxt->input->cur[(val)]))
136:
137: #define CUR_PTR ctxt->input->cur
138:
139: #define SHRINK xmlParserInputShrink(ctxt->input)
140:
141: #define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
142:
143: #define CURRENT ((int) (*ctxt->input->cur))
144:
145: #define SKIP_BLANKS sgmlSkipBlankChars(ctxt);
146:
147: #if 0
148: #define CUR ((int) (*ctxt->input->cur))
149: #define NEXT sgmlNextChar(ctxt);
150: #else
151: /* Inported from XML */
152:
153: /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
154: #define CUR ((int) (*ctxt->input->cur))
155: #define NEXT xmlNextChar(ctxt);ctxt->nbChars++;
156:
157: #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
158: #define NXT(val) ctxt->input->cur[(val)]
159: #define CUR_PTR ctxt->input->cur
160:
161:
162: #define NEXTL(l) \
163: if (*(ctxt->input->cur) == '\n') { \
164: ctxt->input->line++; ctxt->input->col = 1; \
165: } else ctxt->input->col++; \
166: ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;
167:
168: /************
169: \
170: if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
171: if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
172: ************/
173:
174: #define CUR_CHAR(l) sgmlCurrentChar(ctxt, &l);
175: #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l);
176:
177: #define COPY_BUF(l,b,i,v) \
178: if (l == 1) b[i++] = (xmlChar) v; \
179: else i += xmlCopyChar(l,&b[i],v);
180: #endif
181:
182: /**
183: * sgmlCurrentChar:
184: * @ctxt: the SGML parser context
185: * @len: pointer to the length of the char read
186: *
187: * The current char value, if using UTF-8 this may actaully span multiple
188: * bytes in the input buffer. Implement the end of line normalization:
189: * 2.11 End-of-Line Handling
190: * If the encoding is unspecified, in the case we find an ISO-Latin-1
191: * char, then the encoding converter is plugged in automatically.
192: *
193: * Returns the current char value and its lenght
194: */
195:
196: int
197: sgmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
198: if (ctxt->instate == XML_PARSER_EOF)
199: return(0);
200:
201: if (ctxt->token != 0) {
202: *len = 0;
203: return(ctxt->token);
204: }
205: if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
206: /*
207: * We are supposed to handle UTF8, check it's valid
208: * From rfc2044: encoding of the Unicode values on UTF-8:
209: *
210: * UCS-4 range (hex.) UTF-8 octet sequence (binary)
211: * 0000 0000-0000 007F 0xxxxxxx
212: * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
213: * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
214: *
215: * Check for the 0x110000 limit too
216: */
217: const unsigned char *cur = ctxt->input->cur;
218: unsigned char c;
219: unsigned int val;
220:
221: c = *cur;
222: if (c & 0x80) {
223: if (cur[1] == 0)
224: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
225: if ((cur[1] & 0xc0) != 0x80)
226: goto encoding_error;
227: if ((c & 0xe0) == 0xe0) {
228:
229: if (cur[2] == 0)
230: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
231: if ((cur[2] & 0xc0) != 0x80)
232: goto encoding_error;
233: if ((c & 0xf0) == 0xf0) {
234: if (cur[3] == 0)
235: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
236: if (((c & 0xf8) != 0xf0) ||
237: ((cur[3] & 0xc0) != 0x80))
238: goto encoding_error;
239: /* 4-byte code */
240: *len = 4;
241: val = (cur[0] & 0x7) << 18;
242: val |= (cur[1] & 0x3f) << 12;
243: val |= (cur[2] & 0x3f) << 6;
244: val |= cur[3] & 0x3f;
245: } else {
246: /* 3-byte code */
247: *len = 3;
248: val = (cur[0] & 0xf) << 12;
249: val |= (cur[1] & 0x3f) << 6;
250: val |= cur[2] & 0x3f;
251: }
252: } else {
253: /* 2-byte code */
254: *len = 2;
255: val = (cur[0] & 0x1f) << 6;
256: val |= cur[1] & 0x3f;
257: }
258: if (!IS_CHAR(val)) {
259: if ((ctxt->sax != NULL) &&
260: (ctxt->sax->error != NULL))
261: ctxt->sax->error(ctxt->userData,
262: "Char 0x%X out of allowed range\n", val);
263: ctxt->errNo = XML_ERR_INVALID_ENCODING;
264: ctxt->wellFormed = 0;
265: ctxt->disableSAX = 1;
266: }
267: return(val);
268: } else {
269: /* 1-byte code */
270: *len = 1;
271: return((int) *ctxt->input->cur);
272: }
273: }
274: /*
275: * Assume it's a fixed lenght encoding (1) with
276: * a compatibke encoding for the ASCII set, since
277: * XML constructs only use < 128 chars
278: */
279: *len = 1;
280: if ((int) *ctxt->input->cur < 0x80)
281: return((int) *ctxt->input->cur);
282:
283: /*
284: * Humm this is bad, do an automatic flow conversion
285: */
286: xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
287: ctxt->charset = XML_CHAR_ENCODING_UTF8;
288: return(xmlCurrentChar(ctxt, len));
289:
290: encoding_error:
291: /*
292: * If we detect an UTF8 error that probably mean that the
293: * input encoding didn't get properly advertized in the
294: * declaration header. Report the error and switch the encoding
295: * to ISO-Latin-1 (if you don't like this policy, just declare the
296: * encoding !)
297: */
298: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
299: ctxt->sax->error(ctxt->userData,
300: "Input is not proper UTF-8, indicate encoding !\n");
301: ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
302: ctxt->input->cur[0], ctxt->input->cur[1],
303: ctxt->input->cur[2], ctxt->input->cur[3]);
304: }
305: ctxt->errNo = XML_ERR_INVALID_ENCODING;
306:
307: ctxt->charset = XML_CHAR_ENCODING_8859_1;
308: *len = 1;
309: return((int) *ctxt->input->cur);
310: }
311:
312: /**
313: * sgmlNextChar:
314: * @ctxt: the SGML parser context
315: *
316: * Skip to the next char input char.
317: */
318:
319: void
320: sgmlNextChar(sgmlParserCtxtPtr ctxt) {
321: if (ctxt->instate == XML_PARSER_EOF)
322: return;
323: if ((*ctxt->input->cur == 0) &&
324: (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
325: xmlPopInput(ctxt);
326: } else {
327: if (*(ctxt->input->cur) == '\n') {
328: ctxt->input->line++; ctxt->input->col = 1;
329: } else ctxt->input->col++;
330: ctxt->input->cur++;
331: ctxt->nbChars++;
332: if (*ctxt->input->cur == 0)
333: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
334: }
335: }
336:
337: /**
338: * sgmlSkipBlankChars:
339: * @ctxt: the SGML parser context
340: *
341: * skip all blanks character found at that point in the input streams.
342: *
343: * Returns the number of space chars skipped
344: */
345:
346: int
347: sgmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
348: int res = 0;
349:
350: while (IS_BLANK(*(ctxt->input->cur))) {
351: if ((*ctxt->input->cur == 0) &&
352: (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
353: xmlPopInput(ctxt);
354: } else {
355: if (*(ctxt->input->cur) == '\n') {
356: ctxt->input->line++; ctxt->input->col = 1;
357: } else ctxt->input->col++;
358: ctxt->input->cur++;
359: ctxt->nbChars++;
360: if (*ctxt->input->cur == 0)
361: xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
362: }
363: res++;
364: }
365: return(res);
366: }
367:
368:
369:
370: /************************************************************************
371: * *
372: * The list of SGML elements and their properties *
373: * *
374: ************************************************************************/
375:
376: /*
377: * Start Tag: 1 means the start tag can be ommited
378: * End Tag: 1 means the end tag can be ommited
379: * 2 means it's forbidden (empty elements)
380: * Depr: this element is deprecated
381: * DTD: 1 means that this element is valid only in the Loose DTD
382: * 2 means that this element is valid only in the Frameset DTD
383: *
384: * Name,Start Tag,End Tag, Empty, Depr., DTD, Description
385: */
386: sgmlElemDesc docbookElementTable[] = {
387: { "abbrev", 0, 0, 0, 3, 0, "" }, /* word */
388: { "abstract", 0, 0, 0, 9, 0, "" }, /* title */
389: { "accel", 0, 0, 0, 7, 0, "" }, /* smallcptr */
390: { "ackno", 0, 0, 0, 4, 0, "" }, /* docinfo */
391: { "acronym", 0, 0, 0, 3, 0, "" }, /* word */
392: { "action", 0, 0, 0, 7, 0, "" }, /* smallcptr */
393: { "address", 0, 0, 0, 1, 0, "" },
394: { "affiliation",0, 0, 0, 9, 0, "" }, /* shortaffil */
395: { "alt", 0, 0, 0, 1, 0, "" },
396: { "anchor", 0, 2, 1, 0, 0, "" },
397: { "answer", 0, 0, 0, 9, 0, "" }, /* label */
398: { "appendix", 0, 0, 0, 9, 0, "" }, /* appendixinfo */
399: { "appendixinfo",0, 0, 0, 9, 0, "" }, /* graphic */
400: { "application",0, 0, 0, 2, 0, "" }, /* para */
401: { "area", 0, 2, 1, 0, 0, "" },
402: { "areaset", 0, 0, 0, 9, 0, "" }, /* area */
403: { "areaspec", 0, 0, 0, 9, 0, "" }, /* area */
404: { "arg", 0, 0, 0, 1, 0, "" },
405: { "article", 0, 0, 0, 9, 0, "" }, /* div.title.content */
406: { "articleinfo",0, 0, 0, 9, 0, "" }, /* graphic */
407: { "artpagenums",0, 0, 0, 4, 0, "" }, /* docinfo */
408: { "attribution",0, 0, 0, 2, 0, "" }, /* para */
409: { "audiodata", 0, 2, 1, 0, 0, "" },
410: { "audioobject",0, 0, 0, 9, 0, "" }, /* objectinfo */
411: { "authorblurb",0, 0, 0, 9, 0, "" }, /* title */
412: { "authorgroup",0, 0, 0, 9, 0, "" }, /* author */
413: { "authorinitials",0, 0, 0, 4, 0, "" }, /* docinfo */
414: { "author", 0, 0, 0, 9, 0, "" }, /* person.ident.mix */
415: { "beginpage", 0, 2, 1, 0, 0, "" },
416: { "bibliodiv", 0, 0, 0, 9, 0, "" }, /* sect.title.content */
417: { "biblioentry",0, 0, 0, 9, 0, "" }, /* articleinfo */
418: { "bibliography",0, 0, 0, 9, 0, "" }, /* bibliographyinfo */
419: { "bibliographyinfo",0, 0, 0, 9, 0, "" }, /* graphic */
420: { "bibliomisc", 0, 0, 0, 2, 0, "" }, /* para */
421: { "bibliomixed",0, 0, 0, 1, 0, "" }, /* %bibliocomponent.mix, bibliomset) */
422: { "bibliomset", 0, 0, 0, 1, 0, "" }, /* %bibliocomponent.mix; | bibliomset) */
423: { "biblioset", 0, 0, 0, 9, 0, "" }, /* bibliocomponent.mix */
424: { "blockquote", 0, 0, 0, 9, 0, "" }, /* title */
425: { "book", 0, 0, 0, 9, 0, "" }, /* div.title.content */
426: { "bookinfo", 0, 0, 0, 9, 0, "" }, /* graphic */
427: { "bridgehead", 0, 0, 0, 8, 0, "" }, /* title */
428: { "callout", 0, 0, 0, 9, 0, "" }, /* component.mix */
429: { "calloutlist",0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
430: { "caption", 0, 0, 0, 9, 0, "" }, /* textobject.mix */
431: { "caution", 0, 0, 0, 9, 0, "" }, /* title */
432: { "chapter", 0, 0, 0, 9, 0, "" }, /* chapterinfo */
433: { "chapterinfo",0, 0, 0, 9, 0, "" }, /* graphic */
434: { "citation", 0, 0, 0, 2, 0, "" }, /* para */
435: { "citerefentry",0, 0, 0, 9, 0, "" }, /* refentrytitle */
436: { "citetitle", 0, 0, 0, 2, 0, "" }, /* para */
437: { "city", 0, 0, 0, 4, 0, "" }, /* docinfo */
438: { "classname", 0, 0, 0, 7, 0, "" }, /* smallcptr */
439: { "classsynopsisinfo",0,0, 0, 9, 0, "" }, /* cptr */
440: { "classsynopsis",0, 0, 0, 9, 0, "" }, /* ooclass */
441: { "cmdsynopsis",0, 0, 0, 9, 0, "" }, /* command */
442: { "co", 0, 2, 1, 0, 0, "" },
443: { "collab", 0, 0, 0, 9, 0, "" }, /* collabname */
444: { "collabname", 0, 0, 0, 4, 0, "" }, /* docinfo */
445: { "colophon", 0, 0, 0, 9, 0, "" }, /* sect.title.content */
446: { "colspec", 0, 2, 1, 0, 0, "" },
447: { "colspec", 0, 2, 1, 0, 0, "" },
448: { "command", 0, 0, 0, 9, 0, "" }, /* cptr */
449: { "computeroutput",0, 0, 0, 9, 0, "" }, /* cptr */
450: { "confdates", 0, 0, 0, 4, 0, "" }, /* docinfo */
451: { "confgroup", 0, 0, 0, 9, 0, "" }, /* confdates */
452: { "confnum", 0, 0, 0, 4, 0, "" }, /* docinfo */
453: { "confsponsor",0, 0, 0, 4, 0, "" }, /* docinfo */
454: { "conftitle", 0, 0, 0, 4, 0, "" }, /* docinfo */
455: { "constant", 0, 0, 0, 7, 0, "" }, /* smallcptr */
456: { "constructorsynopsis",0,0, 0, 9, 0, "" }, /* modifier */
457: { "contractnum",0, 0, 0, 4, 0, "" }, /* docinfo */
458: { "contractsponsor",0, 0, 0, 4, 0, "" }, /* docinfo */
459: { "contrib", 0, 0, 0, 4, 0, "" }, /* docinfo */
460: { "copyright", 0, 0, 0, 9, 0, "" }, /* year */
461: { "corpauthor", 0, 0, 0, 4, 0, "" }, /* docinfo */
462: { "corpname", 0, 0, 0, 4, 0, "" }, /* docinfo */
463: { "country", 0, 0, 0, 4, 0, "" }, /* docinfo */
464: { "database", 0, 0, 0, 7, 0, "" }, /* smallcptr */
465: { "date", 0, 0, 0, 4, 0, "" }, /* docinfo */
466: { "dedication", 0, 0, 0, 9, 0, "" }, /* sect.title.content */
467: { "destructorsynopsis",0,0, 0, 9, 0, "" }, /* modifier */
468: { "edition", 0, 0, 0, 4, 0, "" }, /* docinfo */
469: { "editor", 0, 0, 0, 9, 0, "" }, /* person.ident.mix */
470: { "email", 0, 0, 0, 4, 0, "" }, /* docinfo */
471: { "emphasis", 0, 0, 0, 2, 0, "" }, /* para */
472: { "entry", 0, 0, 0, 9, 0, "" }, /* tbl.entry.mdl */
473: { "entrytbl", 0, 0, 0, 9, 0, "" }, /* tbl.entrytbl.mdl */
474: { "envar", 0, 0, 0, 7, 0, "" }, /* smallcptr */
475: { "epigraph", 0, 0, 0, 9, 0, "" }, /* attribution */
476: { "equation", 0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
477: { "errorcode", 0, 0, 0, 7, 0, "" }, /* smallcptr */
478: { "errorname", 0, 0, 0, 7, 0, "" }, /* smallcptr */
479: { "errortype", 0, 0, 0, 7, 0, "" }, /* smallcptr */
480: { "example", 0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
481: { "exceptionname",0, 0, 0, 7, 0, "" }, /* smallcptr */
482: { "fax", 0, 0, 0, 4, 0, "" }, /* docinfo */
483: { "fieldsynopsis", 0, 0, 0, 9, 0, "" }, /* modifier */
484: { "figure", 0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
485: { "filename", 0, 0, 0, 7, 0, "" }, /* smallcptr */
486: { "firstname", 0, 0, 0, 4, 0, "" }, /* docinfo */
487: { "firstterm", 0, 0, 0, 3, 0, "" }, /* word */
488: { "footnote", 0, 0, 0, 9, 0, "" }, /* footnote.mix */
489: { "footnoteref",0, 2, 1, 0, 0, "" },
490: { "foreignphrase",0, 0, 0, 2, 0, "" }, /* para */
491: { "formalpara", 0, 0, 0, 9, 0, "" }, /* title */
492: { "funcdef", 0, 0, 0, 1, 0, "" },
493: { "funcparams", 0, 0, 0, 9, 0, "" }, /* cptr */
494: { "funcprototype",0, 0, 0, 9, 0, "" }, /* funcdef */
495: { "funcsynopsis",0, 0, 0, 9, 0, "" }, /* funcsynopsisinfo */
496: { "funcsynopsisinfo", 0, 0, 0, 9, 0, "" }, /* cptr */
497: { "function", 0, 0, 0, 9, 0, "" }, /* cptr */
498: { "glossary", 0, 0, 0, 9, 0, "" }, /* glossaryinfo */
499: { "glossaryinfo",0, 0, 0, 9, 0, "" }, /* graphic */
500: { "glossdef", 0, 0, 0, 9, 0, "" }, /* glossdef.mix */
501: { "glossdiv", 0, 0, 0, 9, 0, "" }, /* sect.title.content */
502: { "glossentry", 0, 0, 0, 9, 0, "" }, /* glossterm */
503: { "glosslist", 0, 0, 0, 9, 0, "" }, /* glossentry */
504: { "glossseealso",0, 0, 0, 2, 0, "" }, /* para */
505: { "glosssee", 0, 0, 0, 2, 0, "" }, /* para */
506: { "glossterm", 0, 0, 0, 2, 0, "" }, /* para */
507: { "graphic", 0, 2, 1, 0, 0, "" },
508: { "graphicco", 0, 0, 0, 9, 0, "" }, /* areaspec */
509: { "group", 0, 0, 0, 9, 0, "" }, /* arg */
510: { "guibutton", 0, 0, 0, 7, 0, "" }, /* smallcptr */
511: { "guiicon", 0, 0, 0, 7, 0, "" }, /* smallcptr */
512: { "guilabel", 0, 0, 0, 7, 0, "" }, /* smallcptr */
513: { "guimenuitem",0, 0, 0, 7, 0, "" }, /* smallcptr */
514: { "guimenu", 0, 0, 0, 7, 0, "" }, /* smallcptr */
515: { "guisubmenu", 0, 0, 0, 7, 0, "" }, /* smallcptr */
516: { "hardware", 0, 0, 0, 7, 0, "" }, /* smallcptr */
517: { "highlights", 0, 0, 0, 9, 0, "" }, /* highlights.mix */
518: { "holder", 0, 0, 0, 4, 0, "" }, /* docinfo */
519: { "honorific", 0, 0, 0, 4, 0, "" }, /* docinfo */
520: { "imagedata", 0, 2, 1, 0, 0, "" },
521: { "imageobjectco",0, 0, 0, 9, 0, "" }, /* areaspec */
522: { "imageobject",0, 0, 0, 9, 0, "" }, /* objectinfo */
523: { "important", 0, 0, 0, 9, 0, "" }, /* title */
524: { "indexdiv", 0, 0, 0, 9, 0, "" }, /* sect.title.content */
525: { "indexentry", 0, 0, 0, 9, 0, "" }, /* primaryie */
526: { "index", 0, 0, 0, 9, 0, "" }, /* indexinfo */
527: { "indexinfo", 0, 0, 0, 9, 0, "" }, /* graphic */
528: { "indexterm", 0, 0, 0, 9, 0, "" }, /* primary */
529: { "informalequation",0, 0, 0, 9, 0, "" }, /* equation.content */
530: { "informalexample",0, 0, 0, 9, 0, "" }, /* example.mix */
531: { "informalfigure",0, 0, 0, 9, 0, "" }, /* figure.mix */
532: { "informaltable",0, 0, 0, 9, 0, "" }, /* graphic */
533: { "initializer",0, 0, 0, 7, 0, "" }, /* smallcptr */
534: { "inlineequation",0, 0, 0, 9, 0, "" }, /* inlineequation.content */
535: { "inlinegraphic",0, 2, 1, 0, 0, "" },
536: { "inlinemediaobject",0,0, 0, 9, 0, "" }, /* objectinfo */
537: { "interfacename",0, 0, 0, 7, 0, "" }, /* smallcptr */
538: { "interface", 0, 0, 0, 7, 0, "" }, /* smallcptr */
539: { "invpartnumber",0, 0, 0, 4, 0, "" }, /* docinfo */
540: { "isbn", 0, 0, 0, 4, 0, "" }, /* docinfo */
541: { "issn", 0, 0, 0, 4, 0, "" }, /* docinfo */
542: { "issuenum", 0, 0, 0, 4, 0, "" }, /* docinfo */
543: { "itemizedlist",0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
544: { "itermset", 0, 0, 0, 9, 0, "" }, /* indexterm */
545: { "jobtitle", 0, 0, 0, 4, 0, "" }, /* docinfo */
546: { "keycap", 0, 0, 0, 7, 0, "" }, /* smallcptr */
547: { "keycode", 0, 0, 0, 7, 0, "" }, /* smallcptr */
548: { "keycombo", 0, 0, 0, 9, 0, "" }, /* keycap */
549: { "keysym", 0, 0, 0, 7, 0, "" }, /* smallcptr */
550: { "keyword", 0, 0, 0, 1, 0, "" },
551: { "keywordset", 0, 0, 0, 9, 0, "" }, /* keyword */
552: { "label", 0, 0, 0, 3, 0, "" }, /* word */
553: { "legalnotice",0, 0, 0, 9, 0, "" }, /* title */
554: { "lineage", 0, 0, 0, 4, 0, "" }, /* docinfo */
555: { "lineannotation",0, 0, 0, 2, 0, "" }, /* para */
556: { "link", 0, 0, 0, 2, 0, "" }, /* para */
557: { "listitem", 0, 0, 0, 9, 0, "" }, /* component.mix */
558: { "literal", 0, 0, 0, 9, 0, "" }, /* cptr */
559: { "literallayout",0, 0, 0, 2, 0, "" }, /* para */
560: { "lot", 0, 0, 0, 9, 0, "" }, /* bookcomponent.title.content */
561: { "lotentry", 0, 0, 0, 2, 0, "" }, /* para */
562: { "manvolnum", 0, 0, 0, 3, 0, "" }, /* word */
563: { "markup", 0, 0, 0, 7, 0, "" }, /* smallcptr */
564: { "medialabel", 0, 0, 0, 7, 0, "" }, /* smallcptr */
565: { "mediaobjectco",0, 0, 0, 9, 0, "" }, /* objectinfo */
566: { "mediaobject",0, 0, 0, 9, 0, "" }, /* objectinfo */
567: { "member", 0, 0, 0, 2, 0, "" }, /* para */
568: { "menuchoice", 0, 0, 0, 9, 0, "" }, /* shortcut */
569: { "methodname", 0, 0, 0, 7, 0, "" }, /* smallcptr */
570: { "methodparam",0, 0, 0, 9, 0, "" }, /* modifier */
571: { "methodsynopsis",0, 0, 0, 9, 0, "" }, /* modifier */
572: { "modespec", 0, 0, 0, 4, 0, "" }, /* docinfo */
573: { "modifier", 0, 0, 0, 7, 0, "" }, /* smallcptr */
574: { "mousebutton",0, 0, 0, 7, 0, "" }, /* smallcptr */
575: { "msgaud", 0, 0, 0, 2, 0, "" }, /* para */
576: { "msgentry", 0, 0, 0, 9, 0, "" }, /* msg */
577: { "msgexplan", 0, 0, 0, 9, 0, "" }, /* title */
578: { "msginfo", 0, 0, 0, 9, 0, "" }, /* msglevel */
579: { "msglevel", 0, 0, 0, 7, 0, "" }, /* smallcptr */
580: { "msgmain", 0, 0, 0, 9, 0, "" }, /* title */
581: { "msgorig", 0, 0, 0, 7, 0, "" }, /* smallcptr */
582: { "msgrel", 0, 0, 0, 9, 0, "" }, /* title */
583: { "msgset", 0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
584: { "msgsub", 0, 0, 0, 9, 0, "" }, /* title */
585: { "msgtext", 0, 0, 0, 9, 0, "" }, /* component.mix */
586: { "msg", 0, 0, 0, 9, 0, "" }, /* title */
587: { "note", 0, 0, 0, 9, 0, "" }, /* title */
588: { "objectinfo", 0, 0, 0, 9, 0, "" }, /* graphic */
589: { "olink", 0, 0, 0, 2, 0, "" }, /* para */
590: { "ooclass", 0, 0, 0, 9, 0, "" }, /* modifier */
591: { "ooexception",0, 0, 0, 9, 0, "" }, /* modifier */
592: { "oointerface",0, 0, 0, 9, 0, "" }, /* modifier */
593: { "optional", 0, 0, 0, 9, 0, "" }, /* cptr */
594: { "option", 0, 0, 0, 7, 0, "" }, /* smallcptr */
595: { "orderedlist",0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
596: { "orgdiv", 0, 0, 0, 4, 0, "" }, /* docinfo */
597: { "orgname", 0, 0, 0, 4, 0, "" }, /* docinfo */
598: { "otheraddr", 0, 0, 0, 4, 0, "" }, /* docinfo */
599: { "othercredit",0, 0, 0, 9, 0, "" }, /* person.ident.mix */
600: { "othername", 0, 0, 0, 4, 0, "" }, /* docinfo */
601: { "pagenums", 0, 0, 0, 4, 0, "" }, /* docinfo */
602: { "paramdef", 0, 0, 0, 1, 0, "" },
603: { "parameter", 0, 0, 0, 7, 0, "" }, /* smallcptr */
604: { "para", 0, 0, 0, 2, 0, "" }, /* para */
605: { "partinfo", 0, 0, 0, 9, 0, "" }, /* graphic */
606: { "partintro", 0, 0, 0, 9, 0, "" }, /* div.title.content */
607: { "part", 0, 0, 0, 9, 0, "" }, /* partinfo */
608: { "phone", 0, 0, 0, 4, 0, "" }, /* docinfo */
609: { "phrase", 0, 0, 0, 2, 0, "" }, /* para */
610: { "pob", 0, 0, 0, 4, 0, "" }, /* docinfo */
611: { "postcode", 0, 0, 0, 4, 0, "" }, /* docinfo */
612: { "prefaceinfo",0, 0, 0, 9, 0, "" }, /* graphic */
613: { "preface", 0, 0, 0, 9, 0, "" }, /* prefaceinfo */
614: { "primaryie", 0, 0, 0, 4, 0, "" }, /* ndxterm */
615: { "primary ", 0, 0, 0, 4, 0, "" }, /* ndxterm */
616: { "printhistory",0, 0, 0, 9, 0, "" }, /* para.class */
617: { "procedure", 0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
618: { "productname",0, 0, 0, 2, 0, "" }, /* para */
619: { "productnumber",0, 0, 0, 4, 0, "" }, /* docinfo */
620: { "programlistingco",0, 0, 0, 9, 0, "" }, /* areaspec */
621: { "programlisting",0, 0, 0, 2, 0, "" }, /* para */
622: { "prompt", 0, 0, 0, 7, 0, "" }, /* smallcptr */
623: { "property", 0, 0, 0, 7, 0, "" }, /* smallcptr */
624: { "pubdate", 0, 0, 0, 4, 0, "" }, /* docinfo */
625: { "publishername",0, 0, 0, 4, 0, "" }, /* docinfo */
626: { "publisher", 0, 0, 0, 9, 0, "" }, /* publishername */
627: { "pubsnumber", 0, 0, 0, 4, 0, "" }, /* docinfo */
628: { "qandadiv", 0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
629: { "qandaentry", 0, 0, 0, 9, 0, "" }, /* revhistory */
630: { "qandaset", 0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
631: { "question", 0, 0, 0, 9, 0, "" }, /* label */
632: { "quote", 0, 0, 0, 2, 0, "" }, /* para */
633: { "refclass", 0, 0, 0, 9, 0, "" }, /* refclass.char.mix */
634: { "refdescriptor",0, 0, 0, 9, 0, "" }, /* refname.char.mix */
635: { "refentryinfo",0, 0, 0, 9, 0, "" }, /* graphic */
636: { "refentry", 0, 0, 0, 9, 0, "" }, /* ndxterm.class */
637: { "refentrytitle",0, 0, 0, 2, 0, "" }, /* para */
638: { "referenceinfo",0, 0, 0, 9, 0, "" }, /* graphic */
639: { "reference", 0, 0, 0, 9, 0, "" }, /* referenceinfo */
640: { "refmeta", 0, 0, 0, 9, 0, "" }, /* ndxterm.class */
641: { "refmiscinfo",0, 0, 0, 4, 0, "" }, /* docinfo */
642: { "refnamediv", 0, 0, 0, 9, 0, "" }, /* refdescriptor */
643: { "refname", 0, 0, 0, 9, 0, "" }, /* refname.char.mix */
644: { "refpurpose", 0, 0, 0, 9, 0, "" }, /* refinline.char.mix */
645: { "refsect1info",0, 0, 0, 9, 0, "" }, /* graphic */
646: { "refsect1", 0, 0, 0, 9, 0, "" }, /* refsect */
647: { "refsect2info",0, 0, 0, 9, 0, "" }, /* graphic */
648: { "refsect2", 0, 0, 0, 9, 0, "" }, /* refsect */
649: { "refsect3info",0, 0, 0, 9, 0, "" }, /* graphic */
650: { "refsect3", 0, 0, 0, 9, 0, "" }, /* refsect */
651: { "refsynopsisdivinfo",0,0, 0, 9, 0, "" }, /* graphic */
652: { "refsynopsisdiv",0, 0, 0, 9, 0, "" }, /* refsynopsisdivinfo */
653: { "releaseinfo",0, 0, 0, 4, 0, "" }, /* docinfo */
654: { "remark", 0, 0, 0, 2, 0, "" }, /* para */
655: { "replaceable",0, 0, 0, 1, 0, "" },
656: { "returnvalue",0, 0, 0, 7, 0, "" }, /* smallcptr */
657: { "revdescription",0, 0, 0, 9, 0, "" }, /* revdescription.mix */
658: { "revhistory", 0, 0, 0, 9, 0, "" }, /* revision */
659: { "revision", 0, 0, 0, 9, 0, "" }, /* revnumber */
660: { "revnumber", 0, 0, 0, 4, 0, "" }, /* docinfo */
661: { "revremark", 0, 0, 0, 4, 0, "" }, /* docinfo */
662: { "row", 0, 0, 0, 9, 0, "" }, /* tbl.row.mdl */
663: { "row", 0, 0, 0, 9, 0, "" }, /* tbl.row.mdl */
664: { "sbr", 0, 2, 1, 0, 0, "" },
665: { "screenco", 0, 0, 0, 9, 0, "" }, /* areaspec */
666: { "screeninfo", 0, 0, 0, 2, 0, "" }, /* para */
667: { "screen", 0, 0, 0, 2, 0, "" }, /* para */
668: { "screenshot", 0, 0, 0, 9, 0, "" }, /* screeninfo */
669: { "secondaryie",0, 0, 0, 4, 0, "" }, /* ndxterm */
670: { "secondary", 0, 0, 0, 4, 0, "" }, /* ndxterm */
671: { "sect1info", 0, 0, 0, 9, 0, "" }, /* graphic */
672: { "sect1", 0, 0, 0, 9, 0, "" }, /* sect */
673: { "sect2info", 0, 0, 0, 9, 0, "" }, /* graphic */
674: { "sect2", 0, 0, 0, 9, 0, "" }, /* sect */
675: { "sect3info", 0, 0, 0, 9, 0, "" }, /* graphic */
676: { "sect3", 0, 0, 0, 9, 0, "" }, /* sect */
677: { "sect4info", 0, 0, 0, 9, 0, "" }, /* graphic */
678: { "sect4", 0, 0, 0, 9, 0, "" }, /* sect */
679: { "sect5info", 0, 0, 0, 9, 0, "" }, /* graphic */
680: { "sect5", 0, 0, 0, 9, 0, "" }, /* sect */
681: { "sectioninfo",0, 0, 0, 9, 0, "" }, /* graphic */
682: { "section", 0, 0, 0, 9, 0, "" }, /* sectioninfo */
683: { "seealsoie", 0, 0, 0, 4, 0, "" }, /* ndxterm */
684: { "seealso", 0, 0, 0, 4, 0, "" }, /* ndxterm */
685: { "seeie", 0, 0, 0, 4, 0, "" }, /* ndxterm */
686: { "see", 0, 0, 0, 4, 0, "" }, /* ndxterm */
687: { "seglistitem",0, 0, 0, 9, 0, "" }, /* seg */
688: { "segmentedlist",0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
689: { "seg", 0, 0, 0, 2, 0, "" }, /* para */
690: { "segtitle", 0, 0, 0, 8, 0, "" }, /* title */
691: { "seriesvolnums", 0, 0, 0, 4, 0, "" }, /* docinfo */
692: { "set", 0, 0, 0, 9, 0, "" }, /* div.title.content */
693: { "setindexinfo",0, 0, 0, 9, 0, "" }, /* graphic */
694: { "setindex", 0, 0, 0, 9, 0, "" }, /* setindexinfo */
695: { "setinfo", 0, 0, 0, 9, 0, "" }, /* graphic */
696: { "sgmltag", 0, 0, 0, 7, 0, "" }, /* smallcptr */
697: { "shortaffil", 0, 0, 0, 4, 0, "" }, /* docinfo */
698: { "shortcut", 0, 0, 0, 9, 0, "" }, /* keycap */
699: { "sidebarinfo",0, 0, 0, 9, 0, "" }, /* graphic */
700: { "sidebar", 0, 0, 0, 9, 0, "" }, /* sidebarinfo */
701: { "simpara", 0, 0, 0, 2, 0, "" }, /* para */
702: { "simplelist", 0, 0, 0, 9, 0, "" }, /* member */
703: { "simplemsgentry", 0, 0, 0, 9, 0, "" }, /* msgtext */
704: { "simplesect", 0, 0, 0, 9, 0, "" }, /* sect.title.content */
705: { "spanspec", 0, 2, 1, 0, 0, "" },
706: { "state", 0, 0, 0, 4, 0, "" }, /* docinfo */
707: { "step", 0, 0, 0, 9, 0, "" }, /* title */
708: { "street", 0, 0, 0, 4, 0, "" }, /* docinfo */
709: { "structfield",0, 0, 0, 7, 0, "" }, /* smallcptr */
710: { "structname", 0, 0, 0, 7, 0, "" }, /* smallcptr */
711: { "subjectset", 0, 0, 0, 9, 0, "" }, /* subject */
712: { "subject", 0, 0, 0, 9, 0, "" }, /* subjectterm */
713: { "subjectterm",0, 0, 0, 1, 0, "" },
714: { "subscript", 0, 0, 0, 1, 0, "" },
715: { "substeps", 0, 0, 0, 9, 0, "" }, /* step */
716: { "subtitle", 0, 0, 0, 8, 0, "" }, /* title */
717: { "superscript", 0, 0, 0, 1, 0, "" },
718: { "surname", 0, 0, 0, 4, 0, "" }, /* docinfo */
719: { "symbol", 0, 0, 0, 7, 0, "" }, /* smallcptr */
720: { "synopfragment", 0, 0, 0, 9, 0, "" }, /* arg */
721: { "synopfragmentref", 0, 0, 0, 1, 0, "" },
722: { "synopsis", 0, 0, 0, 2, 0, "" }, /* para */
723: { "systemitem", 0, 0, 0, 7, 0, "" }, /* smallcptr */
724: { "table", 0, 0, 0, 9, 0, "" }, /* tbl.table.mdl */
725: /* { "%tbl.table.name;", 0, 0, 0, 9, 0, "" },*/ /* tbl.table.mdl */
726: { "tbody", 0, 0, 0, 9, 0, "" }, /* row */
727: { "tbody", 0, 0, 0, 9, 0, "" }, /* row */
728: { "term", 0, 0, 0, 2, 0, "" }, /* para */
729: { "tertiaryie", 0, 0, 0, 4, 0, "" }, /* ndxterm */
730: { "tertiary ", 0, 0, 0, 4, 0, "" }, /* ndxterm */
731: { "textobject", 0, 0, 0, 9, 0, "" }, /* objectinfo */
732: { "tfoot", 0, 0, 0, 9, 0, "" }, /* tbl.hdft.mdl */
733: { "tgroup", 0, 0, 0, 9, 0, "" }, /* tbl.tgroup.mdl */
734: { "tgroup", 0, 0, 0, 9, 0, "" }, /* tbl.tgroup.mdl */
735: { "thead", 0, 0, 0, 9, 0, "" }, /* row */
736: { "thead", 0, 0, 0, 9, 0, "" }, /* tbl.hdft.mdl */
737: { "tip", 0, 0, 0, 9, 0, "" }, /* title */
738: { "titleabbrev",0, 0, 0, 8, 0, "" }, /* title */
739: { "title", 0, 0, 0, 8, 0, "" }, /* title */
740: { "tocback", 0, 0, 0, 2, 0, "" }, /* para */
741: { "toc", 0, 0, 0, 9, 0, "" }, /* bookcomponent.title.content */
742: { "tocchap", 0, 0, 0, 9, 0, "" }, /* tocentry */
743: { "tocentry", 0, 0, 0, 2, 0, "" }, /* para */
744: { "tocfront", 0, 0, 0, 2, 0, "" }, /* para */
745: { "toclevel1", 0, 0, 0, 9, 0, "" }, /* tocentry */
746: { "toclevel2", 0, 0, 0, 9, 0, "" }, /* tocentry */
747: { "toclevel3", 0, 0, 0, 9, 0, "" }, /* tocentry */
748: { "toclevel4", 0, 0, 0, 9, 0, "" }, /* tocentry */
749: { "toclevel5", 0, 0, 0, 9, 0, "" }, /* tocentry */
750: { "tocpart", 0, 0, 0, 9, 0, "" }, /* tocentry */
751: { "token", 0, 0, 0, 7, 0, "" }, /* smallcptr */
752: { "trademark", 0, 0, 0, 1, 0, "" },
753: { "type", 0, 0, 0, 7, 0, "" }, /* smallcptr */
754: { "ulink", 0, 0, 0, 2, 0, "" }, /* para */
755: { "userinput", 0, 0, 0, 9, 0, "" }, /* cptr */
756: { "varargs", 0, 2, 1, 0, 0, "" },
757: { "variablelist",0, 0, 0, 9, 0, "" }, /* formalobject.title.content */
758: { "varlistentry",0, 0, 0, 9, 0, "" }, /* term */
759: { "varname", 0, 0, 0, 7, 0, "" }, /* smallcptr */
760: { "videodata", 0, 2, 1, 0, 0, "" },
761: { "videoobject",0, 0, 0, 9, 0, "" }, /* objectinfo */
762: { "void", 0, 2, 1, 0, 0, "" },
763: { "volumenum", 0, 0, 0, 4, 0, "" }, /* docinfo */
764: { "warning", 0, 0, 0, 9, 0, "" }, /* title */
765: { "wordasword", 0, 0, 0, 3, 0, "" }, /* word */
766: { "xref", 0, 2, 1, 0, 0, "" },
767: { "year", 0, 0, 0, 4, 0, "" }, /* docinfo */
768: };
769:
770: /*
771: * start tags that imply the end of a current element
772: * any tag of each line implies the end of the current element if the type of
773: * that element is in the same line
774: */
775: char *sgmlEquEnd[] = {
776: "dt", "dd", "li", "option", NULL,
777: "h1", "h2", "h3", "h4", "h5", "h6", NULL,
778: "ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,
779: NULL
780: };
781: /*
782: * acording the SGML DTD, HR should be added to the 2nd line above, as it
783: * is not allowed within a H1, H2, H3, etc. But we should tolerate that case
784: * because many documents contain rules in headings...
785: */
786:
787: /*
788: * start tags that imply the end of current element
789: */
790: char *sgmlStartClose[] = {
791: NULL
792: };
793:
794: /*
795: * The list of SGML elements which are supposed not to have
796: * CDATA content and where a p element will be implied
797: *
798: * TODO: extend that list by reading the SGML SGML DtD on
799: * implied paragraph
800: */
801: static char *sgmlNoContentElements[] = {
802: NULL
803: };
804:
805:
806: static char** sgmlStartCloseIndex[100];
807: static int sgmlStartCloseIndexinitialized = 0;
808:
809: /************************************************************************
810: * *
811: * functions to handle SGML specific data *
812: * *
813: ************************************************************************/
814:
815: /**
816: * sgmlInitAutoClose:
817: *
818: * Initialize the sgmlStartCloseIndex for fast lookup of closing tags names.
819: *
820: */
821: void
822: sgmlInitAutoClose(void) {
823: int index, i = 0;
824:
825: if (sgmlStartCloseIndexinitialized) return;
826:
827: for (index = 0;index < 100;index ++) sgmlStartCloseIndex[index] = NULL;
828: index = 0;
829: while ((sgmlStartClose[i] != NULL) && (index < 100 - 1)) {
830: sgmlStartCloseIndex[index++] = &sgmlStartClose[i];
831: while (sgmlStartClose[i] != NULL) i++;
832: i++;
833: }
834: }
835:
836: /**
837: * sgmlTagLookup:
838: * @tag: The tag name
839: *
840: * Lookup the SGML tag in the ElementTable
841: *
842: * Returns the related sgmlElemDescPtr or NULL if not found.
843: */
844: sgmlElemDescPtr
845: sgmlTagLookup(const xmlChar *tag) {
846: int i;
847:
848: for (i = 0; i < (sizeof(docbookElementTable) /
849: sizeof(docbookElementTable[0]));i++) {
850: if (!xmlStrcmp(tag, BAD_CAST docbookElementTable[i].name))
851: return(&docbookElementTable[i]);
852: }
853: return(NULL);
854: }
855:
856: /**
857: * sgmlCheckAutoClose:
858: * @newtag: The new tag name
859: * @oldtag: The old tag name
860: *
861: * Checks wether the new tag is one of the registered valid tags for closing old.
862: * Initialize the sgmlStartCloseIndex for fast lookup of closing tags names.
863: *
864: * Returns 0 if no, 1 if yes.
865: */
866: int
867: sgmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
868: int i, index;
869: char **close;
870:
871: if (sgmlStartCloseIndexinitialized == 0) sgmlInitAutoClose();
872:
873: /* inefficient, but not a big deal */
874: for (index = 0; index < 100;index++) {
875: close = sgmlStartCloseIndex[index];
876: if (close == NULL) return(0);
877: if (!xmlStrcmp(BAD_CAST *close, newtag)) break;
878: }
879:
880: i = close - sgmlStartClose;
881: i++;
882: while (sgmlStartClose[i] != NULL) {
883: if (!xmlStrcmp(BAD_CAST sgmlStartClose[i], oldtag)) {
884: return(1);
885: }
886: i++;
887: }
888: return(0);
889: }
890:
891: /**
892: * sgmlAutoCloseOnClose:
893: * @ctxt: an SGML parser context
894: * @newtag: The new tag name
895: *
896: * The HTmL DtD allows an ending tag to implicitely close other tags.
897: */
898: void
899: sgmlAutoCloseOnClose(sgmlParserCtxtPtr ctxt, const xmlChar *newtag) {
900: sgmlElemDescPtr info;
901: xmlChar *oldname;
902: int i;
903:
904: if ((newtag[0] == '/') && (newtag[1] == 0))
905: return;
906:
907: #ifdef DEBUG
908: fprintf(stderr,"Close of %s stack: %d elements\n", newtag, ctxt->nameNr);
909: for (i = 0;i < ctxt->nameNr;i++)
910: fprintf(stderr,"%d : %s\n", i, ctxt->nameTab[i]);
911: #endif
912:
913: for (i = (ctxt->nameNr - 1);i >= 0;i--) {
914: if (!xmlStrcmp(newtag, ctxt->nameTab[i])) break;
915: }
916: if (i < 0) return;
917:
918: while (xmlStrcmp(newtag, ctxt->name)) {
919: info = sgmlTagLookup(ctxt->name);
920: if ((info == NULL) || (info->endTag == 1)) {
921: #ifdef DEBUG
922: fprintf(stderr,"sgmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
923: #endif
924: } else {
925: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
926: ctxt->sax->error(ctxt->userData,
927: "Opening and ending tag mismatch: %s and %s\n",
928: newtag, ctxt->name);
929: ctxt->wellFormed = 0;
930: }
931: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
932: ctxt->sax->endElement(ctxt->userData, ctxt->name);
933: oldname = sgmlnamePop(ctxt);
934: if (oldname != NULL) {
935: #ifdef DEBUG
936: fprintf(stderr,"sgmlAutoCloseOnClose: popped %s\n", oldname);
937: #endif
938: xmlFree(oldname);
939: }
940: }
941: }
942:
943: /**
944: * sgmlAutoClose:
945: * @ctxt: an SGML parser context
946: * @newtag: The new tag name or NULL
947: *
948: * The HTmL DtD allows a tag to implicitely close other tags.
949: * The list is kept in sgmlStartClose array. This function is
950: * called when a new tag has been detected and generates the
951: * appropriates closes if possible/needed.
952: * If newtag is NULL this mean we are at the end of the resource
953: * and we should check
954: */
955: void
956: sgmlAutoClose(sgmlParserCtxtPtr ctxt, const xmlChar *newtag) {
957: xmlChar *oldname;
958: while ((newtag != NULL) && (ctxt->name != NULL) &&
959: (sgmlCheckAutoClose(newtag, ctxt->name))) {
960: #ifdef DEBUG
961: fprintf(stderr,"sgmlAutoClose: %s closes %s\n", newtag, ctxt->name);
962: #endif
963: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
964: ctxt->sax->endElement(ctxt->userData, ctxt->name);
965: oldname = sgmlnamePop(ctxt);
966: if (oldname != NULL) {
967: #ifdef DEBUG
968: fprintf(stderr,"sgmlAutoClose: popped %s\n", oldname);
969: #endif
970: xmlFree(oldname);
971: }
972: }
973: #if 0
974: if (newtag == NULL) {
975: sgmlAutoCloseOnClose(ctxt, BAD_CAST"head");
976: sgmlAutoCloseOnClose(ctxt, BAD_CAST"body");
977: sgmlAutoCloseOnClose(ctxt, BAD_CAST"sgml");
978: }
979: while ((newtag == NULL) && (ctxt->name != NULL) &&
980: ((!xmlStrcmp(ctxt->name, BAD_CAST"head")) ||
981: (!xmlStrcmp(ctxt->name, BAD_CAST"body")) ||
982: (!xmlStrcmp(ctxt->name, BAD_CAST"sgml")))) {
983: #ifdef DEBUG
984: fprintf(stderr,"sgmlAutoClose: EOF closes %s\n", ctxt->name);
985: #endif
986: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
987: ctxt->sax->endElement(ctxt->userData, ctxt->name);
988: oldname = sgmlnamePop(ctxt);
989: if (oldname != NULL) {
990: #ifdef DEBUG
991: fprintf(stderr,"sgmlAutoClose: popped %s\n", oldname);
992: #endif
993: xmlFree(oldname);
994: }
995: }
996: #endif
997: }
998:
999: /**
1000: * sgmlAutoCloseTag:
1001: * @doc: the SGML document
1002: * @name: The tag name
1003: * @elem: the SGML element
1004: *
1005: * The HTmL DtD allows a tag to implicitely close other tags.
1006: * The list is kept in sgmlStartClose array. This function checks
1007: * if the element or one of it's children would autoclose the
1008: * given tag.
1009: *
1010: * Returns 1 if autoclose, 0 otherwise
1011: */
1012: int
1013: sgmlAutoCloseTag(sgmlDocPtr doc, const xmlChar *name, sgmlNodePtr elem) {
1014: sgmlNodePtr child;
1015:
1016: if (elem == NULL) return(1);
1017: if (!xmlStrcmp(name, elem->name)) return(0);
1018: if (sgmlCheckAutoClose(elem->name, name)) return(1);
1019: child = elem->children;
1020: while (child != NULL) {
1021: if (sgmlAutoCloseTag(doc, name, child)) return(1);
1022: child = child->next;
1023: }
1024: return(0);
1025: }
1026:
1027: /**
1028: * sgmlIsAutoClosed:
1029: * @doc: the SGML document
1030: * @elem: the SGML element
1031: *
1032: * The HTmL DtD allows a tag to implicitely close other tags.
1033: * The list is kept in sgmlStartClose array. This function checks
1034: * if a tag is autoclosed by one of it's child
1035: *
1036: * Returns 1 if autoclosed, 0 otherwise
1037: */
1038: int
1039: sgmlIsAutoClosed(sgmlDocPtr doc, sgmlNodePtr elem) {
1040: sgmlNodePtr child;
1041:
1042: if (elem == NULL) return(1);
1043: child = elem->children;
1044: while (child != NULL) {
1045: if (sgmlAutoCloseTag(doc, elem->name, child)) return(1);
1046: child = child->next;
1047: }
1048: return(0);
1049: }
1050:
1051: /**
1052: * sgmlCheckImplied:
1053: * @ctxt: an SGML parser context
1054: * @newtag: The new tag name
1055: *
1056: * The HTmL DtD allows a tag to exists only implicitely
1057: * called when a new tag has been detected and generates the
1058: * appropriates implicit tags if missing
1059: */
1060: void
1061: sgmlCheckImplied(sgmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1062: #if 0
1063: if (!xmlStrcmp(newtag, BAD_CAST"sgml"))
1064: return;
1065: if (ctxt->nameNr <= 0) {
1066: #ifdef DEBUG
1067: fprintf(stderr,"Implied element sgml: pushed sgml\n");
1068: #endif
1069: sgmlnamePush(ctxt, xmlStrdup(BAD_CAST"sgml"));
1070: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1071: ctxt->sax->startElement(ctxt->userData, BAD_CAST"sgml", NULL);
1072: }
1073: if ((!xmlStrcmp(newtag, BAD_CAST"body")) || (!xmlStrcmp(newtag, BAD_CAST"head")))
1074: return;
1075: if (ctxt->nameNr <= 1) {
1076: if ((!xmlStrcmp(newtag, BAD_CAST"script")) ||
1077: (!xmlStrcmp(newtag, BAD_CAST"style")) ||
1078: (!xmlStrcmp(newtag, BAD_CAST"meta")) ||
1079: (!xmlStrcmp(newtag, BAD_CAST"link")) ||
1080: (!xmlStrcmp(newtag, BAD_CAST"title")) ||
1081: (!xmlStrcmp(newtag, BAD_CAST"base"))) {
1082: /*
1083: * dropped OBJECT ... i you put it first BODY will be
1084: * assumed !
1085: */
1086: #ifdef DEBUG
1087: fprintf(stderr,"Implied element head: pushed head\n");
1088: #endif
1089: sgmlnamePush(ctxt, xmlStrdup(BAD_CAST"head"));
1090: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1091: ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1092: } else {
1093: #ifdef DEBUG
1094: fprintf(stderr,"Implied element body: pushed body\n");
1095: #endif
1096: sgmlnamePush(ctxt, xmlStrdup(BAD_CAST"body"));
1097: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1098: ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1099: }
1100: }
1101: #endif
1102: }
1103:
1104: /**
1105: * sgmlCheckParagraph
1106: * @ctxt: an SGML parser context
1107: *
1108: * Check whether a p element need to be implied before inserting
1109: * characters in the current element.
1110: *
1111: * Returns 1 if a paragraph has been inserted, 0 if not and -1
1112: * in case of error.
1113: */
1114:
1115: int
1116: sgmlCheckParagraph(sgmlParserCtxtPtr ctxt) {
1117: const xmlChar *tag;
1118: int i;
1119:
1120: if (ctxt == NULL)
1121: return(-1);
1122: tag = ctxt->name;
1123: if (tag == NULL) {
1124: sgmlAutoClose(ctxt, BAD_CAST"p");
1125: sgmlCheckImplied(ctxt, BAD_CAST"p");
1126: sgmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
1127: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1128: ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1129: return(1);
1130: }
1131: for (i = 0; sgmlNoContentElements[i] != NULL; i++) {
1132: if (!xmlStrcmp(tag, BAD_CAST sgmlNoContentElements[i])) {
1133: #ifdef DEBUG
1134: fprintf(stderr,"Implied element paragraph\n");
1135: #endif
1136: sgmlAutoClose(ctxt, BAD_CAST"p");
1137: sgmlCheckImplied(ctxt, BAD_CAST"p");
1138: sgmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
1139: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1140: ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1141: return(1);
1142: }
1143: }
1144: return(0);
1145: }
1146:
1147: /************************************************************************
1148: * *
1149: * The list of SGML predefined entities *
1150: * *
1151: ************************************************************************/
1152:
1153:
1154: sgmlEntityDesc docbookEntitiesTable[] = {
1155: /*
1156: * the 4 absolute ones, plus apostrophe.
1157: */
1158: { 0x0026, "amp", "AMPERSAND" },
1159: { 0x003C, "lt", "LESS-THAN SIGN" },
1160:
1161: /*
1162: * Converted with VI macros from docbook ent files
1163: */
1164: { 0x0021, "excl", "EXCLAMATION MARK" },
1165: { 0x0022, "quot", "QUOTATION MARK" },
1166: { 0x0023, "num", "NUMBER SIGN" },
1167: { 0x0024, "dollar", "DOLLAR SIGN" },
1168: { 0x0025, "percnt", "PERCENT SIGN" },
1169: { 0x0027, "apos", "APOSTROPHE" },
1170: { 0x0028, "lpar", "LEFT PARENTHESIS" },
1171: { 0x0029, "rpar", "RIGHT PARENTHESIS" },
1172: { 0x002A, "ast", "ASTERISK OPERATOR" },
1173: { 0x002B, "plus", "PLUS SIGN" },
1174: { 0x002C, "comma", "COMMA" },
1175: { 0x002D, "hyphen", "HYPHEN-MINUS" },
1176: { 0x002E, "period", "FULL STOP" },
1177: { 0x002F, "sol", "SOLIDUS" },
1178: { 0x003A, "colon", "COLON" },
1179: { 0x003B, "semi", "SEMICOLON" },
1180: { 0x003D, "equals", "EQUALS SIGN" },
1181: { 0x003E, "gt", "GREATER-THAN SIGN" },
1182: { 0x003F, "quest", "QUESTION MARK" },
1183: { 0x0040, "commat", "COMMERCIAL AT" },
1184: { 0x005B, "lsqb", "LEFT SQUARE BRACKET" },
1185: { 0x005C, "bsol", "REVERSE SOLIDUS" },
1186: { 0x005D, "rsqb", "RIGHT SQUARE BRACKET" },
1187: { 0x005E, "circ", "RING OPERATOR" },
1188: { 0x005F, "lowbar", "LOW LINE" },
1189: { 0x0060, "grave", "GRAVE ACCENT" },
1190: { 0x007B, "lcub", "LEFT CURLY BRACKET" },
1191: { 0x007C, "verbar", "VERTICAL LINE" },
1192: { 0x007D, "rcub", "RIGHT CURLY BRACKET" },
1193: { 0x00A0, "nbsp", "NO-BREAK SPACE" },
1194: { 0x00A1, "iexcl", "INVERTED EXCLAMATION MARK" },
1195: { 0x00A2, "cent", "CENT SIGN" },
1196: { 0x00A3, "pound", "POUND SIGN" },
1197: { 0x00A4, "curren", "CURRENCY SIGN" },
1198: { 0x00A5, "yen", "YEN SIGN" },
1199: { 0x00A6, "brvbar", "BROKEN BAR" },
1200: { 0x00A7, "sect", "SECTION SIGN" },
1201: { 0x00A8, "die", "" },
1202: { 0x00A8, "Dot", "" },
1203: { 0x00A8, "uml", "" },
1204: { 0x00A9, "copy", "COPYRIGHT SIGN" },
1205: { 0x00AA, "ordf", "FEMININE ORDINAL INDICATOR" },
1206: { 0x00AB, "laquo", "LEFT-POINTING DOUBLE ANGLE QUOTATION MARK" },
1207: { 0x00AC, "not", "NOT SIGN" },
1208: { 0x00AD, "shy", "SOFT HYPHEN" },
1209: { 0x00AE, "reg", "REG TRADE MARK SIGN" },
1210: { 0x00AF, "macr", "MACRON" },
1211: { 0x00B0, "deg", "DEGREE SIGN" },
1212: { 0x00B1, "plusmn", "PLUS-MINUS SIGN" },
1213: { 0x00B2, "sup2", "SUPERSCRIPT TWO" },
1214: { 0x00B3, "sup3", "SUPERSCRIPT THREE" },
1215: { 0x00B4, "acute", "ACUTE ACCENT" },
1216: { 0x00B5, "micro", "MICRO SIGN" },
1217: { 0x00B6, "para", "PILCROW SIGN" },
1218: { 0x00B7, "middot", "MIDDLE DOT" },
1219: { 0x00B8, "cedil", "CEDILLA" },
1220: { 0x00B9, "sup1", "SUPERSCRIPT ONE" },
1221: { 0x00BA, "ordm", "MASCULINE ORDINAL INDICATOR" },
1222: { 0x00BB, "raquo", "RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK" },
1223: { 0x00BC, "frac14", "VULGAR FRACTION ONE QUARTER" },
1224: { 0x00BD, "frac12", "VULGAR FRACTION ONE HALF" },
1225: { 0x00BD, "half", "VULGAR FRACTION ONE HALF" },
1226: { 0x00BE, "frac34", "VULGAR FRACTION THREE QUARTERS" },
1227: { 0x00BF, "iquest", "INVERTED QUESTION MARK" },
1228: { 0x00C0, "Agrave", "LATIN CAPITAL LETTER A WITH GRAVE" },
1229: { 0x00C1, "Aacute", "LATIN CAPITAL LETTER A WITH ACUTE" },
1230: { 0x00C2, "Acirc", "LATIN CAPITAL LETTER A WITH CIRCUMFLEX" },
1231: { 0x00C3, "Atilde", "LATIN CAPITAL LETTER A WITH TILDE" },
1232: { 0x00C4, "Auml", "LATIN CAPITAL LETTER A WITH DIAERESIS" },
1233: { 0x00C5, "Aring", "LATIN CAPITAL LETTER A WITH RING ABOVE" },
1234: { 0x00C6, "AElig", "LATIN CAPITAL LETTER AE" },
1235: { 0x00C7, "Ccedil", "LATIN CAPITAL LETTER C WITH CEDILLA" },
1236: { 0x00C8, "Egrave", "LATIN CAPITAL LETTER E WITH GRAVE" },
1237: { 0x00C9, "Eacute", "LATIN CAPITAL LETTER E WITH ACUTE" },
1238: { 0x00CA, "Ecirc", "LATIN CAPITAL LETTER E WITH CIRCUMFLEX" },
1239: { 0x00CB, "Euml", "LATIN CAPITAL LETTER E WITH DIAERESIS" },
1240: { 0x00CC, "Igrave", "LATIN CAPITAL LETTER I WITH GRAVE" },
1241: { 0x00CD, "Iacute", "LATIN CAPITAL LETTER I WITH ACUTE" },
1242: { 0x00CE, "Icirc", "LATIN CAPITAL LETTER I WITH CIRCUMFLEX" },
1243: { 0x00CF, "Iuml", "LATIN CAPITAL LETTER I WITH DIAERESIS" },
1244: { 0x00D0, "ETH", "LATIN CAPITAL LETTER ETH" },
1245: { 0x00D1, "Ntilde", "LATIN CAPITAL LETTER N WITH TILDE" },
1246: { 0x00D2, "Ograve", "LATIN CAPITAL LETTER O WITH GRAVE" },
1247: { 0x00D3, "Oacute", "LATIN CAPITAL LETTER O WITH ACUTE" },
1248: { 0x00D4, "Ocirc", "LATIN CAPITAL LETTER O WITH CIRCUMFLEX" },
1249: { 0x00D5, "Otilde", "LATIN CAPITAL LETTER O WITH TILDE" },
1250: { 0x00D6, "Ouml", "LATIN CAPITAL LETTER O WITH DIAERESIS" },
1251: { 0x00D7, "times", "MULTIPLICATION SIGN" },
1252: { 0x00D8, "Oslash", "LATIN CAPITAL LETTER O WITH STROKE" },
1253: { 0x00D9, "Ugrave", "LATIN CAPITAL LETTER U WITH GRAVE" },
1254: { 0x00DA, "Uacute", "LATIN CAPITAL LETTER U WITH ACUTE" },
1255: { 0x00DB, "Ucirc", "LATIN CAPITAL LETTER U WITH CIRCUMFLEX" },
1256: { 0x00DC, "Uuml", "LATIN CAPITAL LETTER U WITH DIAERESIS" },
1257: { 0x00DD, "Yacute", "LATIN CAPITAL LETTER Y WITH ACUTE" },
1258: { 0x00DE, "THORN", "LATIN CAPITAL LETTER THORN" },
1259: { 0x00DF, "szlig", "LATIN SMALL LETTER SHARP S" },
1260: { 0x00E0, "agrave", "LATIN SMALL LETTER A WITH GRAVE" },
1261: { 0x00E1, "aacute", "LATIN SMALL LETTER A WITH ACUTE" },
1262: { 0x00E2, "acirc", "LATIN SMALL LETTER A WITH CIRCUMFLEX" },
1263: { 0x00E3, "atilde", "LATIN SMALL LETTER A WITH TILDE" },
1264: { 0x00E4, "auml", "LATIN SMALL LETTER A WITH DIAERESIS" },
1265: { 0x00E5, "aring", "LATIN SMALL LETTER A WITH RING ABOVE" },
1266: { 0x00E6, "aelig", "LATIN SMALL LETTER AE" },
1267: { 0x00E7, "ccedil", "LATIN SMALL LETTER C WITH CEDILLA" },
1268: { 0x00E8, "egrave", "LATIN SMALL LETTER E WITH GRAVE" },
1269: { 0x00E9, "eacute", "LATIN SMALL LETTER E WITH ACUTE" },
1270: { 0x00EA, "ecirc", "LATIN SMALL LETTER E WITH CIRCUMFLEX" },
1271: { 0x00EB, "euml", "LATIN SMALL LETTER E WITH DIAERESIS" },
1272: { 0x00EC, "igrave", "LATIN SMALL LETTER I WITH GRAVE" },
1273: { 0x00ED, "iacute", "LATIN SMALL LETTER I WITH ACUTE" },
1274: { 0x00EE, "icirc", "LATIN SMALL LETTER I WITH CIRCUMFLEX" },
1275: { 0x00EF, "iuml", "LATIN SMALL LETTER I WITH DIAERESIS" },
1276: { 0x00F0, "eth", "LATIN SMALL LETTER ETH" },
1277: { 0x00F1, "ntilde", "LATIN SMALL LETTER N WITH TILDE" },
1278: { 0x00F2, "ograve", "LATIN SMALL LETTER O WITH GRAVE" },
1279: { 0x00F3, "oacute", "LATIN SMALL LETTER O WITH ACUTE" },
1280: { 0x00F4, "ocirc", "LATIN SMALL LETTER O WITH CIRCUMFLEX" },
1281: { 0x00F5, "otilde", "LATIN SMALL LETTER O WITH TILDE" },
1282: { 0x00F6, "ouml", "LATIN SMALL LETTER O WITH DIAERESIS" },
1283: { 0x00F7, "divide", "DIVISION SIGN" },
1284: { 0x00F8, "oslash", "CIRCLED DIVISION SLASH" },
1285: { 0x00F9, "ugrave", "LATIN SMALL LETTER U WITH GRAVE" },
1286: { 0x00FA, "uacute", "LATIN SMALL LETTER U WITH ACUTE" },
1287: { 0x00FB, "ucirc", "LATIN SMALL LETTER U WITH CIRCUMFLEX" },
1288: { 0x00FC, "uuml", "LATIN SMALL LETTER U WITH DIAERESIS" },
1289: { 0x00FD, "yacute", "LATIN SMALL LETTER Y WITH ACUTE" },
1290: { 0x00FE, "thorn", "LATIN SMALL LETTER THORN" },
1291: { 0x00FF, "yuml", "LATIN SMALL LETTER Y WITH DIAERESIS" },
1292: { 0x0100, "Amacr", "LATIN CAPITAL LETTER A WITH MACRON" },
1293: { 0x0101, "amacr", "LATIN SMALL LETTER A WITH MACRON" },
1294: { 0x0102, "Abreve", "LATIN CAPITAL LETTER A WITH BREVE" },
1295: { 0x0103, "abreve", "LATIN SMALL LETTER A WITH BREVE" },
1296: { 0x0104, "Aogon", "LATIN CAPITAL LETTER A WITH OGONEK" },
1297: { 0x0105, "aogon", "LATIN SMALL LETTER A WITH OGONEK" },
1298: { 0x0106, "Cacute", "LATIN CAPITAL LETTER C WITH ACUTE" },
1299: { 0x0107, "cacute", "LATIN SMALL LETTER C WITH ACUTE" },
1300: { 0x0108, "Ccirc", "LATIN CAPITAL LETTER C WITH CIRCUMFLEX" },
1301: { 0x0109, "ccirc", "LATIN SMALL LETTER C WITH CIRCUMFLEX" },
1302: { 0x010A, "Cdot", "LATIN CAPITAL LETTER C WITH DOT ABOVE" },
1303: { 0x010B, "cdot", "DOT OPERATOR" },
1304: { 0x010C, "Ccaron", "LATIN CAPITAL LETTER C WITH CARON" },
1305: { 0x010D, "ccaron", "LATIN SMALL LETTER C WITH CARON" },
1306: { 0x010E, "Dcaron", "LATIN CAPITAL LETTER D WITH CARON" },
1307: { 0x010F, "dcaron", "LATIN SMALL LETTER D WITH CARON" },
1308: { 0x0110, "Dstrok", "LATIN CAPITAL LETTER D WITH STROKE" },
1309: { 0x0111, "dstrok", "LATIN SMALL LETTER D WITH STROKE" },
1310: { 0x0112, "Emacr", "LATIN CAPITAL LETTER E WITH MACRON" },
1311: { 0x0113, "emacr", "LATIN SMALL LETTER E WITH MACRON" },
1312: { 0x0116, "Edot", "LATIN CAPITAL LETTER E WITH DOT ABOVE" },
1313: { 0x0117, "edot", "LATIN SMALL LETTER E WITH DOT ABOVE" },
1314: { 0x0118, "Eogon", "LATIN CAPITAL LETTER E WITH OGONEK" },
1315: { 0x0119, "eogon", "LATIN SMALL LETTER E WITH OGONEK" },
1316: { 0x011A, "Ecaron", "LATIN CAPITAL LETTER E WITH CARON" },
1317: { 0x011B, "ecaron", "LATIN SMALL LETTER E WITH CARON" },
1318: { 0x011C, "Gcirc", "LATIN CAPITAL LETTER G WITH CIRCUMFLEX" },
1319: { 0x011D, "gcirc", "LATIN SMALL LETTER G WITH CIRCUMFLEX" },
1320: { 0x011E, "Gbreve", "LATIN CAPITAL LETTER G WITH BREVE" },
1321: { 0x011F, "gbreve", "LATIN SMALL LETTER G WITH BREVE" },
1322: { 0x0120, "Gdot", "LATIN CAPITAL LETTER G WITH DOT ABOVE" },
1323: { 0x0121, "gdot", "LATIN SMALL LETTER G WITH DOT ABOVE" },
1324: { 0x0122, "Gcedil", "LATIN CAPITAL LETTER G WITH CEDILLA" },
1325: { 0x0124, "Hcirc", "LATIN CAPITAL LETTER H WITH CIRCUMFLEX" },
1326: { 0x0125, "hcirc", "LATIN SMALL LETTER H WITH CIRCUMFLEX" },
1327: { 0x0126, "Hstrok", "LATIN CAPITAL LETTER H WITH STROKE" },
1328: { 0x0127, "hstrok", "LATIN SMALL LETTER H WITH STROKE" },
1329: { 0x0128, "Itilde", "LATIN CAPITAL LETTER I WITH TILDE" },
1330: { 0x0129, "itilde", "LATIN SMALL LETTER I WITH TILDE" },
1331: { 0x012A, "Imacr", "LATIN CAPITAL LETTER I WITH MACRON" },
1332: { 0x012B, "imacr", "LATIN SMALL LETTER I WITH MACRON" },
1333: { 0x012E, "Iogon", "LATIN CAPITAL LETTER I WITH OGONEK" },
1334: { 0x012F, "iogon", "LATIN SMALL LETTER I WITH OGONEK" },
1335: { 0x0130, "Idot", "LATIN CAPITAL LETTER I WITH DOT ABOVE" },
1336: { 0x0131, "inodot", "LATIN SMALL LETTER DOTLESS I" },
1337: { 0x0131, "inodot", "LATIN SMALL LETTER DOTLESS I" },
1338: { 0x0132, "IJlig", "LATIN CAPITAL LIGATURE IJ" },
1339: { 0x0133, "ijlig", "LATIN SMALL LIGATURE IJ" },
1340: { 0x0134, "Jcirc", "LATIN CAPITAL LETTER J WITH CIRCUMFLEX" },
1341: { 0x0135, "jcirc", "LATIN SMALL LETTER J WITH CIRCUMFLEX" },
1342: { 0x0136, "Kcedil", "LATIN CAPITAL LETTER K WITH CEDILLA" },
1343: { 0x0137, "kcedil", "LATIN SMALL LETTER K WITH CEDILLA" },
1344: { 0x0138, "kgreen", "LATIN SMALL LETTER KRA" },
1345: { 0x0139, "Lacute", "LATIN CAPITAL LETTER L WITH ACUTE" },
1346: { 0x013A, "lacute", "LATIN SMALL LETTER L WITH ACUTE" },
1347: { 0x013B, "Lcedil", "LATIN CAPITAL LETTER L WITH CEDILLA" },
1348: { 0x013C, "lcedil", "LATIN SMALL LETTER L WITH CEDILLA" },
1349: { 0x013D, "Lcaron", "LATIN CAPITAL LETTER L WITH CARON" },
1350: { 0x013E, "lcaron", "LATIN SMALL LETTER L WITH CARON" },
1351: { 0x013F, "Lmidot", "LATIN CAPITAL LETTER L WITH MIDDLE DOT" },
1352: { 0x0140, "lmidot", "LATIN SMALL LETTER L WITH MIDDLE DOT" },
1353: { 0x0141, "Lstrok", "LATIN CAPITAL LETTER L WITH STROKE" },
1354: { 0x0142, "lstrok", "LATIN SMALL LETTER L WITH STROKE" },
1355: { 0x0143, "Nacute", "LATIN CAPITAL LETTER N WITH ACUTE" },
1356: { 0x0144, "nacute", "LATIN SMALL LETTER N WITH ACUTE" },
1357: { 0x0145, "Ncedil", "LATIN CAPITAL LETTER N WITH CEDILLA" },
1358: { 0x0146, "ncedil", "LATIN SMALL LETTER N WITH CEDILLA" },
1359: { 0x0147, "Ncaron", "LATIN CAPITAL LETTER N WITH CARON" },
1360: { 0x0148, "ncaron", "LATIN SMALL LETTER N WITH CARON" },
1361: { 0x0149, "napos", "LATIN SMALL LETTER N PRECEDED BY APOSTROPHE" },
1362: { 0x014A, "ENG", "LATIN CAPITAL LETTER ENG" },
1363: { 0x014B, "eng", "LATIN SMALL LETTER ENG" },
1364: { 0x014C, "Omacr", "LATIN CAPITAL LETTER O WITH MACRON" },
1365: { 0x014D, "omacr", "LATIN SMALL LETTER O WITH MACRON" },
1366: { 0x0150, "Odblac", "LATIN CAPITAL LETTER O WITH DOUBLE ACUTE" },
1367: { 0x0151, "odblac", "LATIN SMALL LETTER O WITH DOUBLE ACUTE" },
1368: { 0x0152, "OElig", "LATIN CAPITAL LIGATURE OE" },
1369: { 0x0153, "oelig", "LATIN SMALL LIGATURE OE" },
1370: { 0x0154, "Racute", "LATIN CAPITAL LETTER R WITH ACUTE" },
1371: { 0x0155, "racute", "LATIN SMALL LETTER R WITH ACUTE" },
1372: { 0x0156, "Rcedil", "LATIN CAPITAL LETTER R WITH CEDILLA" },
1373: { 0x0157, "rcedil", "LATIN SMALL LETTER R WITH CEDILLA" },
1374: { 0x0158, "Rcaron", "LATIN CAPITAL LETTER R WITH CARON" },
1375: { 0x0159, "rcaron", "LATIN SMALL LETTER R WITH CARON" },
1376: { 0x015A, "Sacute", "LATIN CAPITAL LETTER S WITH ACUTE" },
1377: { 0x015B, "sacute", "LATIN SMALL LETTER S WITH ACUTE" },
1378: { 0x015C, "Scirc", "LATIN CAPITAL LETTER S WITH CIRCUMFLEX" },
1379: { 0x015D, "scirc", "LATIN SMALL LETTER S WITH CIRCUMFLEX" },
1380: { 0x015E, "Scedil", "LATIN CAPITAL LETTER S WITH CEDILLA" },
1381: { 0x015F, "scedil", "LATIN SMALL LETTER S WITH CEDILLA" },
1382: { 0x0160, "Scaron", "LATIN CAPITAL LETTER S WITH CARON" },
1383: { 0x0161, "scaron", "LATIN SMALL LETTER S WITH CARON" },
1384: { 0x0162, "Tcedil", "LATIN CAPITAL LETTER T WITH CEDILLA" },
1385: { 0x0163, "tcedil", "LATIN SMALL LETTER T WITH CEDILLA" },
1386: { 0x0164, "Tcaron", "LATIN CAPITAL LETTER T WITH CARON" },
1387: { 0x0165, "tcaron", "LATIN SMALL LETTER T WITH CARON" },
1388: { 0x0166, "Tstrok", "LATIN CAPITAL LETTER T WITH STROKE" },
1389: { 0x0167, "tstrok", "LATIN SMALL LETTER T WITH STROKE" },
1390: { 0x0168, "Utilde", "LATIN CAPITAL LETTER U WITH TILDE" },
1391: { 0x0169, "utilde", "LATIN SMALL LETTER U WITH TILDE" },
1392: { 0x016A, "Umacr", "LATIN CAPITAL LETTER U WITH MACRON" },
1393: { 0x016B, "umacr", "LATIN SMALL LETTER U WITH MACRON" },
1394: { 0x016C, "Ubreve", "LATIN CAPITAL LETTER U WITH BREVE" },
1395: { 0x016D, "ubreve", "LATIN SMALL LETTER U WITH BREVE" },
1396: { 0x016E, "Uring", "LATIN CAPITAL LETTER U WITH RING ABOVE" },
1397: { 0x016F, "uring", "LATIN SMALL LETTER U WITH RING ABOVE" },
1398: { 0x0170, "Udblac", "LATIN CAPITAL LETTER U WITH DOUBLE ACUTE" },
1399: { 0x0171, "udblac", "LATIN SMALL LETTER U WITH DOUBLE ACUTE" },
1400: { 0x0172, "Uogon", "LATIN CAPITAL LETTER U WITH OGONEK" },
1401: { 0x0173, "uogon", "LATIN SMALL LETTER U WITH OGONEK" },
1402: { 0x0174, "Wcirc", "LATIN CAPITAL LETTER W WITH CIRCUMFLEX" },
1403: { 0x0175, "wcirc", "LATIN SMALL LETTER W WITH CIRCUMFLEX" },
1404: { 0x0176, "Ycirc", "LATIN CAPITAL LETTER Y WITH CIRCUMFLEX" },
1405: { 0x0177, "ycirc", "LATIN SMALL LETTER Y WITH CIRCUMFLEX" },
1406: { 0x0178, "Yuml", "LATIN CAPITAL LETTER Y WITH DIAERESIS" },
1407: { 0x0179, "Zacute", "LATIN CAPITAL LETTER Z WITH ACUTE" },
1408: { 0x017A, "zacute", "LATIN SMALL LETTER Z WITH ACUTE" },
1409: { 0x017B, "Zdot", "LATIN CAPITAL LETTER Z WITH DOT ABOVE" },
1410: { 0x017C, "zdot", "LATIN SMALL LETTER Z WITH DOT ABOVE" },
1411: { 0x017D, "Zcaron", "LATIN CAPITAL LETTER Z WITH CARON" },
1412: { 0x017E, "zcaron", "LATIN SMALL LETTER Z WITH CARON" },
1413: { 0x0192, "fnof", "LATIN SMALL LETTER F WITH HOOK" },
1414: { 0x01F5, "gacute", "LATIN SMALL LETTER G WITH ACUTE" },
1415: { 0x02C7, "caron", "CARON" },
1416: { 0x02D8, "breve", "BREVE" },
1417: { 0x02D9, "dot", "DOT ABOVE" },
1418: { 0x02DA, "ring", "RING ABOVE" },
1419: { 0x02DB, "ogon", "OGONEK" },
1420: { 0x02DC, "tilde", "TILDE" },
1421: { 0x02DD, "dblac", "DOUBLE ACUTE ACCENT" },
1422: { 0x0386, "Aacgr", "GREEK CAPITAL LETTER ALPHA WITH TONOS" },
1423: { 0x0388, "Eacgr", "GREEK CAPITAL LETTER EPSILON WITH TONOS" },
1424: { 0x0389, "EEacgr", "GREEK CAPITAL LETTER ETA WITH TONOS" },
1425: { 0x038A, "Iacgr", "GREEK CAPITAL LETTER IOTA WITH TONOS" },
1426: { 0x038C, "Oacgr", "GREEK CAPITAL LETTER OMICRON WITH TONOS" },
1427: { 0x038E, "Uacgr", "GREEK CAPITAL LETTER UPSILON WITH TONOS" },
1428: { 0x038F, "OHacgr", "GREEK CAPITAL LETTER OMEGA WITH TONOS" },
1429: { 0x0390, "idiagr", "GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS" },
1430: { 0x0391, "Agr", "GREEK CAPITAL LETTER ALPHA" },
1431: { 0x0392, "Bgr", "GREEK CAPITAL LETTER BETA" },
1432: { 0x0393, "b.Gamma", "GREEK CAPITAL LETTER GAMMA" },
1433: { 0x0393, "Gamma", "GREEK CAPITAL LETTER GAMMA" },
1434: { 0x0393, "Ggr", "GREEK CAPITAL LETTER GAMMA" },
1435: { 0x0394, "b.Delta", "GREEK CAPITAL LETTER DELTA" },
1436: { 0x0394, "Delta", "GREEK CAPITAL LETTER DELTA" },
1437: { 0x0394, "Dgr", "GREEK CAPITAL LETTER DELTA" },
1438: { 0x0395, "Egr", "GREEK CAPITAL LETTER EPSILON" },
1439: { 0x0396, "Zgr", "GREEK CAPITAL LETTER ZETA" },
1440: { 0x0397, "EEgr", "GREEK CAPITAL LETTER ETA" },
1441: { 0x0398, "b.Theta", "GREEK CAPITAL LETTER THETA" },
1442: { 0x0398, "Theta", "GREEK CAPITAL LETTER THETA" },
1443: { 0x0398, "THgr", "GREEK CAPITAL LETTER THETA" },
1444: { 0x0399, "Igr", "GREEK CAPITAL LETTER IOTA" },
1445: { 0x039A, "Kgr", "GREEK CAPITAL LETTER KAPPA" },
1446: { 0x039B, "b.Lambda", "GREEK CAPITAL LETTER LAMDA" },
1447: { 0x039B, "Lambda", "GREEK CAPITAL LETTER LAMDA" },
1448: { 0x039B, "Lgr", "GREEK CAPITAL LETTER LAMDA" },
1449: { 0x039C, "Mgr", "GREEK CAPITAL LETTER MU" },
1450: { 0x039D, "Ngr", "GREEK CAPITAL LETTER NU" },
1451: { 0x039E, "b.Xi", "GREEK CAPITAL LETTER XI" },
1452: { 0x039E, "Xgr", "GREEK CAPITAL LETTER XI" },
1453: { 0x039E, "Xi", "GREEK CAPITAL LETTER XI" },
1454: { 0x039F, "Ogr", "GREEK CAPITAL LETTER OMICRON" },
1455: { 0x03A0, "b.Pi", "GREEK CAPITAL LETTER PI" },
1456: { 0x03A0, "Pgr", "GREEK CAPITAL LETTER PI" },
1457: { 0x03A0, "Pi", "GREEK CAPITAL LETTER PI" },
1458: { 0x03A1, "Rgr", "GREEK CAPITAL LETTER RHO" },
1459: { 0x03A3, "b.Sigma", "GREEK CAPITAL LETTER SIGMA" },
1460: { 0x03A3, "Sgr", "GREEK CAPITAL LETTER SIGMA" },
1461: { 0x03A3, "Sigma", "GREEK CAPITAL LETTER SIGMA" },
1462: { 0x03A4, "Tgr", "GREEK CAPITAL LETTER TAU" },
1463: { 0x03A5, "Ugr", "" },
1464: { 0x03A6, "b.Phi", "GREEK CAPITAL LETTER PHI" },
1465: { 0x03A6, "PHgr", "GREEK CAPITAL LETTER PHI" },
1466: { 0x03A6, "Phi", "GREEK CAPITAL LETTER PHI" },
1467: { 0x03A7, "KHgr", "GREEK CAPITAL LETTER CHI" },
1468: { 0x03A8, "b.Psi", "GREEK CAPITAL LETTER PSI" },
1469: { 0x03A8, "PSgr", "GREEK CAPITAL LETTER PSI" },
1470: { 0x03A8, "Psi", "GREEK CAPITAL LETTER PSI" },
1471: { 0x03A9, "b.Omega", "GREEK CAPITAL LETTER OMEGA" },
1472: { 0x03A9, "OHgr", "GREEK CAPITAL LETTER OMEGA" },
1473: { 0x03A9, "Omega", "GREEK CAPITAL LETTER OMEGA" },
1474: { 0x03AA, "Idigr", "GREEK CAPITAL LETTER IOTA WITH DIALYTIKA" },
1475: { 0x03AB, "Udigr", "GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA" },
1476: { 0x03AC, "aacgr", "GREEK SMALL LETTER ALPHA WITH TONOS" },
1477: { 0x03AD, "eacgr", "GREEK SMALL LETTER EPSILON WITH TONOS" },
1478: { 0x03AE, "eeacgr", "GREEK SMALL LETTER ETA WITH TONOS" },
1479: { 0x03AF, "iacgr", "GREEK SMALL LETTER IOTA WITH TONOS" },
1480: { 0x03B0, "udiagr", "GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS" },
1481: { 0x03B1, "agr", "" },
1482: { 0x03B1, "alpha", "" },
1483: { 0x03B1, "b.alpha", "" },
1484: { 0x03B2, "b.beta", "GREEK SMALL LETTER BETA" },
1485: { 0x03B2, "beta", "GREEK SMALL LETTER BETA" },
1486: { 0x03B2, "bgr", "GREEK SMALL LETTER BETA" },
1487: { 0x03B3, "b.gamma", "GREEK SMALL LETTER GAMMA" },
1488: { 0x03B3, "gamma", "GREEK SMALL LETTER GAMMA" },
1489: { 0x03B3, "ggr", "GREEK SMALL LETTER GAMMA" },
1490: { 0x03B4, "b.delta", "GREEK SMALL LETTER DELTA" },
1491: { 0x03B4, "delta", "GREEK SMALL LETTER DELTA" },
1492: { 0x03B4, "dgr", "GREEK SMALL LETTER DELTA" },
1493: { 0x03B5, "b.epsi", "" },
1494: { 0x03B5, "b.epsis", "" },
1495: { 0x03B5, "b.epsiv", "" },
1496: { 0x03B5, "egr", "" },
1497: { 0x03B5, "epsiv", "" },
1498: { 0x03B6, "b.zeta", "GREEK SMALL LETTER ZETA" },
1499: { 0x03B6, "zeta", "GREEK SMALL LETTER ZETA" },
1500: { 0x03B6, "zgr", "GREEK SMALL LETTER ZETA" },
1501: { 0x03B7, "b.eta", "GREEK SMALL LETTER ETA" },
1502: { 0x03B7, "eegr", "GREEK SMALL LETTER ETA" },
1503: { 0x03B7, "eta", "GREEK SMALL LETTER ETA" },
1504: { 0x03B8, "b.thetas", "" },
1505: { 0x03B8, "thetas", "" },
1506: { 0x03B8, "thgr", "" },
1507: { 0x03B9, "b.iota", "GREEK SMALL LETTER IOTA" },
1508: { 0x03B9, "igr", "GREEK SMALL LETTER IOTA" },
1509: { 0x03B9, "iota", "GREEK SMALL LETTER IOTA" },
1510: { 0x03BA, "b.kappa", "GREEK SMALL LETTER KAPPA" },
1511: { 0x03BA, "kappa", "GREEK SMALL LETTER KAPPA" },
1512: { 0x03BA, "kgr", "GREEK SMALL LETTER KAPPA" },
1513: { 0x03BB, "b.lambda", "GREEK SMALL LETTER LAMDA" },
1514: { 0x03BB, "lambda", "GREEK SMALL LETTER LAMDA" },
1515: { 0x03BB, "lgr", "GREEK SMALL LETTER LAMDA" },
1516: { 0x03BC, "b.mu", "GREEK SMALL LETTER MU" },
1517: { 0x03BC, "mgr", "GREEK SMALL LETTER MU" },
1518: { 0x03BC, "mu", "GREEK SMALL LETTER MU" },
1519: { 0x03BD, "b.nu", "GREEK SMALL LETTER NU" },
1520: { 0x03BD, "ngr", "GREEK SMALL LETTER NU" },
1521: { 0x03BD, "nu", "GREEK SMALL LETTER NU" },
1522: { 0x03BE, "b.xi", "GREEK SMALL LETTER XI" },
1523: { 0x03BE, "xgr", "GREEK SMALL LETTER XI" },
1524: { 0x03BE, "xi", "GREEK SMALL LETTER XI" },
1525: { 0x03BF, "ogr", "GREEK SMALL LETTER OMICRON" },
1526: { 0x03C0, "b.pi", "GREEK SMALL LETTER PI" },
1527: { 0x03C0, "pgr", "GREEK SMALL LETTER PI" },
1528: { 0x03C0, "pi", "GREEK SMALL LETTER PI" },
1529: { 0x03C1, "b.rho", "GREEK SMALL LETTER RHO" },
1530: { 0x03C1, "rgr", "GREEK SMALL LETTER RHO" },
1531: { 0x03C1, "rho", "GREEK SMALL LETTER RHO" },
1532: { 0x03C2, "b.sigmav", "" },
1533: { 0x03C2, "sfgr", "" },
1534: { 0x03C2, "sigmav", "" },
1535: { 0x03C3, "b.sigma", "GREEK SMALL LETTER SIGMA" },
1536: { 0x03C3, "sgr", "GREEK SMALL LETTER SIGMA" },
1537: { 0x03C3, "sigma", "GREEK SMALL LETTER SIGMA" },
1538: { 0x03C4, "b.tau", "GREEK SMALL LETTER TAU" },
1539: { 0x03C4, "tau", "GREEK SMALL LETTER TAU" },
1540: { 0x03C4, "tgr", "GREEK SMALL LETTER TAU" },
1541: { 0x03C5, "b.upsi", "GREEK SMALL LETTER UPSILON" },
1542: { 0x03C5, "ugr", "GREEK SMALL LETTER UPSILON" },
1543: { 0x03C5, "upsi", "GREEK SMALL LETTER UPSILON" },
1544: { 0x03C6, "b.phis", "GREEK SMALL LETTER PHI" },
1545: { 0x03C6, "phgr", "GREEK SMALL LETTER PHI" },
1546: { 0x03C6, "phis", "GREEK SMALL LETTER PHI" },
1547: { 0x03C7, "b.chi", "GREEK SMALL LETTER CHI" },
1548: { 0x03C7, "chi", "GREEK SMALL LETTER CHI" },
1549: { 0x03C7, "khgr", "GREEK SMALL LETTER CHI" },
1550: { 0x03C8, "b.psi", "GREEK SMALL LETTER PSI" },
1551: { 0x03C8, "psgr", "GREEK SMALL LETTER PSI" },
1552: { 0x03C8, "psi", "GREEK SMALL LETTER PSI" },
1553: { 0x03C9, "b.omega", "GREEK SMALL LETTER OMEGA" },
1554: { 0x03C9, "ohgr", "GREEK SMALL LETTER OMEGA" },
1555: { 0x03C9, "omega", "GREEK SMALL LETTER OMEGA" },
1556: { 0x03CA, "idigr", "GREEK SMALL LETTER IOTA WITH DIALYTIKA" },
1557: { 0x03CB, "udigr", "GREEK SMALL LETTER UPSILON WITH DIALYTIKA" },
1558: { 0x03CC, "oacgr", "GREEK SMALL LETTER OMICRON WITH TONOS" },
1559: { 0x03CD, "uacgr", "GREEK SMALL LETTER UPSILON WITH TONOS" },
1560: { 0x03CE, "ohacgr", "GREEK SMALL LETTER OMEGA WITH TONOS" },
1561: { 0x03D1, "b.thetav", "" },
1562: { 0x03D1, "thetav", "" },
1563: { 0x03D2, "b.Upsi", "" },
1564: { 0x03D2, "Upsi", "" },
1565: { 0x03D5, "b.phiv", "GREEK PHI SYMBOL" },
1566: { 0x03D5, "phiv", "GREEK PHI SYMBOL" },
1567: { 0x03D6, "b.piv", "GREEK PI SYMBOL" },
1568: { 0x03D6, "piv", "GREEK PI SYMBOL" },
1569: { 0x03DC, "b.gammad", "GREEK LETTER DIGAMMA" },
1570: { 0x03DC, "gammad", "GREEK LETTER DIGAMMA" },
1571: { 0x03F0, "b.kappav", "GREEK KAPPA SYMBOL" },
1572: { 0x03F0, "kappav", "GREEK KAPPA SYMBOL" },
1573: { 0x03F1, "b.rhov", "GREEK RHO SYMBOL" },
1574: { 0x03F1, "rhov", "GREEK RHO SYMBOL" },
1575: { 0x0401, "IOcy", "CYRILLIC CAPITAL LETTER IO" },
1576: { 0x0402, "DJcy", "CYRILLIC CAPITAL LETTER DJE" },
1577: { 0x0403, "GJcy", "CYRILLIC CAPITAL LETTER GJE" },
1578: { 0x0404, "Jukcy", "CYRILLIC CAPITAL LETTER UKRAINIAN IE" },
1579: { 0x0405, "DScy", "CYRILLIC CAPITAL LETTER DZE" },
1580: { 0x0406, "Iukcy", "CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I" },
1581: { 0x0407, "YIcy", "CYRILLIC CAPITAL LETTER YI" },
1582: { 0x0408, "Jsercy", "CYRILLIC CAPITAL LETTER JE" },
1583: { 0x0409, "LJcy", "CYRILLIC CAPITAL LETTER LJE" },
1584: { 0x040A, "NJcy", "CYRILLIC CAPITAL LETTER NJE" },
1585: { 0x040B, "TSHcy", "CYRILLIC CAPITAL LETTER TSHE" },
1586: { 0x040C, "KJcy", "CYRILLIC CAPITAL LETTER KJE" },
1587: { 0x040E, "Ubrcy", "CYRILLIC CAPITAL LETTER SHORT U" },
1588: { 0x040F, "DZcy", "CYRILLIC CAPITAL LETTER DZHE" },
1589: { 0x0410, "Acy", "CYRILLIC CAPITAL LETTER A" },
1590: { 0x0411, "Bcy", "CYRILLIC CAPITAL LETTER BE" },
1591: { 0x0412, "Vcy", "CYRILLIC CAPITAL LETTER VE" },
1592: { 0x0413, "Gcy", "CYRILLIC CAPITAL LETTER GHE" },
1593: { 0x0414, "Dcy", "CYRILLIC CAPITAL LETTER DE" },
1594: { 0x0415, "IEcy", "CYRILLIC CAPITAL LETTER IE" },
1595: { 0x0416, "ZHcy", "CYRILLIC CAPITAL LETTER ZHE" },
1596: { 0x0417, "Zcy", "CYRILLIC CAPITAL LETTER ZE" },
1597: { 0x0418, "Icy", "CYRILLIC CAPITAL LETTER I" },
1598: { 0x0419, "Jcy", "CYRILLIC CAPITAL LETTER SHORT I" },
1599: { 0x041A, "Kcy", "CYRILLIC CAPITAL LETTER KA" },
1600: { 0x041B, "Lcy", "CYRILLIC CAPITAL LETTER EL" },
1601: { 0x041C, "Mcy", "CYRILLIC CAPITAL LETTER EM" },
1602: { 0x041D, "Ncy", "CYRILLIC CAPITAL LETTER EN" },
1603: { 0x041E, "Ocy", "CYRILLIC CAPITAL LETTER O" },
1604: { 0x041F, "Pcy", "CYRILLIC CAPITAL LETTER PE" },
1605: { 0x0420, "Rcy", "CYRILLIC CAPITAL LETTER ER" },
1606: { 0x0421, "Scy", "CYRILLIC CAPITAL LETTER ES" },
1607: { 0x0422, "Tcy", "CYRILLIC CAPITAL LETTER TE" },
1608: { 0x0423, "Ucy", "CYRILLIC CAPITAL LETTER U" },
1609: { 0x0424, "Fcy", "CYRILLIC CAPITAL LETTER EF" },
1610: { 0x0425, "KHcy", "CYRILLIC CAPITAL LETTER HA" },
1611: { 0x0426, "TScy", "CYRILLIC CAPITAL LETTER TSE" },
1612: { 0x0427, "CHcy", "CYRILLIC CAPITAL LETTER CHE" },
1613: { 0x0428, "SHcy", "CYRILLIC CAPITAL LETTER SHA" },
1614: { 0x0429, "SHCHcy", "CYRILLIC CAPITAL LETTER SHCHA" },
1615: { 0x042A, "HARDcy", "CYRILLIC CAPITAL LETTER HARD SIGN" },
1616: { 0x042B, "Ycy", "CYRILLIC CAPITAL LETTER YERU" },
1617: { 0x042C, "SOFTcy", "CYRILLIC CAPITAL LETTER SOFT SIGN" },
1618: { 0x042D, "Ecy", "CYRILLIC CAPITAL LETTER E" },
1619: { 0x042E, "YUcy", "CYRILLIC CAPITAL LETTER YU" },
1620: { 0x042F, "YAcy", "CYRILLIC CAPITAL LETTER YA" },
1621: { 0x0430, "acy", "CYRILLIC SMALL LETTER A" },
1622: { 0x0431, "bcy", "CYRILLIC SMALL LETTER BE" },
1623: { 0x0432, "vcy", "CYRILLIC SMALL LETTER VE" },
1624: { 0x0433, "gcy", "CYRILLIC SMALL LETTER GHE" },
1625: { 0x0434, "dcy", "CYRILLIC SMALL LETTER DE" },
1626: { 0x0435, "iecy", "CYRILLIC SMALL LETTER IE" },
1627: { 0x0436, "zhcy", "CYRILLIC SMALL LETTER ZHE" },
1628: { 0x0437, "zcy", "CYRILLIC SMALL LETTER ZE" },
1629: { 0x0438, "icy", "CYRILLIC SMALL LETTER I" },
1630: { 0x0439, "jcy", "CYRILLIC SMALL LETTER SHORT I" },
1631: { 0x043A, "kcy", "CYRILLIC SMALL LETTER KA" },
1632: { 0x043B, "lcy", "CYRILLIC SMALL LETTER EL" },
1633: { 0x043C, "mcy", "CYRILLIC SMALL LETTER EM" },
1634: { 0x043D, "ncy", "CYRILLIC SMALL LETTER EN" },
1635: { 0x043E, "ocy", "CYRILLIC SMALL LETTER O" },
1636: { 0x043F, "pcy", "CYRILLIC SMALL LETTER PE" },
1637: { 0x0440, "rcy", "CYRILLIC SMALL LETTER ER" },
1638: { 0x0441, "scy", "CYRILLIC SMALL LETTER ES" },
1639: { 0x0442, "tcy", "CYRILLIC SMALL LETTER TE" },
1640: { 0x0443, "ucy", "CYRILLIC SMALL LETTER U" },
1641: { 0x0444, "fcy", "CYRILLIC SMALL LETTER EF" },
1642: { 0x0445, "khcy", "CYRILLIC SMALL LETTER HA" },
1643: { 0x0446, "tscy", "CYRILLIC SMALL LETTER TSE" },
1644: { 0x0447, "chcy", "CYRILLIC SMALL LETTER CHE" },
1645: { 0x0448, "shcy", "CYRILLIC SMALL LETTER SHA" },
1646: { 0x0449, "shchcy", "CYRILLIC SMALL LETTER SHCHA" },
1647: { 0x044A, "hardcy", "CYRILLIC SMALL LETTER HARD SIGN" },
1648: { 0x044B, "ycy", "CYRILLIC SMALL LETTER YERU" },
1649: { 0x044C, "softcy", "CYRILLIC SMALL LETTER SOFT SIGN" },
1650: { 0x044D, "ecy", "CYRILLIC SMALL LETTER E" },
1651: { 0x044E, "yucy", "CYRILLIC SMALL LETTER YU" },
1652: { 0x044F, "yacy", "CYRILLIC SMALL LETTER YA" },
1653: { 0x0451, "iocy", "CYRILLIC SMALL LETTER IO" },
1654: { 0x0452, "djcy", "CYRILLIC SMALL LETTER DJE" },
1655: { 0x0453, "gjcy", "CYRILLIC SMALL LETTER GJE" },
1656: { 0x0454, "jukcy", "CYRILLIC SMALL LETTER UKRAINIAN IE" },
1657: { 0x0455, "dscy", "CYRILLIC SMALL LETTER DZE" },
1658: { 0x0456, "iukcy", "CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I" },
1659: { 0x0457, "yicy", "CYRILLIC SMALL LETTER YI" },
1660: { 0x0458, "jsercy", "CYRILLIC SMALL LETTER JE" },
1661: { 0x0459, "ljcy", "CYRILLIC SMALL LETTER LJE" },
1662: { 0x045A, "njcy", "CYRILLIC SMALL LETTER NJE" },
1663: { 0x045B, "tshcy", "CYRILLIC SMALL LETTER TSHE" },
1664: { 0x045C, "kjcy", "CYRILLIC SMALL LETTER KJE" },
1665: { 0x045E, "ubrcy", "CYRILLIC SMALL LETTER SHORT U" },
1666: { 0x045F, "dzcy", "CYRILLIC SMALL LETTER DZHE" },
1667: { 0x2002, "ensp", "EN SPACE" },
1668: { 0x2003, "emsp", "EM SPACE" },
1669: { 0x2004, "emsp13", "THREE-PER-EM SPACE" },
1670: { 0x2005, "emsp14", "FOUR-PER-EM SPACE" },
1671: { 0x2007, "numsp", "FIGURE SPACE" },
1672: { 0x2008, "puncsp", "PUNCTUATION SPACE" },
1673: { 0x2009, "thinsp", "THIN SPACE" },
1674: { 0x200A, "hairsp", "HAIR SPACE" },
1675: { 0x2010, "dash", "HYPHEN" },
1676: { 0x2013, "ndash", "EN DASH" },
1677: { 0x2014, "mdash", "EM DASH" },
1678: { 0x2015, "horbar", "HORIZONTAL BAR" },
1679: { 0x2016, "Verbar", "DOUBLE VERTICAL LINE" },
1680: { 0x2018, "lsquo", "" },
1681: { 0x2018, "rsquor", "" },
1682: { 0x2019, "rsquo", "RIGHT SINGLE QUOTATION MARK" },
1683: { 0x201A, "lsquor", "SINGLE LOW-9 QUOTATION MARK" },
1684: { 0x201C, "ldquo", "" },
1685: { 0x201C, "rdquor", "" },
1686: { 0x201D, "rdquo", "RIGHT DOUBLE QUOTATION MARK" },
1687: { 0x201E, "ldquor", "DOUBLE LOW-9 QUOTATION MARK" },
1688: { 0x2020, "dagger", "DAGGER" },
1689: { 0x2021, "Dagger", "DOUBLE DAGGER" },
1690: { 0x2022, "bull", "BULLET" },
1691: { 0x2025, "nldr", "TWO DOT LEADER" },
1692: { 0x2026, "hellip", "HORIZONTAL ELLIPSIS" },
1693: { 0x2026, "mldr", "HORIZONTAL ELLIPSIS" },
1694: { 0x2030, "permil", "PER MILLE SIGN" },
1695: { 0x2032, "prime", "PRIME" },
1696: { 0x2032, "vprime", "PRIME" },
1697: { 0x2033, "Prime", "DOUBLE PRIME" },
1698: { 0x2034, "tprime", "TRIPLE PRIME" },
1699: { 0x2035, "bprime", "REVERSED PRIME" },
1700: { 0x2041, "caret", "CARET" },
1701: { 0x2043, "hybull", "HYPHEN BULLET" },
1702: { 0x20DB, "tdot", "COMBINING THREE DOTS ABOVE" },
1703: { 0x20DC, "DotDot", "COMBINING FOUR DOTS ABOVE" },
1704: { 0x2105, "incare", "CARE OF" },
1705: { 0x210B, "hamilt", "SCRIPT CAPITAL H" },
1706: { 0x210F, "planck", "PLANCK CONSTANT OVER TWO PI" },
1707: { 0x2111, "image", "BLACK-LETTER CAPITAL I" },
1708: { 0x2112, "lagran", "SCRIPT CAPITAL L" },
1709: { 0x2113, "ell", "SCRIPT SMALL L" },
1710: { 0x2116, "numero", "NUMERO SIGN" },
1711: { 0x2117, "copysr", "SOUND RECORDING COPYRIGHT" },
1712: { 0x2118, "weierp", "SCRIPT CAPITAL P" },
1713: { 0x211C, "real", "BLACK-LETTER CAPITAL R" },
1714: { 0x211E, "rx", "PRESCRIPTION TAKE" },
1715: { 0x2122, "trade", "TRADE MARK SIGN" },
1716: { 0x2126, "ohm", "OHM SIGN" },
1717: { 0x212B, "angst", "ANGSTROM SIGN" },
1718: { 0x212C, "bernou", "SCRIPT CAPITAL B" },
1719: { 0x2133, "phmmat", "SCRIPT CAPITAL M" },
1720: { 0x2134, "order", "SCRIPT SMALL O" },
1721: { 0x2135, "aleph", "ALEF SYMBOL" },
1722: { 0x2136, "beth", "BET SYMBOL" },
1723: { 0x2137, "gimel", "GIMEL SYMBOL" },
1724: { 0x2138, "daleth", "DALET SYMBOL" },
1725: { 0x2153, "frac13", "VULGAR FRACTION ONE THIRD" },
1726: { 0x2154, "frac23", "VULGAR FRACTION TWO THIRDS" },
1727: { 0x2155, "frac15", "VULGAR FRACTION ONE FIFTH" },
1728: { 0x2156, "frac25", "VULGAR FRACTION TWO FIFTHS" },
1729: { 0x2157, "frac35", "VULGAR FRACTION THREE FIFTHS" },
1730: { 0x2158, "frac45", "VULGAR FRACTION FOUR FIFTHS" },
1731: { 0x2159, "frac16", "VULGAR FRACTION ONE SIXTH" },
1732: { 0x215A, "frac56", "VULGAR FRACTION FIVE SIXTHS" },
1733: { 0x215B, "frac18", "" },
1734: { 0x215C, "frac38", "" },
1735: { 0x215D, "frac58", "" },
1736: { 0x215E, "frac78", "" },
1737: { 0x2190, "larr", "LEFTWARDS DOUBLE ARROW" },
1738: { 0x2191, "uarr", "UPWARDS ARROW" },
1739: { 0x2192, "rarr", "RIGHTWARDS DOUBLE ARROW" },
1740: { 0x2193, "darr", "DOWNWARDS ARROW" },
1741: { 0x2194, "harr", "LEFT RIGHT ARROW" },
1742: { 0x2194, "xhArr", "LEFT RIGHT ARROW" },
1743: { 0x2194, "xharr", "LEFT RIGHT ARROW" },
1744: { 0x2195, "varr", "UP DOWN ARROW" },
1745: { 0x2196, "nwarr", "NORTH WEST ARROW" },
1746: { 0x2197, "nearr", "NORTH EAST ARROW" },
1747: { 0x2198, "drarr", "SOUTH EAST ARROW" },
1748: { 0x2199, "dlarr", "SOUTH WEST ARROW" },
1749: { 0x219A, "nlarr", "LEFTWARDS ARROW WITH STROKE" },
1750: { 0x219B, "nrarr", "RIGHTWARDS ARROW WITH STROKE" },
1751: { 0x219D, "rarrw", "RIGHTWARDS SQUIGGLE ARROW" },
1752: { 0x219E, "Larr", "LEFTWARDS TWO HEADED ARROW" },
1753: { 0x21A0, "Rarr", "RIGHTWARDS TWO HEADED ARROW" },
1754: { 0x21A2, "larrtl", "LEFTWARDS ARROW WITH TAIL" },
1755: { 0x21A3, "rarrtl", "RIGHTWARDS ARROW WITH TAIL" },
1756: { 0x21A6, "map", "RIGHTWARDS ARROW FROM BAR" },
1757: { 0x21A9, "larrhk", "LEFTWARDS ARROW WITH HOOK" },
1758: { 0x21AA, "rarrhk", "RIGHTWARDS ARROW WITH HOOK" },
1759: { 0x21AB, "larrlp", "LEFTWARDS ARROW WITH LOOP" },
1760: { 0x21AC, "rarrlp", "RIGHTWARDS ARROW WITH LOOP" },
1761: { 0x21AD, "harrw", "LEFT RIGHT WAVE ARROW" },
1762: { 0x21AE, "nharr", "LEFT RIGHT ARROW WITH STROKE" },
1763: { 0x21B0, "lsh", "UPWARDS ARROW WITH TIP LEFTWARDS" },
1764: { 0x21B1, "rsh", "UPWARDS ARROW WITH TIP RIGHTWARDS" },
1765: { 0x21B6, "cularr", "ANTICLOCKWISE TOP SEMICIRCLE ARROW" },
1766: { 0x21B7, "curarr", "CLOCKWISE TOP SEMICIRCLE ARROW" },
1767: { 0x21BA, "olarr", "ANTICLOCKWISE OPEN CIRCLE ARROW" },
1768: { 0x21BB, "orarr", "CLOCKWISE OPEN CIRCLE ARROW" },
1769: { 0x21BC, "lharu", "LEFTWARDS HARPOON WITH BARB UPWARDS" },
1770: { 0x21BD, "lhard", "LEFTWARDS HARPOON WITH BARB DOWNWARDS" },
1771: { 0x21BE, "uharr", "UPWARDS HARPOON WITH BARB RIGHTWARDS" },
1772: { 0x21BF, "uharl", "UPWARDS HARPOON WITH BARB LEFTWARDS" },
1773: { 0x21C0, "rharu", "RIGHTWARDS HARPOON WITH BARB UPWARDS" },
1774: { 0x21C1, "rhard", "RIGHTWARDS HARPOON WITH BARB DOWNWARDS" },
1775: { 0x21C2, "dharr", "DOWNWARDS HARPOON WITH BARB RIGHTWARDS" },
1776: { 0x21C3, "dharl", "DOWNWARDS HARPOON WITH BARB LEFTWARDS" },
1777: { 0x21C4, "rlarr2", "RIGHTWARDS ARROW OVER LEFTWARDS ARROW" },
1778: { 0x21C6, "lrarr2", "LEFTWARDS ARROW OVER RIGHTWARDS ARROW" },
1779: { 0x21C7, "larr2", "LEFTWARDS PAIRED ARROWS" },
1780: { 0x21C8, "uarr2", "UPWARDS PAIRED ARROWS" },
1781: { 0x21C9, "rarr2", "RIGHTWARDS PAIRED ARROWS" },
1782: { 0x21CA, "darr2", "DOWNWARDS PAIRED ARROWS" },
1783: { 0x21CB, "lrhar2", "LEFTWARDS HARPOON OVER RIGHTWARDS HARPOON" },
1784: { 0x21CC, "rlhar2", "RIGHTWARDS HARPOON OVER LEFTWARDS HARPOON" },
1785: { 0x21CD, "nlArr", "LEFTWARDS DOUBLE ARROW WITH STROKE" },
1786: { 0x21CE, "nhArr", "LEFT RIGHT DOUBLE ARROW WITH STROKE" },
1787: { 0x21CF, "nrArr", "RIGHTWARDS DOUBLE ARROW WITH STROKE" },
1788: { 0x21D0, "lArr", "LEFTWARDS ARROW" },
1789: { 0x21D0, "xlArr", "LEFTWARDS DOUBLE ARROW" },
1790: { 0x21D1, "uArr", "UPWARDS DOUBLE ARROW" },
1791: { 0x21D2, "rArr", "RIGHTWARDS ARROW" },
1792: { 0x21D2, "xrArr", "RIGHTWARDS DOUBLE ARROW" },
1793: { 0x21D3, "dArr", "DOWNWARDS DOUBLE ARROW" },
1794: { 0x21D4, "hArr", "" },
1795: { 0x21D4, "iff", "LEFT RIGHT DOUBLE ARROW" },
1796: { 0x21D5, "vArr", "UP DOWN DOUBLE ARROW" },
1797: { 0x21DA, "lAarr", "LEFTWARDS TRIPLE ARROW" },
1798: { 0x21DB, "rAarr", "RIGHTWARDS TRIPLE ARROW" },
1799: { 0x2200, "forall", "" },
1800: { 0x2201, "comp", "COMPLEMENT" },
1801: { 0x2202, "part", "" },
1802: { 0x2203, "exist", "" },
1803: { 0x2204, "nexist", "THERE DOES NOT EXIST" },
1804: { 0x2205, "empty", "" },
1805: { 0x2207, "nabla", "NABLA" },
1806: { 0x2209, "notin", "" },
1807: { 0x220A, "epsi", "" },
1808: { 0x220A, "epsis", "" },
1809: { 0x220A, "isin", "" },
1810: { 0x220D, "bepsi", "SMALL CONTAINS AS MEMBER" },
1811: { 0x220D, "ni", "" },
1812: { 0x220F, "prod", "N-ARY PRODUCT" },
1813: { 0x2210, "amalg", "N-ARY COPRODUCT" },
1814: { 0x2210, "coprod", "N-ARY COPRODUCT" },
1815: { 0x2210, "samalg", "" },
1816: { 0x2211, "sum", "N-ARY SUMMATION" },
1817: { 0x2212, "minus", "MINUS SIGN" },
1818: { 0x2213, "mnplus", "" },
1819: { 0x2214, "plusdo", "DOT PLUS" },
1820: { 0x2216, "setmn", "SET MINUS" },
1821: { 0x2216, "ssetmn", "SET MINUS" },
1822: { 0x2217, "lowast", "ASTERISK OPERATOR" },
1823: { 0x2218, "compfn", "RING OPERATOR" },
1824: { 0x221A, "radic", "" },
1825: { 0x221D, "prop", "" },
1826: { 0x221D, "vprop", "" },
1827: { 0x221E, "infin", "" },
1828: { 0x221F, "ang90", "RIGHT ANGLE" },
1829: { 0x2220, "ang", "ANGLE" },
1830: { 0x2221, "angmsd", "MEASURED ANGLE" },
1831: { 0x2222, "angsph", "" },
1832: { 0x2223, "mid", "" },
1833: { 0x2224, "nmid", "DOES NOT DIVIDE" },
1834: { 0x2225, "par", "PARALLEL TO" },
1835: { 0x2225, "spar", "PARALLEL TO" },
1836: { 0x2226, "npar", "NOT PARALLEL TO" },
1837: { 0x2226, "nspar", "NOT PARALLEL TO" },
1838: { 0x2227, "and", "" },
1839: { 0x2228, "or", "" },
1840: { 0x2229, "cap", "" },
1841: { 0x222A, "cup", "" },
1842: { 0x222B, "int", "" },
1843: { 0x222E, "conint", "" },
1844: { 0x2234, "there4", "" },
1845: { 0x2235, "becaus", "BECAUSE" },
1846: { 0x223C, "sim", "" },
1847: { 0x223C, "thksim", "TILDE OPERATOR" },
1848: { 0x223D, "bsim", "" },
1849: { 0x2240, "wreath", "WREATH PRODUCT" },
1850: { 0x2241, "nsim", "" },
1851: { 0x2243, "sime", "" },
1852: { 0x2244, "nsime", "" },
1853: { 0x2245, "cong", "" },
1854: { 0x2247, "ncong", "NEITHER APPROXIMATELY NOR ACTUALLY EQUAL TO" },
1855: { 0x2248, "ap", "" },
1856: { 0x2248, "thkap", "ALMOST EQUAL TO" },
1857: { 0x2249, "nap", "NOT ALMOST EQUAL TO" },
1858: { 0x224A, "ape", "" },
1859: { 0x224C, "bcong", "ALL EQUAL TO" },
1860: { 0x224D, "asymp", "EQUIVALENT TO" },
1861: { 0x224E, "bump", "" },
1862: { 0x224F, "bumpe", "" },
1863: { 0x2250, "esdot", "" },
1864: { 0x2251, "eDot", "" },
1865: { 0x2252, "efDot", "" },
1866: { 0x2253, "erDot", "" },
1867: { 0x2254, "colone", "" },
1868: { 0x2255, "ecolon", "" },
1869: { 0x2256, "ecir", "" },
1870: { 0x2257, "cire", "" },
1871: { 0x2259, "wedgeq", "ESTIMATES" },
1872: { 0x225C, "trie", "" },
1873: { 0x2260, "ne", "" },
1874: { 0x2261, "equiv", "" },
1875: { 0x2262, "nequiv", "NOT IDENTICAL TO" },
1876: { 0x2264, "le", "" },
1877: { 0x2264, "les", "LESS-THAN OR EQUAL TO" },
1878: { 0x2265, "ge", "GREATER-THAN OR EQUAL TO" },
1879: { 0x2265, "ges", "GREATER-THAN OR EQUAL TO" },
1880: { 0x2266, "lE", "" },
1881: { 0x2267, "gE", "" },
1882: { 0x2268, "lnE", "" },
1883: { 0x2268, "lne", "" },
1884: { 0x2268, "lvnE", "LESS-THAN BUT NOT EQUAL TO" },
1885: { 0x2269, "gnE", "" },
1886: { 0x2269, "gne", "" },
1887: { 0x2269, "gvnE", "GREATER-THAN BUT NOT EQUAL TO" },
1888: { 0x226A, "Lt", "MUCH LESS-THAN" },
1889: { 0x226B, "Gt", "MUCH GREATER-THAN" },
1890: { 0x226C, "twixt", "BETWEEN" },
1891: { 0x226E, "nlt", "NOT LESS-THAN" },
1892: { 0x226F, "ngt", "NOT GREATER-THAN" },
1893: { 0x2270, "nlE", "" },
1894: { 0x2270, "nle", "NEITHER LESS-THAN NOR EQUAL TO" },
1895: { 0x2270, "nles", "" },
1896: { 0x2271, "ngE", "" },
1897: { 0x2271, "nge", "NEITHER GREATER-THAN NOR EQUAL TO" },
1898: { 0x2271, "nges", "" },
1899: { 0x2272, "lap", "LESS-THAN OR EQUIVALENT TO" },
1900: { 0x2272, "lsim", "LESS-THAN OR EQUIVALENT TO" },
1901: { 0x2273, "gap", "GREATER-THAN OR EQUIVALENT TO" },
1902: { 0x2273, "gsim", "GREATER-THAN OR EQUIVALENT TO" },
1903: { 0x2276, "lg", "LESS-THAN OR GREATER-THAN" },
1904: { 0x2277, "gl", "" },
1905: { 0x227A, "pr", "" },
1906: { 0x227B, "sc", "" },
1907: { 0x227C, "cupre", "" },
1908: { 0x227C, "pre", "" },
1909: { 0x227D, "sccue", "" },
1910: { 0x227D, "sce", "" },
1911: { 0x227E, "prap", "" },
1912: { 0x227E, "prsim", "" },
1913: { 0x227F, "scap", "" },
1914: { 0x227F, "scsim", "" },
1915: { 0x2280, "npr", "DOES NOT PRECEDE" },
1916: { 0x2281, "nsc", "DOES NOT SUCCEED" },
1917: { 0x2282, "sub", "" },
1918: { 0x2283, "sup", "" },
1919: { 0x2284, "nsub", "NOT A SUBSET OF" },
1920: { 0x2285, "nsup", "NOT A SUPERSET OF" },
1921: { 0x2286, "subE", "" },
1922: { 0x2286, "sube", "" },
1923: { 0x2287, "supE", "" },
1924: { 0x2287, "supe", "" },
1925: { 0x2288, "nsubE", "" },
1926: { 0x2288, "nsube", "" },
1927: { 0x2289, "nsupE", "" },
1928: { 0x2289, "nsupe", "" },
1929: { 0x228A, "subne", "" },
1930: { 0x228A, "subnE", "SUBSET OF WITH NOT EQUAL TO" },
1931: { 0x228A, "vsubne", "SUBSET OF WITH NOT EQUAL TO" },
1932: { 0x228B, "supnE", "" },
1933: { 0x228B, "supne", "" },
1934: { 0x228B, "vsupnE", "SUPERSET OF WITH NOT EQUAL TO" },
1935: { 0x228B, "vsupne", "SUPERSET OF WITH NOT EQUAL TO" },
1936: { 0x228E, "uplus", "MULTISET UNION" },
1937: { 0x228F, "sqsub", "" },
1938: { 0x2290, "sqsup", "" },
1939: { 0x2291, "sqsube", "" },
1940: { 0x2292, "sqsupe", "" },
1941: { 0x2293, "sqcap", "SQUARE CAP" },
1942: { 0x2294, "sqcup", "SQUARE CUP" },
1943: { 0x2295, "oplus", "CIRCLED PLUS" },
1944: { 0x2296, "ominus", "CIRCLED MINUS" },
1945: { 0x2297, "otimes", "CIRCLED TIMES" },
1946: { 0x2298, "osol", "CIRCLED DIVISION SLASH" },
1947: { 0x2299, "odot", "CIRCLED DOT OPERATOR" },
1948: { 0x229A, "ocir", "CIRCLED RING OPERATOR" },
1949: { 0x229B, "oast", "CIRCLED ASTERISK OPERATOR" },
1950: { 0x229D, "odash", "CIRCLED DASH" },
1951: { 0x229E, "plusb", "SQUARED PLUS" },
1952: { 0x229F, "minusb", "SQUARED MINUS" },
1953: { 0x22A0, "timesb", "SQUARED TIMES" },
1954: { 0x22A1, "sdotb", "SQUARED DOT OPERATOR" },
1955: { 0x22A2, "vdash", "" },
1956: { 0x22A3, "dashv", "" },
1957: { 0x22A4, "top", "DOWN TACK" },
1958: { 0x22A5, "bottom", "" },
1959: { 0x22A5, "perp", "" },
1960: { 0x22A7, "models", "MODELS" },
1961: { 0x22A8, "vDash", "" },
1962: { 0x22A9, "Vdash", "" },
1963: { 0x22AA, "Vvdash", "" },
1964: { 0x22AC, "nvdash", "DOES NOT PROVE" },
1965: { 0x22AD, "nvDash", "NOT TRUE" },
1966: { 0x22AE, "nVdash", "DOES NOT FORCE" },
1967: { 0x22AF, "nVDash", "NEGATED DOUBLE VERTICAL BAR DOUBLE RIGHT TURNSTILE" },
1968: { 0x22B2, "vltri", "" },
1969: { 0x22B3, "vrtri", "" },
1970: { 0x22B4, "ltrie", "" },
1971: { 0x22B5, "rtrie", "" },
1972: { 0x22B8, "mumap", "MULTIMAP" },
1973: { 0x22BA, "intcal", "INTERCALATE" },
1974: { 0x22BB, "veebar", "" },
1975: { 0x22BC, "barwed", "NAND" },
1976: { 0x22C4, "diam", "DIAMOND OPERATOR" },
1977: { 0x22C5, "sdot", "DOT OPERATOR" },
1978: { 0x22C6, "sstarf", "STAR OPERATOR" },
1979: { 0x22C6, "star", "STAR OPERATOR" },
1980: { 0x22C7, "divonx", "DIVISION TIMES" },
1981: { 0x22C8, "bowtie", "" },
1982: { 0x22C9, "ltimes", "LEFT NORMAL FACTOR SEMIDIRECT PRODUCT" },
1983: { 0x22CA, "rtimes", "RIGHT NORMAL FACTOR SEMIDIRECT PRODUCT" },
1984: { 0x22CB, "lthree", "LEFT SEMIDIRECT PRODUCT" },
1985: { 0x22CC, "rthree", "RIGHT SEMIDIRECT PRODUCT" },
1986: { 0x22CD, "bsime", "" },
1987: { 0x22CE, "cuvee", "CURLY LOGICAL OR" },
1988: { 0x22CF, "cuwed", "CURLY LOGICAL AND" },
1989: { 0x22D0, "Sub", "" },
1990: { 0x22D1, "Sup", "" },
1991: { 0x22D2, "Cap", "DOUBLE INTERSECTION" },
1992: { 0x22D3, "Cup", "DOUBLE UNION" },
1993: { 0x22D4, "fork", "" },
1994: { 0x22D6, "ldot", "" },
1995: { 0x22D7, "gsdot", "" },
1996: { 0x22D8, "Ll", "" },
1997: { 0x22D9, "Gg", "VERY MUCH GREATER-THAN" },
1998: { 0x22DA, "lEg", "" },
1999: { 0x22DA, "leg", "" },
2000: { 0x22DB, "gEl", "" },
2001: { 0x22DB, "gel", "" },
2002: { 0x22DC, "els", "" },
2003: { 0x22DD, "egs", "" },
2004: { 0x22DE, "cuepr", "" },
2005: { 0x22DF, "cuesc", "" },
2006: { 0x22E0, "npre", "DOES NOT PRECEDE OR EQUAL" },
2007: { 0x22E1, "nsce", "DOES NOT SUCCEED OR EQUAL" },
2008: { 0x22E6, "lnsim", "" },
2009: { 0x22E7, "gnsim", "GREATER-THAN BUT NOT EQUIVALENT TO" },
2010: { 0x22E8, "prnap", "" },
2011: { 0x22E8, "prnsim", "" },
2012: { 0x22E9, "scnap", "" },
2013: { 0x22E9, "scnsim", "" },
2014: { 0x22EA, "nltri", "NOT NORMAL SUBGROUP OF" },
2015: { 0x22EB, "nrtri", "DOES NOT CONTAIN AS NORMAL SUBGROUP" },
2016: { 0x22EC, "nltrie", "NOT NORMAL SUBGROUP OF OR EQUAL TO" },
2017: { 0x22ED, "nrtrie", "DOES NOT CONTAIN AS NORMAL SUBGROUP OR EQUAL" },
2018: { 0x22EE, "vellip", "" },
2019: { 0x2306, "Barwed", "PERSPECTIVE" },
2020: { 0x2308, "lceil", "LEFT CEILING" },
2021: { 0x2309, "rceil", "RIGHT CEILING" },
2022: { 0x230A, "lfloor", "LEFT FLOOR" },
2023: { 0x230B, "rfloor", "RIGHT FLOOR" },
2024: { 0x230C, "drcrop", "BOTTOM RIGHT CROP" },
2025: { 0x230D, "dlcrop", "BOTTOM LEFT CROP" },
2026: { 0x230E, "urcrop", "TOP RIGHT CROP" },
2027: { 0x230F, "ulcrop", "TOP LEFT CROP" },
2028: { 0x2315, "telrec", "TELEPHONE RECORDER" },
2029: { 0x2316, "target", "POSITION INDICATOR" },
2030: { 0x231C, "ulcorn", "TOP LEFT CORNER" },
2031: { 0x231D, "urcorn", "TOP RIGHT CORNER" },
2032: { 0x231E, "dlcorn", "BOTTOM LEFT CORNER" },
2033: { 0x231F, "drcorn", "BOTTOM RIGHT CORNER" },
2034: { 0x2322, "frown", "" },
2035: { 0x2322, "sfrown", "FROWN" },
2036: { 0x2323, "smile", "" },
2037: { 0x2323, "ssmile", "SMILE" },
2038: { 0x2423, "blank", "OPEN BOX" },
2039: { 0x24C8, "oS", "CIRCLED LATIN CAPITAL LETTER S" },
2040: { 0x2500, "boxh", "BOX DRAWINGS LIGHT HORIZONTAL" },
2041: { 0x2502, "boxv", "BOX DRAWINGS LIGHT VERTICAL" },
2042: { 0x250C, "boxdr", "BOX DRAWINGS LIGHT DOWN AND RIGHT" },
2043: { 0x2510, "boxdl", "BOX DRAWINGS LIGHT DOWN AND LEFT" },
2044: { 0x2514, "boxur", "BOX DRAWINGS LIGHT UP AND RIGHT" },
2045: { 0x2518, "boxul", "BOX DRAWINGS LIGHT UP AND LEFT" },
2046: { 0x251C, "boxvr", "BOX DRAWINGS LIGHT VERTICAL AND RIGHT" },
2047: { 0x2524, "boxvl", "BOX DRAWINGS LIGHT VERTICAL AND LEFT" },
2048: { 0x252C, "boxhd", "BOX DRAWINGS LIGHT DOWN AND HORIZONTAL" },
2049: { 0x2534, "boxhu", "BOX DRAWINGS LIGHT UP AND HORIZONTAL" },
2050: { 0x253C, "boxvh", "BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL" },
2051: { 0x2550, "boxH", "BOX DRAWINGS DOUBLE HORIZONTAL" },
2052: { 0x2551, "boxV", "BOX DRAWINGS DOUBLE VERTICAL" },
2053: { 0x2552, "boxDR", "BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE" },
2054: { 0x2553, "boxDr", "BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE" },
2055: { 0x2554, "boxdR", "BOX DRAWINGS DOUBLE DOWN AND RIGHT" },
2056: { 0x2555, "boxDL", "BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE" },
2057: { 0x2556, "boxdL", "BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE" },
2058: { 0x2557, "boxDl", "BOX DRAWINGS DOUBLE DOWN AND LEFT" },
2059: { 0x2558, "boxUR", "BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE" },
2060: { 0x2559, "boxuR", "BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE" },
2061: { 0x255A, "boxUr", "BOX DRAWINGS DOUBLE UP AND RIGHT" },
2062: { 0x255B, "boxUL", "BOX DRAWINGS UP SINGLE AND LEFT DOUBLE" },
2063: { 0x255C, "boxUl", "BOX DRAWINGS UP DOUBLE AND LEFT SINGLE" },
2064: { 0x255D, "boxuL", "BOX DRAWINGS DOUBLE UP AND LEFT" },
2065: { 0x255E, "boxvR", "BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE" },
2066: { 0x255F, "boxVR", "BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE" },
2067: { 0x2560, "boxVr", "BOX DRAWINGS DOUBLE VERTICAL AND RIGHT" },
2068: { 0x2561, "boxvL", "BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE" },
2069: { 0x2562, "boxVL", "BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE" },
2070: { 0x2563, "boxVl", "BOX DRAWINGS DOUBLE VERTICAL AND LEFT" },
2071: { 0x2564, "boxhD", "BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE" },
2072: { 0x2565, "boxHD", "BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE" },
2073: { 0x2566, "boxHd", "BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL" },
2074: { 0x2567, "boxhU", "BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE" },
2075: { 0x2568, "boxHU", "BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE" },
2076: { 0x2569, "boxHu", "BOX DRAWINGS DOUBLE UP AND HORIZONTAL" },
2077: { 0x256A, "boxvH", "BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE" },
2078: { 0x256B, "boxVH", "BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE" },
2079: { 0x256C, "boxVh", "BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL" },
2080: { 0x2580, "uhblk", "UPPER HALF BLOCK" },
2081: { 0x2584, "lhblk", "LOWER HALF BLOCK" },
2082: { 0x2588, "block", "FULL BLOCK" },
2083: { 0x2591, "blk14", "LIGHT SHADE" },
2084: { 0x2592, "blk12", "MEDIUM SHADE" },
2085: { 0x2593, "blk34", "DARK SHADE" },
2086: { 0x25A1, "square", "WHITE SQUARE" },
2087: { 0x25A1, "squ", "WHITE SQUARE" },
2088: { 0x25AA, "squf", "" },
2089: { 0x25AD, "rect", "WHITE RECTANGLE" },
2090: { 0x25AE, "marker", "BLACK VERTICAL RECTANGLE" },
2091: { 0x25B3, "xutri", "WHITE UP-POINTING TRIANGLE" },
2092: { 0x25B4, "utrif", "BLACK UP-POINTING TRIANGLE" },
2093: { 0x25B5, "utri", "WHITE UP-POINTING TRIANGLE" },
2094: { 0x25B8, "rtrif", "BLACK RIGHT-POINTING TRIANGLE" },
2095: { 0x25B9, "rtri", "WHITE RIGHT-POINTING TRIANGLE" },
2096: { 0x25BD, "xdtri", "WHITE DOWN-POINTING TRIANGLE" },
2097: { 0x25BE, "dtrif", "BLACK DOWN-POINTING TRIANGLE" },
2098: { 0x25BF, "dtri", "WHITE DOWN-POINTING TRIANGLE" },
2099: { 0x25C2, "ltrif", "BLACK LEFT-POINTING TRIANGLE" },
2100: { 0x25C3, "ltri", "WHITE LEFT-POINTING TRIANGLE" },
2101: { 0x25CA, "loz", "LOZENGE" },
2102: { 0x25CB, "cir", "WHITE CIRCLE" },
2103: { 0x25CB, "xcirc", "WHITE CIRCLE" },
2104: { 0x2605, "starf", "BLACK STAR" },
2105: { 0x260E, "phone", "TELEPHONE SIGN" },
2106: { 0x2640, "female", "" },
2107: { 0x2642, "male", "MALE SIGN" },
2108: { 0x2660, "spades", "BLACK SPADE SUIT" },
2109: { 0x2663, "clubs", "BLACK CLUB SUIT" },
2110: { 0x2665, "hearts", "BLACK HEART SUIT" },
2111: { 0x2666, "diams", "BLACK DIAMOND SUIT" },
2112: { 0x2669, "sung", "" },
2113: { 0x266D, "flat", "MUSIC FLAT SIGN" },
2114: { 0x266E, "natur", "MUSIC NATURAL SIGN" },
2115: { 0x266F, "sharp", "MUSIC SHARP SIGN" },
2116: { 0x2713, "check", "CHECK MARK" },
2117: { 0x2717, "cross", "BALLOT X" },
2118: { 0x2720, "malt", "MALTESE CROSS" },
2119: { 0x2726, "lozf", "" },
2120: { 0x2736, "sext", "SIX POINTED BLACK STAR" },
2121: { 0x3008, "lang", "" },
2122: { 0x3009, "rang", "" },
2123: { 0xE291, "rpargt", "" },
2124: { 0xE2A2, "lnap", "" },
2125: { 0xE2AA, "nsmid", "" },
2126: { 0xE2B3, "prnE", "" },
2127: { 0xE2B5, "scnE", "" },
2128: { 0xE2B8, "vsubnE", "" },
2129: { 0xE301, "smid", "" },
2130: { 0xE411, "gnap", "" },
2131: { 0xFB00, "fflig", "" },
2132: { 0xFB01, "filig", "" },
2133: { 0xFB02, "fllig", "" },
2134: { 0xFB03, "ffilig", "" },
2135: { 0xFB04, "ffllig", "" },
2136: { 0xFE68, "sbsol", "SMALL REVERSE SOLIDUS" },
2137: };
2138:
2139: /************************************************************************
2140: * *
2141: * Commodity functions to handle entities *
2142: * *
2143: ************************************************************************/
2144:
2145: /*
2146: * Macro used to grow the current buffer.
2147: */
2148: #define growBuffer(buffer) { \
2149: buffer##_size *= 2; \
2150: buffer = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
2151: if (buffer == NULL) { \
2152: perror("realloc failed"); \
2153: return(NULL); \
2154: } \
2155: }
2156:
2157: /**
2158: * sgmlEntityLookup:
2159: * @name: the entity name
2160: *
2161: * Lookup the given entity in EntitiesTable
2162: *
2163: * TODO: the linear scan is really ugly, an hash table is really needed.
2164: *
2165: * Returns the associated sgmlEntityDescPtr if found, NULL otherwise.
2166: */
2167: sgmlEntityDescPtr
2168: sgmlEntityLookup(const xmlChar *name) {
2169: int i;
2170:
2171: for (i = 0;i < (sizeof(docbookEntitiesTable)/
2172: sizeof(docbookEntitiesTable[0]));i++) {
2173: if (!xmlStrcmp(name, BAD_CAST docbookEntitiesTable[i].name)) {
2174: #ifdef DEBUG
2175: fprintf(stderr,"Found entity %s\n", name);
2176: #endif
2177: return(&docbookEntitiesTable[i]);
2178: }
2179: }
2180: return(NULL);
2181: }
2182:
2183: /**
2184: * sgmlEntityValueLookup:
2185: * @value: the entity's unicode value
2186: *
2187: * Lookup the given entity in EntitiesTable
2188: *
2189: * TODO: the linear scan is really ugly, an hash table is really needed.
2190: *
2191: * Returns the associated sgmlEntityDescPtr if found, NULL otherwise.
2192: */
2193: sgmlEntityDescPtr
2194: sgmlEntityValueLookup(int value) {
2195: int i;
2196: #ifdef DEBUG
2197: int lv = 0;
2198: #endif
2199:
2200: for (i = 0;i < (sizeof(docbookEntitiesTable)/
2201: sizeof(docbookEntitiesTable[0]));i++) {
2202: if (docbookEntitiesTable[i].value >= value) {
2203: if (docbookEntitiesTable[i].value > value)
2204: break;
2205: #ifdef DEBUG
2206: fprintf(stderr,"Found entity %s\n", docbookEntitiesTable[i].name);
2207: #endif
2208: return(&docbookEntitiesTable[i]);
2209: }
2210: #ifdef DEBUG
2211: if (lv > docbookEntitiesTable[i].value) {
2212: fprintf(stderr, "docbookEntitiesTable[] is not sorted (%d > %d)!\n",
2213: lv, docbookEntitiesTable[i].value);
2214: }
2215: lv = docbookEntitiesTable[i].value;
2216: #endif
2217: }
2218: return(NULL);
2219: }
2220:
2221: /**
2222: * UTF8ToSgml:
2223: * @out: a pointer to an array of bytes to store the result
2224: * @outlen: the length of @out
2225: * @in: a pointer to an array of UTF-8 chars
2226: * @inlen: the length of @in
2227: *
2228: * Take a block of UTF-8 chars in and try to convert it to an ASCII
2229: * plus SGML entities block of chars out.
2230: *
2231: * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2232: * The value of @inlen after return is the number of octets consumed
2233: * as the return value is positive, else unpredictiable.
2234: * The value of @outlen after return is the number of octets consumed.
2235: */
2236: int
2237: UTF8ToSgml(unsigned char* out, int *outlen,
2238: const unsigned char* in, int *inlen) {
2239: const unsigned char* processed = in;
2240: const unsigned char* outend;
2241: const unsigned char* outstart = out;
2242: const unsigned char* instart = in;
2243: const unsigned char* inend;
2244: unsigned int c, d;
2245: int trailing;
2246:
2247: if (in == NULL) {
2248: /*
2249: * initialization nothing to do
2250: */
2251: *outlen = 0;
2252: *inlen = 0;
2253: return(0);
2254: }
2255: inend = in + (*inlen);
2256: outend = out + (*outlen);
2257: while (in < inend) {
2258: d = *in++;
2259: if (d < 0x80) { c= d; trailing= 0; }
2260: else if (d < 0xC0) {
2261: /* trailing byte in leading position */
2262: *outlen = out - outstart;
2263: *inlen = processed - instart;
2264: return(-2);
2265: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2266: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2267: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2268: else {
2269: /* no chance for this in Ascii */
2270: *outlen = out - outstart;
2271: *inlen = processed - instart;
2272: return(-2);
2273: }
2274:
2275: if (inend - in < trailing) {
2276: break;
2277: }
2278:
2279: for ( ; trailing; trailing--) {
2280: if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2281: break;
2282: c <<= 6;
2283: c |= d & 0x3F;
2284: }
2285:
2286: /* assertion: c is a single UTF-4 value */
2287: if (c < 0x80) {
2288: if (out + 1 >= outend)
2289: break;
2290: *out++ = c;
2291: } else {
2292: int len;
2293: sgmlEntityDescPtr ent;
2294:
2295: /*
2296: * Try to lookup a predefined SGML entity for it
2297: */
2298:
2299: ent = sgmlEntityValueLookup(c);
2300: if (ent == NULL) {
2301: /* no chance for this in Ascii */
2302: *outlen = out - outstart;
2303: *inlen = processed - instart;
2304: return(-2);
2305: }
2306: len = strlen(ent->name);
2307: if (out + 2 + len >= outend)
2308: break;
2309: *out++ = '&';
2310: memcpy(out, ent->name, len);
2311: out += len;
2312: *out++ = ';';
2313: }
2314: processed = in;
2315: }
2316: *outlen = out - outstart;
2317: *inlen = processed - instart;
2318: return(0);
2319: }
2320:
2321: /**
2322: * sgmlEncodeEntities:
2323: * @out: a pointer to an array of bytes to store the result
2324: * @outlen: the length of @out
2325: * @in: a pointer to an array of UTF-8 chars
2326: * @inlen: the length of @in
2327: * @quoteChar: the quote character to escape (' or ") or zero.
2328: *
2329: * Take a block of UTF-8 chars in and try to convert it to an ASCII
2330: * plus SGML entities block of chars out.
2331: *
2332: * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2333: * The value of @inlen after return is the number of octets consumed
2334: * as the return value is positive, else unpredictiable.
2335: * The value of @outlen after return is the number of octets consumed.
2336: */
2337: int
2338: sgmlEncodeEntities(unsigned char* out, int *outlen,
2339: const unsigned char* in, int *inlen, int quoteChar) {
2340: const unsigned char* processed = in;
2341: const unsigned char* outend = out + (*outlen);
2342: const unsigned char* outstart = out;
2343: const unsigned char* instart = in;
2344: const unsigned char* inend = in + (*inlen);
2345: unsigned int c, d;
2346: int trailing;
2347:
2348: while (in < inend) {
2349: d = *in++;
2350: if (d < 0x80) { c= d; trailing= 0; }
2351: else if (d < 0xC0) {
2352: /* trailing byte in leading position */
2353: *outlen = out - outstart;
2354: *inlen = processed - instart;
2355: return(-2);
2356: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2357: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2358: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2359: else {
2360: /* no chance for this in Ascii */
2361: *outlen = out - outstart;
2362: *inlen = processed - instart;
2363: return(-2);
2364: }
2365:
2366: if (inend - in < trailing)
2367: break;
2368:
2369: while (trailing--) {
2370: if (((d= *in++) & 0xC0) != 0x80) {
2371: *outlen = out - outstart;
2372: *inlen = processed - instart;
2373: return(-2);
2374: }
2375: c <<= 6;
2376: c |= d & 0x3F;
2377: }
2378:
2379: /* assertion: c is a single UTF-4 value */
2380: if (c < 0x80 && c != quoteChar && c != '&' && c != '<' && c != '>') {
2381: if (out >= outend)
2382: break;
2383: *out++ = c;
2384: } else {
2385: sgmlEntityDescPtr ent;
2386: const char *cp;
2387: char nbuf[16];
2388: int len;
2389:
2390: /*
2391: * Try to lookup a predefined SGML entity for it
2392: */
2393: ent = sgmlEntityValueLookup(c);
2394: if (ent == NULL) {
2395: sprintf(nbuf, "#%u", c);
2396: cp = nbuf;
2397: }
2398: else
2399: cp = ent->name;
2400: len = strlen(cp);
2401: if (out + 2 + len > outend)
2402: break;
2403: *out++ = '&';
2404: memcpy(out, cp, len);
2405: out += len;
2406: *out++ = ';';
2407: }
2408: processed = in;
2409: }
2410: *outlen = out - outstart;
2411: *inlen = processed - instart;
2412: return(0);
2413: }
2414:
2415: /**
2416: * sgmlDecodeEntities:
2417: * @ctxt: the parser context
2418: * @len: the len to decode (in bytes !), -1 for no size limit
2419: * @end: an end marker xmlChar, 0 if none
2420: * @end2: an end marker xmlChar, 0 if none
2421: * @end3: an end marker xmlChar, 0 if none
2422: *
2423: * Subtitute the SGML entities by their value
2424: *
2425: * DEPRECATED !!!!
2426: *
2427: * Returns A newly allocated string with the substitution done. The caller
2428: * must deallocate it !
2429: */
2430: xmlChar *
2431: sgmlDecodeEntities(sgmlParserCtxtPtr ctxt, int len,
2432: xmlChar end, xmlChar end2, xmlChar end3) {
2433: xmlChar *name = NULL;
2434: xmlChar *buffer = NULL;
2435: unsigned int buffer_size = 0;
2436: unsigned int nbchars = 0;
2437: sgmlEntityDescPtr ent;
2438: unsigned int max = (unsigned int) len;
2439: int c,l;
2440:
2441: if (ctxt->depth > 40) {
2442: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2443: ctxt->sax->error(ctxt->userData,
2444: "Detected entity reference loop\n");
2445: ctxt->wellFormed = 0;
2446: ctxt->disableSAX = 1;
2447: ctxt->errNo = XML_ERR_ENTITY_LOOP;
2448: return(NULL);
2449: }
2450:
2451: /*
2452: * allocate a translation buffer.
2453: */
2454: buffer_size = SGML_PARSER_BIG_BUFFER_SIZE;
2455: buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
2456: if (buffer == NULL) {
2457: perror("xmlDecodeEntities: malloc failed");
2458: return(NULL);
2459: }
2460:
2461: /*
2462: * Ok loop until we reach one of the ending char or a size limit.
2463: */
2464: c = CUR_CHAR(l);
2465: while ((nbchars < max) && (c != end) &&
2466: (c != end2) && (c != end3)) {
2467:
2468: if (c == 0) break;
2469: if (((c == '&') && (ctxt->token != '&')) && (NXT(1) == '#')) {
2470: int val = sgmlParseCharRef(ctxt);
2471: COPY_BUF(0,buffer,nbchars,val);
2472: NEXTL(l);
2473: } else if ((c == '&') && (ctxt->token != '&')) {
2474: ent = sgmlParseEntityRef(ctxt, &name);
2475: if (name != NULL) {
2476: if (ent != NULL) {
2477: int val = ent->value;
2478: COPY_BUF(0,buffer,nbchars,val);
2479: NEXTL(l);
2480: } else {
2481: const xmlChar *cur = name;
2482:
2483: buffer[nbchars++] = '&';
2484: if (nbchars > buffer_size - SGML_PARSER_BUFFER_SIZE) {
2485: growBuffer(buffer);
2486: }
2487: while (*cur != 0) {
2488: buffer[nbchars++] = *cur++;
2489: }
2490: buffer[nbchars++] = ';';
2491: }
2492: }
2493: } else {
2494: COPY_BUF(l,buffer,nbchars,c);
2495: NEXTL(l);
2496: if (nbchars > buffer_size - SGML_PARSER_BUFFER_SIZE) {
2497: growBuffer(buffer);
2498: }
2499: }
2500: c = CUR_CHAR(l);
2501: }
2502: buffer[nbchars++] = 0;
2503: return(buffer);
2504: }
2505:
2506: /************************************************************************
2507: * *
2508: * Commodity functions to handle streams *
2509: * *
2510: ************************************************************************/
2511:
2512: /**
2513: * sgmlFreeInputStream:
2514: * @input: an sgmlParserInputPtr
2515: *
2516: * Free up an input stream.
2517: */
2518: void
2519: sgmlFreeInputStream(sgmlParserInputPtr input) {
2520: if (input == NULL) return;
2521:
2522: if (input->filename != NULL) xmlFree((char *) input->filename);
2523: if (input->directory != NULL) xmlFree((char *) input->directory);
2524: if ((input->free != NULL) && (input->base != NULL))
2525: input->free((xmlChar *) input->base);
2526: if (input->buf != NULL)
2527: xmlFreeParserInputBuffer(input->buf);
2528: memset(input, -1, sizeof(sgmlParserInput));
2529: xmlFree(input);
2530: }
2531:
2532: /**
2533: * sgmlNewInputStream:
2534: * @ctxt: an SGML parser context
2535: *
2536: * Create a new input stream structure
2537: * Returns the new input stream or NULL
2538: */
2539: sgmlParserInputPtr
2540: sgmlNewInputStream(sgmlParserCtxtPtr ctxt) {
2541: sgmlParserInputPtr input;
2542:
2543: input = (xmlParserInputPtr) xmlMalloc(sizeof(sgmlParserInput));
2544: if (input == NULL) {
2545: ctxt->errNo = XML_ERR_NO_MEMORY;
2546: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2547: ctxt->sax->error(ctxt->userData,
2548: "malloc: couldn't allocate a new input stream\n");
2549: ctxt->errNo = XML_ERR_NO_MEMORY;
2550: return(NULL);
2551: }
2552: memset(input, 0, sizeof(sgmlParserInput));
2553: input->filename = NULL;
2554: input->directory = NULL;
2555: input->base = NULL;
2556: input->cur = NULL;
2557: input->buf = NULL;
2558: input->line = 1;
2559: input->col = 1;
2560: input->buf = NULL;
2561: input->free = NULL;
2562: input->version = NULL;
2563: input->consumed = 0;
2564: input->length = 0;
2565: return(input);
2566: }
2567:
2568:
2569: /************************************************************************
2570: * *
2571: * Commodity functions, cleanup needed ? *
2572: * *
2573: ************************************************************************/
2574:
2575: /**
2576: * areBlanks:
2577: * @ctxt: an SGML parser context
2578: * @str: a xmlChar *
2579: * @len: the size of @str
2580: *
2581: * Is this a sequence of blank chars that one can ignore ?
2582: *
2583: * Returns 1 if ignorable 0 otherwise.
2584: */
2585:
2586: static int areBlanks(sgmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2587: int i;
2588: xmlNodePtr lastChild;
2589:
2590: for (i = 0;i < len;i++)
2591: if (!(IS_BLANK(str[i]))) return(0);
2592:
2593: if (CUR == 0) return(1);
2594: if (CUR != '<') return(0);
2595: if (ctxt->name == NULL)
2596: return(1);
2597: #if 0
2598: if (!xmlStrcmp(ctxt->name, BAD_CAST"sgml"))
2599: return(1);
2600: if (!xmlStrcmp(ctxt->name, BAD_CAST"head"))
2601: return(1);
2602: if (!xmlStrcmp(ctxt->name, BAD_CAST"body"))
2603: return(1);
2604: #endif
2605: if (ctxt->node == NULL) return(0);
2606: lastChild = xmlGetLastChild(ctxt->node);
2607: if (lastChild == NULL) {
2608: if (ctxt->node->content != NULL) return(0);
2609: } else if (xmlNodeIsText(lastChild))
2610: return(0);
2611: return(1);
2612: }
2613:
2614: /**
2615: * sgmlHandleEntity:
2616: * @ctxt: an SGML parser context
2617: * @entity: an XML entity pointer.
2618: *
2619: * Default handling of an SGML entity, call the parser with the
2620: * substitution string
2621: */
2622:
2623: void
2624: sgmlHandleEntity(sgmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
2625: int len;
2626:
2627: if (entity->content == NULL) {
2628: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
2629: ctxt->sax->error(ctxt->userData, "sgmlHandleEntity %s: content == NULL\n",
2630: entity->name);
2631: ctxt->wellFormed = 0;
2632: return;
2633: }
2634: len = xmlStrlen(entity->content);
2635:
2636: /*
2637: * Just handle the content as a set of chars.
2638: */
2639: sgmlCheckParagraph(ctxt);
2640: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
2641: ctxt->sax->characters(ctxt->userData, entity->content, len);
2642:
2643: }
2644:
2645: /**
2646: * sgmlNewDocNoDtD:
2647: * @URI: URI for the dtd, or NULL
2648: * @ExternalID: the external ID of the DTD, or NULL
2649: *
2650: * Returns a new document, do not intialize the DTD if not provided
2651: */
2652: sgmlDocPtr
2653: sgmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2654: xmlDocPtr cur;
2655:
2656: /*
2657: * Allocate a new document and fill the fields.
2658: */
2659: cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2660: if (cur == NULL) {
2661: fprintf(stderr, "xmlNewDoc : malloc failed\n");
2662: return(NULL);
2663: }
2664: memset(cur, 0, sizeof(xmlDoc));
2665:
2666: cur->type = XML_SGML_DOCUMENT_NODE;
2667: cur->version = NULL;
2668: cur->intSubset = NULL;
2669: if ((ExternalID != NULL) ||
2670: (URI != NULL))
2671: xmlCreateIntSubset(cur, BAD_CAST "SGML", ExternalID, URI);
2672: cur->doc = cur;
2673: cur->name = NULL;
2674: cur->children = NULL;
2675: cur->extSubset = NULL;
2676: cur->oldNs = NULL;
2677: cur->encoding = NULL;
2678: cur->standalone = 1;
2679: cur->compression = 0;
2680: cur->ids = NULL;
2681: cur->refs = NULL;
2682: #ifndef XML_WITHOUT_CORBA
2683: cur->_private = NULL;
2684: #endif
2685: return(cur);
2686: }
2687:
2688: /**
2689: * sgmlNewDoc:
2690: * @URI: URI for the dtd, or NULL
2691: * @ExternalID: the external ID of the DTD, or NULL
2692: *
2693: * Returns a new document
2694: */
2695: sgmlDocPtr
2696: sgmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2697: if ((URI == NULL) && (ExternalID == NULL))
2698: return(sgmlNewDocNoDtD(
2699: BAD_CAST "-//W3C//DTD SGML 4.0 Transitional//EN",
2700: BAD_CAST "http://www.w3.org/TR/REC-docbook/loose.dtd"));
2701:
2702: return(sgmlNewDocNoDtD(URI, ExternalID));
2703: }
2704:
2705:
2706: /************************************************************************
2707: * *
2708: * The parser itself *
2709: * Relates to http://www.w3.org/TR/docbook *
2710: * *
2711: ************************************************************************/
2712:
2713: /************************************************************************
2714: * *
2715: * The parser itself *
2716: * *
2717: ************************************************************************/
2718:
2719: /**
2720: * sgmlParseSGMLName:
2721: * @ctxt: an SGML parser context
2722: *
2723: * parse an SGML tag or attribute name, note that we convert it to lowercase
2724: * since SGML names are not case-sensitive.
2725: *
2726: * Returns the Tag Name parsed or NULL
2727: */
2728:
2729: xmlChar *
2730: sgmlParseSGMLName(sgmlParserCtxtPtr ctxt) {
2731: xmlChar *ret = NULL;
2732: int i = 0;
2733: xmlChar loc[SGML_PARSER_BUFFER_SIZE];
2734:
2735: if (!IS_LETTER(CUR) && (CUR != '_') &&
2736: (CUR != ':')) return(NULL);
2737:
2738: while ((i < SGML_PARSER_BUFFER_SIZE) &&
2739: ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2740: (CUR == ':') || (CUR == '_'))) {
2741: if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2742: else loc[i] = CUR;
2743: i++;
2744:
2745: NEXT;
2746: }
2747:
2748: ret = xmlStrndup(loc, i);
2749:
2750: return(ret);
2751: }
2752:
2753: /**
2754: * sgmlParseName:
2755: * @ctxt: an SGML parser context
2756: *
2757: * parse an SGML name, this routine is case sensistive.
2758: *
2759: * Returns the Name parsed or NULL
2760: */
2761:
2762: xmlChar *
2763: sgmlParseName(sgmlParserCtxtPtr ctxt) {
2764: xmlChar buf[SGML_MAX_NAMELEN];
2765: int len = 0;
2766:
2767: GROW;
2768: if (!IS_LETTER(CUR) && (CUR != '_')) {
2769: return(NULL);
2770: }
2771:
2772: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2773: (CUR == '.') || (CUR == '-') ||
2774: (CUR == '_') || (CUR == ':') ||
2775: (IS_COMBINING(CUR)) ||
2776: (IS_EXTENDER(CUR))) {
2777: buf[len++] = CUR;
2778: NEXT;
2779: if (len >= SGML_MAX_NAMELEN) {
2780: fprintf(stderr,
2781: "sgmlParseName: reached SGML_MAX_NAMELEN limit\n");
2782: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2783: (CUR == '.') || (CUR == '-') ||
2784: (CUR == '_') || (CUR == ':') ||
2785: (IS_COMBINING(CUR)) ||
2786: (IS_EXTENDER(CUR)))
2787: NEXT;
2788: break;
2789: }
2790: }
2791: return(xmlStrndup(buf, len));
2792: }
2793:
2794: /**
2795: * sgmlParseSGMLAttribute:
2796: * @ctxt: an SGML parser context
2797: * @stop: a char stop value
2798: *
2799: * parse an SGML attribute value till the stop (quote), if
2800: * stop is 0 then it stops at the first space
2801: *
2802: * Returns the attribute parsed or NULL
2803: */
2804:
2805: xmlChar *
2806: sgmlParseSGMLAttribute(sgmlParserCtxtPtr ctxt, const xmlChar stop) {
2807: #if 0
2808: xmlChar buf[SGML_MAX_NAMELEN];
2809: int len = 0;
2810:
2811: GROW;
2812: while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
2813: if ((stop == 0) && (IS_BLANK(CUR))) break;
2814: buf[len++] = CUR;
2815: NEXT;
2816: if (len >= SGML_MAX_NAMELEN) {
2817: fprintf(stderr,
2818: "sgmlParseSGMLAttribute: reached SGML_MAX_NAMELEN limit\n");
2819: while ((!IS_BLANK(CUR)) && (CUR != '<') &&
2820: (CUR != '>') &&
2821: (CUR != '\'') && (CUR != '"'))
2822: NEXT;
2823: break;
2824: }
2825: }
2826: return(xmlStrndup(buf, len));
2827: #else
2828: xmlChar *buffer = NULL;
2829: int buffer_size = 0;
2830: xmlChar *out = NULL;
2831: xmlChar *name = NULL;
2832:
2833: xmlChar *cur = NULL;
2834: sgmlEntityDescPtr ent;
2835:
2836: /*
2837: * allocate a translation buffer.
2838: */
2839: buffer_size = SGML_PARSER_BIG_BUFFER_SIZE;
2840: buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
2841: if (buffer == NULL) {
2842: perror("sgmlParseSGMLAttribute: malloc failed");
2843: return(NULL);
2844: }
2845: out = buffer;
2846:
2847: /*
2848: * Ok loop until we reach one of the ending chars
2849: */
2850: while ((CUR != 0) && (CUR != stop) && (CUR != '>')) {
2851: if ((stop == 0) && (IS_BLANK(CUR))) break;
2852: if (CUR == '&') {
2853: if (NXT(1) == '#') {
2854: unsigned int c;
2855: int bits;
2856:
2857: c = sgmlParseCharRef(ctxt);
2858: if (c < 0x80)
2859: { *out++ = c; bits= -6; }
2860: else if (c < 0x800)
2861: { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2862: else if (c < 0x10000)
2863: { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2864: else
2865: { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2866:
2867: for ( ; bits >= 0; bits-= 6) {
2868: *out++ = ((c >> bits) & 0x3F) | 0x80;
2869: }
2870: } else {
2871: ent = sgmlParseEntityRef(ctxt, &name);
2872: if (name == NULL) {
2873: *out++ = '&';
2874: if (out - buffer > buffer_size - 100) {
2875: int index = out - buffer;
2876:
2877: growBuffer(buffer);
2878: out = &buffer[index];
2879: }
2880: } else if (ent == NULL) {
2881: *out++ = '&';
2882: cur = name;
2883: while (*cur != 0) {
2884: if (out - buffer > buffer_size - 100) {
2885: int index = out - buffer;
2886:
2887: growBuffer(buffer);
2888: out = &buffer[index];
2889: }
2890: *out++ = *cur++;
2891: }
2892: xmlFree(name);
2893: } else {
2894: unsigned int c;
2895: int bits;
2896:
2897: if (out - buffer > buffer_size - 100) {
2898: int index = out - buffer;
2899:
2900: growBuffer(buffer);
2901: out = &buffer[index];
2902: }
2903: c = (xmlChar)ent->value;
2904: if (c < 0x80)
2905: { *out++ = c; bits= -6; }
2906: else if (c < 0x800)
2907: { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2908: else if (c < 0x10000)
2909: { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2910: else
2911: { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2912:
2913: for ( ; bits >= 0; bits-= 6) {
2914: *out++ = ((c >> bits) & 0x3F) | 0x80;
2915: }
2916: xmlFree(name);
2917: }
2918: }
2919: } else {
2920: unsigned int c;
2921: int bits;
2922:
2923: if (out - buffer > buffer_size - 100) {
2924: int index = out - buffer;
2925:
2926: growBuffer(buffer);
2927: out = &buffer[index];
2928: }
2929: c = CUR;
2930: if (c < 0x80)
2931: { *out++ = c; bits= -6; }
2932: else if (c < 0x800)
2933: { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2934: else if (c < 0x10000)
2935: { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2936: else
2937: { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2938:
2939: for ( ; bits >= 0; bits-= 6) {
2940: *out++ = ((c >> bits) & 0x3F) | 0x80;
2941: }
2942: NEXT;
2943: }
2944: }
2945: *out++ = 0;
2946: return(buffer);
2947: #endif
2948: }
2949:
2950: /**
2951: * sgmlParseNmtoken:
2952: * @ctxt: an SGML parser context
2953: *
2954: * parse an SGML Nmtoken.
2955: *
2956: * Returns the Nmtoken parsed or NULL
2957: */
2958:
2959: xmlChar *
2960: sgmlParseNmtoken(sgmlParserCtxtPtr ctxt) {
2961: xmlChar buf[SGML_MAX_NAMELEN];
2962: int len = 0;
2963:
2964: GROW;
2965: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2966: (CUR == '.') || (CUR == '-') ||
2967: (CUR == '_') || (CUR == ':') ||
2968: (IS_COMBINING(CUR)) ||
2969: (IS_EXTENDER(CUR))) {
2970: buf[len++] = CUR;
2971: NEXT;
2972: if (len >= SGML_MAX_NAMELEN) {
2973: fprintf(stderr,
2974: "sgmlParseNmtoken: reached SGML_MAX_NAMELEN limit\n");
2975: while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) ||
2976: (CUR == '.') || (CUR == '-') ||
2977: (CUR == '_') || (CUR == ':') ||
2978: (IS_COMBINING(CUR)) ||
2979: (IS_EXTENDER(CUR)))
2980: NEXT;
2981: break;
2982: }
2983: }
2984: return(xmlStrndup(buf, len));
2985: }
2986:
2987: /**
2988: * sgmlParseEntityRef:
2989: * @ctxt: an SGML parser context
2990: * @str: location to store the entity name
2991: *
2992: * parse an SGML ENTITY references
2993: *
2994: * [68] EntityRef ::= '&' Name ';'
2995: *
2996: * Returns the associated sgmlEntityDescPtr if found, or NULL otherwise,
2997: * if non-NULL *str will have to be freed by the caller.
2998: */
2999: sgmlEntityDescPtr
3000: sgmlParseEntityRef(sgmlParserCtxtPtr ctxt, xmlChar **str) {
3001: xmlChar *name;
3002: sgmlEntityDescPtr ent = NULL;
3003: *str = NULL;
3004:
3005: if (CUR == '&') {
3006: NEXT;
3007: name = sgmlParseName(ctxt);
3008: if (name == NULL) {
3009: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3010: ctxt->sax->error(ctxt->userData, "sgmlParseEntityRef: no name\n");
3011: ctxt->wellFormed = 0;
3012: } else {
3013: GROW;
3014: if (CUR == ';') {
3015: *str = name;
3016:
3017: /*
3018: * Lookup the entity in the table.
3019: */
3020: ent = sgmlEntityLookup(name);
3021: if (ent != NULL) /* OK that's ugly !!! */
3022: NEXT;
3023: } else {
3024: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3025: ctxt->sax->error(ctxt->userData,
3026: "sgmlParseEntityRef: expecting ';'\n");
3027: *str = name;
3028: }
3029: }
3030: }
3031: return(ent);
3032: }
3033:
3034: /**
3035: * sgmlParseAttValue:
3036: * @ctxt: an SGML parser context
3037: *
3038: * parse a value for an attribute
3039: * Note: the parser won't do substitution of entities here, this
3040: * will be handled later in xmlStringGetNodeList, unless it was
3041: * asked for ctxt->replaceEntities != 0
3042: *
3043: * Returns the AttValue parsed or NULL.
3044: */
3045:
3046: xmlChar *
3047: sgmlParseAttValue(sgmlParserCtxtPtr ctxt) {
3048: xmlChar *ret = NULL;
3049:
3050: if (CUR == '"') {
3051: NEXT;
3052: ret = sgmlParseSGMLAttribute(ctxt, '"');
3053: if (CUR != '"') {
3054: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3055: ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
3056: ctxt->wellFormed = 0;
3057: } else
3058: NEXT;
3059: } else if (CUR == '\'') {
3060: NEXT;
3061: ret = sgmlParseSGMLAttribute(ctxt, '\'');
3062: if (CUR != '\'') {
3063: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3064: ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
3065: ctxt->wellFormed = 0;
3066: } else
3067: NEXT;
3068: } else {
3069: /*
3070: * That's an SGMLism, the attribute value may not be quoted
3071: */
3072: ret = sgmlParseSGMLAttribute(ctxt, 0);
3073: if (ret == NULL) {
3074: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3075: ctxt->sax->error(ctxt->userData, "AttValue: no value found\n");
3076: ctxt->wellFormed = 0;
3077: }
3078: }
3079: return(ret);
3080: }
3081:
3082: /**
3083: * sgmlParseSystemLiteral:
3084: * @ctxt: an SGML parser context
3085: *
3086: * parse an SGML Literal
3087: *
3088: * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
3089: *
3090: * Returns the SystemLiteral parsed or NULL
3091: */
3092:
3093: xmlChar *
3094: sgmlParseSystemLiteral(sgmlParserCtxtPtr ctxt) {
3095: const xmlChar *q;
3096: xmlChar *ret = NULL;
3097:
3098: if (CUR == '"') {
3099: NEXT;
3100: q = CUR_PTR;
3101: while ((IS_CHAR(CUR)) && (CUR != '"'))
3102: NEXT;
3103: if (!IS_CHAR(CUR)) {
3104: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3105: ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
3106: ctxt->wellFormed = 0;
3107: } else {
3108: ret = xmlStrndup(q, CUR_PTR - q);
3109: NEXT;
3110: }
3111: } else if (CUR == '\'') {
3112: NEXT;
3113: q = CUR_PTR;
3114: while ((IS_CHAR(CUR)) && (CUR != '\''))
3115: NEXT;
3116: if (!IS_CHAR(CUR)) {
3117: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3118: ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n");
3119: ctxt->wellFormed = 0;
3120: } else {
3121: ret = xmlStrndup(q, CUR_PTR - q);
3122: NEXT;
3123: }
3124: } else {
3125: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3126: ctxt->sax->error(ctxt->userData,
3127: "SystemLiteral \" or ' expected\n");
3128: ctxt->wellFormed = 0;
3129: }
3130:
3131: return(ret);
3132: }
3133:
3134: /**
3135: * sgmlParsePubidLiteral:
3136: * @ctxt: an SGML parser context
3137: *
3138: * parse an SGML public literal
3139: *
3140: * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
3141: *
3142: * Returns the PubidLiteral parsed or NULL.
3143: */
3144:
3145: xmlChar *
3146: sgmlParsePubidLiteral(sgmlParserCtxtPtr ctxt) {
3147: const xmlChar *q;
3148: xmlChar *ret = NULL;
3149: /*
3150: * Name ::= (Letter | '_') (NameChar)*
3151: */
3152: if (CUR == '"') {
3153: NEXT;
3154: q = CUR_PTR;
3155: while (IS_PUBIDCHAR(CUR)) NEXT;
3156: if (CUR != '"') {
3157: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3158: ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
3159: ctxt->wellFormed = 0;
3160: } else {
3161: ret = xmlStrndup(q, CUR_PTR - q);
3162: NEXT;
3163: }
3164: } else if (CUR == '\'') {
3165: NEXT;
3166: q = CUR_PTR;
3167: while ((IS_LETTER(CUR)) && (CUR != '\''))
3168: NEXT;
3169: if (!IS_LETTER(CUR)) {
3170: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3171: ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n");
3172: ctxt->wellFormed = 0;
3173: } else {
3174: ret = xmlStrndup(q, CUR_PTR - q);
3175: NEXT;
3176: }
3177: } else {
3178: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3179: ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n");
3180: ctxt->wellFormed = 0;
3181: }
3182:
3183: return(ret);
3184: }
3185:
3186: /**
3187: * sgmlParseCharData:
3188: * @ctxt: an SGML parser context
3189: * @cdata: int indicating whether we are within a CDATA section
3190: *
3191: * parse a CharData section.
3192: * if we are within a CDATA section ']]>' marks an end of section.
3193: *
3194: * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3195: */
3196:
3197: void
3198: sgmlParseCharData(sgmlParserCtxtPtr ctxt, int cdata) {
3199: xmlChar buf[SGML_PARSER_BIG_BUFFER_SIZE + 5];
3200: int nbchar = 0;
3201: int cur, l;
3202:
3203: SHRINK;
3204: cur = CUR_CHAR(l);
3205: while (((cur != '<') || (ctxt->token == '<')) &&
3206: ((cur != '&') || (ctxt->token == '&')) &&
3207: (IS_CHAR(cur))) {
3208: COPY_BUF(l,buf,nbchar,cur);
3209: if (nbchar >= SGML_PARSER_BIG_BUFFER_SIZE) {
3210: /*
3211: * Ok the segment is to be consumed as chars.
3212: */
3213: if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3214: if (areBlanks(ctxt, buf, nbchar)) {
3215: if (ctxt->sax->ignorableWhitespace != NULL)
3216: ctxt->sax->ignorableWhitespace(ctxt->userData,
3217: buf, nbchar);
3218: } else {
3219: sgmlCheckParagraph(ctxt);
3220: if (ctxt->sax->characters != NULL)
3221: ctxt->sax->characters(ctxt->userData, buf, nbchar);
3222: }
3223: }
3224: nbchar = 0;
3225: }
3226: NEXTL(l);
3227: cur = CUR_CHAR(l);
3228: }
3229: if (nbchar != 0) {
3230: /*
3231: * Ok the segment is to be consumed as chars.
3232: */
3233: if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3234: if (areBlanks(ctxt, buf, nbchar)) {
3235: if (ctxt->sax->ignorableWhitespace != NULL)
3236: ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
3237: } else {
3238: sgmlCheckParagraph(ctxt);
3239: if (ctxt->sax->characters != NULL)
3240: ctxt->sax->characters(ctxt->userData, buf, nbchar);
3241: }
3242: }
3243: }
3244: }
3245:
3246: /**
3247: * sgmlParseExternalID:
3248: * @ctxt: an SGML parser context
3249: * @publicID: a xmlChar** receiving PubidLiteral
3250: * @strict: indicate whether we should restrict parsing to only
3251: * production [75], see NOTE below
3252: *
3253: * Parse an External ID or a Public ID
3254: *
3255: * NOTE: Productions [75] and [83] interract badly since [75] can generate
3256: * 'PUBLIC' S PubidLiteral S SystemLiteral
3257: *
3258: * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3259: * | 'PUBLIC' S PubidLiteral S SystemLiteral
3260: *
3261: * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3262: *
3263: * Returns the function returns SystemLiteral and in the second
3264: * case publicID receives PubidLiteral, is strict is off
3265: * it is possible to return NULL and have publicID set.
3266: */
3267:
3268: xmlChar *
3269: sgmlParseExternalID(sgmlParserCtxtPtr ctxt, xmlChar **publicID, int strict) {
3270: xmlChar *URI = NULL;
3271:
3272: if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3273: (UPP(2) == 'S') && (UPP(3) == 'T') &&
3274: (UPP(4) == 'E') && (UPP(5) == 'M')) {
3275: SKIP(6);
3276: if (!IS_BLANK(CUR)) {
3277: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3278: ctxt->sax->error(ctxt->userData,
3279: "Space required after 'SYSTEM'\n");
3280: ctxt->wellFormed = 0;
3281: }
3282: SKIP_BLANKS;
3283: URI = sgmlParseSystemLiteral(ctxt);
3284: if (URI == NULL) {
3285: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3286: ctxt->sax->error(ctxt->userData,
3287: "sgmlParseExternalID: SYSTEM, no URI\n");
3288: ctxt->wellFormed = 0;
3289: }
3290: } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3291: (UPP(2) == 'B') && (UPP(3) == 'L') &&
3292: (UPP(4) == 'I') && (UPP(5) == 'C')) {
3293: SKIP(6);
3294: if (!IS_BLANK(CUR)) {
3295: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3296: ctxt->sax->error(ctxt->userData,
3297: "Space required after 'PUBLIC'\n");
3298: ctxt->wellFormed = 0;
3299: }
3300: SKIP_BLANKS;
3301: *publicID = sgmlParsePubidLiteral(ctxt);
3302: if (*publicID == NULL) {
3303: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3304: ctxt->sax->error(ctxt->userData,
3305: "sgmlParseExternalID: PUBLIC, no Public Identifier\n");
3306: ctxt->wellFormed = 0;
3307: }
3308: SKIP_BLANKS;
3309: if ((CUR == '"') || (CUR == '\'')) {
3310: URI = sgmlParseSystemLiteral(ctxt);
3311: }
3312: }
3313: return(URI);
3314: }
3315:
3316: /**
3317: * sgmlParseComment:
3318: * @ctxt: an SGML parser context
3319: *
3320: * Parse an XML (SGML) comment <!-- .... -->
3321: *
3322: * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3323: */
3324: void
3325: sgmlParseComment(sgmlParserCtxtPtr ctxt) {
3326: xmlChar *buf = NULL;
3327: int len;
3328: int size = SGML_PARSER_BUFFER_SIZE;
3329: int q, ql;
3330: int r, rl;
3331: int cur, l;
3332: xmlParserInputState state;
3333:
3334: /*
3335: * Check that there is a comment right here.
3336: */
3337: if ((RAW != '<') || (NXT(1) != '!') ||
3338: (NXT(2) != '-') || (NXT(3) != '-')) return;
3339:
3340: state = ctxt->instate;
3341: ctxt->instate = XML_PARSER_COMMENT;
3342: SHRINK;
3343: SKIP(4);
3344: buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
3345: if (buf == NULL) {
3346: fprintf(stderr, "malloc of %d byte failed\n", size);
3347: ctxt->instate = state;
3348: return;
3349: }
3350: q = CUR_CHAR(ql);
3351: NEXTL(ql);
3352: r = CUR_CHAR(rl);
3353: NEXTL(rl);
3354: cur = CUR_CHAR(l);
3355: len = 0;
3356: while (IS_CHAR(cur) &&
3357: ((cur != '>') ||
3358: (r != '-') || (q != '-'))) {
3359: if (len + 5 >= size) {
3360: size *= 2;
3361: buf = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3362: if (buf == NULL) {
3363: fprintf(stderr, "realloc of %d byte failed\n", size);
3364: ctxt->instate = state;
3365: return;
3366: }
3367: }
3368: COPY_BUF(ql,buf,len,q);
3369: q = r;
3370: ql = rl;
3371: r = cur;
3372: rl = l;
3373: NEXTL(l);
3374: cur = CUR_CHAR(l);
3375: if (cur == 0) {
3376: SHRINK;
3377: GROW;
3378: cur = CUR_CHAR(l);
3379: }
3380: }
3381: buf[len] = 0;
3382: if (!IS_CHAR(cur)) {
3383: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3384: ctxt->sax->error(ctxt->userData,
3385: "Comment not terminated \n<!--%.50s\n", buf);
3386: ctxt->errNo = XML_ERR_COMMENT_NOT_FINISHED;
3387: ctxt->wellFormed = 0;
3388: xmlFree(buf);
3389: } else {
3390: NEXT;
3391: if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3392: (!ctxt->disableSAX))
3393: ctxt->sax->comment(ctxt->userData, buf);
3394: xmlFree(buf);
3395: }
3396: ctxt->instate = state;
3397: }
3398:
3399: /**
3400: * sgmlParseCharRef:
3401: * @ctxt: an SGML parser context
3402: *
3403: * parse Reference declarations
3404: *
3405: * [66] CharRef ::= '&#' [0-9]+ ';' |
3406: * '&#x' [0-9a-fA-F]+ ';'
3407: *
3408: * Returns the value parsed (as an int)
3409: */
3410: int
3411: sgmlParseCharRef(sgmlParserCtxtPtr ctxt) {
3412: int val = 0;
3413:
3414: if ((CUR == '&') && (NXT(1) == '#') &&
3415: (NXT(2) == 'x')) {
3416: SKIP(3);
3417: while (CUR != ';') {
3418: if ((CUR >= '0') && (CUR <= '9'))
3419: val = val * 16 + (CUR - '0');
3420: else if ((CUR >= 'a') && (CUR <= 'f'))
3421: val = val * 16 + (CUR - 'a') + 10;
3422: else if ((CUR >= 'A') && (CUR <= 'F'))
3423: val = val * 16 + (CUR - 'A') + 10;
3424: else {
3425: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3426: ctxt->sax->error(ctxt->userData,
3427: "sgmlParseCharRef: invalid hexadecimal value\n");
3428: ctxt->wellFormed = 0;
3429: val = 0;
3430: break;
3431: }
3432: NEXT;
3433: }
3434: if (CUR == ';')
3435: NEXT;
3436: } else if ((CUR == '&') && (NXT(1) == '#')) {
3437: SKIP(2);
3438: while (CUR != ';') {
3439: if ((CUR >= '0') && (CUR <= '9'))
3440: val = val * 10 + (CUR - '0');
3441: else {
3442: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3443: ctxt->sax->error(ctxt->userData,
3444: "sgmlParseCharRef: invalid decimal value\n");
3445: ctxt->wellFormed = 0;
3446: val = 0;
3447: break;
3448: }
3449: NEXT;
3450: }
3451: if (CUR == ';')
3452: NEXT;
3453: } else {
3454: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3455: ctxt->sax->error(ctxt->userData, "sgmlParseCharRef: invalid value\n");
3456: ctxt->wellFormed = 0;
3457: }
3458: /*
3459: * Check the value IS_CHAR ...
3460: */
3461: if (IS_CHAR(val)) {
3462: return(val);
3463: } else {
3464: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3465: ctxt->sax->error(ctxt->userData, "sgmlParseCharRef: invalid xmlChar value %d\n",
3466: val);
3467: ctxt->wellFormed = 0;
3468: }
3469: return(0);
3470: }
3471:
3472:
3473: /**
3474: * sgmlParseDocTypeDecl :
3475: * @ctxt: an SGML parser context
3476: *
3477: * parse a DOCTYPE declaration
3478: *
3479: * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3480: * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3481: */
3482:
3483: void
3484: sgmlParseDocTypeDecl(sgmlParserCtxtPtr ctxt) {
3485: xmlChar *name;
3486: xmlChar *ExternalID = NULL;
3487: xmlChar *URI = NULL;
3488:
3489: /*
3490: * We know that '<!DOCTYPE' has been detected.
3491: */
3492: SKIP(9);
3493:
3494: SKIP_BLANKS;
3495:
3496: /*
3497: * Parse the DOCTYPE name.
3498: */
3499: name = sgmlParseName(ctxt);
3500: if (name == NULL) {
3501: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3502: ctxt->sax->error(ctxt->userData, "sgmlParseDocTypeDecl : no DOCTYPE name !\n");
3503: ctxt->wellFormed = 0;
3504: }
3505: /*
3506: * Check that upper(name) == "SGML" !!!!!!!!!!!!!
3507: */
3508:
3509: SKIP_BLANKS;
3510:
3511: /*
3512: * Check for SystemID and ExternalID
3513: */
3514: URI = sgmlParseExternalID(ctxt, &ExternalID, 0);
3515: SKIP_BLANKS;
3516:
3517: /*
1.2 veillard 3518: * Create or update the document accordingly to the DOCTYPE
3519: */
3520: if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3521: (!ctxt->disableSAX))
3522: ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3523:
3524: /*
3525: * Is there any internal subset declarations ?
3526: * they are handled separately in sgmlParseInternalSubset()
3527: */
3528: if (RAW == '[')
3529: return;
3530:
3531:
3532: /*
1.1 veillard 3533: * We should be at the end of the DOCTYPE declaration.
3534: */
3535: if (CUR != '>') {
3536: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3537: ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
3538: ctxt->wellFormed = 0;
3539: /* We shouldn't try to resynchronize ... */
3540: }
3541: NEXT;
3542:
3543: /*
3544: * Cleanup, since we don't use all those identifiers
3545: */
3546: if (URI != NULL) xmlFree(URI);
3547: if (ExternalID != NULL) xmlFree(ExternalID);
3548: if (name != NULL) xmlFree(name);
3549: }
3550:
3551: /**
3552: * sgmlParseAttribute:
3553: * @ctxt: an SGML parser context
3554: * @value: a xmlChar ** used to store the value of the attribute
3555: *
3556: * parse an attribute
3557: *
3558: * [41] Attribute ::= Name Eq AttValue
3559: *
3560: * [25] Eq ::= S? '=' S?
3561: *
3562: * With namespace:
3563: *
3564: * [NS 11] Attribute ::= QName Eq AttValue
3565: *
3566: * Also the case QName == xmlns:??? is handled independently as a namespace
3567: * definition.
3568: *
3569: * Returns the attribute name, and the value in *value.
3570: */
3571:
3572: xmlChar *
3573: sgmlParseAttribute(sgmlParserCtxtPtr ctxt, xmlChar **value) {
3574: xmlChar *name, *val = NULL;
3575:
3576: *value = NULL;
3577: name = sgmlParseName(ctxt);
3578: if (name == NULL) {
3579: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3580: ctxt->sax->error(ctxt->userData, "error parsing attribute name\n");
3581: ctxt->wellFormed = 0;
3582: return(NULL);
3583: }
3584:
3585: /*
3586: * read the value
3587: */
3588: SKIP_BLANKS;
3589: if (CUR == '=') {
3590: NEXT;
3591: SKIP_BLANKS;
3592: val = sgmlParseAttValue(ctxt);
3593: /******
3594: } else {
3595: * TODO : some attribute must have values, some may not
3596: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3597: ctxt->sax->warning(ctxt->userData,
3598: "No value for attribute %s\n", name); */
3599: }
3600:
3601: *value = val;
3602: return(name);
3603: }
3604:
3605: /**
3606: * sgmlCheckEncoding:
3607: * @ctxt: an SGML parser context
3608: * @attvalue: the attribute value
3609: *
3610: * Checks an http-equiv attribute from a Meta tag to detect
3611: * the encoding
3612: * If a new encoding is detected the parser is switched to decode
3613: * it and pass UTF8
3614: */
3615: void
3616: sgmlCheckEncoding(sgmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3617: const xmlChar *encoding;
3618:
3619: if ((ctxt == NULL) || (attvalue == NULL))
3620: return;
3621:
3622: encoding = xmlStrstr(attvalue, BAD_CAST"charset=");
3623: if (encoding == NULL)
3624: encoding = xmlStrstr(attvalue, BAD_CAST"Charset=");
3625: if (encoding == NULL)
3626: encoding = xmlStrstr(attvalue, BAD_CAST"CHARSET=");
3627: if (encoding != NULL) {
3628: encoding += 8;
3629: } else {
3630: encoding = xmlStrstr(attvalue, BAD_CAST"charset =");
3631: if (encoding == NULL)
3632: encoding = xmlStrstr(attvalue, BAD_CAST"Charset =");
3633: if (encoding == NULL)
3634: encoding = xmlStrstr(attvalue, BAD_CAST"CHARSET =");
3635: if (encoding != NULL)
3636: encoding += 9;
3637: }
3638: if (encoding != NULL) {
3639: xmlCharEncoding enc;
3640: xmlCharEncodingHandlerPtr handler;
3641:
3642: while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3643:
3644: if (ctxt->input->encoding != NULL)
3645: xmlFree((xmlChar *) ctxt->input->encoding);
3646: ctxt->input->encoding = xmlStrdup(encoding);
3647:
3648: enc = xmlParseCharEncoding((const char *) encoding);
3649: /*
3650: * registered set of known encodings
3651: */
3652: if (enc != XML_CHAR_ENCODING_ERROR) {
3653: xmlSwitchEncoding(ctxt, enc);
3654: ctxt->charset = XML_CHAR_ENCODING_UTF8;
3655: } else {
3656: /*
3657: * fallback for unknown encodings
3658: */
3659: handler = xmlFindCharEncodingHandler((const char *) encoding);
3660: if (handler != NULL) {
3661: xmlSwitchToEncoding(ctxt, handler);
3662: ctxt->charset = XML_CHAR_ENCODING_UTF8;
3663: } else {
3664: ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3665: }
3666: }
3667:
3668: if ((ctxt->input->buf != NULL) &&
3669: (ctxt->input->buf->encoder != NULL) &&
3670: (ctxt->input->buf->raw != NULL) &&
3671: (ctxt->input->buf->buffer != NULL)) {
3672: int nbchars;
3673: int processed;
3674:
3675: /*
3676: * convert as much as possible to the parser reading buffer.
3677: */
3678: processed = ctxt->input->cur - ctxt->input->base;
3679: xmlBufferShrink(ctxt->input->buf->buffer, processed);
3680: nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3681: ctxt->input->buf->buffer,
3682: ctxt->input->buf->raw);
3683: if (nbchars < 0) {
3684: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3685: ctxt->sax->error(ctxt->userData,
3686: "sgmlCheckEncoding: encoder error\n");
3687: ctxt->errNo = XML_ERR_INVALID_ENCODING;
3688: }
3689: ctxt->input->base =
3690: ctxt->input->cur = ctxt->input->buf->buffer->content;
3691: }
3692: }
3693: }
3694:
3695: /**
3696: * sgmlCheckMeta:
3697: * @ctxt: an SGML parser context
3698: * @atts: the attributes values
3699: *
3700: * Checks an attributes from a Meta tag
3701: */
3702: void
3703: sgmlCheckMeta(sgmlParserCtxtPtr ctxt, const xmlChar **atts) {
3704: int i;
3705: const xmlChar *att, *value;
3706: int http = 0;
3707: const xmlChar *content = NULL;
3708:
3709: if ((ctxt == NULL) || (atts == NULL))
3710: return;
3711:
3712: i = 0;
3713: att = atts[i++];
3714: while (att != NULL) {
3715: value = atts[i++];
3716: if ((value != NULL) &&
3717: ((!xmlStrcmp(att, BAD_CAST"http-equiv")) ||
3718: (!xmlStrcmp(att, BAD_CAST"Http-Equiv")) ||
3719: (!xmlStrcmp(att, BAD_CAST"HTTP-EQUIV"))) &&
3720: ((!xmlStrcmp(value, BAD_CAST"Content-Type")) ||
3721: (!xmlStrcmp(value, BAD_CAST"content-type")) ||
3722: (!xmlStrcmp(value, BAD_CAST"CONTENT-TYPE"))))
3723: http = 1;
3724: else if ((value != NULL) &&
3725: ((!xmlStrcmp(att, BAD_CAST"content")) ||
3726: (!xmlStrcmp(att, BAD_CAST"Content")) ||
3727: (!xmlStrcmp(att, BAD_CAST"CONTENT"))))
3728: content = value;
3729: att = atts[i++];
3730: }
3731: if ((http) && (content != NULL))
3732: sgmlCheckEncoding(ctxt, content);
3733:
3734: }
3735:
3736: /**
3737: * sgmlParseStartTag:
3738: * @ctxt: an SGML parser context
3739: *
3740: * parse a start of tag either for rule element or
3741: * EmptyElement. In both case we don't parse the tag closing chars.
3742: *
3743: * [40] STag ::= '<' Name (S Attribute)* S? '>'
3744: *
3745: * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3746: *
3747: * With namespace:
3748: *
3749: * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3750: *
3751: * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3752: *
3753: */
3754:
3755: void
3756: sgmlParseStartTag(sgmlParserCtxtPtr ctxt) {
3757: xmlChar *name;
3758: xmlChar *attname;
3759: xmlChar *attvalue;
3760: const xmlChar **atts = NULL;
3761: int nbatts = 0;
3762: int maxatts = 0;
3763: int meta = 0;
3764: int i;
3765:
3766: if (CUR != '<') return;
3767: NEXT;
3768:
3769: GROW;
3770: name = sgmlParseSGMLName(ctxt);
3771: if (name == NULL) {
3772: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3773: ctxt->sax->error(ctxt->userData,
3774: "sgmlParseStartTag: invalid element name\n");
3775: ctxt->wellFormed = 0;
3776: return;
3777: }
3778: if (!xmlStrcmp(name, BAD_CAST"meta"))
3779: meta = 1;
3780:
3781: /*
3782: * Check for auto-closure of SGML elements.
3783: */
3784: sgmlAutoClose(ctxt, name);
3785:
3786: /*
3787: * Check for implied SGML elements.
3788: */
3789: sgmlCheckImplied(ctxt, name);
3790:
3791: /*
3792: * Now parse the attributes, it ends up with the ending
3793: *
3794: * (S Attribute)* S?
3795: */
3796: SKIP_BLANKS;
3797: while ((IS_CHAR(CUR)) &&
3798: (CUR != '>') &&
3799: ((CUR != '/') || (NXT(1) != '>'))) {
3800: long cons = ctxt->nbChars;
3801:
3802: GROW;
3803: attname = sgmlParseAttribute(ctxt, &attvalue);
3804: if (attname != NULL) {
3805:
3806: /*
3807: * Well formedness requires at most one declaration of an attribute
3808: */
3809: for (i = 0; i < nbatts;i += 2) {
3810: if (!xmlStrcmp(atts[i], attname)) {
3811: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3812: ctxt->sax->error(ctxt->userData,
3813: "Attribute %s redefined\n",
3814: attname);
3815: ctxt->wellFormed = 0;
3816: xmlFree(attname);
3817: if (attvalue != NULL)
3818: xmlFree(attvalue);
3819: goto failed;
3820: }
3821: }
3822:
3823: /*
3824: * Add the pair to atts
3825: */
3826: if (atts == NULL) {
3827: maxatts = 10;
3828: atts = (const xmlChar **) xmlMalloc(maxatts * sizeof(xmlChar *));
3829: if (atts == NULL) {
3830: fprintf(stderr, "malloc of %ld byte failed\n",
3831: maxatts * (long)sizeof(xmlChar *));
3832: if (name != NULL) xmlFree(name);
3833: return;
3834: }
3835: } else if (nbatts + 4 > maxatts) {
3836: maxatts *= 2;
3837: atts = (const xmlChar **) xmlRealloc(atts, maxatts * sizeof(xmlChar *));
3838: if (atts == NULL) {
3839: fprintf(stderr, "realloc of %ld byte failed\n",
3840: maxatts * (long)sizeof(xmlChar *));
3841: if (name != NULL) xmlFree(name);
3842: return;
3843: }
3844: }
3845: atts[nbatts++] = attname;
3846: atts[nbatts++] = attvalue;
3847: atts[nbatts] = NULL;
3848: atts[nbatts + 1] = NULL;
3849: }
3850:
3851: failed:
3852: SKIP_BLANKS;
3853: if (cons == ctxt->nbChars) {
3854: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3855: ctxt->sax->error(ctxt->userData,
3856: "sgmlParseStartTag: problem parsing attributes\n");
3857: ctxt->wellFormed = 0;
3858: break;
3859: }
3860: }
3861:
3862: /*
3863: * Handle specific association to the META tag
3864: */
3865: if (meta)
3866: sgmlCheckMeta(ctxt, atts);
3867:
3868: /*
3869: * SAX: Start of Element !
3870: */
3871: sgmlnamePush(ctxt, xmlStrdup(name));
3872: #ifdef DEBUG
3873: fprintf(stderr,"Start of element %s: pushed %s\n", name, ctxt->name);
3874: #endif
3875: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
3876: ctxt->sax->startElement(ctxt->userData, name, atts);
3877:
3878: if (atts != NULL) {
3879: for (i = 0;i < nbatts;i++) {
3880: if (atts[i] != NULL)
3881: xmlFree((xmlChar *) atts[i]);
3882: }
3883: xmlFree((void *) atts);
3884: }
3885: if (name != NULL) xmlFree(name);
3886: }
3887:
3888: /**
3889: * sgmlParseEndTag:
3890: * @ctxt: an SGML parser context
3891: *
3892: * parse an end of tag
3893: *
3894: * [42] ETag ::= '</' Name S? '>'
3895: *
3896: * With namespace
3897: *
3898: * [NS 9] ETag ::= '</' QName S? '>'
3899: */
3900:
3901: void
3902: sgmlParseEndTag(sgmlParserCtxtPtr ctxt) {
3903: xmlChar *name;
3904: xmlChar *oldname;
3905: int i;
3906:
3907: if ((CUR != '<') || (NXT(1) != '/')) {
3908: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3909: ctxt->sax->error(ctxt->userData, "sgmlParseEndTag: '</' not found\n");
3910: ctxt->wellFormed = 0;
3911: return;
3912: }
3913: SKIP(2);
3914:
3915: name = sgmlParseSGMLName(ctxt);
3916: if (name == NULL) {
3917: if (CUR == '>') {
3918: NEXT;
3919: oldname = sgmlnamePop(ctxt);
3920: if (oldname != NULL) {
3921: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3922: ctxt->sax->endElement(ctxt->userData, name);
3923: #ifdef DEBUG
3924: fprintf(stderr,"End of tag </>: popping out %s\n", oldname);
3925: #endif
3926: xmlFree(oldname);
3927: #ifdef DEBUG
3928: } else {
3929: fprintf(stderr,"End of tag </>: stack empty !!!\n");
3930: #endif
3931: }
3932: return;
3933: } else
3934: return;
3935: }
3936:
3937: /*
3938: * We should definitely be at the ending "S? '>'" part
3939: */
3940: SKIP_BLANKS;
3941: if ((!IS_CHAR(CUR)) || (CUR != '>')) {
3942: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3943: ctxt->sax->error(ctxt->userData, "End tag : expected '>'\n");
3944: ctxt->wellFormed = 0;
3945: } else
3946: NEXT;
3947:
3948: /*
3949: * If the name read is not one of the element in the parsing stack
3950: * then return, it's just an error.
3951: */
3952: for (i = (ctxt->nameNr - 1);i >= 0;i--) {
3953: if (!xmlStrcmp(name, ctxt->nameTab[i])) break;
3954: }
3955: if (i < 0) {
3956: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3957: ctxt->sax->error(ctxt->userData,
3958: "Unexpected end tag : %s\n", name);
3959: xmlFree(name);
3960: ctxt->wellFormed = 0;
3961: return;
3962: }
3963:
3964:
3965: /*
3966: * Check for auto-closure of SGML elements.
3967: */
3968:
3969: sgmlAutoCloseOnClose(ctxt, name);
3970:
3971: /*
3972: * Well formedness constraints, opening and closing must match.
3973: * With the exception that the autoclose may have popped stuff out
3974: * of the stack.
3975: */
3976: if (((name[0] != '/') || (name[1] != 0)) &&
3977: (xmlStrcmp(name, ctxt->name))) {
3978: #ifdef DEBUG
3979: fprintf(stderr,"End of tag %s: expecting %s\n", name, ctxt->name);
3980: #endif
3981: if ((ctxt->name != NULL) &&
3982: (xmlStrcmp(ctxt->name, name))) {
3983: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3984: ctxt->sax->error(ctxt->userData,
3985: "Opening and ending tag mismatch: %s and %s\n",
3986: name, ctxt->name);
3987: ctxt->wellFormed = 0;
3988: }
3989: }
3990:
3991: /*
3992: * SAX: End of Tag
3993: */
3994: oldname = ctxt->name;
3995: if (((name[0] == '/') && (name[1] == 0)) ||
3996: ((oldname != NULL) && (!xmlStrcmp(oldname, name)))) {
3997: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3998: ctxt->sax->endElement(ctxt->userData, name);
3999: oldname = sgmlnamePop(ctxt);
4000: if (oldname != NULL) {
4001: #ifdef DEBUG
4002: fprintf(stderr,"End of tag %s: popping out %s\n", name, oldname);
4003: #endif
4004: xmlFree(oldname);
4005: #ifdef DEBUG
4006: } else {
4007: fprintf(stderr,"End of tag %s: stack empty !!!\n", name);
4008: #endif
4009: }
4010: }
4011:
4012: if (name != NULL)
4013: xmlFree(name);
4014:
4015: return;
4016: }
4017:
4018:
4019: /**
4020: * sgmlParseReference:
4021: * @ctxt: an SGML parser context
4022: *
4023: * parse and handle entity references in content,
4024: * this will end-up in a call to character() since this is either a
4025: * CharRef, or a predefined entity.
4026: */
4027: void
4028: sgmlParseReference(sgmlParserCtxtPtr ctxt) {
4029: sgmlEntityDescPtr ent;
4030: xmlChar out[6];
4031: xmlChar *name;
4032: if (CUR != '&') return;
4033:
4034: if (NXT(1) == '#') {
4035: unsigned int c;
4036: int bits, i = 0;
4037:
4038: c = sgmlParseCharRef(ctxt);
4039: if (c < 0x80) { out[i++]= c; bits= -6; }
4040: else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4041: else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4042: else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4043:
4044: for ( ; bits >= 0; bits-= 6) {
4045: out[i++]= ((c >> bits) & 0x3F) | 0x80;
4046: }
4047: out[i] = 0;
4048:
4049: sgmlCheckParagraph(ctxt);
4050: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4051: ctxt->sax->characters(ctxt->userData, out, i);
4052: } else {
4053: ent = sgmlParseEntityRef(ctxt, &name);
4054: if (name == NULL) {
4055: sgmlCheckParagraph(ctxt);
4056: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4057: ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4058: return;
4059: }
4060: if ((ent == NULL) || (ent->value <= 0)) {
4061: sgmlCheckParagraph(ctxt);
4062: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4063: ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4064: ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4065: /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4066: }
4067: } else {
4068: unsigned int c;
4069: int bits, i = 0;
4070:
4071: c = ent->value;
4072: if (c < 0x80)
4073: { out[i++]= c; bits= -6; }
4074: else if (c < 0x800)
4075: { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4076: else if (c < 0x10000)
4077: { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4078: else
4079: { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4080:
4081: for ( ; bits >= 0; bits-= 6) {
4082: out[i++]= ((c >> bits) & 0x3F) | 0x80;
4083: }
4084: out[i] = 0;
4085:
4086: sgmlCheckParagraph(ctxt);
4087: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4088: ctxt->sax->characters(ctxt->userData, out, i);
4089: }
4090: xmlFree(name);
4091: }
4092: }
4093:
4094: /**
4095: * sgmlParseContent:
4096: * @ctxt: an SGML parser context
4097: * @name: the node name
4098: *
4099: * Parse a content: comment, sub-element, reference or text.
4100: *
4101: */
4102:
4103: void
4104: sgmlParseContent(sgmlParserCtxtPtr ctxt) {
4105: xmlChar *currentNode;
4106: int depth;
4107:
4108: currentNode = xmlStrdup(ctxt->name);
4109: depth = ctxt->nameNr;
4110: while (1) {
4111: long cons = ctxt->nbChars;
4112:
4113: GROW;
4114: /*
4115: * Our tag or one of it's parent or children is ending.
4116: */
4117: if ((CUR == '<') && (NXT(1) == '/')) {
4118: sgmlParseEndTag(ctxt);
4119: if (currentNode != NULL) xmlFree(currentNode);
4120: return;
4121: }
4122:
4123: /*
4124: * Has this node been popped out during parsing of
4125: * the next element
4126: */
4127: if ((xmlStrcmp(currentNode, ctxt->name)) &&
4128: (depth >= ctxt->nameNr)) {
4129: if (currentNode != NULL) xmlFree(currentNode);
4130: return;
4131: }
4132:
4133: /*
4134: * Sometimes DOCTYPE arrives in the middle of the document
4135: */
4136: if ((CUR == '<') && (NXT(1) == '!') &&
4137: (UPP(2) == 'D') && (UPP(3) == 'O') &&
4138: (UPP(4) == 'C') && (UPP(5) == 'T') &&
4139: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4140: (UPP(8) == 'E')) {
4141: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4142: ctxt->sax->error(ctxt->userData,
4143: "Misplaced DOCTYPE declaration\n");
4144: ctxt->wellFormed = 0;
4145: sgmlParseDocTypeDecl(ctxt);
4146: }
4147:
4148: /*
4149: * First case : a comment
4150: */
4151: if ((CUR == '<') && (NXT(1) == '!') &&
4152: (NXT(2) == '-') && (NXT(3) == '-')) {
4153: sgmlParseComment(ctxt);
4154: }
4155:
4156: /*
4157: * Second case : a sub-element.
4158: */
4159: else if (CUR == '<') {
4160: sgmlParseElement(ctxt);
4161: }
4162:
4163: /*
4164: * Third case : a reference. If if has not been resolved,
4165: * parsing returns it's Name, create the node
4166: */
4167: else if (CUR == '&') {
4168: sgmlParseReference(ctxt);
4169: }
4170:
4171: /*
4172: * Fourth : end of the resource
4173: */
4174: else if (CUR == 0) {
4175: sgmlAutoClose(ctxt, NULL);
4176: }
4177:
4178: /*
4179: * Last case, text. Note that References are handled directly.
4180: */
4181: else {
4182: sgmlParseCharData(ctxt, 0);
4183: }
4184:
4185: if (cons == ctxt->nbChars) {
4186: if (ctxt->node != NULL) {
4187: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4188: ctxt->sax->error(ctxt->userData,
4189: "detected an error in element content\n");
4190: ctxt->wellFormed = 0;
4191: }
4192: break;
4193: }
4194:
4195: GROW;
4196: }
4197: if (currentNode != NULL) xmlFree(currentNode);
4198: }
4199:
4200: /**
4201: * sgmlParseElement:
4202: * @ctxt: an SGML parser context
4203: *
4204: * parse an SGML element, this is highly recursive
4205: *
4206: * [39] element ::= EmptyElemTag | STag content ETag
4207: *
4208: * [41] Attribute ::= Name Eq AttValue
4209: */
4210:
4211: void
4212: sgmlParseElement(sgmlParserCtxtPtr ctxt) {
4213: xmlChar *name;
4214: xmlChar *currentNode = NULL;
4215: sgmlElemDescPtr info;
4216: sgmlParserNodeInfo node_info;
4217: xmlChar *oldname;
4218: int depth = ctxt->nameNr;
4219:
4220: /* Capture start position */
4221: if (ctxt->record_info) {
4222: node_info.begin_pos = ctxt->input->consumed +
4223: (CUR_PTR - ctxt->input->base);
4224: node_info.begin_line = ctxt->input->line;
4225: }
4226:
4227: oldname = xmlStrdup(ctxt->name);
4228: sgmlParseStartTag(ctxt);
4229: name = ctxt->name;
4230: #ifdef DEBUG
4231: if (oldname == NULL)
4232: fprintf(stderr, "Start of element %s\n", name);
4233: else if (name == NULL)
4234: fprintf(stderr, "Start of element failed, was %s\n", oldname);
4235: else
4236: fprintf(stderr, "Start of element %s, was %s\n", name, oldname);
4237: #endif
4238: if (((depth == ctxt->nameNr) && (!xmlStrcmp(oldname, ctxt->name))) ||
4239: (name == NULL)) {
4240: if (CUR == '>')
4241: NEXT;
4242: if (oldname != NULL)
4243: xmlFree(oldname);
4244: return;
4245: }
4246: if (oldname != NULL)
4247: xmlFree(oldname);
4248:
4249: /*
4250: * Lookup the info for that element.
4251: */
4252: info = sgmlTagLookup(name);
4253: if (info == NULL) {
4254: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.4 veillard 4255: ctxt->sax->error(ctxt->userData, "Tag %s unknown\n",
1.1 veillard 4256: name);
4257: ctxt->wellFormed = 0;
4258: } else if (info->depr) {
4259: /***************************
4260: if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
4261: ctxt->sax->warning(ctxt->userData, "Tag %s is deprecated\n",
4262: name);
4263: ***************************/
4264: }
4265:
4266: /*
4267: * Check for an Empty Element labelled the XML/SGML way
4268: */
4269: if ((CUR == '/') && (NXT(1) == '>')) {
4270: SKIP(2);
4271: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4272: ctxt->sax->endElement(ctxt->userData, name);
4273: oldname = sgmlnamePop(ctxt);
4274: #ifdef DEBUG
4275: fprintf(stderr,"End of tag the XML way: popping out %s\n", oldname);
4276: #endif
4277: if (oldname != NULL)
4278: xmlFree(oldname);
4279: return;
4280: }
4281:
4282: if (CUR == '>') {
4283: NEXT;
4284: } else {
4285: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4286: ctxt->sax->error(ctxt->userData,
4287: "Couldn't find end of Start Tag %s\n",
4288: name);
4289: ctxt->wellFormed = 0;
4290:
4291: /*
4292: * end of parsing of this node.
4293: */
4294: if (!xmlStrcmp(name, ctxt->name)) {
4295: nodePop(ctxt);
4296: oldname = sgmlnamePop(ctxt);
4297: #ifdef DEBUG
4298: fprintf(stderr,"End of start tag problem: popping out %s\n", oldname);
4299: #endif
4300: if (oldname != NULL)
4301: xmlFree(oldname);
4302: }
4303:
4304: /*
4305: * Capture end position and add node
4306: */
4307: if ( currentNode != NULL && ctxt->record_info ) {
4308: node_info.end_pos = ctxt->input->consumed +
4309: (CUR_PTR - ctxt->input->base);
4310: node_info.end_line = ctxt->input->line;
4311: node_info.node = ctxt->node;
4312: xmlParserAddNodeInfo(ctxt, &node_info);
4313: }
4314: return;
4315: }
4316:
4317: /*
4318: * Check for an Empty Element from DTD definition
4319: */
4320: if ((info != NULL) && (info->empty)) {
4321: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4322: ctxt->sax->endElement(ctxt->userData, name);
4323: oldname = sgmlnamePop(ctxt);
4324: #ifdef DEBUG
4325: fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
4326: #endif
4327: if (oldname != NULL)
4328: xmlFree(oldname);
4329: return;
4330: }
4331:
4332: /*
4333: * Parse the content of the element:
4334: */
4335: currentNode = xmlStrdup(ctxt->name);
4336: depth = ctxt->nameNr;
4337: while (IS_CHAR(CUR)) {
4338: sgmlParseContent(ctxt);
4339: if (ctxt->nameNr < depth) break;
4340: }
4341:
4342: if (!IS_CHAR(CUR)) {
4343: /************
4344: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4345: ctxt->sax->error(ctxt->userData,
4346: "Premature end of data in tag %s\n", currentNode);
4347: ctxt->wellFormed = 0;
4348: *************/
4349:
4350: /*
4351: * end of parsing of this node.
4352: */
4353: nodePop(ctxt);
4354: oldname = sgmlnamePop(ctxt);
4355: #ifdef DEBUG
4356: fprintf(stderr,"Premature end of tag %s : popping out %s\n", name, oldname);
4357: #endif
4358: if (oldname != NULL)
4359: xmlFree(oldname);
4360: if (currentNode != NULL)
4361: xmlFree(currentNode);
4362: return;
4363: }
4364:
4365: /*
4366: * Capture end position and add node
4367: */
4368: if ( currentNode != NULL && ctxt->record_info ) {
4369: node_info.end_pos = ctxt->input->consumed +
4370: (CUR_PTR - ctxt->input->base);
4371: node_info.end_line = ctxt->input->line;
4372: node_info.node = ctxt->node;
4373: xmlParserAddNodeInfo(ctxt, &node_info);
4374: }
4375: if (currentNode != NULL)
4376: xmlFree(currentNode);
4377: }
4378:
4379: /**
1.3 veillard 4380: * sgmlParseEntityDecl:
4381: * @ctxt: an SGML parser context
4382: *
4383: * parse <!ENTITY declarations
4384: *
4385: */
4386:
4387: void
4388: sgmlParseEntityDecl(xmlParserCtxtPtr ctxt) {
4389: xmlChar *name = NULL;
4390: xmlChar *value = NULL;
4391: xmlChar *URI = NULL, *literal = NULL;
4392: xmlChar *ndata = NULL;
4393: int isParameter = 0;
4394: xmlChar *orig = NULL;
4395:
4396: GROW;
4397: if ((RAW == '<') && (NXT(1) == '!') &&
4398: (NXT(2) == 'E') && (NXT(3) == 'N') &&
4399: (NXT(4) == 'T') && (NXT(5) == 'I') &&
4400: (NXT(6) == 'T') && (NXT(7) == 'Y')) {
4401: xmlParserInputPtr input = ctxt->input;
4402: ctxt->instate = XML_PARSER_ENTITY_DECL;
4403: SHRINK;
4404: SKIP(8);
4405: if (!IS_BLANK(CUR)) {
4406: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4407: ctxt->sax->error(ctxt->userData,
4408: "Space required after '<!ENTITY'\n");
4409: ctxt->errNo = XML_ERR_SPACE_REQUIRED;
4410: ctxt->wellFormed = 0;
4411: ctxt->disableSAX = 1;
4412: }
4413: SKIP_BLANKS;
4414:
4415: if (RAW == '%') {
4416: NEXT;
4417: if (!IS_BLANK(CUR)) {
4418: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4419: ctxt->sax->error(ctxt->userData,
4420: "Space required after '%'\n");
4421: ctxt->errNo = XML_ERR_SPACE_REQUIRED;
4422: ctxt->wellFormed = 0;
4423: ctxt->disableSAX = 1;
4424: }
4425: SKIP_BLANKS;
4426: isParameter = 1;
4427: }
4428:
4429: name = xmlParseName(ctxt);
4430: if (name == NULL) {
4431: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4432: ctxt->sax->error(ctxt->userData, "sgmlarseEntityDecl: no name\n");
4433: ctxt->errNo = XML_ERR_NAME_REQUIRED;
4434: ctxt->wellFormed = 0;
4435: ctxt->disableSAX = 1;
4436: return;
4437: }
4438: if (!IS_BLANK(CUR)) {
4439: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4440: ctxt->sax->error(ctxt->userData,
4441: "Space required after the entity name\n");
4442: ctxt->errNo = XML_ERR_SPACE_REQUIRED;
4443: ctxt->wellFormed = 0;
4444: ctxt->disableSAX = 1;
4445: }
4446: SKIP_BLANKS;
4447:
4448: /*
4449: * handle the various case of definitions...
4450: */
4451: if (isParameter) {
4452: if ((RAW == '"') || (RAW == '\'')) {
4453: value = xmlParseEntityValue(ctxt, &orig);
4454: if (value) {
4455: if ((ctxt->sax != NULL) &&
4456: (!ctxt->disableSAX) && (ctxt->sax->entityDecl != NULL))
4457: ctxt->sax->entityDecl(ctxt->userData, name,
4458: XML_INTERNAL_PARAMETER_ENTITY,
4459: NULL, NULL, value);
4460: }
4461: } else {
4462: URI = xmlParseExternalID(ctxt, &literal, 1);
4463: if ((URI == NULL) && (literal == NULL)) {
4464: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4465: ctxt->sax->error(ctxt->userData,
4466: "Entity value required\n");
4467: ctxt->errNo = XML_ERR_VALUE_REQUIRED;
4468: ctxt->wellFormed = 0;
4469: ctxt->disableSAX = 1;
4470: }
4471: if (URI) {
4472: xmlURIPtr uri;
4473:
4474: uri = xmlParseURI((const char *) URI);
4475: if (uri == NULL) {
4476: if ((ctxt->sax != NULL) &&
4477: (!ctxt->disableSAX) &&
4478: (ctxt->sax->error != NULL))
4479: ctxt->sax->error(ctxt->userData,
4480: "Invalid URI: %s\n", URI);
4481: ctxt->wellFormed = 0;
4482: ctxt->errNo = XML_ERR_INVALID_URI;
4483: } else {
4484: if (uri->fragment != NULL) {
4485: if ((ctxt->sax != NULL) &&
4486: (!ctxt->disableSAX) &&
4487: (ctxt->sax->error != NULL))
4488: ctxt->sax->error(ctxt->userData,
4489: "Fragment not allowed: %s\n", URI);
4490: ctxt->wellFormed = 0;
4491: ctxt->errNo = XML_ERR_URI_FRAGMENT;
4492: } else {
4493: if ((ctxt->sax != NULL) &&
4494: (!ctxt->disableSAX) &&
4495: (ctxt->sax->entityDecl != NULL))
4496: ctxt->sax->entityDecl(ctxt->userData, name,
4497: XML_EXTERNAL_PARAMETER_ENTITY,
4498: literal, URI, NULL);
4499: }
4500: xmlFreeURI(uri);
4501: }
4502: }
4503: }
4504: } else {
4505: if ((RAW == '"') || (RAW == '\'')) {
4506: value = xmlParseEntityValue(ctxt, &orig);
4507: if ((ctxt->sax != NULL) &&
4508: (!ctxt->disableSAX) && (ctxt->sax->entityDecl != NULL))
4509: ctxt->sax->entityDecl(ctxt->userData, name,
4510: XML_INTERNAL_GENERAL_ENTITY,
4511: NULL, NULL, value);
4512: } else {
4513: URI = xmlParseExternalID(ctxt, &literal, 1);
4514: if ((URI == NULL) && (literal == NULL)) {
4515: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4516: ctxt->sax->error(ctxt->userData,
4517: "Entity value required\n");
4518: ctxt->errNo = XML_ERR_VALUE_REQUIRED;
4519: ctxt->wellFormed = 0;
4520: ctxt->disableSAX = 1;
4521: }
4522: if (URI) {
4523: xmlURIPtr uri;
4524:
4525: uri = xmlParseURI((const char *)URI);
4526: if (uri == NULL) {
4527: if ((ctxt->sax != NULL) &&
4528: (!ctxt->disableSAX) &&
4529: (ctxt->sax->error != NULL))
4530: ctxt->sax->error(ctxt->userData,
4531: "Invalid URI: %s\n", URI);
4532: ctxt->wellFormed = 0;
4533: ctxt->errNo = XML_ERR_INVALID_URI;
4534: } else {
4535: if (uri->fragment != NULL) {
4536: if ((ctxt->sax != NULL) &&
4537: (!ctxt->disableSAX) &&
4538: (ctxt->sax->error != NULL))
4539: ctxt->sax->error(ctxt->userData,
4540: "Fragment not allowed: %s\n", URI);
4541: ctxt->wellFormed = 0;
4542: ctxt->errNo = XML_ERR_URI_FRAGMENT;
4543: }
4544: xmlFreeURI(uri);
4545: }
4546: }
4547: if ((RAW != '>') && (!IS_BLANK(CUR))) {
4548: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4549: ctxt->sax->error(ctxt->userData,
4550: "Space required before content model\n");
4551: ctxt->errNo = XML_ERR_SPACE_REQUIRED;
4552: ctxt->wellFormed = 0;
4553: ctxt->disableSAX = 1;
4554: }
4555: SKIP_BLANKS;
4556:
4557: /*
4558: * SGML specific: here we can get the content model
4559: */
4560: if (RAW != '>') {
4561: xmlChar *contmod;
4562:
4563: contmod = xmlParseName(ctxt);
4564:
4565: if (contmod == NULL) {
4566: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4567: ctxt->sax->error(ctxt->userData,
4568: "Could not parse entity content model\n");
4569: ctxt->errNo = XML_ERR_SPACE_REQUIRED;
4570: ctxt->wellFormed = 0;
4571: ctxt->disableSAX = 1;
4572: } else {
4573: if (!xmlStrcmp(contmod, BAD_CAST"NDATA")) {
4574: if (!IS_BLANK(CUR)) {
4575: if ((ctxt->sax != NULL) &&
4576: (ctxt->sax->error != NULL))
4577: ctxt->sax->error(ctxt->userData,
4578: "Space required after 'NDATA'\n");
4579: ctxt->errNo = XML_ERR_SPACE_REQUIRED;
4580: ctxt->wellFormed = 0;
4581: ctxt->disableSAX = 1;
4582: }
4583: SKIP_BLANKS;
4584: ndata = xmlParseName(ctxt);
4585: if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4586: (ctxt->sax->unparsedEntityDecl != NULL)) {
4587: ctxt->sax->unparsedEntityDecl(ctxt->userData,
4588: name, literal, URI, ndata);
4589: }
4590: } else if (!xmlStrcmp(contmod, BAD_CAST"SUBDOC")) {
4591: if ((ctxt->sax != NULL) &&
4592: (ctxt->sax->warning != NULL))
4593: ctxt->sax->warning(ctxt->userData,
4594: "SUBDOC entities are not supported\n");
4595: SKIP_BLANKS;
4596: ndata = xmlParseName(ctxt);
4597: if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4598: (ctxt->sax->unparsedEntityDecl != NULL)) {
4599: ctxt->sax->unparsedEntityDecl(ctxt->userData,
4600: name, literal, URI, ndata);
4601: }
4602: } else if (!xmlStrcmp(contmod, BAD_CAST"CDATA")) {
4603: if ((ctxt->sax != NULL) &&
4604: (ctxt->sax->warning != NULL))
4605: ctxt->sax->warning(ctxt->userData,
4606: "CDATA entities are not supported\n");
4607: SKIP_BLANKS;
4608: ndata = xmlParseName(ctxt);
4609: if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4610: (ctxt->sax->unparsedEntityDecl != NULL)) {
4611: ctxt->sax->unparsedEntityDecl(ctxt->userData,
4612: name, literal, URI, ndata);
4613: }
4614: }
4615: xmlFree(contmod);
4616: }
4617: } else {
4618: if ((ctxt->sax != NULL) &&
4619: (!ctxt->disableSAX) && (ctxt->sax->entityDecl != NULL))
4620: ctxt->sax->entityDecl(ctxt->userData, name,
4621: XML_EXTERNAL_GENERAL_PARSED_ENTITY,
4622: literal, URI, NULL);
4623: }
4624: }
4625: }
4626: SKIP_BLANKS;
4627: if (RAW != '>') {
4628: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4629: ctxt->sax->error(ctxt->userData,
4630: "sgmlParseEntityDecl: entity %s not terminated\n", name);
4631: ctxt->errNo = XML_ERR_ENTITY_NOT_FINISHED;
4632: ctxt->wellFormed = 0;
4633: ctxt->disableSAX = 1;
4634: } else {
4635: if (input != ctxt->input) {
4636: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4637: ctxt->sax->error(ctxt->userData,
4638: "Entity declaration doesn't start and stop in the same entity\n");
4639: ctxt->errNo = XML_ERR_ENTITY_BOUNDARY;
4640: ctxt->wellFormed = 0;
4641: ctxt->disableSAX = 1;
4642: }
4643: NEXT;
4644: }
4645: if (orig != NULL) {
4646: /*
4647: * Ugly mechanism to save the raw entity value.
4648: */
4649: xmlEntityPtr cur = NULL;
4650:
4651: if (isParameter) {
4652: if ((ctxt->sax != NULL) &&
4653: (ctxt->sax->getParameterEntity != NULL))
4654: cur = ctxt->sax->getParameterEntity(ctxt->userData, name);
4655: } else {
4656: if ((ctxt->sax != NULL) &&
4657: (ctxt->sax->getEntity != NULL))
4658: cur = ctxt->sax->getEntity(ctxt->userData, name);
4659: }
4660: if (cur != NULL) {
4661: if (cur->orig != NULL)
4662: xmlFree(orig);
4663: else
4664: cur->orig = orig;
4665: } else
4666: xmlFree(orig);
4667: }
4668: if (name != NULL) xmlFree(name);
4669: if (value != NULL) xmlFree(value);
4670: if (URI != NULL) xmlFree(URI);
4671: if (literal != NULL) xmlFree(literal);
4672: if (ndata != NULL) xmlFree(ndata);
4673: }
4674: }
4675:
4676: /**
4677: * sgmlParseMarkupDecl:
4678: * @ctxt: an SGML parser context
4679: *
4680: * parse Markup declarations
4681: *
4682: * [29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl |
4683: * NotationDecl | PI | Comment
4684: */
4685: void
4686: sgmlParseMarkupDecl(xmlParserCtxtPtr ctxt) {
4687: GROW;
4688: xmlParseElementDecl(ctxt);
4689: xmlParseAttributeListDecl(ctxt);
4690: sgmlParseEntityDecl(ctxt);
4691: xmlParseNotationDecl(ctxt);
4692: xmlParsePI(ctxt);
4693: xmlParseComment(ctxt);
4694: /*
4695: * This is only for internal subset. On external entities,
4696: * the replacement is done before parsing stage
4697: */
4698: if ((ctxt->external == 0) && (ctxt->inputNr == 1))
4699: xmlParsePEReference(ctxt);
4700: ctxt->instate = XML_PARSER_DTD;
4701: }
4702:
4703: /**
4704: * sgmlParseInternalsubset:
4705: * @ctxt: an SGML parser context
4706: *
4707: * parse the internal subset declaration
4708: *
4709: * [28 end] ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
4710: */
4711:
4712: void
4713: sgmlParseInternalSubset(xmlParserCtxtPtr ctxt) {
4714: /*
4715: * Is there any DTD definition ?
4716: */
4717: if (RAW == '[') {
4718: ctxt->instate = XML_PARSER_DTD;
4719: NEXT;
4720: /*
4721: * Parse the succession of Markup declarations and
4722: * PEReferences.
4723: * Subsequence (markupdecl | PEReference | S)*
4724: */
4725: while (RAW != ']') {
4726: const xmlChar *check = CUR_PTR;
4727: int cons = ctxt->input->consumed;
4728:
4729: SKIP_BLANKS;
4730: sgmlParseMarkupDecl(ctxt);
4731: xmlParsePEReference(ctxt);
4732:
4733: /*
4734: * Pop-up of finished entities.
4735: */
4736: while ((RAW == 0) && (ctxt->inputNr > 1))
4737: xmlPopInput(ctxt);
4738:
4739: if ((CUR_PTR == check) && (cons == ctxt->input->consumed)) {
4740: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4741: ctxt->sax->error(ctxt->userData,
4742: "sgmlParseInternalSubset: error detected in Markup declaration\n");
4743: ctxt->wellFormed = 0;
4744: ctxt->disableSAX = 1;
4745: ctxt->errNo = XML_ERR_INTERNAL_ERROR;
4746: break;
4747: }
4748: }
4749: if (RAW == ']') {
4750: NEXT;
4751: SKIP_BLANKS;
4752: }
4753: }
4754:
4755: /*
4756: * We should be at the end of the DOCTYPE declaration.
4757: */
4758: if (RAW != '>') {
4759: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4760: ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
4761: ctxt->wellFormed = 0;
4762: ctxt->disableSAX = 1;
4763: ctxt->errNo = XML_ERR_DOCTYPE_NOT_FINISHED;
4764: }
4765: NEXT;
4766: }
4767:
4768: /**
1.2 veillard 4769: * sgmlParseMisc:
4770: * @ctxt: an XML parser context
4771: *
4772: * parse an XML Misc* optionnal field.
4773: *
4774: * [27] Misc ::= Comment | PI | S
4775: */
4776:
4777: void
4778: sgmlParseMisc(xmlParserCtxtPtr ctxt) {
4779: while (((RAW == '<') && (NXT(1) == '?')) ||
4780: ((RAW == '<') && (NXT(1) == '!') &&
4781: (NXT(2) == '-') && (NXT(3) == '-')) ||
4782: IS_BLANK(CUR)) {
4783: if ((RAW == '<') && (NXT(1) == '?')) {
4784: xmlParsePI(ctxt); /* TODO: SGML PIs differs */
4785: } else if (IS_BLANK(CUR)) {
4786: NEXT;
4787: } else
4788: xmlParseComment(ctxt);
4789: }
4790: }
4791:
4792: /**
1.1 veillard 4793: * sgmlParseDocument :
4794: * @ctxt: an SGML parser context
4795: *
4796: * parse an SGML document (and build a tree if using the standard SAX
4797: * interface).
4798: *
4799: * Returns 0, -1 in case of error. the parser context is augmented
4800: * as a result of the parsing.
4801: */
4802:
4803: int
4804: sgmlParseDocument(sgmlParserCtxtPtr ctxt) {
1.2 veillard 4805: xmlChar start[4];
4806: xmlCharEncoding enc;
1.1 veillard 4807: xmlDtdPtr dtd;
4808:
4809: sgmlDefaultSAXHandlerInit();
4810: ctxt->html = 2;
4811:
4812: GROW;
4813: /*
4814: * SAX: beginning of the document processing.
4815: */
4816: if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4817: ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4818:
1.2 veillard 4819: /*
4820: * Get the 4 first bytes and decode the charset
4821: * if enc != XML_CHAR_ENCODING_NONE
4822: * plug some encoding conversion routines.
4823: */
4824: start[0] = RAW;
4825: start[1] = NXT(1);
4826: start[2] = NXT(2);
4827: start[3] = NXT(3);
4828: enc = xmlDetectCharEncoding(start, 4);
4829: if (enc != XML_CHAR_ENCODING_NONE) {
4830: xmlSwitchEncoding(ctxt, enc);
4831: }
4832:
1.1 veillard 4833: /*
4834: * Wipe out everything which is before the first '<'
4835: */
4836: SKIP_BLANKS;
4837: if (CUR == 0) {
4838: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
4839: ctxt->sax->error(ctxt->userData, "Document is empty\n");
4840: ctxt->wellFormed = 0;
4841: }
4842:
4843: if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4844: ctxt->sax->startDocument(ctxt->userData);
4845:
4846:
4847: /*
1.2 veillard 4848: * The Misc part of the Prolog
1.1 veillard 4849: */
1.2 veillard 4850: GROW;
4851: sgmlParseMisc(ctxt);
1.1 veillard 4852:
4853: /*
4854: * Then possibly doc type declaration(s) and more Misc
4855: * (doctypedecl Misc*)?
4856: */
1.2 veillard 4857: GROW;
4858: if ((RAW == '<') && (NXT(1) == '!') &&
4859: (NXT(2) == 'D') && (NXT(3) == 'O') &&
4860: (NXT(4) == 'C') && (NXT(5) == 'T') &&
4861: (NXT(6) == 'Y') && (NXT(7) == 'P') &&
4862: (NXT(8) == 'E')) {
4863:
4864: ctxt->inSubset = 1;
1.1 veillard 4865: sgmlParseDocTypeDecl(ctxt);
1.2 veillard 4866: if (RAW == '[') {
4867: ctxt->instate = XML_PARSER_DTD;
1.3 veillard 4868: sgmlParseInternalSubset(ctxt);
1.2 veillard 4869: }
4870:
4871: /*
4872: * Create and update the external subset.
4873: */
4874: ctxt->inSubset = 2;
4875: if ((ctxt->sax != NULL) && (ctxt->sax->externalSubset != NULL) &&
4876: (!ctxt->disableSAX))
4877: ctxt->sax->externalSubset(ctxt->userData, ctxt->intSubName,
4878: ctxt->extSubSystem, ctxt->extSubURI);
4879: ctxt->inSubset = 0;
4880:
4881:
4882: ctxt->instate = XML_PARSER_PROLOG;
4883: sgmlParseMisc(ctxt);
1.1 veillard 4884: }
4885:
4886: /*
4887: * Time to start parsing the tree itself
4888: */
4889: sgmlParseContent(ctxt);
4890:
4891: /*
4892: * autoclose
4893: */
4894: if (CUR == 0)
4895: sgmlAutoClose(ctxt, NULL);
4896:
4897:
4898: /*
4899: * SAX: end of the document processing.
4900: */
4901: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4902: ctxt->sax->endDocument(ctxt->userData);
4903:
4904: if (ctxt->myDoc != NULL) {
4905: dtd = xmlGetIntSubset(ctxt->myDoc);
4906: if (dtd == NULL)
4907: ctxt->myDoc->intSubset =
4908: xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "SGML",
4909: BAD_CAST "-//W3C//DTD SGML 4.0 Transitional//EN",
4910: BAD_CAST "http://www.w3.org/TR/REC-docbook/loose.dtd");
4911: }
4912: if (! ctxt->wellFormed) return(-1);
4913: return(0);
4914: }
4915:
4916:
4917: /************************************************************************
4918: * *
4919: * Parser contexts handling *
4920: * *
4921: ************************************************************************/
4922:
4923: /**
4924: * xmlInitParserCtxt:
4925: * @ctxt: an SGML parser context
4926: *
4927: * Initialize a parser context
4928: */
4929:
4930: void
4931: sgmlInitParserCtxt(sgmlParserCtxtPtr ctxt)
4932: {
4933: sgmlSAXHandler *sax;
4934:
4935: if (ctxt == NULL) return;
4936: memset(ctxt, 0, sizeof(sgmlParserCtxt));
4937:
4938: sax = (sgmlSAXHandler *) xmlMalloc(sizeof(sgmlSAXHandler));
4939: if (sax == NULL) {
4940: fprintf(stderr, "sgmlInitParserCtxt: out of memory\n");
4941: }
4942: memset(sax, 0, sizeof(sgmlSAXHandler));
4943:
4944: /* Allocate the Input stack */
4945: ctxt->inputTab = (sgmlParserInputPtr *)
4946: xmlMalloc(5 * sizeof(sgmlParserInputPtr));
4947: if (ctxt->inputTab == NULL) {
4948: fprintf(stderr, "sgmlInitParserCtxt: out of memory\n");
4949: }
4950: ctxt->inputNr = 0;
4951: ctxt->inputMax = 5;
4952: ctxt->input = NULL;
4953: ctxt->version = NULL;
4954: ctxt->encoding = NULL;
4955: ctxt->standalone = -1;
4956: ctxt->instate = XML_PARSER_START;
4957:
4958: /* Allocate the Node stack */
4959: ctxt->nodeTab = (sgmlNodePtr *) xmlMalloc(10 * sizeof(sgmlNodePtr));
4960: ctxt->nodeNr = 0;
4961: ctxt->nodeMax = 10;
4962: ctxt->node = NULL;
4963:
4964: /* Allocate the Name stack */
4965: ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4966: ctxt->nameNr = 0;
4967: ctxt->nameMax = 10;
4968: ctxt->name = NULL;
4969:
4970: if (sax == NULL) ctxt->sax = &sgmlDefaultSAXHandler;
4971: else {
4972: ctxt->sax = sax;
4973: memcpy(sax, &sgmlDefaultSAXHandler, sizeof(sgmlSAXHandler));
4974: }
4975: ctxt->userData = ctxt;
4976: ctxt->myDoc = NULL;
4977: ctxt->wellFormed = 1;
4978: ctxt->replaceEntities = 0;
4979: ctxt->html = 2;
4980: ctxt->record_info = 0;
4981: ctxt->validate = 0;
4982: ctxt->nbChars = 0;
4983: ctxt->checkIndex = 0;
4984: xmlInitNodeInfoSeq(&ctxt->node_seq);
4985: }
4986:
4987: /**
4988: * sgmlFreeParserCtxt:
4989: * @ctxt: an SGML parser context
4990: *
4991: * Free all the memory used by a parser context. However the parsed
4992: * document in ctxt->myDoc is not freed.
4993: */
4994:
4995: void
4996: sgmlFreeParserCtxt(sgmlParserCtxtPtr ctxt)
4997: {
4998: xmlFreeParserCtxt(ctxt);
4999: }
5000:
5001: /**
5002: * sgmlCreateDocParserCtxt :
5003: * @cur: a pointer to an array of xmlChar
5004: * @encoding: a free form C string describing the SGML document encoding, or NULL
5005: *
5006: * Create a parser context for an SGML document.
5007: *
5008: * Returns the new parser context or NULL
5009: */
5010: sgmlParserCtxtPtr
5011: sgmlCreateDocParserCtxt(xmlChar *cur, const char *encoding) {
5012: sgmlParserCtxtPtr ctxt;
5013: sgmlParserInputPtr input;
5014: /* sgmlCharEncoding enc; */
5015:
5016: ctxt = (sgmlParserCtxtPtr) xmlMalloc(sizeof(sgmlParserCtxt));
5017: if (ctxt == NULL) {
5018: perror("malloc");
5019: return(NULL);
5020: }
5021: sgmlInitParserCtxt(ctxt);
5022: input = (sgmlParserInputPtr) xmlMalloc(sizeof(sgmlParserInput));
5023: if (input == NULL) {
5024: perror("malloc");
5025: xmlFree(ctxt);
5026: return(NULL);
5027: }
5028: memset(input, 0, sizeof(sgmlParserInput));
5029:
5030: input->line = 1;
5031: input->col = 1;
5032: input->base = cur;
5033: input->cur = cur;
5034:
5035: inputPush(ctxt, input);
5036: return(ctxt);
5037: }
5038:
5039: /************************************************************************
5040: * *
5041: * Progressive parsing interfaces *
5042: * *
5043: ************************************************************************/
5044:
5045: /**
5046: * sgmlParseLookupSequence:
5047: * @ctxt: an SGML parser context
5048: * @first: the first char to lookup
5049: * @next: the next char to lookup or zero
5050: * @third: the next char to lookup or zero
5051: *
5052: * Try to find if a sequence (first, next, third) or just (first next) or
5053: * (first) is available in the input stream.
5054: * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5055: * to avoid rescanning sequences of bytes, it DOES change the state of the
5056: * parser, do not use liberally.
5057: * This is basically similar to xmlParseLookupSequence()
5058: *
5059: * Returns the index to the current parsing point if the full sequence
5060: * is available, -1 otherwise.
5061: */
5062: int
5063: sgmlParseLookupSequence(sgmlParserCtxtPtr ctxt, xmlChar first,
5064: xmlChar next, xmlChar third) {
5065: int base, len;
5066: sgmlParserInputPtr in;
5067: const xmlChar *buf;
5068:
5069: in = ctxt->input;
5070: if (in == NULL) return(-1);
5071: base = in->cur - in->base;
5072: if (base < 0) return(-1);
5073: if (ctxt->checkIndex > base)
5074: base = ctxt->checkIndex;
5075: if (in->buf == NULL) {
5076: buf = in->base;
5077: len = in->length;
5078: } else {
5079: buf = in->buf->buffer->content;
5080: len = in->buf->buffer->use;
5081: }
5082: /* take into account the sequence length */
5083: if (third) len -= 2;
5084: else if (next) len --;
5085: for (;base < len;base++) {
5086: if (buf[base] == first) {
5087: if (third != 0) {
5088: if ((buf[base + 1] != next) ||
5089: (buf[base + 2] != third)) continue;
5090: } else if (next != 0) {
5091: if (buf[base + 1] != next) continue;
5092: }
5093: ctxt->checkIndex = 0;
5094: #ifdef DEBUG_PUSH
5095: if (next == 0)
5096: fprintf(stderr, "HPP: lookup '%c' found at %d\n",
5097: first, base);
5098: else if (third == 0)
5099: fprintf(stderr, "HPP: lookup '%c%c' found at %d\n",
5100: first, next, base);
5101: else
5102: fprintf(stderr, "HPP: lookup '%c%c%c' found at %d\n",
5103: first, next, third, base);
5104: #endif
5105: return(base - (in->cur - in->base));
5106: }
5107: }
5108: ctxt->checkIndex = base;
5109: #ifdef DEBUG_PUSH
5110: if (next == 0)
5111: fprintf(stderr, "HPP: lookup '%c' failed\n", first);
5112: else if (third == 0)
5113: fprintf(stderr, "HPP: lookup '%c%c' failed\n", first, next);
5114: else
5115: fprintf(stderr, "HPP: lookup '%c%c%c' failed\n", first, next, third);
5116: #endif
5117: return(-1);
5118: }
5119:
5120: /**
5121: * sgmlParseTryOrFinish:
5122: * @ctxt: an SGML parser context
5123: * @terminate: last chunk indicator
5124: *
5125: * Try to progress on parsing
5126: *
5127: * Returns zero if no parsing was possible
5128: */
5129: int
5130: sgmlParseTryOrFinish(sgmlParserCtxtPtr ctxt, int terminate) {
5131: int ret = 0;
5132: sgmlParserInputPtr in;
5133: int avail = 0;
5134: xmlChar cur, next;
5135:
5136: #ifdef DEBUG_PUSH
5137: switch (ctxt->instate) {
5138: case XML_PARSER_EOF:
5139: fprintf(stderr, "HPP: try EOF\n"); break;
5140: case XML_PARSER_START:
5141: fprintf(stderr, "HPP: try START\n"); break;
5142: case XML_PARSER_MISC:
5143: fprintf(stderr, "HPP: try MISC\n");break;
5144: case XML_PARSER_COMMENT:
5145: fprintf(stderr, "HPP: try COMMENT\n");break;
5146: case XML_PARSER_PROLOG:
5147: fprintf(stderr, "HPP: try PROLOG\n");break;
5148: case XML_PARSER_START_TAG:
5149: fprintf(stderr, "HPP: try START_TAG\n");break;
5150: case XML_PARSER_CONTENT:
5151: fprintf(stderr, "HPP: try CONTENT\n");break;
5152: case XML_PARSER_CDATA_SECTION:
5153: fprintf(stderr, "HPP: try CDATA_SECTION\n");break;
5154: case XML_PARSER_END_TAG:
5155: fprintf(stderr, "HPP: try END_TAG\n");break;
5156: case XML_PARSER_ENTITY_DECL:
5157: fprintf(stderr, "HPP: try ENTITY_DECL\n");break;
5158: case XML_PARSER_ENTITY_VALUE:
5159: fprintf(stderr, "HPP: try ENTITY_VALUE\n");break;
5160: case XML_PARSER_ATTRIBUTE_VALUE:
5161: fprintf(stderr, "HPP: try ATTRIBUTE_VALUE\n");break;
5162: case XML_PARSER_DTD:
5163: fprintf(stderr, "HPP: try DTD\n");break;
5164: case XML_PARSER_EPILOG:
5165: fprintf(stderr, "HPP: try EPILOG\n");break;
5166: case XML_PARSER_PI:
5167: fprintf(stderr, "HPP: try PI\n");break;
5168: }
5169: #endif
5170:
5171: while (1) {
5172:
5173: in = ctxt->input;
5174: if (in == NULL) break;
5175: if (in->buf == NULL)
5176: avail = in->length - (in->cur - in->base);
5177: else
5178: avail = in->buf->buffer->use - (in->cur - in->base);
5179: if ((avail == 0) && (terminate)) {
5180: sgmlAutoClose(ctxt, NULL);
5181: if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5182: /*
5183: * SAX: end of the document processing.
5184: */
5185: ctxt->instate = XML_PARSER_EOF;
5186: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5187: ctxt->sax->endDocument(ctxt->userData);
5188: }
5189: }
5190: if (avail < 1)
5191: goto done;
5192: switch (ctxt->instate) {
5193: case XML_PARSER_EOF:
5194: /*
5195: * Document parsing is done !
5196: */
5197: goto done;
5198: case XML_PARSER_START:
5199: /*
5200: * Very first chars read from the document flow.
5201: */
5202: cur = in->cur[0];
5203: if (IS_BLANK(cur)) {
5204: SKIP_BLANKS;
5205: if (in->buf == NULL)
5206: avail = in->length - (in->cur - in->base);
5207: else
5208: avail = in->buf->buffer->use - (in->cur - in->base);
5209: }
5210: if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5211: ctxt->sax->setDocumentLocator(ctxt->userData,
5212: &xmlDefaultSAXLocator);
5213: if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5214: (!ctxt->disableSAX))
5215: ctxt->sax->startDocument(ctxt->userData);
5216:
5217: cur = in->cur[0];
5218: next = in->cur[1];
5219: if ((cur == '<') && (next == '!') &&
5220: (UPP(2) == 'D') && (UPP(3) == 'O') &&
5221: (UPP(4) == 'C') && (UPP(5) == 'T') &&
5222: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5223: (UPP(8) == 'E')) {
5224: if ((!terminate) &&
5225: (sgmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
5226: goto done;
5227: #ifdef DEBUG_PUSH
5228: fprintf(stderr, "HPP: Parsing internal subset\n");
5229: #endif
5230: sgmlParseDocTypeDecl(ctxt);
5231: ctxt->instate = XML_PARSER_PROLOG;
5232: #ifdef DEBUG_PUSH
5233: fprintf(stderr, "HPP: entering PROLOG\n");
5234: #endif
5235: } else {
5236: ctxt->instate = XML_PARSER_MISC;
5237: }
5238: #ifdef DEBUG_PUSH
5239: fprintf(stderr, "HPP: entering MISC\n");
5240: #endif
5241: break;
5242: case XML_PARSER_MISC:
5243: SKIP_BLANKS;
5244: if (in->buf == NULL)
5245: avail = in->length - (in->cur - in->base);
5246: else
5247: avail = in->buf->buffer->use - (in->cur - in->base);
5248: if (avail < 2)
5249: goto done;
5250: cur = in->cur[0];
5251: next = in->cur[1];
5252: if ((cur == '<') && (next == '!') &&
5253: (in->cur[2] == '-') && (in->cur[3] == '-')) {
5254: if ((!terminate) &&
5255: (sgmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
5256: goto done;
5257: #ifdef DEBUG_PUSH
5258: fprintf(stderr, "HPP: Parsing Comment\n");
5259: #endif
5260: sgmlParseComment(ctxt);
5261: ctxt->instate = XML_PARSER_MISC;
5262: } else if ((cur == '<') && (next == '!') &&
5263: (UPP(2) == 'D') && (UPP(3) == 'O') &&
5264: (UPP(4) == 'C') && (UPP(5) == 'T') &&
5265: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5266: (UPP(8) == 'E')) {
5267: if ((!terminate) &&
5268: (sgmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
5269: goto done;
5270: #ifdef DEBUG_PUSH
5271: fprintf(stderr, "HPP: Parsing internal subset\n");
5272: #endif
5273: sgmlParseDocTypeDecl(ctxt);
5274: ctxt->instate = XML_PARSER_PROLOG;
5275: #ifdef DEBUG_PUSH
5276: fprintf(stderr, "HPP: entering PROLOG\n");
5277: #endif
5278: } else if ((cur == '<') && (next == '!') &&
5279: (avail < 9)) {
5280: goto done;
5281: } else {
5282: ctxt->instate = XML_PARSER_START_TAG;
5283: #ifdef DEBUG_PUSH
5284: fprintf(stderr, "HPP: entering START_TAG\n");
5285: #endif
5286: }
5287: break;
5288: case XML_PARSER_PROLOG:
5289: SKIP_BLANKS;
5290: if (in->buf == NULL)
5291: avail = in->length - (in->cur - in->base);
5292: else
5293: avail = in->buf->buffer->use - (in->cur - in->base);
5294: if (avail < 2)
5295: goto done;
5296: cur = in->cur[0];
5297: next = in->cur[1];
5298: if ((cur == '<') && (next == '!') &&
5299: (in->cur[2] == '-') && (in->cur[3] == '-')) {
5300: if ((!terminate) &&
5301: (sgmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
5302: goto done;
5303: #ifdef DEBUG_PUSH
5304: fprintf(stderr, "HPP: Parsing Comment\n");
5305: #endif
5306: sgmlParseComment(ctxt);
5307: ctxt->instate = XML_PARSER_PROLOG;
5308: } else if ((cur == '<') && (next == '!') &&
5309: (avail < 4)) {
5310: goto done;
5311: } else {
5312: ctxt->instate = XML_PARSER_START_TAG;
5313: #ifdef DEBUG_PUSH
5314: fprintf(stderr, "HPP: entering START_TAG\n");
5315: #endif
5316: }
5317: break;
5318: case XML_PARSER_EPILOG:
5319: if (in->buf == NULL)
5320: avail = in->length - (in->cur - in->base);
5321: else
5322: avail = in->buf->buffer->use - (in->cur - in->base);
5323: if (avail < 1)
5324: goto done;
5325: cur = in->cur[0];
5326: if (IS_BLANK(cur)) {
5327: sgmlParseCharData(ctxt, 0);
5328: goto done;
5329: }
5330: if (avail < 2)
5331: goto done;
5332: next = in->cur[1];
5333: if ((cur == '<') && (next == '!') &&
5334: (in->cur[2] == '-') && (in->cur[3] == '-')) {
5335: if ((!terminate) &&
5336: (sgmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
5337: goto done;
5338: #ifdef DEBUG_PUSH
5339: fprintf(stderr, "HPP: Parsing Comment\n");
5340: #endif
5341: sgmlParseComment(ctxt);
5342: ctxt->instate = XML_PARSER_EPILOG;
5343: } else if ((cur == '<') && (next == '!') &&
5344: (avail < 4)) {
5345: goto done;
5346: } else {
5347: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
5348: ctxt->sax->error(ctxt->userData,
5349: "Extra content at the end of the document\n");
5350: ctxt->wellFormed = 0;
5351: ctxt->errNo = XML_ERR_DOCUMENT_END;
5352: ctxt->instate = XML_PARSER_EOF;
5353: #ifdef DEBUG_PUSH
5354: fprintf(stderr, "HPP: entering EOF\n");
5355: #endif
5356: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5357: ctxt->sax->endDocument(ctxt->userData);
5358: goto done;
5359: }
5360: break;
5361: case XML_PARSER_START_TAG: {
5362: xmlChar *name, *oldname;
5363: int depth = ctxt->nameNr;
5364: sgmlElemDescPtr info;
5365:
5366: if (avail < 2)
5367: goto done;
5368: cur = in->cur[0];
5369: if (cur != '<') {
5370: ctxt->instate = XML_PARSER_CONTENT;
5371: #ifdef DEBUG_PUSH
5372: fprintf(stderr, "HPP: entering CONTENT\n");
5373: #endif
5374: break;
5375: }
5376: if ((!terminate) &&
5377: (sgmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
5378: goto done;
5379:
5380: oldname = xmlStrdup(ctxt->name);
5381: sgmlParseStartTag(ctxt);
5382: name = ctxt->name;
5383: #ifdef DEBUG
5384: if (oldname == NULL)
5385: fprintf(stderr, "Start of element %s\n", name);
5386: else if (name == NULL)
5387: fprintf(stderr, "Start of element failed, was %s\n",
5388: oldname);
5389: else
5390: fprintf(stderr, "Start of element %s, was %s\n",
5391: name, oldname);
5392: #endif
5393: if (((depth == ctxt->nameNr) &&
5394: (!xmlStrcmp(oldname, ctxt->name))) ||
5395: (name == NULL)) {
5396: if (CUR == '>')
5397: NEXT;
5398: if (oldname != NULL)
5399: xmlFree(oldname);
5400: break;
5401: }
5402: if (oldname != NULL)
5403: xmlFree(oldname);
5404:
5405: /*
5406: * Lookup the info for that element.
5407: */
5408: info = sgmlTagLookup(name);
5409: if (info == NULL) {
5410: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
1.4 veillard 5411: ctxt->sax->error(ctxt->userData, "Tag %s unknown\n",
1.1 veillard 5412: name);
5413: ctxt->wellFormed = 0;
5414: } else if (info->depr) {
5415: /***************************
5416: if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
5417: ctxt->sax->warning(ctxt->userData,
5418: "Tag %s is deprecated\n",
5419: name);
5420: ***************************/
5421: }
5422:
5423: /*
5424: * Check for an Empty Element labelled the XML/SGML way
5425: */
5426: if ((CUR == '/') && (NXT(1) == '>')) {
5427: SKIP(2);
5428: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5429: ctxt->sax->endElement(ctxt->userData, name);
5430: oldname = sgmlnamePop(ctxt);
5431: #ifdef DEBUG
5432: fprintf(stderr,"End of tag the XML way: popping out %s\n",
5433: oldname);
5434: #endif
5435: if (oldname != NULL)
5436: xmlFree(oldname);
5437: ctxt->instate = XML_PARSER_CONTENT;
5438: #ifdef DEBUG_PUSH
5439: fprintf(stderr, "HPP: entering CONTENT\n");
5440: #endif
5441: break;
5442: }
5443:
5444: if (CUR == '>') {
5445: NEXT;
5446: } else {
5447: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
5448: ctxt->sax->error(ctxt->userData,
5449: "Couldn't find end of Start Tag %s\n",
5450: name);
5451: ctxt->wellFormed = 0;
5452:
5453: /*
5454: * end of parsing of this node.
5455: */
5456: if (!xmlStrcmp(name, ctxt->name)) {
5457: nodePop(ctxt);
5458: oldname = sgmlnamePop(ctxt);
5459: #ifdef DEBUG
5460: fprintf(stderr,
5461: "End of start tag problem: popping out %s\n", oldname);
5462: #endif
5463: if (oldname != NULL)
5464: xmlFree(oldname);
5465: }
5466:
5467: ctxt->instate = XML_PARSER_CONTENT;
5468: #ifdef DEBUG_PUSH
5469: fprintf(stderr, "HPP: entering CONTENT\n");
5470: #endif
5471: break;
5472: }
5473:
5474: /*
5475: * Check for an Empty Element from DTD definition
5476: */
5477: if ((info != NULL) && (info->empty)) {
5478: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5479: ctxt->sax->endElement(ctxt->userData, name);
5480: oldname = sgmlnamePop(ctxt);
5481: #ifdef DEBUG
5482: fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
5483: #endif
5484: if (oldname != NULL)
5485: xmlFree(oldname);
5486: }
5487: ctxt->instate = XML_PARSER_CONTENT;
5488: #ifdef DEBUG_PUSH
5489: fprintf(stderr, "HPP: entering CONTENT\n");
5490: #endif
5491: break;
5492: }
5493: case XML_PARSER_CONTENT: {
5494: long cons;
5495: /*
5496: * Handle preparsed entities and charRef
5497: */
5498: if (ctxt->token != 0) {
5499: xmlChar chr[2] = { 0 , 0 } ;
5500:
5501: chr[0] = (xmlChar) ctxt->token;
5502: sgmlCheckParagraph(ctxt);
5503: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5504: ctxt->sax->characters(ctxt->userData, chr, 1);
5505: ctxt->token = 0;
5506: ctxt->checkIndex = 0;
5507: }
5508: if ((avail == 1) && (terminate)) {
5509: cur = in->cur[0];
5510: if ((cur != '<') && (cur != '&')) {
5511: if (ctxt->sax != NULL) {
5512: if (IS_BLANK(cur)) {
5513: if (ctxt->sax->ignorableWhitespace != NULL)
5514: ctxt->sax->ignorableWhitespace(
5515: ctxt->userData, &cur, 1);
5516: } else {
5517: sgmlCheckParagraph(ctxt);
5518: if (ctxt->sax->characters != NULL)
5519: ctxt->sax->characters(
5520: ctxt->userData, &cur, 1);
5521: }
5522: }
5523: ctxt->token = 0;
5524: ctxt->checkIndex = 0;
5525: NEXT;
5526: }
5527: break;
5528: }
5529: if (avail < 2)
5530: goto done;
5531: cur = in->cur[0];
5532: next = in->cur[1];
5533: cons = ctxt->nbChars;
5534: /*
5535: * Sometimes DOCTYPE arrives in the middle of the document
5536: */
5537: if ((cur == '<') && (next == '!') &&
5538: (UPP(2) == 'D') && (UPP(3) == 'O') &&
5539: (UPP(4) == 'C') && (UPP(5) == 'T') &&
5540: (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5541: (UPP(8) == 'E')) {
5542: if ((!terminate) &&
5543: (sgmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
5544: goto done;
5545: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
5546: ctxt->sax->error(ctxt->userData,
5547: "Misplaced DOCTYPE declaration\n");
5548: ctxt->wellFormed = 0;
5549: sgmlParseDocTypeDecl(ctxt);
5550: } else if ((cur == '<') && (next == '!') &&
5551: (in->cur[2] == '-') && (in->cur[3] == '-')) {
5552: if ((!terminate) &&
5553: (sgmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
5554: goto done;
5555: #ifdef DEBUG_PUSH
5556: fprintf(stderr, "HPP: Parsing Comment\n");
5557: #endif
5558: sgmlParseComment(ctxt);
5559: ctxt->instate = XML_PARSER_CONTENT;
5560: } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5561: goto done;
5562: } else if ((cur == '<') && (next == '/')) {
5563: ctxt->instate = XML_PARSER_END_TAG;
5564: ctxt->checkIndex = 0;
5565: #ifdef DEBUG_PUSH
5566: fprintf(stderr, "HPP: entering END_TAG\n");
5567: #endif
5568: break;
5569: } else if (cur == '<') {
5570: ctxt->instate = XML_PARSER_START_TAG;
5571: ctxt->checkIndex = 0;
5572: #ifdef DEBUG_PUSH
5573: fprintf(stderr, "HPP: entering START_TAG\n");
5574: #endif
5575: break;
5576: } else if (cur == '&') {
5577: if ((!terminate) &&
5578: (sgmlParseLookupSequence(ctxt, ';', 0, 0) < 0))
5579: goto done;
5580: #ifdef DEBUG_PUSH
5581: fprintf(stderr, "HPP: Parsing Reference\n");
5582: #endif
5583: /* TODO: check generation of subtrees if noent !!! */
5584: sgmlParseReference(ctxt);
5585: } else {
5586: /* TODO Avoid the extra copy, handle directly !!!!!! */
5587: /*
5588: * Goal of the following test is :
5589: * - minimize calls to the SAX 'character' callback
5590: * when they are mergeable
5591: */
5592: if ((ctxt->inputNr == 1) &&
5593: (avail < SGML_PARSER_BIG_BUFFER_SIZE)) {
5594: if ((!terminate) &&
5595: (sgmlParseLookupSequence(ctxt, '<', 0, 0) < 0))
5596: goto done;
5597: }
5598: ctxt->checkIndex = 0;
5599: #ifdef DEBUG_PUSH
5600: fprintf(stderr, "HPP: Parsing char data\n");
5601: #endif
5602: sgmlParseCharData(ctxt, 0);
5603: }
5604: if (cons == ctxt->nbChars) {
5605: if (ctxt->node != NULL) {
5606: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
5607: ctxt->sax->error(ctxt->userData,
5608: "detected an error in element content\n");
5609: ctxt->wellFormed = 0;
5610: NEXT;
5611: }
5612: break;
5613: }
5614:
5615: break;
5616: }
5617: case XML_PARSER_END_TAG:
5618: if (avail < 2)
5619: goto done;
5620: if ((!terminate) &&
5621: (sgmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
5622: goto done;
5623: sgmlParseEndTag(ctxt);
5624: if (ctxt->nameNr == 0) {
5625: ctxt->instate = XML_PARSER_EPILOG;
5626: } else {
5627: ctxt->instate = XML_PARSER_CONTENT;
5628: }
5629: ctxt->checkIndex = 0;
5630: #ifdef DEBUG_PUSH
5631: fprintf(stderr, "HPP: entering CONTENT\n");
5632: #endif
5633: break;
5634: case XML_PARSER_CDATA_SECTION:
5635: fprintf(stderr, "HPP: internal error, state == CDATA\n");
5636: ctxt->instate = XML_PARSER_CONTENT;
5637: ctxt->checkIndex = 0;
5638: #ifdef DEBUG_PUSH
5639: fprintf(stderr, "HPP: entering CONTENT\n");
5640: #endif
5641: break;
5642: case XML_PARSER_DTD:
5643: fprintf(stderr, "HPP: internal error, state == DTD\n");
5644: ctxt->instate = XML_PARSER_CONTENT;
5645: ctxt->checkIndex = 0;
5646: #ifdef DEBUG_PUSH
5647: fprintf(stderr, "HPP: entering CONTENT\n");
5648: #endif
5649: break;
5650: case XML_PARSER_COMMENT:
5651: fprintf(stderr, "HPP: internal error, state == COMMENT\n");
5652: ctxt->instate = XML_PARSER_CONTENT;
5653: ctxt->checkIndex = 0;
5654: #ifdef DEBUG_PUSH
5655: fprintf(stderr, "HPP: entering CONTENT\n");
5656: #endif
5657: break;
5658: case XML_PARSER_PI:
5659: fprintf(stderr, "HPP: internal error, state == PI\n");
5660: ctxt->instate = XML_PARSER_CONTENT;
5661: ctxt->checkIndex = 0;
5662: #ifdef DEBUG_PUSH
5663: fprintf(stderr, "HPP: entering CONTENT\n");
5664: #endif
5665: break;
5666: case XML_PARSER_ENTITY_DECL:
5667: fprintf(stderr, "HPP: internal error, state == ENTITY_DECL\n");
5668: ctxt->instate = XML_PARSER_CONTENT;
5669: ctxt->checkIndex = 0;
5670: #ifdef DEBUG_PUSH
5671: fprintf(stderr, "HPP: entering CONTENT\n");
5672: #endif
5673: break;
5674: case XML_PARSER_ENTITY_VALUE:
5675: fprintf(stderr, "HPP: internal error, state == ENTITY_VALUE\n");
5676: ctxt->instate = XML_PARSER_CONTENT;
5677: ctxt->checkIndex = 0;
5678: #ifdef DEBUG_PUSH
5679: fprintf(stderr, "HPP: entering DTD\n");
5680: #endif
5681: break;
5682: case XML_PARSER_ATTRIBUTE_VALUE:
5683: fprintf(stderr, "HPP: internal error, state == ATTRIBUTE_VALUE\n");
5684: ctxt->instate = XML_PARSER_START_TAG;
5685: ctxt->checkIndex = 0;
5686: #ifdef DEBUG_PUSH
5687: fprintf(stderr, "HPP: entering START_TAG\n");
5688: #endif
5689: break;
5690: case XML_PARSER_SYSTEM_LITERAL:
5691: fprintf(stderr, "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n");
5692: ctxt->instate = XML_PARSER_CONTENT;
5693: ctxt->checkIndex = 0;
5694: #ifdef DEBUG_PUSH
5695: fprintf(stderr, "HPP: entering CONTENT\n");
5696: #endif
5697: break;
5698: }
5699: }
5700: done:
5701: if ((avail == 0) && (terminate)) {
5702: sgmlAutoClose(ctxt, NULL);
5703: if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5704: /*
5705: * SAX: end of the document processing.
5706: */
5707: ctxt->instate = XML_PARSER_EOF;
5708: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5709: ctxt->sax->endDocument(ctxt->userData);
5710: }
5711: }
5712: if ((ctxt->myDoc != NULL) &&
5713: ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5714: (ctxt->instate == XML_PARSER_EPILOG))) {
5715: xmlDtdPtr dtd;
5716: dtd = xmlGetIntSubset(ctxt->myDoc);
5717: if (dtd == NULL)
5718: ctxt->myDoc->intSubset =
5719: xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "SGML",
5720: BAD_CAST "-//W3C//DTD SGML 4.0 Transitional//EN",
5721: BAD_CAST "http://www.w3.org/TR/REC-docbook/loose.dtd");
5722: }
5723: #ifdef DEBUG_PUSH
5724: fprintf(stderr, "HPP: done %d\n", ret);
5725: #endif
5726: return(ret);
5727: }
5728:
5729: /**
5730: * sgmlParseTry:
5731: * @ctxt: an SGML parser context
5732: *
5733: * Try to progress on parsing
5734: *
5735: * Returns zero if no parsing was possible
5736: */
5737: int
5738: sgmlParseTry(sgmlParserCtxtPtr ctxt) {
5739: return(sgmlParseTryOrFinish(ctxt, 0));
5740: }
5741:
5742: /**
5743: * sgmlParseChunk:
5744: * @ctxt: an XML parser context
5745: * @chunk: an char array
5746: * @size: the size in byte of the chunk
5747: * @terminate: last chunk indicator
5748: *
5749: * Parse a Chunk of memory
5750: *
5751: * Returns zero if no error, the xmlParserErrors otherwise.
5752: */
5753: int
5754: sgmlParseChunk(sgmlParserCtxtPtr ctxt, const char *chunk, int size,
5755: int terminate) {
5756: if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5757: (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5758: int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5759: int cur = ctxt->input->cur - ctxt->input->base;
5760:
5761: xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5762: ctxt->input->base = ctxt->input->buf->buffer->content + base;
5763: ctxt->input->cur = ctxt->input->base + cur;
5764: #ifdef DEBUG_PUSH
5765: fprintf(stderr, "HPP: pushed %d\n", size);
5766: #endif
5767:
5768: if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5769: sgmlParseTryOrFinish(ctxt, terminate);
5770: } else if (ctxt->instate != XML_PARSER_EOF) {
5771: xmlParserInputBufferPush(ctxt->input->buf, 0, "");
5772: sgmlParseTryOrFinish(ctxt, terminate);
5773: }
5774: if (terminate) {
5775: if ((ctxt->instate != XML_PARSER_EOF) &&
5776: (ctxt->instate != XML_PARSER_EPILOG) &&
5777: (ctxt->instate != XML_PARSER_MISC)) {
5778: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
5779: ctxt->sax->error(ctxt->userData,
5780: "Extra content at the end of the document\n");
5781: ctxt->wellFormed = 0;
5782: ctxt->errNo = XML_ERR_DOCUMENT_END;
5783: }
5784: if (ctxt->instate != XML_PARSER_EOF) {
5785: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5786: ctxt->sax->endDocument(ctxt->userData);
5787: }
5788: ctxt->instate = XML_PARSER_EOF;
5789: }
5790: return((xmlParserErrors) ctxt->errNo);
5791: }
5792:
5793: /************************************************************************
5794: * *
5795: * User entry points *
5796: * *
5797: ************************************************************************/
5798:
5799: /**
5800: * sgmlCreatePushParserCtxt :
5801: * @sax: a SAX handler
5802: * @user_data: The user data returned on SAX callbacks
5803: * @chunk: a pointer to an array of chars
5804: * @size: number of chars in the array
5805: * @filename: an optional file name or URI
5806: * @enc: an optional encoding
5807: *
5808: * Create a parser context for using the SGML parser in push mode
5809: * To allow content encoding detection, @size should be >= 4
5810: * The value of @filename is used for fetching external entities
5811: * and error/warning reports.
5812: *
5813: * Returns the new parser context or NULL
5814: */
5815: sgmlParserCtxtPtr
5816: sgmlCreatePushParserCtxt(sgmlSAXHandlerPtr sax, void *user_data,
5817: const char *chunk, int size, const char *filename,
5818: xmlCharEncoding enc) {
5819: sgmlParserCtxtPtr ctxt;
5820: sgmlParserInputPtr inputStream;
5821: xmlParserInputBufferPtr buf;
5822:
5823: buf = xmlAllocParserInputBuffer(enc);
5824: if (buf == NULL) return(NULL);
5825:
5826: ctxt = (sgmlParserCtxtPtr) xmlMalloc(sizeof(sgmlParserCtxt));
5827: if (ctxt == NULL) {
5828: xmlFree(buf);
5829: return(NULL);
5830: }
5831: memset(ctxt, 0, sizeof(sgmlParserCtxt));
5832: sgmlInitParserCtxt(ctxt);
5833: if (sax != NULL) {
5834: if (ctxt->sax != &sgmlDefaultSAXHandler)
5835: xmlFree(ctxt->sax);
5836: ctxt->sax = (sgmlSAXHandlerPtr) xmlMalloc(sizeof(sgmlSAXHandler));
5837: if (ctxt->sax == NULL) {
5838: xmlFree(buf);
5839: xmlFree(ctxt);
5840: return(NULL);
5841: }
5842: memcpy(ctxt->sax, sax, sizeof(sgmlSAXHandler));
5843: if (user_data != NULL)
5844: ctxt->userData = user_data;
5845: }
5846: if (filename == NULL) {
5847: ctxt->directory = NULL;
5848: } else {
5849: ctxt->directory = xmlParserGetDirectory(filename);
5850: }
5851:
5852: inputStream = sgmlNewInputStream(ctxt);
5853: if (inputStream == NULL) {
5854: xmlFreeParserCtxt(ctxt);
5855: return(NULL);
5856: }
5857:
5858: if (filename == NULL)
5859: inputStream->filename = NULL;
5860: else
5861: inputStream->filename = xmlMemStrdup(filename);
5862: inputStream->buf = buf;
5863: inputStream->base = inputStream->buf->buffer->content;
5864: inputStream->cur = inputStream->buf->buffer->content;
5865:
5866: inputPush(ctxt, inputStream);
5867:
5868: if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5869: (ctxt->input->buf != NULL)) {
5870: xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5871: #ifdef DEBUG_PUSH
5872: fprintf(stderr, "HPP: pushed %d\n", size);
5873: #endif
5874: }
5875:
5876: return(ctxt);
5877: }
5878:
5879: /**
5880: * sgmlSAXParseDoc :
5881: * @cur: a pointer to an array of xmlChar
5882: * @encoding: a free form C string describing the SGML document encoding, or NULL
5883: * @sax: the SAX handler block
5884: * @userData: if using SAX, this pointer will be provided on callbacks.
5885: *
5886: * parse an SGML in-memory document and build a tree.
5887: * It use the given SAX function block to handle the parsing callback.
5888: * If sax is NULL, fallback to the default DOM tree building routines.
5889: *
5890: * Returns the resulting document tree
5891: */
5892:
5893: sgmlDocPtr
5894: sgmlSAXParseDoc(xmlChar *cur, const char *encoding, sgmlSAXHandlerPtr sax, void *userData) {
5895: sgmlDocPtr ret;
5896: sgmlParserCtxtPtr ctxt;
5897:
5898: if (cur == NULL) return(NULL);
5899:
5900:
5901: ctxt = sgmlCreateDocParserCtxt(cur, encoding);
5902: if (ctxt == NULL) return(NULL);
5903: if (sax != NULL) {
5904: ctxt->sax = sax;
5905: ctxt->userData = userData;
5906: }
5907:
5908: sgmlParseDocument(ctxt);
5909: ret = ctxt->myDoc;
5910: if (sax != NULL) {
5911: ctxt->sax = NULL;
5912: ctxt->userData = NULL;
5913: }
5914: sgmlFreeParserCtxt(ctxt);
5915:
5916: return(ret);
5917: }
5918:
5919: /**
5920: * sgmlParseDoc :
5921: * @cur: a pointer to an array of xmlChar
5922: * @encoding: a free form C string describing the SGML document encoding, or NULL
5923: *
5924: * parse an SGML in-memory document and build a tree.
5925: *
5926: * Returns the resulting document tree
5927: */
5928:
5929: sgmlDocPtr
5930: sgmlParseDoc(xmlChar *cur, const char *encoding) {
5931: return(sgmlSAXParseDoc(cur, encoding, NULL, NULL));
5932: }
5933:
5934:
5935: /**
5936: * sgmlCreateFileParserCtxt :
5937: * @filename: the filename
5938: * @encoding: a free form C string describing the SGML document encoding, or NULL
5939: *
5940: * Create a parser context for a file content.
5941: * Automatic support for ZLIB/Compress compressed document is provided
5942: * by default if found at compile-time.
5943: *
5944: * Returns the new parser context or NULL
5945: */
5946: sgmlParserCtxtPtr
5947: sgmlCreateFileParserCtxt(const char *filename, const char *encoding)
5948: {
5949: sgmlParserCtxtPtr ctxt;
5950: sgmlParserInputPtr inputStream;
5951: xmlParserInputBufferPtr buf;
5952: /* sgmlCharEncoding enc; */
5953:
5954: buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
5955: if (buf == NULL) return(NULL);
5956:
5957: ctxt = (sgmlParserCtxtPtr) xmlMalloc(sizeof(sgmlParserCtxt));
5958: if (ctxt == NULL) {
5959: perror("malloc");
5960: return(NULL);
5961: }
5962: memset(ctxt, 0, sizeof(sgmlParserCtxt));
5963: sgmlInitParserCtxt(ctxt);
5964: inputStream = (sgmlParserInputPtr) xmlMalloc(sizeof(sgmlParserInput));
5965: if (inputStream == NULL) {
5966: perror("malloc");
5967: xmlFree(ctxt);
5968: return(NULL);
5969: }
5970: memset(inputStream, 0, sizeof(sgmlParserInput));
5971:
5972: inputStream->filename = xmlMemStrdup(filename);
5973: inputStream->line = 1;
5974: inputStream->col = 1;
5975: inputStream->buf = buf;
5976: inputStream->directory = NULL;
5977:
5978: inputStream->base = inputStream->buf->buffer->content;
5979: inputStream->cur = inputStream->buf->buffer->content;
5980: inputStream->free = NULL;
5981:
5982: inputPush(ctxt, inputStream);
5983: return(ctxt);
5984: }
5985:
5986: /**
5987: * sgmlSAXParseFile :
5988: * @filename: the filename
5989: * @encoding: a free form C string describing the SGML document encoding, or NULL
5990: * @sax: the SAX handler block
5991: * @userData: if using SAX, this pointer will be provided on callbacks.
5992: *
5993: * parse an SGML file and build a tree. Automatic support for ZLIB/Compress
5994: * compressed document is provided by default if found at compile-time.
5995: * It use the given SAX function block to handle the parsing callback.
5996: * If sax is NULL, fallback to the default DOM tree building routines.
5997: *
5998: * Returns the resulting document tree
5999: */
6000:
6001: sgmlDocPtr
6002: sgmlSAXParseFile(const char *filename, const char *encoding, sgmlSAXHandlerPtr sax,
6003: void *userData) {
6004: sgmlDocPtr ret;
6005: sgmlParserCtxtPtr ctxt;
6006: sgmlSAXHandlerPtr oldsax = NULL;
6007:
6008: ctxt = sgmlCreateFileParserCtxt(filename, encoding);
6009: if (ctxt == NULL) return(NULL);
6010: if (sax != NULL) {
6011: oldsax = ctxt->sax;
6012: ctxt->sax = sax;
6013: ctxt->userData = userData;
6014: }
6015:
6016: sgmlParseDocument(ctxt);
6017:
6018: ret = ctxt->myDoc;
6019: if (sax != NULL) {
6020: ctxt->sax = oldsax;
6021: ctxt->userData = NULL;
6022: }
6023: sgmlFreeParserCtxt(ctxt);
6024:
6025: return(ret);
6026: }
6027:
6028: /**
6029: * sgmlParseFile :
6030: * @filename: the filename
6031: * @encoding: a free form C string describing the SGML document encoding, or NULL
6032: *
6033: * parse an SGML file and build a tree. Automatic support for ZLIB/Compress
6034: * compressed document is provided by default if found at compile-time.
6035: *
6036: * Returns the resulting document tree
6037: */
6038:
6039: sgmlDocPtr
6040: sgmlParseFile(const char *filename, const char *encoding) {
6041: return(sgmlSAXParseFile(filename, encoding, NULL, NULL));
6042: }
6043:
6044: #endif /* LIBXML_SGML_ENABLED */
Webmaster