Annotation of 2001/rfc2html/rfc2html.pl, revision 1.9
1.1 connolly 1: #!/usr/local/bin/perl
1.9 ! connolly 2: # $Id: rfc2html.pl,v 1.8 2004/09/01 13:21:38 connolly Exp $
1.1 connolly 3: #
4: # formerly:
5: # http://www.w3.org/Protocols/rfc2616/rfc2html.pl
6: # Id: rfc2html.pl,v 1.13 2000/08/02 09:43:05 ylafon Exp
7: #
8: # Usage:
9: # perl rfc2html.pl --title '...' --stitle '...' ... (@@see getopt call below)
10: # in_rfc.txt >rfcNNN.html
11: # also creates $base-secN.html for each section N
12: #
13: # Features
14: # splits by section
15: # marks up TOC with hypertext links
16: # using rel=Section per HTML 4.0
17: # marks up section headings with anchors
18: # marks up indented sections as <pre>
19: # marks up paragraphs and definition lists
20: # marks up cross references and bibliographic references
21: # using rel=xref, rel=bibref
22: # (some false matches)
23: # marks up references section with links to other RFCs and docs
24: # creates well-formed XML output
25: #
26: #
27: # TODO
28: # markup ul, ol in body text as such rather than as <pre>
29: # generalize &convert() params: title, short title, basename, bibsection
30: # for other RFCs
31: #
32: # BY
33: # Dan Connolly <connolly@w3.org>
34: # http://www.w3.org/People/Connolly/
35: #
36: # with thanks to Pete Whiting for a fix on 19 Jan 2000
37: #
38: # LICENSE
39: #
40: # Copyright (c) 1999-2001 World Wide Web Consortium (W3C, http://www.w3.org/),
41: # (Massachusetts Institute of Technology, Institut National de
42: # Recherche en Informatique et en Automatique, Keio University). All
43: # Rights Reserved.
44: #
45: # Permission to use, copy, modify, and distribute this software
46: # and its documentation for any purpose and without fee or
47: # royalty is hereby granted, per the terms and conditions in
48: #
49: # W3C Intellectual Property Notice and Legal Disclaimers
50: # http://www.w3.org/COPYRIGHT
51: # 1999/07/28 13:54:29
52:
53: use strict;
54: use Getopt::Long;
55:
56: my($xmlns) = 'http://www.w3.org/1999/xhtml';
57:
58: my($title) = 'TITLE';
59: my($stitle) = 'STITLE';
60: my($base) = 'rfcNNNN';
61: my($bibsec) = 20;
62: my($by) = 'AUTHORS';
63: my($docno) = 'RFC NNNN';
64: GetOptions('title=s' => \$title,
65: 'stitle=s' => \$stitle,
66: 'base=s' => \$base,
67: 'bibsec=s' => \$bibsec,
68: 'by=s' => \$by,
69: 'docno=s' => \$docno);
70: &convert($title, $stitle, $base, $bibsec, $by, $docno);
71:
72: sub convert{
73: my($title, $stitle, $base, $bibsec, $by, $docno) = @_;
74: my($state);
75: my($firstLine, $list, $toclevel, $citation);
76:
77: $state = 'start';
78:
1.8 connolly 79: print '<!DOCTYPE html
80: PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
81: "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">';
82: print "\n<html xmlns='$xmlns'>\n";
1.1 connolly 83: print "<head><title>$title</title></head><body>\n";
84:
85: while(<>){
86: # warn "$state $_";
87: s/&/&/g;
88: s/</</g;
89:
90: if(/^$by / or /^$docno /){
1.2 connolly 91: #print STDERR "skipped", $_;
1.1 connolly 92: next;
93: }
94:
1.2 connolly 95: #print STDERR " $state: $_";
96: #print STDERR "$state: note: [[^$_]]\n" if (/^ (Note:)/);
1.1 connolly 97:
98: s/^(\d+)\. /$1 /; # copyright statement header is goofy
99:
100: if($state eq 'start'){
101: if(/\S/){
102: print "<pre>\n";
1.6 connolly 103: print $_;
1.1 connolly 104: $state = 'banner';
105: }
106: }
107: elsif($state eq 'banner'){
108: if(/^\s*$/){ # blank line
109: $state = 'title';
110: print "</pre>\n";
111: print "<h1>\n";
112: }else{
113: print $_;
114: }
115: }
116: elsif($state eq 'title'){
117: if(/\S/){
118: print $_;
119: print "</h1>\n";
120: $state = 'body';
121: }
122: }
123:
124: elsif($state eq 'body'){
125: if(/^\s*$/){ # blank line
126: # nothing
127: }
128: elsif(/^Table of Contents/){
129: print "<div class='toc'><h2>Table of Contents</h2\n>";
130: $state = 'toc';
131: $toclevel = 0;
132: }
133: elsif(/^ ((\d+)(\.\d+)?([\.\d]+)?)\s+([^\.]+)\.\./){
134: my($num, $sec, $sub, $subsub, $heading) = ($1, $2, $3, $4, $5);
135: die "bad toc $toclevel: $_" if ($sub || $subsub);
136:
137: print "<ol class='toc'>\n";
1.2 connolly 138: print "<li><a rel='Section' href='$base-sec$sec.html#sec$num'>$heading</a> ... $num\n";
1.1 connolly 139: $state = 'toc';
140: $toclevel = 1;
141: }
142: elsif(/^(([\dA-Z]+)((\.\d+)*))\.?\s+([^
]*)/){
143: my($sec, $sub, $heading) = ($2, $3, $5);
144:
145: &changeSection($sec, $sub, $heading,
146: $title, $stitle, $base, $docno, $by);
147: }
148: elsif(/^\S/){
149: print "<h2>\n";
150: print $_;
151: print "</h2>\n";
152: }
153: elsif(/^ (\[(\d+)\])\s+(.*)/){
154: $citation = $_;
155: print "<dl class='bib'>\n";
156: $state = 'bib';
157: }
158: elsif(/^ (Note:)/){
159: $_ = &addrefs($base, $bibsec, $_);
160:
161: print "<p><strong>$1</strong>$'";
162: $list = undef;
163: $state = 'note';
164: }
165: elsif(/^ \S/){
166: $_ = &addrefs($base, $bibsec, $_);
167:
168: $firstLine = $_;
169: $list = undef;
170: $state = 'block';
171: }
172: elsif(/^ /){
173: die "$state: unflushed $firstLine [[$firstLine]]" if $firstLine;
174:
175: print "<pre>$_";
176: $state = 'pre';
177: }
178: else{
179: die "$state: what? [[$_]]";
180: }
181: }
182:
183: elsif($state eq 'toc'){
1.2 connolly 184: if(/^\s*$/ # skip blank lines in TOC
185: || /^IMAP4rev1/ || /^Appendices/){
186: print STDERR "state: $state skipped", $_;
1.1 connolly 187: next;
188: }
189:
1.2 connolly 190: if(/\.\d+\s*$/ &&
191: /^( )?(([A-Z\d]+)(\.\d+)?([\.\d]+)?\.?)\s+([^\.]+)/){
1.1 connolly 192: my($num, $sec, $sub, $subsub, $heading) = ($2, $3, $4, $5, $6);
193: $num =~ s/\.$//;
194: print STDERR "@@ TOC: $num, $sec, $sub, $subsub, $heading\n";
195:
1.2 connolly 196: my($endli);
197: $endli = "</li>";
198:
199: if($toclevel == 0){
200: print "<ol>\n";
201: $toclevel = 1;
202: $endli = '';
203: }
204: elsif($toclevel == 1){
1.1 connolly 205: if($sub){
1.2 connolly 206: print "\n<ol>\n";
1.1 connolly 207: $toclevel = 2;
1.2 connolly 208: $endli = '';
1.1 connolly 209: if($subsub){
1.2 connolly 210: warn "skipping a TOC level...";
211: print "<li><span>@@ missing</span>\n";
212:
213: print "\n<ol>\n";
1.1 connolly 214: $toclevel = 3;
1.2 connolly 215: $endli = '';
1.1 connolly 216: }
217: }
218: }
219: elsif($toclevel == 2){
220: if($sub){
221: if($subsub){
1.2 connolly 222: print "\n<ol>\n";
1.1 connolly 223: $toclevel = 3;
1.2 connolly 224: $endli = '';
1.1 connolly 225: }
226: }
227: else{
1.2 connolly 228: print "</li>\n</ol>\n";
1.1 connolly 229: $toclevel = 1;
230: }
231: }elsif($toclevel == 3){
232: if($subsub){
233: # stay at level 3
234: }else{
1.2 connolly 235: print "</li>\n</ol>\n";
1.1 connolly 236: $toclevel = 2;
237: if($sub){
238: }else{
1.2 connolly 239: print "</li>\n</ol>\n";
1.1 connolly 240: $toclevel = 1;
241: }
242: }
243: }
244:
1.2 connolly 245: print $endli;
246: print "\n";
247: print " " x $toclevel;
248: print "<li><span><a rel='Section' href='$base-sec$sec.html#sec$num'>$heading</a> ... $num</span>";
1.1 connolly 249: }
1.2 connolly 250: elsif(/^(([\dA-Z]+)((\.\d+)*))\.?\s+([^
]*)/){
251: my($sec, $sub, $heading) = ($2, $3, $5);
252:
1.1 connolly 253: if($toclevel == 1){
1.2 connolly 254: print "</li></ol></div\n>";
1.1 connolly 255: $state = 'body';
1.4 connolly 256: &colophon($base, $docno, $stitle, $by);
1.2 connolly 257: &startSection($sec, $sub, $heading,
258: $title, $stitle, $base, $docno, $by);
1.1 connolly 259: }else{
260: warn "blank line in toc level $toclevel";
261: }
262: }
1.2 connolly 263:
1.1 connolly 264: else{
265: die "$state: what? [[$_]]";
266: }
267: }
268:
269:
270: elsif($state eq 'block'){
271: if(/^((\d+)((\.\d+)*))\s+([^
]*)/){
272: my($sec, $sub, $heading) = ($2, $3, $5);
273:
274: print "</dl>\n";
275: &changeSection($sec, $sub, $heading,
276: $title, $stitle, $base, $docno, $by);
277: $state = 'body';
278: }
279: elsif(/^\S/){
280: print "</dl>\n";
281: print "<h2>$_</h2>\n";
282: $state = 'body';
283: }
284: elsif(/^ \S/){
285: # warn "hit this $_";
286: if($firstLine){
287: print "</$list>\n" if $list;
288:
289: print "<p>\n";
290: print $firstLine;
291: $firstLine = undef;
292: print $_;
293: $state = 'p';
294: }else{
295: $firstLine = $_;
296: }
297: }
298: elsif(/^ (Note:)/){
1.7 connolly 299: if($list eq 'dl'){ print "<dd>" };
1.1 connolly 300: print "<p><strong>$1</strong>$'";
301: $state = 'note';
302: }
303: elsif(/^ ?\S/){
304: if($list ne 'dl'){
305: print "</$list>\n" if $list;
306: print "<dl>\n";
307: }
308:
309: print " <dt>$firstLine</dt>";
310: $firstLine = undef;
311: print " <dd>$_";
312: $state = 'dd';
313: }
314: elsif(/^\s*$/){ # added the $ - don't want to drop a line with text
315: if($firstLine){
316: print "<p>\n";
317: print $firstLine;
318: $firstLine = undef;
319: print "</p>\n";
320: $state = 'body';
321: }
322: }
323: elsif(/^\s*\S/){
324: # this missed all of the above, but it looks like valid text, so
325: # lets just use it like it was a normal paragraph. I wanted to keep
326: # this separate from the three space rule, but it has the exact same
327: # behavior.
328: if($firstLine){
329: print "</$list>\n" if $list;
330:
331: print "<p>\n";
332: print $firstLine;
333: $firstLine = undef;
334: print $_;
335: $state = 'p';
336: }else{
337: $firstLine = $_;
338: }
339: }
340: else{
341: die "$state: what? [[$_]]\n firstline: [[$firstLine]] list: [[$list]]";
342: }
343: }
344:
345: elsif($state eq 'p'){
346: $_ = &addrefs($base, $bibsec, $_);
347:
348: if(/^ *\S/){
349: print $_;
350: }
351: elsif(/^\s*$/){
352: print "</p>\n";
353: $state = 'body';
354: }
355: else{
356: die "$state: what? [[$_]]";
357: }
358: }
359:
360: elsif($state eq 'note'){
361: $_ = &addrefs($base, $bibsec, $_);
362:
363: if(/^ \S/){
364: print $_;
365: }
366: elsif(/^\s*$/){
367: print "</p>\n";
1.7 connolly 368: if($list eq 'dl'){ print "</dd>" };
1.1 connolly 369: $state = ($list ? 'block' : 'body');
370: }
371: else{
372: die "$state: what? [[$_]]";
373: }
374: }
375:
376: elsif($state eq 'dd'){
377: $_ = &addrefs($base, $bibsec, $_);
378:
379: if(/^ ?\S/){
380: print $_;
381: }
382: elsif(/^ \S/){
383: print STDERR "$state: pre: [[$_]]\n";
384: print "<pre>$_</pre>";
385: }
386: elsif(/^\s*$/){
387: $list = 'dl' unless $list;
388: print "</dd>\n";
389: $state = 'block';
390: }
391: else{
392: die "$state: what? [[$_]]";
393: }
394: }
395:
396: elsif($state eq 'bib'){
397: if(/^ (\[(\d+)\])\s+(.*)/){
398: my($label, $num, $rest) = ($1, $2, $3);
399:
400: &cite($citation) if $citation;
401:
402: $citation = $_;
403: }
404: elsif(/^((\d+)((\.\d+)*))\s+([^
]*)/){
405: my($sec, $sub, $heading) = ($2, $3, $5);
406:
407: &cite($citation) if $citation;
408: print "</dl>\n";
409:
410: &changeSection($sec, $sub, $heading,
411: $title, $stitle, $base, $docno, $by);
412: $state = 'body';
413: }
414: else{
415: $citation = $citation . $_;
416: }
417: }
418:
419: elsif($state eq 'pre'){
420: $_ = &addrefs($base, $bibsec, $_);
421:
422: die "firstline: $_" if $firstLine;
423:
424: if(/^\s*$/){
425: print "</pre>\n";
426: $state = 'body';
427: }else{
428: print $_;
429: }
430: }
431:
432: else{
433: die "unkown state $state";
434: }
435: }
436:
437: print "</body></html>\n";
438:
439: }
440:
441: sub addrefs{
442: my($base, $bibsec, $l) = @_;
443:
444: $l =~ s, ((\d+)(\.\d+)+), <a rel='xref' href='$base-sec$2.html#sec$1'>$1</a>,g;
445: $l =~ s,(\[(\d+)\]),<a rel='bibref' href='$base-sec$bibsec.html#bib$2'>$1</a>,g;
446:
447: return $l;
448: }
449:
450: sub changeSection{
451: my($sec, $sub, $heading,
452: $title, $stitle, $base, $docno, $by) = @_;
453:
1.2 connolly 454: #print STDERR "section: [[$sec]][[$sub]][[$heading]]\n";
1.1 connolly 455:
456: if($sub){
1.7 connolly 457: print "<h3><a id='sec$sec$sub'>$sec$sub</a> $heading</h3>\n";
1.1 connolly 458: }else{
459: print "</body></html>\n";
460:
1.2 connolly 461: startSection($sec, $sub, $heading,
462: $title, $stitle, $base, $docno, $by);
463: }
464: }
465:
466: sub startSection{
467: my($sec, $sub, $heading,
468: $title, $stitle, $base, $docno, $by) = @_;
469:
470: #print STDERR "section: [[$sec]][[$sub]][[$heading]]\n";
1.1 connolly 471:
1.2 connolly 472: open(SECTION, ">$base-sec$sec.html");
473: select(SECTION);
1.1 connolly 474:
1.9 ! connolly 475: print '<!DOCTYPE html
! 476: PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
! 477: "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">';
! 478: print "\n<html xmlns='$xmlns'>\n";
1.2 connolly 479: print "<head><title>$stitle: $heading</title></head>\n";
480:
1.7 connolly 481: print "<body><address>part of <a rev='Section' href='$base.html'>$title</a><br />\n";
482: print "$docno $by</address>\n";
483: print "<h2><a id='sec$sec$sub'>$sec$sub</a> $heading</h2>\n";
1.1 connolly 484: }
485:
486: sub cite{
487: my($citation) = @_;
488: local($_);
489: $_ = $citation;
490:
491: my($num, $label, $by, $title, $addr);
492:
493: s/^\s*//;
494: s/\s+/ /g;
495:
496: s/HTTPLat ency.html/HTTPLatency.html/; # URL split across lines
497:
498: if(s/^(\[(\d+)\])\s*//){
499: ($num, $label) = ($2, $1);
500:
501: if(s/^([^\"]*)\"([^\"]+)\"//){
502: ($by, $title) = ($1, $2);
503: }
504:
505: if(/RFC (\d+)/){
506: my($RFCAddrFormat) = "http://www.ietf.org/rfc/rfc%04d.txt";
507: $addr = sprintf("$RFCAddrFormat", $1);
508: }
509:
510: if(m;((ftp|http)://[^,> ]+);){
511: $addr = $1;
512: $addr =~ s/\.$//; # period at the end of a URL is probably punctuation
513: }
514:
1.7 connolly 515: print "<dt><a id='bib$num'>$label</a></dt>\n";
1.1 connolly 516: if($addr){
517: print "<dd>$by <cite><a href='$addr'>$title</a></cite> $_</dd>\n";
518: }else{
519: print "<dd>$by <cite>$title</cite> $_</dd>\n";
520: }
521: }
522: }
523:
524: sub colophon{
1.4 connolly 525: my($base, $docno, $stitle, $by) = @_;
1.2 connolly 526: my($revdate);
1.9 ! connolly 527: $revdate = '$Revision: 1.8 $ $Date: 2004/09/01 13:21:38 $';
1.2 connolly 528: $revdate =~ s/\$//g;
529:
1.1 connolly 530: print "<address>\n";
1.5 connolly 531: print "derived from <cite><a rel='derived-from' href='http://www.ietf.org/rfc/$base.txt'>$stitle</a></cite>, Internet $docno, $by<br class=''/>\n";
1.4 connolly 532: print "using <a href='http://dev.w3.org/cvsweb/2001/rfc2html/'>rfc2html</a> ", $revdate, " by ";
1.3 connolly 533: print "<a href='http://www.w3.org/People/Connolly/'>Dan Connolly</a>\n";
1.1 connolly 534: print "</address>\n";
1.2 connolly 535: print "</body></html>\n";
1.1 connolly 536: }
537:
Webmaster