Annotation of 2001/rfc2html/rfc2html.pl, revision 1.4
1.1 connolly 1: #!/usr/local/bin/perl
1.4 ! connolly 2: # $Id: rfc2html.pl,v 1.3 2001/06/28 14:55:02 connolly Exp $
1.1 connolly 3: #
4: # formerly:
5: # http://www.w3.org/Protocols/rfc2616/rfc2html.pl
6: # Id: rfc2html.pl,v 1.13 2000/08/02 09:43:05 ylafon Exp
7: #
8: # Usage:
9: # perl rfc2html.pl --title '...' --stitle '...' ... (@@see getopt call below)
10: # in_rfc.txt >rfcNNN.html
11: # also creates $base-secN.html for each section N
12: #
13: # Features
14: # splits by section
15: # marks up TOC with hypertext links
16: # using rel=Section per HTML 4.0
17: # marks up section headings with anchors
18: # marks up indented sections as <pre>
19: # marks up paragraphs and definition lists
20: # marks up cross references and bibliographic references
21: # using rel=xref, rel=bibref
22: # (some false matches)
23: # marks up references section with links to other RFCs and docs
24: # creates well-formed XML output
25: #
26: #
27: # TODO
28: # markup ul, ol in body text as such rather than as <pre>
29: # generalize &convert() params: title, short title, basename, bibsection
30: # for other RFCs
31: #
32: # BY
33: # Dan Connolly <connolly@w3.org>
34: # http://www.w3.org/People/Connolly/
35: #
36: # with thanks to Pete Whiting for a fix on 19 Jan 2000
37: #
38: # LICENSE
39: #
40: # Copyright (c) 1999-2001 World Wide Web Consortium (W3C, http://www.w3.org/),
41: # (Massachusetts Institute of Technology, Institut National de
42: # Recherche en Informatique et en Automatique, Keio University). All
43: # Rights Reserved.
44: #
45: # Permission to use, copy, modify, and distribute this software
46: # and its documentation for any purpose and without fee or
47: # royalty is hereby granted, per the terms and conditions in
48: #
49: # W3C Intellectual Property Notice and Legal Disclaimers
50: # http://www.w3.org/COPYRIGHT
51: # 1999/07/28 13:54:29
52:
53: use strict;
54: use Getopt::Long;
55:
56: my($xmlns) = 'http://www.w3.org/1999/xhtml';
57:
58: my($title) = 'TITLE';
59: my($stitle) = 'STITLE';
60: my($base) = 'rfcNNNN';
61: my($bibsec) = 20;
62: my($by) = 'AUTHORS';
63: my($docno) = 'RFC NNNN';
64: GetOptions('title=s' => \$title,
65: 'stitle=s' => \$stitle,
66: 'base=s' => \$base,
67: 'bibsec=s' => \$bibsec,
68: 'by=s' => \$by,
69: 'docno=s' => \$docno);
70: &convert($title, $stitle, $base, $bibsec, $by, $docno);
71:
72: sub convert{
73: my($title, $stitle, $base, $bibsec, $by, $docno) = @_;
74: my($state);
75: my($firstLine, $list, $toclevel, $citation);
76:
77: $state = 'start';
78:
79: print "<html xmlns='$xmlns'>\n";
80: print "<head><title>$title</title></head><body>\n";
81:
82: while(<>){
83: # warn "$state $_";
84: s/&/&/g;
85: s/</</g;
86:
87: if(/^$by / or /^$docno /){
1.2 connolly 88: #print STDERR "skipped", $_;
1.1 connolly 89: next;
90: }
91:
1.2 connolly 92: #print STDERR " $state: $_";
93: #print STDERR "$state: note: [[^$_]]\n" if (/^ (Note:)/);
1.1 connolly 94:
95: s/^(\d+)\. /$1 /; # copyright statement header is goofy
96:
97: if($state eq 'start'){
98: if(/\S/){
99: print "<pre>\n";
100: $state = 'banner';
101: }
102: }
103: elsif($state eq 'banner'){
104: if(/^\s*$/){ # blank line
105: $state = 'title';
106: print "</pre>\n";
107: print "<h1>\n";
108: }else{
109: print $_;
110: }
111: }
112: elsif($state eq 'title'){
113: if(/\S/){
114: print $_;
115: print "</h1>\n";
116: $state = 'body';
117: }
118: }
119:
120: elsif($state eq 'body'){
121: if(/^\s*$/){ # blank line
122: # nothing
123: }
124: elsif(/^Table of Contents/){
125: print "<div class='toc'><h2>Table of Contents</h2\n>";
126: $state = 'toc';
127: $toclevel = 0;
128: }
129: elsif(/^ ((\d+)(\.\d+)?([\.\d]+)?)\s+([^\.]+)\.\./){
130: my($num, $sec, $sub, $subsub, $heading) = ($1, $2, $3, $4, $5);
131: die "bad toc $toclevel: $_" if ($sub || $subsub);
132:
133: print "<ol class='toc'>\n";
1.2 connolly 134: print "<li><a rel='Section' href='$base-sec$sec.html#sec$num'>$heading</a> ... $num\n";
1.1 connolly 135: $state = 'toc';
136: $toclevel = 1;
137: }
138: elsif(/^(([\dA-Z]+)((\.\d+)*))\.?\s+([^
]*)/){
139: my($sec, $sub, $heading) = ($2, $3, $5);
140:
141: &changeSection($sec, $sub, $heading,
142: $title, $stitle, $base, $docno, $by);
143: }
144: elsif(/^\S/){
145: print "<h2>\n";
146: print $_;
147: print "</h2>\n";
148: }
149: elsif(/^ (\[(\d+)\])\s+(.*)/){
150: $citation = $_;
151: print "<dl class='bib'>\n";
152: $state = 'bib';
153: }
154: elsif(/^ (Note:)/){
155: $_ = &addrefs($base, $bibsec, $_);
156:
157: print "<p><strong>$1</strong>$'";
158: $list = undef;
159: $state = 'note';
160: }
161: elsif(/^ \S/){
162: $_ = &addrefs($base, $bibsec, $_);
163:
164: $firstLine = $_;
165: $list = undef;
166: $state = 'block';
167: }
168: elsif(/^ /){
169: die "$state: unflushed $firstLine [[$firstLine]]" if $firstLine;
170:
171: print "<pre>$_";
172: $state = 'pre';
173: }
174: else{
175: die "$state: what? [[$_]]";
176: }
177: }
178:
179: elsif($state eq 'toc'){
1.2 connolly 180: if(/^\s*$/ # skip blank lines in TOC
181: || /^IMAP4rev1/ || /^Appendices/){
182: print STDERR "state: $state skipped", $_;
1.1 connolly 183: next;
184: }
185:
1.2 connolly 186: if(/\.\d+\s*$/ &&
187: /^( )?(([A-Z\d]+)(\.\d+)?([\.\d]+)?\.?)\s+([^\.]+)/){
1.1 connolly 188: my($num, $sec, $sub, $subsub, $heading) = ($2, $3, $4, $5, $6);
189: $num =~ s/\.$//;
190: print STDERR "@@ TOC: $num, $sec, $sub, $subsub, $heading\n";
191:
1.2 connolly 192: my($endli);
193: $endli = "</li>";
194:
195: if($toclevel == 0){
196: print "<ol>\n";
197: $toclevel = 1;
198: $endli = '';
199: }
200: elsif($toclevel == 1){
1.1 connolly 201: if($sub){
1.2 connolly 202: print "\n<ol>\n";
1.1 connolly 203: $toclevel = 2;
1.2 connolly 204: $endli = '';
1.1 connolly 205: if($subsub){
1.2 connolly 206: warn "skipping a TOC level...";
207: print "<li><span>@@ missing</span>\n";
208:
209: print "\n<ol>\n";
1.1 connolly 210: $toclevel = 3;
1.2 connolly 211: $endli = '';
1.1 connolly 212: }
213: }
214: }
215: elsif($toclevel == 2){
216: if($sub){
217: if($subsub){
1.2 connolly 218: print "\n<ol>\n";
1.1 connolly 219: $toclevel = 3;
1.2 connolly 220: $endli = '';
1.1 connolly 221: }
222: }
223: else{
1.2 connolly 224: print "</li>\n</ol>\n";
1.1 connolly 225: $toclevel = 1;
226: }
227: }elsif($toclevel == 3){
228: if($subsub){
229: # stay at level 3
230: }else{
1.2 connolly 231: print "</li>\n</ol>\n";
1.1 connolly 232: $toclevel = 2;
233: if($sub){
234: }else{
1.2 connolly 235: print "</li>\n</ol>\n";
1.1 connolly 236: $toclevel = 1;
237: }
238: }
239: }
240:
1.2 connolly 241: print $endli;
242: print "\n";
243: print " " x $toclevel;
244: print "<li><span><a rel='Section' href='$base-sec$sec.html#sec$num'>$heading</a> ... $num</span>";
1.1 connolly 245: }
1.2 connolly 246: elsif(/^(([\dA-Z]+)((\.\d+)*))\.?\s+([^
]*)/){
247: my($sec, $sub, $heading) = ($2, $3, $5);
248:
1.1 connolly 249: if($toclevel == 1){
1.2 connolly 250: print "</li></ol></div\n>";
1.1 connolly 251: $state = 'body';
1.4 ! connolly 252: &colophon($base, $docno, $stitle, $by);
1.2 connolly 253: &startSection($sec, $sub, $heading,
254: $title, $stitle, $base, $docno, $by);
1.1 connolly 255: }else{
256: warn "blank line in toc level $toclevel";
257: }
258: }
1.2 connolly 259:
1.1 connolly 260: else{
261: die "$state: what? [[$_]]";
262: }
263: }
264:
265:
266: elsif($state eq 'block'){
267: if(/^((\d+)((\.\d+)*))\s+([^
]*)/){
268: my($sec, $sub, $heading) = ($2, $3, $5);
269:
270: print "</dl>\n";
271: &changeSection($sec, $sub, $heading,
272: $title, $stitle, $base, $docno, $by);
273: $state = 'body';
274: }
275: elsif(/^\S/){
276: print "</dl>\n";
277: print "<h2>$_</h2>\n";
278: $state = 'body';
279: }
280: elsif(/^ \S/){
281: # warn "hit this $_";
282: if($firstLine){
283: print "</$list>\n" if $list;
284:
285: print "<p>\n";
286: print $firstLine;
287: $firstLine = undef;
288: print $_;
289: $state = 'p';
290: }else{
291: $firstLine = $_;
292: }
293: }
294: elsif(/^ (Note:)/){
295: print "<p><strong>$1</strong>$'";
296: $state = 'note';
297: }
298: elsif(/^ ?\S/){
299: if($list ne 'dl'){
300: print "</$list>\n" if $list;
301: print "<dl>\n";
302: }
303:
304: print " <dt>$firstLine</dt>";
305: $firstLine = undef;
306: print " <dd>$_";
307: $state = 'dd';
308: }
309: elsif(/^\s*$/){ # added the $ - don't want to drop a line with text
310: if($firstLine){
311: print "<p>\n";
312: print $firstLine;
313: $firstLine = undef;
314: print "</p>\n";
315: $state = 'body';
316: }
317: }
318: elsif(/^\s*\S/){
319: # this missed all of the above, but it looks like valid text, so
320: # lets just use it like it was a normal paragraph. I wanted to keep
321: # this separate from the three space rule, but it has the exact same
322: # behavior.
323: if($firstLine){
324: print "</$list>\n" if $list;
325:
326: print "<p>\n";
327: print $firstLine;
328: $firstLine = undef;
329: print $_;
330: $state = 'p';
331: }else{
332: $firstLine = $_;
333: }
334: }
335: else{
336: die "$state: what? [[$_]]\n firstline: [[$firstLine]] list: [[$list]]";
337: }
338: }
339:
340: elsif($state eq 'p'){
341: $_ = &addrefs($base, $bibsec, $_);
342:
343: if(/^ *\S/){
344: print $_;
345: }
346: elsif(/^\s*$/){
347: print "</p>\n";
348: $state = 'body';
349: }
350: else{
351: die "$state: what? [[$_]]";
352: }
353: }
354:
355: elsif($state eq 'note'){
356: $_ = &addrefs($base, $bibsec, $_);
357:
358: if(/^ \S/){
359: print $_;
360: }
361: elsif(/^\s*$/){
362: print "</p>\n";
363: $state = ($list ? 'block' : 'body');
364: }
365: else{
366: die "$state: what? [[$_]]";
367: }
368: }
369:
370: elsif($state eq 'dd'){
371: $_ = &addrefs($base, $bibsec, $_);
372:
373: if(/^ ?\S/){
374: print $_;
375: }
376: elsif(/^ \S/){
377: print STDERR "$state: pre: [[$_]]\n";
378: print "<pre>$_</pre>";
379: }
380: elsif(/^\s*$/){
381: $list = 'dl' unless $list;
382: print "</dd>\n";
383: $state = 'block';
384: }
385: else{
386: die "$state: what? [[$_]]";
387: }
388: }
389:
390: elsif($state eq 'bib'){
391: if(/^ (\[(\d+)\])\s+(.*)/){
392: my($label, $num, $rest) = ($1, $2, $3);
393:
394: &cite($citation) if $citation;
395:
396: $citation = $_;
397: }
398: elsif(/^((\d+)((\.\d+)*))\s+([^
]*)/){
399: my($sec, $sub, $heading) = ($2, $3, $5);
400:
401: &cite($citation) if $citation;
402: print "</dl>\n";
403:
404: &changeSection($sec, $sub, $heading,
405: $title, $stitle, $base, $docno, $by);
406: $state = 'body';
407: }
408: else{
409: $citation = $citation . $_;
410: }
411: }
412:
413: elsif($state eq 'pre'){
414: $_ = &addrefs($base, $bibsec, $_);
415:
416: die "firstline: $_" if $firstLine;
417:
418: if(/^\s*$/){
419: print "</pre>\n";
420: $state = 'body';
421: }else{
422: print $_;
423: }
424: }
425:
426: else{
427: die "unkown state $state";
428: }
429: }
430:
431: print "</body></html>\n";
432:
433: }
434:
435: sub addrefs{
436: my($base, $bibsec, $l) = @_;
437:
438: $l =~ s, ((\d+)(\.\d+)+), <a rel='xref' href='$base-sec$2.html#sec$1'>$1</a>,g;
439: $l =~ s,(\[(\d+)\]),<a rel='bibref' href='$base-sec$bibsec.html#bib$2'>$1</a>,g;
440:
441: return $l;
442: }
443:
444: sub changeSection{
445: my($sec, $sub, $heading,
446: $title, $stitle, $base, $docno, $by) = @_;
447:
1.2 connolly 448: #print STDERR "section: [[$sec]][[$sub]][[$heading]]\n";
1.1 connolly 449:
450: if($sub){
451: print "<h3><a name='sec$sec$sub'>$sec$sub</a> $heading</h3>\n";
452: }else{
453: print "</body></html>\n";
454:
1.2 connolly 455: startSection($sec, $sub, $heading,
456: $title, $stitle, $base, $docno, $by);
457: }
458: }
459:
460: sub startSection{
461: my($sec, $sub, $heading,
462: $title, $stitle, $base, $docno, $by) = @_;
463:
464: #print STDERR "section: [[$sec]][[$sub]][[$heading]]\n";
1.1 connolly 465:
1.2 connolly 466: open(SECTION, ">$base-sec$sec.html");
467: select(SECTION);
1.1 connolly 468:
1.2 connolly 469: print "<html xmlns='$xmlns'>\n";
470: print "<head><title>$stitle: $heading</title></head>\n";
471:
472: print "<body><address><p>part of <a rev='Section' href='$base.html'>$title</a><br />\n";
473: print "$docno $by</p></address>\n";
474: print "<h2><a name='sec$sec$sub'>$sec$sub</a> $heading</h2>\n";
1.1 connolly 475: }
476:
477: sub cite{
478: my($citation) = @_;
479: local($_);
480: $_ = $citation;
481:
482: my($num, $label, $by, $title, $addr);
483:
484: s/^\s*//;
485: s/\s+/ /g;
486:
487: s/HTTPLat ency.html/HTTPLatency.html/; # URL split across lines
488:
489: if(s/^(\[(\d+)\])\s*//){
490: ($num, $label) = ($2, $1);
491:
492: if(s/^([^\"]*)\"([^\"]+)\"//){
493: ($by, $title) = ($1, $2);
494: }
495:
496: if(/RFC (\d+)/){
497: my($RFCAddrFormat) = "http://www.ietf.org/rfc/rfc%04d.txt";
498: $addr = sprintf("$RFCAddrFormat", $1);
499: }
500:
501: if(m;((ftp|http)://[^,> ]+);){
502: $addr = $1;
503: $addr =~ s/\.$//; # period at the end of a URL is probably punctuation
504: }
505:
506: print "<dt><a name='bib$num'>$label</a></dt>\n";
507: if($addr){
508: print "<dd>$by <cite><a href='$addr'>$title</a></cite> $_</dd>\n";
509: }else{
510: print "<dd>$by <cite>$title</cite> $_</dd>\n";
511: }
512: }
513: }
514:
515: sub colophon{
1.4 ! connolly 516: my($base, $docno, $stitle, $by) = @_;
1.2 connolly 517: my($revdate);
1.4 ! connolly 518: $revdate = '$Revision: 1.3 $ $Date: 2001/06/28 14:55:02 $';
1.2 connolly 519: $revdate =~ s/\$//g;
520:
1.1 connolly 521: print "<address>\n";
1.4 ! connolly 522: print "derived from <cite><a rel='derived-from' href='http://www.ietf.org/rfc/$base.txt'>$stitle</a>, Internet $docno, $by<br class=''/>\n";
! 523: print "using <a href='http://dev.w3.org/cvsweb/2001/rfc2html/'>rfc2html</a> ", $revdate, " by ";
1.3 connolly 524: print "<a href='http://www.w3.org/People/Connolly/'>Dan Connolly</a>\n";
1.1 connolly 525: print "</address>\n";
1.2 connolly 526: print "</body></html>\n";
1.1 connolly 527: }
528:
Webmaster