Annotation of 2001/rfc2html/rfc2html.pl, revision 1.9

1.1       connolly    1: #!/usr/local/bin/perl
1.9     ! connolly    2: # $Id: rfc2html.pl,v 1.8 2004/09/01 13:21:38 connolly Exp $
1.1       connolly    3: #
                      4: # formerly:
                      5: # http://www.w3.org/Protocols/rfc2616/rfc2html.pl
                      6: # Id: rfc2html.pl,v 1.13 2000/08/02 09:43:05 ylafon Exp 
                      7: #
                      8: # Usage:
                      9: #  perl rfc2html.pl --title '...' --stitle '...' ... (@@see getopt call below)
                     10: #    in_rfc.txt >rfcNNN.html
                     11: #   also creates $base-secN.html for each section N
                     12: #
                     13: # Features
                     14: #  splits by section
                     15: #  marks up TOC with hypertext links
                     16: #    using rel=Section per HTML 4.0
                     17: #  marks up section headings with anchors
                     18: #  marks up indented sections as <pre>
                     19: #  marks up paragraphs and definition lists
                     20: #  marks up cross references and bibliographic references
                     21: #   using rel=xref, rel=bibref
                     22: #   (some false matches)
                     23: #  marks up references section with links to other RFCs and docs
                     24: #  creates well-formed XML output
                     25: #
                     26: #
                     27: # TODO
                     28: #  markup ul, ol in body text as such rather than as <pre>
                     29: #  generalize &convert() params: title, short title, basename, bibsection
                     30: #     for other RFCs
                     31: #
                     32: # BY
                     33: #  Dan Connolly <connolly@w3.org>
                     34: #  http://www.w3.org/People/Connolly/
                     35: #  
                     36: #  with thanks to Pete Whiting for a fix on 19 Jan 2000
                     37: #
                     38: # LICENSE
                     39: #
                     40: # Copyright (c) 1999-2001 World Wide Web Consortium (W3C, http://www.w3.org/),
                     41: # (Massachusetts Institute of Technology, Institut National de
                     42: # Recherche en Informatique et en Automatique, Keio University). All
                     43: # Rights Reserved. 
                     44: #
                     45: # Permission to use, copy, modify, and distribute this software
                     46: # and its documentation for any purpose and without fee or
                     47: # royalty is hereby granted, per the terms and conditions in
                     48: #
                     49: # W3C Intellectual Property Notice and Legal Disclaimers
                     50: # http://www.w3.org/COPYRIGHT
                     51: # 1999/07/28 13:54:29
                     52: 
                     53: use strict;
                     54: use Getopt::Long;
                     55: 
                     56: my($xmlns) = 'http://www.w3.org/1999/xhtml';
                     57: 
                     58: my($title) = 'TITLE';
                     59: my($stitle) = 'STITLE';
                     60: my($base) = 'rfcNNNN';
                     61: my($bibsec) = 20;
                     62: my($by) = 'AUTHORS';
                     63: my($docno) = 'RFC NNNN';
                     64: GetOptions('title=s' => \$title,
                     65:           'stitle=s' => \$stitle,
                     66:           'base=s' => \$base,
                     67:           'bibsec=s' => \$bibsec,
                     68:           'by=s' => \$by,
                     69:           'docno=s' => \$docno);
                     70: &convert($title, $stitle, $base, $bibsec, $by, $docno);
                     71: 
                     72: sub convert{
                     73:   my($title, $stitle, $base, $bibsec, $by, $docno) = @_;
                     74:   my($state);
                     75:   my($firstLine, $list, $toclevel, $citation);
                     76: 
                     77:   $state = 'start';
                     78: 
1.8       connolly   79:   print '<!DOCTYPE html 
                     80:      PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
                     81:      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">';
                     82:   print "\n<html xmlns='$xmlns'>\n";
1.1       connolly   83:   print "<head><title>$title</title></head><body>\n";
                     84: 
                     85:   while(<>){
                     86: #      warn "$state $_";
                     87:     s/&/&amp;/g;
                     88:     s/</&lt;/g;
                     89: 
                     90:     if(/^$by    / or /^$docno    /){
1.2       connolly   91:       #print STDERR "skipped", $_;
1.1       connolly   92:       next;
                     93:     }
                     94: 
1.2       connolly   95:     #print STDERR " $state: $_";
                     96:     #print STDERR "$state: note: [[^$_]]\n" if (/^       (Note:)/);
1.1       connolly   97: 
                     98:     s/^(\d+)\. /$1 /; # copyright statement header is goofy
                     99: 
                    100:     if($state eq 'start'){
                    101:       if(/\S/){
                    102:        print "<pre>\n";
1.6       connolly  103:        print $_;
1.1       connolly  104:        $state = 'banner';
                    105:       }
                    106:     }
                    107:     elsif($state eq 'banner'){
                    108:       if(/^\s*$/){ # blank line
                    109:        $state = 'title';
                    110:        print "</pre>\n";
                    111:        print "<h1>\n";
                    112:       }else{
                    113:        print $_;
                    114:       }
                    115:     }
                    116:     elsif($state eq 'title'){
                    117:       if(/\S/){
                    118:        print $_;
                    119:        print "</h1>\n";
                    120:        $state = 'body';
                    121:       }
                    122:     }
                    123: 
                    124:     elsif($state eq 'body'){
                    125:       if(/^\s*$/){ # blank line
                    126:        # nothing
                    127:       }
                    128:       elsif(/^Table of Contents/){
                    129:        print "<div class='toc'><h2>Table of Contents</h2\n>";
                    130:        $state = 'toc';
                    131:        $toclevel = 0;
                    132:       }
                    133:       elsif(/^   ((\d+)(\.\d+)?([\.\d]+)?)\s+([^\.]+)\.\./){
                    134:        my($num, $sec, $sub, $subsub, $heading) = ($1, $2, $3, $4, $5);
                    135:        die "bad toc $toclevel: $_" if ($sub || $subsub);
                    136: 
                    137:        print "<ol class='toc'>\n";
1.2       connolly  138:        print "<li><a rel='Section' href='$base-sec$sec.html#sec$num'>$heading</a> ... $num\n";
1.1       connolly  139:        $state = 'toc';
                    140:        $toclevel = 1;
                    141:       }
                    142:       elsif(/^(([\dA-Z]+)((\.\d+)*))\.?\s+([^
]*)/){
                    143:        my($sec, $sub, $heading) = ($2, $3, $5);
                    144: 
                    145:        &changeSection($sec, $sub, $heading,
                    146:                       $title, $stitle, $base, $docno, $by);
                    147:       }
                    148:       elsif(/^\S/){
                    149:        print "<h2>\n";
                    150:        print $_;
                    151:        print "</h2>\n";
                    152:       }
                    153:       elsif(/^   (\[(\d+)\])\s+(.*)/){
                    154:        $citation = $_;
                    155:        print "<dl class='bib'>\n";
                    156:        $state = 'bib';
                    157:       }
                    158:       elsif(/^       (Note:)/){
                    159:        $_ = &addrefs($base, $bibsec, $_);
                    160: 
                    161:        print "<p><strong>$1</strong>$'";
                    162:        $list = undef;
                    163:        $state = 'note';
                    164:       }
                    165:       elsif(/^   \S/){
                    166:        $_ = &addrefs($base, $bibsec, $_);
                    167: 
                    168:        $firstLine = $_;
                    169:        $list = undef;
                    170:        $state = 'block';
                    171:       }
                    172:       elsif(/^    /){
                    173:        die "$state: unflushed $firstLine [[$firstLine]]" if $firstLine;
                    174: 
                    175:        print "<pre>$_";
                    176:        $state = 'pre';
                    177:       }
                    178:       else{
                    179:        die "$state: what? [[$_]]";
                    180:       }
                    181:     }
                    182: 
                    183:     elsif($state eq 'toc'){
1.2       connolly  184:       if(/^\s*$/ # skip blank lines in TOC
                    185:         || /^IMAP4rev1/ || /^Appendices/){
                    186:        print STDERR "state: $state skipped", $_;
1.1       connolly  187:        next;
                    188:       }
                    189: 
1.2       connolly  190:       if(/\.\d+\s*$/ &&
                    191:         /^(   )?(([A-Z\d]+)(\.\d+)?([\.\d]+)?\.?)\s+([^\.]+)/){
1.1       connolly  192:        my($num, $sec, $sub, $subsub, $heading) = ($2, $3, $4, $5, $6);
                    193:        $num =~ s/\.$//;
                    194:        print STDERR "@@ TOC:  $num, $sec, $sub, $subsub, $heading\n";
                    195: 
1.2       connolly  196:        my($endli);
                    197:        $endli = "</li>";
                    198:        
                    199:        if($toclevel == 0){
                    200:          print "<ol>\n";
                    201:          $toclevel = 1;
                    202:          $endli = '';
                    203:        }
                    204:        elsif($toclevel == 1){
1.1       connolly  205:          if($sub){
1.2       connolly  206:            print "\n<ol>\n";
1.1       connolly  207:            $toclevel = 2;
1.2       connolly  208:            $endli = '';
1.1       connolly  209:            if($subsub){
1.2       connolly  210:              warn "skipping a TOC level...";
                    211:              print "<li><span>@@ missing</span>\n";
                    212:              
                    213:              print "\n<ol>\n";
1.1       connolly  214:              $toclevel = 3;
1.2       connolly  215:              $endli = '';
1.1       connolly  216:            }
                    217:          }
                    218:        }
                    219:        elsif($toclevel == 2){
                    220:          if($sub){
                    221:            if($subsub){
1.2       connolly  222:              print "\n<ol>\n";
1.1       connolly  223:              $toclevel = 3;
1.2       connolly  224:              $endli = '';
1.1       connolly  225:            }
                    226:          }
                    227:          else{
1.2       connolly  228:            print "</li>\n</ol>\n";
1.1       connolly  229:            $toclevel = 1;
                    230:          }
                    231:        }elsif($toclevel == 3){
                    232:          if($subsub){
                    233:            # stay at level 3
                    234:          }else{
1.2       connolly  235:            print "</li>\n</ol>\n";
1.1       connolly  236:            $toclevel = 2;
                    237:            if($sub){
                    238:            }else{
1.2       connolly  239:              print "</li>\n</ol>\n";
1.1       connolly  240:              $toclevel = 1;
                    241:            }
                    242:          }
                    243:        }
                    244: 
1.2       connolly  245:        print $endli;
                    246:        print "\n";
                    247:        print "   " x $toclevel;
                    248:        print "<li><span><a rel='Section' href='$base-sec$sec.html#sec$num'>$heading</a> ... $num</span>";
1.1       connolly  249:       }
1.2       connolly  250:       elsif(/^(([\dA-Z]+)((\.\d+)*))\.?\s+([^
]*)/){
                    251:        my($sec, $sub, $heading) = ($2, $3, $5);
                    252: 
1.1       connolly  253:        if($toclevel == 1){
1.2       connolly  254:          print "</li></ol></div\n>";
1.1       connolly  255:          $state = 'body';
1.4       connolly  256:          &colophon($base, $docno, $stitle, $by);
1.2       connolly  257:          &startSection($sec, $sub, $heading,
                    258:                        $title, $stitle, $base, $docno, $by);
1.1       connolly  259:        }else{
                    260:          warn "blank line in toc level $toclevel";
                    261:        }
                    262:       }
1.2       connolly  263: 
1.1       connolly  264:       else{
                    265:        die "$state: what? [[$_]]";
                    266:       }
                    267:     }
                    268: 
                    269: 
                    270:     elsif($state eq 'block'){
                    271:       if(/^((\d+)((\.\d+)*))\s+([^
]*)/){
                    272:        my($sec, $sub, $heading) = ($2, $3, $5);
                    273:        
                    274:        print "</dl>\n";
                    275:        &changeSection($sec, $sub, $heading,
                    276:                       $title, $stitle, $base, $docno, $by);
                    277:        $state = 'body';
                    278:       }
                    279:       elsif(/^\S/){
                    280:        print "</dl>\n";
                    281:        print "<h2>$_</h2>\n";
                    282:        $state = 'body';
                    283:       }
                    284:       elsif(/^   \S/){
                    285: #          warn "hit this $_";
                    286:        if($firstLine){
                    287:          print "</$list>\n" if $list;
                    288:          
                    289:          print "<p>\n";
                    290:          print $firstLine;
                    291:          $firstLine = undef;
                    292:          print $_;
                    293:          $state = 'p';
                    294:        }else{
                    295:          $firstLine = $_;
                    296:        }
                    297:       }
                    298:       elsif(/^       (Note:)/){
1.7       connolly  299:        if($list eq 'dl'){ print "<dd>" };
1.1       connolly  300:        print "<p><strong>$1</strong>$'";
                    301:        $state = 'note';
                    302:       }
                    303:       elsif(/^       ?\S/){
                    304:        if($list ne 'dl'){
                    305:          print "</$list>\n" if $list;
                    306:          print "<dl>\n";
                    307:        }
                    308:        
                    309:        print " <dt>$firstLine</dt>";
                    310:        $firstLine = undef;
                    311:        print " <dd>$_";
                    312:        $state = 'dd';
                    313:       }
                    314:       elsif(/^\s*$/){  # added the $ - don't want to drop a line with text 
                    315:        if($firstLine){
                    316:          print "<p>\n";
                    317:          print $firstLine;
                    318:          $firstLine = undef;
                    319:          print "</p>\n";
                    320:          $state = 'body';
                    321:        }
                    322:       }
                    323:       elsif(/^\s*\S/){
                    324: # this missed all of the above, but it looks like valid text,  so
                    325: # lets just use it like it was a normal paragraph.  I wanted to keep
                    326: # this separate from the three space rule, but it has the exact same
                    327: # behavior.
                    328:         if($firstLine){
                    329:           print "</$list>\n" if $list;
                    330:               
                    331:           print "<p>\n";
                    332:           print $firstLine;
                    333:           $firstLine = undef;
                    334:           print $_;
                    335:           $state = 'p';
                    336:         }else{
                    337:           $firstLine = $_;
                    338:         }
                    339:       }
                    340:       else{
                    341:         die "$state: what? [[$_]]\n firstline: [[$firstLine]] list: [[$list]]";
                    342:       }
                    343:     }
                    344: 
                    345:     elsif($state eq 'p'){
                    346:       $_ = &addrefs($base, $bibsec, $_);
                    347: 
                    348:       if(/^ *\S/){ 
                    349:        print $_;
                    350:       }
                    351:       elsif(/^\s*$/){
                    352:        print "</p>\n";
                    353:        $state = 'body';
                    354:       }
                    355:       else{
                    356:        die "$state: what? [[$_]]";
                    357:       }
                    358:     }
                    359: 
                    360:     elsif($state eq 'note'){
                    361:       $_ = &addrefs($base, $bibsec, $_);
                    362: 
                    363:       if(/^       \S/){
                    364:        print $_;
                    365:       }
                    366:       elsif(/^\s*$/){
                    367:        print "</p>\n";
1.7       connolly  368:        if($list eq 'dl'){ print "</dd>" };
1.1       connolly  369:        $state = ($list ? 'block' : 'body');
                    370:       }
                    371:       else{
                    372:        die "$state: what? [[$_]]";
                    373:       }
                    374:     }
                    375: 
                    376:     elsif($state eq 'dd'){
                    377:       $_ = &addrefs($base, $bibsec, $_);
                    378: 
                    379:       if(/^       ?\S/){
                    380:        print $_;
                    381:       }
                    382:       elsif(/^         \S/){
                    383:        print STDERR "$state: pre: [[$_]]\n";
                    384:        print "<pre>$_</pre>";
                    385:       }
                    386:       elsif(/^\s*$/){
                    387:        $list = 'dl' unless $list;
                    388:        print "</dd>\n";
                    389:        $state = 'block';
                    390:       }
                    391:       else{
                    392:        die "$state: what? [[$_]]";
                    393:       }
                    394:     }
                    395: 
                    396:     elsif($state eq 'bib'){
                    397:       if(/^   (\[(\d+)\])\s+(.*)/){
                    398:         my($label, $num, $rest) = ($1, $2, $3);
                    399: 
                    400:        &cite($citation) if $citation;
                    401: 
                    402:        $citation = $_;
                    403:       }
                    404:       elsif(/^((\d+)((\.\d+)*))\s+([^
]*)/){
                    405:        my($sec, $sub, $heading) = ($2, $3, $5);
                    406: 
                    407:        &cite($citation) if $citation;
                    408:        print "</dl>\n";
                    409: 
                    410:        &changeSection($sec, $sub, $heading,
                    411:                       $title, $stitle, $base, $docno, $by);
                    412:        $state = 'body';
                    413:       }
                    414:       else{
                    415:        $citation = $citation . $_;
                    416:       }
                    417:     }
                    418: 
                    419:     elsif($state eq 'pre'){
                    420:       $_ = &addrefs($base, $bibsec, $_);
                    421: 
                    422:       die "firstline: $_" if $firstLine;
                    423: 
                    424:       if(/^\s*$/){
                    425:        print "</pre>\n";
                    426:        $state = 'body';
                    427:       }else{
                    428:        print $_;
                    429:       }
                    430:     }
                    431: 
                    432:     else{
                    433:       die "unkown state $state";
                    434:     }
                    435:   }
                    436: 
                    437:   print "</body></html>\n";
                    438: 
                    439: }
                    440: 
                    441: sub addrefs{
                    442:   my($base, $bibsec, $l) = @_;
                    443: 
                    444:   $l =~ s, ((\d+)(\.\d+)+), <a rel='xref' href='$base-sec$2.html#sec$1'>$1</a>,g;
                    445:   $l =~ s,(\[(\d+)\]),<a rel='bibref' href='$base-sec$bibsec.html#bib$2'>$1</a>,g;
                    446: 
                    447:   return $l;
                    448: }
                    449: 
                    450: sub changeSection{
                    451:   my($sec, $sub, $heading,
                    452:     $title, $stitle, $base, $docno, $by) = @_;
                    453:   
1.2       connolly  454:   #print STDERR "section: [[$sec]][[$sub]][[$heading]]\n";
1.1       connolly  455: 
                    456:   if($sub){
1.7       connolly  457:     print "<h3><a id='sec$sec$sub'>$sec$sub</a> $heading</h3>\n";
1.1       connolly  458:   }else{
                    459:     print "</body></html>\n";
                    460: 
1.2       connolly  461:     startSection($sec, $sub, $heading,
                    462:                 $title, $stitle, $base, $docno, $by);
                    463:   }
                    464: }
                    465: 
                    466: sub startSection{
                    467:   my($sec, $sub, $heading,
                    468:     $title, $stitle, $base, $docno, $by) = @_;
                    469:   
                    470:   #print STDERR "section: [[$sec]][[$sub]][[$heading]]\n";
1.1       connolly  471: 
1.2       connolly  472:   open(SECTION, ">$base-sec$sec.html");
                    473:   select(SECTION);
1.1       connolly  474: 
1.9     ! connolly  475:   print '<!DOCTYPE html 
        !           476:      PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
        !           477:      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">';
        !           478:   print "\n<html xmlns='$xmlns'>\n";
1.2       connolly  479:   print "<head><title>$stitle: $heading</title></head>\n";
                    480:   
1.7       connolly  481:   print "<body><address>part of <a rev='Section' href='$base.html'>$title</a><br />\n";
                    482:   print "$docno $by</address>\n";
                    483:   print "<h2><a id='sec$sec$sub'>$sec$sub</a> $heading</h2>\n";
1.1       connolly  484: }
                    485: 
                    486: sub cite{
                    487:   my($citation) = @_;
                    488:   local($_);
                    489:   $_ = $citation;
                    490: 
                    491:   my($num, $label, $by, $title, $addr);
                    492: 
                    493:   s/^\s*//;
                    494:   s/\s+/ /g;
                    495: 
                    496:   s/HTTPLat ency.html/HTTPLatency.html/; # URL split across lines
                    497: 
                    498:   if(s/^(\[(\d+)\])\s*//){
                    499:     ($num, $label) = ($2, $1);
                    500: 
                    501:     if(s/^([^\"]*)\"([^\"]+)\"//){
                    502:       ($by, $title) = ($1, $2);
                    503:     }
                    504: 
                    505:     if(/RFC (\d+)/){
                    506:       my($RFCAddrFormat) = "http://www.ietf.org/rfc/rfc%04d.txt";
                    507:       $addr = sprintf("$RFCAddrFormat", $1);
                    508:     }
                    509: 
                    510:     if(m;((ftp|http)://[^,> ]+);){
                    511:       $addr = $1;
                    512:       $addr =~ s/\.$//; # period at the end of a URL is probably punctuation
                    513:     }
                    514: 
1.7       connolly  515:     print "<dt><a id='bib$num'>$label</a></dt>\n";
1.1       connolly  516:     if($addr){
                    517:       print "<dd>$by <cite><a href='$addr'>$title</a></cite> $_</dd>\n";
                    518:     }else{
                    519:       print "<dd>$by <cite>$title</cite> $_</dd>\n";
                    520:     }
                    521:   }
                    522: }
                    523: 
                    524: sub colophon{
1.4       connolly  525:   my($base, $docno, $stitle, $by) = @_;
1.2       connolly  526:   my($revdate);
1.9     ! connolly  527:   $revdate = '$Revision: 1.8 $ $Date: 2004/09/01 13:21:38 $';
1.2       connolly  528:   $revdate =~ s/\$//g;
                    529: 
1.1       connolly  530:     print "<address>\n";
1.5       connolly  531:     print "derived from <cite><a rel='derived-from' href='http://www.ietf.org/rfc/$base.txt'>$stitle</a></cite>, Internet $docno, $by<br class=''/>\n";
1.4       connolly  532:     print "using <a href='http://dev.w3.org/cvsweb/2001/rfc2html/'>rfc2html</a> ", $revdate, " by ";
1.3       connolly  533:     print "<a href='http://www.w3.org/People/Connolly/'>Dan Connolly</a>\n";
1.1       connolly  534:     print "</address>\n";
1.2       connolly  535:     print "</body></html>\n";
1.1       connolly  536: }
                    537: 

Webmaster