Annotation of 2001/rfc2html/rfc2html.pl, revision 1.4

1.1       connolly    1: #!/usr/local/bin/perl
1.4     ! connolly    2: # $Id: rfc2html.pl,v 1.3 2001/06/28 14:55:02 connolly Exp $
1.1       connolly    3: #
                      4: # formerly:
                      5: # http://www.w3.org/Protocols/rfc2616/rfc2html.pl
                      6: # Id: rfc2html.pl,v 1.13 2000/08/02 09:43:05 ylafon Exp 
                      7: #
                      8: # Usage:
                      9: #  perl rfc2html.pl --title '...' --stitle '...' ... (@@see getopt call below)
                     10: #    in_rfc.txt >rfcNNN.html
                     11: #   also creates $base-secN.html for each section N
                     12: #
                     13: # Features
                     14: #  splits by section
                     15: #  marks up TOC with hypertext links
                     16: #    using rel=Section per HTML 4.0
                     17: #  marks up section headings with anchors
                     18: #  marks up indented sections as <pre>
                     19: #  marks up paragraphs and definition lists
                     20: #  marks up cross references and bibliographic references
                     21: #   using rel=xref, rel=bibref
                     22: #   (some false matches)
                     23: #  marks up references section with links to other RFCs and docs
                     24: #  creates well-formed XML output
                     25: #
                     26: #
                     27: # TODO
                     28: #  markup ul, ol in body text as such rather than as <pre>
                     29: #  generalize &convert() params: title, short title, basename, bibsection
                     30: #     for other RFCs
                     31: #
                     32: # BY
                     33: #  Dan Connolly <connolly@w3.org>
                     34: #  http://www.w3.org/People/Connolly/
                     35: #  
                     36: #  with thanks to Pete Whiting for a fix on 19 Jan 2000
                     37: #
                     38: # LICENSE
                     39: #
                     40: # Copyright (c) 1999-2001 World Wide Web Consortium (W3C, http://www.w3.org/),
                     41: # (Massachusetts Institute of Technology, Institut National de
                     42: # Recherche en Informatique et en Automatique, Keio University). All
                     43: # Rights Reserved. 
                     44: #
                     45: # Permission to use, copy, modify, and distribute this software
                     46: # and its documentation for any purpose and without fee or
                     47: # royalty is hereby granted, per the terms and conditions in
                     48: #
                     49: # W3C Intellectual Property Notice and Legal Disclaimers
                     50: # http://www.w3.org/COPYRIGHT
                     51: # 1999/07/28 13:54:29
                     52: 
                     53: use strict;
                     54: use Getopt::Long;
                     55: 
                     56: my($xmlns) = 'http://www.w3.org/1999/xhtml';
                     57: 
                     58: my($title) = 'TITLE';
                     59: my($stitle) = 'STITLE';
                     60: my($base) = 'rfcNNNN';
                     61: my($bibsec) = 20;
                     62: my($by) = 'AUTHORS';
                     63: my($docno) = 'RFC NNNN';
                     64: GetOptions('title=s' => \$title,
                     65:           'stitle=s' => \$stitle,
                     66:           'base=s' => \$base,
                     67:           'bibsec=s' => \$bibsec,
                     68:           'by=s' => \$by,
                     69:           'docno=s' => \$docno);
                     70: &convert($title, $stitle, $base, $bibsec, $by, $docno);
                     71: 
                     72: sub convert{
                     73:   my($title, $stitle, $base, $bibsec, $by, $docno) = @_;
                     74:   my($state);
                     75:   my($firstLine, $list, $toclevel, $citation);
                     76: 
                     77:   $state = 'start';
                     78: 
                     79:   print "<html xmlns='$xmlns'>\n";
                     80:   print "<head><title>$title</title></head><body>\n";
                     81: 
                     82:   while(<>){
                     83: #      warn "$state $_";
                     84:     s/&/&amp;/g;
                     85:     s/</&lt;/g;
                     86: 
                     87:     if(/^$by    / or /^$docno    /){
1.2       connolly   88:       #print STDERR "skipped", $_;
1.1       connolly   89:       next;
                     90:     }
                     91: 
1.2       connolly   92:     #print STDERR " $state: $_";
                     93:     #print STDERR "$state: note: [[^$_]]\n" if (/^       (Note:)/);
1.1       connolly   94: 
                     95:     s/^(\d+)\. /$1 /; # copyright statement header is goofy
                     96: 
                     97:     if($state eq 'start'){
                     98:       if(/\S/){
                     99:        print "<pre>\n";
                    100:        $state = 'banner';
                    101:       }
                    102:     }
                    103:     elsif($state eq 'banner'){
                    104:       if(/^\s*$/){ # blank line
                    105:        $state = 'title';
                    106:        print "</pre>\n";
                    107:        print "<h1>\n";
                    108:       }else{
                    109:        print $_;
                    110:       }
                    111:     }
                    112:     elsif($state eq 'title'){
                    113:       if(/\S/){
                    114:        print $_;
                    115:        print "</h1>\n";
                    116:        $state = 'body';
                    117:       }
                    118:     }
                    119: 
                    120:     elsif($state eq 'body'){
                    121:       if(/^\s*$/){ # blank line
                    122:        # nothing
                    123:       }
                    124:       elsif(/^Table of Contents/){
                    125:        print "<div class='toc'><h2>Table of Contents</h2\n>";
                    126:        $state = 'toc';
                    127:        $toclevel = 0;
                    128:       }
                    129:       elsif(/^   ((\d+)(\.\d+)?([\.\d]+)?)\s+([^\.]+)\.\./){
                    130:        my($num, $sec, $sub, $subsub, $heading) = ($1, $2, $3, $4, $5);
                    131:        die "bad toc $toclevel: $_" if ($sub || $subsub);
                    132: 
                    133:        print "<ol class='toc'>\n";
1.2       connolly  134:        print "<li><a rel='Section' href='$base-sec$sec.html#sec$num'>$heading</a> ... $num\n";
1.1       connolly  135:        $state = 'toc';
                    136:        $toclevel = 1;
                    137:       }
                    138:       elsif(/^(([\dA-Z]+)((\.\d+)*))\.?\s+([^
]*)/){
                    139:        my($sec, $sub, $heading) = ($2, $3, $5);
                    140: 
                    141:        &changeSection($sec, $sub, $heading,
                    142:                       $title, $stitle, $base, $docno, $by);
                    143:       }
                    144:       elsif(/^\S/){
                    145:        print "<h2>\n";
                    146:        print $_;
                    147:        print "</h2>\n";
                    148:       }
                    149:       elsif(/^   (\[(\d+)\])\s+(.*)/){
                    150:        $citation = $_;
                    151:        print "<dl class='bib'>\n";
                    152:        $state = 'bib';
                    153:       }
                    154:       elsif(/^       (Note:)/){
                    155:        $_ = &addrefs($base, $bibsec, $_);
                    156: 
                    157:        print "<p><strong>$1</strong>$'";
                    158:        $list = undef;
                    159:        $state = 'note';
                    160:       }
                    161:       elsif(/^   \S/){
                    162:        $_ = &addrefs($base, $bibsec, $_);
                    163: 
                    164:        $firstLine = $_;
                    165:        $list = undef;
                    166:        $state = 'block';
                    167:       }
                    168:       elsif(/^    /){
                    169:        die "$state: unflushed $firstLine [[$firstLine]]" if $firstLine;
                    170: 
                    171:        print "<pre>$_";
                    172:        $state = 'pre';
                    173:       }
                    174:       else{
                    175:        die "$state: what? [[$_]]";
                    176:       }
                    177:     }
                    178: 
                    179:     elsif($state eq 'toc'){
1.2       connolly  180:       if(/^\s*$/ # skip blank lines in TOC
                    181:         || /^IMAP4rev1/ || /^Appendices/){
                    182:        print STDERR "state: $state skipped", $_;
1.1       connolly  183:        next;
                    184:       }
                    185: 
1.2       connolly  186:       if(/\.\d+\s*$/ &&
                    187:         /^(   )?(([A-Z\d]+)(\.\d+)?([\.\d]+)?\.?)\s+([^\.]+)/){
1.1       connolly  188:        my($num, $sec, $sub, $subsub, $heading) = ($2, $3, $4, $5, $6);
                    189:        $num =~ s/\.$//;
                    190:        print STDERR "@@ TOC:  $num, $sec, $sub, $subsub, $heading\n";
                    191: 
1.2       connolly  192:        my($endli);
                    193:        $endli = "</li>";
                    194:        
                    195:        if($toclevel == 0){
                    196:          print "<ol>\n";
                    197:          $toclevel = 1;
                    198:          $endli = '';
                    199:        }
                    200:        elsif($toclevel == 1){
1.1       connolly  201:          if($sub){
1.2       connolly  202:            print "\n<ol>\n";
1.1       connolly  203:            $toclevel = 2;
1.2       connolly  204:            $endli = '';
1.1       connolly  205:            if($subsub){
1.2       connolly  206:              warn "skipping a TOC level...";
                    207:              print "<li><span>@@ missing</span>\n";
                    208:              
                    209:              print "\n<ol>\n";
1.1       connolly  210:              $toclevel = 3;
1.2       connolly  211:              $endli = '';
1.1       connolly  212:            }
                    213:          }
                    214:        }
                    215:        elsif($toclevel == 2){
                    216:          if($sub){
                    217:            if($subsub){
1.2       connolly  218:              print "\n<ol>\n";
1.1       connolly  219:              $toclevel = 3;
1.2       connolly  220:              $endli = '';
1.1       connolly  221:            }
                    222:          }
                    223:          else{
1.2       connolly  224:            print "</li>\n</ol>\n";
1.1       connolly  225:            $toclevel = 1;
                    226:          }
                    227:        }elsif($toclevel == 3){
                    228:          if($subsub){
                    229:            # stay at level 3
                    230:          }else{
1.2       connolly  231:            print "</li>\n</ol>\n";
1.1       connolly  232:            $toclevel = 2;
                    233:            if($sub){
                    234:            }else{
1.2       connolly  235:              print "</li>\n</ol>\n";
1.1       connolly  236:              $toclevel = 1;
                    237:            }
                    238:          }
                    239:        }
                    240: 
1.2       connolly  241:        print $endli;
                    242:        print "\n";
                    243:        print "   " x $toclevel;
                    244:        print "<li><span><a rel='Section' href='$base-sec$sec.html#sec$num'>$heading</a> ... $num</span>";
1.1       connolly  245:       }
1.2       connolly  246:       elsif(/^(([\dA-Z]+)((\.\d+)*))\.?\s+([^
]*)/){
                    247:        my($sec, $sub, $heading) = ($2, $3, $5);
                    248: 
1.1       connolly  249:        if($toclevel == 1){
1.2       connolly  250:          print "</li></ol></div\n>";
1.1       connolly  251:          $state = 'body';
1.4     ! connolly  252:          &colophon($base, $docno, $stitle, $by);
1.2       connolly  253:          &startSection($sec, $sub, $heading,
                    254:                        $title, $stitle, $base, $docno, $by);
1.1       connolly  255:        }else{
                    256:          warn "blank line in toc level $toclevel";
                    257:        }
                    258:       }
1.2       connolly  259: 
1.1       connolly  260:       else{
                    261:        die "$state: what? [[$_]]";
                    262:       }
                    263:     }
                    264: 
                    265: 
                    266:     elsif($state eq 'block'){
                    267:       if(/^((\d+)((\.\d+)*))\s+([^
]*)/){
                    268:        my($sec, $sub, $heading) = ($2, $3, $5);
                    269:        
                    270:        print "</dl>\n";
                    271:        &changeSection($sec, $sub, $heading,
                    272:                       $title, $stitle, $base, $docno, $by);
                    273:        $state = 'body';
                    274:       }
                    275:       elsif(/^\S/){
                    276:        print "</dl>\n";
                    277:        print "<h2>$_</h2>\n";
                    278:        $state = 'body';
                    279:       }
                    280:       elsif(/^   \S/){
                    281: #          warn "hit this $_";
                    282:        if($firstLine){
                    283:          print "</$list>\n" if $list;
                    284:          
                    285:          print "<p>\n";
                    286:          print $firstLine;
                    287:          $firstLine = undef;
                    288:          print $_;
                    289:          $state = 'p';
                    290:        }else{
                    291:          $firstLine = $_;
                    292:        }
                    293:       }
                    294:       elsif(/^       (Note:)/){
                    295:        print "<p><strong>$1</strong>$'";
                    296:        $state = 'note';
                    297:       }
                    298:       elsif(/^       ?\S/){
                    299:        if($list ne 'dl'){
                    300:          print "</$list>\n" if $list;
                    301:          print "<dl>\n";
                    302:        }
                    303:        
                    304:        print " <dt>$firstLine</dt>";
                    305:        $firstLine = undef;
                    306:        print " <dd>$_";
                    307:        $state = 'dd';
                    308:       }
                    309:       elsif(/^\s*$/){  # added the $ - don't want to drop a line with text 
                    310:        if($firstLine){
                    311:          print "<p>\n";
                    312:          print $firstLine;
                    313:          $firstLine = undef;
                    314:          print "</p>\n";
                    315:          $state = 'body';
                    316:        }
                    317:       }
                    318:       elsif(/^\s*\S/){
                    319: # this missed all of the above, but it looks like valid text,  so
                    320: # lets just use it like it was a normal paragraph.  I wanted to keep
                    321: # this separate from the three space rule, but it has the exact same
                    322: # behavior.
                    323:         if($firstLine){
                    324:           print "</$list>\n" if $list;
                    325:               
                    326:           print "<p>\n";
                    327:           print $firstLine;
                    328:           $firstLine = undef;
                    329:           print $_;
                    330:           $state = 'p';
                    331:         }else{
                    332:           $firstLine = $_;
                    333:         }
                    334:       }
                    335:       else{
                    336:         die "$state: what? [[$_]]\n firstline: [[$firstLine]] list: [[$list]]";
                    337:       }
                    338:     }
                    339: 
                    340:     elsif($state eq 'p'){
                    341:       $_ = &addrefs($base, $bibsec, $_);
                    342: 
                    343:       if(/^ *\S/){ 
                    344:        print $_;
                    345:       }
                    346:       elsif(/^\s*$/){
                    347:        print "</p>\n";
                    348:        $state = 'body';
                    349:       }
                    350:       else{
                    351:        die "$state: what? [[$_]]";
                    352:       }
                    353:     }
                    354: 
                    355:     elsif($state eq 'note'){
                    356:       $_ = &addrefs($base, $bibsec, $_);
                    357: 
                    358:       if(/^       \S/){
                    359:        print $_;
                    360:       }
                    361:       elsif(/^\s*$/){
                    362:        print "</p>\n";
                    363:        $state = ($list ? 'block' : 'body');
                    364:       }
                    365:       else{
                    366:        die "$state: what? [[$_]]";
                    367:       }
                    368:     }
                    369: 
                    370:     elsif($state eq 'dd'){
                    371:       $_ = &addrefs($base, $bibsec, $_);
                    372: 
                    373:       if(/^       ?\S/){
                    374:        print $_;
                    375:       }
                    376:       elsif(/^         \S/){
                    377:        print STDERR "$state: pre: [[$_]]\n";
                    378:        print "<pre>$_</pre>";
                    379:       }
                    380:       elsif(/^\s*$/){
                    381:        $list = 'dl' unless $list;
                    382:        print "</dd>\n";
                    383:        $state = 'block';
                    384:       }
                    385:       else{
                    386:        die "$state: what? [[$_]]";
                    387:       }
                    388:     }
                    389: 
                    390:     elsif($state eq 'bib'){
                    391:       if(/^   (\[(\d+)\])\s+(.*)/){
                    392:         my($label, $num, $rest) = ($1, $2, $3);
                    393: 
                    394:        &cite($citation) if $citation;
                    395: 
                    396:        $citation = $_;
                    397:       }
                    398:       elsif(/^((\d+)((\.\d+)*))\s+([^
]*)/){
                    399:        my($sec, $sub, $heading) = ($2, $3, $5);
                    400: 
                    401:        &cite($citation) if $citation;
                    402:        print "</dl>\n";
                    403: 
                    404:        &changeSection($sec, $sub, $heading,
                    405:                       $title, $stitle, $base, $docno, $by);
                    406:        $state = 'body';
                    407:       }
                    408:       else{
                    409:        $citation = $citation . $_;
                    410:       }
                    411:     }
                    412: 
                    413:     elsif($state eq 'pre'){
                    414:       $_ = &addrefs($base, $bibsec, $_);
                    415: 
                    416:       die "firstline: $_" if $firstLine;
                    417: 
                    418:       if(/^\s*$/){
                    419:        print "</pre>\n";
                    420:        $state = 'body';
                    421:       }else{
                    422:        print $_;
                    423:       }
                    424:     }
                    425: 
                    426:     else{
                    427:       die "unkown state $state";
                    428:     }
                    429:   }
                    430: 
                    431:   print "</body></html>\n";
                    432: 
                    433: }
                    434: 
                    435: sub addrefs{
                    436:   my($base, $bibsec, $l) = @_;
                    437: 
                    438:   $l =~ s, ((\d+)(\.\d+)+), <a rel='xref' href='$base-sec$2.html#sec$1'>$1</a>,g;
                    439:   $l =~ s,(\[(\d+)\]),<a rel='bibref' href='$base-sec$bibsec.html#bib$2'>$1</a>,g;
                    440: 
                    441:   return $l;
                    442: }
                    443: 
                    444: sub changeSection{
                    445:   my($sec, $sub, $heading,
                    446:     $title, $stitle, $base, $docno, $by) = @_;
                    447:   
1.2       connolly  448:   #print STDERR "section: [[$sec]][[$sub]][[$heading]]\n";
1.1       connolly  449: 
                    450:   if($sub){
                    451:     print "<h3><a name='sec$sec$sub'>$sec$sub</a> $heading</h3>\n";
                    452:   }else{
                    453:     print "</body></html>\n";
                    454: 
1.2       connolly  455:     startSection($sec, $sub, $heading,
                    456:                 $title, $stitle, $base, $docno, $by);
                    457:   }
                    458: }
                    459: 
                    460: sub startSection{
                    461:   my($sec, $sub, $heading,
                    462:     $title, $stitle, $base, $docno, $by) = @_;
                    463:   
                    464:   #print STDERR "section: [[$sec]][[$sub]][[$heading]]\n";
1.1       connolly  465: 
1.2       connolly  466:   open(SECTION, ">$base-sec$sec.html");
                    467:   select(SECTION);
1.1       connolly  468: 
1.2       connolly  469:   print "<html xmlns='$xmlns'>\n";
                    470:   print "<head><title>$stitle: $heading</title></head>\n";
                    471:   
                    472:   print "<body><address><p>part of <a rev='Section' href='$base.html'>$title</a><br />\n";
                    473:   print "$docno $by</p></address>\n";
                    474:   print "<h2><a name='sec$sec$sub'>$sec$sub</a> $heading</h2>\n";
1.1       connolly  475: }
                    476: 
                    477: sub cite{
                    478:   my($citation) = @_;
                    479:   local($_);
                    480:   $_ = $citation;
                    481: 
                    482:   my($num, $label, $by, $title, $addr);
                    483: 
                    484:   s/^\s*//;
                    485:   s/\s+/ /g;
                    486: 
                    487:   s/HTTPLat ency.html/HTTPLatency.html/; # URL split across lines
                    488: 
                    489:   if(s/^(\[(\d+)\])\s*//){
                    490:     ($num, $label) = ($2, $1);
                    491: 
                    492:     if(s/^([^\"]*)\"([^\"]+)\"//){
                    493:       ($by, $title) = ($1, $2);
                    494:     }
                    495: 
                    496:     if(/RFC (\d+)/){
                    497:       my($RFCAddrFormat) = "http://www.ietf.org/rfc/rfc%04d.txt";
                    498:       $addr = sprintf("$RFCAddrFormat", $1);
                    499:     }
                    500: 
                    501:     if(m;((ftp|http)://[^,> ]+);){
                    502:       $addr = $1;
                    503:       $addr =~ s/\.$//; # period at the end of a URL is probably punctuation
                    504:     }
                    505: 
                    506:     print "<dt><a name='bib$num'>$label</a></dt>\n";
                    507:     if($addr){
                    508:       print "<dd>$by <cite><a href='$addr'>$title</a></cite> $_</dd>\n";
                    509:     }else{
                    510:       print "<dd>$by <cite>$title</cite> $_</dd>\n";
                    511:     }
                    512:   }
                    513: }
                    514: 
                    515: sub colophon{
1.4     ! connolly  516:   my($base, $docno, $stitle, $by) = @_;
1.2       connolly  517:   my($revdate);
1.4     ! connolly  518:   $revdate = '$Revision: 1.3 $ $Date: 2001/06/28 14:55:02 $';
1.2       connolly  519:   $revdate =~ s/\$//g;
                    520: 
1.1       connolly  521:     print "<address>\n";
1.4     ! connolly  522:     print "derived from <cite><a rel='derived-from' href='http://www.ietf.org/rfc/$base.txt'>$stitle</a>, Internet $docno, $by<br class=''/>\n";
        !           523:     print "using <a href='http://dev.w3.org/cvsweb/2001/rfc2html/'>rfc2html</a> ", $revdate, " by ";
1.3       connolly  524:     print "<a href='http://www.w3.org/People/Connolly/'>Dan Connolly</a>\n";
1.1       connolly  525:     print "</address>\n";
1.2       connolly  526:     print "</body></html>\n";
1.1       connolly  527: }
                    528: 

Webmaster