# author: Anton Tagunov # 5 April 2002 # version 0.023 our $myV="\n\n formatted to HTML from plain text\n". " by scientific htmlizer version 0.023\n". " by Anton Tagunov \n". " http://tagunov.tripod.com\n". " \n". " Scientific Research Center\n". " Moscow State University\n". " http://srcc.msu.su\n". " The author is willing to communicate :-)\n\n"; # # converts text to HTML # # look at survey.txt and survey.html # and figure out what it does # # inspired by perl pod :-) use strict; use warnings; our $DBG_HV; sub NORM(){ 0 }; sub VERB(){ 1 }; sub REFR(){ 2 }; sub HEDR(){ 3 }; sub GLUE(){ 4 }; #sub GLUEdbg1(){ 5 }; #sub GLUEdbg2(){ 6 }; sub hl_get_line(;$); our @GlobGLUE = ( GLUE ); #our @GlobGLUEdbg1 = ( GLUEdbg1 ); #our @GlobGLUEdbg2 = ( GLUEdbg2 ); our $RefrRE = '[ .[:alnum:]_:-]{3,32}'; sub refr_tr($){ $_[0] =~ tr/ .:-/____/;} sub cntmrk($); sub cntmrk2($); our $txt; $txt = $ARGV[0]; die "Usage: htmlizer [-fix | -v]\n". " -fix rewrite the source file, regenerate table". " of contents\n". " -v add comments to the generated HTML\n". " that explain why certain paragraphs have\n". " been formatted verbatim or not\n" unless defined $txt; die "File $txt not found: $!\n" unless -f $txt; open F, $txt or die "Can not open file $txt: $!\n"; my ($ttitle,$te,$tp1,$tp2,$tmain, $pttitle,$pte,$ptp1,$ptp2,$ptmain); my $addon; if ( defined $ARGV[1] ){ if ( $ARGV[1] eq '-fix' ){ ($pttitle,$pte,$ptp1,$ptp2,$ptmain)= \($ttitle,$te,$tp1,$tp2,$tmain); }elsif( $ARGV[1] eq '-v' ){ $::DBG_HV = 1; }elsif( $ARGV[1] =~ /^-/ ){ die "Unknown option $ARGV[1]\n"; }else{ open (ADDON, '<', $ARGV[1]) or die "can not open '$ARGV[1]' for reading: $!\n"; $addon = 1; } } my $title = &get_block($pttitle); my ($p1,$p2) = (&get_block($ptp1),&get_block($ptp2)); #&html_toc(&get_block()); &check_it_toc(&get_block(undef,$pte)); #swallow contents my @doc; for(;;){ my @blocks = &get_par($ptmain); last unless defined $blocks[0]; push @doc, @blocks; } close F; if ( defined $ARGV[1] && $ARGV[1] eq '-fix' ){ print "Rewriting '$txt', putting TABLE OF CONTENTS...\n"; my $bckp = '~'.$txt; print "creating backup '$bckp'...\n"; open O, '>', $bckp or die "Can't open '$bckp' for writing: $!\n"; open F, '<', $txt or die "Can't open '$txt' for reading: $!\n"; my ($modtm, $now) = ((stat F)[9],time); while(){ chomp; print O $_,"\n" or die "Failed writing to '$txt': $!\n" } close F; close O; print "rewriting '$txt'...\n"; open F, '>', $txt or die "Can not write '$txt': $!\n"; print F $ttitle or die "Can not write '$txt': $!\n"; print F $tp1 or die "Can not write '$txt': $!\n"; print F $tp2 or die "Can not write '$txt': $!\n"; if (defined $te) { print F $te or die "Can not write '$txt': $!\n"; } &txt_toc(*F,\@doc); print F $tmain or die "Can not write '$txt': $!\n"; close F; utime $now,$modtm,$bckp or warn "Failed to set ". "modification time on ".$bckp."to ".(gmtime $modtm)."\n"; utime $now,$modtm,$txt or warn "Failed to set ". "modification time on ".$txt."to ".(gmtime $modtm)."\n"; print "Done.\n"; }else{ &html_start($title); &html_preamble($p1,$p2); &html_toc(\@doc); for my $block (@doc){ my $type = $$block[0]; &html_verb($block) if ( $type == VERB ); &html_norm($block) if ( $type == NORM ); &html_refr($block) if ( $type == REFR ); &html_hedr($block) if ( $type == HEDR ); print "

\n" if ( $type == GLUE ); #print "

\n" if ( $type == GLUEdbg1 ); #print "

\n" if ( $type == GLUEdbg2 ); } &html_end(); } #======================================================== =pod =head formats NORM data VERB data HEDR id level label data REFR id label data =head recognizes - aaa bbb - cc -d style lists and converts them to UL note html_ hedr/refr/norm/verb all spoil the orig arrays =cut sub html_start{ my $title = shift; print <<"EOT"; EOT print ''; html_plain( $title ); print <<'EOT';
EOT print '

'; html_plain( $title ); print "

\n"; } sub html_preamble{ my ($a, $b, $tm) = (shift, shift, scalar gmtime); my $txttm = gmtime ((stat($txt))[9]); $txttm =~ s/\S+\s+(\S+)\s+(\d+)\s+\d+:\d+:\d+\s+(\d+)/$1 $2 $3/; print <<"EOT"; EOT print '\n
'; html_full_br( $a ); print ""; html_full_br( $b ); my $tml = ($tm) . " GMT"; print <<"EOT"
$txttm text version of this document
EOT } sub html_toc($){ my $block = shift; return unless defined $block; print <<'EOT';

TABLE OF CONTENTS

EOT for (@$block){ next unless $$_[0] == HEDR; my ($id,$level,$word,$label) = @$_[1..4]; next unless defined($id); $level+=2; $word = ucfirst lc $word; print "$word$label "; html_hedr_engine( $_, 5 ); print "\n"; } print "
\n"; } sub txt_toc(*$){ *F = shift; die "txt_toc failure: F not defined" unless defined *F{IO}; my $block = shift; return unless defined $block; print F "TABLE OF CONTENTS\n" or die "Failed writing to '$txt': $!\n"; for (@$block){ next unless $$_[0] == HEDR; my ($id,$level,$word,$label) = @$_[1..4]; next unless defined($id); $word = ucfirst lc $word; print F ' ' x $level or die "Failed writing to '$txt': $!\n"; print F $word,$label,' ' or die "Failed writing to '$txt': $!\n"; my $r; if ($word eq '' && $level == 0 && ( $r = 3 - length $label ) > 0 ){ print F ' ' x $r or die "Failed writing to '$txt': $!\n"; } for ( @$_[5..$#$_-1] ) { print F $_,' ' or die "Failed writing to '$txt': $!\n"; } print F $$_[-1],"\n" or die "Failed writing to '$txt': $!\n"; } print F "\n" or die "Failed writing to '$txt': $!\n"; } # my ($id,$level,$label,$tail) = &addr( &chchp($_) ); # next unless defined($id); sub html_end{ print <<"EOT";
EOT } sub html_plain($;$){ my ($block, $thr) = (shift, shift); return unless defined $block; my $s; $thr = 0 unless defined $thr; for (@$block[$thr..$#$block-1]){ $s = $_; html_min_proc($s); print $s,"\n"; } $s = $$block[-1]; html_min_proc( $s ); print $s; } sub html_hedr_engine($;$){ my ($block, $thr) = (shift, shift); return unless defined $block; my $s; $thr = 0 unless defined $thr; for (@$block[$thr..$#$block-1]){ $s = $_; html_min_proc($s); print $s,"\n"; } $s = $$block[-1]; html_hedr_proc( $s ); print $s; } sub html_full_br($){ my $block = shift; return unless defined $block; my $s; for (@$block[0..$#$block-1]){ $s = $_; html_full_proc($s); print $s,"
\n"; } $s = $$block[-1]; html_full_proc( $s ); print $s; } sub find_sub_block($$){ my $block = shift; return unless defined $block; my ($tha,$thb)=(shift); return unless defined $tha; for(;;++$tha){ return unless defined $$block[$tha]; last if $$block[$tha] =~ /\S/ } $thb=$tha+1; for(; defined $$block[$thb] && $$block[$thb] =~ /\S/; ++$thb){ } ($tha,$thb); } sub html_verb($){ my $first_sub = 1; my $block = shift; return unless defined $block; my $s; my ($ntha,$nthb) = find_sub_block($block,1); return unless defined $nthb; my ($ntbl,$nmsg) = heurist_verb($block,$ntha,$nthb); for(;;){ my ($tha,$thb,$tbl,$msg)=($ntha,$nthb,$ntbl,$nmsg); last unless defined $thb; # print STDERR "1.#$tha#$thb#\n"; for(;;){ ($ntha,$nthb) = find_sub_block($block,$thb); last unless defined $nthb; ($ntbl,$nmsg) = heurist_verb($block,$ntha,$nthb); last; #turning this feature off: #the spaces creted by empty lines in #PRE look too long #last unless $tbl && $ntbl; #$thb=$nthb; #print "\n

".$tbl."

\n" # if $tbl && $DBG_HV; #$tbl=$ntbl; #undef $ntbl; } print "\n

".$msg."

\n" if $DBG_HV; # print STDERR "2.#$tha#$thb#\n"; print "

\n" unless $first_sub; undef $first_sub; if ($tbl){ print '
';
            for (@$block[$tha..$thb-2]){
                $s = $_; &html_verb_proc($s);  
                print $s,"\n"; 
            }
            $s = $$block[$thb-1];  &html_verb_proc( $s ); 
            print $s,"
\n"; }else{ my ($lev,$o,$wl) = (0); # list level #

open?, was line? for (@$block[$tha..$thb-1]){ my $oldlev=$lev; if ( /\S/ ){ print "

" unless $o; $o = 1; my ($cnt,$mrk); ($cnt,$mrk,$s) = cntmrk2 $_; $cnt = $cnt/2; if ($lev == 0 && $cnt + $mrk> 0){ print "

"; } undef $wl if $lev > $cnt + 1; for(;$lev>$cnt+1;--$lev){ print "\n\n"; } if($lev>$cnt){ print ""; if ($mrk == 0){ print "\n"; $lev--; }else{ print "\n" if $wl; undef $wl; print "
  • "; $mrk--; } } if ($lev<$cnt+$mrk){ print "\n" if $wl; undef $wl; } for(;$lev<$cnt+$mrk;++$lev){ print $lev == 0 ? "
    • ": "
      • "; } if ($lev == 0 && $oldlev > 0){ print "

        "; } &html_full_proc($s); print "\n" if $wl; print $s; $wl = 1; }else{ for(;$lev>0;$lev--){ print "

      • \n
      \n" } if ($oldlev > 0){ print "

      " } print "

      \n" if $o; undef $o; undef $wl; } } if ( $o ){ my $oldlev = $lev; for(;$lev>0;$lev--){ print "
    \n" } if ($oldlev > 0){ print "

    " } print "

    \n"; } } } print "\n"; } sub html_hedr($){ my $block = shift; return unless defined $block; my $s; my ($id,$level,$word,$label)=(@$block[1..4]); $level+=2; $word = ucfirst lc $word; print "\n\n$word$label "; html_hedr_engine($block,5); #for(@$block[5..$#$block-1]){ # my $s = $_; # &html_hedr_proc($s); # print $s,"\n"; #} #$s = $$block[-1]; &html_hedr_proc( $s ); #print $s; print "\n"; } sub html_norm($){ my $block = shift; return unless defined $block; my ($lev,$v,$i,$u)=(0); print '

    '; for($i=1; $i<=$#$block; $i++){ $v = $$block[$i]; if (ref $v){# print STDERR "marker $$v\n"; my $old = $lev; if ( $lev == 0 && $$v > 0 ){ print "

    \n"; } if ( $lev == $$v && $lev > 0 ){ print "
  • \n
  • "; } if ( $lev < $$v ){ print "
      \n" unless $u; undef $u; print "
    • "; $lev++; } while( $lev < $$v ){ print "
      • "; $lev++; } while( $lev > $$v+1 ){ print "
      • \n
      \n"; $lev--; } if ( $lev > $$v ){ print "
    • \n"; $u = 1; $lev--; } }else{ &html_full_proc($v); #insert .., #conver http://, a@a.b if ( $u ){ print "
    \n"; print "

    " if $lev == 0; undef $u; } print $v; print "\n" if $i < $#$block; } } my $lm = $u || $lev > 0; #list mode print "\n\n" if $u; while( $lev > 0 ){ print "

  • \n"; $lev--; print "\n" if $lev > 0; } print "

    " if $lm; #for better vertical margins print "

    \n\n"; } sub html_refr($){ my $block = shift; return unless defined $block; my $s; my ($id,$label)=($$block[1], $$block[2]); print "\n\n
    \n\n
    [$label]
    \n"; print "
    "; for(@$block[3..$#$block-1]){ $s = $_; &html_full_proc($s); print $s,"
    \n"; } $s=$$block[-1]; &html_full_proc( $s ); print $s,"
    \n"; } sub check_it_toc($){ my $block = shift; die "TABLE OF CONTENTS not found in '$txt'" unless defined $block && defined $$block[0]; die "TABLE OF CONTENTS expected '$$block[0]' found" unless $$block[0]=~/^\s*TABLE\s+OF\s+CONTENTS\s*$/i; for (@$block[1..$#$block]){ die "Table contents entry expected, '$_' found" unless /^\s*(?:\d+\.|\d+(?:\.\d+)+)\s+ | ^\s*Appendix\s+[[:alpha:]]+\.\s+/xi; } } sub heurist_any($$;$$){ my ($block,$qr,$tha,$thb) = (shift,shift,shift,shift); return undef unless defined $block && $#$block > 0 && defined $qr; $tha = 1 unless defined $tha; $thb = $#$block+1 unless defined $thb; for(@$block[$tha..$thb-1]){ return 1 if $_ =~ $qr; } undef; } sub heurist_all($$;$$){ my ($block,$qr,$tha,$thb) = (shift,shift,shift,shift); return undef unless defined $block && $#$block > 0 && defined $qr; $tha = 1 unless defined $tha; $thb = $#$block+1 unless defined $thb; for(@$block[$tha..$thb-1]){ return undef unless $_ =~ $qr; } 1; } INIT{ our $DBG_HV = 0; } sub heur_minmax_violation($$){ defined $_[0] && ( $_[0] * 10 < $_[1] *6 ); } #try to guess if this is a table or a citation? # #conservative: considers table if in doubt # #ret: table? sub heurist_verb($;$$){ my ($block,$tha,$thb) = (shift,shift,shift); return unless defined $block; my $s; $tha = 1 unless defined $tha; $thb = $#$block+1 unless defined $thb; return (1,"#more then 3 spaces in a row\n") if (heurist_any($block,qr{\S\s{3,}\S},$tha,$thb)); return (2,"#special for Thanks section: this looks like". " a list of names") if (heurist_all($block,qr{ ^\s*(?: (?: [[:upper:]][[:alpha:]:-]* | [\d-]+ | [[:alpha:]]{1,3} ) (?: [.,:\s-]+ | \z ) )* \z }x,$tha,$thb)); # check if this is a list? see get_par my $lev; my $min; my $max; my $l; $lev = 0; for (@$block[$tha..$thb-1]){ my ($reg,$cnt,$mrk) = ( scalar(/\S/) , cntmrk $_); if ($reg){ if ($cnt % 2){ print return (1, "uneven number of spaces in list structure\n"); } if ( ($cnt = $cnt / 2) > $lev ){ return (1, "hierarchy rules violated, ". "going vcerbatim\n"); } } unless ( $reg && $lev == $cnt && $mrk == 0 ){ #print "

    cleanup:====cnt=$cnt, mrk=$mrk,". # "lev = $lev\n

    "; last if heur_minmax_violation $min, $max; undef $l; undef $min; undef $max; }else{ #print "

    bypass:====cnt=$cnt, mrk=$mrk,". #" lev = $lev\n

    "; if ( defined $l ){ $min = $l unless defined $min && $min <= $l; } } if ( $reg ){ $l = length; $max = $l unless defined $max && $max >= $l; } $lev = $cnt + $mrk; } if ( heur_minmax_violation $min, $max ){ return (1,"#min=$min, max=$max#\n"); } my ($first,$spec,$tot,$msg)=(1,0,0,''); for (@$block[$tha..$thb-1]){ if ($first && m{ #is this a header? \G( \[[[:alnum:]:,."'\s+=-]+\]:?\s* \d+(?:\.\d+)*\.?\s* | \[[[:alnum:]:,."'\s+=-]+\]:?\s* | (?:\d+(?:\.\d+)*\.?\s*)? ) }gx){ $tot++; $msg.="<0>$1</0>". "\n" if $DBG_HV; } undef $first; while (m{\G (?: ( #word with two [[:upper:]]'s [\]\[/\\()[:lower:]:,."'+=-]* [[:upper:]] [\]\[/\\()[:lower:]:,."'+=-]* [[:upper:]] \S*\s* ) #$1 | ( #word with two [-+=]'s [\]\[/\\()[:alpha:]:,."']* [[+=-]] [\]\[/\\()[:alpha:]:,."']* [[+=-]] \S*\s* ) #$2 | ( #word with a special or a digit [\]\[/\\()[:alpha:]:,."'+=-]* [^\]\[/\\()[:alpha:],."'\s+=-] \S*\s* ) #$3 | ( #simple word [\]\[/\\()[:alpha:]:,."'+=-]+ (?: \s+ | \z ) ) #$4 ) }gx){ $msg.="<0>$4</0>". "\n" if ($DBG_HV && $4); $msg.="".( $1 && "<1>$1</1>" || $2 && "<2>$2</2>" || $3 && "<3>$3</3>" )."\n" if ($DBG_HV && !$4); $tot++; $spec++ unless $4; } } my $ret = "#$spec#".($msg?"($msg)":''). "#$tot# - signals " if $DBG_HV; if ( $spec*2 >= $tot ){ $ret .= "verbatim\n" if $DBG_HV; #print STDERR "\t\t$spec\t$tot\n"; return (1, $ret); }else{ $ret .= "normal\n" if $DBG_HV; #print STDERR "$spec\t$tot\n"; return (0, $ret); } } =pod =head2 liner_get_block returns (\array) this array has block_type as its first element NORM,VERB or REFR. NORM: | |This is a normal |block | VERB | | This is a verbatim | block | REFR | |[ECMA 35] Character Code Str... | Standard ECMA-35 | Equal number of spaces is cut of from VERB and body of REFR block. For example the two blocks above will be returned as |This is a verbatim | block |Character Code Str... |Standard ECMA-35 the first space prepended line in a block starts a special block the first non-prepeneded line in a special starts normal block zero-length lines end any kind of block between prepended lines are part of the special block otherwise block separators GLUE is inserted when a NORM switches to VERB or back without blank lines GLUE is usefull for CSS trick =cut our $buf; my @intg=(0); #array to keep integers. will use to build refs sub hl_get_line(;$){ my $accum = shift; my $ret = ; return unless defined $ret; chomp $ret; ($$accum .= $ret).="\n" if defined $accum; $ret =~ s/\s+$//; return $ret unless $ret =~ s/\\$//; my $go; do{ my $hl_buf = ; last unless defined $hl_buf; chomp $hl_buf; ($$accum .= $hl_buf).="\n" if defined $accum; $hl_buf =~ s/\s+$//; $go = $hl_buf =~ s/\\$//; $ret .= $hl_buf; }while($go); $ret; } my $glue_flag; sub get_par($){ my $str = shift; my $old_glue = $glue_flag; undef $glue_flag; my $v = $buf; undef $buf; unless (defined $v){ while( defined ($v=hl_get_line $str) ){ last if $v =~ /\S/; }} return unless ( defined $v ); my $verb = $v =~ /^\s/; #$verb - input in verb mode? my @accum; my @ret = (\@accum); @accum=( VERB ) if $verb; unless( $verb ){ my ($rid, $rlabel, $rtail) = &refr($v); my $rvv = hl_get_line $str if defined $rid; my ($hid, $hlevel, $hword, $hlabel, $htail) = &addr($v) unless defined $rid; if ( defined $rid && ( !defined $rvv || $rvv !~ /\S/ ) ){ #do we allow one-line REFR-s? No. @accum = ( NORM, $v ); }elsif ( defined $rid && $rvv =~ /^\s/ ){ #okay, this is REFR #setup for if ($verb) part to run @accum = ( REFR, $rid, $rlabel); push @accum, $rtail if defined $rtail; $v = $rvv; $verb=1; }elsif ( defined $hid ){ @accum = ( HEDR, $hid, $hlevel, $hword, $hlabel, $htail ); while ( defined ($buf = hl_get_line $str) ){ unless ( $buf =~ /\S/ ){ undef $buf; last; } #okay, we'll accept the /^\s/ lines :-) #last if $buf =~ /^\s/; $buf =~ s/^\s+//; push @accum, $buf; } }else{ #!defined($id) || $vv=~/^\S/ @accum = (NORM); my $lev = 0; #for UL-s my $ls = 0; for(;;){ my $s; if (defined $v){ $s = $v; $v = $rvv; undef $rvv;} unless ( defined $s ){ last unless defined ($buf = hl_get_line $str); $s = $buf; } unless ( $s =~ /\S/ ){ undef $buf; last } my ($cnt,$mrk); unless (do{#check level, leave $buf ($cnt,$mrk,$s) = cntmrk2 $s; if ( $cnt % 2 != 0 ){ 0; #not allowed! :-) }elsif( ($cnt = $cnt / 2) > $lev ){ 0; }else{ # 0 < $cnt <= $lev if ( $cnt == $lev - 1 && $mrk == 1 ){ push @accum, \($intg[$lev] = $lev); }else{ if ( $cnt < $lev ){ push @accum, \($intg[$cnt]=$cnt); } $lev = $cnt + $mrk; if ( $mrk > 0 ){ push @accum, \($intg[$lev]=$lev); } } 1; } }){ #push @ret, \@GlobGLUEdbg1; push @ret, \@GlobGLUE; $glue_flag = 1; last; } if ( $cnt == 0 && $mrk == 0 ){ if ( $old_glue && $ls == 1 ){# Add an option to #toggle this? my $v = '- ' . $accum[-2]; splice @accum, -3, 2, $v; } $ls = 0; }else{ $ls++; } push @accum, $s if $s =~ /\S/; } if ( $old_glue && $ls == 1 ){ #add an option? my $v = '- ' . $accum[-1]; splice @accum, -2, 2, $v; } } } if ( $verb ){ #this will work for VERB and NORM my $min = &spaces( $v ); push @accum, $v; my $thr = $#accum; #threshold my $linger = 0; while( defined ($buf = hl_get_line $str )){ if ($buf =~ /^\S/){ #push @ret, \@GlobGLUEdbg2 unless $linger; push @ret, \@GlobGLUE unless $linger; $glue_flag =1 unless $linger; last; } $v = $buf; undef $buf; if ( $v =~ /^\s*$/ ){ $linger++; }else{ my $new_min = &spaces( $v ); $min = $new_min if $new_min < $min; for (1..$linger){ push @accum, ''; } $linger = 0; push @accum, $v; } } for $v (@accum[$thr..$#accum]){ substr( $v, 0, $min ) = '' if length $v; } } @ret; } #just swallow all non-empty lines sub get_block{ my $v; my @accum; my ($str,$e) = (shift, shift); while(defined ($v = hl_get_line $str )){ if ( $v =~ /^\s*\*[\s*]*$/ || #drop markers $#accum < 0 && $v !~ /\S/){ ($$e .= $v ) .= "\n" if defined $e; next; } last unless $v =~ /\S/; push @accum, $v; } $#accum >=0 ? \@accum : undef; } =pod =head2 sub html_verb_proc fetches http:// ftp:// mail@mail.com, (2.2): and converts them to A HREF= called directly for verbatim (special) paragraphs and idirectly for normal paragraphs =cut sub html_verb_proc{ &html_min_proc($_[0]); $_[0] =~ s{((?:http|ftp)://[.a-zA-Z0-9/?%_-]+)} {$1}g; $_[0] =~ s{([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+)} {$1}g; $_[0] =~ s{\((\d+(?:\.\d+)*)\)}{($1)}g; undef; } =pod =head2 sub html_full_proc fetches tokens like RFC 2222 JIS X 0208:1266 JIS_C6226-1111-jp latin1 US-ASCII fitting chars are [[:alnum:]_*$-]. ' ','.' and ':' are permitted as part separators '' quotations _ * $ anywhere withing word two C's, more then 3 chars total digit,leter | digit,- - more then 3 chars total =cut sub par_val_proc($){ $_[0]=~s{(")|(<)|(>)}{ $1 && '"' || $2 && '<' || $3 && '>' }gex; } sub html_full_proc{ $_[0] =~ s{ ((?:http|ftp)://[.a-zA-Z0-9/?%_-]+) | #1 ( [a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+ | (?:"[^"]+"\s*)? < [a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+ > ) | #2 \((\d+(?:\.\d+)*)\) | #3 \[($RefrRE)\] | #4 (\t) | #5 (\s) | #6 (&) | #7 (\d+-[[:alpha:]]+) | #8, save this from becoming (?]* ) >(?![[:alnum:]_]) |#9 (?) | #14 (?:(?i:(appendixes))\s+([[:upper:]]+(?: (?: ,\s* | \s+and\s+ | \s+or\s+ ) [[:upper:]]+ )* ) ) | #15,16 (?:(?i:(appendix))\s+([[:upper:]]+)) | #17,18 (!=) | #19 L<([^<>|]*)> | #20 L<([^<>|]*) \| ( [^<>|]* (?:< [^<>|]* > [^<>|]*)? )> #21,22 }{ $1 && "$1" || $2 && do{ my $addr = $2; par_val_proc $addr; "$addr" } || $3 && "($3)" || $5 && ' ' || $4 && do{ my ($id,$label) = ($4,$4); refr_tr($id); $label =~ s/(?<=\S)\s(?=\S)/ /g if length $label < 16; "[$label]" } || $6 && ' ' || $7 && '&' || $8 && $8 || $9 && '<'.$9.'>'|| $10 && "'$10'"|| $11 && "$11" || $12 && "$12" || $13 && '<' || $14 && '>' || $15 && $15.' '.do{ my $s = $16; my $res; for(;;){ last unless $s=~/\G([[:upper:]]+)([^[:upper:]]*)/g; ($res .= "$1").=$2 } $res; } || $17 && $17." $18" || $19 && '≠' || $20 && "$20" || ($21 || $22) && do{ my ($text,$addr) = ($21, $22); par_val_proc $text; par_val_proc $addr; "$text" } }gex; undef; } sub html_min_proc{ $_[0] =~ s{\t}{ }g; $_[0] =~ s{\s}{ }g; $_[0] =~ s{&}{&}g; $_[0] =~ s{<}{<}g; $_[0] =~ s{>}{>}g; undef; } sub html_hedr_proc{ $_[0] =~ s{ (\t) | #1 (\s) | #2 (&) | #3 (<) | #4 (>) | #5 (!=) #6 - this may turn bad if # c or Perl code is processed # introduce an option to toggle # this behavior? }{ $1 && ' ' || $2 && ' ' || $3 && '&' || $4 && '<' || $5 && '>' || $6 && '≠' }gex; undef; } sub level($){ my $id = shift; return -1 unless defined $id; my $level=0; while($id=~/\./g){ $level++; } $level; } sub addr($){ my $v = shift; return unless defined $v; my ($i1,$i2,$tail) = $v =~ /^\s*(?:(\d+(?:\.\d+)+)|(\d+)\.)\s*(.*)$/; my $item = $i1?$i1:$i2; my $level = level $item if defined $item; return ('A'.$item, $level, '', $item=~/\./? $item : $item.'.', $tail) if defined($item); my $word; #print STDERR "***$v***\n" if $v =~ /appendix/i; ($word,$item,$tail) = $v =~ /^(APPENDIX)\s+(\w+)\.\s+(.*)$/i; return ('B'.$item, 0, $word.' ', $item.'.', $tail) if defined($item); (undef,'',$v); } sub refr($){ my $v = shift; return unless defined $v; my ($label,$tail) = $v =~ /^\[ ( $RefrRE ) \]\s*([[:upper:]].+)?$/xo; if (defined $label){ my $id = $label; refr_tr($id); return ('R'.$id, $label, $tail); } undef; } sub spaces{ my ($v,$cnt) = (shift,0); $cnt++ while( $v =~ /\G\s/g ); $cnt; } sub chchp{ my $v = shift; return unless defined $v; ($v) =~ /^\s*(.*?)\s*$/; $v; } sub cntmrk($){ my ($cnt,$mrk)=(0,0); $cnt++ while $_[0]=~/\G\s/gc; $mrk++ while $_[0]=~/\G- /g; ($cnt,$mrk); } sub cntmrk2($){ my ($cnt,$mrk,$s)=(0,0,shift); $cnt++ while $s=~s/^\s//; $mrk++ while $s=~s/^-\s//; ($cnt,$mrk,$s); }