#!/usr/bin/perl -w
# gnuhtml2latex html to latex converter
# Copyright (c) 1999 Tomasz Wgrzanowski <maniek@beer.com>
#
# gnuhtml2latex is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# On Debian GNU/Linux systems, the complete text of the GNU General
# Public License can be found in `/usr/share/common-licenses/GPL'.
#
# THIS IS VERY ALPHA

use strict;
use Getopt::Std;

getopts('a:cf:h:no:pst:',\%main::opts);
unless (defined $main::opts{o}) { $main::opts{o} = '{article}' }
unless (defined $main::opts{h}) { $main::opts{h} = '' }
unless (defined $main::opts{f}) { $main::opts{f} = '' }
$main::num = ($main::opts{n})?'':'*';

{
my %tagstable_start = (
'p' => '\\par ',
'b' => '{\\bf ',
'i' => '{\\it ',
'u' => '\\underline{',
'dt' => '\\item[',
'dd' => ']',
'br' => '\\\\',
'em' => '{\\it ',
'h1' => "\\section${main::num}\{",
'h2' => "\\subsection${main::num}\{",
'h3' => "\\subsubsection${main::num}\{",
'h4' => "\\paragraph${main::num}\{",
'h5' => "\\subparagraph${main::num}\{",
'h6' => "\\subparagraph${main::num}\{",
'li' => '\\item ',
'ul' => '\\begin{itemize}',
'ol' => '\\begin{enumerate}',
'dl' => '\\begin{description}',
'tt' => '{\tt ',
'kbd' => '{\\tt\\bf ',
'var' => '{\\it ',
'dfn' => '{\\bf\\it ',
'cite' => '{\\sc ',
'samp' => '{\\tt ',
'strong' => '{\\bf ',
'listing' => '\\begin{verbatim}'
);

my %tagstable_end = (
'b' => '}',
'i' => '}',
'u' => '}',
'em' => '}',
'h1' => '}',
'h2' => '}',
'h3' => '}',
'h4' => '}',
'h5' => '}',
'h6' => '}',
'tt' => '}',
'kbd' => '}',
'var' => '}',
'dfn' => '}',
'cite' => '}',
'samp' => '}',
'strong' => '}',
'ul' => '\\end{itemize}',
'ol' => '\\end{enumerate}',
'dl' => '\\end{description}',
'listing' => '\\end{verbatim}'
);

my $mode = 0;
package HTML::LatexMaker;
use HTML::Parser;
use HTML::Entities;
@HTML::LatexMaker::ISA = ( "HTML::Parser" );
1;

sub start {
my ( $self, $tag, $attr, $attrseq ) = @_;
   if ( $tag eq 'html' ) { start_mode(1); return }
elsif ( $tag eq 'head' ) { start_mode(2); return }
elsif ( $tag eq 'body' ) { start_mode(3); return }
return unless( $mode == 3 and defined $tagstable_start{$tag} );
print $tagstable_start{$tag};
}

sub end {
my ( $self, $tag ) = @_;
   if ( $tag eq 'html' ) { end_mode(0); return }
elsif ( $tag eq 'head' ) { end_mode(1); return }
elsif ( $tag eq 'body' ) { end_mode(1); return }
return unless( $mode == 3 and defined $tagstable_end{$tag} );
print $tagstable_end{$tag};
}

sub text {
my ( $self, $text ) = @_;
return unless( $mode == 3 );

# Handle some things that decode_entities doesn't.
# (This needs to be done *before* calling decode_entities: otherwise
# there'd be no way of distinguishing `&FOO;' from `&amp;FOO;'.)

# We use `!' for internal purposes during entity translation.
$text =~ s/!|&\#(?:0*33|x0*21);/!bang;/g;

# Handle `&lsquo;&ldquo;', `&ndash;&mdash;' and so on by inserting
# thin space between the translations in such cases.
$text =~ s/&\#(?:x0*2d|0*45);/-/g;
$text =~ s/(&mdash;|&ndash;|-)(?=(?:&mdash;|&ndash;|-))/$1!thinsp;/g;
$text =~ s/(&[lr][sd]quo;)(?=(?:&[lr][sd]quo;))/$1!thinsp;/g;

# There are many things that decode_entities doesn't handle.
# A few of those things we handle ourselves.  The final replacement
# happens later (so that we correctly handle the various quotes
# whether they're literal, numeric character ref, or symbolic ref).
# In the meantime we change from `&FOO;' to `!FOO;'.
$text =~ s/&([mn]dash|[lr][sd]quo|hellip);/!$1;/g;

decode_entities($text);

$text =~ s/\\/!backslash;/g;
$text =~ s/([~`'"]+)/!verb|$1|/g;  #`;
$text =~ s/([_&%\{\}#])/\\$1/g;
$text =~ s/\$/\\\$/g;
$text =~ s/\^/\\^{}/g;
$text =~ s/!backslash;/\$\\backslash\$/g;
$text =~ s/!mdash;/---/g;
$text =~ s/!ndash;/--/g;
$text =~ s/!lsquo;/`/g;  #`;
$text =~ s/!rsquo;/'/g;  #';
$text =~ s/!ldquo;/``/g;
$text =~ s/!rdquo;/''/g;
$text =~ s/!hellip;/\ldots{}/g;
$text =~ s/!thinsp;/\$\\,\$/g;
$text =~ s/!verb\|/\\verb|/g;
$text =~ s/!bang;/!/g;
$text =~ s/\xa0/~/g;
print $text;
}

sub start_mode {
my ( $mode_new ) = @_;
if ( $mode_new == 1 ) {
print
'% This file was converted from HTML to LaTeX with
% Tomasz Wegrzanowski\'s <maniek@beer.com> gnuhtml2latex program
% Version : '.$main::version.'
\documentstyle'.$main::opts{o}."\n";
}
if ( $mode_new == 3) {
print ('\begin{document}'."\n".$main::opts{h});

if ( defined $main::opts{a} or defined $main::opts{t} or defined $main::opts{c} ){
if ( defined $main::opts{a} or defined $main::opts{t} ) {
if ( $main::opts{t} ) { print ('\\title{'.$main::opts{t}.'}') }
print ( '\\author{'.($main::opts{a} or '')."}\n\\maketitle" );
}
if ( $main::opts{c} ) { print "\n\\tableofcontents" }
}
if ( $main::opts{p} ) { print "\n\\newpage" }
}
$mode = $mode_new;
}

sub end_mode {
my ( $mode_new ) = @_;
if ( $mode == 3 ) {
print ($main::opts{f}.'\end{document}'."\n");
}
$mode = $mode_new;
}

}

$main::version = '0.1';
foreach my $filename(@ARGV) {
open FILE, $filename or next;
$filename =~ s/\.html?$//;
my $outfile = $filename.".tex";
unless ( $main::opts{s} ) { open STDOUT,">$outfile" }
my $doc = new HTML::LatexMaker;
$doc->parse_file (\*FILE);
close FILE;
}

=head1 NAME

gnuhtml2latex - html to latex converter

=head1 SYNOPSIS

B<gnuhtml2latex> F<[options]> F<filename>

F<-a> F<[author]> - speecify author of document 

F<-c>          - use table of contents

F<-f> F<[string]> - specify foonote

F<-h> F<[string]> - specify header

F<-n>          - use numbered sections

F<-o> F<[string]> - specify document style

F<-p>          - break page after title / table of contents

F<-s>          - write to stdout

F<-t> F<[title]>  - specify title of document

=head1 DESCRIPTION

This aims to be replacement of html2latex.

Program takes html file foo.html or foo.htm file
and makes latex file foo.tex from it

=head1 NOT VERY AMBITIOUS TODO

For people who want only functionality of original html2latex

 bugfixes - Im sure there is plenty of bugs inside
 clueful backslash escaping
 more entities from outside of iso-8895-1
 tables
 performance boost
 and a lot more

=head1 MORE AMBITIOUS TODO

For people who want a real tool

 make it part of some html processor
 rewrite in flex

=head1 FUTURE OF THIS PACKAGE

This is very possible that functions of this package will be included
to some more general project. This package was made mainly to make world
a bit more free.

=cut
