#!/usr/bin/perl -w

die "i don't want this to run accidentally on my web server";
#
# Copyright (C) 2005  Adrien "Krunch" Kunysz
# 
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#

=head1 NAME

mbox2html - Translate a Unix mailbox file to HTML.

=head1 SYNOPSIS

mbox2html [-T template_part=file] [-s MESSAGE_TRAILER] [-t PAGE_TITLE]
MBOX_FILE [> OUTPUT_FILE]

=head1 DESCRIPTION

Convert a Unix mailbox file (mbox) to HTML with nice threads organization.

=head1 OPTIONS

=over 4

=item -s --signature

If that string is found at end of message, it will be removed. This is
useful for example when your mailing list software automatically add a
signature to the end of each message. The string must be a valid Perl
regular expression. Remember to escape "@". Default is "\s*".

=item -t --title

Specify the page title to use. Default is "mbox2html".

=item -T --Template

Tell mbox2html to use a custom template for a certain part. Valid parts are:

=back

=over 8

=item mail

This one defines how a mail is printed. Valid variables are MAIL_SUBJECT,
MAIL_FROM, MAIL_DATE, MAIL_ID and MAIL_BODY.

=item thread_prefix

What to print just before a thread. There is no variable.

=item thread_postfix

What to print just after a thread. There is no variable.

=item footer

What to print at the end of the file. There is no variable.

=item header

What to print at the beginning of the file. Valid variables are CHARSET,
TITLE and the loop SUBJECTS_THREADS which contains ID and SUBJECT.

=back

=head1 EXAMPLE

mbox2html -t 'Example mailing list archive' -T mail=mail.tpl -T header=head.tpl
-s '(\s*--\nTo unsubscribe send a mail to unsubscribe\@example\.org\s*)+'
example.mbox > example.html

=head1 BUGS

=over 4

=item

Multipart mails are not supported yet. Notably, OpenPGP signatures
are not checked.

=item

This is too slow and it takes too much memory.

=item

It should be possible to keep only threads covering a specified time span.

=item

Encoding conversion will be supported when Mail::Message::Body::Encode
will support it. See <http://rt.cpan.org/Public/Bug/Display.html?id=20301>.

=item

There is no "-o" options: you have to use the shell to pipe the output to a file.

=item

It should be possible to put each thread in a different file.

=item

md5 hashes in URL is not very nice.

=item

There should be an example of template.

=item

It should be possible to use a configuration file.

=item

The -s option should be checked before being passed to the regexp engine.

=back

=head1 HISTORY

First version was started around september 2005. It used Mail::MboxParser,
was probably faster, only used the In-Reply-To header field to build
threads and didn't have any template support. This version (november 2005)
is a complete rewrite using Mail::Box, Mail::Thread and HTML::Template. It
has a lot more dependancies and is probably slower but smarter.

The -s option and documentation for the -t option were added in July 2006.

=head1 AUTHOR

Adrien "Krunch" Kunysz, <a_kunysz@yahoo.com>

The latest version of this software should be available at
L<http://krunch.servebeer.com/~krunch/>.

=cut

use strict;
use Mail::Box::Mbox;
use Mail::Thread;
use HTML::Template;
use Digest::MD5('md5_hex');  # should get rid of this at some point
use Getopt::Long qw(:config no_ignore_case);

### globals
my $page_title = 'mbox2html';
my $page_charset = 'iso-8859-15';
#my $page_charset = 'utf-8';
#my $msgtrailer = '(\s*--\nPour vous desabonner envoyez un mail a public-unsubscribe\@lilit\.be\n*)+';
my $msgtrailer = '\s*';
my %templates = (footer => { default => '</body></html>' },
				 thread_prefix  => { default => '<div class="thread">' },
				 thread_postfix => { default => '</div>' },
                 mail   => { default => <<EOT
<div class="mailHeader">
	<a href="#<TMPL_VAR NAME=MAIL_ID>"><TMPL_VAR NAME=MAIL_SUBJECT></a><br />
	From: <TMPL_VAR NAME=MAIL_FROM><br />
	Date: <TMPL_VAR NAME=MAIL_DATE><br />
</div>
<div class="mailBody">
	<TMPL_VAR NAME=MAIL_BODY>
</div>
EOT
},
                 header => { default => <<EOT
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
<head>
	<meta http-equiv="Content-Type" content="text/html; charset=<TMPL_VAR NAME=CHARSET>" />
	<title><TMPL_VAR NAME=TITLE></title>
	<style type="text/css">
		.mailHeader {
			background-color: #BBB;
			border: 1px dotted black;
		}
		.mailBody {
			border-left: 1px solid black;
			border-bottom: 1px solid black;
		}
		.mail {
			margin-left: 1em;
			background-color: #CCC;
		}
		.thread {
			margin-bottom: 2em;
		}
	</style>
</head>
<body>
<ul>
	<TMPL_LOOP NAME="THREADS_SUBJECTS">
		<li><a href="#<TMPL_VAR NAME=ID>"><TMPL_VAR NAME=SUBJECT></a></li>
	</TMPL_LOOP>
</ul>
EOT
});

### subs
sub usage {
	print STDERR "Usage:\n\t$0 MBOX_FILE\n";
	exit -1;
}

sub html_encode($) {
	my $string = shift;
	$string =~ s/&/&amp;/g;
	$string =~ s/</&lt;/g;
	$string =~ s/>/&gt;/g;
	$string =~ s/"/&quot;/g;
	return $string;
}

# First arg is the template name as found in %templates.
# Second arg is a hash reference to pass to HTML::Template->param.
sub print_templated($$) {
	my $template_name = shift;
	my $template_values = shift;
	return undef unless defined $templates{$template_name};

	my $template;
	if (defined $templates{$template_name}{'file'}) {
		$template = HTML::Template->new(filename => $templates{$template_name}{'file'});
	}
	else {
		$template = HTML::Template->new(scalarref => \$templates{$template_name}{'default'});
	}
	$template->param($template_values) if defined $template_values;
	print $template->output;
}

# Convert a mail ID to a valid HTML id.
sub id2html {
	# can't start with a number
	return 'mid' . md5_hex($_[0]);
}

# arg is a Mail::Message::Body
sub body2string($);
sub body2string($) {
	my $body = shift;
	if (!$body->isMultipart) {
		return undef unless $body->isText;
		# charset conversion is not implemented yet in Mail::Message::Body
		return $body->encode(mime_type => 'text/plain',
		                     charset => $page_charset,
		                     transfer_encoding => 'none');
	}
	for ($body->parts) {
		my $retval = body2string($_->body);
		return $retval if defined $retval;
	}
	return undef;
}

# The only arg is a Mail::Thread::Container.
# Recursively print mail and replies using $mail_template.
sub print_mail($);
sub print_mail($) {
	my $container = shift;
	return unless defined $container && defined $container->message;

	my $msg = $container->message;
	my $body = body2string($msg->body);
	$body =~ s/$msgtrailer$// if defined $msgtrailer;
	$body = html_encode($body);
	$body =~ s/\n/<br \/>/g;
	my $id = id2html($msg->messageId);
	my $from = $msg->head->study('from') || $msg->get('from');

	my $template;
	if (defined $templates{'mail'}{'file'}) {
		$template = HTML::Template->new(filename => $templates{'mail'}{'file'});
	}
	else {
		$template = HTML::Template->new(scalarref => \$templates{'mail'}{'default'});
	}
	print "<div class=\"mail\" id=\"$id\">";
	print_templated('mail', { MAIL_SUBJECT => html_encode($msg->head->study('subject')),
	                          MAIL_FROM    => html_encode($from),
	                          MAIL_DATE    => $msg->head->study('date'),
	                          MAIL_BODY    => $body,
	                          MAIL_ID      => $id});
	print_mail($container->child) if $container->child;
	print '</div>';
	print_mail($container->next) if $container->next;
}

# This sub will be passed to Mail::Thread->order.
# It will just sort by date (as returned by Mail::Message->timestamp).
my $cmp_date = sub {
	sub cmpdate ($$) {
		return -1 if !defined $_[0]->message;
		return  1 if !defined $_[1]->message;
		return $_[0]->message->timestamp <=> $_[1]->message->timestamp;
	}
	return sort cmpdate @_;
};

### actual program starts here
GetOptions('Template=s%' => sub {
                               usage() if (@_ != 3 || !$templates{$_[1]});
                               $templates{$_[1]}{'file'} = $_[2];
                           },
           'title=s'     => \$page_title,
		   'charset=s'   => \$page_charset,
		   'signature=s'   => \$msgtrailer
);
my $mbox_file = $ARGV[0] || usage();

my $mbox = Mail::Box::Mbox->new(folder => $mbox_file);
my $threader = Mail::Thread->new($mbox->messages);
$threader->thread;
$threader->order($cmp_date);

my @threads_subjects = ();
for ($threader->rootset) {
	next unless defined $_->message;
	my $subject = $_->message->head->study('subject');
	my $id = id2html($_->message->messageId);
	push @threads_subjects, { SUBJECT => html_encode($subject),
                              ID      => $id};
}

print_templated('header', { CHARSET => $page_charset,
                            TITLE   => $page_title,
                            THREADS_SUBJECTS => \@threads_subjects });

for ($threader->rootset) {
	&print_templated('thread_prefix');
	print_mail($_);
	&print_templated('thread_postfix');
}

&print_templated('footer');
