#!/usr/bin/perl -W
 
#
# I found one or two scripts to translate html code to phpBB but
# they didn't do enough things for me.
# 
# Script by Salagir, 2005
# GPL licence
# 
 
use strict qw(subs vars refs);
 
if (!defined $ARGV[0]) {
	print "\n Use:  ./html2bbcode.pl  htmlfile.html \n";
	exit;
}
 
my $file = $ARGV[0];
$file =~ s/\.x?html?$//;
 
print "     html2bbcode.pl is running. Using '$ARGV[0]', we generate '$file.bb' \n";
 
open (BUFFY, $ARGV[0]) or die (" Can't find file '$ARGV[0]', stopped.\n");
my @allf = <BUFFY>;
my $all = "";
foreach(@allf) {
	$_ =~ s/[\r\n]//;
	$all .= " ".$_;
}
close(BUFFY);
 
 
#kill unnecessary
$all =~ s/<head.*<\/head>//i;
$all =~ s/<!DOCTYPE[^>]*>//;
$all =~ s/<\/?(html|body)[^>]*>//ig;
 
# unnecessary spaces
$all =~ s/\s+/ /g;
 
# regexp part for any attributes in a tag, which we don't care about
my $rt = '(?:\s[^>]*)?';
 
# tags remplacments
my %remplacements = qw(b b  strong b  i i  em i  u u  blockquote quote  pre code  ul list);
while (my($k,$v) = each(%remplacements)) {
	$all =~ s/<$k$rt>(.*?)<\/$k>/\[$v\]$1\[\/$v\]/ig;
}
$all =~ s/<ol$rt>(.*?)<\/ol>/\[list=1\]$1\[\/list\]\n/ig;
$all =~ s/<li$rt>(.*?)(<\/?li>|\[\/list\])/lis($1,$2)/eig;
$all =~ s/<\/li>//ig; # pb [A] resolution
 
sub lis {
	my ($content, $nexttag) = @_;
	return '[*]'.$content.'[/list]' if ($nexttag eq '[/list]');
	return "[*]$content" if ($nexttag eq '</li>');
	return "[*]$content [*]"; # pb [A] : what of the </li> which <li> we just turned into '[*]', if it exists?
}
 
#special chars
%remplacements = ('nbsp', ' ',   'amp', '&',   'copy', '(c)',   'quot', '"' );
while (my($k,$v) = each(%remplacements)) {
	$all =~ s/&$k;/$v/ig;
}
$all =~ s/&(a|e|i|o|u|y)(grave|acute|circ|uml|tilde|ring);/$1/gi; # if I was not lazy I'd do all the cases and keep the accents... but who still uses these codes anyway? You? Well... patch it then :)
 
# links
$all =~ s/<a\s[^>]*href=["']?([^"'>]+)["']?[^>]*>(.*?)<\/a>/\[url=$1\]$2\[\/url\]/ig;
# case where a "mailto" was in the link (BBcode doesn't accept these links)
$all =~ s/\[url=mailto:(.*?)\](.*?)\[\/url\]/$2 ( $1 )/g;
 
# images (does not handle relative links [of course] )
$all =~ s/<img\s[^>]*src=["']?([^"'>]+)["']?[^>]*>/\[img\]$1\[\/img\]/ig;
 
# font size and headers
my $size;
for(my $i=1; $i<6; $i++) {
	$size = (5-$i) * 4 + 8;
	$all =~ s/<h$i(?:\s[^>]*)?>(.*?)<\/h$i>/\r\r\[size=$size\]$1\[\/size\]\r\r/ig;
}
$all =~ s/<small(?:\s[^>]*)?>(.*?)<\/small>/\[size=9\]$1\[\/size\]\r\r/ig;
$all =~ s/<big>(.*?)<\/big>/\[size=18\]$1\[\/size\]\r\r/ig;
#people who use <font size= color=> should be killed and that's hard to do so WHATEVER
#but i'm little hypocrit because I don't handle css, making no way to use colorized text... colorized text is for dummies!! WHATEVER!!! :p
 
# new lines and paragraphs
# paragraphs that are not closed but followed by another <p> (correct in html, not xhtml. Can also be closed by other tags but... I don't care )
$all =~ s/<p(?:\s[^>]*)?>(.*?)(<\/?p>)/paragraph($1,$2)/eig;
$all =~ s/<p(\s[^>]*)?>(.*?)<\/p>/\r$2\r\r/ig;
$all =~ s/<div(\s[^>]*)?>(.*?)<\/div>/\r$2\r/ig;
$all =~ s/<br ?\/?>\s*/\r/ig;
 
sub paragraph {
	my ($content, $nexttag) = @_;
	return "\r$content\r\r" if ($nexttag eq '</p>');
	return "\r".$content."\r<p>";
}
 
 
# all others tags
$all =~ s/<[^\/][^>]*>(.*?)<\/[^>]*>/$1/ig;
 
$all =~ s/\r/\n/g;
 
open (BUFFY, ">$file.bb") or die (" Can't open for writing file '$file.bb', stopped.\n");
print BUFFY $all;
close(BUFFY);