#!/usr/bin/perl -W
#
# I found one or two scripts to translate html code to phpBB but
# they didn't do enough things for me.
#
# Script by Salagir, 2005
# GPL licence
#
use strict qw(subs vars refs);
if (!defined $ARGV[0]) {
print "\n Use: ./html2bbcode.pl htmlfile.html \n";
exit;
}
my $file = $ARGV[0];
$file =~ s/\.x?html?$//;
print " html2bbcode.pl is running. Using '$ARGV[0]', we generate '$file.bb' \n";
open (BUFFY, $ARGV[0]) or die (" Can't find file '$ARGV[0]', stopped.\n");
my @allf = <BUFFY>;
my $all = "";
foreach(@allf) {
$_ =~ s/[\r\n]//;
$all .= " ".$_;
}
close(BUFFY);
#kill unnecessary
$all =~ s/<head.*<\/head>//i;
$all =~ s/<!DOCTYPE[^>]*>//;
$all =~ s/<\/?(html|body)[^>]*>//ig;
# unnecessary spaces
$all =~ s/\s+/ /g;
# regexp part for any attributes in a tag, which we don't care about
my $rt = '(?:\s[^>]*)?';
# tags remplacments
my %remplacements = qw(b b strong b i i em i u u blockquote quote pre code ul list);
while (my($k,$v) = each(%remplacements)) {
$all =~ s/<$k$rt>(.*?)<\/$k>/\[$v\]$1\[\/$v\]/ig;
}
$all =~ s/<ol$rt>(.*?)<\/ol>/\[list=1\]$1\[\/list\]\n/ig;
$all =~ s/<li$rt>(.*?)(<\/?li>|\[\/list\])/lis($1,$2)/eig;
$all =~ s/<\/li>//ig; # pb [A] resolution
sub lis {
my ($content, $nexttag) = @_;
return '[*]'.$content.'[/list]' if ($nexttag eq '[/list]');
return "[*]$content" if ($nexttag eq '</li>');
return "[*]$content [*]"; # pb [A] : what of the </li> which <li> we just turned into '[*]', if it exists?
}
#special chars
%remplacements = ('nbsp', ' ', 'amp', '&', 'copy', '(c)', 'quot', '"' );
while (my($k,$v) = each(%remplacements)) {
$all =~ s/&$k;/$v/ig;
}
$all =~ s/&(a|e|i|o|u|y)(grave|acute|circ|uml|tilde|ring);/$1/gi; # if I was not lazy I'd do all the cases and keep the accents... but who still uses these codes anyway? You? Well... patch it then :)
# links
$all =~ s/<a\s[^>]*href=["']?([^"'>]+)["']?[^>]*>(.*?)<\/a>/\[url=$1\]$2\[\/url\]/ig;
# case where a "mailto" was in the link (BBcode doesn't accept these links)
$all =~ s/\[url=mailto:(.*?)\](.*?)\[\/url\]/$2 ( $1 )/g;
# images (does not handle relative links [of course] )
$all =~ s/<img\s[^>]*src=["']?([^"'>]+)["']?[^>]*>/\[img\]$1\[\/img\]/ig;
# font size and headers
my $size;
for(my $i=1; $i<6; $i++) {
$size = (5-$i) * 4 + 8;
$all =~ s/<h$i(?:\s[^>]*)?>(.*?)<\/h$i>/\r\r\[size=$size\]$1\[\/size\]\r\r/ig;
}
$all =~ s/<small(?:\s[^>]*)?>(.*?)<\/small>/\[size=9\]$1\[\/size\]\r\r/ig;
$all =~ s/<big>(.*?)<\/big>/\[size=18\]$1\[\/size\]\r\r/ig;
#people who use <font size= color=> should be killed and that's hard to do so WHATEVER
#but i'm little hypocrit because I don't handle css, making no way to use colorized text... colorized text is for dummies!! WHATEVER!!! :p
# new lines and paragraphs
# paragraphs that are not closed but followed by another <p> (correct in html, not xhtml. Can also be closed by other tags but... I don't care )
$all =~ s/<p(?:\s[^>]*)?>(.*?)(<\/?p>)/paragraph($1,$2)/eig;
$all =~ s/<p(\s[^>]*)?>(.*?)<\/p>/\r$2\r\r/ig;
$all =~ s/<div(\s[^>]*)?>(.*?)<\/div>/\r$2\r/ig;
$all =~ s/<br ?\/?>\s*/\r/ig;
sub paragraph {
my ($content, $nexttag) = @_;
return "\r$content\r\r" if ($nexttag eq '</p>');
return "\r".$content."\r<p>";
}
# all others tags
$all =~ s/<[^\/][^>]*>(.*?)<\/[^>]*>/$1/ig;
$all =~ s/\r/\n/g;
open (BUFFY, ">$file.bb") or die (" Can't open for writing file '$file.bb', stopped.\n");
print BUFFY $all;
close(BUFFY);