A mutiliated version of Jamie Zawinksi's irc2html.pl that creates basic MediaWiki tables instead of HTML. Used by the Accessibility Team to post IRC meeting logs. Works great with irssi log format; can't speak for anything else.
#!/usr/bin/perl -w # irc2wiki.pl --- Hacked up irc2html.pl that spits plain MediaWiki tables # Sandy Armstrong <sanfordarmstrong@gmail.com> # irc2html.pl --- converts raw chat logs to a readable HTML table-ized form. # Copyright © 1998-2006 Jamie Zawinski <jwz@jwz.org> # # Permission to use, copy, modify, distribute, and sell this software and its # documentation for any purpose is hereby granted without fee, provided that # the above copyright notice appear in all copies and that both that # copyright notice and this permission notice appear in supporting # documentation. No representations are made about the suitability of this # software for any purpose. It is provided "as is" without express or # implied warranty. # # Created: 11-Jun-98. # # usage: irc2html.pl [--verbose] [--strip] [--no-color] infile outfile # Or, just install it as a CGI on your web server. # # The format it expects from the logs is the format written by, e.g., tkirc: # each line has a \t on it, and the part before the tab is the person talking, # or the action happening ("[ join ]" and so on) and the part after the tab # is the actual text. # # A number of other formats are accepted as well, e.g., # # [17:01:17] <user> blah blah # and # 13:25 [xxx(~xxx@00.11.222.333)] blah blah # 13:25 [msg(xxx)] blah blah # # If the line doesn't match any of the known patterns, then we assume that # the stuff before the first space is the action and the rest is the text. # # The lines alternate white and gray, changing each time the person talking # changes (so multiple lines by the same person are colored the same.) # Or, if --no-color is specified, the talkers are separated by solid # horizontal lines. # # If a line ends in something that looks like a timestamp, that goes in # column 3. # # The --strip option means to strip out non-talking lines, like "[ join ]". # # URLs are made clickable. # # Color codes, bold, etc. are also supported, thanks to # Andy Brezinsky <andy@mbrez.com> and Juerd <juerd@juerd.nl>. # # # This script also works as a CGI: just make a symlink (e.g., irc2html.cgi) # and it will present you with a text area where you can paste your char log. # When you submit, it will return the HTML document. This might save some # time messing around with temporary files. # # 5-May-2006: jwz: updated to use CSS instead of <FONT> tags. require 5; use diagnostics; use strict; my $progname = $0; $progname =~ s@^.*/([^/]+)$@$1@; my $version = q{ $Revision: 1.23 $ }; $version =~ s/^[^0-9]+([0-9.]+).*$/$1/; my $verbose = 0; my $strip_seconds = 1; # whether to keep timestamps in hh:mm only. my $strip_timestamps = 0; # whether to omit any third column entirely. my $strip_noise = 0; # whether to strip "join", "leave", etc. my $colorize_p = 1; # whether to do background colors. my $separators_p = 0; # whether to do horizontal lines. my $chunk_size = 200; # start a new table after this many lines. my $url_re = q{\b(s?https?|ftp|file|gopher|s?news|telnet|mailbox):} . q{(//[-A-Z0-9_.]+:\d*)?} . q{[-A-Z0-9_=?\#\$\@~\`%&*+|\/.,;\240]+}; my $fg_color = "#000"; my $bg_color1 = "#FFF"; my $bg_color2 = "#EEE"; $ENV{USER} = ($ENV{REMOTE_ADDR} || '?') unless defined ($ENV{USER}); my $doc_head = ( "" ); my $stylesheet = ("<!-- This style sheet can be omitted if you don't care\n" . " about the colored text that was used below. -->\n" . "<STYLE TYPE=\"text/css\">\n" . " <!--\n" . " .fc0 { color:white } .bc0 { background:white }\n" . " .fc1 { color:black } .bc1 { background:black }\n" . " .fc2 { color:navy } .bc2 { background:navy }\n" . " .fc3 { color:green } .bc3 { background:green }\n" . " .fc4 { color:red } .bc4 { background:red }\n" . " .fc5 { color:darkred } .bc5 { background:darkred }\n" . " .fc6 { color:purple } .bc6 { background:purple }\n" . " .fc7 { color:orange } .bc7 { background:orange }\n" . " .fc8 { color:yellow } .bc8 { background:yellow }\n" . " .fc9 { color:lime } .bc9 { background:lime }\n" . " .fc10 { color:aqua } .bc10 { background:aqua }\n" . " .fc11 { color:C0C0FF } .bc11 { background:C0C0FF }\n" . " .fc12 { color:blue } .bc12 { background:blue }\n" . " .fc13 { color:magenta } .bc13 { background:magenta }\n" . " .fc14 { color:gray } .bc14 { background:gray }\n" . " .fc15 { color:silver } .bc15 { background:silver }\n" . # " .fixed {font-family: fixedsys, courier new, courier, fixed;\n" . # " font-size: xx-small; }\n" . " -->\n" . "</STYLE>\n" . "\n"); my $margin_style = " STYLE=\"border-top: 1px solid;\""; # used for separators my $input_lines = 0; my $output_lines = 0; sub convert_line_body($) { my ($line) = @_; # don't allow any single word to be longer than N chars -- insert spaces. # (actually, insert non-breaking spaces (240) which we convert later.) 1 while ($line =~ s/^(.*?)([^ \t\n\240]{40})([^ \t\n\240].*)$/$1$2\240$3/s); $line =~ s/&/&/g; $line =~ s/</</g; $line =~ s/>/>/g; # $line =~ s/\"/"/g; # make urls clickable. #$line =~ s@($url_re)@<A HREF="$1">$1</A>@gi; # if the url appeared to end in punctuation, back up over it. #$line =~ s@([-.,!;?\#]+)(\">)@$2@gs; #$line =~ s@([-.,!;?\#]+)(</A>)@$2$1@gs; # if we inserted any non-breaking spaces inside URLs, just nuke them. #1 while ($line =~ s@(<A HREF=\"[^\"\240]*)\240([^\"]*\">)@$1$2@gi); # convert remaining non-breaking spaces to real spaces. $line =~ s/\240/ /g; #if ($verbose) { # $_ = $line; # while (s@<A HREF=\"(.*?)\">@@i) { # print STDERR "$progname: line $input_lines: URL: $1\n"; # } #} # if there are any ctl characters in it, do color-decoding $line = convert_color ($line) if ($line =~ m/[\000-\010\013\014\016-\037]/); return $line; } my $prev_head = ""; my $prev_stamp = ""; my $bg_color = $bg_color1; my $line_tick = 0; my $any_styles = 0; # whether any red hot style sheet action is needed sub convert_line($) { ($_) = @_; # alternate log format... s@^[[(](\d?\d:\d\d(:\d\d)?)[])] (.*)$@$3 {$1}@gs; # leading timestamp s@^(\d?\d:\d\d(:\d\d)? ?([AP]M)?) (.*)$@$4 {$1}@gsi; # leading timestamp # leading time/date stamp s@^\[\d\d?/\d\d/\d\d \@ (\d?\d:\d\d(:\d\d)?)\]\s+(.*)$@$3 {$1}@gs; # IRSSI leading full date log format s@^\d\d?-...-\d\d\d\d (\d\d?:\d\d(:\d\d)?)\s+(.*)$@$3 {$1}@gs; # GAIM inline timestamps (alone on a line with leading whitespace) s@^ +((\d\d?:)?\d\d?:\d\d)$@*** $1@gs; # goofy name+host format... s@^\[msg\(([^()\[\] \t]+)\)\] @>> $1\t@gsi; s@^\[([^()\[\] \t]+)\(([^()\[\] \t]+)\)\] @<$1>\t@gsi; if (m@^\s*$@) { $prev_head = ""; $bg_color = $bg_color1; my $color_style = ''; if ($colorize_p) { $color_style = " STYLE=\"background:$bg_color\""; } return "<TR><TD COLSPAN=2$color_style><BR></TD></TR>\n"; } my ($head, $body) = m@^([^\t]+)[\t]+(.*)$@s; my $stamp = undef; if (!defined($head)) { # it's best if the head and body are separated by a tab; but if we can't # get that, try for it separated by spaces. # if (m@^(<[^> \t\n]+>) (.*)$@s) { if (m@^(<[\@\+ ]?[^> \t\n]+>) (.*)$@s) { $head = $1; $body = $2; } elsif (m@^(\* [^> \t\n]+) (.*)$@s) { $head = $1; $body = $2; } elsif (m@^(\[ *[^\]]+\]) (.*)$@s) { $head = $1; $body = $2; } elsif (m@^([^\[\]<> \t\n:]+:) (.*)$@s) { $head = $1; $body = $2; } elsif (m@^(>) (.*)$@s) { $head = $1; $body = $2; } elsif (m@^(\*+ *[^\[\]<> \t\n:]+) (.*)$@s) { $head = $1; $body = $2; } else { $head = ""; $body = $_; print STDERR "$progname: unparsable line $input_lines: $_\n" if $verbose; } } # If there's a timestamp at the end of the line, put it in column 3. # Otherwise, there is no column 3. if ($body =~ m@^(.*?)\s*[({]?\s*(\d\d?:\d\d(:\d\d)?( ?[AP]M)?)\s*[)}]?\s*$@i) { $body = $1; $stamp = $2; if ($strip_timestamps) { $stamp = ''; } elsif ($strip_seconds) { $stamp =~ s@^(\d?\d:\d\d):\d\d$@$1@; } if ($stamp eq $prev_stamp) { $stamp = undef; } else { $prev_stamp = $stamp; } } if ($strip_noise && $head =~ m@^(\[|\*\*\*)@) { print STDERR "$progname: deleting noise line $input_lines: $head\n" if $verbose; return ""; } $head = convert_line_body ($head); $body = convert_line_body ($body); my $continuation = 1; my $color_changed_p = 0; if ($head ne $prev_head) { $prev_head = $head; $continuation = 0; $color_changed_p = 1; if ($bg_color eq $bg_color1) { $bg_color = $bg_color2; } else { $bg_color = $bg_color1; } } $head .= " "; if ($head =~ m@^(\[|\*\*\*)@ || $head =~ m@\(\s*\d\d\d+\s*\)@) { $head = "''$head''"; $body = "''$body''"; } elsif ($continuation) { $head = (""); } else { $head = "'''$head'''"; } # Specify the foreground color in the table cells too, in case this # html gets pasted somwehere with conflicting document colors. # my $color_style = ''; if ($separators_p && $color_changed_p && $output_lines != 0) { $color_style = $margin_style; } elsif ($colorize_p) { $color_style = (" STYLE=\"color:$fg_color" . ($bg_color eq $bg_color1 ? '' : "; background:$bg_color") . "\""); } my $result = "|-\n" . "| $head\n" . "| $body\n"; if ($stamp) { $result .= "| $stamp\n"; } if ($line_tick++ > $chunk_size) { $line_tick = 0; $color_style = ''; if ($bg_color ne $bg_color1) { if ($separators_p) { $color_style = $margin_style; } elsif ($colorize_p) { $color_style = " STYLE=\"background:$bg_color1\""; } } } $result =~ s@<TD VALIGN=TOP NOWRAP></TD>@<TD></TD>@g; # simplify... $result =~ s@ NOWRAP(>[^\s]+?</TD>)@$1@g; # if no spaces, don't need nowrap $output_lines++; return $result; } sub convert_color($) { my ($str) = @_; $str =~ s/\e\[.*?[a-zA-Z]//g; if ($str =~ /(<BR>)/i) { $str =~ s//\cO$1/g; } else { $str .= "\cO"; } $str =~ s/\cV(.*?)\cV/\cC0,1$1\cO/g; $str =~ s/\cV/\cC0,1/; my @chars = (split(//, $str)); my $ret = ''; my ($CCfc, $CCbc, $CCnc); my $CCb = 0; my $CCu = 0; while (@chars) { my $char = shift @chars; if ($char eq "\cB") { $ret .= ($CCb ? '</b>' : '<b>'); $CCb = !$CCb; } elsif ($char eq "\c_") { $ret .= ($CCu ? '</u>' : '<u>'); $CCu = !$CCu; } elsif ($char eq "\cO") { if ($CCb) { $CCb = 0; $ret .= '</b>'; } if ($CCu) { $CCu = 0; $ret .= '</u>'; } if ($CCnc) { $CCbc= 0; $ret .= '</SPAN>' x $CCnc; $CCfc= 0; $CCnc = 0; } } elsif ($char eq "\cC") { my $CCcs = ''; while (($char = shift @chars) =~ /[\d,]/) { $CCcs .= $char; } if ($char =~ /[^\d,]/) { unshift @chars, $char; } if ($CCcs eq '') { if ($CCbc) { $ret .= '</SPAN>'; $CCnc--; } if ($CCfc) { $ret .= '</SPAN>'; $CCnc--; } $CCbc = undef; $CCfc = undef; } else { my @k = split /,/, $CCcs; if (defined($k[0]) && $k[0] ne '') { $CCfc = $k[0]; $ret .= "<SPAN CLASS=fc$CCfc>"; $any_styles++; $CCnc++; } if (defined($k[1]) && $k[1] ne '') { $CCbc = $k[1]; $ret .= "<SPAN CLASS=bc$CCbc>"; $any_styles++; $CCnc++; } } } else { $ret .= $char; } } return $ret; } my $buffered_line = undef; sub convert($) { my($line) = @_; if ($line =~ m/^[A-Za-z]/ && $line !~ m/^[^ \t\r\n:]+: /) { $buffered_line = '' unless defined ($buffered_line); $buffered_line .= $line; return ""; } else { my $ret = (defined ($buffered_line) ? convert_line ($buffered_line) : ""); $buffered_line = $line; return $ret; } } # convert everything in the given string. sub convert_body($) { my ($body) = @_; my $color_style = ''; if ($colorize_p) { $color_style = " STYLE=\"background:$bg_color\""; } my $output = "{|border=\"0\" cellpadding=\"5\" cellspacing=\"0\" align=\"center\"\n"; $input_lines = 0; foreach (split ("\n", $body)) { $input_lines++; $output .= convert ($_); } $output .= convert (""); $output .= "|-\n|}"; my $ss = ($any_styles ? $stylesheet : ""); $_ = $doc_head; s/%%STYLESHEET%%/$ss/; $output = $_ . $output; return $output; } # This is the main loop when we seem to have been invoked as a CGI script. # sub do_cgi() { error (400, "malformed URL") if ($ENV{PATH_INFO} || $ENV{QUERY_STRING}); if ($ENV{REQUEST_METHOD} eq 'GET') { # # GET means this is the top-level invocation: just display the HTML # for the form in which the user can paste an IRC log. # print STDOUT "Content-Type: text/html\n\n"; print STDOUT ("<HTML>\n" . " <HEAD>\n" . " <TITLE>$progname</TITLE>\n" . " </HEAD>\n" . " <BODY>\n" . " <FORM METHOD=POST>\n" . " <TABLE>\n" . " <TR>\n" . " <TD COLSPAN=2 VALIGN=TOP ALIGN=CENTER>\n" . " <DIV ALIGN=CENTER>\n" . " <B><SPAN STYLE='font-size:larger'>Chat log to HTML Converter<BR></SPAN>\n" . " by <A HREF=\"http://www.jwz.org/\">" . "Jamie Zawinski</A><BR>\n" . " Version <A HREF=\"http://www.jwz.org/hacks/irc2html.pl\">" . "$version</A>\n" . " </SPAN>\n" . " Paste your IRC/AIM log here:</B><BR>\n" . " <TEXTAREA NAME=\"body\" ROWS=10 COLS=60 WRAP=OFF></TEXTAREA>\n" . " </TD>\n" . " </TR>\n" . " <TR>\n" . " <TD VALIGN=TOP ALIGN=LEFT NOWRAP>\n" . " <INPUT NAME=\"strip_status\" TYPE=CHECKBOX>" . " Strip Status Lines<BR>\n" . " <INPUT NAME=\"strip_timestamps\" TYPE=CHECKBOX>" . " Strip Timestamps<BR>\n" . " <INPUT NAME=\"colorize\" TYPE=CHECKBOX CHECKED> Colorize\n" . " </TD>\n" . " <TD VALIGN=MIDDLE ALIGN=RIGHT NOWRAP>\n" . " <INPUT TYPE=SUBMIT VALUE=\"Generate HTML\">\n" . " \n" . " <INPUT TYPE=RESET VALUE=\"Clear\">\n" . " </TD>\n" . " </TR>\n" . " </TABLE>\n" . " </FORM>\n" . " </BODY>\n" . "</HTML>\n"); } elsif ($ENV{REQUEST_METHOD} eq 'POST') { # # POST means that the form has been submitted; read the log text and # print out the corresponding HTML. # my $args = ''; while (<>) { $args .= $_; } $colorize_p = 0; # for cgi, must default to 0 my $body = ''; foreach (split ('&', $args)) { my ($key, $val) = m/^(.*?)=(.*)$/s; if ($key eq 'body') { $body .= $val; } elsif ($key eq 'strip_status') { $strip_noise = ($val eq 'on'); } elsif ($key eq 'strip_timestamps') { $strip_timestamps = ($val eq 'on');} elsif ($key eq 'colorize') { $colorize_p = ($val eq 'on'); } else { error (400, "unknown option $key"); } } $separators_p = !$colorize_p; # $colorize_p = 1;#### $body =~ s/\+/ /g; # spaces are encoded $body =~ s/%([\dA-F]{2})/chr(hex($1))/gexi; # de-hexilate $body =~ s@\r\n@\n@sg; # CRLF -> LF $body =~ s@\r@\n@sg; # CR -> LF print STDOUT "Content-Type: text/html\n\n"; print STDOUT convert_body ($body); } else { error (400, "$ENV{REQUEST_METHOD} not supported"); } } sub error($$) { my ($http_status, $err) = @_; if (defined($ENV{REQUEST_URI})) { print "Status: $http_status\n"; print "Content-Type: text/html\n"; print "\n<TITLE>Error</TITLE>\n"; print "<H1>$http_status</H1><P>\n"; $err =~ s/&/&/g; $err =~ s/</</g; $err =~ s/>/>/g; print "$err\n<P>\n"; exit (0); } else { print STDERR "$progname: $err\n"; exit 1; } } sub usage() { print STDERR "usage: $progname [--verbose] [--strip] [--no-color]\n" . " [input-chat-log] [output-html-file]\n"; exit 1; } sub main() { my $in = undef; my $out = undef; if (defined ($ENV{REQUEST_URI})) { return do_cgi(); } while ($_ = $ARGV[0]) { shift @ARGV; if ($_ eq "--verbose") { $verbose++; } elsif ($_ eq "--strip") { $strip_noise++; } elsif ($_ eq "--no-time") { $strip_timestamps++; } elsif ($_ eq "--no-color") { $colorize_p = 0; $separators_p = 1; } elsif (m/^-v+$/) { $verbose += length($_)-1; } elsif (m/^-./) { usage; } elsif (!defined($in)) { $in = $_; } elsif (!defined($out)) { $out = $_; } else { usage; } } $in = "-" unless $in; $out = "-" unless $out; local *IN; local *OUT; open (IN, "<$in") || die "$progname: unable to read $in: $!\n"; open (OUT, ">$out") || die "$progname: unable to write $out: $!\n"; $in = "stdin" if $in eq "-"; $out = "stdout" if $out eq "-"; if ($verbose) { print STDERR "$progname: converting $in to $out...\n"; } my $body = ''; $body .= $_ while (<IN>); print OUT convert_body ($body); close OUT; close IN; if ($verbose) { print STDERR "$progname: wrote $out ($output_lines lines).\n"; } } main(); exit 0;