htmlbalance: new plugin that balances tags by parsing and re-serializing

Simon McVittie 2008-11-16 18:11:39 +00:00
parent 408d483dc2
commit e7a840ed9a
5 changed files with 84 additions and 4 deletions

View File

@ -0,0 +1,57 @@
#!/usr/bin/perl
package IkiWiki::Plugin::htmlbalance;
# htmlbalance: Parse and re-serialize HTML to ensure balanced tags
#
# Copyright 2008 Simon McVittie <http://smcv.pseudorandom.co.uk/>
# Licensed under the GNU GPL, version 2, or any later version published by the
# Free Software Foundation
use warnings;
use strict;
use IkiWiki 2.00;
sub import { #{{{
hook(type => "getsetup", id => "htmlbalance", call => \&getsetup);
hook(type => "sanitize", id => "htmlbalance", call => \&sanitize);
} # }}}
sub getsetup () { #{{{
return
plugin => {
safe => 1,
rebuild => undef,
},
} #}}}
sub sanitize (@) { #{{{
my %params=@_;
my $ret = '';
eval {
use HTML::TreeBuilder;
use XML::Atom::Util qw(encode_xml);
};
if ($@) {
error($@);
return $params{content};
}
my $tree = HTML::TreeBuilder->new_from_content($params{content});
my @nodes = $tree->disembowel();
foreach my $node (@nodes) {
if (ref $node) {
$ret .= $node->as_XML();
chomp $ret;
$node->delete();
}
else {
$ret .= encode_xml($node);
}
}
$tree->delete();
return $ret;
} # }}}
1

View File

@ -9,9 +9,9 @@ New users of aggregate should enable the `aggregateinternal => 1` option in the
.setup file. If you don't do so, you will need to enable the [[html]] plugin
as well as aggregate itself, since feed entries will be stored as HTML.
The [[meta]] and [[tag]] plugins are also recommended. The
[[htmltidy]] plugin is suggested, since feeds can easily contain html
problems, some of which tidy can fix.
The [[meta]] and [[tag]] plugins are also recommended. Either the
[[htmltidy]] or [[htmlbalance]] plugin is suggested, since feeds can easily
contain html problems, some of which these plugins can fix.
You will need to run ikiwiki periodically from a cron job, passing it the
--aggregate parameter, to make it check for new posts. Here's an example

View File

@ -0,0 +1,9 @@
[[!template id=plugin name=htmlbalance author="Simon McVittie"]]
[[!tag type/html]]
This plugin ensures that the HTML emitted by ikiwiki contains well-balanced
HTML tags, by parsing it with HTML::TreeBuilder and re-serializing it. This
acts as a lighter-weight alternative to [[plugins/htmltidy]]; it doesn't
ensure validity, but it does at least ensure that formatting from a
blog post pulled in by \[[![[ikiwiki/directive/inline]]]] doesn't
leak into the rest of the page.

View File

@ -7,4 +7,5 @@ emitted by ikiwiki. Besides being nicely formatted, this helps ensure that
even if users enter suboptimal html, your wiki generates valid html.
Note that since tidy is an external program, that is run each time a page
is built, this plugin will slow ikiwiki down somewhat.
is built, this plugin will slow ikiwiki down somewhat. [[plugins/htmlbalance]]
might provide a faster alternative.

13
t/htmlbalance.t 100755
View File

@ -0,0 +1,13 @@
#!/usr/bin/perl
use warnings;
use strict;
use Test::More tests => 7;
BEGIN { use_ok("IkiWiki::Plugin::htmlbalance"); }
is(IkiWiki::Plugin::htmlbalance::sanitize(content => "<br></br>"), "<br />");
is(IkiWiki::Plugin::htmlbalance::sanitize(content => "<div><p b=\"c\">hello world</div>"), "<div><p b=\"c\">hello world</p></div>");
is(IkiWiki::Plugin::htmlbalance::sanitize(content => "<a></a></a>"), "<a></a>");
is(IkiWiki::Plugin::htmlbalance::sanitize(content => "<b>foo <a</b>"), "<b>foo </b>");
is(IkiWiki::Plugin::htmlbalance::sanitize(content => "<b> foo <a</a></b>"), "<b> foo </b>");
is(IkiWiki::Plugin::htmlbalance::sanitize(content => "a>"), "a&gt;");