2006-05-05 07:41:11 +02:00
|
|
|
#!/usr/bin/perl
|
|
|
|
package IkiWiki::Plugin::htmlscrubber;
|
|
|
|
|
|
|
|
use warnings;
|
|
|
|
use strict;
|
2007-04-27 04:55:52 +02:00
|
|
|
use IkiWiki 2.00;
|
2006-05-05 07:41:11 +02:00
|
|
|
|
2008-02-10 23:07:21 +01:00
|
|
|
# This regexp matches urls that are in a known safe scheme.
|
|
|
|
# Feel free to use it from other plugins.
|
|
|
|
our $safe_url_regexp;
|
|
|
|
|
2006-05-05 07:41:11 +02:00
|
|
|
sub import { #{{{
|
2006-09-10 00:50:27 +02:00
|
|
|
hook(type => "sanitize", id => "htmlscrubber", call => \&sanitize);
|
2006-05-05 07:41:11 +02:00
|
|
|
|
2008-02-10 19:16:40 +01:00
|
|
|
# Only known uri schemes are allowed to avoid all the ways of
|
|
|
|
# embedding javascrpt.
|
|
|
|
# List at http://en.wikipedia.org/wiki/URI_scheme
|
2008-02-11 01:02:12 +01:00
|
|
|
my $uri_schemes=join("|", map quotemeta,
|
2008-02-10 19:16:40 +01:00
|
|
|
# IANA registered schemes
|
|
|
|
"http", "https", "ftp", "mailto", "file", "telnet", "gopher",
|
|
|
|
"aaa", "aaas", "acap", "cap", "cid", "crid",
|
|
|
|
"dav", "dict", "dns", "fax", "go", "h323", "im", "imap",
|
|
|
|
"ldap", "mid", "news", "nfs", "nntp", "pop", "pres",
|
|
|
|
"sip", "sips", "snmp", "tel", "urn", "wais", "xmpp",
|
2008-02-11 01:02:12 +01:00
|
|
|
"z39.50r", "z39.50s",
|
2008-02-10 19:16:40 +01:00
|
|
|
# Selected unofficial schemes
|
2008-02-10 22:23:28 +01:00
|
|
|
"aim", "callto", "cvs", "ed2k", "feed", "fish", "gg",
|
2008-02-10 19:16:40 +01:00
|
|
|
"irc", "ircs", "lastfm", "ldaps", "magnet", "mms",
|
|
|
|
"msnim", "notes", "rsync", "secondlife", "skype", "ssh",
|
2008-02-11 00:08:56 +01:00
|
|
|
"sftp", "smb", "sms", "snews", "webcal", "ymsgr",
|
2008-02-10 19:16:40 +01:00
|
|
|
);
|
2008-02-10 21:24:03 +01:00
|
|
|
# data is a special case. Allow data:image/*, but
|
|
|
|
# disallow data:text/javascript and everything else.
|
2008-02-10 23:07:21 +01:00
|
|
|
$safe_url_regexp=qr/^(?:(?:$uri_schemes):|data:image\/|[^:]+$)/i;
|
|
|
|
} # }}}
|
|
|
|
|
|
|
|
sub sanitize (@) { #{{{
|
|
|
|
my %params=@_;
|
|
|
|
return scrubber()->scrub($params{content});
|
|
|
|
} # }}}
|
|
|
|
|
|
|
|
my $_scrubber;
|
|
|
|
sub scrubber { #{{{
|
|
|
|
return $_scrubber if defined $_scrubber;
|
2008-02-10 19:16:40 +01:00
|
|
|
|
2006-05-05 07:41:11 +02:00
|
|
|
eval q{use HTML::Scrubber};
|
2006-11-08 22:03:33 +01:00
|
|
|
error($@) if $@;
|
2006-05-05 07:41:11 +02:00
|
|
|
# Lists based on http://feedparser.org/docs/html-sanitization.html
|
2007-11-18 19:34:06 +01:00
|
|
|
# With html 5 video and audio tags added.
|
2006-05-05 07:41:11 +02:00
|
|
|
$_scrubber = HTML::Scrubber->new(
|
|
|
|
allow => [qw{
|
2008-01-08 00:32:50 +01:00
|
|
|
a abbr acronym address area b big blockquote br br/
|
2006-05-05 07:41:11 +02:00
|
|
|
button caption center cite code col colgroup dd del
|
|
|
|
dfn dir div dl dt em fieldset font form h1 h2 h3 h4
|
2008-01-08 00:32:50 +01:00
|
|
|
h5 h6 hr hr/ i img input ins kbd label legend li map
|
|
|
|
menu ol optgroup option p p/ pre q s samp select small
|
2006-05-05 07:41:11 +02:00
|
|
|
span strike strong sub sup table tbody td textarea
|
|
|
|
tfoot th thead tr tt u ul var
|
2007-11-18 19:34:06 +01:00
|
|
|
video audio
|
2006-05-05 07:41:11 +02:00
|
|
|
}],
|
2007-07-11 18:50:59 +02:00
|
|
|
default => [undef, { (
|
|
|
|
map { $_ => 1 } qw{
|
2008-02-10 19:16:40 +01:00
|
|
|
abbr accept accept-charset accesskey
|
2007-07-11 18:50:59 +02:00
|
|
|
align alt axis border cellpadding cellspacing
|
2008-02-10 22:59:37 +01:00
|
|
|
char charoff charset checked class
|
2007-07-11 18:50:59 +02:00
|
|
|
clear cols colspan color compact coords
|
|
|
|
datetime dir disabled enctype for frame
|
2008-02-10 19:16:40 +01:00
|
|
|
headers height hreflang hspace id ismap
|
2008-02-10 22:59:37 +01:00
|
|
|
label lang maxlength media method
|
2007-07-11 18:50:59 +02:00
|
|
|
multiple name nohref noshade nowrap prompt
|
|
|
|
readonly rel rev rows rowspan rules scope
|
2008-02-10 19:16:40 +01:00
|
|
|
selected shape size span start summary
|
2008-02-10 22:59:37 +01:00
|
|
|
tabindex target title type valign
|
2007-07-11 18:50:59 +02:00
|
|
|
value vspace width
|
2008-02-10 19:16:40 +01:00
|
|
|
autoplay loopstart loopend end
|
2007-11-18 19:34:06 +01:00
|
|
|
playcount controls
|
2007-07-11 18:50:59 +02:00
|
|
|
} ),
|
|
|
|
"/" => 1, # emit proper <hr /> XHTML
|
2008-02-10 23:07:21 +01:00
|
|
|
href => $safe_url_regexp,
|
|
|
|
src => $safe_url_regexp,
|
|
|
|
action => $safe_url_regexp,
|
|
|
|
cite => $safe_url_regexp,
|
|
|
|
longdesc => $safe_url_regexp,
|
|
|
|
poster => $safe_url_regexp,
|
|
|
|
usemap => $safe_url_regexp,
|
2008-02-10 19:16:40 +01:00
|
|
|
}],
|
2006-05-05 07:41:11 +02:00
|
|
|
);
|
|
|
|
return $_scrubber;
|
|
|
|
} # }}}
|
|
|
|
|
|
|
|
1
|