2006-05-05 07:41:11 +02:00
|
|
|
#!/usr/bin/perl
|
|
|
|
package IkiWiki::Plugin::htmlscrubber;
|
|
|
|
|
|
|
|
use warnings;
|
|
|
|
use strict;
|
2008-12-23 22:34:19 +01:00
|
|
|
use IkiWiki 3.00;
|
2006-05-05 07:41:11 +02:00
|
|
|
|
2008-02-10 23:07:21 +01:00
|
|
|
# This regexp matches urls that are in a known safe scheme.
|
|
|
|
# Feel free to use it from other plugins.
|
|
|
|
our $safe_url_regexp;
|
|
|
|
|
2008-12-17 21:22:16 +01:00
|
|
|
sub import {
|
2008-08-03 22:40:12 +02:00
|
|
|
hook(type => "getsetup", id => "htmlscrubber", call => \&getsetup);
|
2006-09-10 00:50:27 +02:00
|
|
|
hook(type => "sanitize", id => "htmlscrubber", call => \&sanitize);
|
2006-05-05 07:41:11 +02:00
|
|
|
|
2008-02-10 19:16:40 +01:00
|
|
|
# Only known uri schemes are allowed to avoid all the ways of
|
|
|
|
# embedding javascrpt.
|
|
|
|
# List at http://en.wikipedia.org/wiki/URI_scheme
|
2008-02-11 01:02:12 +01:00
|
|
|
my $uri_schemes=join("|", map quotemeta,
|
2008-02-10 19:16:40 +01:00
|
|
|
# IANA registered schemes
|
|
|
|
"http", "https", "ftp", "mailto", "file", "telnet", "gopher",
|
|
|
|
"aaa", "aaas", "acap", "cap", "cid", "crid",
|
|
|
|
"dav", "dict", "dns", "fax", "go", "h323", "im", "imap",
|
|
|
|
"ldap", "mid", "news", "nfs", "nntp", "pop", "pres",
|
|
|
|
"sip", "sips", "snmp", "tel", "urn", "wais", "xmpp",
|
2008-02-11 01:02:12 +01:00
|
|
|
"z39.50r", "z39.50s",
|
2008-02-10 19:16:40 +01:00
|
|
|
# Selected unofficial schemes
|
2008-02-10 22:23:28 +01:00
|
|
|
"aim", "callto", "cvs", "ed2k", "feed", "fish", "gg",
|
2008-02-10 19:16:40 +01:00
|
|
|
"irc", "ircs", "lastfm", "ldaps", "magnet", "mms",
|
|
|
|
"msnim", "notes", "rsync", "secondlife", "skype", "ssh",
|
2008-02-11 00:08:56 +01:00
|
|
|
"sftp", "smb", "sms", "snews", "webcal", "ymsgr",
|
2008-02-10 19:16:40 +01:00
|
|
|
);
|
2008-02-10 21:24:03 +01:00
|
|
|
# data is a special case. Allow data:image/*, but
|
|
|
|
# disallow data:text/javascript and everything else.
|
Allow colons in URLs after the first slash
A new regexp fixes this bug:
http://ikiwiki.info/bugs/No_link_for_blog_items_when_filename_contains_a_colon/
I traced this down to htmlscrubber. If disabled,
it works. If enabled, then $safe_url_regexp
determines the URL unsafe because of the colon and
hence removes the src attribute.
Digging into this, I find that RFC 3986 pretty
much discourages colons in filenames:
"""
A path segment that contains a colon character
(e.g., "this:that") cannot be used as the first
segment of a relative-path reference, as it would
be mistaken for a scheme name. Such a segment must
be preceded by a dot-segment (e.g., "./this:that")
to make a relative- path reference.
"""
on the other hand, with usedirs, any link to
another page will be prepended by ../ anyway, so
that makes them okay again.
The solution still seems not to use colons.
In any case, htmlscrubber should get a new regexp,
courtesy of dato.
I have tested and verified this.
Signed-off-by: martin f. krafft <madduck@madduck.net>
2008-02-29 19:26:53 +01:00
|
|
|
$safe_url_regexp=qr/^(?:(?:$uri_schemes):|data:image\/|[^:]+(?:$|\/))/i;
|
2008-12-17 21:22:16 +01:00
|
|
|
}
|
2008-02-10 23:07:21 +01:00
|
|
|
|
2008-12-17 21:22:16 +01:00
|
|
|
sub getsetup () {
|
2008-08-03 22:40:12 +02:00
|
|
|
return
|
|
|
|
plugin => {
|
|
|
|
safe => 1,
|
|
|
|
rebuild => undef,
|
|
|
|
},
|
2008-09-27 00:05:36 +02:00
|
|
|
htmlscrubber_skip => {
|
|
|
|
type => "pagespec",
|
|
|
|
example => "!*/Discussion",
|
|
|
|
description => "PageSpec specifying pages not to scrub",
|
|
|
|
link => "ikiwiki/PageSpec",
|
|
|
|
safe => 1,
|
|
|
|
rebuild => undef,
|
|
|
|
},
|
2008-12-17 21:22:16 +01:00
|
|
|
}
|
2008-08-03 22:40:12 +02:00
|
|
|
|
2008-12-17 21:22:16 +01:00
|
|
|
sub sanitize (@) {
|
2008-02-10 23:07:21 +01:00
|
|
|
my %params=@_;
|
2008-09-27 00:05:36 +02:00
|
|
|
|
|
|
|
if (exists $config{htmlscrubber_skip} &&
|
|
|
|
length $config{htmlscrubber_skip} &&
|
|
|
|
exists $params{destpage} &&
|
|
|
|
pagespec_match($params{destpage}, $config{htmlscrubber_skip})) {
|
|
|
|
return $params{content};
|
|
|
|
}
|
|
|
|
|
2008-02-10 23:07:21 +01:00
|
|
|
return scrubber()->scrub($params{content});
|
2008-12-17 21:22:16 +01:00
|
|
|
}
|
2008-02-10 23:07:21 +01:00
|
|
|
|
|
|
|
my $_scrubber;
|
2008-12-17 21:22:16 +01:00
|
|
|
sub scrubber {
|
2008-02-10 23:07:21 +01:00
|
|
|
return $_scrubber if defined $_scrubber;
|
2008-02-10 19:16:40 +01:00
|
|
|
|
2006-05-05 07:41:11 +02:00
|
|
|
eval q{use HTML::Scrubber};
|
2006-11-08 22:03:33 +01:00
|
|
|
error($@) if $@;
|
2006-05-05 07:41:11 +02:00
|
|
|
# Lists based on http://feedparser.org/docs/html-sanitization.html
|
2007-11-18 19:34:06 +01:00
|
|
|
# With html 5 video and audio tags added.
|
2006-05-05 07:41:11 +02:00
|
|
|
$_scrubber = HTML::Scrubber->new(
|
|
|
|
allow => [qw{
|
2008-01-08 00:32:50 +01:00
|
|
|
a abbr acronym address area b big blockquote br br/
|
2006-05-05 07:41:11 +02:00
|
|
|
button caption center cite code col colgroup dd del
|
|
|
|
dfn dir div dl dt em fieldset font form h1 h2 h3 h4
|
2008-01-08 00:32:50 +01:00
|
|
|
h5 h6 hr hr/ i img input ins kbd label legend li map
|
|
|
|
menu ol optgroup option p p/ pre q s samp select small
|
2006-05-05 07:41:11 +02:00
|
|
|
span strike strong sub sup table tbody td textarea
|
|
|
|
tfoot th thead tr tt u ul var
|
2007-11-18 19:34:06 +01:00
|
|
|
video audio
|
2006-05-05 07:41:11 +02:00
|
|
|
}],
|
2007-07-11 18:50:59 +02:00
|
|
|
default => [undef, { (
|
|
|
|
map { $_ => 1 } qw{
|
2008-02-10 19:16:40 +01:00
|
|
|
abbr accept accept-charset accesskey
|
2007-07-11 18:50:59 +02:00
|
|
|
align alt axis border cellpadding cellspacing
|
2008-02-10 22:59:37 +01:00
|
|
|
char charoff charset checked class
|
2007-07-11 18:50:59 +02:00
|
|
|
clear cols colspan color compact coords
|
|
|
|
datetime dir disabled enctype for frame
|
2008-02-10 19:16:40 +01:00
|
|
|
headers height hreflang hspace id ismap
|
2008-02-10 22:59:37 +01:00
|
|
|
label lang maxlength media method
|
2007-07-11 18:50:59 +02:00
|
|
|
multiple name nohref noshade nowrap prompt
|
|
|
|
readonly rel rev rows rowspan rules scope
|
2008-02-10 19:16:40 +01:00
|
|
|
selected shape size span start summary
|
2008-02-10 22:59:37 +01:00
|
|
|
tabindex target title type valign
|
2007-07-11 18:50:59 +02:00
|
|
|
value vspace width
|
2008-02-10 19:16:40 +01:00
|
|
|
autoplay loopstart loopend end
|
2007-11-18 19:34:06 +01:00
|
|
|
playcount controls
|
2007-07-11 18:50:59 +02:00
|
|
|
} ),
|
|
|
|
"/" => 1, # emit proper <hr /> XHTML
|
2008-02-10 23:07:21 +01:00
|
|
|
href => $safe_url_regexp,
|
|
|
|
src => $safe_url_regexp,
|
|
|
|
action => $safe_url_regexp,
|
|
|
|
cite => $safe_url_regexp,
|
|
|
|
longdesc => $safe_url_regexp,
|
|
|
|
poster => $safe_url_regexp,
|
|
|
|
usemap => $safe_url_regexp,
|
2008-02-10 19:16:40 +01:00
|
|
|
}],
|
2006-05-05 07:41:11 +02:00
|
|
|
);
|
|
|
|
return $_scrubber;
|
2008-12-17 21:22:16 +01:00
|
|
|
}
|
2006-05-05 07:41:11 +02:00
|
|
|
|
|
|
|
1
|