ikiwiki/IkiWiki/Plugin/htmlscrubber.pm

#!/usr/bin/perl
package IkiWiki::Plugin::htmlscrubber;

use warnings;
use strict;
use IkiWiki 3.00;

# This regexp matches urls that are in a known safe scheme.
# Feel free to use it from other plugins.
our $safe_url_regexp;

sub import {
	hook(type => "getsetup", id => "htmlscrubber", call => \&getsetup);
	hook(type => "sanitize", id => "htmlscrubber", call => \&sanitize);

	# Only known uri schemes are allowed to avoid all the ways of
	# embedding javascrpt.
	# List at http://en.wikipedia.org/wiki/URI_scheme
	my $uri_schemes=join("|", map quotemeta,
		# IANA registered schemes
		"http", "https", "ftp", "mailto", "file", "telnet", "gopher",
		"aaa", "aaas", "acap", 	"cap", "cid", "crid", 
		"dav", "dict", "dns", "fax", "go", "h323", "im", "imap",
		"ldap", "mid", "news", "nfs", "nntp", "pop", "pres",
		"sip", "sips", "snmp", "tel", "urn", "wais", "xmpp",
		"z39.50r", "z39.50s",
		# Selected unofficial schemes
		"aim", "callto", "cvs", "ed2k", "feed", "fish", "gg",
		"irc", "ircs", "lastfm", "ldaps", "magnet", "mms",
		"msnim", "notes", "rsync", "secondlife", "skype", "ssh",
		"sftp", "smb", "sms", "snews", "webcal", "ymsgr",
		"bitcoin", "git", "svn", "bzr", "darcs", "hg"
	);
	# data is a special case. Allow a few data:image/ types,
	# but disallow data:text/javascript and everything else.
	$safe_url_regexp=qr/^(?:(?:$uri_schemes):|data:image\/(?:png|jpeg|gif)|[^:]+(?:$|[\/\?#]))|^#/i;
}

sub getsetup () {
	return
		plugin => {
			safe => 1,
			rebuild => undef,
			section => "core",
		},
		htmlscrubber_skip => {
			type => "pagespec",
			example => "!*/Discussion",
			description => "PageSpec specifying pages not to scrub",
			link => "ikiwiki/PageSpec",
			safe => 1,
			rebuild => undef,
		},
}

sub sanitize (@) {
	my %params=@_;

	if (exists $config{htmlscrubber_skip} &&
	    length $config{htmlscrubber_skip} &&
	    exists $params{page} &&
	    pagespec_match($params{page}, $config{htmlscrubber_skip})) {
		return $params{content};
	}

	return scrubber()->scrub($params{content});
}

my $_scrubber;
sub scrubber {
	return $_scrubber if defined $_scrubber;

	eval q{use HTML::Scrubber};
	error($@) if $@;
	# Lists based on http://feedparser.org/docs/html-sanitization.html
	# With html5 tags added.
	$_scrubber = HTML::Scrubber->new(
		allow => [qw{
			a abbr acronym address area b big blockquote br br/
			button caption center cite code col colgroup dd del
			dfn dir div dl dt em fieldset font form h1 h2 h3 h4
			h5 h6 hr hr/ i img input ins kbd label legend li map
			menu ol optgroup option p p/ pre q s samp select small
			span strike strong sub sup table tbody td textarea
			tfoot th thead tr tt u ul var

			video audio source section nav article aside hgroup
			header footer figure figcaption time mark canvas
			datalist progress meter ruby rt rp details summary
		}],
		default => [undef, { (
			map { $_ => 1 } qw{
				abbr accept accept-charset accesskey
				align alt axis border cellpadding cellspacing
				char charoff charset checked class
				clear cols colspan color compact coords
				datetime dir disabled enctype for frame
				headers height hreflang hspace id ismap
				label lang maxlength media method
				multiple name nohref noshade nowrap prompt
				readonly rel rev rows rowspan rules scope
				selected shape size span start summary
				tabindex target title type valign
				value vspace width
				autofocus autoplay preload loopstart
				loopend end playcount controls pubdate
				loop muted
				placeholder min max step low high optimum
				form required autocomplete novalidate pattern
				list formenctype formmethod formnovalidate
				formtarget reversed spellcheck open hidden
			} ),
			"/" => 1, # emit proper <hr /> XHTML
			href => $safe_url_regexp,
			src => $safe_url_regexp,
			action => $safe_url_regexp,
			formaction => $safe_url_regexp,
			cite => $safe_url_regexp,
			longdesc => $safe_url_regexp,
			poster => $safe_url_regexp,
			usemap => $safe_url_regexp,
		}],
	);
	return $_scrubber;
}

1
* Removed --sanitize and --no-sanitize, replaced with --plugin htmlscrubber and --disable-plugin htmlscrubber. 2006-05-05 07:41:11 +02:00			`#!/usr/bin/perl`
			`package IkiWiki::Plugin::htmlscrubber;`

			`use warnings;`
			`use strict;`
finalise version 3.00 of the plugin api 2008-12-23 22:34:19 +01:00			`use IkiWiki 3.00;`
* Removed --sanitize and --no-sanitize, replaced with --plugin htmlscrubber and --disable-plugin htmlscrubber. 2006-05-05 07:41:11 +02:00
export $safe_url_regexp 2008-02-10 23:07:21 +01:00			`# This regexp matches urls that are in a known safe scheme.`
			`# Feel free to use it from other plugins.`
			`our $safe_url_regexp;`

Coding style change: Remove explcit vim folding markers. 2008-12-17 21:22:16 +01:00			`sub import {`
add plugin safe/rebuild info (part 1 of 2) too many plugins.. brain exploding.. 2008-08-03 22:40:12 +02:00			`hook(type => "getsetup", id => "htmlscrubber", call => \&getsetup);`
* Work on firming up the plugin interface: - Plugins should not need to load IkiWiki::Render to get commonly used functions, so moved some functions from there to IkiWiki. - Picked out the set of functions and variables that most plugins use, documented them, and made IkiWiki export them by default, like a proper perl module should. - Use the other functions at your own risk. - This is not quite complete, I still have to decide whether to export some other things. * Changed all plugins included in ikiwiki to not use "IkiWiki::" when referring to stuff now exported by the IkiWiki module. * Anyone with a third-party ikiwiki plugin is strongly enrouraged to make like changes to it and avoid use of non-exported symboles from "IkiWiki::". * Link debian/changelog and debian/news to NEWS and CHANGELOG. * Support hyperestradier version 1.4.2, which adds a new required phraseform setting. 2006-09-10 00:50:27 +02:00			`hook(type => "sanitize", id => "htmlscrubber", call => \&sanitize);`
* Removed --sanitize and --no-sanitize, replaced with --plugin htmlscrubber and --disable-plugin htmlscrubber. 2006-05-05 07:41:11 +02:00
* htmlscrubber security fix: Block javascript in uris. * Add htmlscrubber test suite. 2008-02-10 19:16:40 +01:00			`# Only known uri schemes are allowed to avoid all the ways of`
			`# embedding javascrpt.`
			`# List at http://en.wikipedia.org/wiki/URI_scheme`
use quotemeta when building the regexp 2008-02-11 01:02:12 +01:00			`my $uri_schemes=join("\|", map quotemeta,`
* htmlscrubber security fix: Block javascript in uris. * Add htmlscrubber test suite. 2008-02-10 19:16:40 +01:00			`# IANA registered schemes`
			`"http", "https", "ftp", "mailto", "file", "telnet", "gopher",`
			`"aaa", "aaas", "acap", "cap", "cid", "crid",`
			`"dav", "dict", "dns", "fax", "go", "h323", "im", "imap",`
			`"ldap", "mid", "news", "nfs", "nntp", "pop", "pres",`
			`"sip", "sips", "snmp", "tel", "urn", "wais", "xmpp",`
use quotemeta when building the regexp 2008-02-11 01:02:12 +01:00			`"z39.50r", "z39.50s",`
* htmlscrubber security fix: Block javascript in uris. * Add htmlscrubber test suite. 2008-02-10 19:16:40 +01:00			`# Selected unofficial schemes`
Do not allow the about: URI scheme Some browsers interpret about: URIs like a limited version of data: URIs. In particular, some versions of Internet Explorer interpret arbitrary HTML content in about: URIs. 2008-02-10 22:23:28 +01:00			`"aim", "callto", "cvs", "ed2k", "feed", "fish", "gg",`
* htmlscrubber security fix: Block javascript in uris. * Add htmlscrubber test suite. 2008-02-10 19:16:40 +01:00			`"irc", "ircs", "lastfm", "ldaps", "magnet", "mms",`
			`"msnim", "notes", "rsync", "secondlife", "skype", "ssh",`
htmlscrubber: Allow the URI schemes of major VCS's. 2013-01-05 22:24:40 +01:00			`"sftp", "smb", "sms", "snews", "webcal", "ymsgr",`
			`"bitcoin", "git", "svn", "bzr", "darcs", "hg"`
* htmlscrubber security fix: Block javascript in uris. * Add htmlscrubber test suite. 2008-02-10 19:16:40 +01:00			`);`
htmlscrubber: Security fix: In data:image/* uris, only allow a few whitelisted image types. No svg. 2010-03-12 20:49:13 +01:00			`# data is a special case. Allow a few data:image/ types,`
			`# but disallow data:text/javascript and everything else.`
htmlscrubber: Do not scrub url anchors that contain colons. 2010-08-19 19:59:31 +02:00			`$safe_url_regexp=qr/^(?:(?:$uri_schemes):\|data:image\/(?:png\|jpeg\|gif)\|[^:]+(?:$\|[\/\?#]))\|^#/i;`
Coding style change: Remove explcit vim folding markers. 2008-12-17 21:22:16 +01:00			`}`
export $safe_url_regexp 2008-02-10 23:07:21 +01:00
Coding style change: Remove explcit vim folding markers. 2008-12-17 21:22:16 +01:00			`sub getsetup () {`
add plugin safe/rebuild info (part 1 of 2) too many plugins.. brain exploding.. 2008-08-03 22:40:12 +02:00			`return`
			`plugin => {`
			`safe => 1,`
			`rebuild => undef,`
Group related plugins into sections in the setup file, and drop unused rcs plugins from the setup file. 2010-02-12 04:24:15 +01:00			`section => "core",`
add plugin safe/rebuild info (part 1 of 2) too many plugins.. brain exploding.. 2008-08-03 22:40:12 +02:00			`},`
htmlscrubber: Add a config setting that can be used to disable the scrubber acting on a set of pages. 2008-09-27 00:05:36 +02:00			`htmlscrubber_skip => {`
			`type => "pagespec",`
			`example => "!*/Discussion",`
			`description => "PageSpec specifying pages not to scrub",`
			`link => "ikiwiki/PageSpec",`
			`safe => 1,`
			`rebuild => undef,`
			`},`
Coding style change: Remove explcit vim folding markers. 2008-12-17 21:22:16 +01:00			`}`
add plugin safe/rebuild info (part 1 of 2) too many plugins.. brain exploding.. 2008-08-03 22:40:12 +02:00
Coding style change: Remove explcit vim folding markers. 2008-12-17 21:22:16 +01:00			`sub sanitize (@) {`
export $safe_url_regexp 2008-02-10 23:07:21 +01:00			`my %params=@_;`
htmlscrubber: Add a config setting that can be used to disable the scrubber acting on a set of pages. 2008-09-27 00:05:36 +02:00
			`if (exists $config{htmlscrubber_skip} &&`
			`length $config{htmlscrubber_skip} &&`
Fix htmlscrubber_skip to be matched on the source page, not the page it is inlined into. Should allow setting to "* and !comment(*)" to scrub comments, but leave your blog posts unscrubbed, etc. 2010-11-12 04:59:24 +01:00			`exists $params{page} &&`
			`pagespec_match($params{page}, $config{htmlscrubber_skip})) {`
htmlscrubber: Add a config setting that can be used to disable the scrubber acting on a set of pages. 2008-09-27 00:05:36 +02:00			`return $params{content};`
			`}`

export $safe_url_regexp 2008-02-10 23:07:21 +01:00			`return scrubber()->scrub($params{content});`
Coding style change: Remove explcit vim folding markers. 2008-12-17 21:22:16 +01:00			`}`
export $safe_url_regexp 2008-02-10 23:07:21 +01:00
			`my $_scrubber;`
Coding style change: Remove explcit vim folding markers. 2008-12-17 21:22:16 +01:00			`sub scrubber {`
export $safe_url_regexp 2008-02-10 23:07:21 +01:00			`return $_scrubber if defined $_scrubber;`
* htmlscrubber security fix: Block javascript in uris. * Add htmlscrubber test suite. 2008-02-10 19:16:40 +01:00
* Removed --sanitize and --no-sanitize, replaced with --plugin htmlscrubber and --disable-plugin htmlscrubber. 2006-05-05 07:41:11 +02:00			`eval q{use HTML::Scrubber};`
* Make sure to check for errors from every eval. 2006-11-08 22:03:33 +01:00			`error($@) if $@;`
* Removed --sanitize and --no-sanitize, replaced with --plugin htmlscrubber and --disable-plugin htmlscrubber. 2006-05-05 07:41:11 +02:00			`# Lists based on http://feedparser.org/docs/html-sanitization.html`
htmlscrubber: Allow html5 semantic tags: section nav article aside hgroup header footer time mark 2010-05-01 22:34:47 +02:00			`# With html5 tags added.`
* Removed --sanitize and --no-sanitize, replaced with --plugin htmlscrubber and --disable-plugin htmlscrubber. 2006-05-05 07:41:11 +02:00			`$_scrubber = HTML::Scrubber->new(`
			`allow => [qw{`
* htmlscrubber: Further work around #365971 by adding tags for 'br/', 'hr/' and 'p/'. 2008-01-08 00:32:50 +01:00			`a abbr acronym address area b big blockquote br br/`
* Removed --sanitize and --no-sanitize, replaced with --plugin htmlscrubber and --disable-plugin htmlscrubber. 2006-05-05 07:41:11 +02:00			`button caption center cite code col colgroup dd del`
			`dfn dir div dl dt em fieldset font form h1 h2 h3 h4`
* htmlscrubber: Further work around #365971 by adding tags for 'br/', 'hr/' and 'p/'. 2008-01-08 00:32:50 +01:00			`h5 h6 hr hr/ i img input ins kbd label legend li map`
			`menu ol optgroup option p p/ pre q s samp select small`
* Removed --sanitize and --no-sanitize, replaced with --plugin htmlscrubber and --disable-plugin htmlscrubber. 2006-05-05 07:41:11 +02:00			`span strike strong sub sup table tbody td textarea`
			`tfoot th thead tr tt u ul var`
htmlscrubber: Allow html5 semantic tags: section nav article aside hgroup header footer time mark 2010-05-01 22:34:47 +02:00
more html5 * htmlscrubber: Also allow html5 canvas tags. * htmlscrubber: Round out html5 video support with the preload attribute and the source tag. 2010-05-01 23:56:35 +02:00			`video audio source section nav article aside hgroup`
add figure and figcaption 2010-05-02 00:31:33 +02:00			`header footer figure figcaption time mark canvas`
htmlscrubber: Also allow some other html5 tags: canvas, progress, meter, ruby, rt, rp, details, summary. 2010-05-02 01:28:28 +02:00			`datalist progress meter ruby rt rp details summary`
* Removed --sanitize and --no-sanitize, replaced with --plugin htmlscrubber and --disable-plugin htmlscrubber. 2006-05-05 07:41:11 +02:00			`}],`
* Allow simple alphanumeric style attribute values in the htmlscrubber. This should be safe from javascript attacks. 2007-07-11 18:50:59 +02:00			`default => [undef, { (`
			`map { $_ => 1 } qw{`
* htmlscrubber security fix: Block javascript in uris. * Add htmlscrubber test suite. 2008-02-10 19:16:40 +01:00			`abbr accept accept-charset accesskey`
* Allow simple alphanumeric style attribute values in the htmlscrubber. This should be safe from javascript attacks. 2007-07-11 18:50:59 +02:00			`align alt axis border cellpadding cellspacing`
Also filter the attributes cite, longdesc, and usemap, which can contain URIs 2008-02-10 22:59:37 +01:00			`char charoff charset checked class`
* Allow simple alphanumeric style attribute values in the htmlscrubber. This should be safe from javascript attacks. 2007-07-11 18:50:59 +02:00			`clear cols colspan color compact coords`
			`datetime dir disabled enctype for frame`
* htmlscrubber security fix: Block javascript in uris. * Add htmlscrubber test suite. 2008-02-10 19:16:40 +01:00			`headers height hreflang hspace id ismap`
Also filter the attributes cite, longdesc, and usemap, which can contain URIs 2008-02-10 22:59:37 +01:00			`label lang maxlength media method`
* Allow simple alphanumeric style attribute values in the htmlscrubber. This should be safe from javascript attacks. 2007-07-11 18:50:59 +02:00			`multiple name nohref noshade nowrap prompt`
			`readonly rel rev rows rowspan rules scope`
* htmlscrubber security fix: Block javascript in uris. * Add htmlscrubber test suite. 2008-02-10 19:16:40 +01:00			`selected shape size span start summary`
Also filter the attributes cite, longdesc, and usemap, which can contain URIs 2008-02-10 22:59:37 +01:00			`tabindex target title type valign`
* Allow simple alphanumeric style attribute values in the htmlscrubber. This should be safe from javascript attacks. 2007-07-11 18:50:59 +02:00			`value vspace width`
htmlscrubber: Allow the html5 form attributes: placeholder autofocus, min, max, step. 2010-05-02 00:27:53 +02:00			`autofocus autoplay preload loopstart`
			`loopend end playcount controls pubdate`
htmlscrubber: Add support for the video tag's loop and muted attributes. Those were not in the original html5 spec, but have been added in the whatwg html living standard and have wide browser support. This commit was sponsored by John Peloquin on Patreon. 2017-07-11 21:51:44 +02:00			`loop muted`
htmlscrubber: Also allow some other html5 tags: canvas, progress, meter, ruby, rt, rp, details, summary. 2010-05-02 01:28:28 +02:00			`placeholder min max step low high optimum`
			`form required autocomplete novalidate pattern`
			`list formenctype formmethod formnovalidate`
enable hidden attribute 2010-05-02 01:59:16 +02:00			`formtarget reversed spellcheck open hidden`
* Allow simple alphanumeric style attribute values in the htmlscrubber. This should be safe from javascript attacks. 2007-07-11 18:50:59 +02:00			`} ),`
			`"/" => 1, # emit proper <hr /> XHTML`
export $safe_url_regexp 2008-02-10 23:07:21 +01:00			`href => $safe_url_regexp,`
			`src => $safe_url_regexp,`
			`action => $safe_url_regexp,`
more html5 attributes 2010-05-02 01:11:03 +02:00			`formaction => $safe_url_regexp,`
export $safe_url_regexp 2008-02-10 23:07:21 +01:00			`cite => $safe_url_regexp,`
			`longdesc => $safe_url_regexp,`
			`poster => $safe_url_regexp,`
			`usemap => $safe_url_regexp,`
* htmlscrubber security fix: Block javascript in uris. * Add htmlscrubber test suite. 2008-02-10 19:16:40 +01:00			`}],`
* Removed --sanitize and --no-sanitize, replaced with --plugin htmlscrubber and --disable-plugin htmlscrubber. 2006-05-05 07:41:11 +02:00			`);`
			`return $_scrubber;`
Coding style change: Remove explcit vim folding markers. 2008-12-17 21:22:16 +01:00			`}`
* Removed --sanitize and --no-sanitize, replaced with --plugin htmlscrubber and --disable-plugin htmlscrubber. 2006-05-05 07:41:11 +02:00
			`1`