useragent: Automatically choose whether to use LWPx::ParanoidAgent

The simple implementation of this, which I'd prefer to use, would be:
if we can import LWPx::ParanoidAgent, use it; otherwise, use
LWP::UserAgent.

However, aggregate has historically worked with proxies, and
LWPx::ParanoidAgent quite reasonably refuses to work with proxies
(because it can't know whether those proxies are going to do the same
filtering that LWPx::ParanoidAgent would).

Signed-off-by: Simon McVittie <smcv@debian.org>
master
Simon McVittie 2019-02-10 17:22:06 +00:00
parent 67543ce1d6
commit d283e4ca1a
6 changed files with 458 additions and 36 deletions

View File

@ -2469,16 +2469,131 @@ sub add_autofile ($$$) {
$autofiles{$file}{generator}=$generator; $autofiles{$file}{generator}=$generator;
} }
sub useragent () { sub useragent (@) {
my %params = @_;
my $for_url = delete $params{for_url};
# Fail safe, in case a plugin calling this function is relying on
# a future parameter to make the UA more strict
foreach my $key (keys %params) {
error "Internal error: useragent(\"$key\" => ...) not understood";
}
eval q{use LWP}; eval q{use LWP};
error($@) if $@; error($@) if $@;
return LWP::UserAgent->new( my %args = (
cookie_jar => $config{cookiejar},
env_proxy => 1, # respect proxy env vars
agent => $config{useragent}, agent => $config{useragent},
cookie_jar => $config{cookiejar},
env_proxy => 0,
protocols_allowed => [qw(http https)], protocols_allowed => [qw(http https)],
); );
my %proxies;
if (defined $for_url) {
# We know which URL we're going to fetch, so we can choose
# whether it's going to go through a proxy or not.
#
# We reimplement http_proxy, https_proxy and no_proxy here, so
# that we are not relying on LWP implementing them exactly the
# same way we do.
eval q{use URI};
error($@) if $@;
my $proxy;
my $uri = URI->new($for_url);
if ($uri->scheme eq 'http') {
$proxy = $ENV{http_proxy};
# HTTP_PROXY is deliberately not implemented
# because the HTTP_* namespace is also used by CGI
}
elsif ($uri->scheme eq 'https') {
$proxy = $ENV{https_proxy};
$proxy = $ENV{HTTPS_PROXY} unless defined $proxy;
}
else {
$proxy = undef;
}
foreach my $var (qw(no_proxy NO_PROXY)) {
my $no_proxy = $ENV{$var};
if (defined $no_proxy) {
foreach my $domain (split /\s*,\s*/, $no_proxy) {
if ($domain =~ s/^\*?\.//) {
# no_proxy="*.example.com" or
# ".example.com": match suffix
# against .example.com
if ($uri->host =~ m/(^|\.)\Q$domain\E$/i) {
$proxy = undef;
}
}
else {
# no_proxy="example.com":
# match exactly example.com
if (lc $uri->host eq lc $domain) {
$proxy = undef;
}
}
}
}
}
if (defined $proxy) {
$proxies{$uri->scheme} = $proxy;
# Paranoia: make sure we can't bypass the proxy
$args{protocols_allowed} = [$uri->scheme];
}
}
else {
# The plugin doesn't know yet which URL(s) it's going to
# fetch, so we have to make some conservative assumptions.
my $http_proxy = $ENV{http_proxy};
my $https_proxy = $ENV{https_proxy};
$https_proxy = $ENV{HTTPS_PROXY} unless defined $https_proxy;
# We don't respect no_proxy here: if we are not using the
# paranoid user-agent, then we need to give the proxy the
# opportunity to reject undesirable requests.
# If we have one, we need the other: otherwise, neither
# LWPx::ParanoidAgent nor the proxy would have the
# opportunity to filter requests for the other protocol.
if (defined $https_proxy && defined $http_proxy) {
%proxies = (http => $http_proxy, https => $https_proxy);
}
elsif (defined $https_proxy) {
%proxies = (http => $https_proxy, https => $https_proxy);
}
elsif (defined $http_proxy) {
%proxies = (http => $http_proxy, https => $http_proxy);
}
}
if (scalar keys %proxies) {
# The configured proxy is responsible for deciding which
# URLs are acceptable to fetch and which URLs are not.
my $ua = LWP::UserAgent->new(%args);
foreach my $scheme (@{$ua->protocols_allowed}) {
unless ($proxies{$scheme}) {
error "internal error: $scheme is allowed but has no proxy";
}
}
# We can't pass the proxies in %args because that only
# works since LWP 6.24.
foreach my $scheme (keys %proxies) {
$ua->proxy($scheme, $proxies{$scheme});
}
return $ua;
}
eval q{use LWPx::ParanoidAgent};
if ($@) {
print STDERR "warning: installing LWPx::ParanoidAgent is recommended\n";
return LWP::UserAgent->new(%args);
}
return LWPx::ParanoidAgent->new(%args);
} }
sub sortspec_translate ($$) { sub sortspec_translate ($$) {

View File

@ -513,7 +513,10 @@ sub aggregate (@) {
} }
$feed->{feedurl}=pop @urls; $feed->{feedurl}=pop @urls;
} }
my $ua=useragent(); # Using the for_url parameter makes sure we crash if used
# with an older IkiWiki.pm that didn't automatically try
# to use LWPx::ParanoidAgent.
my $ua=useragent(for_url => $feed->{feedurl});
my $res=URI::Fetch->fetch($feed->{feedurl}, UserAgent=>$ua); my $res=URI::Fetch->fetch($feed->{feedurl}, UserAgent=>$ua);
if (! $res) { if (! $res) {
$feed->{message}=URI::Fetch->errstr; $feed->{message}=URI::Fetch->errstr;

View File

@ -57,18 +57,10 @@ sub checkconfig () {
}; };
error $@ if $@; error $@ if $@;
eval q{use LWPx::ParanoidAgent}; # Using the for_url parameter makes sure we crash if used
if (!$@) { # with an older IkiWiki.pm that didn't automatically try
$client=LWPx::ParanoidAgent->new(agent => $config{useragent}); # to use LWPx::ParanoidAgent.
} $client=useragent(for_url => $config{blogspam_server});
else {
eval q{use LWP};
if ($@) {
error $@;
return;
}
$client=useragent();
}
} }
sub checkcontent (@) { sub checkcontent (@) {

View File

@ -219,14 +219,10 @@ sub getobj ($$) {
eval q{use Net::OpenID::Consumer}; eval q{use Net::OpenID::Consumer};
error($@) if $@; error($@) if $@;
my $ua; # We pass the for_url parameter, even though it's undef, because
eval q{use LWPx::ParanoidAgent}; # that will make sure we crash if used with an older IkiWiki.pm
if (! $@) { # that didn't automatically try to use LWPx::ParanoidAgent.
$ua=LWPx::ParanoidAgent->new(agent => $config{useragent}); my $ua=useragent(for_url => undef);
}
else {
$ua=useragent();
}
# Store the secret in the session. # Store the secret in the session.
my $secret=$session->param("openid_secret"); my $secret=$session->param("openid_secret");

View File

@ -70,17 +70,16 @@ sub ping {
eval q{use Net::INET6Glue::INET_is_INET6}; # may not be available eval q{use Net::INET6Glue::INET_is_INET6}; # may not be available
my $ua; my $ua;
eval q{use LWPx::ParanoidAgent}; eval {
if (!$@) { # We pass the for_url parameter, even though it's
$ua=LWPx::ParanoidAgent->new(agent => $config{useragent}); # undef, because that will make sure we crash if used
} # with an older IkiWiki.pm that didn't automatically
else { # try to use LWPx::ParanoidAgent.
eval q{use LWP}; $ua=useragent(for_url => undef);
if ($@) { };
debug(gettext("LWP not found, not pinging")); if ($@) {
return; debug(gettext("LWP not found, not pinging").": $@");
} return;
$ua=useragent();
} }
$ua->timeout($config{pinger_timeout} || 15); $ua->timeout($config{pinger_timeout} || 15);

317
t/useragent.t 100755
View File

@ -0,0 +1,317 @@
#!/usr/bin/perl
use warnings;
use strict;
use Test::More;
my $have_paranoid_agent;
BEGIN {
plan(skip_all => 'LWP not available')
unless eval q{
use LWP qw(); 1;
};
use_ok("IkiWiki");
$have_paranoid_agent = eval q{
use LWPx::ParanoidAgent qw(); 1;
};
}
eval { useragent(future_feature => 1); };
ok($@, 'future features should cause useragent to fail');
diag "==== No proxy ====";
delete $ENV{http_proxy};
delete $ENV{https_proxy};
delete $ENV{no_proxy};
delete $ENV{HTTPS_PROXY};
delete $ENV{NO_PROXY};
diag "---- Unspecified URL ----";
my $ua = useragent(for_url => undef);
SKIP: {
skip 'paranoid agent not available', 1 unless $have_paranoid_agent;
ok($ua->isa('LWPx::ParanoidAgent'), 'uses ParanoidAgent if possible');
}
is_deeply([sort @{$ua->protocols_allowed}], [sort qw(http https)]);
is($ua->proxy('http'), undef, 'No http proxy');
is($ua->proxy('https'), undef, 'No https proxy');
diag "---- Specified URL ----";
$ua = useragent(for_url => 'http://example.com');
SKIP: {
skip 'paranoid agent not available', 1 unless $have_paranoid_agent;
ok($ua->isa('LWPx::ParanoidAgent'), 'uses ParanoidAgent if possible');
}
is_deeply([sort @{$ua->protocols_allowed}], [sort qw(http https)]);
is($ua->proxy('http'), undef, 'No http proxy');
is($ua->proxy('https'), undef, 'No https proxy');
diag "==== Proxy for everything ====";
$ENV{http_proxy} = 'http://proxy:8080';
$ENV{https_proxy} = 'http://sproxy:8080';
delete $ENV{no_proxy};
delete $ENV{HTTPS_PROXY};
delete $ENV{NO_PROXY};
diag "---- Unspecified URL ----";
$ua = useragent(for_url => undef);
ok(! $ua->isa('LWPx::ParanoidAgent'), 'should use proxy instead of ParanoidAgent');
is_deeply([sort @{$ua->protocols_allowed}], [sort qw(http https)]);
is($ua->proxy('http'), 'http://proxy:8080', 'should use proxy');
is($ua->proxy('https'), 'http://sproxy:8080', 'should use CONNECT proxy');
$ua = useragent(for_url => 'http://example.com');
ok(! $ua->isa('LWPx::ParanoidAgent'), 'should use proxy instead of ParanoidAgent');
is_deeply([sort @{$ua->protocols_allowed}], [sort qw(http)]);
is($ua->proxy('http'), 'http://proxy:8080', 'should use proxy');
# We don't care what $ua->proxy('https') is, because it won't be used
$ua = useragent(for_url => 'https://example.com');
ok(! $ua->isa('LWPx::ParanoidAgent'), 'should use proxy instead of ParanoidAgent');
is_deeply([sort @{$ua->protocols_allowed}], [sort qw(https)]);
# We don't care what $ua->proxy('http') is, because it won't be used
is($ua->proxy('https'), 'http://sproxy:8080', 'should use CONNECT proxy');
diag "==== Selective proxy ====";
$ENV{http_proxy} = 'http://proxy:8080';
$ENV{https_proxy} = 'http://sproxy:8080';
$ENV{no_proxy} = '*.example.net,example.com,.example.org';
delete $ENV{HTTPS_PROXY};
delete $ENV{NO_PROXY};
diag "---- Unspecified URL ----";
$ua = useragent(for_url => undef);
ok(! $ua->isa('LWPx::ParanoidAgent'), 'should use proxy instead of ParanoidAgent');
is_deeply([sort @{$ua->protocols_allowed}], [sort qw(http https)]);
is($ua->proxy('http'), 'http://proxy:8080', 'should use proxy');
is($ua->proxy('https'), 'http://sproxy:8080', 'should use CONNECT proxy');
diag "---- Exact match for no_proxy ----";
$ua = useragent(for_url => 'http://example.com');
SKIP: {
skip 'paranoid agent not available', 1 unless $have_paranoid_agent;
ok($ua->isa('LWPx::ParanoidAgent'), 'uses ParanoidAgent if possible');
}
is_deeply([sort @{$ua->protocols_allowed}], [sort qw(http https)]);
is($ua->proxy('http'), undef);
is($ua->proxy('https'), undef);
diag "---- Subdomain of exact domain in no_proxy ----";
$ua = useragent(for_url => 'http://sub.example.com');
ok(! $ua->isa('LWPx::ParanoidAgent'), 'should use proxy instead of ParanoidAgent');
is_deeply([sort @{$ua->protocols_allowed}], [sort qw(http)]);
is($ua->proxy('http'), 'http://proxy:8080', 'should use proxy');
diag "---- example.net matches *.example.net ----";
$ua = useragent(for_url => 'https://example.net');
SKIP: {
skip 'paranoid agent not available', 1 unless $have_paranoid_agent;
ok($ua->isa('LWPx::ParanoidAgent'), 'uses ParanoidAgent if possible');
}
is_deeply([sort @{$ua->protocols_allowed}], [sort qw(http https)]);
is($ua->proxy('http'), undef);
is($ua->proxy('https'), undef);
diag "---- sub.example.net matches *.example.net ----";
$ua = useragent(for_url => 'https://sub.example.net');
SKIP: {
skip 'paranoid agent not available', 1 unless $have_paranoid_agent;
ok($ua->isa('LWPx::ParanoidAgent'), 'uses ParanoidAgent if possible');
}
is_deeply([sort @{$ua->protocols_allowed}], [sort qw(http https)]);
is($ua->proxy('http'), undef);
is($ua->proxy('https'), undef);
diag "---- badexample.net does not match *.example.net ----";
$ua = useragent(for_url => 'https://badexample.net');
ok(! $ua->isa('LWPx::ParanoidAgent'), 'should use proxy instead of ParanoidAgent');
is_deeply([sort @{$ua->protocols_allowed}], [sort qw(https)]);
is($ua->proxy('https'), 'http://sproxy:8080', 'should use proxy');
diag "---- example.org matches .example.org ----";
$ua = useragent(for_url => 'https://example.org');
SKIP: {
skip 'paranoid agent not available', 1 unless $have_paranoid_agent;
ok($ua->isa('LWPx::ParanoidAgent'), 'uses ParanoidAgent if possible');
}
is_deeply([sort @{$ua->protocols_allowed}], [sort qw(http https)]);
is($ua->proxy('http'), undef);
is($ua->proxy('https'), undef);
diag "---- sub.example.org matches .example.org ----";
$ua = useragent(for_url => 'https://sub.example.org');
SKIP: {
skip 'paranoid agent not available', 1 unless $have_paranoid_agent;
ok($ua->isa('LWPx::ParanoidAgent'), 'uses ParanoidAgent if possible');
}
is_deeply([sort @{$ua->protocols_allowed}], [sort qw(http https)]);
is($ua->proxy('http'), undef);
is($ua->proxy('https'), undef);
diag "---- badexample.org does not match .example.org ----";
$ua = useragent(for_url => 'https://badexample.org');
ok(! $ua->isa('LWPx::ParanoidAgent'), 'should use proxy instead of ParanoidAgent');
is_deeply([sort @{$ua->protocols_allowed}], [sort qw(https)]);
is($ua->proxy('https'), 'http://sproxy:8080', 'should use proxy');
diag "==== Selective proxy (alternate variables) ====";
$ENV{http_proxy} = 'http://proxy:8080';
delete $ENV{https_proxy};
$ENV{HTTPS_PROXY} = 'http://sproxy:8080';
delete $ENV{no_proxy};
$ENV{NO_PROXY} = '*.example.net,example.com,.example.org';
diag "---- Unspecified URL ----";
$ua = useragent(for_url => undef);
ok(! $ua->isa('LWPx::ParanoidAgent'), 'should use proxy instead of ParanoidAgent');
is_deeply([sort @{$ua->protocols_allowed}], [sort qw(http https)]);
is($ua->proxy('http'), 'http://proxy:8080', 'should use proxy');
is($ua->proxy('https'), 'http://sproxy:8080', 'should use CONNECT proxy');
diag "---- Exact match for no_proxy ----";
$ua = useragent(for_url => 'http://example.com');
SKIP: {
skip 'paranoid agent not available', 1 unless $have_paranoid_agent;
ok($ua->isa('LWPx::ParanoidAgent'), 'uses ParanoidAgent if possible');
}
is_deeply([sort @{$ua->protocols_allowed}], [sort qw(http https)]);
is($ua->proxy('http'), undef);
is($ua->proxy('https'), undef);
diag "---- Subdomain of exact domain in no_proxy ----";
$ua = useragent(for_url => 'http://sub.example.com');
ok(! $ua->isa('LWPx::ParanoidAgent'), 'should use proxy instead of ParanoidAgent');
is_deeply([sort @{$ua->protocols_allowed}], [sort qw(http)]);
is($ua->proxy('http'), 'http://proxy:8080', 'should use proxy');
diag "---- example.net matches *.example.net ----";
$ua = useragent(for_url => 'https://example.net');
SKIP: {
skip 'paranoid agent not available', 1 unless $have_paranoid_agent;
ok($ua->isa('LWPx::ParanoidAgent'), 'uses ParanoidAgent if possible');
}
is_deeply([sort @{$ua->protocols_allowed}], [sort qw(http https)]);
is($ua->proxy('http'), undef);
is($ua->proxy('https'), undef);
diag "---- sub.example.net matches *.example.net ----";
$ua = useragent(for_url => 'https://sub.example.net');
SKIP: {
skip 'paranoid agent not available', 1 unless $have_paranoid_agent;
ok($ua->isa('LWPx::ParanoidAgent'), 'uses ParanoidAgent if possible');
}
is_deeply([sort @{$ua->protocols_allowed}], [sort qw(http https)]);
is($ua->proxy('http'), undef);
is($ua->proxy('https'), undef);
diag "---- badexample.net does not match *.example.net ----";
$ua = useragent(for_url => 'https://badexample.net');
ok(! $ua->isa('LWPx::ParanoidAgent'), 'should use proxy instead of ParanoidAgent');
is_deeply([sort @{$ua->protocols_allowed}], [sort qw(https)]);
is($ua->proxy('https'), 'http://sproxy:8080', 'should use proxy');
diag "---- example.org matches .example.org ----";
$ua = useragent(for_url => 'https://example.org');
SKIP: {
skip 'paranoid agent not available', 1 unless $have_paranoid_agent;
ok($ua->isa('LWPx::ParanoidAgent'), 'uses ParanoidAgent if possible');
}
is_deeply([sort @{$ua->protocols_allowed}], [sort qw(http https)]);
is($ua->proxy('http'), undef);
is($ua->proxy('https'), undef);
diag "---- sub.example.org matches .example.org ----";
$ua = useragent(for_url => 'https://sub.example.org');
SKIP: {
skip 'paranoid agent not available', 1 unless $have_paranoid_agent;
ok($ua->isa('LWPx::ParanoidAgent'), 'uses ParanoidAgent if possible');
}
is_deeply([sort @{$ua->protocols_allowed}], [sort qw(http https)]);
is($ua->proxy('http'), undef);
is($ua->proxy('https'), undef);
diag "---- badexample.org does not match .example.org ----";
$ua = useragent(for_url => 'https://badexample.org');
ok(! $ua->isa('LWPx::ParanoidAgent'), 'should use proxy instead of ParanoidAgent');
is_deeply([sort @{$ua->protocols_allowed}], [sort qw(https)]);
is($ua->proxy('https'), 'http://sproxy:8080', 'should use proxy');
diag "==== Selective proxy (many variables) ====";
$ENV{http_proxy} = 'http://proxy:8080';
$ENV{https_proxy} = 'http://sproxy:8080';
# This one should be ignored in favour of https_proxy
$ENV{HTTPS_PROXY} = 'http://not.preferred.proxy:3128';
# These two should be merged
$ENV{no_proxy} = '*.example.net,example.com';
$ENV{NO_PROXY} = '.example.org';
diag "---- Unspecified URL ----";
$ua = useragent(for_url => undef);
ok(! $ua->isa('LWPx::ParanoidAgent'), 'should use proxy instead of ParanoidAgent');
is_deeply([sort @{$ua->protocols_allowed}], [sort qw(http https)]);
is($ua->proxy('http'), 'http://proxy:8080', 'should use proxy');
is($ua->proxy('https'), 'http://sproxy:8080', 'should use CONNECT proxy');
diag "---- Exact match for no_proxy ----";
$ua = useragent(for_url => 'http://example.com');
SKIP: {
skip 'paranoid agent not available', 1 unless $have_paranoid_agent;
ok($ua->isa('LWPx::ParanoidAgent'), 'uses ParanoidAgent if possible');
}
is_deeply([sort @{$ua->protocols_allowed}], [sort qw(http https)]);
is($ua->proxy('http'), undef);
is($ua->proxy('https'), undef);
diag "---- Subdomain of exact domain in no_proxy ----";
$ua = useragent(for_url => 'http://sub.example.com');
ok(! $ua->isa('LWPx::ParanoidAgent'), 'should use proxy instead of ParanoidAgent');
is_deeply([sort @{$ua->protocols_allowed}], [sort qw(http)]);
is($ua->proxy('http'), 'http://proxy:8080', 'should use proxy');
diag "---- example.net matches *.example.net ----";
$ua = useragent(for_url => 'https://example.net');
SKIP: {
skip 'paranoid agent not available', 1 unless $have_paranoid_agent;
ok($ua->isa('LWPx::ParanoidAgent'), 'uses ParanoidAgent if possible');
}
is_deeply([sort @{$ua->protocols_allowed}], [sort qw(http https)]);
is($ua->proxy('http'), undef);
is($ua->proxy('https'), undef);
diag "---- sub.example.net matches *.example.net ----";
$ua = useragent(for_url => 'https://sub.example.net');
SKIP: {
skip 'paranoid agent not available', 1 unless $have_paranoid_agent;
ok($ua->isa('LWPx::ParanoidAgent'), 'uses ParanoidAgent if possible');
}
is_deeply([sort @{$ua->protocols_allowed}], [sort qw(http https)]);
is($ua->proxy('http'), undef);
is($ua->proxy('https'), undef);
diag "---- badexample.net does not match *.example.net ----";
$ua = useragent(for_url => 'https://badexample.net');
ok(! $ua->isa('LWPx::ParanoidAgent'), 'should use proxy instead of ParanoidAgent');
is_deeply([sort @{$ua->protocols_allowed}], [sort qw(https)]);
is($ua->proxy('https'), 'http://sproxy:8080', 'should use proxy');
diag "==== One but not the other ====\n";
$ENV{http_proxy} = 'http://proxy:8080';
delete $ENV{https_proxy};
delete $ENV{HTTPS_PROXY};
delete $ENV{no_proxy};
delete $ENV{NO_PROXY};
$ua = useragent(for_url => undef);
ok(! $ua->isa('LWPx::ParanoidAgent'), 'should use proxy instead of ParanoidAgent');
is_deeply([sort @{$ua->protocols_allowed}], [sort qw(http https)]);
is($ua->proxy('http'), 'http://proxy:8080', 'should use proxy');
is($ua->proxy('https'), 'http://proxy:8080', 'should use proxy');
delete $ENV{http_proxy};
$ENV{https_proxy} = 'http://sproxy:8080';
delete $ENV{HTTPS_PROXY};
delete $ENV{no_proxy};
delete $ENV{NO_PROXY};
$ua = useragent(for_url => undef);
ok(! $ua->isa('LWPx::ParanoidAgent'), 'should use proxy instead of ParanoidAgent');
is_deeply([sort @{$ua->protocols_allowed}], [sort qw(http https)]);
is($ua->proxy('http'), 'http://sproxy:8080', 'should use proxy');
is($ua->proxy('https'), 'http://sproxy:8080', 'should use proxy');
done_testing;