2006-07-30 06:31:08 +02:00
|
|
|
#!/usr/bin/perl
|
2008-05-06 02:20:45 +02:00
|
|
|
# Feed aggregation plugin.
|
2006-07-30 06:31:08 +02:00
|
|
|
package IkiWiki::Plugin::aggregate;
|
|
|
|
|
|
|
|
use warnings;
|
|
|
|
use strict;
|
2007-04-27 04:55:52 +02:00
|
|
|
use IkiWiki 2.00;
|
2006-08-03 23:50:47 +02:00
|
|
|
use HTML::Parser;
|
|
|
|
use HTML::Tagset;
|
2008-03-14 23:43:54 +01:00
|
|
|
use HTML::Entities;
|
2006-08-03 23:50:47 +02:00
|
|
|
use URI;
|
2006-11-08 21:13:59 +01:00
|
|
|
use open qw{:utf8 :std};
|
2006-07-30 06:31:08 +02:00
|
|
|
|
|
|
|
my %feeds;
|
|
|
|
my %guids;
|
|
|
|
|
|
|
|
sub import { #{{{
|
2006-09-10 00:50:27 +02:00
|
|
|
hook(type => "getopt", id => "aggregate", call => \&getopt);
|
|
|
|
hook(type => "checkconfig", id => "aggregate", call => \&checkconfig);
|
* meta: Drop support for "meta link", since supporting this for internal
links required meta to be run during scan, which complicated its data
storage, since it had to clear data stored during the scan pass to avoid
duplicating it during the normal preprocessing pass.
* If you used "meta link", you should switch to either "meta openid" (for
openid delegations), or tags (for internal, invisible links). I assume
that nobody really used "meta link" for external, non-openid links, since
the htmlscrubber ate those. (Tell me differently and I'll consider bringing
back that support.)
* meta: Improved data storage.
* meta: Drop the hackish filter hook that was used to clear
stored data before preprocessing, this hack was ugly, and broken (cf:
liw's disappearing openids).
* aggregate: Convert filter hook to a needsbuild hook.
2007-12-16 21:56:09 +01:00
|
|
|
hook(type => "needsbuild", id => "aggregate", call => \&needsbuild);
|
2006-09-10 00:50:27 +02:00
|
|
|
hook(type => "preprocess", id => "aggregate", call => \&preprocess);
|
|
|
|
hook(type => "delete", id => "aggregate", call => \&delete);
|
|
|
|
hook(type => "savestate", id => "aggregate", call => \&savestate);
|
2008-05-06 02:20:45 +02:00
|
|
|
if (exists $config{aggregate_webtrigger} && $config{aggregate_webtrigger}) {
|
|
|
|
hook(type => "cgi", id => "aggregate", call => \&cgi);
|
|
|
|
}
|
2006-07-30 06:31:08 +02:00
|
|
|
} # }}}
|
|
|
|
|
|
|
|
sub getopt () { #{{{
|
|
|
|
eval q{use Getopt::Long};
|
2006-11-08 22:03:33 +01:00
|
|
|
error($@) if $@;
|
2006-07-30 06:31:08 +02:00
|
|
|
Getopt::Long::Configure('pass_through');
|
2006-09-10 00:50:27 +02:00
|
|
|
GetOptions("aggregate" => \$config{aggregate});
|
2006-07-30 06:31:08 +02:00
|
|
|
} #}}}
|
|
|
|
|
|
|
|
sub checkconfig () { #{{{
|
2007-05-21 04:52:51 +02:00
|
|
|
if ($config{aggregate} && ! ($config{post_commit} &&
|
|
|
|
IkiWiki::commit_hook_enabled())) {
|
2008-05-06 02:20:45 +02:00
|
|
|
launchaggregation();
|
|
|
|
}
|
|
|
|
} #}}}
|
|
|
|
|
|
|
|
sub cgi ($) { #{{{
|
|
|
|
my $cgi=shift;
|
2008-02-03 22:48:26 +01:00
|
|
|
|
2008-05-06 02:20:45 +02:00
|
|
|
if (defined $cgi->param('do') &&
|
|
|
|
$cgi->param("do") eq "aggregate_webtrigger") {
|
|
|
|
$|=1;
|
|
|
|
print "Content-Type: text/plain\n\n";
|
|
|
|
$config{cgi}=0;
|
|
|
|
$config{verbose}=1;
|
|
|
|
$config{syslog}=0;
|
|
|
|
print gettext("Aggregation triggered via web.")."\n\n";
|
|
|
|
if (launchaggregation()) {
|
|
|
|
IkiWiki::lockwiki();
|
|
|
|
IkiWiki::loadindex();
|
|
|
|
require IkiWiki::Render;
|
|
|
|
IkiWiki::refresh();
|
|
|
|
IkiWiki::saveindex();
|
2008-02-03 22:48:26 +01:00
|
|
|
}
|
2008-05-06 02:20:45 +02:00
|
|
|
else {
|
|
|
|
print gettext("Nothing to do right now, all feeds are up-to-date!")."\n";
|
2008-02-03 22:48:26 +01:00
|
|
|
}
|
2008-05-06 02:20:45 +02:00
|
|
|
exit 0;
|
|
|
|
}
|
|
|
|
} #}}}
|
|
|
|
|
|
|
|
sub launchaggregation () { #{{{
|
|
|
|
# See if any feeds need aggregation.
|
|
|
|
loadstate();
|
|
|
|
my @feeds=needsaggregate();
|
|
|
|
return unless @feeds;
|
|
|
|
if (! lockaggregate()) {
|
|
|
|
debug("an aggregation process is already running");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
# force a later rebuild of source pages
|
|
|
|
$IkiWiki::forcerebuild{$_->{sourcepage}}=1
|
|
|
|
foreach @feeds;
|
2008-02-03 05:56:13 +01:00
|
|
|
|
2008-05-06 02:20:45 +02:00
|
|
|
# Fork a child process to handle the aggregation.
|
|
|
|
# The parent process will then handle building the
|
|
|
|
# result. This avoids messy code to clear state
|
|
|
|
# accumulated while aggregating.
|
|
|
|
defined(my $pid = fork) or error("Can't fork: $!");
|
|
|
|
if (! $pid) {
|
|
|
|
IkiWiki::loadindex();
|
|
|
|
# Aggregation happens without the main wiki lock
|
|
|
|
# being held. This allows editing pages etc while
|
|
|
|
# aggregation is running.
|
|
|
|
aggregate(@feeds);
|
|
|
|
|
|
|
|
IkiWiki::lockwiki;
|
|
|
|
# Merge changes, since aggregation state may have
|
|
|
|
# changed on disk while the aggregation was happening.
|
|
|
|
mergestate();
|
|
|
|
expire();
|
|
|
|
savestate();
|
|
|
|
IkiWiki::unlockwiki;
|
|
|
|
exit 0;
|
|
|
|
}
|
|
|
|
waitpid($pid,0);
|
|
|
|
if ($?) {
|
|
|
|
error "aggregation failed with code $?";
|
2006-07-30 06:31:08 +02:00
|
|
|
}
|
2008-05-06 02:20:45 +02:00
|
|
|
|
|
|
|
clearstate();
|
|
|
|
unlockaggregate();
|
|
|
|
|
|
|
|
return 1;
|
2006-07-30 06:31:08 +02:00
|
|
|
} #}}}
|
|
|
|
|
* meta: Drop support for "meta link", since supporting this for internal
links required meta to be run during scan, which complicated its data
storage, since it had to clear data stored during the scan pass to avoid
duplicating it during the normal preprocessing pass.
* If you used "meta link", you should switch to either "meta openid" (for
openid delegations), or tags (for internal, invisible links). I assume
that nobody really used "meta link" for external, non-openid links, since
the htmlscrubber ate those. (Tell me differently and I'll consider bringing
back that support.)
* meta: Improved data storage.
* meta: Drop the hackish filter hook that was used to clear
stored data before preprocessing, this hack was ugly, and broken (cf:
liw's disappearing openids).
* aggregate: Convert filter hook to a needsbuild hook.
2007-12-16 21:56:09 +01:00
|
|
|
sub needsbuild (@) { #{{{
|
|
|
|
my $needsbuild=shift;
|
2007-12-17 01:40:50 +01:00
|
|
|
|
2008-02-03 22:48:26 +01:00
|
|
|
loadstate();
|
2006-07-30 06:31:08 +02:00
|
|
|
|
2007-12-30 20:49:25 +01:00
|
|
|
foreach my $feed (values %feeds) {
|
2008-01-29 23:50:11 +01:00
|
|
|
if (exists $pagesources{$feed->{sourcepage}} &&
|
2008-01-29 23:36:25 +01:00
|
|
|
grep { $_ eq $pagesources{$feed->{sourcepage}} } @$needsbuild) {
|
2008-02-03 22:48:26 +01:00
|
|
|
# Mark all feeds originating on this page as
|
|
|
|
# not yet seen; preprocess will unmark those that
|
|
|
|
# still exist.
|
|
|
|
markunseen($feed->{sourcepage});
|
* meta: Drop support for "meta link", since supporting this for internal
links required meta to be run during scan, which complicated its data
storage, since it had to clear data stored during the scan pass to avoid
duplicating it during the normal preprocessing pass.
* If you used "meta link", you should switch to either "meta openid" (for
openid delegations), or tags (for internal, invisible links). I assume
that nobody really used "meta link" for external, non-openid links, since
the htmlscrubber ate those. (Tell me differently and I'll consider bringing
back that support.)
* meta: Improved data storage.
* meta: Drop the hackish filter hook that was used to clear
stored data before preprocessing, this hack was ugly, and broken (cf:
liw's disappearing openids).
* aggregate: Convert filter hook to a needsbuild hook.
2007-12-16 21:56:09 +01:00
|
|
|
}
|
|
|
|
}
|
2006-07-30 06:31:08 +02:00
|
|
|
} # }}}
|
|
|
|
|
|
|
|
sub preprocess (@) { #{{{
|
|
|
|
my %params=@_;
|
|
|
|
|
2006-07-30 07:14:35 +02:00
|
|
|
foreach my $required (qw{name url}) {
|
2006-07-30 06:31:08 +02:00
|
|
|
if (! exists $params{$required}) {
|
2007-03-07 13:04:45 +01:00
|
|
|
return "[[aggregate ".sprintf(gettext("missing %s parameter"), $required)."]]";
|
2006-07-30 06:31:08 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
my $feed={};
|
|
|
|
my $name=$params{name};
|
|
|
|
if (exists $feeds{$name}) {
|
|
|
|
$feed=$feeds{$name};
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
$feeds{$name}=$feed;
|
|
|
|
}
|
|
|
|
$feed->{name}=$name;
|
|
|
|
$feed->{sourcepage}=$params{page};
|
|
|
|
$feed->{url}=$params{url};
|
2006-07-31 03:05:22 +02:00
|
|
|
my $dir=exists $params{dir} ? $params{dir} : $params{page}."/".IkiWiki::titlepage($params{name});
|
2006-07-30 07:14:35 +02:00
|
|
|
$dir=~s/^\/+//;
|
2006-09-10 00:50:27 +02:00
|
|
|
($dir)=$dir=~/$config{wiki_file_regexp}/;
|
2006-07-30 07:14:35 +02:00
|
|
|
$feed->{dir}=$dir;
|
2006-07-30 08:14:44 +02:00
|
|
|
$feed->{feedurl}=defined $params{feedurl} ? $params{feedurl} : "";
|
2006-07-30 07:14:35 +02:00
|
|
|
$feed->{updateinterval}=defined $params{updateinterval} ? $params{updateinterval} * 60 : 15 * 60;
|
2006-07-30 06:31:08 +02:00
|
|
|
$feed->{expireage}=defined $params{expireage} ? $params{expireage} : 0;
|
|
|
|
$feed->{expirecount}=defined $params{expirecount} ? $params{expirecount} : 0;
|
2008-02-03 22:48:26 +01:00
|
|
|
delete $feed->{unseen};
|
2006-07-30 06:31:08 +02:00
|
|
|
$feed->{lastupdate}=0 unless defined $feed->{lastupdate};
|
|
|
|
$feed->{numposts}=0 unless defined $feed->{numposts};
|
|
|
|
$feed->{newposts}=0 unless defined $feed->{newposts};
|
2006-12-29 05:38:40 +01:00
|
|
|
$feed->{message}=gettext("new feed") unless defined $feed->{message};
|
2006-10-13 20:31:18 +02:00
|
|
|
$feed->{error}=0 unless defined $feed->{error};
|
2006-07-30 08:57:42 +02:00
|
|
|
$feed->{tags}=[];
|
2006-07-30 06:31:08 +02:00
|
|
|
while (@_) {
|
|
|
|
my $key=shift;
|
|
|
|
my $value=shift;
|
|
|
|
if ($key eq 'tag') {
|
|
|
|
push @{$feed->{tags}}, $value;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return "<a href=\"".$feed->{url}."\">".$feed->{name}."</a>: ".
|
2006-10-13 20:31:18 +02:00
|
|
|
($feed->{error} ? "<em>" : "").$feed->{message}.
|
|
|
|
($feed->{error} ? "</em>" : "").
|
2006-12-29 05:38:40 +01:00
|
|
|
" (".$feed->{numposts}." ".gettext("posts").
|
|
|
|
($feed->{newposts} ? "; ".$feed->{newposts}.
|
|
|
|
" ".gettext("new") : "").
|
2006-09-08 01:54:37 +02:00
|
|
|
")";
|
2006-07-30 06:31:08 +02:00
|
|
|
} # }}}
|
|
|
|
|
|
|
|
sub delete (@) { #{{{
|
|
|
|
my @files=@_;
|
|
|
|
|
|
|
|
# Remove feed data for removed pages.
|
|
|
|
foreach my $file (@files) {
|
2006-09-10 00:50:27 +02:00
|
|
|
my $page=pagename($file);
|
2008-02-03 22:48:26 +01:00
|
|
|
markunseen($page);
|
|
|
|
}
|
|
|
|
} #}}}
|
|
|
|
|
|
|
|
sub markunseen ($) { #{{{
|
|
|
|
my $page=shift;
|
|
|
|
|
|
|
|
foreach my $id (keys %feeds) {
|
|
|
|
if ($feeds{$id}->{sourcepage} eq $page) {
|
|
|
|
$feeds{$id}->{unseen}=1;
|
|
|
|
}
|
2006-07-30 06:31:08 +02:00
|
|
|
}
|
|
|
|
} #}}}
|
|
|
|
|
2007-05-21 04:52:51 +02:00
|
|
|
my $state_loaded=0;
|
2008-02-03 22:48:26 +01:00
|
|
|
|
2006-07-30 06:31:08 +02:00
|
|
|
sub loadstate () { #{{{
|
2007-05-21 04:52:51 +02:00
|
|
|
return if $state_loaded;
|
2008-01-09 02:41:25 +01:00
|
|
|
$state_loaded=1;
|
2006-09-10 00:50:27 +02:00
|
|
|
if (-e "$config{wikistatedir}/aggregate") {
|
2008-02-03 22:48:26 +01:00
|
|
|
open(IN, "$config{wikistatedir}/aggregate") ||
|
2007-07-28 23:01:56 +02:00
|
|
|
die "$config{wikistatedir}/aggregate: $!";
|
2006-07-30 06:31:08 +02:00
|
|
|
while (<IN>) {
|
|
|
|
$_=IkiWiki::possibly_foolish_untaint($_);
|
|
|
|
chomp;
|
|
|
|
my $data={};
|
|
|
|
foreach my $i (split(/ /, $_)) {
|
|
|
|
my ($field, $val)=split(/=/, $i, 2);
|
2006-07-30 08:20:58 +02:00
|
|
|
if ($field eq "name" || $field eq "feed" ||
|
|
|
|
$field eq "guid" || $field eq "message") {
|
2006-07-30 08:55:33 +02:00
|
|
|
$data->{$field}=decode_entities($val, " \t\n");
|
2006-07-30 06:31:08 +02:00
|
|
|
}
|
|
|
|
elsif ($field eq "tag") {
|
|
|
|
push @{$data->{tags}}, $val;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
$data->{$field}=$val;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (exists $data->{name}) {
|
|
|
|
$feeds{$data->{name}}=$data;
|
|
|
|
}
|
|
|
|
elsif (exists $data->{guid}) {
|
|
|
|
$guids{$data->{guid}}=$data;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
close IN;
|
|
|
|
}
|
|
|
|
} #}}}
|
|
|
|
|
|
|
|
sub savestate () { #{{{
|
2008-01-05 07:26:09 +01:00
|
|
|
return unless $state_loaded;
|
2008-02-03 22:48:26 +01:00
|
|
|
garbage_collect();
|
2007-02-15 03:22:08 +01:00
|
|
|
my $newfile="$config{wikistatedir}/aggregate.new";
|
|
|
|
my $cleanup = sub { unlink($newfile) };
|
2008-02-03 22:48:26 +01:00
|
|
|
open (OUT, ">$newfile") || error("open $newfile: $!", $cleanup);
|
2006-07-30 06:31:08 +02:00
|
|
|
foreach my $data (values %feeds, values %guids) {
|
|
|
|
my @line;
|
|
|
|
foreach my $field (keys %$data) {
|
2006-07-30 08:20:58 +02:00
|
|
|
if ($field eq "name" || $field eq "feed" ||
|
|
|
|
$field eq "guid" || $field eq "message") {
|
2006-07-30 08:55:33 +02:00
|
|
|
push @line, "$field=".encode_entities($data->{$field}, " \t\n");
|
2006-07-30 06:31:08 +02:00
|
|
|
}
|
|
|
|
elsif ($field eq "tags") {
|
|
|
|
push @line, "tag=$_" foreach @{$data->{tags}};
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
push @line, "$field=".$data->{$field};
|
|
|
|
}
|
|
|
|
}
|
2007-02-15 03:22:08 +01:00
|
|
|
print OUT join(" ", @line)."\n" || error("write $newfile: $!", $cleanup);
|
2006-07-30 06:31:08 +02:00
|
|
|
}
|
2007-02-15 03:22:08 +01:00
|
|
|
close OUT || error("save $newfile: $!", $cleanup);
|
|
|
|
rename($newfile, "$config{wikistatedir}/aggregate") ||
|
|
|
|
error("rename $newfile: $!", $cleanup);
|
2006-07-30 06:31:08 +02:00
|
|
|
} #}}}
|
|
|
|
|
2008-02-03 22:48:26 +01:00
|
|
|
sub garbage_collect () { #{{{
|
|
|
|
foreach my $name (keys %feeds) {
|
|
|
|
# remove any feeds that were not seen while building the pages
|
|
|
|
# that used to contain them
|
|
|
|
if ($feeds{$name}->{unseen}) {
|
|
|
|
delete $feeds{$name};
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
foreach my $guid (values %guids) {
|
|
|
|
# any guid whose feed is gone should be removed
|
|
|
|
if (! exists $feeds{$guid->{feed}}) {
|
|
|
|
unlink pagefile($guid->{page})
|
|
|
|
if exists $guid->{page};
|
|
|
|
delete $guids{$guid->{guid}};
|
|
|
|
}
|
|
|
|
# handle expired guids
|
|
|
|
elsif ($guid->{expired} && exists $guid->{page}) {
|
|
|
|
unlink pagefile($guid->{page});
|
|
|
|
delete $guid->{page};
|
|
|
|
delete $guid->{md5};
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} #}}}
|
|
|
|
|
|
|
|
sub mergestate () { #{{{
|
|
|
|
# Load the current state in from disk, and merge into it
|
|
|
|
# values from the state in memory that might have changed
|
|
|
|
# during aggregation.
|
|
|
|
my %myfeeds=%feeds;
|
|
|
|
my %myguids=%guids;
|
|
|
|
clearstate();
|
|
|
|
loadstate();
|
|
|
|
|
|
|
|
# All that can change in feed state during aggregation is a few
|
|
|
|
# fields.
|
|
|
|
foreach my $name (keys %myfeeds) {
|
|
|
|
if (exists $feeds{$name}) {
|
|
|
|
foreach my $field (qw{message lastupdate numposts
|
|
|
|
newposts error}) {
|
|
|
|
$feeds{$name}->{$field}=$myfeeds{$name}->{$field};
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# New guids can be created during aggregation.
|
|
|
|
# It's also possible that guids were removed from the on-disk state
|
|
|
|
# while the aggregation was in process. That would only happen if
|
|
|
|
# their feed was also removed, so any removed guids added back here
|
|
|
|
# will be garbage collected later.
|
|
|
|
foreach my $guid (keys %myguids) {
|
|
|
|
if (! exists $guids{$guid}) {
|
|
|
|
$guids{$guid}=$myguids{$guid};
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} #}}}
|
|
|
|
|
2008-02-03 09:04:19 +01:00
|
|
|
sub clearstate () { #{{{
|
|
|
|
%feeds=();
|
|
|
|
%guids=();
|
|
|
|
$state_loaded=0;
|
|
|
|
} #}}}
|
|
|
|
|
2006-11-01 06:41:37 +01:00
|
|
|
sub expire () { #{{{
|
|
|
|
foreach my $feed (values %feeds) {
|
|
|
|
next unless $feed->{expireage} || $feed->{expirecount};
|
|
|
|
my $count=0;
|
2007-10-31 03:50:44 +01:00
|
|
|
my %seen;
|
2006-11-01 06:41:37 +01:00
|
|
|
foreach my $item (sort { $IkiWiki::pagectime{$b->{page}} <=> $IkiWiki::pagectime{$a->{page}} }
|
2007-10-31 03:56:13 +01:00
|
|
|
grep { exists $_->{page} && $_->{feed} eq $feed->{name} && $IkiWiki::pagectime{$_->{page}} }
|
2006-11-01 06:41:37 +01:00
|
|
|
values %guids) {
|
|
|
|
if ($feed->{expireage}) {
|
|
|
|
my $days_old = (time - $IkiWiki::pagectime{$item->{page}}) / 60 / 60 / 24;
|
|
|
|
if ($days_old > $feed->{expireage}) {
|
2006-12-29 05:38:40 +01:00
|
|
|
debug(sprintf(gettext("expiring %s (%s days old)"),
|
2008-01-03 05:38:45 +01:00
|
|
|
$item->{page}, int($days_old)));
|
2006-11-01 06:41:37 +01:00
|
|
|
$item->{expired}=1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
elsif ($feed->{expirecount} &&
|
|
|
|
$count >= $feed->{expirecount}) {
|
2006-12-29 05:38:40 +01:00
|
|
|
debug(sprintf(gettext("expiring %s"), $item->{page}));
|
2006-11-01 06:41:37 +01:00
|
|
|
$item->{expired}=1;
|
|
|
|
}
|
|
|
|
else {
|
2007-10-31 03:56:13 +01:00
|
|
|
if (! $seen{$item->{page}}) {
|
|
|
|
$seen{$item->{page}}=1;
|
|
|
|
$count++;
|
|
|
|
}
|
2006-11-01 06:41:37 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} #}}}
|
|
|
|
|
2008-02-03 22:48:26 +01:00
|
|
|
sub needsaggregate () { #{{{
|
|
|
|
return values %feeds if $config{rebuild};
|
|
|
|
return grep { time - $_->{lastupdate} >= $_->{updateinterval} } values %feeds;
|
|
|
|
} #}}}
|
|
|
|
|
|
|
|
sub aggregate (@) { #{{{
|
2006-07-30 06:31:08 +02:00
|
|
|
eval q{use XML::Feed};
|
2006-11-08 22:03:33 +01:00
|
|
|
error($@) if $@;
|
2007-04-23 20:36:44 +02:00
|
|
|
eval q{use URI::Fetch};
|
|
|
|
error($@) if $@;
|
2006-07-30 06:31:08 +02:00
|
|
|
|
2008-02-03 22:48:26 +01:00
|
|
|
foreach my $feed (@_) {
|
2006-07-30 06:31:08 +02:00
|
|
|
$feed->{lastupdate}=time;
|
|
|
|
$feed->{newposts}=0;
|
2007-04-23 20:36:44 +02:00
|
|
|
$feed->{message}=sprintf(gettext("processed ok at %s"),
|
|
|
|
displaytime($feed->{lastupdate}));
|
|
|
|
$feed->{error}=0;
|
2006-07-30 06:31:08 +02:00
|
|
|
|
2007-01-03 05:19:51 +01:00
|
|
|
debug(sprintf(gettext("checking feed %s ..."), $feed->{name}));
|
2006-07-30 06:31:08 +02:00
|
|
|
|
2006-07-30 08:14:44 +02:00
|
|
|
if (! length $feed->{feedurl}) {
|
|
|
|
my @urls=XML::Feed->find_feeds($feed->{url});
|
|
|
|
if (! @urls) {
|
2007-03-31 05:26:43 +02:00
|
|
|
$feed->{message}=sprintf(gettext("could not find feed at %s"), $feed->{url});
|
2006-10-13 20:31:18 +02:00
|
|
|
$feed->{error}=1;
|
2006-09-10 00:50:27 +02:00
|
|
|
debug($feed->{message});
|
2006-07-30 08:14:44 +02:00
|
|
|
next;
|
2006-07-30 06:31:08 +02:00
|
|
|
}
|
2006-07-30 08:14:44 +02:00
|
|
|
$feed->{feedurl}=pop @urls;
|
|
|
|
}
|
2007-04-23 20:36:44 +02:00
|
|
|
my $res=URI::Fetch->fetch($feed->{feedurl});
|
|
|
|
if (! $res) {
|
|
|
|
$feed->{message}=URI::Fetch->errstr;
|
|
|
|
$feed->{error}=1;
|
|
|
|
debug($feed->{message});
|
|
|
|
next;
|
|
|
|
}
|
|
|
|
if ($res->status == URI::Fetch::URI_GONE()) {
|
|
|
|
$feed->{message}=gettext("feed not found");
|
|
|
|
$feed->{error}=1;
|
|
|
|
debug($feed->{message});
|
|
|
|
next;
|
|
|
|
}
|
|
|
|
my $content=$res->content;
|
|
|
|
my $f=eval{XML::Feed->parse(\$content)};
|
|
|
|
if ($@) {
|
|
|
|
# One common cause of XML::Feed crashing is a feed
|
|
|
|
# that contains invalid UTF-8 sequences. Convert
|
|
|
|
# feed to ascii to try to work around.
|
2007-04-23 21:32:21 +02:00
|
|
|
$feed->{message}.=" ".sprintf(gettext("(invalid UTF-8 stripped from feed)"));
|
2007-04-23 20:36:44 +02:00
|
|
|
$content=Encode::decode_utf8($content);
|
|
|
|
$f=eval{XML::Feed->parse(\$content)};
|
|
|
|
}
|
2008-01-09 02:41:25 +01:00
|
|
|
if ($@) {
|
|
|
|
# Another possibility is badly escaped entities.
|
|
|
|
$feed->{message}.=" ".sprintf(gettext("(feed entities escaped)"));
|
|
|
|
$content=~s/\&(?!amp)(\w+);/&$1;/g;
|
|
|
|
$content=Encode::decode_utf8($content);
|
|
|
|
$f=eval{XML::Feed->parse(\$content)};
|
|
|
|
}
|
2006-07-30 08:14:44 +02:00
|
|
|
if ($@) {
|
2006-12-29 05:38:40 +01:00
|
|
|
$feed->{message}=gettext("feed crashed XML::Feed!")." ($@)";
|
2006-10-13 20:31:18 +02:00
|
|
|
$feed->{error}=1;
|
2006-09-10 00:50:27 +02:00
|
|
|
debug($feed->{message});
|
2006-07-30 08:14:44 +02:00
|
|
|
next;
|
|
|
|
}
|
|
|
|
if (! $f) {
|
|
|
|
$feed->{message}=XML::Feed->errstr;
|
2006-10-13 20:31:18 +02:00
|
|
|
$feed->{error}=1;
|
2006-09-10 00:50:27 +02:00
|
|
|
debug($feed->{message});
|
2006-07-30 08:14:44 +02:00
|
|
|
next;
|
|
|
|
}
|
2006-07-30 06:31:08 +02:00
|
|
|
|
2006-07-30 08:14:44 +02:00
|
|
|
foreach my $entry ($f->entries) {
|
|
|
|
add_page(
|
|
|
|
feed => $feed,
|
2008-01-09 02:41:25 +01:00
|
|
|
copyright => $f->copyright,
|
2006-07-30 08:14:44 +02:00
|
|
|
title => defined $entry->title ? decode_entities($entry->title) : "untitled",
|
|
|
|
link => $entry->link,
|
2007-04-23 20:36:44 +02:00
|
|
|
content => defined $entry->content->body ? $entry->content->body : "",
|
2008-04-03 08:36:01 +02:00
|
|
|
guid => defined $entry->id ? $entry->id : time."_".$feed->{name},
|
2006-07-30 08:14:44 +02:00
|
|
|
ctime => $entry->issued ? ($entry->issued->epoch || time) : time,
|
|
|
|
);
|
2006-07-30 06:31:08 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
} #}}}
|
|
|
|
|
|
|
|
sub add_page (@) { #{{{
|
|
|
|
my %params=@_;
|
2006-07-30 07:29:03 +02:00
|
|
|
|
2006-07-30 06:31:08 +02:00
|
|
|
my $feed=$params{feed};
|
|
|
|
my $guid={};
|
|
|
|
my $mtime;
|
|
|
|
if (exists $guids{$params{guid}}) {
|
|
|
|
# updating an existing post
|
|
|
|
$guid=$guids{$params{guid}};
|
2006-11-06 05:27:29 +01:00
|
|
|
return if $guid->{expired};
|
2006-07-30 06:31:08 +02:00
|
|
|
}
|
|
|
|
else {
|
|
|
|
# new post
|
|
|
|
$guid->{guid}=$params{guid};
|
|
|
|
$guids{$params{guid}}=$guid;
|
|
|
|
$mtime=$params{ctime};
|
|
|
|
$feed->{numposts}++;
|
|
|
|
$feed->{newposts}++;
|
|
|
|
|
|
|
|
# assign it an unused page
|
2006-07-30 23:51:37 +02:00
|
|
|
my $page=IkiWiki::titlepage($params{title});
|
2006-07-31 02:59:54 +02:00
|
|
|
# escape slashes and periods in title so it doesn't specify
|
|
|
|
# directory name or trigger ".." disallowing code.
|
|
|
|
$page=~s!([/.])!"__".ord($1)."__"!eg;
|
2006-07-30 23:51:37 +02:00
|
|
|
$page=$feed->{dir}."/".$page;
|
2006-09-10 00:50:27 +02:00
|
|
|
($page)=$page=~/$config{wiki_file_regexp}/;
|
2006-07-30 06:31:08 +02:00
|
|
|
if (! defined $page || ! length $page) {
|
|
|
|
$page=$feed->{dir}."/item";
|
|
|
|
}
|
|
|
|
my $c="";
|
2006-08-16 19:37:36 +02:00
|
|
|
while (exists $IkiWiki::pagecase{lc $page.$c} ||
|
2006-07-30 06:31:08 +02:00
|
|
|
-e pagefile($page.$c)) {
|
|
|
|
$c++
|
|
|
|
}
|
2007-01-14 05:17:53 +01:00
|
|
|
|
|
|
|
# Make sure that the file name isn't too long.
|
|
|
|
# NB: This doesn't check for path length limits.
|
|
|
|
my $max=POSIX::pathconf($config{srcdir}, &POSIX::_PC_NAME_MAX);
|
2007-04-01 21:59:42 +02:00
|
|
|
if (defined $max && length(htmlfn($page)) >= $max) {
|
2007-01-14 05:17:53 +01:00
|
|
|
$c="";
|
2007-01-14 06:01:46 +01:00
|
|
|
$page=$feed->{dir}."/item";
|
2007-01-14 05:17:53 +01:00
|
|
|
while (exists $IkiWiki::pagecase{lc $page.$c} ||
|
|
|
|
-e pagefile($page.$c)) {
|
|
|
|
$c++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-07-30 06:31:08 +02:00
|
|
|
$guid->{page}=$page;
|
2006-12-29 05:38:40 +01:00
|
|
|
debug(sprintf(gettext("creating new page %s"), $page));
|
2006-07-30 06:31:08 +02:00
|
|
|
}
|
|
|
|
$guid->{feed}=$feed->{name};
|
|
|
|
|
|
|
|
# To write or not to write? Need to avoid writing unchanged pages
|
|
|
|
# to avoid unneccessary rebuilding. The mtime from rss cannot be
|
|
|
|
# trusted; let's use a digest.
|
|
|
|
eval q{use Digest::MD5 'md5_hex'};
|
2006-11-08 22:03:33 +01:00
|
|
|
error($@) if $@;
|
2006-07-30 07:17:59 +02:00
|
|
|
require Encode;
|
|
|
|
my $digest=md5_hex(Encode::encode_utf8($params{content}));
|
2006-09-10 00:50:27 +02:00
|
|
|
return unless ! exists $guid->{md5} || $guid->{md5} ne $digest || $config{rebuild};
|
2006-07-30 06:31:08 +02:00
|
|
|
$guid->{md5}=$digest;
|
|
|
|
|
|
|
|
# Create the page.
|
2006-09-10 00:50:27 +02:00
|
|
|
my $template=template("aggregatepost.tmpl", blind_cache => 1);
|
2006-07-31 00:58:48 +02:00
|
|
|
$template->param(title => $params{title})
|
|
|
|
if defined $params{title} && length($params{title});
|
2006-08-03 23:50:47 +02:00
|
|
|
$template->param(content => htmlescape(htmlabs($params{content}, $feed->{feedurl})));
|
2006-07-30 06:31:08 +02:00
|
|
|
$template->param(name => $feed->{name});
|
2006-08-04 02:59:00 +02:00
|
|
|
$template->param(url => $feed->{url});
|
2008-01-09 02:41:25 +01:00
|
|
|
$template->param(copyright => $params{copyright})
|
|
|
|
if defined $params{copyright} && length $params{copyright};
|
2006-08-04 02:01:51 +02:00
|
|
|
$template->param(permalink => urlabs($params{link}, $feed->{feedurl}))
|
2006-08-03 23:50:47 +02:00
|
|
|
if defined $params{link};
|
2006-07-30 06:31:08 +02:00
|
|
|
if (ref $feed->{tags}) {
|
2006-07-30 07:14:35 +02:00
|
|
|
$template->param(tags => [map { tag => $_ }, @{$feed->{tags}}]);
|
2006-07-30 06:31:08 +02:00
|
|
|
}
|
2007-04-01 21:59:42 +02:00
|
|
|
writefile(htmlfn($guid->{page}), $config{srcdir},
|
2006-07-30 06:31:08 +02:00
|
|
|
$template->output);
|
|
|
|
|
|
|
|
# Set the mtime, this lets the build process get the right creation
|
|
|
|
# time on record for the new page.
|
2008-01-16 08:01:00 +01:00
|
|
|
utime $mtime, $mtime, pagefile($guid->{page})
|
|
|
|
if defined $mtime && $mtime <= time;
|
2006-07-30 06:31:08 +02:00
|
|
|
} #}}}
|
|
|
|
|
2006-08-03 23:50:47 +02:00
|
|
|
sub htmlescape ($) { #{{{
|
|
|
|
# escape accidental wikilinks and preprocessor stuff
|
|
|
|
my $html=shift;
|
|
|
|
$html=~s/(?<!\\)\[\[/\\\[\[/g;
|
|
|
|
return $html;
|
|
|
|
} #}}}
|
|
|
|
|
|
|
|
sub urlabs ($$) { #{{{
|
|
|
|
my $url=shift;
|
|
|
|
my $urlbase=shift;
|
|
|
|
|
|
|
|
URI->new_abs($url, $urlbase)->as_string;
|
|
|
|
} #}}}
|
|
|
|
|
|
|
|
sub htmlabs ($$) { #{{{
|
|
|
|
# Convert links in html from relative to absolute.
|
|
|
|
# Note that this is a heuristic, which is not specified by the rss
|
|
|
|
# spec and may not be right for all feeds. Also, see Debian
|
2006-08-28 06:40:00 +02:00
|
|
|
# bug #381359.
|
2006-08-03 23:50:47 +02:00
|
|
|
my $html=shift;
|
|
|
|
my $urlbase=shift;
|
|
|
|
|
|
|
|
my $ret="";
|
|
|
|
my $p = HTML::Parser->new(api_version => 3);
|
|
|
|
$p->handler(default => sub { $ret.=join("", @_) }, "text");
|
|
|
|
$p->handler(start => sub {
|
|
|
|
my ($tagname, $pos, $text) = @_;
|
|
|
|
if (ref $HTML::Tagset::linkElements{$tagname}) {
|
|
|
|
while (4 <= @$pos) {
|
|
|
|
# use attribute sets from right to left
|
|
|
|
# to avoid invalidating the offsets
|
|
|
|
# when replacing the values
|
|
|
|
my($k_offset, $k_len, $v_offset, $v_len) =
|
|
|
|
splice(@$pos, -4);
|
|
|
|
my $attrname = lc(substr($text, $k_offset, $k_len));
|
|
|
|
next unless grep { $_ eq $attrname } @{$HTML::Tagset::linkElements{$tagname}};
|
|
|
|
next unless $v_offset; # 0 v_offset means no value
|
|
|
|
my $v = substr($text, $v_offset, $v_len);
|
|
|
|
$v =~ s/^([\'\"])(.*)\1$/$2/;
|
|
|
|
my $new_v=urlabs($v, $urlbase);
|
|
|
|
$new_v =~ s/\"/"/g; # since we quote with ""
|
|
|
|
substr($text, $v_offset, $v_len) = qq("$new_v");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
$ret.=$text;
|
|
|
|
}, "tagname, tokenpos, text");
|
|
|
|
$p->parse($html);
|
|
|
|
$p->eof;
|
|
|
|
|
|
|
|
return $ret;
|
|
|
|
} #}}}
|
|
|
|
|
2006-07-30 06:31:08 +02:00
|
|
|
sub pagefile ($) { #{{{
|
2006-07-30 09:17:44 +02:00
|
|
|
my $page=shift;
|
2006-07-30 06:31:08 +02:00
|
|
|
|
2007-04-01 22:55:39 +02:00
|
|
|
return "$config{srcdir}/".htmlfn($page);
|
2006-07-30 06:31:08 +02:00
|
|
|
} #}}}
|
|
|
|
|
2007-04-01 21:59:42 +02:00
|
|
|
sub htmlfn ($) { #{{{
|
2007-07-25 03:16:53 +02:00
|
|
|
return shift().".".$config{htmlext};
|
2007-04-01 21:59:42 +02:00
|
|
|
} #}}}
|
|
|
|
|
2008-02-03 21:17:15 +01:00
|
|
|
my $aggregatelock;
|
|
|
|
|
|
|
|
sub lockaggregate () { #{{{
|
|
|
|
# Take an exclusive lock to prevent multiple concurrent aggregators.
|
|
|
|
# Returns true if the lock was aquired.
|
|
|
|
if (! -d $config{wikistatedir}) {
|
|
|
|
mkdir($config{wikistatedir});
|
|
|
|
}
|
|
|
|
open($aggregatelock, '>', "$config{wikistatedir}/aggregatelock") ||
|
|
|
|
error ("cannot open to $config{wikistatedir}/aggregatelock: $!");
|
|
|
|
if (! flock($aggregatelock, 2 | 4)) { # LOCK_EX | LOCK_NB
|
|
|
|
close($aggregatelock) || error("failed closing aggregatelock: $!");
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
} #}}}
|
|
|
|
|
|
|
|
sub unlockaggregate () { #{{{
|
|
|
|
return close($aggregatelock) if $aggregatelock;
|
|
|
|
return;
|
|
|
|
} #}}}
|
|
|
|
|
2006-07-30 06:31:08 +02:00
|
|
|
1
|