move indexing to sanitize hook

I think this will give better results overall.

I made %IkiWiki::preprocessing accessible and used it to avoid indexing
at unnecessary points.
master
Joey Hess 2008-06-04 00:58:46 -04:00
parent 46f7dcb362
commit 1546b48b97
4 changed files with 61 additions and 74 deletions

View File

@ -692,7 +692,7 @@ sub linkify ($$$) { #{{{
return $content;
} #}}}
my %preprocessing;
our %preprocessing;
our $preprocess_preview=0;
sub preprocess ($$$;$$) { #{{{
my $page=shift; # the page the data comes from

View File

@ -9,10 +9,7 @@ use IkiWiki 2.00;
sub import { #{{{
hook(type => "checkconfig", id => "search", call => \&checkconfig);
hook(type => "pagetemplate", id => "search", call => \&pagetemplate);
# run last so other needsbuild hooks can modify the list
hook(type => "needsbuild", id => "search", call => \&needsbuild,
last => 1);
hook(type => "filter", id => "search", call => \&filter);
hook(type => "sanitize", id => "search", call => \&index);
hook(type => "delete", id => "search", call => \&delete);
hook(type => "cgi", id => "search", call => \&cgi);
} # }}}
@ -56,73 +53,65 @@ sub pagetemplate (@) { #{{{
}
} #}}}
my %toindex;
sub needsbuild ($) { #{{{
%toindex = map { pagename($_) => 1 } @{shift()};
} #}}}
my $scrubber;
sub filter (@) { #{{{
sub index (@) { #{{{
my %params=@_;
if ($params{page} eq $params{destpage} && $toindex{$params{page}}) {
# index page
my $db=xapiandb();
my $doc=Search::Xapian::Document->new();
my $title;
if (exists $pagestate{$params{page}}{meta} &&
exists $pagestate{$params{page}}{meta}{title}) {
$title=$pagestate{$params{page}}{meta}{title};
}
else {
$title=IkiWiki::pagetitle($params{page});
}
# Remove any html from text to be indexed.
# TODO: This removes html that is in eg, a markdown pre,
# which should not be removed, really.
if (! defined $scrubber) {
eval q{use HTML::Scrubber};
if (! $@) {
$scrubber=HTML::Scrubber->new(allow => []);
}
}
my $toindex = defined $scrubber ? $scrubber->scrub($params{content}) : $params{content};
# Take 512 characters for a sample, then extend it out
# if it stopped in the middle of a word.
my $size=512;
my ($sample)=substr($toindex, 0, $size);
if (length($sample) == $size) {
my $max=length($toindex);
my $next;
while ($size < $max &&
($next=substr($toindex, $size++, 1)) !~ /\s/) {
$sample.=$next;
}
}
$sample=~s/\n/ /g;
# data used by omega
$doc->set_data(
"url=".urlto($params{page}, "")."\n".
"sample=$sample\n".
"caption=$title\n".
"modtime=$IkiWiki::pagemtime{$params{page}}\n".
"size=".length($params{content})."\n"
);
my $tg = Search::Xapian::TermGenerator->new();
$tg->set_stemmer(new Search::Xapian::Stem("english"));
$tg->set_document($doc);
$tg->index_text($params{page}, 2);
$tg->index_text($title, 2);
$tg->index_text($toindex);
my $pageterm=pageterm($params{page});
$doc->add_term($pageterm);
$db->replace_document_by_term($pageterm, $doc);
return $params{content} if %IkiWiki::preprocessing;
my $db=xapiandb();
my $doc=Search::Xapian::Document->new();
my $title;
if (exists $pagestate{$params{page}}{meta} &&
exists $pagestate{$params{page}}{meta}{title}) {
$title=$pagestate{$params{page}}{meta}{title};
}
else {
$title=IkiWiki::pagetitle($params{page});
}
# Remove any html from text to be indexed.
if (! defined $scrubber) {
eval q{use HTML::Scrubber};
if (! $@) {
$scrubber=HTML::Scrubber->new(allow => []);
}
}
my $toindex = defined $scrubber ? $scrubber->scrub($params{content}) : $params{content};
# Take 512 characters for a sample, then extend it out
# if it stopped in the middle of a word.
my $size=512;
my ($sample)=substr($toindex, 0, $size);
if (length($sample) == $size) {
my $max=length($toindex);
my $next;
while ($size < $max &&
($next=substr($toindex, $size++, 1)) !~ /\s/) {
$sample.=$next;
}
}
$sample=~s/\n/ /g;
# data used by omega
$doc->set_data(
"url=".urlto($params{page}, "")."\n".
"sample=$sample\n".
"caption=$title\n".
"modtime=$IkiWiki::pagemtime{$params{page}}\n".
"size=".length($params{content})."\n"
);
my $tg = Search::Xapian::TermGenerator->new();
$tg->set_stemmer(new Search::Xapian::Stem("english"));
$tg->set_document($doc);
$tg->index_text($params{page}, 2);
$tg->index_text($title, 2);
$tg->index_text($toindex);
my $pageterm=pageterm($params{page});
$doc->add_term($pageterm);
$db->replace_document_by_term($pageterm, $doc);
return $params{content};
} #}}}

4
debian/changelog vendored
View File

@ -7,9 +7,7 @@ ikiwiki (2.49) UNRELEASED; urgency=low
* search: Converted to use xapian-omega.
* Filter hooks are no longer called during the scan phase. This will
prevent wikilinks added by filters from being scanned properly. But
no known filter hook does that, and calling filters unncessarily during
scan slowed down complex filters such as the one used to update the xapian
index.
no known filter hook does that, so let's not waste time on it.
-- Joey Hess <joeyh@debian.org> Fri, 30 May 2008 19:08:54 -0400

View File

@ -8,7 +8,7 @@ msgid ""
msgstr ""
"Project-Id-Version: PACKAGE VERSION\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2008-06-04 00:33-0400\n"
"POT-Creation-Date: 2008-06-04 00:52-0400\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
"Language-Team: LANGUAGE <LL@li.org>\n"
@ -476,12 +476,12 @@ msgstr ""
msgid "(Diff truncated)"
msgstr ""
#: ../IkiWiki/Plugin/search.pm:23
#: ../IkiWiki/Plugin/search.pm:20
#, perl-format
msgid "Must specify %s when using the search plugin"
msgstr ""
#: ../IkiWiki/Plugin/search.pm:36
#: ../IkiWiki/Plugin/search.pm:33
msgid "search"
msgstr ""