move indexing to sanitize hook
I think this will give better results overall. I made %IkiWiki::preprocessing accessible and used it to avoid indexing at unnecessary points.master
parent
46f7dcb362
commit
1546b48b97
|
@ -692,7 +692,7 @@ sub linkify ($$$) { #{{{
|
|||
return $content;
|
||||
} #}}}
|
||||
|
||||
my %preprocessing;
|
||||
our %preprocessing;
|
||||
our $preprocess_preview=0;
|
||||
sub preprocess ($$$;$$) { #{{{
|
||||
my $page=shift; # the page the data comes from
|
||||
|
|
|
@ -9,10 +9,7 @@ use IkiWiki 2.00;
|
|||
sub import { #{{{
|
||||
hook(type => "checkconfig", id => "search", call => \&checkconfig);
|
||||
hook(type => "pagetemplate", id => "search", call => \&pagetemplate);
|
||||
# run last so other needsbuild hooks can modify the list
|
||||
hook(type => "needsbuild", id => "search", call => \&needsbuild,
|
||||
last => 1);
|
||||
hook(type => "filter", id => "search", call => \&filter);
|
||||
hook(type => "sanitize", id => "search", call => \&index);
|
||||
hook(type => "delete", id => "search", call => \&delete);
|
||||
hook(type => "cgi", id => "search", call => \&cgi);
|
||||
} # }}}
|
||||
|
@ -56,73 +53,65 @@ sub pagetemplate (@) { #{{{
|
|||
}
|
||||
} #}}}
|
||||
|
||||
my %toindex;
|
||||
sub needsbuild ($) { #{{{
|
||||
%toindex = map { pagename($_) => 1 } @{shift()};
|
||||
} #}}}
|
||||
|
||||
my $scrubber;
|
||||
sub filter (@) { #{{{
|
||||
sub index (@) { #{{{
|
||||
my %params=@_;
|
||||
|
||||
if ($params{page} eq $params{destpage} && $toindex{$params{page}}) {
|
||||
# index page
|
||||
my $db=xapiandb();
|
||||
my $doc=Search::Xapian::Document->new();
|
||||
my $title;
|
||||
if (exists $pagestate{$params{page}}{meta} &&
|
||||
exists $pagestate{$params{page}}{meta}{title}) {
|
||||
$title=$pagestate{$params{page}}{meta}{title};
|
||||
}
|
||||
else {
|
||||
$title=IkiWiki::pagetitle($params{page});
|
||||
}
|
||||
|
||||
# Remove any html from text to be indexed.
|
||||
# TODO: This removes html that is in eg, a markdown pre,
|
||||
# which should not be removed, really.
|
||||
if (! defined $scrubber) {
|
||||
eval q{use HTML::Scrubber};
|
||||
if (! $@) {
|
||||
$scrubber=HTML::Scrubber->new(allow => []);
|
||||
}
|
||||
}
|
||||
my $toindex = defined $scrubber ? $scrubber->scrub($params{content}) : $params{content};
|
||||
|
||||
# Take 512 characters for a sample, then extend it out
|
||||
# if it stopped in the middle of a word.
|
||||
my $size=512;
|
||||
my ($sample)=substr($toindex, 0, $size);
|
||||
if (length($sample) == $size) {
|
||||
my $max=length($toindex);
|
||||
my $next;
|
||||
while ($size < $max &&
|
||||
($next=substr($toindex, $size++, 1)) !~ /\s/) {
|
||||
$sample.=$next;
|
||||
}
|
||||
}
|
||||
$sample=~s/\n/ /g;
|
||||
|
||||
# data used by omega
|
||||
$doc->set_data(
|
||||
"url=".urlto($params{page}, "")."\n".
|
||||
"sample=$sample\n".
|
||||
"caption=$title\n".
|
||||
"modtime=$IkiWiki::pagemtime{$params{page}}\n".
|
||||
"size=".length($params{content})."\n"
|
||||
);
|
||||
|
||||
my $tg = Search::Xapian::TermGenerator->new();
|
||||
$tg->set_stemmer(new Search::Xapian::Stem("english"));
|
||||
$tg->set_document($doc);
|
||||
$tg->index_text($params{page}, 2);
|
||||
$tg->index_text($title, 2);
|
||||
$tg->index_text($toindex);
|
||||
|
||||
my $pageterm=pageterm($params{page});
|
||||
$doc->add_term($pageterm);
|
||||
$db->replace_document_by_term($pageterm, $doc);
|
||||
return $params{content} if %IkiWiki::preprocessing;
|
||||
|
||||
my $db=xapiandb();
|
||||
my $doc=Search::Xapian::Document->new();
|
||||
my $title;
|
||||
if (exists $pagestate{$params{page}}{meta} &&
|
||||
exists $pagestate{$params{page}}{meta}{title}) {
|
||||
$title=$pagestate{$params{page}}{meta}{title};
|
||||
}
|
||||
else {
|
||||
$title=IkiWiki::pagetitle($params{page});
|
||||
}
|
||||
|
||||
# Remove any html from text to be indexed.
|
||||
if (! defined $scrubber) {
|
||||
eval q{use HTML::Scrubber};
|
||||
if (! $@) {
|
||||
$scrubber=HTML::Scrubber->new(allow => []);
|
||||
}
|
||||
}
|
||||
my $toindex = defined $scrubber ? $scrubber->scrub($params{content}) : $params{content};
|
||||
|
||||
# Take 512 characters for a sample, then extend it out
|
||||
# if it stopped in the middle of a word.
|
||||
my $size=512;
|
||||
my ($sample)=substr($toindex, 0, $size);
|
||||
if (length($sample) == $size) {
|
||||
my $max=length($toindex);
|
||||
my $next;
|
||||
while ($size < $max &&
|
||||
($next=substr($toindex, $size++, 1)) !~ /\s/) {
|
||||
$sample.=$next;
|
||||
}
|
||||
}
|
||||
$sample=~s/\n/ /g;
|
||||
|
||||
# data used by omega
|
||||
$doc->set_data(
|
||||
"url=".urlto($params{page}, "")."\n".
|
||||
"sample=$sample\n".
|
||||
"caption=$title\n".
|
||||
"modtime=$IkiWiki::pagemtime{$params{page}}\n".
|
||||
"size=".length($params{content})."\n"
|
||||
);
|
||||
|
||||
my $tg = Search::Xapian::TermGenerator->new();
|
||||
$tg->set_stemmer(new Search::Xapian::Stem("english"));
|
||||
$tg->set_document($doc);
|
||||
$tg->index_text($params{page}, 2);
|
||||
$tg->index_text($title, 2);
|
||||
$tg->index_text($toindex);
|
||||
|
||||
my $pageterm=pageterm($params{page});
|
||||
$doc->add_term($pageterm);
|
||||
$db->replace_document_by_term($pageterm, $doc);
|
||||
|
||||
return $params{content};
|
||||
} #}}}
|
||||
|
|
|
@ -7,9 +7,7 @@ ikiwiki (2.49) UNRELEASED; urgency=low
|
|||
* search: Converted to use xapian-omega.
|
||||
* Filter hooks are no longer called during the scan phase. This will
|
||||
prevent wikilinks added by filters from being scanned properly. But
|
||||
no known filter hook does that, and calling filters unncessarily during
|
||||
scan slowed down complex filters such as the one used to update the xapian
|
||||
index.
|
||||
no known filter hook does that, so let's not waste time on it.
|
||||
|
||||
-- Joey Hess <joeyh@debian.org> Fri, 30 May 2008 19:08:54 -0400
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@ msgid ""
|
|||
msgstr ""
|
||||
"Project-Id-Version: PACKAGE VERSION\n"
|
||||
"Report-Msgid-Bugs-To: \n"
|
||||
"POT-Creation-Date: 2008-06-04 00:33-0400\n"
|
||||
"POT-Creation-Date: 2008-06-04 00:52-0400\n"
|
||||
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
|
||||
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
|
||||
"Language-Team: LANGUAGE <LL@li.org>\n"
|
||||
|
@ -476,12 +476,12 @@ msgstr ""
|
|||
msgid "(Diff truncated)"
|
||||
msgstr ""
|
||||
|
||||
#: ../IkiWiki/Plugin/search.pm:23
|
||||
#: ../IkiWiki/Plugin/search.pm:20
|
||||
#, perl-format
|
||||
msgid "Must specify %s when using the search plugin"
|
||||
msgstr ""
|
||||
|
||||
#: ../IkiWiki/Plugin/search.pm:36
|
||||
#: ../IkiWiki/Plugin/search.pm:33
|
||||
msgid "search"
|
||||
msgstr ""
|
||||
|
||||
|
|
Loading…
Reference in New Issue