move indexing to sanitize hook
I think this will give better results overall. I made %IkiWiki::preprocessing accessible and used it to avoid indexing at unnecessary points.master
parent
46f7dcb362
commit
1546b48b97
|
@ -692,7 +692,7 @@ sub linkify ($$$) { #{{{
|
||||||
return $content;
|
return $content;
|
||||||
} #}}}
|
} #}}}
|
||||||
|
|
||||||
my %preprocessing;
|
our %preprocessing;
|
||||||
our $preprocess_preview=0;
|
our $preprocess_preview=0;
|
||||||
sub preprocess ($$$;$$) { #{{{
|
sub preprocess ($$$;$$) { #{{{
|
||||||
my $page=shift; # the page the data comes from
|
my $page=shift; # the page the data comes from
|
||||||
|
|
|
@ -9,10 +9,7 @@ use IkiWiki 2.00;
|
||||||
sub import { #{{{
|
sub import { #{{{
|
||||||
hook(type => "checkconfig", id => "search", call => \&checkconfig);
|
hook(type => "checkconfig", id => "search", call => \&checkconfig);
|
||||||
hook(type => "pagetemplate", id => "search", call => \&pagetemplate);
|
hook(type => "pagetemplate", id => "search", call => \&pagetemplate);
|
||||||
# run last so other needsbuild hooks can modify the list
|
hook(type => "sanitize", id => "search", call => \&index);
|
||||||
hook(type => "needsbuild", id => "search", call => \&needsbuild,
|
|
||||||
last => 1);
|
|
||||||
hook(type => "filter", id => "search", call => \&filter);
|
|
||||||
hook(type => "delete", id => "search", call => \&delete);
|
hook(type => "delete", id => "search", call => \&delete);
|
||||||
hook(type => "cgi", id => "search", call => \&cgi);
|
hook(type => "cgi", id => "search", call => \&cgi);
|
||||||
} # }}}
|
} # }}}
|
||||||
|
@ -56,73 +53,65 @@ sub pagetemplate (@) { #{{{
|
||||||
}
|
}
|
||||||
} #}}}
|
} #}}}
|
||||||
|
|
||||||
my %toindex;
|
|
||||||
sub needsbuild ($) { #{{{
|
|
||||||
%toindex = map { pagename($_) => 1 } @{shift()};
|
|
||||||
} #}}}
|
|
||||||
|
|
||||||
my $scrubber;
|
my $scrubber;
|
||||||
sub filter (@) { #{{{
|
sub index (@) { #{{{
|
||||||
my %params=@_;
|
my %params=@_;
|
||||||
|
|
||||||
if ($params{page} eq $params{destpage} && $toindex{$params{page}}) {
|
return $params{content} if %IkiWiki::preprocessing;
|
||||||
# index page
|
|
||||||
my $db=xapiandb();
|
|
||||||
my $doc=Search::Xapian::Document->new();
|
|
||||||
my $title;
|
|
||||||
if (exists $pagestate{$params{page}}{meta} &&
|
|
||||||
exists $pagestate{$params{page}}{meta}{title}) {
|
|
||||||
$title=$pagestate{$params{page}}{meta}{title};
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
$title=IkiWiki::pagetitle($params{page});
|
|
||||||
}
|
|
||||||
|
|
||||||
# Remove any html from text to be indexed.
|
my $db=xapiandb();
|
||||||
# TODO: This removes html that is in eg, a markdown pre,
|
my $doc=Search::Xapian::Document->new();
|
||||||
# which should not be removed, really.
|
my $title;
|
||||||
if (! defined $scrubber) {
|
if (exists $pagestate{$params{page}}{meta} &&
|
||||||
eval q{use HTML::Scrubber};
|
exists $pagestate{$params{page}}{meta}{title}) {
|
||||||
if (! $@) {
|
$title=$pagestate{$params{page}}{meta}{title};
|
||||||
$scrubber=HTML::Scrubber->new(allow => []);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
my $toindex = defined $scrubber ? $scrubber->scrub($params{content}) : $params{content};
|
|
||||||
|
|
||||||
# Take 512 characters for a sample, then extend it out
|
|
||||||
# if it stopped in the middle of a word.
|
|
||||||
my $size=512;
|
|
||||||
my ($sample)=substr($toindex, 0, $size);
|
|
||||||
if (length($sample) == $size) {
|
|
||||||
my $max=length($toindex);
|
|
||||||
my $next;
|
|
||||||
while ($size < $max &&
|
|
||||||
($next=substr($toindex, $size++, 1)) !~ /\s/) {
|
|
||||||
$sample.=$next;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
$sample=~s/\n/ /g;
|
|
||||||
|
|
||||||
# data used by omega
|
|
||||||
$doc->set_data(
|
|
||||||
"url=".urlto($params{page}, "")."\n".
|
|
||||||
"sample=$sample\n".
|
|
||||||
"caption=$title\n".
|
|
||||||
"modtime=$IkiWiki::pagemtime{$params{page}}\n".
|
|
||||||
"size=".length($params{content})."\n"
|
|
||||||
);
|
|
||||||
|
|
||||||
my $tg = Search::Xapian::TermGenerator->new();
|
|
||||||
$tg->set_stemmer(new Search::Xapian::Stem("english"));
|
|
||||||
$tg->set_document($doc);
|
|
||||||
$tg->index_text($params{page}, 2);
|
|
||||||
$tg->index_text($title, 2);
|
|
||||||
$tg->index_text($toindex);
|
|
||||||
|
|
||||||
my $pageterm=pageterm($params{page});
|
|
||||||
$doc->add_term($pageterm);
|
|
||||||
$db->replace_document_by_term($pageterm, $doc);
|
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
$title=IkiWiki::pagetitle($params{page});
|
||||||
|
}
|
||||||
|
|
||||||
|
# Remove any html from text to be indexed.
|
||||||
|
if (! defined $scrubber) {
|
||||||
|
eval q{use HTML::Scrubber};
|
||||||
|
if (! $@) {
|
||||||
|
$scrubber=HTML::Scrubber->new(allow => []);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
my $toindex = defined $scrubber ? $scrubber->scrub($params{content}) : $params{content};
|
||||||
|
|
||||||
|
# Take 512 characters for a sample, then extend it out
|
||||||
|
# if it stopped in the middle of a word.
|
||||||
|
my $size=512;
|
||||||
|
my ($sample)=substr($toindex, 0, $size);
|
||||||
|
if (length($sample) == $size) {
|
||||||
|
my $max=length($toindex);
|
||||||
|
my $next;
|
||||||
|
while ($size < $max &&
|
||||||
|
($next=substr($toindex, $size++, 1)) !~ /\s/) {
|
||||||
|
$sample.=$next;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
$sample=~s/\n/ /g;
|
||||||
|
|
||||||
|
# data used by omega
|
||||||
|
$doc->set_data(
|
||||||
|
"url=".urlto($params{page}, "")."\n".
|
||||||
|
"sample=$sample\n".
|
||||||
|
"caption=$title\n".
|
||||||
|
"modtime=$IkiWiki::pagemtime{$params{page}}\n".
|
||||||
|
"size=".length($params{content})."\n"
|
||||||
|
);
|
||||||
|
|
||||||
|
my $tg = Search::Xapian::TermGenerator->new();
|
||||||
|
$tg->set_stemmer(new Search::Xapian::Stem("english"));
|
||||||
|
$tg->set_document($doc);
|
||||||
|
$tg->index_text($params{page}, 2);
|
||||||
|
$tg->index_text($title, 2);
|
||||||
|
$tg->index_text($toindex);
|
||||||
|
|
||||||
|
my $pageterm=pageterm($params{page});
|
||||||
|
$doc->add_term($pageterm);
|
||||||
|
$db->replace_document_by_term($pageterm, $doc);
|
||||||
|
|
||||||
return $params{content};
|
return $params{content};
|
||||||
} #}}}
|
} #}}}
|
||||||
|
|
|
@ -7,9 +7,7 @@ ikiwiki (2.49) UNRELEASED; urgency=low
|
||||||
* search: Converted to use xapian-omega.
|
* search: Converted to use xapian-omega.
|
||||||
* Filter hooks are no longer called during the scan phase. This will
|
* Filter hooks are no longer called during the scan phase. This will
|
||||||
prevent wikilinks added by filters from being scanned properly. But
|
prevent wikilinks added by filters from being scanned properly. But
|
||||||
no known filter hook does that, and calling filters unncessarily during
|
no known filter hook does that, so let's not waste time on it.
|
||||||
scan slowed down complex filters such as the one used to update the xapian
|
|
||||||
index.
|
|
||||||
|
|
||||||
-- Joey Hess <joeyh@debian.org> Fri, 30 May 2008 19:08:54 -0400
|
-- Joey Hess <joeyh@debian.org> Fri, 30 May 2008 19:08:54 -0400
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,7 @@ msgid ""
|
||||||
msgstr ""
|
msgstr ""
|
||||||
"Project-Id-Version: PACKAGE VERSION\n"
|
"Project-Id-Version: PACKAGE VERSION\n"
|
||||||
"Report-Msgid-Bugs-To: \n"
|
"Report-Msgid-Bugs-To: \n"
|
||||||
"POT-Creation-Date: 2008-06-04 00:33-0400\n"
|
"POT-Creation-Date: 2008-06-04 00:52-0400\n"
|
||||||
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
|
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
|
||||||
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
|
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
|
||||||
"Language-Team: LANGUAGE <LL@li.org>\n"
|
"Language-Team: LANGUAGE <LL@li.org>\n"
|
||||||
|
@ -476,12 +476,12 @@ msgstr ""
|
||||||
msgid "(Diff truncated)"
|
msgid "(Diff truncated)"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: ../IkiWiki/Plugin/search.pm:23
|
#: ../IkiWiki/Plugin/search.pm:20
|
||||||
#, perl-format
|
#, perl-format
|
||||||
msgid "Must specify %s when using the search plugin"
|
msgid "Must specify %s when using the search plugin"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: ../IkiWiki/Plugin/search.pm:36
|
#: ../IkiWiki/Plugin/search.pm:33
|
||||||
msgid "search"
|
msgid "search"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue