parent
8a9be51d61
commit
44fde6cbff
|
@ -9,8 +9,11 @@ use IkiWiki 2.00;
|
|||
sub import { #{{{
|
||||
hook(type => "checkconfig", id => "search", call => \&checkconfig);
|
||||
hook(type => "pagetemplate", id => "search", call => \&pagetemplate);
|
||||
# run last so other needsbuild hooks can modify the list
|
||||
hook(type => "needsbuild", id => "search", call => \&needsbuild,
|
||||
last => 1);
|
||||
hook(type => "filter", id => "search", call => \&filter);
|
||||
hook(type => "delete", id => "search", call => \&delete);
|
||||
hook(type => "change", id => "search", call => \&change);
|
||||
hook(type => "cgi", id => "search", call => \&cgi);
|
||||
} # }}}
|
||||
|
||||
|
@ -53,12 +56,53 @@ sub pagetemplate (@) { #{{{
|
|||
}
|
||||
} #}}}
|
||||
|
||||
sub delete (@) { #{{{
|
||||
debug(gettext("cleaning xapian search index"));
|
||||
my %toindex;
|
||||
sub needsbuild ($) { #{{{
|
||||
%toindex = map { $_ => 1 } @{shift()};
|
||||
} #}}}
|
||||
|
||||
sub change (@) { #{{{
|
||||
debug(gettext("updating xapian search index"));
|
||||
sub filter (@) { #{{{
|
||||
my %params=@_;
|
||||
|
||||
if ($params{page} eq $params{destpage} && $toindex{$params{page}}) {
|
||||
# index page
|
||||
my $db=xapiandb();
|
||||
my $doc=Search::Xapian::Document->new();
|
||||
my $title=$params{page};
|
||||
if (exists $pagestate{$params{page}}{meta} &&
|
||||
exists $pagestate{$params{page}}{meta}{title}) {
|
||||
$title=$pagestate{$params{page}}{meta}{title};
|
||||
}
|
||||
|
||||
# data used by omega
|
||||
$doc->set_data(
|
||||
"url=".urlto($params{page}, "")."\n".
|
||||
"sample=\n". # TODO
|
||||
"caption=$title\n".
|
||||
"modtime=$IkiWiki::pagemtime{$params{page}}\n".
|
||||
"size=".length($params{content})."\n"
|
||||
);
|
||||
|
||||
my $tg = Search::Xapian::TermGenerator->new();
|
||||
$tg->set_stemmer(new Search::Xapian::Stem("english"));
|
||||
$tg->set_document($doc);
|
||||
$tg->index_text($params{page}, 2);
|
||||
$tg->index_text($title, 2);
|
||||
$tg->index_text($params{content}); # TODO html strip; preprocessor too
|
||||
|
||||
my $pageterm=pageterm($params{page});
|
||||
$doc->add_term($pageterm);
|
||||
$db->replace_document_by_term($pageterm, $doc);
|
||||
}
|
||||
|
||||
return $params{content};
|
||||
} #}}}
|
||||
|
||||
sub delete (@) { #{{{
|
||||
my $db=xapiandb();
|
||||
foreach my $page (@_) {
|
||||
$db->delete_document_by_term(pageterm($page));
|
||||
}
|
||||
} #}}}
|
||||
|
||||
sub cgi ($) { #{{{
|
||||
|
@ -73,4 +117,26 @@ sub cgi ($) { #{{{
|
|||
}
|
||||
} #}}}
|
||||
|
||||
sub pageterm ($) { #{{{
|
||||
my $page=shift;
|
||||
|
||||
# TODO: check if > 255 char page names overflow term
|
||||
# length; use sha1 if so?
|
||||
return "P".$page;
|
||||
} #}}}
|
||||
|
||||
my $db;
|
||||
sub xapiandb () { #{{{
|
||||
if (! defined $db) {
|
||||
eval q{
|
||||
use Search::Xapian;
|
||||
use Search::Xapian::WritableDatabase;
|
||||
};
|
||||
error($@) if $@;
|
||||
$db=Search::Xapian::WritableDatabase->new($config{wikistatedir}."/xapian/default",
|
||||
Search::Xapian::DB_CREATE_OR_OPEN());
|
||||
}
|
||||
return $db;
|
||||
} #}}}
|
||||
|
||||
1
|
||||
|
|
|
@ -11,3 +11,5 @@ point to `foo/bar/` instead.
|
|||
> This bug affects the [[plugins/amazon_s3]] plugin -- when using that
|
||||
> plugin plus the search plugin, you need to enable `amazon_s3_dupindex`.
|
||||
> So this definitly should be fixed. --[[Joey]]
|
||||
|
||||
> [[done]], the new xapian search uses nice urls
|
||||
|
|
|
@ -33,35 +33,25 @@ Possibilities:
|
|||
written on the page would be indexed. Not text generated by directives,
|
||||
pulled in by inlining, etc. There's something to be said for that. And
|
||||
something to be said against it. It would also get markdown formatted
|
||||
content, mostly, though it would still need to strip html.
|
||||
content, mostly, though it would still need to strip html, and also
|
||||
probably strip preprocessor directives too.
|
||||
* `sanitize` - Would get the htmlized content, so would need to strip html.
|
||||
Preprocessor directive output would be indexed.
|
||||
Preprocessor directive output would be indexed. Doesn't get a destpage
|
||||
parameter, making optimisation hard.
|
||||
* `format` - Would get the entire html page, including the page template.
|
||||
Probably not a good choice as indexing the same template for each page
|
||||
is unnecessary.
|
||||
|
||||
Currently, a filter hook seems the best option.
|
||||
|
||||
The hook would remove any html from the content, and index it.
|
||||
It would need to add the same document data that omindex would, as well as
|
||||
adding the same special terms (see
|
||||
http://xapian.org/docs/omega/overview.html "Boolean terms").
|
||||
|
||||
(Note that the U term is a bit tricky because I'll have to replicate
|
||||
ominxes's hash_string() to hash terms > 240 chars.)
|
||||
It would need to add the same document data that omindex would.
|
||||
|
||||
The indexer (and deleter) will need a way to figure out the ids in xapian
|
||||
of the documents to delete. One way is storing the id of each page in the
|
||||
ikiwiki index.
|
||||
|
||||
The other way would be adding a special term to the xapian db that can be
|
||||
used with replace_document_by_term/delete_document_by_term. omindex uses
|
||||
U<url> as a term, and I guess I could just use that, and then map page
|
||||
names to urls when deleting a page ... only real problem being the
|
||||
hashing; a collision would be bad.
|
||||
|
||||
At the moment, storing xapian ids in the ikiwiki index file seems like the
|
||||
best approach.
|
||||
used with replace_document_by_term/delete_document_by_term.
|
||||
Hmm, let's use a term named "P<pagename>".
|
||||
|
||||
The hook should try to avoid re-indexing pages that have not changed since
|
||||
they were last indexed. One problem is that, if a page with an inline is
|
||||
|
|
Loading…
Reference in New Issue