parent
8a9be51d61
commit
44fde6cbff
|
@ -9,8 +9,11 @@ use IkiWiki 2.00;
|
||||||
sub import { #{{{
|
sub import { #{{{
|
||||||
hook(type => "checkconfig", id => "search", call => \&checkconfig);
|
hook(type => "checkconfig", id => "search", call => \&checkconfig);
|
||||||
hook(type => "pagetemplate", id => "search", call => \&pagetemplate);
|
hook(type => "pagetemplate", id => "search", call => \&pagetemplate);
|
||||||
|
# run last so other needsbuild hooks can modify the list
|
||||||
|
hook(type => "needsbuild", id => "search", call => \&needsbuild,
|
||||||
|
last => 1);
|
||||||
|
hook(type => "filter", id => "search", call => \&filter);
|
||||||
hook(type => "delete", id => "search", call => \&delete);
|
hook(type => "delete", id => "search", call => \&delete);
|
||||||
hook(type => "change", id => "search", call => \&change);
|
|
||||||
hook(type => "cgi", id => "search", call => \&cgi);
|
hook(type => "cgi", id => "search", call => \&cgi);
|
||||||
} # }}}
|
} # }}}
|
||||||
|
|
||||||
|
@ -53,12 +56,53 @@ sub pagetemplate (@) { #{{{
|
||||||
}
|
}
|
||||||
} #}}}
|
} #}}}
|
||||||
|
|
||||||
sub delete (@) { #{{{
|
my %toindex;
|
||||||
debug(gettext("cleaning xapian search index"));
|
sub needsbuild ($) { #{{{
|
||||||
|
%toindex = map { $_ => 1 } @{shift()};
|
||||||
} #}}}
|
} #}}}
|
||||||
|
|
||||||
sub change (@) { #{{{
|
sub filter (@) { #{{{
|
||||||
debug(gettext("updating xapian search index"));
|
my %params=@_;
|
||||||
|
|
||||||
|
if ($params{page} eq $params{destpage} && $toindex{$params{page}}) {
|
||||||
|
# index page
|
||||||
|
my $db=xapiandb();
|
||||||
|
my $doc=Search::Xapian::Document->new();
|
||||||
|
my $title=$params{page};
|
||||||
|
if (exists $pagestate{$params{page}}{meta} &&
|
||||||
|
exists $pagestate{$params{page}}{meta}{title}) {
|
||||||
|
$title=$pagestate{$params{page}}{meta}{title};
|
||||||
|
}
|
||||||
|
|
||||||
|
# data used by omega
|
||||||
|
$doc->set_data(
|
||||||
|
"url=".urlto($params{page}, "")."\n".
|
||||||
|
"sample=\n". # TODO
|
||||||
|
"caption=$title\n".
|
||||||
|
"modtime=$IkiWiki::pagemtime{$params{page}}\n".
|
||||||
|
"size=".length($params{content})."\n"
|
||||||
|
);
|
||||||
|
|
||||||
|
my $tg = Search::Xapian::TermGenerator->new();
|
||||||
|
$tg->set_stemmer(new Search::Xapian::Stem("english"));
|
||||||
|
$tg->set_document($doc);
|
||||||
|
$tg->index_text($params{page}, 2);
|
||||||
|
$tg->index_text($title, 2);
|
||||||
|
$tg->index_text($params{content}); # TODO html strip; preprocessor too
|
||||||
|
|
||||||
|
my $pageterm=pageterm($params{page});
|
||||||
|
$doc->add_term($pageterm);
|
||||||
|
$db->replace_document_by_term($pageterm, $doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
return $params{content};
|
||||||
|
} #}}}
|
||||||
|
|
||||||
|
sub delete (@) { #{{{
|
||||||
|
my $db=xapiandb();
|
||||||
|
foreach my $page (@_) {
|
||||||
|
$db->delete_document_by_term(pageterm($page));
|
||||||
|
}
|
||||||
} #}}}
|
} #}}}
|
||||||
|
|
||||||
sub cgi ($) { #{{{
|
sub cgi ($) { #{{{
|
||||||
|
@ -73,4 +117,26 @@ sub cgi ($) { #{{{
|
||||||
}
|
}
|
||||||
} #}}}
|
} #}}}
|
||||||
|
|
||||||
|
sub pageterm ($) { #{{{
|
||||||
|
my $page=shift;
|
||||||
|
|
||||||
|
# TODO: check if > 255 char page names overflow term
|
||||||
|
# length; use sha1 if so?
|
||||||
|
return "P".$page;
|
||||||
|
} #}}}
|
||||||
|
|
||||||
|
my $db;
|
||||||
|
sub xapiandb () { #{{{
|
||||||
|
if (! defined $db) {
|
||||||
|
eval q{
|
||||||
|
use Search::Xapian;
|
||||||
|
use Search::Xapian::WritableDatabase;
|
||||||
|
};
|
||||||
|
error($@) if $@;
|
||||||
|
$db=Search::Xapian::WritableDatabase->new($config{wikistatedir}."/xapian/default",
|
||||||
|
Search::Xapian::DB_CREATE_OR_OPEN());
|
||||||
|
}
|
||||||
|
return $db;
|
||||||
|
} #}}}
|
||||||
|
|
||||||
1
|
1
|
||||||
|
|
|
@ -11,3 +11,5 @@ point to `foo/bar/` instead.
|
||||||
> This bug affects the [[plugins/amazon_s3]] plugin -- when using that
|
> This bug affects the [[plugins/amazon_s3]] plugin -- when using that
|
||||||
> plugin plus the search plugin, you need to enable `amazon_s3_dupindex`.
|
> plugin plus the search plugin, you need to enable `amazon_s3_dupindex`.
|
||||||
> So this definitly should be fixed. --[[Joey]]
|
> So this definitly should be fixed. --[[Joey]]
|
||||||
|
|
||||||
|
> [[done]], the new xapian search uses nice urls
|
||||||
|
|
|
@ -33,35 +33,25 @@ Possibilities:
|
||||||
written on the page would be indexed. Not text generated by directives,
|
written on the page would be indexed. Not text generated by directives,
|
||||||
pulled in by inlining, etc. There's something to be said for that. And
|
pulled in by inlining, etc. There's something to be said for that. And
|
||||||
something to be said against it. It would also get markdown formatted
|
something to be said against it. It would also get markdown formatted
|
||||||
content, mostly, though it would still need to strip html.
|
content, mostly, though it would still need to strip html, and also
|
||||||
|
probably strip preprocessor directives too.
|
||||||
* `sanitize` - Would get the htmlized content, so would need to strip html.
|
* `sanitize` - Would get the htmlized content, so would need to strip html.
|
||||||
Preprocessor directive output would be indexed.
|
Preprocessor directive output would be indexed. Doesn't get a destpage
|
||||||
|
parameter, making optimisation hard.
|
||||||
* `format` - Would get the entire html page, including the page template.
|
* `format` - Would get the entire html page, including the page template.
|
||||||
Probably not a good choice as indexing the same template for each page
|
Probably not a good choice as indexing the same template for each page
|
||||||
is unnecessary.
|
is unnecessary.
|
||||||
|
|
||||||
Currently, a filter hook seems the best option.
|
|
||||||
|
|
||||||
The hook would remove any html from the content, and index it.
|
The hook would remove any html from the content, and index it.
|
||||||
It would need to add the same document data that omindex would, as well as
|
It would need to add the same document data that omindex would.
|
||||||
adding the same special terms (see
|
|
||||||
http://xapian.org/docs/omega/overview.html "Boolean terms").
|
|
||||||
|
|
||||||
(Note that the U term is a bit tricky because I'll have to replicate
|
|
||||||
ominxes's hash_string() to hash terms > 240 chars.)
|
|
||||||
|
|
||||||
The indexer (and deleter) will need a way to figure out the ids in xapian
|
The indexer (and deleter) will need a way to figure out the ids in xapian
|
||||||
of the documents to delete. One way is storing the id of each page in the
|
of the documents to delete. One way is storing the id of each page in the
|
||||||
ikiwiki index.
|
ikiwiki index.
|
||||||
|
|
||||||
The other way would be adding a special term to the xapian db that can be
|
The other way would be adding a special term to the xapian db that can be
|
||||||
used with replace_document_by_term/delete_document_by_term. omindex uses
|
used with replace_document_by_term/delete_document_by_term.
|
||||||
U<url> as a term, and I guess I could just use that, and then map page
|
Hmm, let's use a term named "P<pagename>".
|
||||||
names to urls when deleting a page ... only real problem being the
|
|
||||||
hashing; a collision would be bad.
|
|
||||||
|
|
||||||
At the moment, storing xapian ids in the ikiwiki index file seems like the
|
|
||||||
best approach.
|
|
||||||
|
|
||||||
The hook should try to avoid re-indexing pages that have not changed since
|
The hook should try to avoid re-indexing pages that have not changed since
|
||||||
they were last indexed. One problem is that, if a page with an inline is
|
they were last indexed. One problem is that, if a page with an inline is
|
||||||
|
|
Loading…
Reference in New Issue