vrg-archive/src/full-text-search.md
hiina c5e7bf307a implement the thing
seems to work pretty okay.
2025-04-11 17:13:08 -06:00

77 lines
4.1 KiB
Markdown

---
title: Full Text Search
---
# Full Text Search
This uses [DuckDB's full text search extention](https://duckdb.org/docs/stable/extensions/full_text_search.html)
to search through all the post comments. It's freakishly fast for running entirely in your browser. The index
is built offline and loaded as parquet files.
It is a bit limited, in that it only searches stems of common words, and no search operators (I think).
But still, very fast for ~1.3 million posts.
```js
const schema_sql = `
LOAD fts;
CREATE SCHEMA fts_main_posts;
create table fts_main_posts.dict AS FROM '${FileAttachment("data/vrgindex/fts_main_posts_dict.parquet").href}';
create table fts_main_posts.docs AS FROM '${FileAttachment("data/vrgindex/fts_main_posts_docs.parquet").href}';
create table fts_main_posts.fields AS FROM '${FileAttachment("data/vrgindex/fts_main_posts_fields.parquet").href}';
create table fts_main_posts.stats AS FROM '${FileAttachment("data/vrgindex/fts_main_posts_stats.parquet").href}';
create table fts_main_posts.stopwords AS FROM '${FileAttachment("data/vrgindex/fts_main_posts_stopwords.parquet").href}';
create table fts_main_posts.terms AS FROM '${FileAttachment("data/vrgindex/fts_main_posts_terms.parquet").href}';
create table posts as from '${FileAttachment("data/vrgarchive.parquet").href}';
CREATE MACRO if not exists fts_main_posts.tokenize (s) AS (string_split_regex(regexp_replace(lower(strip_accents(CAST(s AS VARCHAR))), '[0-9!@#$%^&*()_+={}\\[\\]:;<>,.?~\\\/\\|''''"\`-]+', ' ', 'g'), '\\s+'));;
CREATE MACRO if not exists fts_main_posts.match_bm25 (docname, query_string, b := 0.75, conjunctive := false, k := 1.2, fields := NULL) AS ((WITH tokens AS (SELECT DISTINCT stem(unnest(fts_main_posts.tokenize(query_string)), 'porter') AS t), fieldids AS (SELECT fieldid FROM fts_main_posts.fields WHERE CASE WHEN ((fields IS NULL)) THEN (1) ELSE (field = ANY(SELECT * FROM (SELECT unnest(string_split(fields, ','))) AS fsq)) END), qtermids AS (SELECT termid FROM fts_main_posts.dict AS dict , tokens WHERE (dict.term = tokens.t)), qterms AS (SELECT termid, docid FROM fts_main_posts.terms AS terms WHERE (CASE WHEN ((fields IS NULL)) THEN (1) ELSE (fieldid = ANY(SELECT * FROM fieldids)) END AND (termid = ANY(SELECT qtermids.termid FROM qtermids)))), term_tf AS (SELECT termid, docid, count_star() AS tf FROM qterms GROUP BY docid, termid), cdocs AS (SELECT docid FROM qterms GROUP BY docid HAVING CASE WHEN (conjunctive) THEN ((count(DISTINCT termid) = (SELECT count_star() FROM tokens))) ELSE 1 END), subscores AS (SELECT docs.docid, len, term_tf.termid, tf, df, (log((((((SELECT num_docs FROM fts_main_posts.stats) - df) + 0.5) / (df + 0.5)) + 1)) * ((tf * (k + 1)) / (tf + (k * ((1 - b) + (b * (len / (SELECT avgdl FROM fts_main_posts.stats)))))))) AS subscore FROM term_tf , cdocs , fts_main_posts.docs AS docs , fts_main_posts.dict AS dict WHERE ((term_tf.docid = cdocs.docid) AND (term_tf.docid = docs.docid) AND (term_tf.termid = dict.termid))), scores AS (SELECT docid, sum(subscore) AS score FROM subscores GROUP BY docid)SELECT score FROM scores , fts_main_posts.docs AS docs WHERE ((scores.docid = docs.docid) AND (docs."name" = docname))));;
`;
const db = await DuckDBClient.of();
await db.sql([schema_sql]);
const sql = db.sql.bind(db);
```
```js
import DOMPurify from 'npm:dompurify';
````
```js
const search_term = view(
Inputs.text({
label: "Search",
placeholder: "search for something",
value: "sneed"
})
);
```
```js
const search_results = await sql`
select post_num, epoch_ms(timestamp * 1000) as post_time, comment from (select *, fts_main_posts.match_bm25(post_num, ${search_term}) as score from posts) sq where score is not null order by score desc`;
```
${search_results.numRows} results.
```js
Plot.plot({
width: width,
x: {
interval: "month"
},
marks: [
Plot.barY(search_results, Plot.binX({"y": "count"}, { x: "post_time" , interval: "month", "tip": true}))
]
})
```
```js
Inputs.table(search_results, { width: {"comment": 500 },
format: {
"comment": (v) => {
let p = DOMPurify.sanitize(v);
let span = document.createElement("span");
span.innerHTML = p;
return span
}
}})
```