From d4e61d51abf26e185ec2324ca89be10e1507a38c Mon Sep 17 00:00:00 2001 From: David W Bitner Date: Mon, 22 Jun 2026 17:07:03 -0500 Subject: [PATCH 1/7] Index-resident summary storage: page I/O foundation (Generic WAL) First step toward BRIN-style, in-index incremental summaries (no side table, no stale flag, no REINDEX). Adds index_storage.rs: write/read a length-prefixed byte blob in the index's own metapage (block 0), updated in place and WAL-logged via the Generic WAL API. Because a table_range summary only needs to be over-inclusive, these page updates need no MVCC/transactionality. Round-trip is proven by a pg_test (caught and fixed the Generic-WAL page-hole zeroing by setting pd_lower = pd_upper). Existing 26 tests unaffected. Co-Authored-By: Claude Opus 4.8 --- src/index_am_tests.rs | 16 +++++++ src/index_storage.rs | 102 ++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 1 + 3 files changed, 119 insertions(+) create mode 100644 src/index_storage.rs diff --git a/src/index_am_tests.rs b/src/index_am_tests.rs index 23cb8cd..f9b9fb4 100644 --- a/src/index_am_tests.rs +++ b/src/index_am_tests.rs @@ -153,3 +153,19 @@ fn am_drop_index_cleans_summaries_and_stays_correct() { 51 ); } + +#[pg_test] +fn storage_page_roundtrip() { + Spi::run( + "DROP TABLE IF EXISTS pr CASCADE; CREATE TABLE pr (val bigint); + INSERT INTO pr VALUES (1); + CREATE INDEX pr_tr ON pr USING table_range (val);", + ) + .unwrap(); + let out = Spi::get_one::( + "SELECT table_range_test_page_roundtrip('pr_tr'::regclass::oid, 'hello-page-42')", + ) + .unwrap() + .unwrap(); + assert_eq!(out, "hello-page-42", "blob must round-trip through the index metapage"); +} diff --git a/src/index_storage.rs b/src/index_storage.rs new file mode 100644 index 0000000..68feac2 --- /dev/null +++ b/src/index_storage.rs @@ -0,0 +1,102 @@ +use pgrx::pg_sys; +use pgrx::prelude::*; + +// Low-level storage for a per-index summary, kept in the index's own metapage (block 0) +// and updated in place with the Generic WAL API — the same model BRIN uses. Because a +// table_range summary only ever needs to be *over-inclusive*, these in-place page +// updates need no MVCC/transactionality: a rolled-back or concurrent widening that +// over-covers is always safe. +// +// This module currently provides the raw page round-trip (length-prefixed byte blob in +// the metapage). Typed summary (de)serialization and the ambuild/aminsert/planner wiring +// build on top of it. + +// Usable bytes in the metapage content area (after the page header + length prefix). A +// generous margin below BLCKSZ keeps us clear of the page header regardless of alignment. +fn max_blob_len() -> usize { + pg_sys::BLCKSZ as usize - 64 +} + +/// Write a byte blob into the index's metapage (block 0), creating the block if needed. +/// WAL-logged via Generic WAL. Caller must hold a lock on the index relation appropriate +/// for the calling context (ambuild owns it; aminsert holds the row's locks). +pub unsafe fn write_blob(index: pg_sys::Relation, data: &[u8]) -> Result<(), &'static str> { + if data.len() > max_blob_len() { + return Err("table_range: summary blob exceeds one page"); + } + let nblocks = pg_sys::RelationGetNumberOfBlocksInFork(index, pg_sys::ForkNumber::MAIN_FORKNUM); + let is_new = nblocks == 0; + let buffer = if is_new { + pg_sys::ReadBuffer(index, pg_sys::InvalidBlockNumber) // P_NEW -> extend + } else { + pg_sys::ReadBuffer(index, 0) + }; + pg_sys::LockBuffer(buffer, pg_sys::BUFFER_LOCK_EXCLUSIVE as i32); + + let state = pg_sys::GenericXLogStart(index); + let page = + pg_sys::GenericXLogRegisterBuffer(state, buffer, pg_sys::GENERIC_XLOG_FULL_IMAGE as i32); + if is_new || pg_sys::PageIsNew(page) { + pg_sys::PageInit(page, pg_sys::BLCKSZ as usize, 0); + } + + let contents = pg_sys::PageGetContents(page) as *mut u8; + let len = data.len() as u32; + std::ptr::copy_nonoverlapping(len.to_ne_bytes().as_ptr(), contents, 4); + std::ptr::copy_nonoverlapping(data.as_ptr(), contents.add(4), data.len()); + + // Eliminate the page "hole": Generic WAL (and standard page logging) treats the + // region between pd_lower and pd_upper as empty and zeroes it, which would clobber + // our content. Setting pd_lower = pd_upper means the whole page is logged verbatim. + let header = page as *mut pg_sys::PageHeaderData; + (*header).pd_lower = (*header).pd_upper; + + pg_sys::GenericXLogFinish(state); + pg_sys::UnlockReleaseBuffer(buffer); + Ok(()) +} + +/// Read the byte blob from the index's metapage, or `None` if the index has no metapage. +pub unsafe fn read_blob(index: pg_sys::Relation) -> Option> { + let nblocks = pg_sys::RelationGetNumberOfBlocksInFork(index, pg_sys::ForkNumber::MAIN_FORKNUM); + if nblocks == 0 { + return None; + } + let buffer = pg_sys::ReadBuffer(index, 0); + pg_sys::LockBuffer(buffer, pg_sys::BUFFER_LOCK_SHARE as i32); + let page = pg_sys::BufferGetPage(buffer); + let result = if pg_sys::PageIsNew(page) { + None + } else { + let contents = pg_sys::PageGetContents(page) as *const u8; + let mut len_bytes = [0u8; 4]; + std::ptr::copy_nonoverlapping(contents, len_bytes.as_mut_ptr(), 4); + let len = u32::from_ne_bytes(len_bytes) as usize; + if len == 0 || len > max_blob_len() { + None + } else { + let mut data = vec![0u8; len]; + std::ptr::copy_nonoverlapping(contents.add(4), data.as_mut_ptr(), len); + Some(data) + } + }; + pg_sys::UnlockReleaseBuffer(buffer); + result +} + +// ---- test-only round-trip harness ------------------------------------------------- + +#[cfg(any(test, feature = "pg_test"))] +#[pg_extern] +fn table_range_test_page_roundtrip(index: pg_sys::Oid, payload: String) -> String { + unsafe { + let rel = pg_sys::index_open(index, pg_sys::AccessExclusiveLock as i32); + write_blob(rel, payload.as_bytes()).unwrap_or_else(|e| error!("write_blob: {e}")); + let back = read_blob(rel); + pg_sys::index_close(rel, pg_sys::AccessExclusiveLock as i32); + match back { + Some(b) => String::from_utf8_lossy(&b).into_owned(), + None => error!("no blob read back"), + } + } +} diff --git a/src/lib.rs b/src/lib.rs index 945d75b..76e3afb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,6 +6,7 @@ use pgrx::{GucContext, GucFlags, GucRegistry}; ::pgrx::pg_module_magic!(name, version); mod index_am; +mod index_storage; mod prune_hook; mod summary_build; From 4940a741d9301b8e946eae0afd9191b20bd9a082 Mon Sep 17 00:00:00 2001 From: David W Bitner Date: Mon, 22 Jun 2026 17:08:54 -0500 Subject: [PATCH 2/7] Typed index summary: serialize/deserialize + write_summary/read_summary Adds the IndexSummary/ColSummary types (one entry per indexed column: attnum, minmax-vs-overlap kind, type name, min/max text, null flags) and a compact, versioned byte format persisted into the index metapage via the page-I/O layer. Pure-Rust round-trip + bad-input tests pass on the host. This is the shared currency for the next stages: ambuild builds an IndexSummary and writes it; the planner reads it per partition; aminsert reads/widens/writes it. Co-Authored-By: Claude Opus 4.8 --- src/index_storage.rs | 167 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) diff --git a/src/index_storage.rs b/src/index_storage.rs index 68feac2..f32aa97 100644 --- a/src/index_storage.rs +++ b/src/index_storage.rs @@ -84,6 +84,173 @@ pub unsafe fn read_blob(index: pg_sys::Relation) -> Option> { result } +// ---- typed summary (de)serialization ---------------------------------------------- + +/// One column's summary, as stored in the index metapage and used by the planner. +#[derive(Clone, Debug, PartialEq)] +pub struct ColSummary { + /// Heap attnum the summary is for (matched against `Var.varattno` at plan time). + pub attnum: i16, + /// `true` -> `min` holds a covering extent for `&&` pruning (range/geometry); + /// `false` -> `min`/`max` hold the column's btree min/max. + pub overlap: bool, + /// SQL type name (for casting in overlap evaluation). + pub type_name: String, + pub min: Option, + pub max: Option, + pub has_nulls: bool, + pub all_nulls: bool, +} + +/// The whole index's summary: one entry per indexed column. +#[derive(Clone, Debug, PartialEq, Default)] +pub struct IndexSummary { + pub cols: Vec, +} + +const SUMMARY_VERSION: u8 = 1; + +fn put_str(out: &mut Vec, s: &str) { + out.extend_from_slice(&(s.len() as u16).to_le_bytes()); + out.extend_from_slice(s.as_bytes()); +} + +fn put_opt_str(out: &mut Vec, s: &Option) { + match s { + Some(s) => { + out.push(1); + put_str(out, s); + } + None => out.push(0), + } +} + +struct Reader<'a> { + buf: &'a [u8], + pos: usize, +} + +impl Reader<'_> { + fn u8(&mut self) -> Option { + let b = *self.buf.get(self.pos)?; + self.pos += 1; + Some(b) + } + fn u16(&mut self) -> Option { + let bytes = self.buf.get(self.pos..self.pos + 2)?; + self.pos += 2; + Some(u16::from_le_bytes([bytes[0], bytes[1]])) + } + fn i16(&mut self) -> Option { + self.u16().map(|v| v as i16) + } + fn str(&mut self) -> Option { + let len = self.u16()? as usize; + let bytes = self.buf.get(self.pos..self.pos + len)?; + self.pos += len; + String::from_utf8(bytes.to_vec()).ok() + } + fn opt_str(&mut self) -> Option> { + match self.u8()? { + 0 => Some(None), + _ => Some(Some(self.str()?)), + } + } +} + +pub fn serialize(summary: &IndexSummary) -> Vec { + let mut out = Vec::new(); + out.push(SUMMARY_VERSION); + out.extend_from_slice(&(summary.cols.len() as u16).to_le_bytes()); + for c in &summary.cols { + out.extend_from_slice(&c.attnum.to_le_bytes()); + out.push(c.overlap as u8); + out.push((c.has_nulls as u8) | ((c.all_nulls as u8) << 1)); + put_str(&mut out, &c.type_name); + put_opt_str(&mut out, &c.min); + put_opt_str(&mut out, &c.max); + } + out +} + +pub fn deserialize(buf: &[u8]) -> Option { + let mut r = Reader { buf, pos: 0 }; + if r.u8()? != SUMMARY_VERSION { + return None; + } + let ncols = r.u16()? as usize; + let mut cols = Vec::with_capacity(ncols); + for _ in 0..ncols { + let attnum = r.i16()?; + let overlap = r.u8()? != 0; + let flags = r.u8()?; + let type_name = r.str()?; + let min = r.opt_str()?; + let max = r.opt_str()?; + cols.push(ColSummary { + attnum, + overlap, + type_name, + min, + max, + has_nulls: flags & 1 != 0, + all_nulls: flags & 2 != 0, + }); + } + Some(IndexSummary { cols }) +} + +/// Persist the typed summary into the index metapage. +pub unsafe fn write_summary( + index: pg_sys::Relation, + summary: &IndexSummary, +) -> Result<(), &'static str> { + write_blob(index, &serialize(summary)) +} + +/// Read the typed summary from the index metapage, if present. +pub unsafe fn read_summary(index: pg_sys::Relation) -> Option { + deserialize(&read_blob(index)?) +} + +#[cfg(test)] +mod serde_tests { + use super::*; + + #[test] + fn summary_roundtrips() { + let s = IndexSummary { + cols: vec![ + ColSummary { + attnum: 2, + overlap: false, + type_name: "bigint".into(), + min: Some("0".into()), + max: Some("99".into()), + has_nulls: true, + all_nulls: false, + }, + ColSummary { + attnum: 3, + overlap: true, + type_name: "int8range".into(), + min: Some("[0,100)".into()), + max: None, + has_nulls: false, + all_nulls: false, + }, + ], + }; + assert_eq!(deserialize(&serialize(&s)), Some(s)); + } + + #[test] + fn rejects_garbage_and_wrong_version() { + assert_eq!(deserialize(&[]), None); + assert_eq!(deserialize(&[99, 0, 0]), None); + } +} + // ---- test-only round-trip harness ------------------------------------------------- #[cfg(any(test, feature = "pg_test"))] From 80a66f610a9ccf83fd9f23809308dea55b385c74 Mon Sep 17 00:00:00 2001 From: David W Bitner Date: Mon, 22 Jun 2026 17:29:37 -0500 Subject: [PATCH 3/7] Store summaries in the index metapage and maintain them incrementally (BRIN-style) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the side table + mark-stale + REINDEX model with self-maintaining, index-resident summaries: - ambuild now writes the per-partition summary to the index's own metapage (index_storage.rs), keyed by nothing but the index itself. - The planner reads each partition's summary from its index page (cached per plan) instead of an SPI load of a side table. - aminsert widens the metapage summary in place as rows arrive — no MVCC needed because the summary only has to be over-inclusive. An insert within range writes nothing; one that extends it grows min/max (scalars, in memory) or the extent (range/geometry, via the type's union). Pruning stays correct AND active across inserts with no REINDEX. - Remove the table_range_summary side table, the stale flag + per-txn memo, and the sql_drop cleanup event trigger (DROP INDEX frees the summary with the index). Deletes leave the summary conservatively wide (safe); VACUUM/REINDEX re-tighten. 29 tests pass on pg18; production build, clippy -D warnings, and fmt all clean. README and module docs updated. Co-Authored-By: Claude Opus 4.8 --- README.md | 59 ++++++----- src/e2e_tests.rs | 31 ------ src/index_am.rs | 230 +++++++++++++++++++++++++++++------------- src/index_am_tests.rs | 55 ++++++---- src/index_storage.rs | 25 +++++ src/lib.rs | 60 ----------- src/prune_hook.rs | 166 ++++++++++++++---------------- src/summary_build.rs | 201 +++++++++--------------------------- 8 files changed, 376 insertions(+), 451 deletions(-) diff --git a/README.md b/README.md index fe1a6b1..4e106be 100644 --- a/README.md +++ b/README.md @@ -25,9 +25,10 @@ CREATE INDEX events_tr ON events USING table_range (val, created_at); -- Verify with EXPLAIN: non-matching partitions disappear from the plan. EXPLAIN (COSTS OFF) SELECT * FROM events WHERE val >= 250; --- Recompute after heavy churn; or drop the summaries entirely. +-- Inserts maintain the summary automatically; REINDEX only re-tightens after many +-- deletes. DROP INDEX removes the summary with the index. REINDEX INDEX events_tr; -DROP INDEX events_tr; -- removes the summaries it built +DROP INDEX events_tr; ``` The index is never used for scans — it exists only to build and own the summaries — so @@ -49,17 +50,16 @@ EXPLAIN (COSTS OFF) SELECT * FROM places WHERE geom && ST_MakeEnvelope(0,0,10,10 ## How it works -- **Summaries.** For each leaf partition and indexed column, one row in - `table_range_summary` records the `has_nulls` / `all_nulls` flags plus either the - column's btree `min`/`max` (scalar columns) or a single covering **extent** — a covering - range for range types (`range_merge(range_agg(col))`) or the bounding box for PostGIS - geometry (`ST_Extent(col)`). -- **Planning.** A `planner_hook` loads all non-stale summaries once per top-level plan - (a single query, cached for the duration of planning). A `set_rel_pathlist_hook` then - evaluates each partition's restriction clauses against its cached summary and calls - `mark_dummy_rel` on any partition that provably cannot match — eliminating it before - child paths are generated. Wide partition trees therefore do not pay a per-partition - lookup. +- **Summaries live in the index.** Like BRIN, each leaf partition's summary is stored in + that partition's index — one record per indexed column on the index's **metapage**, not + in any side table. It holds the `has_nulls` / `all_nulls` flags plus either the column's + btree `min`/`max` (scalar columns) or a single covering **extent** — a covering range + for range types (`range_merge(range_agg(col))`) or the bounding box for PostGIS geometry + (`ST_Extent(col)`). +- **Planning.** For each partition the planner builds, a `set_rel_pathlist_hook` reads the + summary from that partition's index (cached for the plan) and evaluates the partition's + restriction clauses against it, calling `mark_dummy_rel` on any partition that provably + cannot match — eliminating it before child paths are generated. - **Typed comparisons.** Min/max vs. constant comparisons use each column type's own btree compare function, so **any btree-comparable type works**: `bigint` / `int` / `smallint`, `numeric`, `real` / `double precision`, `text` / `varchar`, `date`, @@ -69,12 +69,14 @@ EXPLAIN (COSTS OFF) SELECT * FROM places WHERE geom && ST_MakeEnvelope(0,0,10,10 is pruned by testing the constant against the partition's stored extent with PostgreSQL's own `&&` operator — so a partition is eliminated when its extent cannot overlap the query. -- **Automatic correctness.** An insert that extends a partition marks its summary - *stale* (via the index's `aminsert`), and stale summaries are never used for pruning — - so a change can never cause a missing row. Deletes only shrink a partition's true - range, so the summary stays conservatively wide and remains safe. `REINDEX` recomputes - and re-enables pruning after churn, and a `sql_drop` event trigger removes a dropped - index's (or table's) summaries. +- **Incremental maintenance (no REINDEX).** `aminsert` widens the summary in place as + rows are inserted — the same way BRIN maintains its ranges. Because the summary only + ever needs to be over-inclusive, these updates need no MVCC: an insert within the + existing range writes nothing; one that extends it grows the min/max/extent. Pruning + therefore stays correct **and** active across inserts without any rebuild. Deletes only + shrink a partition's true range, leaving the summary conservatively wide (still safe); + `VACUUM`/`REINDEX` can re-tighten it for selectivity. `DROP INDEX` removes the summary + with the index's storage — there is no side table to clean up. ## Performance @@ -119,20 +121,20 @@ Everything not listed is conservatively **kept** (never mispruned): - `table_range.enable_pruning` (default `on`) — master switch. - `table_range.log_pruning_debug` (default `off`) — log each prune decision. -## Catalog +## Storage -- `table_range_summary` — one summary row per (index, leaf partition, column): - `index_oid`, `relid`, `attnum`, `kind` (`minmax` or `overlap`), `type_name`, - `min_summary`, `max_summary`, `has_nulls`, `all_nulls`, `stale`, `tuple_version`. +There is no catalog table — each partition's summary lives on its `table_range` index's +metapage (block 0), written by `ambuild` and updated in place by `aminsert`, like BRIN. ## Project layout | File | Responsibility | |------|----------------| -| `src/lib.rs` | GUCs, `_PG_init`, catalog/bootstrap SQL, test wiring | -| `src/summary_build.rs` | SPI summary build (scalar min/max + range/geometry extent) | +| `src/lib.rs` | GUCs, `_PG_init`, test wiring | +| `src/index_storage.rs` | per-index summary on the metapage: page I/O (Generic WAL) + (de)serialization | +| `src/summary_build.rs` | build a leaf's summary by scanning its data (used by `ambuild`) | | `src/prune_hook.rs` | planner + pathlist hooks, per-plan cache, typed in-memory evaluation | -| `src/index_am.rs` | `table_range` index access method + automatic operator-class provisioning | +| `src/index_am.rs` | `table_range` index AM: build, incremental `aminsert` widening, opclass provisioning | | `src/e2e_tests.rs`, `src/index_am_tests.rs` | end-to-end tests | ## Building and testing @@ -154,5 +156,6 @@ range-type tests, which exercise the same code path. - `NOT IN` / `<> ALL`, `NOT (...)`, expression predicates, and parameterized prepared-statement plans are kept rather than pruned. -- Summaries are exact at build time; an insert that extends a partition marks it stale - (not pruned, but still correct) until the next `REINDEX`. +- Inserts keep summaries current incrementally, but deletes only relax them (the summary + can stay wider than the live data until a `VACUUM`/`REINDEX` re-tightens it) — always + correct, just potentially less selective. diff --git a/src/e2e_tests.rs b/src/e2e_tests.rs index ad2bf1c..999b183 100644 --- a/src/e2e_tests.rs +++ b/src/e2e_tests.rs @@ -237,37 +237,6 @@ fn e2e_large_tree_prunes_to_single_partition() { assert_eq!(count_big("val >= 3150"), 50); // p31 (3100..3199): 3150..3199 } -#[pg_test] -fn e2e_per_plan_cache_loads_once_regardless_of_partitions() { - // 64 range partitions; planning a query must load summaries exactly once, not once - // per partition — the observable signature of the per-plan cache. - Spi::run( - "DROP TABLE IF EXISTS cache_t CASCADE; - CREATE TABLE cache_t (val bigint) PARTITION BY RANGE (val);", - ) - .unwrap(); - for i in 0..64 { - let lo = i * 100; - let hi = lo + 100; - Spi::run(&format!( - "CREATE TABLE cache_t_p{i} PARTITION OF cache_t FOR VALUES FROM ({lo}) TO ({hi}); - INSERT INTO cache_t SELECT g FROM generate_series({lo}, {hi} - 1) g;" - )) - .unwrap(); - } - e2e_build("cache_t", "val"); - e2e_set_pruning(true); - - Spi::run("SELECT table_range_reset_cache_load_count()").unwrap(); - let found = Spi::get_one::("SELECT count(*)::bigint FROM cache_t WHERE val = 3333") - .unwrap() - .unwrap(); - assert_eq!(found, 1); - let loads = Spi::get_one::("SELECT table_range_cache_load_count()") - .unwrap() - .unwrap(); - assert_eq!(loads, 1, "expected exactly one summary load for the plan, got {loads}"); -} /// True if PostGIS can be created in this environment. Checked via the catalog so a /// missing extension does not abort the test transaction. diff --git a/src/index_am.rs b/src/index_am.rs index 8a92a52..71f7d13 100644 --- a/src/index_am.rs +++ b/src/index_am.rs @@ -1,44 +1,17 @@ +use crate::index_storage::{self, ColSummary}; +use crate::prune_hook::{btree_cmp_proc, datum_cmp, datum_to_text, text_to_datum}; use pgrx::pg_sys; use pgrx::prelude::*; -use std::cell::RefCell; -use std::collections::HashSet; - -thread_local! { - /// Leaf relids whose summaries we've already marked stale in the current - /// transaction, so a bulk insert marks each partition at most once instead of - /// once per row. Cleared at transaction end (so a later transaction re-marks after - /// a concurrent REINDEX could have refreshed the summary). - static STALE_MARKED: RefCell> = RefCell::new(HashSet::new()); -} - -/// Register the transaction callback that resets the per-transaction stale memo. -pub fn install() { - unsafe { - pg_sys::RegisterXactCallback(Some(xact_callback), std::ptr::null_mut()); - } -} - -unsafe extern "C-unwind" fn xact_callback( - event: pg_sys::XactEvent::Type, - _arg: *mut core::ffi::c_void, -) { - if event == pg_sys::XactEvent::XACT_EVENT_COMMIT - || event == pg_sys::XactEvent::XACT_EVENT_ABORT - || event == pg_sys::XactEvent::XACT_EVENT_PREPARE - { - STALE_MARKED.with(|s| s.borrow_mut().clear()); - } -} // Custom index access method `table_range`, providing the ergonomic // `CREATE INDEX ... USING table_range (cols)` front-end over the same summary engine. // -// This is not a conventional scannable index: it stores nothing in index pages. -// Instead, `ambuild` scans the (leaf) relation and writes one min/max/null summary per -// indexed column into `table_range_summary` (keyed by the index OID), and installs -// the staleness trigger that keeps the summary conservative on data changes. The planner -// hook then prunes partitions using those summaries. The index is never chosen for scans -// (no `amgettuple`/`amgetbitmap`, prohibitive cost estimate). +// This is not a conventional scannable index: instead of indexing rows, it stores one +// min/max/null (or extent) summary per indexed column in its own metapage. `ambuild` +// scans the leaf relation to build it; `aminsert` widens it in place as rows arrive (like +// BRIN, and needing no MVCC because the summary only has to be over-inclusive). The +// planner hook prunes partitions using those summaries. The index is never chosen for +// scans (no `amgettuple`/`amgetbitmap`, prohibitive cost estimate). /// V1 function-info record so PostgreSQL can call `table_range_amhandler` as a /// `LANGUAGE c` function declared in the access-method SQL below. @@ -86,27 +59,30 @@ unsafe extern "C-unwind" fn am_build( index_info: *mut pg_sys::IndexInfo, ) -> *mut pg_sys::IndexBuildResult { let heap_relid = (*heap).rd_id; - let index_relid = (*index).rd_id; let nattrs = (*index_info).ii_NumIndexAttrs.max(0) as usize; let attnums: Vec = (0..nattrs) .map(|i| (*index_info).ii_IndexAttrNumbers[i]) .collect(); - // Build summaries via SPI under a freshly pushed snapshot so the snapshot the - // CREATE INDEX portal relies on is restored afterwards. We skip trigger - // installation here (DDL is unsafe inside the build portal); staleness for the AM - // path is handled by `aminsert`. Any failure degrades to "no summary" (KEEP). + // Compute the summary via SPI under a freshly pushed snapshot so the snapshot the + // CREATE INDEX portal relies on is restored afterwards. Any failure degrades to + // "no summary" (KEEP at planning time). pg_sys::PushActiveSnapshot(pg_sys::GetTransactionSnapshot()); - pgrx::PgTryBuilder::new(|| { - if let Ok(names) = crate::summary_build::column_names_for_attnums(heap_relid, &attnums) { - let _ = crate::summary_build::build_one_leaf(index_relid, heap_relid, &names); - } + let summary = pgrx::PgTryBuilder::new(|| { + let names = crate::summary_build::column_names_for_attnums(heap_relid, &attnums).ok()?; + crate::summary_build::build_one_leaf(heap_relid, &names).ok() }) - .catch_others(|_| ()) + .catch_others(|_| None) .execute(); pg_sys::PopActiveSnapshot(); + // Persist the summary into the index's own metapage. Page writes use no snapshot, + // so this happens after the SPI section. + if let Some(summary) = summary { + let _ = crate::index_storage::write_summary(index, &summary); + } + let result = pg_sys::palloc0(std::mem::size_of::()) as *mut pg_sys::IndexBuildResult; (*result).heap_tuples = 0.0; @@ -117,38 +93,153 @@ unsafe extern "C-unwind" fn am_build( #[pg_guard] unsafe extern "C-unwind" fn am_buildempty(_index: pg_sys::Relation) {} -/// Conservatively mark this partition's summaries stale on insert so the planner stops -/// pruning it until a rebuild (REINDEX) recomputes the range. The `AND NOT stale` guard -/// makes the steady state a cheap no-op once a partition is already marked. +/// Incrementally widen the partition's metapage summary to include the inserted row — +/// the BRIN-style maintenance path. Because the summary only needs to be over-inclusive, +/// this update is in place and needs no MVCC; an insert that stays within the existing +/// range writes nothing. This keeps pruning correct and active without any REINDEX. #[pg_guard] #[allow(clippy::too_many_arguments)] // signature is fixed by PostgreSQL's aminsert_function unsafe extern "C-unwind" fn am_insert( - _index: pg_sys::Relation, - _values: *mut pg_sys::Datum, - _isnull: *mut bool, + index: pg_sys::Relation, + values: *mut pg_sys::Datum, + isnull: *mut bool, _heap_tid: pg_sys::ItemPointer, - heap: pg_sys::Relation, + _heap: pg_sys::Relation, _check_unique: pg_sys::IndexUniqueCheck::Type, _index_unchanged: bool, - _index_info: *mut pg_sys::IndexInfo, + index_info: *mut pg_sys::IndexInfo, +) -> bool { + pgrx::PgTryBuilder::new(|| widen_on_insert(index, index_info, values, isnull)) + .catch_others(|_| ()) + .execute(); + false +} + +/// Read the metapage summary, widen each indexed column to cover the new value, and +/// write it back only if something changed. +unsafe fn widen_on_insert( + index: pg_sys::Relation, + index_info: *mut pg_sys::IndexInfo, + values: *mut pg_sys::Datum, + isnull: *mut bool, +) { + let mut summary = match index_storage::read_summary(index) { + Some(s) => s, + None => return, + }; + let nattrs = (*index_info).ii_NumIndexAttrs.max(0) as usize; + let mut changed = false; + for i in 0..nattrs { + let heap_attnum = (*index_info).ii_IndexAttrNumbers[i]; + let col = match summary.cols.iter_mut().find(|c| c.attnum == heap_attnum) { + Some(c) => c, + None => continue, + }; + let typoid = (*pg_sys::TupleDescAttr((*index).rd_att, i as i32)).atttypid; + let collation = if (*index).rd_indcollation.is_null() { + pg_sys::Oid::INVALID + } else { + *(*index).rd_indcollation.add(i) + }; + changed |= widen_column(col, typoid, collation, *values.add(i), *isnull.add(i)); + } + if changed { + let _ = index_storage::write_summary(index, &summary); + } +} + +/// Widen one column's summary for a single inserted value. Returns whether it changed. +unsafe fn widen_column( + col: &mut ColSummary, + typoid: pg_sys::Oid, + collation: pg_sys::Oid, + value: pg_sys::Datum, + isnull: bool, ) -> bool { - let heap_relid: u32 = (*heap).rd_id.into(); - // Already marked stale in this transaction? Then nothing more to do this statement. - if STALE_MARKED.with(|s| s.borrow().contains(&heap_relid)) { + if isnull { + if !col.has_nulls { + col.has_nulls = true; + return true; + } return false; } - let marked = pgrx::PgTryBuilder::new(|| { - Spi::run(&format!( - "UPDATE {tbl} SET stale = true WHERE relid = {heap_relid}::oid AND NOT stale", - tbl = crate::summary_build::summary_table() - )) - .is_ok() - }) - .catch_others(|_| false) - .execute(); - // Only memo on success, so a failed mark is retried on the next row. - if marked { - STALE_MARKED.with(|s| s.borrow_mut().insert(heap_relid)); + + let mut changed = false; + if col.all_nulls { + col.all_nulls = false; + changed = true; + } + + if col.overlap { + return changed | widen_overlap(col, typoid, value); + } + + // btree min/max: widen using the column type's compare support function. + let new_text = match datum_to_text(typoid, value) { + Some(t) => t, + None => return changed, + }; + match (&col.min, &col.max) { + (Some(min), Some(max)) => { + let cmp = match btree_cmp_proc(typoid) { + Some(c) => c, + None => return changed, + }; + if let (Some(min_d), Some(max_d)) = + (text_to_datum(typoid, min), text_to_datum(typoid, max)) + { + if datum_cmp(cmp, collation, value, min_d) < 0 { + col.min = Some(new_text.clone()); + changed = true; + } + if datum_cmp(cmp, collation, value, max_d) > 0 { + col.max = Some(new_text); + changed = true; + } + } + } + // No range yet (was all-null/empty): seed it with this value. + _ => { + col.min = Some(new_text.clone()); + col.max = Some(new_text); + changed = true; + } + } + changed +} + +/// Widen an overlap (range/geometry) extent to cover a new value, via the type's own +/// union operator (delegated to SQL). Writes nothing if the value is already covered. +unsafe fn widen_overlap(col: &mut ColSummary, typoid: pg_sys::Oid, value: pg_sys::Datum) -> bool { + let new_text = match datum_to_text(typoid, value) { + Some(t) => t, + None => return false, + }; + let tn = &col.type_name; + let is_geometry = tn == "geometry" || tn.ends_with(".geometry"); + let lit = |s: &str| format!("'{}'", s.replace('\'', "''")); + let new_lit = lit(&new_text); + + let sql = match &col.min { + None if is_geometry => format!( + "SELECT ST_Extent(g)::geometry::text FROM (VALUES (CAST({new_lit} AS {tn}))) v(g)" + ), + None => format!("SELECT CAST({new_lit} AS {tn})::text"), + Some(ext) if is_geometry => format!( + "SELECT ST_Extent(g)::geometry::text FROM \ + (VALUES (CAST({} AS {tn})), (CAST({new_lit} AS {tn}))) v(g)", + lit(ext) + ), + Some(ext) => format!( + "SELECT range_merge(CAST({} AS {tn}), CAST({new_lit} AS {tn}))::text", + lit(ext) + ), + }; + + let widened = Spi::get_one::(&sql).ok().flatten(); + if widened.is_some() && widened != col.min { + col.min = widened; + return true; } false } @@ -276,6 +367,5 @@ extension_sql!( ON ddl_command_end WHEN TAG IN ('CREATE EXTENSION') EXECUTE FUNCTION table_range_opclass_sync_evt(); "#, - name = "table_range_access_method", - requires = ["table_range_bootstrap_sql"] + name = "table_range_access_method" ); diff --git a/src/index_am_tests.rs b/src/index_am_tests.rs index f9b9fb4..f42d4ec 100644 --- a/src/index_am_tests.rs +++ b/src/index_am_tests.rs @@ -37,13 +37,17 @@ fn am_create_index_builds_and_prunes() { am_setup(); Spi::run("CREATE INDEX amt_tr ON amt USING table_range (val)").expect("create index"); - // Summaries should exist for the three leaves. - let n = Spi::get_one::( - "SELECT count(DISTINCT relid)::bigint FROM table_range_summary", + // Each leaf index holds its summary on its own metapage. Check one leaf's page. + let leaf_summary = Spi::get_one::( + "SELECT table_range_test_read_summary(i.inhrelid) + FROM pg_inherits i WHERE i.inhparent = 'amt_tr'::regclass LIMIT 1", ) .unwrap() .unwrap(); - assert!(n >= 3, "expected >=3 summarized leaves, got {n}"); + assert!( + leaf_summary.contains("min=Some") && leaf_summary.contains("max=Some"), + "leaf index must carry a summary: {leaf_summary}" + ); let plan = am_explain("amt", "val >= 250"); assert!(plan.contains("amt_3"), "r3 kept:\n{plan}"); @@ -121,24 +125,17 @@ fn am_bulk_insert_stays_correct_under_stale_memo() { } #[pg_test] -fn am_drop_index_cleans_summaries_and_stays_correct() { +fn am_drop_index_stops_pruning_and_stays_correct() { am_setup(); Spi::run("CREATE INDEX amt_tr ON amt USING table_range (val)").unwrap(); - let before = Spi::get_one::("SELECT count(*)::bigint FROM table_range_summary") - .unwrap() - .unwrap(); - assert!(before >= 3, "summaries built before drop: {before}"); + Spi::run("SET table_range.enable_pruning = on").unwrap(); + // With the summary owned by the index, DROP INDEX removes it (the index's storage is + // gone) — no side table to clean up. Pruning simply stops; results stay correct. Spi::run("DROP INDEX amt_tr").unwrap(); + let plan = am_explain("amt", "val >= 250"); + assert!(plan.contains("amt_1"), "no index -> no pruning:\n{plan}"); - // The sql_drop event trigger must remove the index's summaries, so a later insert - // (no longer tracked by any index/trigger) cannot cause a stale-prune false negative. - let after = Spi::get_one::("SELECT count(*)::bigint FROM table_range_summary") - .unwrap() - .unwrap(); - assert_eq!(after, 0, "summaries must be cleaned on DROP INDEX, found {after}"); - - Spi::run("SET table_range.enable_pruning = on").unwrap(); Spi::run("INSERT INTO amt VALUES (1, 5000)").unwrap(); assert_eq!( Spi::get_one::("SELECT count(*)::bigint FROM amt WHERE val = 5000") @@ -169,3 +166,27 @@ fn storage_page_roundtrip() { .unwrap(); assert_eq!(out, "hello-page-42", "blob must round-trip through the index metapage"); } + +#[pg_test] +fn ambuild_writes_summary_to_page() { + Spi::run( + "DROP TABLE IF EXISTS ap CASCADE; + CREATE TABLE ap (val bigint) PARTITION BY RANGE (val); + CREATE TABLE ap_1 PARTITION OF ap FOR VALUES FROM (0) TO (100); + INSERT INTO ap SELECT g FROM generate_series(0, 99) g; + CREATE INDEX ap_tr ON ap USING table_range (val);", + ) + .unwrap(); + // The per-partition summary lives on the leaf child index (on ap_1), not the + // partitioned parent. Find that child index and read its metapage summary. + let out = Spi::get_one::( + "SELECT table_range_test_read_summary(i.inhrelid) + FROM pg_inherits i WHERE i.inhparent = 'ap_tr'::regclass", + ) + .unwrap() + .unwrap(); + assert!( + out.contains("min=Some(\"0\")") && out.contains("max=Some(\"99\")"), + "ambuild must persist the summary to the index page: {out}" + ); +} diff --git a/src/index_storage.rs b/src/index_storage.rs index f32aa97..a10a44a 100644 --- a/src/index_storage.rs +++ b/src/index_storage.rs @@ -253,6 +253,31 @@ mod serde_tests { // ---- test-only round-trip harness ------------------------------------------------- +/// Read an index's metapage summary and render it for tests. +#[cfg(any(test, feature = "pg_test"))] +#[pg_extern] +fn table_range_test_read_summary(index: pg_sys::Oid) -> String { + unsafe { + let rel = pg_sys::index_open(index, pg_sys::AccessShareLock as i32); + let s = read_summary(rel); + pg_sys::index_close(rel, pg_sys::AccessShareLock as i32); + match s { + None => "none".to_string(), + Some(s) => s + .cols + .iter() + .map(|c| { + format!( + "attnum={} overlap={} min={:?} max={:?} has_nulls={} all_nulls={}", + c.attnum, c.overlap, c.min, c.max, c.has_nulls, c.all_nulls + ) + }) + .collect::>() + .join("; "), + } + } +} + #[cfg(any(test, feature = "pg_test"))] #[pg_extern] fn table_range_test_page_roundtrip(index: pg_sys::Oid, payload: String) -> String { diff --git a/src/lib.rs b/src/lib.rs index 76e3afb..fc95806 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -38,66 +38,6 @@ pub extern "C-unwind" fn _PG_init() { // Install the real planner-time partition pruning hooks. prune_hook::install(); - // Register the transaction callback used by the index AM's staleness memo. - index_am::install(); -} - -extension_sql!( - r#" - -- One summary per (index, leaf partition, column), built by the index AM. - -- index_oid: the (leaf) index relation OID. - -- relid: the leaf partition (heap) OID the planner sees. - -- attnum: the leaf partition's attnum for the column (resolved by name). - -- kind: 'minmax' -> min_summary/max_summary hold the column's btree min/max; - -- 'overlap' -> min_summary holds the covering extent for && pruning - -- (range types and PostGIS geometry). - CREATE TABLE IF NOT EXISTS table_range_summary ( - index_oid oid NOT NULL, - relid oid NOT NULL, - attnum int2 NOT NULL, - kind text NOT NULL DEFAULT 'minmax', - type_name text, - min_summary text, - max_summary text, - has_nulls boolean NOT NULL DEFAULT false, - all_nulls boolean NOT NULL DEFAULT false, - stale boolean NOT NULL DEFAULT false, - tuple_version int4 NOT NULL DEFAULT 1, - PRIMARY KEY (index_oid, relid, attnum) - ); - - -- Fast per-partition lookups during planning (the hot path). - CREATE INDEX IF NOT EXISTS table_range_summary_relid_attnum_idx - ON table_range_summary (relid, attnum); - - -- Fast maintenance scans for stale summaries only. - CREATE INDEX IF NOT EXISTS table_range_summary_stale_idx - ON table_range_summary (relid) - WHERE stale; - - -- Drop summaries for any relation that is dropped, so a dropped table_range index - -- (or its table) can never leave behind a summary that nothing keeps stale. - CREATE FUNCTION table_range_drop_cleanup() RETURNS event_trigger - LANGUAGE c AS 'MODULE_PATHNAME', 'table_range_drop_cleanup'; - CREATE EVENT TRIGGER table_range_drop_trg ON sql_drop - EXECUTE FUNCTION table_range_drop_cleanup(); - "#, - name = "table_range_bootstrap_sql" -); - -/// Diagnostic accessors (test/benchmark only): how many times the planner loaded -/// summaries from the catalog. One load per top-level plan demonstrates the per-plan -/// cache — the count does not grow with the number of partitions. -#[cfg(any(test, feature = "pg_test"))] -#[pg_extern] -fn table_range_cache_load_count() -> i64 { - prune_hook::cache_load_count() as i64 -} - -#[cfg(any(test, feature = "pg_test"))] -#[pg_extern] -fn table_range_reset_cache_load_count() { - prune_hook::reset_cache_load_count(); } #[cfg(any(test, feature = "pg_test"))] diff --git a/src/prune_hook.rs b/src/prune_hook.rs index 5da946b..c2e4ed1 100644 --- a/src/prune_hook.rs +++ b/src/prune_hook.rs @@ -3,24 +3,10 @@ use pgrx::prelude::*; use std::cell::{Cell, RefCell}; use std::collections::HashMap; use std::ffi::{CStr, CString}; -use std::sync::atomic::{AtomicU64, Ordering}; +use crate::index_storage::ColSummary; use crate::{TABLE_RANGE_ENABLE_PRUNING, TABLE_RANGE_LOG_PRUNING_DEBUG}; -/// Diagnostic: number of times summaries were actually loaded from the catalog. -/// One load per top-level plan (regardless of partition count) proves the per-plan cache. -static CACHE_LOADS: AtomicU64 = AtomicU64::new(0); - -#[cfg(any(test, feature = "pg_test"))] -pub fn cache_load_count() -> u64 { - CACHE_LOADS.load(Ordering::Relaxed) -} - -#[cfg(any(test, feature = "pg_test"))] -pub fn reset_cache_load_count() { - CACHE_LOADS.store(0, Ordering::Relaxed); -} - // Real planner-time partition pruning via `set_rel_pathlist_hook`, with a per-plan // summary cache driven by `planner_hook`. // @@ -32,9 +18,8 @@ pub fn reset_cache_load_count() { // partition cannot contain a matching row, we call `mark_dummy_rel` so the planner // eliminates it before generating child paths. // -// The cache is loaded once per top-level planner invocation (one SPI query for all -// non-stale summaries) and discarded when that invocation finishes, so very wide -// partition trees do not issue a summary lookup per partition. +// Each partition's summary is read from its table_range index's metapage and cached for +// the duration of one top-level planner invocation. extern "C" { fn mark_dummy_rel(rel: *mut pg_sys::RelOptInfo); @@ -46,29 +31,19 @@ const BTORDER_PROC: u16 = 1; static mut PREV_PATHLIST_HOOK: pg_sys::set_rel_pathlist_hook_type = None; static mut PREV_PLANNER_HOOK: pg_sys::planner_hook_type = None; -#[derive(Clone)] -struct SummaryRow { - attnum: i16, - /// `true` when `min` holds a covering extent for `&&` pruning (range/geometry); - /// `false` when `min`/`max` hold the column's btree min/max. - is_overlap: bool, - /// SQL type name for the stored extent (overlap rows only). - type_name: Option, - min: Option, - max: Option, - has_nulls: bool, - all_nulls: bool, -} - -type SummaryMap = HashMap>; +/// Per-partition summaries read from each partition's index, cached for one planner +/// invocation (keyed by partition relid). A relid present with an empty vec means +/// "checked, no table_range index / no summary". +type SummaryMap = HashMap>; thread_local! { - /// Cached non-stale summaries for the current planner invocation, keyed by relid. - /// `None` means "not loaded yet this plan". - static CACHE: RefCell> = const { RefCell::new(None) }; + /// Summaries read during the current planner invocation. Cleared per top-level plan. + static CACHE: RefCell = RefCell::new(HashMap::new()); + /// The table_range access-method OID, resolved once per plan. + static AM_OID: Cell> = const { Cell::new(None) }; /// Nesting depth of planner invocations (SPI during planning re-enters). static PLAN_DEPTH: Cell = const { Cell::new(0) }; - /// Guards against re-entering pruning logic from the SPI we issue to load the cache. + /// Guards against re-entering pruning logic from the SPI overlap evaluation issues. static IN_HOOK: Cell = const { Cell::new(false) }; } @@ -160,49 +135,62 @@ unsafe extern "C-unwind" fn table_range_pathlist_hook( } fn clear_cache() { - CACHE.with(|c| *c.borrow_mut() = None); + CACHE.with(|c| c.borrow_mut().clear()); + AM_OID.with(|c| c.set(None)); } -/// Load all non-stale summaries once for this planner invocation. -fn ensure_cache_loaded() { - if CACHE.with(|c| c.borrow().is_some()) { +/// The table_range access-method OID, resolved once per planner invocation. +unsafe fn table_range_am_oid() -> pg_sys::Oid { + if let Some(oid) = AM_OID.with(|c| c.get()) { + return oid; + } + let oid = pg_sys::get_am_oid(c"table_range".as_ptr(), true); + AM_OID.with(|c| c.set(Some(oid))); + oid +} + +/// Read the partition's summary from its table_range index's metapage (the index is +/// found in the relation's index list and is already locked by the planner). The result +/// is cached for this plan. An empty vec means "no table_range index / no summary". +unsafe fn load_summary(rel: *mut pg_sys::RelOptInfo, relid_u32: u32) { + if CACHE.with(|c| c.borrow().contains_key(&relid_u32)) { return; } - CACHE_LOADS.fetch_add(1, Ordering::Relaxed); - let mut map: SummaryMap = HashMap::new(); - let sql = format!( - "SELECT relid, attnum, kind, type_name, min_summary, max_summary, has_nulls, all_nulls \ - FROM {} WHERE NOT stale", - crate::summary_build::summary_table() - ); - let _ = Spi::connect(|client| { - let table = client.select(&sql, None, &[])?; - for row in table { - let relid = row.get::(1).ok().flatten(); - let attnum = row.get::(2).ok().flatten(); - if let (Some(relid), Some(attnum)) = (relid, attnum) { - let kind = row.get::(3).ok().flatten(); - let entry = SummaryRow { - attnum, - is_overlap: kind.as_deref() == Some("overlap"), - type_name: row.get::(4).ok().flatten(), - min: row.get::(5).ok().flatten(), - max: row.get::(6).ok().flatten(), - has_nulls: row.get::(7).ok().flatten().unwrap_or(false), - all_nulls: row.get::(8).ok().flatten().unwrap_or(false), - }; - map.entry(relid.into()).or_default().push(entry); - } - } - Ok::<(), pgrx::spi::SpiError>(()) + let cols = read_index_summary(rel); + CACHE.with(|c| { + c.borrow_mut().insert(relid_u32, cols); }); - CACHE.with(|c| *c.borrow_mut() = Some(map)); +} + +unsafe fn read_index_summary(rel: *mut pg_sys::RelOptInfo) -> Vec { + let am = table_range_am_oid(); + if am == pg_sys::Oid::INVALID || (*rel).indexlist.is_null() { + return Vec::new(); + } + let indexes = pgrx::PgList::::from_pg((*rel).indexlist); + for idx in indexes.iter_ptr() { + if idx.is_null() || (*idx).relam != am { + continue; + } + let irel = pg_sys::index_open((*idx).indexoid, pg_sys::AccessShareLock as i32); + // A partitioned (parent) index has no storage; only leaf indexes hold summaries. + let has_storage = + (*(*irel).rd_rel).relkind != pg_sys::RELKIND_PARTITIONED_INDEX as std::ffi::c_char; + let summary = if has_storage { + crate::index_storage::read_summary(irel) + } else { + None + }; + pg_sys::index_close(irel, pg_sys::AccessShareLock as i32); + return summary.map(|s| s.cols).unwrap_or_default(); + } + Vec::new() } /// Returns true iff some restriction clause proves the partition cannot match. unsafe fn evaluate_relation(rel: *mut pg_sys::RelOptInfo, relid: pg_sys::Oid) -> bool { - ensure_cache_loaded(); let relid_u32: u32 = relid.into(); + load_summary(rel, relid_u32); let restrictlist = (*rel).baserestrictinfo; if restrictlist.is_null() { @@ -212,9 +200,9 @@ unsafe fn evaluate_relation(rel: *mut pg_sys::RelOptInfo, relid: pg_sys::Oid) -> CACHE.with(|c| { let borrow = c.borrow(); - let rows = match borrow.as_ref().and_then(|m| m.get(&relid_u32)) { - Some(rows) => rows, - None => return false, // unregistered / no summaries -> never prune + let rows = match borrow.get(&relid_u32) { + Some(rows) if !rows.is_empty() => rows, + _ => return false, // no summary -> never prune }; // Top-level restriction clauses are implicitly AND-ed: if any one clause proves @@ -237,7 +225,7 @@ unsafe fn evaluate_relation(rel: *mut pg_sys::RelOptInfo, relid: pg_sys::Oid) -> /// - `AND(xs)` prunes if **any** child prunes, /// - `OR(xs)` prunes only if **every** child prunes, /// - `NOT(..)` and unknown shapes are conservative (do not prune). -unsafe fn clause_proves_prune(node: *mut pg_sys::Node, rows: &[SummaryRow], depth: u32) -> bool { +unsafe fn clause_proves_prune(node: *mut pg_sys::Node, rows: &[ColSummary], depth: u32) -> bool { if node.is_null() || depth > 32 { return false; } @@ -308,7 +296,7 @@ impl QualSpec { } /// Evaluate whether this clause proves the partition cannot match. - unsafe fn proves_prune(&self, row: &SummaryRow) -> bool { + unsafe fn proves_prune(&self, row: &ColSummary) -> bool { match self { // Null flags are valid for both minmax and overlap summaries. QualSpec::Null { is_null, .. } => { @@ -325,19 +313,19 @@ impl QualSpec { collation, const_text, .. - } => !row.is_overlap && eval_compare(row, *strategy, *typ, *collation, const_text), + } => !row.overlap && eval_compare(row, *strategy, *typ, *collation, const_text), QualSpec::InList { typ, collation, elems, .. - } => !row.is_overlap && eval_in_list(row, *typ, *collation, elems), + } => !row.overlap && eval_in_list(row, *typ, *collation, elems), // Overlap only applies to extent summaries. QualSpec::Overlap { const_text, const_type_name, .. - } => row.is_overlap && eval_overlap(row, const_text, const_type_name), + } => row.overlap && eval_overlap(row, const_text, const_type_name), } } } @@ -464,7 +452,7 @@ unsafe fn extract_saop(saop: *mut pg_sys::ScalarArrayOpExpr) -> Option // --------------------------------------------------------------------------------- unsafe fn eval_compare( - row: &SummaryRow, + row: &ColSummary, strategy: i16, typ: pg_sys::Oid, collation: pg_sys::Oid, @@ -502,7 +490,7 @@ unsafe fn eval_compare( } unsafe fn eval_in_list( - row: &SummaryRow, + row: &ColSummary, typ: pg_sys::Oid, collation: pg_sys::Oid, elems: &[Option], @@ -547,15 +535,15 @@ unsafe fn eval_in_list( /// constant. The overlap test is delegated to PostgreSQL's own `&&` operator on the /// column's type (range types, PostGIS geometry), so it works wherever that operator /// is defined. A missing extent or any error is conservative (KEEP). -unsafe fn eval_overlap(row: &SummaryRow, const_text: &str, const_type_name: &str) -> bool { +unsafe fn eval_overlap(row: &ColSummary, const_text: &str, const_type_name: &str) -> bool { let extent = match &row.min { Some(e) => e, None => return false, }; - let type_name = match &row.type_name { - Some(t) => t, - None => return false, - }; + let type_name = &row.type_name; + if type_name.is_empty() { + return false; + } let sql = format!( "SELECT NOT (CAST({ext} AS {ext_t}) && CAST({k} AS {k_t}))", ext = sql_literal(extent), @@ -586,7 +574,7 @@ unsafe fn operator_name(opno: pg_sys::Oid) -> Option { name } -unsafe fn datum_cmp( +pub(crate) unsafe fn datum_cmp( cmpproc: pg_sys::Oid, collation: pg_sys::Oid, a: pg_sys::Datum, @@ -596,7 +584,7 @@ unsafe fn datum_cmp( } /// Default btree "compare" support proc for a type, or `None` if unavailable. -unsafe fn btree_cmp_proc(typ: pg_sys::Oid) -> Option { +pub(crate) unsafe fn btree_cmp_proc(typ: pg_sys::Oid) -> Option { let opclass = pg_sys::GetDefaultOpClass(typ, pg_sys::BTREE_AM_OID); if opclass == pg_sys::Oid::INVALID { return None; @@ -614,7 +602,7 @@ unsafe fn btree_cmp_proc(typ: pg_sys::Oid) -> Option { } /// Convert text to a Datum of `typ` via the type's input function. -unsafe fn text_to_datum(typ: pg_sys::Oid, s: &str) -> Option { +pub(crate) unsafe fn text_to_datum(typ: pg_sys::Oid, s: &str) -> Option { let mut infunc = pg_sys::Oid::INVALID; let mut typioparam = pg_sys::Oid::INVALID; pg_sys::getTypeInputInfo(typ, &mut infunc, &mut typioparam); @@ -627,7 +615,7 @@ unsafe fn text_to_datum(typ: pg_sys::Oid, s: &str) -> Option { } /// Render a Datum of `typ` to its text representation via the type's output function. -unsafe fn datum_to_text(typ: pg_sys::Oid, datum: pg_sys::Datum) -> Option { +pub(crate) unsafe fn datum_to_text(typ: pg_sys::Oid, datum: pg_sys::Datum) -> Option { let mut outfunc = pg_sys::Oid::INVALID; let mut is_varlena = false; pg_sys::getTypeOutputInfo(typ, &mut outfunc, &mut is_varlena); diff --git a/src/summary_build.rs b/src/summary_build.rs index 5731ec5..b485d7b 100644 --- a/src/summary_build.rs +++ b/src/summary_build.rs @@ -1,85 +1,28 @@ +use crate::index_storage::{ColSummary, IndexSummary}; use pgrx::prelude::*; use pgrx::spi::SpiError; -use std::sync::OnceLock; -// SPI-driven summary maintenance for the table_range pruning extension. -// -// Summaries are built by the index access method's `ambuild` (see `index_am.rs`): for -// each leaf partition it scans the column's real data and persists one summary row into -// `table_range_summary`, keyed by: -// - `index_oid` = the (leaf) index relation OID, -// - `relid` = the leaf partition (heap) OID the planner sees, -// - `attnum` = the leaf partition's attnum for the column. -// -// Correctness: a missing or `stale` summary means "do not prune". We never persist a -// summary that could cause a false negative; on any failure we leave the partition -// unsummarized (KEEP behavior at planning time). +// Computes a leaf partition's summary by scanning its data. Called by the index access +// method's `ambuild` (see `index_am.rs`); the result is persisted into the index's own +// metapage (see `index_storage.rs`) and maintained incrementally by `aminsert`. There is +// no side table — the summary lives in the index, like BRIN. fn oid_u32(oid: pg_sys::Oid) -> u32 { oid.into() } -/// The extension's schema (where its tables live). PostgreSQL restricts `search_path` -/// during index builds (`ambuild`), so unqualified references fail there; resolving and -/// caching the schema (via pg_catalog, always reachable) keeps every code path working. -pub(crate) fn schema() -> &'static str { - static SCHEMA: OnceLock = OnceLock::new(); - SCHEMA.get_or_init(|| { - Spi::get_one::( - "SELECT relnamespace::regnamespace::text FROM pg_class \ - WHERE relname = 'table_range_summary' AND relkind = 'r' LIMIT 1", - ) - .ok() - .flatten() - .unwrap_or_else(|| "public".to_string()) - }) -} - -/// Schema-qualified name of the summary table, e.g. `public.table_range_summary`. -pub(crate) fn summary_table() -> String { - format!("{}.table_range_summary", schema()) -} - -/// V1 record for the `sql_drop` event-trigger cleanup function. -#[no_mangle] -pub extern "C" fn pg_finfo_table_range_drop_cleanup() -> &'static pg_sys::Pg_finfo_record { - const V1_API: pg_sys::Pg_finfo_record = pg_sys::Pg_finfo_record { api_version: 1 }; - &V1_API -} - -/// Event-trigger handler: when any relation is dropped, remove the summaries that -/// referenced it (by index OID or leaf OID). This closes the gap where a dropped -/// `table_range` index would leave summaries behind that nothing keeps stale anymore. -#[no_mangle] -#[pg_guard] -pub unsafe extern "C-unwind" fn table_range_drop_cleanup( - _fcinfo: pg_sys::FunctionCallInfo, -) -> pg_sys::Datum { - pgrx::PgTryBuilder::new(|| { - let _ = Spi::run(&format!( - "DELETE FROM {tbl} s USING pg_event_trigger_dropped_objects() d \ - WHERE s.index_oid = d.objid OR s.relid = d.objid", - tbl = summary_table() - )); - }) - .catch_others(|_| ()) - .execute(); - pg_sys::Datum::from(0) -} - -/// Build summaries for a single leaf relation's named columns. Called by `ambuild` -/// (keyed by the index OID). Returns the number of summary rows written. +/// Build the summary for a single leaf relation's named columns by scanning its data. +/// Returns the typed summary that `ambuild` persists into the index's own metapage. pub(crate) fn build_one_leaf( - index_oid: pg_sys::Oid, leaf: pg_sys::Oid, columns: &[String], -) -> Result { +) -> Result { let leaf_name = match relation_name(leaf)? { Some(n) => n, - None => return Ok(0), + None => return Ok(IndexSummary::default()), }; - let mut written = 0i64; + let mut summary = IndexSummary::default(); for col in columns { // Resolve this leaf's attnum for the column by name. let attnum = match leaf_attnum(leaf, col)? { @@ -87,22 +30,9 @@ pub(crate) fn build_one_leaf( None => continue, // column absent on this leaf; KEEP behavior }; - let kind = column_kind(leaf, col)?; + let (kind, type_name) = column_kind(leaf, col)?; let qcol = quote_ident(col); - - // Per-kind summary expressions. `minmax` stores the btree min/max; `overlap` - // stores a single covering extent for `&&` pruning (range types / geometry). - let (e1, e2) = match &kind { - ColumnKind::MinMax => (format!("min({qcol})::text"), format!("max({qcol})::text")), - ColumnKind::Range { .. } => ( - format!("range_merge(range_agg({qcol}))::text"), - "NULL::text".to_string(), - ), - ColumnKind::Geometry { .. } => ( - format!("ST_Extent({qcol})::geometry::text"), - "NULL::text".to_string(), - ), - }; + let (e1, e2) = kind.exprs(&qcol); let stats_sql = format!("SELECT count(*)::bigint, count({qcol})::bigint, {e1}, {e2} FROM {leaf_name}"); @@ -122,21 +52,19 @@ pub(crate) fn build_one_leaf( let has_nulls = total > nonnull; let all_nulls = total > 0 && nonnull == 0; + let overlap = kind.overlap(); - upsert_summary( - index_oid, - leaf, + summary.cols.push(ColSummary { attnum, - kind.tag(), - kind.type_name(), - s1.as_deref(), - s2.as_deref(), + overlap, + type_name, + min: s1, + max: s2, has_nulls, all_nulls, - )?; - written += 1; + }); } - Ok(written) + Ok(summary) } /// How a column is summarized. @@ -144,31 +72,34 @@ enum ColumnKind { /// Default: btree min/max, used for scalar comparison pruning. MinMax, /// Range type: a covering range is stored for `&&` overlap pruning. - Range { type_name: String }, + Range, /// PostGIS geometry: the bounding extent is stored for `&&` overlap pruning. - Geometry { type_name: String }, + Geometry, } impl ColumnKind { - fn tag(&self) -> &'static str { - match self { - ColumnKind::MinMax => "minmax", - ColumnKind::Range { .. } | ColumnKind::Geometry { .. } => "overlap", - } + fn overlap(&self) -> bool { + !matches!(self, ColumnKind::MinMax) } - fn type_name(&self) -> Option<&str> { + /// SQL expressions for (min/extent, max) text given the quoted column name. + fn exprs(&self, qcol: &str) -> (String, String) { match self { - ColumnKind::MinMax => None, - ColumnKind::Range { type_name } | ColumnKind::Geometry { type_name } => { - Some(type_name.as_str()) - } + ColumnKind::MinMax => (format!("min({qcol})::text"), format!("max({qcol})::text")), + ColumnKind::Range => ( + format!("range_merge(range_agg({qcol}))::text"), + "NULL::text".to_string(), + ), + ColumnKind::Geometry => ( + format!("ST_Extent({qcol})::geometry::text"), + "NULL::text".to_string(), + ), } } } -/// Classify a leaf column: range types and PostGIS geometry get extent/overlap -/// summaries; everything else gets btree min/max. -fn column_kind(leaf: pg_sys::Oid, col: &str) -> Result { +/// Classify a leaf column and resolve its SQL type name: range types and PostGIS +/// geometry get extent/overlap summaries; everything else gets btree min/max. +fn column_kind(leaf: pg_sys::Oid, col: &str) -> Result<(ColumnKind, String), SpiError> { let row = Spi::connect(|client| { let table = client.select( &format!( @@ -192,16 +123,17 @@ fn column_kind(leaf: pg_sys::Oid, col: &str) -> Result { let (type_name, typtype) = match row { Some((Some(tn), tt)) => (tn, tt.unwrap_or_default()), - _ => return Ok(ColumnKind::MinMax), + _ => return Ok((ColumnKind::MinMax, String::new())), }; - if typtype == "r" { - Ok(ColumnKind::Range { type_name }) + let kind = if typtype == "r" { + ColumnKind::Range } else if type_name == "geometry" || type_name.ends_with(".geometry") { - Ok(ColumnKind::Geometry { type_name }) + ColumnKind::Geometry } else { - Ok(ColumnKind::MinMax) - } + ColumnKind::MinMax + }; + Ok((kind, type_name)) } /// Resolve a relation's column names for the given heap attnums (skips dropped/missing). @@ -227,49 +159,6 @@ pub(crate) fn column_names_for_attnums( Ok(names) } -#[allow(clippy::too_many_arguments)] -fn upsert_summary( - index_oid: pg_sys::Oid, - leaf: pg_sys::Oid, - attnum: i16, - kind: &str, - type_name: Option<&str>, - min_text: Option<&str>, - max_text: Option<&str>, - has_nulls: bool, - all_nulls: bool, -) -> Result<(), SpiError> { - let lit = |v: Option<&str>| match v { - Some(s) => quote_literal(s), - None => "NULL".to_string(), - }; - let q = format!( - "INSERT INTO {tbl} AS s \ - (index_oid, relid, attnum, kind, type_name, min_summary, max_summary, has_nulls, all_nulls, stale, tuple_version) \ - VALUES ({p}::oid, {r}::oid, {a}, {kind}, {tn}, {min}, {max}, {hn}, {an}, false, 1) \ - ON CONFLICT (index_oid, relid, attnum) DO UPDATE SET \ - kind = EXCLUDED.kind, \ - type_name = EXCLUDED.type_name, \ - min_summary = EXCLUDED.min_summary, \ - max_summary = EXCLUDED.max_summary, \ - has_nulls = EXCLUDED.has_nulls, \ - all_nulls = EXCLUDED.all_nulls, \ - stale = false, \ - tuple_version = s.tuple_version + 1", - tbl = summary_table(), - p = oid_u32(index_oid), - r = oid_u32(leaf), - a = attnum, - kind = quote_literal(kind), - tn = lit(type_name), - min = lit(min_text), - max = lit(max_text), - hn = if has_nulls { "true" } else { "false" }, - an = if all_nulls { "true" } else { "false" }, - ); - Spi::run(&q) -} - fn relation_name(relid: pg_sys::Oid) -> Result, SpiError> { Spi::get_one::(&format!("SELECT {}::oid::regclass::text", oid_u32(relid))) } From 65a1b012413f702de0da4d727bfd80d5e2c6b69c Mon Sep 17 00:00:00 2001 From: David W Bitner Date: Mon, 22 Jun 2026 17:30:20 -0500 Subject: [PATCH 4/7] Gate test-only prelude import in index_storage (warning-free production build) Co-Authored-By: Claude Opus 4.8 --- src/index_storage.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/index_storage.rs b/src/index_storage.rs index a10a44a..5a852f7 100644 --- a/src/index_storage.rs +++ b/src/index_storage.rs @@ -1,4 +1,5 @@ use pgrx::pg_sys; +#[cfg(any(test, feature = "pg_test"))] use pgrx::prelude::*; // Low-level storage for a per-index summary, kept in the index's own metapage (block 0) From ec5f9bc8dea7fa53599e29a284cb7a3d594b1544 Mon Sep 17 00:00:00 2001 From: David W Bitner Date: Tue, 23 Jun 2026 08:02:40 -0500 Subject: [PATCH 5/7] Document that pruning depends on the table_range index being indisvalid The planner only puts valid indexes into rel->indexlist, which is where the pathlist hook reads the per-partition summary from. Note this load-bearing dependency so a future change (or an external DDL hook) that invalidates the index doesn't silently disable pruning. Co-Authored-By: Claude Opus 4.8 --- src/prune_hook.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/prune_hook.rs b/src/prune_hook.rs index c2e4ed1..4f20269 100644 --- a/src/prune_hook.rs +++ b/src/prune_hook.rs @@ -162,6 +162,10 @@ unsafe fn load_summary(rel: *mut pg_sys::RelOptInfo, relid_u32: u32) { }); } +// We rely on the planner having put the table_range index into `rel->indexlist`. The +// planner only lists indexes with `indisvalid = true`, so a table_range index must be +// valid for pruning to engage — if anything marks it invalid (e.g. an external +// "hide indexes" DDL hook), `indexlist` omits it and we silently fall back to KEEP. unsafe fn read_index_summary(rel: *mut pg_sys::RelOptInfo) -> Vec { let am = table_range_am_oid(); if am == pg_sys::Oid::INVALID || (*rel).indexlist.is_null() { From 35099a535c9878bd66fc104c3021e14c456175e2 Mon Sep 17 00:00:00 2001 From: David W Bitner Date: Tue, 23 Jun 2026 09:05:57 -0500 Subject: [PATCH 6/7] Memoize per-plan catalog lookups and constant parsing in the prune hook The pathlist hook runs once per partition, and for a column predicate it was re-resolving the btree compare proc (three syscache lookups) and re-parsing the query constant for every partition. Both are identical across all partitions of a column, so memoize them per top-level plan (cleared in clear_cache). Cuts our per-partition planning overhead roughly in half: at 2000 partitions, warm planning for a non-key-column predicate drops from ~139ms to ~80ms. This does not change the O(partitions) scaling (PG still expands every partition for a non-key predicate) but materially widens the range of partition counts where pruning's execution win outweighs its planning cost. Co-Authored-By: Claude Opus 4.8 --- src/prune_hook.rs | 44 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/src/prune_hook.rs b/src/prune_hook.rs index 4f20269..a701cca 100644 --- a/src/prune_hook.rs +++ b/src/prune_hook.rs @@ -45,6 +45,40 @@ thread_local! { static PLAN_DEPTH: Cell = const { Cell::new(0) }; /// Guards against re-entering pruning logic from the SPI overlap evaluation issues. static IN_HOOK: Cell = const { Cell::new(false) }; + /// Per-plan memo of each column type's btree compare proc OID (the lookup is three + /// syscache hits, identical for every partition of a column, so we do it once). + static CMP_PROC_MEMO: RefCell>> = RefCell::new(HashMap::new()); + /// Per-plan memo of parsed query constants, keyed by (type oid, text). The same + /// constant is otherwise re-rendered and re-parsed once per partition. + static CONST_MEMO: RefCell> = RefCell::new(HashMap::new()); +} + +/// Compare proc for `typ`, memoized for the current plan. See [`btree_cmp_proc`]. +unsafe fn cmp_proc_cached(typ: pg_sys::Oid) -> Option { + let key: u32 = typ.into(); + if let Some(v) = CMP_PROC_MEMO.with(|m| m.borrow().get(&key).copied()) { + return v; + } + let v = btree_cmp_proc(typ); + CMP_PROC_MEMO.with(|m| { + m.borrow_mut().insert(key, v); + }); + v +} + +/// Parse `text` to a Datum of `typ`, memoized for the current plan. The cached Datum is +/// allocated in the planner's memory context (which outlives the plan) and the memo is +/// cleared per top-level plan, so the pointer stays valid for its lifetime. +unsafe fn const_datum_cached(typ: pg_sys::Oid, text: &str) -> Option { + let key = (typ.into(), text.to_string()); + if let Some(d) = CONST_MEMO.with(|m| m.borrow().get(&key).copied()) { + return Some(d); + } + let d = text_to_datum(typ, text)?; + CONST_MEMO.with(|m| { + m.borrow_mut().insert(key, d); + }); + Some(d) } /// Install our planner and pathlist hooks, preserving any previously-registered hooks. @@ -137,6 +171,8 @@ unsafe extern "C-unwind" fn table_range_pathlist_hook( fn clear_cache() { CACHE.with(|c| c.borrow_mut().clear()); AM_OID.with(|c| c.set(None)); + CMP_PROC_MEMO.with(|c| c.borrow_mut().clear()); + CONST_MEMO.with(|c| c.borrow_mut().clear()); } /// The table_range access-method OID, resolved once per planner invocation. @@ -466,11 +502,11 @@ unsafe fn eval_compare( (Some(min), Some(max)) => (min, max), _ => return false, // no usable range -> conservative KEEP }; - let cmpproc = match btree_cmp_proc(typ) { + let cmpproc = match cmp_proc_cached(typ) { Some(p) => p, None => return false, }; - let k = match text_to_datum(typ, const_text) { + let k = match const_datum_cached(typ, const_text) { Some(d) => d, None => return false, }; @@ -503,7 +539,7 @@ unsafe fn eval_in_list( (Some(min), Some(max)) => (min, max), _ => return false, }; - let cmpproc = match btree_cmp_proc(typ) { + let cmpproc = match cmp_proc_cached(typ) { Some(p) => p, None => return false, }; @@ -522,7 +558,7 @@ unsafe fn eval_in_list( Some(t) => t, None => continue, // NULL element never matches a value }; - let d = match text_to_datum(typ, text) { + let d = match const_datum_cached(typ, text) { Some(d) => d, None => return false, }; From aa8d29f6f8401bb85b65829e681e68dbc75936fa Mon Sep 17 00:00:00 2001 From: David W Bitner Date: Tue, 23 Jun 2026 09:46:22 -0500 Subject: [PATCH 7/7] Fix build on PG13-17: portable tuple-descriptor attribute access pg_sys::TupleDescAttr is only bound on PG18 (where it became an inline C function); on PG13-17 it is a macro bindgen does not surface, and PG18 also moved attributes to compact_attrs. Add a version-gated att_typid() helper: TupleDescAttr on pg18, direct .attrs access on earlier versions. Fixes the pg16/pg17/postgis CI build failures (only pg18 was exercised locally). Co-Authored-By: Claude Opus 4.8 --- src/index_am.rs | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/index_am.rs b/src/index_am.rs index 71f7d13..21c9d3b 100644 --- a/src/index_am.rs +++ b/src/index_am.rs @@ -135,7 +135,7 @@ unsafe fn widen_on_insert( Some(c) => c, None => continue, }; - let typoid = (*pg_sys::TupleDescAttr((*index).rd_att, i as i32)).atttypid; + let typoid = att_typid((*index).rd_att, i); let collation = if (*index).rd_indcollation.is_null() { pg_sys::Oid::INVALID } else { @@ -148,6 +148,20 @@ unsafe fn widen_on_insert( } } +/// The type OID of a tuple descriptor's `i`-th attribute, portable across PG versions. +/// PG18 made `TupleDescAttr` an inline function (bound by pgrx) and moved attributes to +/// `compact_attrs` (which has no `atttypid`); PG13–17 expose `attrs` directly and only a +/// `TupleDescAttr` macro (which bindgen does not surface as `pg_sys::TupleDescAttr`). +#[cfg(feature = "pg18")] +unsafe fn att_typid(tupdesc: pg_sys::TupleDesc, i: usize) -> pg_sys::Oid { + (*pg_sys::TupleDescAttr(tupdesc, i as i32)).atttypid +} +#[cfg(not(feature = "pg18"))] +unsafe fn att_typid(tupdesc: pg_sys::TupleDesc, i: usize) -> pg_sys::Oid { + let natts = (*tupdesc).natts as usize; + (*tupdesc).attrs.as_slice(natts)[i].atttypid +} + /// Widen one column's summary for a single inserted value. Returns whether it changed. unsafe fn widen_column( col: &mut ColSummary,