From 88401746ab1cf5a8ebe018d7203b4e8970b88cd7 Mon Sep 17 00:00:00 2001 From: Minh Vu Date: Thu, 18 Jun 2026 18:39:39 +0200 Subject: [PATCH] Make schema select fail on missing columns --- src/iceberg/manifest/manifest_reader.cc | 14 +++++++++++++- src/iceberg/schema.cc | 5 +++-- src/iceberg/schema.h | 3 ++- src/iceberg/table_scan.cc | 5 +++++ src/iceberg/test/schema_test.cc | 14 ++++++++++++-- src/iceberg/test/table_scan_test.cc | 7 +++++++ 6 files changed, 42 insertions(+), 6 deletions(-) diff --git a/src/iceberg/manifest/manifest_reader.cc b/src/iceberg/manifest/manifest_reader.cc index bc7572591..2d1e32d77 100644 --- a/src/iceberg/manifest/manifest_reader.cc +++ b/src/iceberg/manifest/manifest_reader.cc @@ -681,7 +681,19 @@ Result> ProjectSchema(std::shared_ptr schema, const std::vector& columns, bool case_sensitive) { if (!columns.empty()) { - return schema->Select(columns, case_sensitive); + if (std::ranges::contains(columns, Schema::kAllColumns)) { + return schema->Select(columns, case_sensitive); + } + + std::vector existing_columns; + existing_columns.reserve(columns.size()); + for (const auto& column : columns) { + ICEBERG_ASSIGN_OR_RAISE(auto field, schema->FindFieldByName(column, case_sensitive)); + if (field.has_value()) { + existing_columns.push_back(column); + } + } + return schema->Select(existing_columns, case_sensitive); } return schema; } diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index fcac43c78..ce2f70296 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -366,9 +366,10 @@ Result> Schema::Select(std::span name std::unordered_set selected_ids; for (const auto& name : names) { ICEBERG_ASSIGN_OR_RAISE(auto result, FindFieldByName(name, case_sensitive)); - if (result.has_value()) { - selected_ids.insert(result.value().get().field_id()); + if (!result.has_value()) { + return InvalidArgument("Cannot find selected column: {}", name); } + selected_ids.insert(result.value().get().field_id()); } PruneColumnVisitor visitor(selected_ids, /*select_full_types=*/true); diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h index 9245be02e..2d3beb8de 100644 --- a/src/iceberg/schema.h +++ b/src/iceberg/schema.h @@ -160,7 +160,8 @@ class ICEBERG_EXPORT Schema : public StructType { /// /// \param names Selected field names and nested names are dot-concatenated. /// \param case_sensitive Whether name matching is case-sensitive (default: true). - /// \return Projected schema containing only selected fields. + /// \return Projected schema containing only selected fields, or an error if any + /// requested field cannot be found. /// \note If the field name of a nested type has been selected, all of its /// sub-fields will be selected. Result> Select(std::span names, diff --git a/src/iceberg/table_scan.cc b/src/iceberg/table_scan.cc index 6881d34fb..b0fefc761 100644 --- a/src/iceberg/table_scan.cc +++ b/src/iceberg/table_scan.cc @@ -413,6 +413,11 @@ Result> TableScanBuilder::Build() { ICEBERG_RETURN_UNEXPECTED(context_.Validate()); ICEBERG_ASSIGN_OR_RAISE(auto schema, ResolveSnapshotSchema()); + if (!context_.selected_columns.empty()) { + ICEBERG_ASSIGN_OR_RAISE( + std::ignore, + schema.get()->Select(context_.selected_columns, context_.case_sensitive)); + } return ScanType::Make(metadata_, schema.get(), io_, std::move(context_)); } diff --git a/src/iceberg/test/schema_test.cc b/src/iceberg/test/schema_test.cc index 8f1b20035..9a24c5d00 100644 --- a/src/iceberg/test/schema_test.cc +++ b/src/iceberg/test/schema_test.cc @@ -816,13 +816,15 @@ INSTANTIATE_TEST_SUITE_P( .create_schema = []() { return BasicSchema(); }, .select_fields = {"nonexistent"}, .expected_schema = []() { return MakeSchema(); }, - .should_succeed = true}, + .should_succeed = false, + .expected_error_message = "Cannot find selected column: nonexistent"}, SelectTestParam{.test_name = "SelectCaseSensitive", .create_schema = []() { return BasicSchema(); }, .select_fields = {"Name"}, // case-sensitive .expected_schema = []() { return MakeSchema(); }, - .should_succeed = true}, + .should_succeed = false, + .expected_error_message = "Cannot find selected column: Name"}, SelectTestParam{.test_name = "SelectCaseInsensitive", .create_schema = []() { return BasicSchema(); }, @@ -840,6 +842,14 @@ INSTANTIATE_TEST_SUITE_P( .expected_schema = []() { return MakeSchema(Id(), Name()); }, .should_succeed = true}, + SelectTestParam{ + .test_name = "SelectPartialMissingField", + .create_schema = []() { return AddressSchema(); }, + .select_fields = {"id", "typo"}, + .expected_schema = []() { return MakeSchema(); }, + .should_succeed = false, + .expected_error_message = "Cannot find selected column: typo"}, + SelectTestParam{.test_name = "SelectNestedField", .create_schema = []() { return AddressSchema(); }, .select_fields = {"address.street"}, diff --git a/src/iceberg/test/table_scan_test.cc b/src/iceberg/test/table_scan_test.cc index 11905a870..2f55da048 100644 --- a/src/iceberg/test/table_scan_test.cc +++ b/src/iceberg/test/table_scan_test.cc @@ -677,6 +677,13 @@ TEST_P(TableScanTest, SchemaWithSelectedColumnsAndFilter) { } } +TEST_P(TableScanTest, SelectMissingColumnFails) { + ICEBERG_UNWRAP_OR_FAIL(auto builder, + DataTableScanBuilder::Make(table_metadata_, file_io_)); + builder->Select({"id", "typo"}); + EXPECT_THAT(builder->Build(), IsError(ErrorKind::kInvalidArgument)); +} + INSTANTIATE_TEST_SUITE_P(TableScanVersions, TableScanTest, testing::Values(1, 2, 3)); } // namespace iceberg