From 2b09f55aeb467a8a6194d8d8d8239a46cae3e7cf Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Tue, 16 Jun 2026 19:56:52 +0800 Subject: [PATCH 1/4] feat: add Iceberg v3 type definitions Introduce the Iceberg v3 types (variant, geometry, geography), including their schema/JSON serialization and type-system integration (visitors, schema projection, etc.). Reading and writing data of these types is not implemented yet: conversion to/from Arrow, Avro, and Parquet returns an error, as do identity transform binding and scalar validation for them. --- src/iceberg/avro/avro_schema_util.cc | 17 ++ src/iceberg/avro/avro_schema_util_internal.h | 3 + src/iceberg/delete_file_index.cc | 4 +- src/iceberg/json_serde.cc | 89 +++++++--- src/iceberg/metrics_config.cc | 5 +- src/iceberg/parquet/parquet_metrics.cc | 4 + src/iceberg/parquet/parquet_schema_util.cc | 5 + src/iceberg/parquet/parquet_writer.cc | 4 + src/iceberg/schema_internal.cc | 165 ++++++++++++------- src/iceberg/table_metadata.h | 5 +- src/iceberg/test/arrow_test.cc | 14 ++ src/iceberg/test/rest_json_serde_test.cc | 2 +- src/iceberg/test/schema_json_test.cc | 51 ++++++ src/iceberg/test/schema_test.cc | 57 ++++--- src/iceberg/test/transform_test.cc | 13 ++ src/iceberg/test/type_test.cc | 44 ++++- src/iceberg/test/visit_type_test.cc | 36 +++- src/iceberg/transform.cc | 3 +- src/iceberg/transform_function.cc | 4 +- src/iceberg/type.cc | 129 ++++++++++++++- src/iceberg/type.h | 98 ++++++++++- src/iceberg/type_fwd.h | 15 ++ src/iceberg/update/update_schema.cc | 6 + src/iceberg/util/struct_like_set.cc | 5 + src/iceberg/util/type_util.cc | 30 +++- src/iceberg/util/type_util.h | 4 + src/iceberg/util/visit_type.h | 9 +- src/iceberg/util/visitor_generate.h | 10 ++ 28 files changed, 696 insertions(+), 135 deletions(-) diff --git a/src/iceberg/avro/avro_schema_util.cc b/src/iceberg/avro/avro_schema_util.cc index c26e2bbc9..14b464cee 100644 --- a/src/iceberg/avro/avro_schema_util.cc +++ b/src/iceberg/avro/avro_schema_util.cc @@ -248,6 +248,18 @@ Status ToAvroNodeVisitor::Visit(const UnknownType&, ::avro::NodePtr* node) { return {}; } +Status ToAvroNodeVisitor::Visit(const VariantType&, ::avro::NodePtr*) { + return NotSupported("Writing Iceberg variant type to Avro is not supported"); +} + +Status ToAvroNodeVisitor::Visit(const GeometryType&, ::avro::NodePtr*) { + return NotSupported("Writing Iceberg geometry type to Avro is not supported"); +} + +Status ToAvroNodeVisitor::Visit(const GeographyType&, ::avro::NodePtr*) { + return NotSupported("Writing Iceberg geography type to Avro is not supported"); +} + Status ToAvroNodeVisitor::Visit(const StructType& type, ::avro::NodePtr* node) { *node = std::make_shared<::avro::NodeRecord>(); @@ -631,6 +643,11 @@ Status ValidateAvroSchemaEvolution(const Type& expected_type, break; case TypeId::kUnknown: return {}; + case TypeId::kVariant: + case TypeId::kGeometry: + case TypeId::kGeography: + return NotSupported("Reading Iceberg type {} from Avro is not supported", + expected_type); default: break; } diff --git a/src/iceberg/avro/avro_schema_util_internal.h b/src/iceberg/avro/avro_schema_util_internal.h index a5bfb989e..342b119a5 100644 --- a/src/iceberg/avro/avro_schema_util_internal.h +++ b/src/iceberg/avro/avro_schema_util_internal.h @@ -59,6 +59,9 @@ class ToAvroNodeVisitor { Status Visit(const FixedType& type, ::avro::NodePtr* node); Status Visit(const BinaryType& type, ::avro::NodePtr* node); Status Visit(const UnknownType&, ::avro::NodePtr*); + Status Visit(const VariantType&, ::avro::NodePtr*); + Status Visit(const GeometryType&, ::avro::NodePtr*); + Status Visit(const GeographyType&, ::avro::NodePtr*); Status Visit(const StructType& type, ::avro::NodePtr* node); Status Visit(const ListType& type, ::avro::NodePtr* node); Status Visit(const MapType& type, ::avro::NodePtr* node); diff --git a/src/iceberg/delete_file_index.cc b/src/iceberg/delete_file_index.cc index 7c8c35032..8c58e861b 100644 --- a/src/iceberg/delete_file_index.cc +++ b/src/iceberg/delete_file_index.cc @@ -56,7 +56,7 @@ Status EqualityDeleteFile::ConvertBoundsIfNeeded() const { } const auto& schema_field = field.value().get(); - if (schema_field.type()->is_nested()) { + if (!schema_field.type()->is_primitive()) { continue; } @@ -103,7 +103,7 @@ Result CanContainEqDeletesForFile(const DataFile& data_file, } const auto& field = found_field.value().get(); - if (field.type()->is_nested()) { + if (!field.type()->is_primitive()) { continue; } diff --git a/src/iceberg/json_serde.cc b/src/iceberg/json_serde.cc index 1f2b8f45c..83486c46f 100644 --- a/src/iceberg/json_serde.cc +++ b/src/iceberg/json_serde.cc @@ -389,6 +389,12 @@ nlohmann::json ToJson(const Type& type) { return "uuid"; case TypeId::kUnknown: return "unknown"; + case TypeId::kVariant: + return "variant"; + case TypeId::kGeometry: + return type.ToString(); + case TypeId::kGeography: + return type.ToString(); } std::unreachable(); } @@ -486,49 +492,52 @@ Result> MapTypeFromJson(const nlohmann::json& json) { Result> TypeFromJson(const nlohmann::json& json) { if (json.is_string()) { std::string type_str = json.get(); - if (type_str == "boolean") { + std::string lower_type_str = StringUtils::ToLower(type_str); + if (lower_type_str == "boolean") { return std::make_unique(); - } else if (type_str == "int") { + } else if (lower_type_str == "int") { return std::make_unique(); - } else if (type_str == "long") { + } else if (lower_type_str == "long") { return std::make_unique(); - } else if (type_str == "float") { + } else if (lower_type_str == "float") { return std::make_unique(); - } else if (type_str == "double") { + } else if (lower_type_str == "double") { return std::make_unique(); - } else if (type_str == "date") { + } else if (lower_type_str == "date") { return std::make_unique(); - } else if (type_str == "time") { + } else if (lower_type_str == "time") { return std::make_unique(); - } else if (type_str == "timestamp") { + } else if (lower_type_str == "timestamp") { return std::make_unique(); - } else if (type_str == "timestamptz") { + } else if (lower_type_str == "timestamptz") { return std::make_unique(); - } else if (type_str == "timestamp_ns") { + } else if (lower_type_str == "timestamp_ns") { return std::make_unique(); - } else if (type_str == "timestamptz_ns") { + } else if (lower_type_str == "timestamptz_ns") { return std::make_unique(); - } else if (type_str == "string") { + } else if (lower_type_str == "string") { return std::make_unique(); - } else if (type_str == "binary") { + } else if (lower_type_str == "binary") { return std::make_unique(); - } else if (type_str == "uuid") { + } else if (lower_type_str == "uuid") { return std::make_unique(); - } else if (type_str == "unknown") { + } else if (lower_type_str == "unknown") { return std::make_unique(); - } else if (type_str.starts_with("fixed")) { - std::regex fixed_regex(R"(fixed\[\s*(\d+)\s*\])"); + } else if (lower_type_str == "variant") { + return std::make_unique(); + } else if (lower_type_str.starts_with("fixed")) { + static const std::regex fixed_regex(R"(fixed\[\s*(\d+)\s*\])"); std::smatch match; - if (std::regex_match(type_str, match, fixed_regex)) { + if (std::regex_match(lower_type_str, match, fixed_regex)) { ICEBERG_ASSIGN_OR_RAISE(auto length, StringUtils::ParseNumber(match[1].str())); return std::make_unique(length); } return JsonParseError("Invalid fixed type: {}", type_str); - } else if (type_str.starts_with("decimal")) { - std::regex decimal_regex(R"(decimal\(\s*(\d+)\s*,\s*(\d+)\s*\))"); + } else if (lower_type_str.starts_with("decimal")) { + static const std::regex decimal_regex(R"(decimal\(\s*(\d+)\s*,\s*(\d+)\s*\))"); std::smatch match; - if (std::regex_match(type_str, match, decimal_regex)) { + if (std::regex_match(lower_type_str, match, decimal_regex)) { ICEBERG_ASSIGN_OR_RAISE(auto precision, StringUtils::ParseNumber(match[1].str())); ICEBERG_ASSIGN_OR_RAISE(auto scale, @@ -536,8 +545,44 @@ Result> TypeFromJson(const nlohmann::json& json) { return std::make_unique(precision, scale); } return JsonParseError("Invalid decimal type: {}", type_str); + } else if (lower_type_str.starts_with("geometry")) { + static const std::regex geometry_regex(R"(\s*(?:\(\s*([^)]*?)\s*\))?)"); + std::smatch match; + const auto type_params = type_str.substr(std::string_view("geometry").size()); + if (std::regex_match(type_params, match, geometry_regex)) { + if (match[1].matched) { + auto crs = match[1].str(); + if (crs.empty()) { + return JsonParseError("Invalid geometry type: {}", type_str); + } + return std::make_unique(std::move(crs)); + } + return std::make_unique(); + } + return JsonParseError("Invalid geometry type: {}", type_str); + } else if (lower_type_str.starts_with("geography")) { + static const std::regex geography_regex( + R"(\s*(?:\(\s*([^,]*?)\s*(?:,\s*(\w*)\s*)?\))?)"); + std::smatch match; + const auto type_params = type_str.substr(std::string_view("geography").size()); + if (std::regex_match(type_params, match, geography_regex)) { + auto crs = match[1].str(); + if (match[1].matched && crs.empty()) { + return JsonParseError("Invalid geography type: {}", type_str); + } + if (match[2].matched) { + ICEBERG_ASSIGN_OR_RAISE(auto algorithm, + EdgeAlgorithmFromString(match[2].str())); + return std::make_unique(std::move(crs), algorithm); + } + if (match[1].matched) { + return std::make_unique(std::move(crs)); + } + return std::make_unique(); + } + return JsonParseError("Invalid geography type: {}", type_str); } else { - return JsonParseError("Unknown primitive type: {}", type_str); + return JsonParseError("Cannot parse type string: {}", type_str); } } diff --git a/src/iceberg/metrics_config.cc b/src/iceberg/metrics_config.cc index 95d1a1fe1..3573c40ba 100644 --- a/src/iceberg/metrics_config.cc +++ b/src/iceberg/metrics_config.cc @@ -197,6 +197,8 @@ Result> MetricsConfig::LimitFieldIds(const Schema& s Status Visit(const Type& type) { if (type.is_nested()) { return VisitNested(internal::checked_cast(type)); + } else if (type.type_id() == TypeId::kVariant) { + return {}; } else { return VisitPrimitive(internal::checked_cast(type)); } @@ -207,8 +209,7 @@ Result> MetricsConfig::LimitFieldIds(const Schema& s if (!ShouldContinue()) { break; } - // TODO(zhuo.wang): variant type should also be handled here - if (field.type()->is_primitive()) { + if (!field.type()->is_nested()) { ids_.insert(field.field_id()); } } diff --git a/src/iceberg/parquet/parquet_metrics.cc b/src/iceberg/parquet/parquet_metrics.cc index 32dc9fec8..dac9ea6a8 100644 --- a/src/iceberg/parquet/parquet_metrics.cc +++ b/src/iceberg/parquet/parquet_metrics.cc @@ -423,6 +423,10 @@ class CollectMetricsVisitor { Status VisitMap(const MapType& /*type*/, const std::string& /*prefix*/) { return {}; } + Status VisitVariant(const VariantType& /*type*/, const std::string& /*prefix*/) { + return {}; + } + Status VisitPrimitive(const PrimitiveType& /*type*/, const std::string& /*prefix*/) { return {}; } diff --git a/src/iceberg/parquet/parquet_schema_util.cc b/src/iceberg/parquet/parquet_schema_util.cc index 39e321d9f..a5629198f 100644 --- a/src/iceberg/parquet/parquet_schema_util.cc +++ b/src/iceberg/parquet/parquet_schema_util.cc @@ -239,6 +239,11 @@ Status ValidateParquetSchemaEvolution( break; case TypeId::kUnknown: return {}; + case TypeId::kVariant: + case TypeId::kGeometry: + case TypeId::kGeography: + return NotSupported("Reading Iceberg type {} from Parquet is not supported", + expected_type); case TypeId::kStruct: if (arrow_type->id() == ::arrow::Type::STRUCT) { return {}; diff --git a/src/iceberg/parquet/parquet_writer.cc b/src/iceberg/parquet/parquet_writer.cc index c50fb26b1..fea7cd834 100644 --- a/src/iceberg/parquet/parquet_writer.cc +++ b/src/iceberg/parquet/parquet_writer.cc @@ -178,6 +178,10 @@ class FieldMetricsCollector { Status VisitMap(const MapType& /*type*/, const ::arrow::Array& /*array*/) { return {}; } + Status VisitVariant(const VariantType& /*type*/, const ::arrow::Array& /*array*/) { + return {}; + } + Status VisitPrimitive(const PrimitiveType& type, const ::arrow::Array& array) { switch (type.type_id()) { case TypeId::kFloat: diff --git a/src/iceberg/schema_internal.cc b/src/iceberg/schema_internal.cc index c32ceb2a6..d54790cb4 100644 --- a/src/iceberg/schema_internal.cc +++ b/src/iceberg/schema_internal.cc @@ -21,9 +21,11 @@ #include #include +#include #include #include +#include "iceberg/arrow_c_data_guard_internal.h" #include "iceberg/constants.h" #include "iceberg/schema.h" #include "iceberg/type.h" @@ -39,129 +41,165 @@ constexpr const char* kArrowExtensionMetadata = "ARROW:extension:metadata"; constexpr const char* kArrowUuidExtensionName = "arrow.uuid"; constexpr int32_t kUnknownFieldId = -1; -// Convert an Iceberg type to Arrow schema. Return value is Nanoarrow error code. -ArrowErrorCode ToArrowSchema(const Type& type, bool optional, std::string_view name, - std::optional field_id, ArrowSchema* schema) { +Status CheckArrowSchemaConversion(ArrowErrorCode error_code) { + if (error_code != NANOARROW_OK) { + return InvalidSchema( + "Failed to convert Iceberg schema to Arrow schema, error code: {}", error_code); + } + return {}; +} + +Status ToArrowSchemaInternal(const Type& type, bool optional, std::string_view name, + std::string_view path, std::optional field_id, + ArrowSchema* schema) { ArrowBuffer metadata_buffer; - NANOARROW_RETURN_NOT_OK(ArrowMetadataBuilderInit(&metadata_buffer, nullptr)); + ICEBERG_RETURN_UNEXPECTED( + CheckArrowSchemaConversion(ArrowMetadataBuilderInit(&metadata_buffer, nullptr))); + internal::ArrowArrayBufferGuard metadata_guard(&metadata_buffer); if (field_id.has_value()) { - NANOARROW_RETURN_NOT_OK(ArrowMetadataBuilderAppend( + ICEBERG_RETURN_UNEXPECTED(CheckArrowSchemaConversion(ArrowMetadataBuilderAppend( &metadata_buffer, ArrowCharView(std::string(kParquetFieldIdKey).c_str()), - ArrowCharView(std::to_string(field_id.value()).c_str()))); + ArrowCharView(std::to_string(field_id.value()).c_str())))); } switch (type.type_id()) { case TypeId::kStruct: { const auto& struct_type = static_cast(type); const auto& fields = struct_type.fields(); - NANOARROW_RETURN_NOT_OK(ArrowSchemaSetTypeStruct(schema, fields.size())); + ICEBERG_RETURN_UNEXPECTED( + CheckArrowSchemaConversion(ArrowSchemaSetTypeStruct(schema, fields.size()))); for (size_t i = 0; i < fields.size(); i++) { const auto& field = fields[i]; - NANOARROW_RETURN_NOT_OK(ToArrowSchema(*field.type(), field.optional(), - field.name(), field.field_id(), - schema->children[i])); + auto field_path = path.empty() ? std::string(field.name()) + : std::format("{}.{}", path, field.name()); + ICEBERG_RETURN_UNEXPECTED( + ToArrowSchemaInternal(*field.type(), field.optional(), field.name(), + field_path, field.field_id(), schema->children[i])); } } break; case TypeId::kList: { - NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, NANOARROW_TYPE_LIST)); + ICEBERG_RETURN_UNEXPECTED( + CheckArrowSchemaConversion(ArrowSchemaSetType(schema, NANOARROW_TYPE_LIST))); const auto& list_type = static_cast(type); const auto& elem_field = list_type.fields()[0]; - NANOARROW_RETURN_NOT_OK(ToArrowSchema(*elem_field.type(), elem_field.optional(), - elem_field.name(), elem_field.field_id(), - schema->children[0])); + auto element_path = path.empty() + ? std::string(ListType::kElementName) + : std::format("{}.{}", path, ListType::kElementName); + ICEBERG_RETURN_UNEXPECTED(ToArrowSchemaInternal( + *elem_field.type(), elem_field.optional(), elem_field.name(), element_path, + elem_field.field_id(), schema->children[0])); } break; case TypeId::kMap: { - NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, NANOARROW_TYPE_MAP)); + ICEBERG_RETURN_UNEXPECTED( + CheckArrowSchemaConversion(ArrowSchemaSetType(schema, NANOARROW_TYPE_MAP))); const auto& map_type = static_cast(type); const auto& key_field = map_type.key(); const auto& value_field = map_type.value(); - NANOARROW_RETURN_NOT_OK(ToArrowSchema(*key_field.type(), key_field.optional(), - key_field.name(), key_field.field_id(), - schema->children[0]->children[0])); - NANOARROW_RETURN_NOT_OK(ToArrowSchema(*value_field.type(), value_field.optional(), - value_field.name(), value_field.field_id(), - schema->children[0]->children[1])); + auto key_path = path.empty() ? std::string(MapType::kKeyName) + : std::format("{}.{}", path, MapType::kKeyName); + ICEBERG_RETURN_UNEXPECTED(ToArrowSchemaInternal( + *key_field.type(), key_field.optional(), key_field.name(), key_path, + key_field.field_id(), schema->children[0]->children[0])); + auto value_path = path.empty() ? std::string(MapType::kValueName) + : std::format("{}.{}", path, MapType::kValueName); + ICEBERG_RETURN_UNEXPECTED(ToArrowSchemaInternal( + *value_field.type(), value_field.optional(), value_field.name(), value_path, + value_field.field_id(), schema->children[0]->children[1])); } break; case TypeId::kBoolean: - NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, NANOARROW_TYPE_BOOL)); + ICEBERG_RETURN_UNEXPECTED( + CheckArrowSchemaConversion(ArrowSchemaSetType(schema, NANOARROW_TYPE_BOOL))); break; case TypeId::kInt: - NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, NANOARROW_TYPE_INT32)); + ICEBERG_RETURN_UNEXPECTED( + CheckArrowSchemaConversion(ArrowSchemaSetType(schema, NANOARROW_TYPE_INT32))); break; case TypeId::kLong: - NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, NANOARROW_TYPE_INT64)); + ICEBERG_RETURN_UNEXPECTED( + CheckArrowSchemaConversion(ArrowSchemaSetType(schema, NANOARROW_TYPE_INT64))); break; case TypeId::kFloat: - NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, NANOARROW_TYPE_FLOAT)); + ICEBERG_RETURN_UNEXPECTED( + CheckArrowSchemaConversion(ArrowSchemaSetType(schema, NANOARROW_TYPE_FLOAT))); break; case TypeId::kDouble: - NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, NANOARROW_TYPE_DOUBLE)); + ICEBERG_RETURN_UNEXPECTED( + CheckArrowSchemaConversion(ArrowSchemaSetType(schema, NANOARROW_TYPE_DOUBLE))); break; case TypeId::kDecimal: { const auto& decimal_type = static_cast(type); - NANOARROW_RETURN_NOT_OK(ArrowSchemaSetTypeDecimal(schema, NANOARROW_TYPE_DECIMAL128, - decimal_type.precision(), - decimal_type.scale())); + ICEBERG_RETURN_UNEXPECTED(CheckArrowSchemaConversion( + ArrowSchemaSetTypeDecimal(schema, NANOARROW_TYPE_DECIMAL128, + decimal_type.precision(), decimal_type.scale()))); } break; case TypeId::kDate: - NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, NANOARROW_TYPE_DATE32)); + ICEBERG_RETURN_UNEXPECTED( + CheckArrowSchemaConversion(ArrowSchemaSetType(schema, NANOARROW_TYPE_DATE32))); break; case TypeId::kTime: { - NANOARROW_RETURN_NOT_OK(ArrowSchemaSetTypeDateTime(schema, NANOARROW_TYPE_TIME64, - NANOARROW_TIME_UNIT_MICRO, - /*timezone=*/nullptr)); + ICEBERG_RETURN_UNEXPECTED(CheckArrowSchemaConversion(ArrowSchemaSetTypeDateTime( + schema, NANOARROW_TYPE_TIME64, NANOARROW_TIME_UNIT_MICRO, + /*timezone=*/nullptr))); } break; case TypeId::kTimestamp: { - NANOARROW_RETURN_NOT_OK(ArrowSchemaSetTypeDateTime(schema, NANOARROW_TYPE_TIMESTAMP, - NANOARROW_TIME_UNIT_MICRO, - /*timezone=*/nullptr)); + ICEBERG_RETURN_UNEXPECTED(CheckArrowSchemaConversion(ArrowSchemaSetTypeDateTime( + schema, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MICRO, + /*timezone=*/nullptr))); } break; case TypeId::kTimestampTz: { - NANOARROW_RETURN_NOT_OK(ArrowSchemaSetTypeDateTime( - schema, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MICRO, "UTC")); + ICEBERG_RETURN_UNEXPECTED(CheckArrowSchemaConversion(ArrowSchemaSetTypeDateTime( + schema, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MICRO, "UTC"))); } break; case TypeId::kTimestampNs: { - NANOARROW_RETURN_NOT_OK(ArrowSchemaSetTypeDateTime(schema, NANOARROW_TYPE_TIMESTAMP, - NANOARROW_TIME_UNIT_NANO, - /*timezone=*/nullptr)); + ICEBERG_RETURN_UNEXPECTED(CheckArrowSchemaConversion(ArrowSchemaSetTypeDateTime( + schema, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_NANO, + /*timezone=*/nullptr))); } break; case TypeId::kTimestampTzNs: { - NANOARROW_RETURN_NOT_OK(ArrowSchemaSetTypeDateTime( - schema, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_NANO, "UTC")); + ICEBERG_RETURN_UNEXPECTED(CheckArrowSchemaConversion(ArrowSchemaSetTypeDateTime( + schema, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_NANO, "UTC"))); } break; case TypeId::kString: - NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, NANOARROW_TYPE_STRING)); + ICEBERG_RETURN_UNEXPECTED( + CheckArrowSchemaConversion(ArrowSchemaSetType(schema, NANOARROW_TYPE_STRING))); break; case TypeId::kBinary: - NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, NANOARROW_TYPE_BINARY)); + ICEBERG_RETURN_UNEXPECTED( + CheckArrowSchemaConversion(ArrowSchemaSetType(schema, NANOARROW_TYPE_BINARY))); break; case TypeId::kFixed: { const auto& fixed_type = static_cast(type); - NANOARROW_RETURN_NOT_OK(ArrowSchemaSetTypeFixedSize( - schema, NANOARROW_TYPE_FIXED_SIZE_BINARY, fixed_type.length())); + ICEBERG_RETURN_UNEXPECTED(CheckArrowSchemaConversion(ArrowSchemaSetTypeFixedSize( + schema, NANOARROW_TYPE_FIXED_SIZE_BINARY, fixed_type.length()))); } break; case TypeId::kUuid: { - NANOARROW_RETURN_NOT_OK(ArrowSchemaSetTypeFixedSize( - schema, NANOARROW_TYPE_FIXED_SIZE_BINARY, /*fixed_size=*/16)); - NANOARROW_RETURN_NOT_OK( + ICEBERG_RETURN_UNEXPECTED(CheckArrowSchemaConversion(ArrowSchemaSetTypeFixedSize( + schema, NANOARROW_TYPE_FIXED_SIZE_BINARY, /*fixed_size=*/16))); + ICEBERG_RETURN_UNEXPECTED(CheckArrowSchemaConversion( ArrowMetadataBuilderAppend(&metadata_buffer, ArrowCharView(kArrowExtensionName), - ArrowCharView(kArrowUuidExtensionName))); + ArrowCharView(kArrowUuidExtensionName)))); } break; case TypeId::kUnknown: - NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, NANOARROW_TYPE_NA)); + ICEBERG_RETURN_UNEXPECTED( + CheckArrowSchemaConversion(ArrowSchemaSetType(schema, NANOARROW_TYPE_NA))); break; + case TypeId::kVariant: + case TypeId::kGeometry: + case TypeId::kGeography: + return NotSupported("Iceberg type {} at '{}' is not supported by Arrow conversion", + type.ToString(), path); } if (!name.empty()) { - NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema, std::string(name).c_str())); + ICEBERG_RETURN_UNEXPECTED(CheckArrowSchemaConversion( + ArrowSchemaSetName(schema, std::string(name).c_str()))); } - NANOARROW_RETURN_NOT_OK(ArrowSchemaSetMetadata( - schema, reinterpret_cast(metadata_buffer.data))); - ArrowBufferReset(&metadata_buffer); + ICEBERG_RETURN_UNEXPECTED(CheckArrowSchemaConversion(ArrowSchemaSetMetadata( + schema, reinterpret_cast(metadata_buffer.data)))); if (optional) { schema->flags |= ARROW_FLAG_NULLABLE; @@ -169,7 +207,7 @@ ArrowErrorCode ToArrowSchema(const Type& type, bool optional, std::string_view n schema->flags &= ~ARROW_FLAG_NULLABLE; } - return NANOARROW_OK; + return {}; } } // namespace @@ -181,11 +219,14 @@ Status ToArrowSchema(const Schema& schema, ArrowSchema* out) { ArrowSchemaInit(out); - if (ArrowErrorCode errorCode = ToArrowSchema(schema, /*optional=*/false, /*name=*/"", - /*field_id=*/std::nullopt, out); - errorCode != NANOARROW_OK) { - return InvalidSchema( - "Failed to convert Iceberg schema to Arrow schema, error code: {}", errorCode); + auto status = + ToArrowSchemaInternal(schema, /*optional=*/false, /*name=*/"", /*path=*/"", + /*field_id=*/std::nullopt, out); + if (!status.has_value()) { + if (out->release != nullptr) { + ArrowSchemaRelease(out); + } + return status; } return {}; diff --git a/src/iceberg/table_metadata.h b/src/iceberg/table_metadata.h index 06d636ef5..fd2c27199 100644 --- a/src/iceberg/table_metadata.h +++ b/src/iceberg/table_metadata.h @@ -79,9 +79,8 @@ struct ICEBERG_EXPORT TableMetadata { static constexpr int64_t kInitialRowId = 0; static inline const std::unordered_map kMinFormatVersions = { - {TypeId::kTimestampNs, 3}, - {TypeId::kTimestampTzNs, 3}, - {TypeId::kUnknown, 3}, + {TypeId::kTimestampNs, 3}, {TypeId::kTimestampTzNs, 3}, {TypeId::kUnknown, 3}, + {TypeId::kVariant, 3}, {TypeId::kGeometry, 3}, {TypeId::kGeography, 3}, }; /// An integer version number for the format diff --git a/src/iceberg/test/arrow_test.cc b/src/iceberg/test/arrow_test.cc index 9f8ce86f5..2a7242e71 100644 --- a/src/iceberg/test/arrow_test.cc +++ b/src/iceberg/test/arrow_test.cc @@ -122,6 +122,20 @@ INSTANTIATE_TEST_SUITE_P( ToArrowSchemaParam{.iceberg_type = iceberg::unknown(), .arrow_type = ::arrow::null()})); +TEST(ToArrowSchemaTest, UnsupportedV3Types) { + const std::vector> unsupported_types = { + iceberg::variant(), iceberg::geometry(), iceberg::geography()}; + + for (const auto& unsupported_type : unsupported_types) { + Schema schema( + {SchemaField::MakeOptional(/*field_id=*/1, "unsupported", unsupported_type)}, + /*schema_id=*/0); + ArrowSchema arrow_schema; + ASSERT_THAT(ToArrowSchema(schema, &arrow_schema), + HasErrorMessage("is not supported by Arrow conversion")); + } +} + namespace { void CheckArrowField(const ::arrow::Field& field, ::arrow::Type::type type_id, diff --git a/src/iceberg/test/rest_json_serde_test.cc b/src/iceberg/test/rest_json_serde_test.cc index 7304831c6..50507dd3a 100644 --- a/src/iceberg/test/rest_json_serde_test.cc +++ b/src/iceberg/test/rest_json_serde_test.cc @@ -1078,7 +1078,7 @@ INSTANTIATE_TEST_SUITE_P( CreateTableRequestInvalidParam{ .test_name = "WrongSchemaType", .invalid_json_str = R"({"name":"my_table","schema":"invalid"})", - .expected_error_message = "Unknown primitive type: invalid"}), + .expected_error_message = "Cannot parse type string: invalid"}), [](const ::testing::TestParamInfo& info) { return info.param.test_name; }); diff --git a/src/iceberg/test/schema_json_test.cc b/src/iceberg/test/schema_json_test.cc index 08275a45c..efab2f55b 100644 --- a/src/iceberg/test/schema_json_test.cc +++ b/src/iceberg/test/schema_json_test.cc @@ -65,6 +65,15 @@ INSTANTIATE_TEST_SUITE_P( SchemaJsonParam{.json = "\"binary\"", .type = iceberg::binary()}, SchemaJsonParam{.json = "\"uuid\"", .type = iceberg::uuid()}, SchemaJsonParam{.json = "\"unknown\"", .type = iceberg::unknown()}, + SchemaJsonParam{.json = "\"variant\"", .type = iceberg::variant()}, + SchemaJsonParam{.json = "\"geometry\"", .type = iceberg::geometry()}, + SchemaJsonParam{.json = "\"geometry(srid:4326)\"", + .type = iceberg::geometry("srid:4326")}, + SchemaJsonParam{.json = "\"geography\"", .type = iceberg::geography()}, + SchemaJsonParam{.json = "\"geography(srid:4326)\"", + .type = iceberg::geography("srid:4326")}, + SchemaJsonParam{.json = "\"geography(srid:4326, karney)\"", + .type = iceberg::geography("srid:4326", EdgeAlgorithm::kKarney)}, SchemaJsonParam{.json = "\"fixed[8]\"", .type = iceberg::fixed(8)}, SchemaJsonParam{.json = "\"decimal(10,2)\"", .type = iceberg::decimal(10, 2)}, SchemaJsonParam{.json = "\"date\"", .type = iceberg::date()}, @@ -111,6 +120,48 @@ TEST(TypeJsonTest, FromJsonWithSpaces) { ASSERT_EQ(decimal->scale(), 2); } +TEST(TypeJsonTest, FromJsonV3TypesWithSpacesAndCase) { + auto variant_result = TypeFromJson(nlohmann::json::parse("\"Variant\"")); + ASSERT_TRUE(variant_result.has_value()); + ASSERT_EQ(*variant_result.value(), *iceberg::variant()); + + auto geometry_result = + TypeFromJson(nlohmann::json::parse("\"GEOMETRY( srid: 3857 )\"")); + ASSERT_TRUE(geometry_result.has_value()); + ASSERT_EQ(*geometry_result.value(), *iceberg::geometry("srid: 3857")); + + auto geography_result = + TypeFromJson(nlohmann::json::parse("\"geography(srid:4269,karney)\"")); + ASSERT_TRUE(geography_result.has_value()); + ASSERT_EQ(*geography_result.value(), + *iceberg::geography("srid:4269", EdgeAlgorithm::kKarney)); +} + +TEST(TypeJsonTest, InvalidV3Types) { + auto invalid_geometry = TypeFromJson(nlohmann::json::parse("\"geometry()\"")); + ASSERT_THAT(invalid_geometry, HasErrorMessage("Invalid geometry type")); + + auto invalid_geometry_with_spaces = + TypeFromJson(nlohmann::json::parse("\"geometry( )\"")); + ASSERT_THAT(invalid_geometry_with_spaces, HasErrorMessage("Invalid geometry type")); + + auto invalid_geography = TypeFromJson(nlohmann::json::parse("\"geography()\"")); + ASSERT_THAT(invalid_geography, HasErrorMessage("Invalid geography type")); + + auto invalid_geography_with_algorithm = + TypeFromJson(nlohmann::json::parse("\"geography( , spherical)\"")); + ASSERT_THAT(invalid_geography_with_algorithm, + HasErrorMessage("Invalid geography type")); + + auto invalid_geography_algorithm = + TypeFromJson(nlohmann::json::parse("\"geography(srid:4269, BadAlgorithm)\"")); + ASSERT_THAT(invalid_geography_algorithm, + HasErrorMessage("Invalid edge interpolation algorithm")); + + auto unknown_type = TypeFromJson(nlohmann::json::parse("\"nonsense\"")); + ASSERT_THAT(unknown_type, HasErrorMessage("Cannot parse type string")); +} + TEST(SchemaJsonTest, RoundTrip) { constexpr std::string_view json = R"({"fields":[{"id":1,"name":"id","required":true,"type":"int"},{"id":2,"name":"name","required":false,"type":"string"}],"schema-id":1,"type":"struct"})"; diff --git a/src/iceberg/test/schema_test.cc b/src/iceberg/test/schema_test.cc index 8f1b20035..db99eb02b 100644 --- a/src/iceberg/test/schema_test.cc +++ b/src/iceberg/test/schema_test.cc @@ -671,6 +671,7 @@ iceberg::SchemaField Id() { return {1, "id", iceberg::int32(), true}; } iceberg::SchemaField Name() { return {2, "name", iceberg::string(), false}; } iceberg::SchemaField Age() { return {3, "age", iceberg::int32(), true}; } iceberg::SchemaField Email() { return {4, "email", iceberg::string(), true}; } +iceberg::SchemaField Payload() { return {5, "payload", iceberg::variant(), true}; } iceberg::SchemaField Street() { return {11, "street", iceberg::string(), true}; } iceberg::SchemaField City() { return {12, "city", iceberg::string(), true}; } iceberg::SchemaField Zip() { return {13, "zip", iceberg::int32(), true}; } @@ -683,6 +684,10 @@ static std::unique_ptr BasicSchema() { return MakeSchema(Id(), Name(), Age(), Email()); } +static std::unique_ptr VariantSchema() { + return MakeSchema(Id(), Payload()); +} + static std::unique_ptr AddressSchema() { auto address_type = MakeStructType(Street(), City(), Zip()); auto address_field = iceberg::SchemaField{14, "address", std::move(address_type), true}; @@ -932,30 +937,36 @@ TEST_P(ProjectParamTest, ProjectFields) { INSTANTIATE_TEST_SUITE_P( ProjectTestCases, ProjectParamTest, - ::testing::Values(ProjectTestParam{.test_name = "ProjectAllFields", - .create_schema = []() { return BasicSchema(); }, - .selected_ids = {1, 2, 3, 4}, - .expected_schema = []() { return BasicSchema(); }, - .should_succeed = true}, - - ProjectTestParam{ - .test_name = "ProjectSingleField", - .create_schema = []() { return BasicSchema(); }, - .selected_ids = {2}, - .expected_schema = []() { return MakeSchema(Name()); }, - .should_succeed = true}, + ::testing::Values( + ProjectTestParam{.test_name = "ProjectAllFields", + .create_schema = []() { return BasicSchema(); }, + .selected_ids = {1, 2, 3, 4}, + .expected_schema = []() { return BasicSchema(); }, + .should_succeed = true}, + + ProjectTestParam{.test_name = "ProjectSingleField", + .create_schema = []() { return BasicSchema(); }, + .selected_ids = {2}, + .expected_schema = []() { return MakeSchema(Name()); }, + .should_succeed = true}, - ProjectTestParam{.test_name = "ProjectNonExistentFieldId", - .create_schema = []() { return BasicSchema(); }, - .selected_ids = {999}, - .expected_schema = []() { return MakeSchema(); }, - .should_succeed = true}, - - ProjectTestParam{.test_name = "ProjectEmptySelection", - .create_schema = []() { return BasicSchema(); }, - .selected_ids = {}, - .expected_schema = []() { return MakeSchema(); }, - .should_succeed = true})); + ProjectTestParam{.test_name = "ProjectVariantField", + .create_schema = []() { return VariantSchema(); }, + .selected_ids = {5}, + .expected_schema = []() { return MakeSchema(Payload()); }, + .should_succeed = true}, + + ProjectTestParam{.test_name = "ProjectNonExistentFieldId", + .create_schema = []() { return BasicSchema(); }, + .selected_ids = {999}, + .expected_schema = []() { return MakeSchema(); }, + .should_succeed = true}, + + ProjectTestParam{.test_name = "ProjectEmptySelection", + .create_schema = []() { return BasicSchema(); }, + .selected_ids = {}, + .expected_schema = []() { return MakeSchema(); }, + .should_succeed = true})); INSTANTIATE_TEST_SUITE_P(ProjectNestedTestCases, ProjectParamTest, ::testing::Values(ProjectTestParam{ diff --git a/src/iceberg/test/transform_test.cc b/src/iceberg/test/transform_test.cc index 8d7cd880d..d3eae5971 100644 --- a/src/iceberg/test/transform_test.cc +++ b/src/iceberg/test/transform_test.cc @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -50,6 +51,18 @@ TEST(TransformTest, Transform) { ASSERT_TRUE(identity_transform); } +TEST(TransformTest, IdentityDoesNotSupportV3Types) { + const auto transform = Transform::Identity(); + const std::vector> unsupported_types = { + iceberg::variant(), iceberg::geometry(), iceberg::geography()}; + + for (const auto& type : unsupported_types) { + EXPECT_FALSE(transform->CanTransform(*type)); + EXPECT_THAT(transform->Bind(type), + HasErrorMessage("is not a valid input type for identity transform")); + } +} + TEST(TransformFunctionTest, CreateBucketTransform) { constexpr int32_t bucket_count = 8; auto transform = Transform::Bucket(bucket_count); diff --git a/src/iceberg/test/type_test.cc b/src/iceberg/test/type_test.cc index d405cccc1..b0fb39f81 100644 --- a/src/iceberg/test/type_test.cc +++ b/src/iceberg/test/type_test.cc @@ -38,6 +38,7 @@ struct TypeTestCase { std::shared_ptr type; iceberg::TypeId type_id; bool primitive; + bool nested = false; std::string repr; }; @@ -61,17 +62,20 @@ TEST_P(TypeTest, IsPrimitive) { const auto* primitive = dynamic_cast(test_case.type.get()); ASSERT_NE(nullptr, primitive); + } else { + ASSERT_FALSE(test_case.type->is_primitive()); } } TEST_P(TypeTest, IsNested) { const auto& test_case = GetParam(); - if (!test_case.primitive) { - ASSERT_FALSE(test_case.type->is_primitive()); + if (test_case.nested) { ASSERT_TRUE(test_case.type->is_nested()); const auto* nested = dynamic_cast(test_case.type.get()); ASSERT_NE(nullptr, nested); + } else { + ASSERT_FALSE(test_case.type->is_nested()); } } @@ -90,7 +94,7 @@ TEST_P(TypeTest, StdFormat) { ASSERT_EQ(test_case.repr, std::format("{}", *test_case.type)); } -const static std::array kPrimitiveTypes = {{ +const static std::array kPrimitiveTypes = {{ { .name = "boolean", .type = iceberg::boolean(), @@ -224,6 +228,27 @@ const static std::array kPrimitiveTypes = {{ .primitive = true, .repr = "unknown", }, + { + .name = "variant", + .type = iceberg::variant(), + .type_id = iceberg::TypeId::kVariant, + .primitive = false, + .repr = "variant", + }, + { + .name = "geometry", + .type = iceberg::geometry(), + .type_id = iceberg::TypeId::kGeometry, + .primitive = true, + .repr = "geometry", + }, + { + .name = "geography", + .type = iceberg::geography(), + .type_id = iceberg::TypeId::kGeography, + .primitive = true, + .repr = "geography", + }, }}; const static std::array kNestedTypes = {{ @@ -232,6 +257,7 @@ const static std::array kNestedTypes = {{ .type = std::make_shared(1, iceberg::int32(), true), .type_id = iceberg::TypeId::kList, .primitive = false, + .nested = true, .repr = "list", }, { @@ -240,6 +266,7 @@ const static std::array kNestedTypes = {{ 1, std::make_shared(2, iceberg::int32(), true), false), .type_id = iceberg::TypeId::kList, .primitive = false, + .nested = true, .repr = "list (required)>", }, { @@ -249,6 +276,7 @@ const static std::array kNestedTypes = {{ iceberg::SchemaField::MakeRequired(2, "value", iceberg::string())), .type_id = iceberg::TypeId::kMap, .primitive = false, + .nested = true, .repr = "map", }, { @@ -259,6 +287,7 @@ const static std::array kNestedTypes = {{ }), .type_id = iceberg::TypeId::kStruct, .primitive = false, + .nested = true, .repr = R"(struct< foo (1): long (required) bar (2): string (optional) @@ -294,6 +323,15 @@ TEST(TypeTest, Equality) { } } +TEST(TypeTest, GeographyEffectiveDefaultEquality) { + ASSERT_EQ(*iceberg::geography("srid:4326"), + *iceberg::geography("srid:4326", iceberg::EdgeAlgorithm::kSpherical)); + ASSERT_EQ(*iceberg::geography(), + *iceberg::geography("OGC:CRS84", iceberg::EdgeAlgorithm::kSpherical)); + ASSERT_NE(*iceberg::geography("srid:4326"), + *iceberg::geography("srid:4326", iceberg::EdgeAlgorithm::kKarney)); +} + TEST(TypeTest, Decimal) { { iceberg::DecimalType decimal(38, 2); diff --git a/src/iceberg/test/visit_type_test.cc b/src/iceberg/test/visit_type_test.cc index f038f906f..35c307326 100644 --- a/src/iceberg/test/visit_type_test.cc +++ b/src/iceberg/test/visit_type_test.cc @@ -46,6 +46,7 @@ struct TypeTestCase { std::shared_ptr type; iceberg::TypeId type_id; bool primitive; + bool nested = false; std::string repr; }; @@ -53,7 +54,7 @@ std::string TypeTestCaseToString(const ::testing::TestParamInfo& i return info.param.name; } -const static std::array kPrimitiveTypes = {{ +const static std::array kPrimitiveTypes = {{ { .name = "boolean", .type = iceberg::boolean(), @@ -187,6 +188,27 @@ const static std::array kPrimitiveTypes = {{ .primitive = true, .repr = "unknown", }, + { + .name = "variant", + .type = iceberg::variant(), + .type_id = iceberg::TypeId::kVariant, + .primitive = false, + .repr = "variant", + }, + { + .name = "geometry", + .type = iceberg::geometry(), + .type_id = iceberg::TypeId::kGeometry, + .primitive = true, + .repr = "geometry", + }, + { + .name = "geography", + .type = iceberg::geography(), + .type_id = iceberg::TypeId::kGeography, + .primitive = true, + .repr = "geography", + }, }}; const static std::array kNestedTypes = {{ @@ -195,6 +217,7 @@ const static std::array kNestedTypes = {{ .type = std::make_shared(1, iceberg::int32(), true), .type_id = iceberg::TypeId::kList, .primitive = false, + .nested = true, .repr = "list", }, { @@ -203,6 +226,7 @@ const static std::array kNestedTypes = {{ 1, std::make_shared(2, iceberg::int32(), true), false), .type_id = iceberg::TypeId::kList, .primitive = false, + .nested = true, .repr = "list (required)>", }, { @@ -212,6 +236,7 @@ const static std::array kNestedTypes = {{ iceberg::SchemaField::MakeRequired(2, "value", iceberg::string())), .type_id = iceberg::TypeId::kMap, .primitive = false, + .nested = true, .repr = "map", }, { @@ -222,6 +247,7 @@ const static std::array kNestedTypes = {{ }), .type_id = iceberg::TypeId::kStruct, .primitive = false, + .nested = true, .repr = R"(struct< foo (1): long (required) bar (2): string (optional) @@ -261,12 +287,12 @@ TEST_P(VisitTypeTest, VisitTypeReturnNestedTypeId) { const auto& test_case = GetParam(); auto result = VisitType(*test_case.type, visitor); - if (test_case.primitive) { - ASSERT_THAT(result, IsError(ErrorKind::kNotImplemented)); - ASSERT_THAT(result, HasErrorMessage("Type is not a nested type")); - } else { + if (test_case.nested) { ASSERT_THAT(result, IsOk()); ASSERT_EQ(result.value(), test_case.type_id); + } else { + ASSERT_THAT(result, IsError(ErrorKind::kNotImplemented)); + ASSERT_THAT(result, HasErrorMessage("Type is not a nested type")); } } diff --git a/src/iceberg/transform.cc b/src/iceberg/transform.cc index 453941c95..079ebade9 100644 --- a/src/iceberg/transform.cc +++ b/src/iceberg/transform.cc @@ -158,7 +158,8 @@ std::shared_ptr Transform::ResultType( bool Transform::CanTransform(const Type& source_type) const { switch (transform_type_) { case TransformType::kIdentity: - if (!source_type.is_primitive()) [[unlikely]] { + if (!source_type.is_primitive() || source_type.type_id() == TypeId::kGeometry || + source_type.type_id() == TypeId::kGeography) [[unlikely]] { return false; } return true; diff --git a/src/iceberg/transform_function.cc b/src/iceberg/transform_function.cc index 4325c53d1..1241a4c7d 100644 --- a/src/iceberg/transform_function.cc +++ b/src/iceberg/transform_function.cc @@ -40,7 +40,9 @@ std::shared_ptr IdentityTransform::ResultType() const { return source_type Result> IdentityTransform::Make( std::shared_ptr const& source_type) { - if (!source_type || !source_type->is_primitive()) { + if (!source_type || !source_type->is_primitive() || + source_type->type_id() == TypeId::kGeometry || + source_type->type_id() == TypeId::kGeography) { return NotSupported("{} is not a valid input type for identity transform", source_type ? source_type->ToString() : "null"); } diff --git a/src/iceberg/type.cc b/src/iceberg/type.cc index 057dcf513..7e9d3dec5 100644 --- a/src/iceberg/type.cc +++ b/src/iceberg/type.cc @@ -23,6 +23,7 @@ #include #include #include +#include #include #include "iceberg/exception.h" @@ -264,6 +265,10 @@ bool MapType::Equals(const Type& other) const { return fields_ == map.fields_; } +TypeId VariantType::type_id() const { return kTypeId; } +std::string VariantType::ToString() const { return "variant"; } +bool VariantType::Equals(const Type& other) const { return other.type_id() == kTypeId; } + TypeId BooleanType::type_id() const { return kTypeId; } std::string BooleanType::ToString() const { return "boolean"; } bool BooleanType::Equals(const Type& other) const { return other.type_id() == kTypeId; } @@ -354,6 +359,70 @@ TypeId UnknownType::type_id() const { return kTypeId; } std::string UnknownType::ToString() const { return "unknown"; } bool UnknownType::Equals(const Type& other) const { return other.type_id() == kTypeId; } +GeometryType::GeometryType(std::string crs) { + ICEBERG_CHECK_OR_DIE(!crs.empty(), "GeometryType: CRS cannot be empty"); + if (StringUtils::ToLower(crs) != StringUtils::ToLower(kDefaultCrs)) { + crs_ = std::move(crs); + } +} + +std::string_view GeometryType::crs() const { + return crs_.has_value() ? std::string_view(*crs_) : kDefaultCrs; +} +TypeId GeometryType::type_id() const { return kTypeId; } +std::string GeometryType::ToString() const { + if (!crs_.has_value()) { + return "geometry"; + } + return std::format("geometry({})", *crs_); +} +bool GeometryType::Equals(const Type& other) const { + if (other.type_id() != kTypeId) { + return false; + } + const auto& geometry = static_cast(other); + return crs() == geometry.crs(); +} + +GeographyType::GeographyType(std::string crs) { + ICEBERG_CHECK_OR_DIE(!crs.empty(), "GeographyType: CRS cannot be empty"); + if (StringUtils::ToLower(crs) != StringUtils::ToLower(kDefaultCrs)) { + crs_ = std::move(crs); + } +} + +GeographyType::GeographyType(std::string crs, EdgeAlgorithm algorithm) + : algorithm_(algorithm) { + ICEBERG_CHECK_OR_DIE(!crs.empty(), "GeographyType: CRS cannot be empty"); + if (StringUtils::ToLower(crs) != StringUtils::ToLower(kDefaultCrs)) { + crs_ = std::move(crs); + } +} + +std::string_view GeographyType::crs() const { + return crs_.has_value() ? std::string_view(*crs_) : kDefaultCrs; +} +EdgeAlgorithm GeographyType::algorithm() const { + return algorithm_.value_or(kDefaultAlgorithm); +} +TypeId GeographyType::type_id() const { return kTypeId; } +std::string GeographyType::ToString() const { + if (algorithm_.has_value()) { + return std::format("geography({}, {})", crs(), iceberg::ToString(*algorithm_)); + } + if (crs_.has_value()) { + return std::format("geography({})", *crs_); + } + return "geography"; +} +bool GeographyType::Equals(const Type& other) const { + if (other.type_id() != kTypeId) { + return false; + } + const auto& geography = static_cast(other); + return crs() == geography.crs() && algorithm() == geography.algorithm(); +} + FixedType::FixedType(int32_t length) : length_(length) { ICEBERG_CHECK_OR_DIE(length >= 0, "FixedType: length must be >= 0, was {}", length); } @@ -374,7 +443,7 @@ std::string BinaryType::ToString() const { return "binary"; } bool BinaryType::Equals(const Type& other) const { return other.type_id() == kTypeId; } // ---------------------------------------------------------------------- -// Factory functions for creating primitive data types +// Factory functions for creating data types #define TYPE_FACTORY(NAME, KLASS) \ const std::shared_ptr& NAME() { \ @@ -397,6 +466,9 @@ TYPE_FACTORY(binary, BinaryType) TYPE_FACTORY(string, StringType) TYPE_FACTORY(uuid, UuidType) TYPE_FACTORY(unknown, UnknownType) +TYPE_FACTORY(variant, VariantType) +TYPE_FACTORY(geometry, GeometryType) +TYPE_FACTORY(geography, GeographyType) #undef TYPE_FACTORY @@ -408,6 +480,18 @@ std::shared_ptr fixed(int32_t length) { return std::make_shared(length); } +std::shared_ptr geometry(std::string crs) { + return std::make_shared(std::move(crs)); +} + +std::shared_ptr geography(std::string crs) { + return std::make_shared(std::move(crs)); +} + +std::shared_ptr geography(std::string crs, EdgeAlgorithm algorithm) { + return std::make_shared(std::move(crs), algorithm); +} + std::shared_ptr map(SchemaField key, SchemaField value) { return std::make_shared(key, value); } @@ -462,9 +546,52 @@ std::string_view ToString(TypeId id) { return "binary"; case TypeId::kUnknown: return "unknown"; + case TypeId::kVariant: + return "variant"; + case TypeId::kGeometry: + return "geometry"; + case TypeId::kGeography: + return "geography"; } std::unreachable(); } +std::string_view ToString(EdgeAlgorithm algorithm) { + switch (algorithm) { + case EdgeAlgorithm::kSpherical: + return "spherical"; + case EdgeAlgorithm::kVincenty: + return "vincenty"; + case EdgeAlgorithm::kThomas: + return "thomas"; + case EdgeAlgorithm::kAndoyer: + return "andoyer"; + case EdgeAlgorithm::kKarney: + return "karney"; + } + + std::unreachable(); +} + +Result EdgeAlgorithmFromString(std::string_view name) { + auto lower_name = StringUtils::ToLower(name); + if (lower_name == "spherical") { + return EdgeAlgorithm::kSpherical; + } + if (lower_name == "vincenty") { + return EdgeAlgorithm::kVincenty; + } + if (lower_name == "thomas") { + return EdgeAlgorithm::kThomas; + } + if (lower_name == "andoyer") { + return EdgeAlgorithm::kAndoyer; + } + if (lower_name == "karney") { + return EdgeAlgorithm::kKarney; + } + return InvalidArgument("Invalid edge interpolation algorithm: {}", name); +} + } // namespace iceberg diff --git a/src/iceberg/type.h b/src/iceberg/type.h index c0966759e..fd37196e6 100644 --- a/src/iceberg/type.h +++ b/src/iceberg/type.h @@ -208,6 +208,30 @@ class ICEBERG_EXPORT MapType : public NestedType { /// @} +/// \defgroup type-semi-structured Semi-structured Types +/// Semi-structured types may contain values whose structure varies across rows. +/// @{ + +/// \brief A semi-structured type whose structure may vary across rows. +class ICEBERG_EXPORT VariantType : public Type { + public: + constexpr static const TypeId kTypeId = TypeId::kVariant; + + VariantType() = default; + ~VariantType() override = default; + + bool is_primitive() const override { return false; } + bool is_nested() const override { return false; } + + TypeId type_id() const override; + std::string ToString() const override; + + protected: + bool Equals(const Type& other) const override; +}; + +/// @} + /// \defgroup type-primitive Primitive Types /// Primitive types do not have nested fields. /// @{ @@ -518,11 +542,59 @@ class ICEBERG_EXPORT UnknownType : public PrimitiveType { bool Equals(const Type& other) const override; }; +/// \brief A data type representing OGC geometry in WKB format. +class ICEBERG_EXPORT GeometryType : public PrimitiveType { + public: + constexpr static const TypeId kTypeId = TypeId::kGeometry; + constexpr static std::string_view kDefaultCrs = "OGC:CRS84"; + + GeometryType() = default; + explicit GeometryType(std::string crs); + ~GeometryType() override = default; + + [[nodiscard]] std::string_view crs() const; + + TypeId type_id() const override; + std::string ToString() const override; + + protected: + bool Equals(const Type& other) const override; + + private: + std::optional crs_; +}; + +/// \brief A data type representing OGC geography in WKB format. +class ICEBERG_EXPORT GeographyType : public PrimitiveType { + public: + constexpr static const TypeId kTypeId = TypeId::kGeography; + constexpr static std::string_view kDefaultCrs = "OGC:CRS84"; + constexpr static EdgeAlgorithm kDefaultAlgorithm = EdgeAlgorithm::kSpherical; + + GeographyType() = default; + explicit GeographyType(std::string crs); + GeographyType(std::string crs, EdgeAlgorithm algorithm); + ~GeographyType() override = default; + + [[nodiscard]] std::string_view crs() const; + [[nodiscard]] EdgeAlgorithm algorithm() const; + + TypeId type_id() const override; + std::string ToString() const override; + + protected: + bool Equals(const Type& other) const override; + + private: + std::optional crs_; + std::optional algorithm_; +}; + /// @} -/// \defgroup type-factories Factory functions for creating primitive data types +/// \defgroup type-factories Factory functions for creating data types /// -/// Factory functions for creating primitive data types +/// Factory functions for creating data types /// @{ /// \brief Return a BooleanType instance. @@ -555,6 +627,12 @@ ICEBERG_EXPORT const std::shared_ptr& string(); ICEBERG_EXPORT const std::shared_ptr& uuid(); /// \brief Return an UnknownType instance. ICEBERG_EXPORT const std::shared_ptr& unknown(); +/// \brief Return a VariantType instance. +ICEBERG_EXPORT const std::shared_ptr& variant(); +/// \brief Return the default GeometryType instance. +ICEBERG_EXPORT const std::shared_ptr& geometry(); +/// \brief Return the default GeographyType instance. +ICEBERG_EXPORT const std::shared_ptr& geography(); /// \brief Create a DecimalType with the given precision and scale. /// \param precision The number of decimal digits (max 38). @@ -567,6 +645,16 @@ ICEBERG_EXPORT std::shared_ptr decimal(int32_t precision, int32_t s /// \return A shared pointer to the FixedType instance. ICEBERG_EXPORT std::shared_ptr fixed(int32_t length); +/// \brief Create a GeometryType with the given CRS. +ICEBERG_EXPORT std::shared_ptr geometry(std::string crs); + +/// \brief Create a GeographyType with the given CRS. +ICEBERG_EXPORT std::shared_ptr geography(std::string crs); + +/// \brief Create a GeographyType with the given CRS and edge algorithm. +ICEBERG_EXPORT std::shared_ptr geography(std::string crs, + EdgeAlgorithm algorithm); + /// \brief Create a StructType with the given fields. /// \param fields The fields of the struct. /// \return A shared pointer to the StructType instance. @@ -594,4 +682,10 @@ ICEBERG_EXPORT std::shared_ptr map(SchemaField key, SchemaField value); /// \return A string_view containing the lowercase type name ICEBERG_EXPORT std::string_view ToString(TypeId id); +/// \brief Get the lowercase string representation of an EdgeAlgorithm. +ICEBERG_EXPORT std::string_view ToString(EdgeAlgorithm algorithm); + +/// \brief Parse a lowercase edge algorithm name. +ICEBERG_EXPORT Result EdgeAlgorithmFromString(std::string_view name); + } // namespace iceberg diff --git a/src/iceberg/type_fwd.h b/src/iceberg/type_fwd.h index bb4da67e5..6f844e4ef 100644 --- a/src/iceberg/type_fwd.h +++ b/src/iceberg/type_fwd.h @@ -53,6 +53,9 @@ enum class TypeId { kFixed, kBinary, kUnknown, + kVariant, + kGeometry, + kGeography, }; /// \brief The time unit. In Iceberg V3 nanoseconds are also supported. @@ -61,6 +64,15 @@ enum class TimeUnit { kNanosecond, }; +/// \brief The algorithm used to interpolate geography edges. +enum class EdgeAlgorithm { + kSpherical, + kVincenty, + kThomas, + kAndoyer, + kKarney, +}; + /// \brief Data type family. class BinaryType; class BooleanType; @@ -86,6 +98,9 @@ class TimestampTzNsType; class Type; class UnknownType; class UuidType; +class VariantType; +class GeographyType; +class GeometryType; /// \brief Data values. class Decimal; diff --git a/src/iceberg/update/update_schema.cc b/src/iceberg/update/update_schema.cc index 0e7f147b0..5c50ee41d 100644 --- a/src/iceberg/update/update_schema.cc +++ b/src/iceberg/update/update_schema.cc @@ -181,6 +181,12 @@ class ApplyChangesVisitor { return base_type; } + Result> VisitVariant(const VariantType& variant_type, + const std::shared_ptr& base_type, + int32_t parent_id) { + return base_type; + } + private: Result> ProcessField( const SchemaField& field, const std::shared_ptr& field_type_result) { diff --git a/src/iceberg/util/struct_like_set.cc b/src/iceberg/util/struct_like_set.cc index 12648ea5e..35a0f9e28 100644 --- a/src/iceberg/util/struct_like_set.cc +++ b/src/iceberg/util/struct_like_set.cc @@ -342,6 +342,11 @@ Status ValidateScalarAgainstType(const Scalar& scalar, const Type& type) { return ValidateMapLikeAgainstType(*map, internal::checked_cast(type)); } + case TypeId::kVariant: + case TypeId::kGeometry: + case TypeId::kGeography: + return NotSupported("Scalar validation for type {} is not supported", + type.ToString()); } std::unreachable(); diff --git a/src/iceberg/util/type_util.cc b/src/iceberg/util/type_util.cc index cb01be08f..9dfe47fef 100644 --- a/src/iceberg/util/type_util.cc +++ b/src/iceberg/util/type_util.cc @@ -37,6 +37,8 @@ IdToFieldVisitor::IdToFieldVisitor( Status IdToFieldVisitor::Visit(const PrimitiveType& type) { return {}; } +Status IdToFieldVisitor::Visit(const VariantType& type) { return {}; } + Status IdToFieldVisitor::Visit(const NestedType& type) { const auto& nested = internal::checked_cast(type); const auto& fields = nested.fields(); @@ -128,6 +130,11 @@ Status NameToIdVisitor::Visit(const PrimitiveType& type, const std::string& path return {}; } +Status NameToIdVisitor::Visit(const VariantType& type, const std::string& path, + const std::string& short_path) { + return {}; +} + std::string NameToIdVisitor::BuildPath(std::string_view prefix, std::string_view field_name, bool case_sensitive) { std::string quoted_name; @@ -168,6 +175,20 @@ Status PositionPathVisitor::Visit(const PrimitiveType& type) { return {}; } +Status PositionPathVisitor::Visit(const VariantType& type) { + if (current_field_id_ == kUnassignedFieldId) { + return InvalidSchema("Current field id is not assigned, type: {}", type.ToString()); + } + + if (auto ret = position_path_.try_emplace(current_field_id_, current_path_); + !ret.second) { + return InvalidSchema("Duplicate field id found: {}, prev path: {}, curr path: {}", + current_field_id_, ret.first->second, current_path_); + } + + return {}; +} + Status PositionPathVisitor::Visit(const StructType& type) { for (size_t i = 0; i < type.fields().size(); ++i) { const auto& field = type.fields()[i]; @@ -208,8 +229,8 @@ Result> PruneColumnVisitor::Visit( Result> PruneColumnVisitor::Visit(const SchemaField& field) const { if (selected_ids_.contains(field.field_id())) { - return (select_full_types_ || field.type()->is_primitive()) ? field.type() - : Visit(field.type()); + return (select_full_types_ || !field.type()->is_nested()) ? field.type() + : Visit(field.type()); } return Visit(field.type()); } @@ -278,6 +299,8 @@ GetProjectedIdsVisitor::GetProjectedIdsVisitor(bool include_struct_ids) Status GetProjectedIdsVisitor::Visit(const Type& type) { if (type.is_nested()) { return VisitNested(internal::checked_cast(type)); + } else if (type.type_id() == TypeId::kVariant) { + return {}; } else { return VisitPrimitive(internal::checked_cast(type)); } @@ -288,9 +311,8 @@ Status GetProjectedIdsVisitor::VisitNested(const NestedType& type) { ICEBERG_RETURN_UNEXPECTED(Visit(*field.type())); } for (auto& field : type.fields()) { - // TODO(zhuo.wang) or is_variant if ((include_struct_ids_ && field.type()->type_id() == TypeId::kStruct) || - field.type()->is_primitive()) { + !field.type()->is_nested()) { ids_.insert(field.field_id()); } } diff --git a/src/iceberg/util/type_util.h b/src/iceberg/util/type_util.h index 8fd5ef19f..8623ad254 100644 --- a/src/iceberg/util/type_util.h +++ b/src/iceberg/util/type_util.h @@ -45,6 +45,7 @@ class IdToFieldVisitor { std::unordered_map>& id_to_field); Status Visit(const PrimitiveType& type); + Status Visit(const VariantType& type); Status Visit(const NestedType& type); private: @@ -67,6 +68,8 @@ class NameToIdVisitor { const std::string& short_path); Status Visit(const PrimitiveType& type, const std::string& path, const std::string& short_path); + Status Visit(const VariantType& type, const std::string& path, + const std::string& short_path); void Finish(); private: @@ -85,6 +88,7 @@ class NameToIdVisitor { class PositionPathVisitor { public: Status Visit(const PrimitiveType& type); + Status Visit(const VariantType& type); Status Visit(const StructType& type); Status Visit(const ListType& type); Status Visit(const MapType& type); diff --git a/src/iceberg/util/visit_type.h b/src/iceberg/util/visit_type.h index bf52d2e9a..73fbbb5fd 100644 --- a/src/iceberg/util/visit_type.h +++ b/src/iceberg/util/visit_type.h @@ -127,21 +127,24 @@ inline Status VisitTypeIdInline(TypeId id, VISITOR* visitor, ARGS&&... args) { /// \brief Visit a type using a categorical visitor pattern /// /// This function provides a simplified visitor interface that groups Iceberg types into -/// four categories based on their structural properties: +/// five categories based on their structural properties: /// /// - **Struct types**: Complex types with named fields (StructType) /// - **List types**: Sequential container types (ListType) /// - **Map types**: Key-value container types (MapType) -/// - **Primitive types**: All leaf types without nested structure (14 primitive types) +/// - **Variant type**: Semi-structured type that is neither nested nor primitive +/// (VariantType) +/// - **Primitive types**: All leaf types without nested structure (primitive types) /// /// This grouping is useful for algorithms that need to distinguish between container /// types and leaf types, but don't require separate handling for each primitive type /// variant (e.g., Int vs Long vs String). /// -/// \tparam VISITOR Visitor class that must implement four Visit methods: +/// \tparam VISITOR Visitor class that must implement five Visit methods: /// - `VisitStruct(const StructType&, ARGS...)` for struct types /// - `VisitList(const ListType&, ARGS...)` for list types /// - `VisitMap(const MapType&, ARGS...)` for map types +/// - `VisitVariant(const VariantType&, ARGS...)` for the variant type /// - `VisitPrimitive(const PrimitiveType&, ARGS...)` for all primitive types /// \tparam ARGS Additional argument types forwarded to Visit methods /// \param type The type to visit diff --git a/src/iceberg/util/visitor_generate.h b/src/iceberg/util/visitor_generate.h index a5b0c2ced..ad6f5eb2d 100644 --- a/src/iceberg/util/visitor_generate.h +++ b/src/iceberg/util/visitor_generate.h @@ -39,6 +39,9 @@ namespace iceberg { ACTION(Fixed); \ ACTION(Binary); \ ACTION(Unknown); \ + ACTION(Variant); \ + ACTION(Geometry); \ + ACTION(Geography); \ ACTION(Struct); \ ACTION(List); \ ACTION(Map); @@ -49,7 +52,12 @@ namespace iceberg { /// - Struct types -> calls ACTION with Struct /// - List types -> calls ACTION with List /// - Map types -> calls ACTION with Map +/// - Variant type -> calls ACTION with Variant /// - All primitive types (default) -> calls ACTION with Primitive +/// +/// Variant is dispatched explicitly because it is neither a nested nor a primitive +/// type, so it must not be routed into the primitive default (which would cast it to +/// PrimitiveType). #define ICEBERG_TYPE_SWITCH_WITH_PRIMITIVE_DEFAULT(ACTION) \ case ::iceberg::TypeId::kStruct: \ ACTION(Struct) \ @@ -57,6 +65,8 @@ namespace iceberg { ACTION(List) \ case ::iceberg::TypeId::kMap: \ ACTION(Map) \ + case ::iceberg::TypeId::kVariant: \ + ACTION(Variant) \ default: \ ACTION(Primitive) From 570060e2c7ee3c293b5706a7b6396bcb87ac6421 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Wed, 17 Jun 2026 22:59:14 +0800 Subject: [PATCH 2/4] resolve review comments --- src/iceberg/json_serde.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/iceberg/json_serde.cc b/src/iceberg/json_serde.cc index 83486c46f..1f95892cd 100644 --- a/src/iceberg/json_serde.cc +++ b/src/iceberg/json_serde.cc @@ -546,10 +546,10 @@ Result> TypeFromJson(const nlohmann::json& json) { } return JsonParseError("Invalid decimal type: {}", type_str); } else if (lower_type_str.starts_with("geometry")) { - static const std::regex geometry_regex(R"(\s*(?:\(\s*([^)]*?)\s*\))?)"); + static const std::regex geometry_regex(R"(geometry\s*(?:\(\s*([^)]*?)\s*\))?)", + std::regex_constants::icase); std::smatch match; - const auto type_params = type_str.substr(std::string_view("geometry").size()); - if (std::regex_match(type_params, match, geometry_regex)) { + if (std::regex_match(type_str, match, geometry_regex)) { if (match[1].matched) { auto crs = match[1].str(); if (crs.empty()) { @@ -562,10 +562,10 @@ Result> TypeFromJson(const nlohmann::json& json) { return JsonParseError("Invalid geometry type: {}", type_str); } else if (lower_type_str.starts_with("geography")) { static const std::regex geography_regex( - R"(\s*(?:\(\s*([^,]*?)\s*(?:,\s*(\w*)\s*)?\))?)"); + R"(geography\s*(?:\(\s*([^,]*?)\s*(?:,\s*(\w*)\s*)?\))?)", + std::regex_constants::icase); std::smatch match; - const auto type_params = type_str.substr(std::string_view("geography").size()); - if (std::regex_match(type_params, match, geography_regex)) { + if (std::regex_match(type_str, match, geography_regex)) { auto crs = match[1].str(); if (match[1].matched && crs.empty()) { return JsonParseError("Invalid geography type: {}", type_str); From f1b07112666871c03074b431ca216639e7cbe584 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Thu, 18 Jun 2026 17:06:52 +0800 Subject: [PATCH 3/4] fix: more review comments --- src/iceberg/type.cc | 24 +++++++++++------------- src/iceberg/type.h | 41 ++++++++++++++++++++--------------------- 2 files changed, 31 insertions(+), 34 deletions(-) diff --git a/src/iceberg/type.cc b/src/iceberg/type.cc index 7e9d3dec5..d175dd34a 100644 --- a/src/iceberg/type.cc +++ b/src/iceberg/type.cc @@ -367,21 +367,21 @@ GeometryType::GeometryType(std::string crs) { } std::string_view GeometryType::crs() const { - return crs_.has_value() ? std::string_view(*crs_) : kDefaultCrs; + return crs_.empty() ? kDefaultCrs : std::string_view(crs_); } TypeId GeometryType::type_id() const { return kTypeId; } std::string GeometryType::ToString() const { - if (!crs_.has_value()) { + if (crs_.empty()) { return "geometry"; } - return std::format("geometry({})", *crs_); + return std::format("geometry({})", crs_); } bool GeometryType::Equals(const Type& other) const { if (other.type_id() != kTypeId) { return false; } const auto& geometry = static_cast(other); - return crs() == geometry.crs(); + return crs_ == geometry.crs_; } GeographyType::GeographyType(std::string crs) { @@ -400,18 +400,16 @@ GeographyType::GeographyType(std::string crs, EdgeAlgorithm algorithm) } std::string_view GeographyType::crs() const { - return crs_.has_value() ? std::string_view(*crs_) : kDefaultCrs; -} -EdgeAlgorithm GeographyType::algorithm() const { - return algorithm_.value_or(kDefaultAlgorithm); + return crs_.empty() ? kDefaultCrs : std::string_view(crs_); } +EdgeAlgorithm GeographyType::algorithm() const { return algorithm_; } TypeId GeographyType::type_id() const { return kTypeId; } std::string GeographyType::ToString() const { - if (algorithm_.has_value()) { - return std::format("geography({}, {})", crs(), iceberg::ToString(*algorithm_)); + if (algorithm_ != kDefaultAlgorithm) { + return std::format("geography({}, {})", crs(), iceberg::ToString(algorithm_)); } - if (crs_.has_value()) { - return std::format("geography({})", *crs_); + if (!crs_.empty()) { + return std::format("geography({})", crs_); } return "geography"; } @@ -420,7 +418,7 @@ bool GeographyType::Equals(const Type& other) const { return false; } const auto& geography = static_cast(other); - return crs() == geography.crs() && algorithm() == geography.algorithm(); + return crs_ == geography.crs_ && algorithm_ == geography.algorithm_; } FixedType::FixedType(int32_t length) : length_(length) { diff --git a/src/iceberg/type.h b/src/iceberg/type.h index fd37196e6..acaa6a211 100644 --- a/src/iceberg/type.h +++ b/src/iceberg/type.h @@ -46,20 +46,20 @@ class ICEBERG_EXPORT Type : public iceberg::util::Formattable { ~Type() override = default; /// \brief Get the type ID. - [[nodiscard]] virtual TypeId type_id() const = 0; + virtual TypeId type_id() const = 0; /// \brief Is this a primitive type (may not have child fields)? - [[nodiscard]] virtual bool is_primitive() const = 0; + virtual bool is_primitive() const = 0; /// \brief Is this a nested type (may have child fields)? - [[nodiscard]] virtual bool is_nested() const = 0; + virtual bool is_nested() const = 0; /// \brief Compare two types for equality. friend bool operator==(const Type& lhs, const Type& rhs) { return lhs.Equals(rhs); } protected: /// \brief Compare two types for equality. - [[nodiscard]] virtual bool Equals(const Type& other) const = 0; + virtual bool Equals(const Type& other) const = 0; }; /// \brief A data type that does not have child fields. @@ -76,28 +76,27 @@ class ICEBERG_EXPORT NestedType : public Type { bool is_nested() const override { return true; } /// \brief Get a view of the child fields. - [[nodiscard]] virtual std::span fields() const = 0; + virtual std::span fields() const = 0; using SchemaFieldConstRef = std::reference_wrapper; /// \brief Get a field by field ID. /// /// \note This is O(1) complexity. - [[nodiscard]] virtual Result> GetFieldById( + virtual Result> GetFieldById( int32_t field_id) const = 0; /// \brief Get a field by index. /// /// \note This is O(1) complexity. - [[nodiscard]] virtual Result> GetFieldByIndex( + virtual Result> GetFieldByIndex( int32_t index) const = 0; /// \brief Get a field by name. Return an error Status if /// the field name is not unique; prefer GetFieldById or GetFieldByIndex /// when possible. /// /// \note This is O(1) complexity. - [[nodiscard]] virtual Result> GetFieldByName( + virtual Result> GetFieldByName( std::string_view name, bool case_sensitive) const = 0; /// \brief Get a field by name (case-sensitive). - [[nodiscard]] Result> GetFieldByName( - std::string_view name) const; + Result> GetFieldByName(std::string_view name) const; }; /// \defgroup type-nested Nested Types @@ -324,10 +323,10 @@ class ICEBERG_EXPORT DecimalType : public PrimitiveType { ~DecimalType() override = default; /// \brief Get the precision (the number of decimal digits). - [[nodiscard]] int32_t precision() const; + int32_t precision() const; /// \brief Get the scale (essentially, the number of decimal digits after /// the decimal point; precisely, the value is scaled by $$10^{-s}$$.). - [[nodiscard]] int32_t scale() const; + int32_t scale() const; TypeId type_id() const override; std::string ToString() const override; @@ -377,9 +376,9 @@ class ICEBERG_EXPORT TimeType : public PrimitiveType { class ICEBERG_EXPORT TimestampBase : public PrimitiveType { public: /// \brief Is this type zoned or naive? - [[nodiscard]] virtual bool is_zoned() const = 0; + virtual bool is_zoned() const = 0; /// \brief The time resolution. - [[nodiscard]] virtual TimeUnit time_unit() const = 0; + virtual TimeUnit time_unit() const = 0; }; /// \brief A data type representing a timestamp in microseconds without @@ -499,7 +498,7 @@ class ICEBERG_EXPORT FixedType : public PrimitiveType { ~FixedType() override = default; /// \brief The length (the number of bytes to store). - [[nodiscard]] int32_t length() const; + int32_t length() const; TypeId type_id() const override; std::string ToString() const override; @@ -552,7 +551,7 @@ class ICEBERG_EXPORT GeometryType : public PrimitiveType { explicit GeometryType(std::string crs); ~GeometryType() override = default; - [[nodiscard]] std::string_view crs() const; + std::string_view crs() const; TypeId type_id() const override; std::string ToString() const override; @@ -561,7 +560,7 @@ class ICEBERG_EXPORT GeometryType : public PrimitiveType { bool Equals(const Type& other) const override; private: - std::optional crs_; + std::string crs_; }; /// \brief A data type representing OGC geography in WKB format. @@ -576,8 +575,8 @@ class ICEBERG_EXPORT GeographyType : public PrimitiveType { GeographyType(std::string crs, EdgeAlgorithm algorithm); ~GeographyType() override = default; - [[nodiscard]] std::string_view crs() const; - [[nodiscard]] EdgeAlgorithm algorithm() const; + std::string_view crs() const; + EdgeAlgorithm algorithm() const; TypeId type_id() const override; std::string ToString() const override; @@ -586,8 +585,8 @@ class ICEBERG_EXPORT GeographyType : public PrimitiveType { bool Equals(const Type& other) const override; private: - std::optional crs_; - std::optional algorithm_; + std::string crs_; + EdgeAlgorithm algorithm_ = kDefaultAlgorithm; }; /// @} From de22931b1f61e5e1c33f6a4e22aa75af3afe7888 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Fri, 19 Jun 2026 12:44:07 +0800 Subject: [PATCH 4/4] variant is not primitive, change the tests accordingly --- src/iceberg/test/type_test.cc | 21 +++++++++++++-------- src/iceberg/test/visit_type_test.cc | 20 ++++++++++++-------- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/src/iceberg/test/type_test.cc b/src/iceberg/test/type_test.cc index b0fb39f81..7be1239f9 100644 --- a/src/iceberg/test/type_test.cc +++ b/src/iceberg/test/type_test.cc @@ -94,7 +94,7 @@ TEST_P(TypeTest, StdFormat) { ASSERT_EQ(test_case.repr, std::format("{}", *test_case.type)); } -const static std::array kPrimitiveTypes = {{ +const static std::array kPrimitiveTypes = {{ { .name = "boolean", .type = iceberg::boolean(), @@ -228,13 +228,6 @@ const static std::array kPrimitiveTypes = {{ .primitive = true, .repr = "unknown", }, - { - .name = "variant", - .type = iceberg::variant(), - .type_id = iceberg::TypeId::kVariant, - .primitive = false, - .repr = "variant", - }, { .name = "geometry", .type = iceberg::geometry(), @@ -251,6 +244,14 @@ const static std::array kPrimitiveTypes = {{ }, }}; +const static TypeTestCase kVariantType = { + .name = "variant", + .type = iceberg::variant(), + .type_id = iceberg::TypeId::kVariant, + .primitive = false, + .repr = "variant", +}; + const static std::array kNestedTypes = {{ { .name = "list_int", @@ -298,6 +299,9 @@ const static std::array kNestedTypes = {{ INSTANTIATE_TEST_SUITE_P(Primitive, TypeTest, ::testing::ValuesIn(kPrimitiveTypes), TypeTestCaseToString); +INSTANTIATE_TEST_SUITE_P(Variant, TypeTest, ::testing::Values(kVariantType), + TypeTestCaseToString); + INSTANTIATE_TEST_SUITE_P(Nested, TypeTest, ::testing::ValuesIn(kNestedTypes), TypeTestCaseToString); @@ -306,6 +310,7 @@ TEST(TypeTest, Equality) { for (const auto& test_case : kPrimitiveTypes) { alltypes.push_back(test_case.type); } + alltypes.push_back(kVariantType.type); for (const auto& test_case : kNestedTypes) { alltypes.push_back(test_case.type); } diff --git a/src/iceberg/test/visit_type_test.cc b/src/iceberg/test/visit_type_test.cc index 35c307326..a6bd9f8c6 100644 --- a/src/iceberg/test/visit_type_test.cc +++ b/src/iceberg/test/visit_type_test.cc @@ -54,7 +54,7 @@ std::string TypeTestCaseToString(const ::testing::TestParamInfo& i return info.param.name; } -const static std::array kPrimitiveTypes = {{ +const static std::array kPrimitiveTypes = {{ { .name = "boolean", .type = iceberg::boolean(), @@ -188,13 +188,6 @@ const static std::array kPrimitiveTypes = {{ .primitive = true, .repr = "unknown", }, - { - .name = "variant", - .type = iceberg::variant(), - .type_id = iceberg::TypeId::kVariant, - .primitive = false, - .repr = "variant", - }, { .name = "geometry", .type = iceberg::geometry(), @@ -211,6 +204,14 @@ const static std::array kPrimitiveTypes = {{ }, }}; +const static TypeTestCase kVariantType = { + .name = "variant", + .type = iceberg::variant(), + .type_id = iceberg::TypeId::kVariant, + .primitive = false, + .repr = "variant", +}; + const static std::array kNestedTypes = {{ { .name = "list_int", @@ -262,6 +263,9 @@ class VisitTypeTest : public ::testing::TestWithParam {}; INSTANTIATE_TEST_SUITE_P(Primitive, VisitTypeTest, ::testing::ValuesIn(kPrimitiveTypes), TypeTestCaseToString); +INSTANTIATE_TEST_SUITE_P(Variant, VisitTypeTest, ::testing::Values(kVariantType), + TypeTestCaseToString); + INSTANTIATE_TEST_SUITE_P(Nested, VisitTypeTest, ::testing::ValuesIn(kNestedTypes), TypeTestCaseToString);