From 0fbf71ddc377d47fd38f3dd99b0373e9f256323a Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Thu, 12 Feb 2026 10:45:45 +0100 Subject: [PATCH 01/39] Add draft for basic extension type support --- datafusion/core/src/dataframe/mod.rs | 1 + .../core/src/execution/session_state.rs | 31 ++++++++++++ .../array_formatter_factory.rs | 50 +++++++++++++++++++ datafusion/core/src/extension_types/mod.rs | 5 ++ datafusion/core/src/lib.rs | 1 + datafusion/ffi/src/session/mod.rs | 2 +- 6 files changed, 89 insertions(+), 1 deletion(-) create mode 100644 datafusion/core/src/extension_types/array_formatter_factory.rs create mode 100644 datafusion/core/src/extension_types/mod.rs diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 0f38988c69405..08a3f18992b3c 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -68,6 +68,7 @@ use datafusion_functions_aggregate::expr_fn::{ avg, count, max, median, min, stddev, sum, }; +use crate::extension_types::DFArrayFormatterFactory; use async_trait::async_trait; use datafusion_catalog::Session; use datafusion_expr::extension_types::DFArrayFormatterFactory; diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index de5e6b97c1af9..88c3ba2cb0f58 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -30,6 +30,7 @@ use crate::datasource::provider_as_source; use crate::execution::SessionStateDefaults; use crate::execution::context::{EmptySerializerRegistry, FunctionFactory, QueryPlanner}; use crate::physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner}; +use arrow_schema::extension::ExtensionType; use arrow_schema::{DataType, FieldRef}; use datafusion_catalog::MemoryCatalogProviderList; use datafusion_catalog::information_schema::{ @@ -84,6 +85,7 @@ use datafusion_sql::{ use async_trait::async_trait; use chrono::{DateTime, Utc}; +use datafusion_common::types::UuidDFExtensionType; use itertools::Itertools; use log::{debug, info}; use object_store::ObjectStore; @@ -2266,6 +2268,35 @@ impl datafusion_execution::TaskContextProvider for SessionState { } } +impl ExtensionTypeRegistry for SessionState { + fn extension_type_registration( + &self, + name: &str, + ) -> datafusion_common::Result { + self.extension_types.extension_type_registration(name) + } + + fn extension_type_registrations(&self) -> Vec> { + self.extension_types.extension_type_registrations() + } + + fn add_extension_type_registration( + &self, + extension_type: ExtensionTypeRegistrationRef, + ) -> datafusion_common::Result> { + self.extension_types + .add_extension_type_registration(extension_type) + } + + fn remove_extension_type_registration( + &self, + name: &str, + ) -> datafusion_common::Result> { + self.extension_types + .remove_extension_type_registration(name) + } +} + impl OptimizerConfig for SessionState { fn query_execution_start_time(&self) -> Option> { self.execution_props.query_execution_start_time diff --git a/datafusion/core/src/extension_types/array_formatter_factory.rs b/datafusion/core/src/extension_types/array_formatter_factory.rs new file mode 100644 index 0000000000000..2c0ba5e4e3b9d --- /dev/null +++ b/datafusion/core/src/extension_types/array_formatter_factory.rs @@ -0,0 +1,50 @@ +use arrow::array::Array; +use arrow::util::display::{ArrayFormatter, ArrayFormatterFactory, FormatOptions}; +use arrow_schema::{ArrowError, Field}; +use datafusion_expr::registry::ExtensionTypeRegistryRef; + +/// A factory for creating [`ArrayFormatter`]s that checks whether a registered extension type can +/// format a given array based on its metadata. +#[derive(Debug)] +pub struct DFArrayFormatterFactory { + /// The extension type registry + registry: ExtensionTypeRegistryRef, +} + +impl DFArrayFormatterFactory { + /// Creates a new [`DFArrayFormatterFactory`]. + pub fn new(registry: ExtensionTypeRegistryRef) -> Self { + Self { registry } + } +} + +impl ArrayFormatterFactory for DFArrayFormatterFactory { + fn create_array_formatter<'formatter>( + &self, + array: &'formatter dyn Array, + options: &FormatOptions<'formatter>, + field: Option<&'formatter Field>, + ) -> Result>, ArrowError> { + let Some(field) = field else { + return Ok(None); + }; + + let Some(extension_type_name) = field.extension_type_name() else { + return Ok(None); + }; + + let Some(registration) = self + .registry + .extension_type_registration(extension_type_name) + .ok() + else { + // If the extension type is not registered, we fall back to the default formatter + return Ok(None); + }; + + registration + .create_df_extension_type(field.extension_type_metadata())? + .create_array_formatter(array, options) + .map_err(ArrowError::from) + } +} diff --git a/datafusion/core/src/extension_types/mod.rs b/datafusion/core/src/extension_types/mod.rs new file mode 100644 index 0000000000000..da5e54099e685 --- /dev/null +++ b/datafusion/core/src/extension_types/mod.rs @@ -0,0 +1,5 @@ +//! This module contains code that enables DataFusion's extension type capabilities. + +mod array_formatter_factory; + +pub use array_formatter_factory::*; diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs index 1d8368f54ba20..1f2ef15a356dc 100644 --- a/datafusion/core/src/lib.rs +++ b/datafusion/core/src/lib.rs @@ -894,6 +894,7 @@ pub mod variable { #[cfg(not(target_arch = "wasm32"))] pub mod test; +mod extension_types; mod schema_equivalence; pub mod test_util; diff --git a/datafusion/ffi/src/session/mod.rs b/datafusion/ffi/src/session/mod.rs index dfc9d1c7dfebd..3363edd73610e 100644 --- a/datafusion/ffi/src/session/mod.rs +++ b/datafusion/ffi/src/session/mod.rs @@ -49,7 +49,7 @@ use stabby::str::Str as SStr; use stabby::string::String as SString; use stabby::vec::Vec as SVec; use tokio::runtime::Handle; - +use datafusion_expr::registry::{ExtensionTypeRegistry, ExtensionTypeRegistryRef}; use crate::arrow_wrappers::WrappedSchema; use crate::execution::FFI_TaskContext; use crate::execution_plan::FFI_ExecutionPlan; From a1a3ca6d9a436bc23ab27143ff0f9ddbf2a8e47f Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Thu, 12 Feb 2026 11:34:12 +0100 Subject: [PATCH 02/39] Add an example for custom extension types --- .../examples/extension_types/event_id.rs | 328 ++++++++++++++++++ .../core/src/execution/session_state.rs | 8 +- 2 files changed, 331 insertions(+), 5 deletions(-) create mode 100644 datafusion-examples/examples/extension_types/event_id.rs diff --git a/datafusion-examples/examples/extension_types/event_id.rs b/datafusion-examples/examples/extension_types/event_id.rs new file mode 100644 index 0000000000000..bcdb048d237d4 --- /dev/null +++ b/datafusion-examples/examples/extension_types/event_id.rs @@ -0,0 +1,328 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{Array, RecordBatch, StringArray, UInt32Array}; +use arrow::util::display::{ArrayFormatter, DisplayIndex, FormatOptions, FormatResult}; +use arrow_schema::extension::ExtensionType; +use arrow_schema::{ArrowError, DataType, Field, Schema, SchemaRef}; +use datafusion::dataframe::DataFrame; +use datafusion::error::Result; +use datafusion::execution::SessionStateBuilder; +use datafusion::prelude::SessionContext; +use datafusion_common::internal_err; +use datafusion_common::types::{DFExtensionType, DFExtensionTypeRef}; +use datafusion_expr::registry::{ + ExtensionTypeRegistration, ExtensionTypeRegistry, MemoryExtensionTypeRegistry, +}; +use std::fmt::Write; +use std::sync::Arc; + +/// This example demonstrates using DataFusion's extension type API to create a custom identifier +/// type [`EventIdExtensionType`]. +/// +/// The following use cases are demonstrated: +/// - Use a custom implementation for pretty-printing data frames. +pub async fn event_id_example() -> Result<()> { + let ctx = create_session_context()?; + register_events_table(&ctx).await?; + + // Print the example table with the custom pretty-printer. + ctx.table("example").await?.show().await +} + +/// Creates the DataFusion session context with the custom extension type implementation. +fn create_session_context() -> Result { + // Create a registry with a reference to the custom extension type implementation. + let registry = MemoryExtensionTypeRegistry::new(); + let event_id_registration = Arc::new(EventIdExtensionTypeRegistration {}); + registry.add_extension_type_registration(event_id_registration)?; + + // Set the extension type registry in the session state so that DataFusion can use it. + let state = SessionStateBuilder::default() + .with_extension_type_registry(Arc::new(registry)) + .build(); + Ok(SessionContext::new_with_state(state)) +} + +/// Registers the example table and returns the data frame. +async fn register_events_table(ctx: &SessionContext) -> Result { + let schema = example_schema(); + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(UInt32Array::from(vec![ + 20_01_000000, + 20_01_000001, + 21_03_000000, + 21_03_000001, + 21_03_000002, + ])), + Arc::new(UInt32Array::from(vec![ + 2020_01_0000, + 2020_01_0001, + 2021_03_0000, + 2021_03_0001, + 2021_03_0002, + ])), + Arc::new(StringArray::from(vec![ + "First Event Jan 2020", + "Second Event Jan 2020", + "First Event Mar 2021", + "Second Event Mar 2021", + "Third Event Mar 2021", + ])), + ], + )?; + + // Register the table and return the data frame. + ctx.register_batch("example", batch)?; + ctx.table("example").await +} + +/// The schema of the example table. +fn example_schema() -> SchemaRef { + Arc::new(Schema::new(vec![ + Field::new("event_id_short", DataType::UInt32, false) + .with_extension_type(EventIdExtensionType(IdYearMode::Short)), + Field::new("event_id_long", DataType::UInt32, false) + .with_extension_type(EventIdExtensionType(IdYearMode::Long)), + Field::new("name", DataType::Utf8, false), + ])) +} + +/// Represents a 32-bit custom identifier that represents a single event. Using this format is not +/// a good idea in practice, but it is useful for demonstrating the API usage. +/// +/// An event is constructed of three parts: +/// - The year +/// - The month +/// - An auto-incrementing counter within the month +/// +/// For example, the event id `2024-01-0000` represents the first event in 2024. +/// +/// # Year Mode +/// +/// In addition, each event id can be represented in two modes. A short year mode `24-01-000000` and +/// a long year mode `2024-01-0000`. This showcases how extension types can be parameterized using +/// metadata. +#[derive(Debug)] +pub struct EventIdExtensionType(IdYearMode); + +/// Represents whether the id uses the short or long format. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum IdYearMode { + /// The short year format (e.g., `24-01-000000`). Allows for more events per month. + Short, + /// The long year format (e.g., `2024-01-0000`). Allows distinguishing between centuries. + Long, +} + +/// Implementation of [`ExtensionType`] for [`EventIdExtensionType`]. +/// +/// This is for the arrow-rs side of the API usage. The [`ExtensionType::Metadata`] type provides +/// static guarantees on the deserialized metadata for the extension type. We will use this +/// implementation to read and write the type metadata to arrow [`Field`]s. +/// +/// This trait does allow users to customize the behavior of DataFusion for this extension type. +/// This is done in [`DFExtensionType`]. +impl ExtensionType for EventIdExtensionType { + const NAME: &'static str = "custom.event_id"; + type Metadata = IdYearMode; + + fn metadata(&self) -> &Self::Metadata { + &self.0 + } + + fn serialize_metadata(&self) -> Option { + // Arrow extension type metadata is encoded as a string. We simply use the lowercase name. + // In a more involved scenario, more complex serialization formats such as JSON are + // appropriate. + Some(format!("{:?}", self.0).to_lowercase()) + } + + fn deserialize_metadata( + metadata: Option<&str>, + ) -> std::result::Result { + match metadata { + None => Err(ArrowError::InvalidArgumentError( + "Event id type requires metadata".to_owned(), + )), + Some(metadata) => match metadata { + "short" => Ok(IdYearMode::Short), + "long" => Ok(IdYearMode::Long), + _ => Err(ArrowError::InvalidArgumentError(format!( + "Invalid metadata for event id type: {}", + metadata + ))), + }, + } + } + + fn supports_data_type( + &self, + data_type: &DataType, + ) -> std::result::Result<(), ArrowError> { + match data_type { + DataType::UInt32 => Ok(()), + _ => Err(ArrowError::InvalidArgumentError(format!( + "Invalid data type: {data_type} for event id type", + ))), + } + } + + fn try_new( + data_type: &DataType, + metadata: Self::Metadata, + ) -> std::result::Result { + let instance = Self(metadata); + instance.supports_data_type(data_type)?; // Check that the data type is supported. + Ok(instance) + } +} + +/// Implementation of [`ExtensionType`] for [`EventIdExtensionType`]. +/// +/// This is for the DataFusion side of the API usage. Here users can override the default behavior +/// of DataFusion for supported extension points. +impl DFExtensionType for EventIdExtensionType { + fn create_array_formatter<'fmt>( + &self, + array: &'fmt dyn Array, + options: &FormatOptions<'fmt>, + ) -> Result>> { + if array.data_type() != &DataType::UInt32 { + return internal_err!("Wrong array type for Event Id"); + } + + // Create the formatter and pass in the year formatting mode of the type + let display_index = EventIdDisplayIndex { + array: array.as_any().downcast_ref().unwrap(), + null_str: options.null(), + mode: self.0, + }; + Ok(Some(ArrayFormatter::new( + Box::new(display_index), + options.safe(), + ))) + } +} + +/// Pretty printer for event ids. +#[derive(Debug)] +struct EventIdDisplayIndex<'a> { + array: &'a UInt32Array, + null_str: &'a str, + mode: IdYearMode, +} + +/// This implements the arrow-rs API for printing individual values of a column. DataFusion will +/// automatically pass in the reference to this implementation if a column is annotated with the +/// extension type metadata. +impl DisplayIndex for EventIdDisplayIndex<'_> { + fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult { + // Handle nulls first + if self.array.is_null(idx) { + write!(f, "{}", self.null_str)?; + return Ok(()); + } + + let value = self.array.value(idx); + + match self.mode { + IdYearMode::Short => { + // Format: YY-MM-CCCCCC + // Logic: + // - The last 6 digits are the counter. + // - The next 2 digits are the month. + // - The remaining digits are the year. + let counter = value % 1_000_000; + let rest = value / 1_000_000; + let month = rest % 100; + let year = rest / 100; + + write!(f, "{:02}-{:02}-{:06}", year, month, counter)?; + } + IdYearMode::Long => { + // Format: YYYY-MM-CCCC + // Logic: + // - The last 4 digits are the counter. + // - The next 2 digits are the month. + // - The remaining digits are the year. + let counter = value % 10_000; + let rest = value / 10_000; + let month = rest % 100; + let year = rest / 100; + + write!(f, "{:04}-{:02}-{:04}", year, month, counter)?; + } + } + Ok(()) + } +} + +/// The registration is the last piece missing for the extension type implementation. It contains +/// the logic for deserializing the metadata from the arrow [`Field`]s and creating the extension +/// type instance. We cannot use the trait from arrow-rs as it's not dyn-compatible (the Metadata +/// type must be known at compile time). +/// +/// If an extension type does not have any parameters, the [`SimpleExtensionTypeRegistration`] +/// provides an easier way of registering it. +#[derive(Debug)] +pub struct EventIdExtensionTypeRegistration(); + +impl ExtensionTypeRegistration for EventIdExtensionTypeRegistration { + fn type_name(&self) -> &str { + EventIdExtensionType::NAME + } + + fn create_df_extension_type( + &self, + metadata: Option<&str>, + ) -> Result { + let metadata = EventIdExtensionType::deserialize_metadata(metadata)?; + Ok(Arc::new(EventIdExtensionType(metadata))) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use insta::assert_snapshot; + + #[tokio::test] + async fn test_print_example_table() -> Result<()> { + let ctx = create_session_context()?; + let table = register_events_table(&ctx).await?; + + assert_snapshot!( + table.to_string().await?, + @r" + +----------------+---------------+-----------------------+ + | event_id_short | event_id_long | name | + +----------------+---------------+-----------------------+ + | 20-01-000000 | 2020-01-0000 | First Event Jan 2020 | + | 20-01-000001 | 2020-01-0001 | Second Event Jan 2020 | + | 21-03-000000 | 2021-03-0000 | First Event Mar 2021 | + | 21-03-000001 | 2021-03-0001 | Second Event Mar 2021 | + | 21-03-000002 | 2021-03-0002 | Third Event Mar 2021 | + +----------------+---------------+-----------------------+ + " + ); + + Ok(()) + } +} diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 88c3ba2cb0f58..eae93630e4129 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -30,7 +30,6 @@ use crate::datasource::provider_as_source; use crate::execution::SessionStateDefaults; use crate::execution::context::{EmptySerializerRegistry, FunctionFactory, QueryPlanner}; use crate::physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner}; -use arrow_schema::extension::ExtensionType; use arrow_schema::{DataType, FieldRef}; use datafusion_catalog::MemoryCatalogProviderList; use datafusion_catalog::information_schema::{ @@ -58,8 +57,8 @@ use datafusion_expr::planner::ExprPlanner; #[cfg(feature = "sql")] use datafusion_expr::planner::{RelationPlanner, TypePlanner}; use datafusion_expr::registry::{ - ExtensionTypeRegistryRef, FunctionRegistry, MemoryExtensionTypeRegistry, - SerializerRegistry, + ExtensionTypeRegistrationRef, ExtensionTypeRegistry, ExtensionTypeRegistryRef, + FunctionRegistry, MemoryExtensionTypeRegistry, SerializerRegistry, }; use datafusion_expr::simplify::SimplifyContext; use datafusion_expr::{ @@ -85,7 +84,6 @@ use datafusion_sql::{ use async_trait::async_trait; use chrono::{DateTime, Utc}; -use datafusion_common::types::UuidDFExtensionType; use itertools::Itertools; use log::{debug, info}; use object_store::ObjectStore; @@ -2276,7 +2274,7 @@ impl ExtensionTypeRegistry for SessionState { self.extension_types.extension_type_registration(name) } - fn extension_type_registrations(&self) -> Vec> { + fn extension_type_registrations(&self) -> Vec { self.extension_types.extension_type_registrations() } From 5f3781aaa6c64b7448619188d251fae4874aaf1d Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Thu, 12 Feb 2026 12:52:40 +0100 Subject: [PATCH 03/39] Further improvements of the extension type API proposal --- .../examples/extension_types/event_id.rs | 36 ++++--------------- datafusion/common/Cargo.toml | 1 + datafusion/common/src/types/mod.rs | 1 - datafusion/ffi/src/session/mod.rs | 5 +-- 4 files changed, 11 insertions(+), 32 deletions(-) diff --git a/datafusion-examples/examples/extension_types/event_id.rs b/datafusion-examples/examples/extension_types/event_id.rs index bcdb048d237d4..6ee11754b6cbe 100644 --- a/datafusion-examples/examples/extension_types/event_id.rs +++ b/datafusion-examples/examples/extension_types/event_id.rs @@ -24,9 +24,9 @@ use datafusion::error::Result; use datafusion::execution::SessionStateBuilder; use datafusion::prelude::SessionContext; use datafusion_common::internal_err; -use datafusion_common::types::{DFExtensionType, DFExtensionTypeRef}; +use datafusion_common::types::DFExtensionType; use datafusion_expr::registry::{ - ExtensionTypeRegistration, ExtensionTypeRegistry, MemoryExtensionTypeRegistry, + DefaultExtensionTypeRegistration, ExtensionTypeRegistry, MemoryExtensionTypeRegistry, }; use std::fmt::Write; use std::sync::Arc; @@ -48,7 +48,9 @@ pub async fn event_id_example() -> Result<()> { fn create_session_context() -> Result { // Create a registry with a reference to the custom extension type implementation. let registry = MemoryExtensionTypeRegistry::new(); - let event_id_registration = Arc::new(EventIdExtensionTypeRegistration {}); + let event_id_registration = DefaultExtensionTypeRegistration::new_arc(|metadata| { + Ok(EventIdExtensionType(metadata)) + }); registry.add_extension_type_registration(event_id_registration)?; // Set the extension type registry in the session state so that DataFusion can use it. @@ -104,8 +106,8 @@ fn example_schema() -> SchemaRef { ])) } -/// Represents a 32-bit custom identifier that represents a single event. Using this format is not -/// a good idea in practice, but it is useful for demonstrating the API usage. +/// Represents a 32-bit custom identifier that represents a single event. Using this format is +/// probably not a good idea in practice, but it is useful for demonstrating the API usage. /// /// An event is constructed of three parts: /// - The year @@ -274,30 +276,6 @@ impl DisplayIndex for EventIdDisplayIndex<'_> { } } -/// The registration is the last piece missing for the extension type implementation. It contains -/// the logic for deserializing the metadata from the arrow [`Field`]s and creating the extension -/// type instance. We cannot use the trait from arrow-rs as it's not dyn-compatible (the Metadata -/// type must be known at compile time). -/// -/// If an extension type does not have any parameters, the [`SimpleExtensionTypeRegistration`] -/// provides an easier way of registering it. -#[derive(Debug)] -pub struct EventIdExtensionTypeRegistration(); - -impl ExtensionTypeRegistration for EventIdExtensionTypeRegistration { - fn type_name(&self) -> &str { - EventIdExtensionType::NAME - } - - fn create_df_extension_type( - &self, - metadata: Option<&str>, - ) -> Result { - let metadata = EventIdExtensionType::deserialize_metadata(metadata)?; - Ok(Arc::new(EventIdExtensionType(metadata))) - } -} - #[cfg(test)] mod tests { use super::*; diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index 740d4e45b8d05..75ccef056959a 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -66,6 +66,7 @@ name = "stats_merge" [dependencies] arrow = { workspace = true } +arrow-schema = { workspace = true, features = ["canonical_extension_types"] } arrow-ipc = { workspace = true } arrow-schema = { workspace = true, features = ["canonical_extension_types"] } chrono = { workspace = true } diff --git a/datafusion/common/src/types/mod.rs b/datafusion/common/src/types/mod.rs index 57bf921a6d564..82455063bc6ce 100644 --- a/datafusion/common/src/types/mod.rs +++ b/datafusion/common/src/types/mod.rs @@ -23,7 +23,6 @@ mod logical; mod native; pub use builtin::*; -pub use canonical_extensions::*; pub use extension::*; pub use field::*; pub use logical::*; diff --git a/datafusion/ffi/src/session/mod.rs b/datafusion/ffi/src/session/mod.rs index 3363edd73610e..eea83625296e6 100644 --- a/datafusion/ffi/src/session/mod.rs +++ b/datafusion/ffi/src/session/mod.rs @@ -22,13 +22,14 @@ use std::sync::Arc; use arrow_schema::SchemaRef; use arrow_schema::ffi::FFI_ArrowSchema; +use arrow_schema::SchemaRef; use async_ffi::{FfiFuture, FutureExt}; use async_trait::async_trait; use datafusion_common::config::{ConfigFileType, ConfigOptions, TableOptions}; use datafusion_common::{DFSchema, DataFusionError}; -use datafusion_execution::TaskContext; use datafusion_execution::config::SessionConfig; use datafusion_execution::runtime_env::RuntimeEnv; +use datafusion_execution::TaskContext; use datafusion_expr::execution_props::ExecutionProps; use datafusion_expr::registry::{ExtensionTypeRegistryRef, MemoryExtensionTypeRegistry}; use datafusion_expr::{ @@ -38,9 +39,9 @@ use datafusion_expr::{ use datafusion_physical_expr::PhysicalExpr; use datafusion_physical_plan::ExecutionPlan; use datafusion_proto::bytes::{logical_plan_from_bytes, logical_plan_to_bytes}; -use datafusion_proto::logical_plan::LogicalExtensionCodec; use datafusion_proto::logical_plan::from_proto::parse_expr; use datafusion_proto::logical_plan::to_proto::serialize_expr; +use datafusion_proto::logical_plan::LogicalExtensionCodec; use datafusion_proto::protobuf::LogicalExprNode; use datafusion_session::Session; use prost::Message; From c0968b5d2f05cd2f70a23b3c5fae65373fe89cf0 Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Thu, 12 Feb 2026 12:54:52 +0100 Subject: [PATCH 04/39] Formatting --- datafusion/ffi/src/session/mod.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/datafusion/ffi/src/session/mod.rs b/datafusion/ffi/src/session/mod.rs index eea83625296e6..c8562f5ae8696 100644 --- a/datafusion/ffi/src/session/mod.rs +++ b/datafusion/ffi/src/session/mod.rs @@ -23,13 +23,14 @@ use std::sync::Arc; use arrow_schema::SchemaRef; use arrow_schema::ffi::FFI_ArrowSchema; use arrow_schema::SchemaRef; +use arrow_schema::ffi::FFI_ArrowSchema; use async_ffi::{FfiFuture, FutureExt}; use async_trait::async_trait; use datafusion_common::config::{ConfigFileType, ConfigOptions, TableOptions}; use datafusion_common::{DFSchema, DataFusionError}; +use datafusion_execution::TaskContext; use datafusion_execution::config::SessionConfig; use datafusion_execution::runtime_env::RuntimeEnv; -use datafusion_execution::TaskContext; use datafusion_expr::execution_props::ExecutionProps; use datafusion_expr::registry::{ExtensionTypeRegistryRef, MemoryExtensionTypeRegistry}; use datafusion_expr::{ @@ -39,9 +40,9 @@ use datafusion_expr::{ use datafusion_physical_expr::PhysicalExpr; use datafusion_physical_plan::ExecutionPlan; use datafusion_proto::bytes::{logical_plan_from_bytes, logical_plan_to_bytes}; +use datafusion_proto::logical_plan::LogicalExtensionCodec; use datafusion_proto::logical_plan::from_proto::parse_expr; use datafusion_proto::logical_plan::to_proto::serialize_expr; -use datafusion_proto::logical_plan::LogicalExtensionCodec; use datafusion_proto::protobuf::LogicalExprNode; use datafusion_session::Session; use prost::Message; From 8b74d6bc85c6199f19c67c9f051e0c3add5dbcc4 Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Thu, 12 Feb 2026 12:57:21 +0100 Subject: [PATCH 05/39] Docs --- testing | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testing b/testing index 7df2b70baf4f0..0d60ccae40d0e 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit 7df2b70baf4f081ebf8e0c6bd22745cf3cbfd824 +Subproject commit 0d60ccae40d0e8f2d22c15fafb01c5d4be8c63a6 From d2df1bbe90a05144353fcbcc7b5f9c95916f6224 Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Thu, 12 Feb 2026 13:10:33 +0100 Subject: [PATCH 06/39] License headers and formatting --- .../extension_types/array_formatter_factory.rs | 17 +++++++++++++++++ datafusion/core/src/extension_types/mod.rs | 17 +++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/datafusion/core/src/extension_types/array_formatter_factory.rs b/datafusion/core/src/extension_types/array_formatter_factory.rs index 2c0ba5e4e3b9d..f10576e816bda 100644 --- a/datafusion/core/src/extension_types/array_formatter_factory.rs +++ b/datafusion/core/src/extension_types/array_formatter_factory.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use arrow::array::Array; use arrow::util::display::{ArrayFormatter, ArrayFormatterFactory, FormatOptions}; use arrow_schema::{ArrowError, Field}; diff --git a/datafusion/core/src/extension_types/mod.rs b/datafusion/core/src/extension_types/mod.rs index da5e54099e685..55ec1ad95b5a1 100644 --- a/datafusion/core/src/extension_types/mod.rs +++ b/datafusion/core/src/extension_types/mod.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + //! This module contains code that enables DataFusion's extension type capabilities. mod array_formatter_factory; From e7b865c16ecd1dc6894a3233c797808b726e1d79 Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Thu, 12 Feb 2026 13:11:42 +0100 Subject: [PATCH 07/39] Add extension type registry implementation for mock sessions --- datafusion/datasource-arrow/src/file_format.rs | 14 +++++++------- datafusion/datasource/src/url.rs | 7 +++---- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/datafusion/datasource-arrow/src/file_format.rs b/datafusion/datasource-arrow/src/file_format.rs index 9297486ad66e7..3c7412d37cdb4 100644 --- a/datafusion/datasource-arrow/src/file_format.rs +++ b/datafusion/datasource-arrow/src/file_format.rs @@ -29,22 +29,22 @@ use arrow::error::ArrowError; use arrow::ipc::convert::fb_to_schema; use arrow::ipc::reader::{FileReader, StreamReader}; use arrow::ipc::writer::IpcWriteOptions; -use arrow::ipc::{CompressionType, root_as_message}; +use arrow::ipc::{root_as_message, CompressionType}; use datafusion_common::error::Result; use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::{ - DEFAULT_ARROW_EXTENSION, DataFusionError, GetExt, Statistics, - internal_datafusion_err, not_impl_err, + internal_datafusion_err, not_impl_err, DataFusionError, GetExt, + Statistics, DEFAULT_ARROW_EXTENSION, }; use datafusion_common_runtime::{JoinSet, SpawnedTask}; -use datafusion_datasource::TableSchema; use datafusion_datasource::display::FileGroupDisplay; use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; use datafusion_datasource::sink::{DataSink, DataSinkExec}; use datafusion_datasource::write::{ - ObjectWriterBuilder, SharedBuffer, get_writer_schema, + get_writer_schema, ObjectWriterBuilder, SharedBuffer, }; +use datafusion_datasource::TableSchema; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_expr::dml::InsertOp; use datafusion_physical_expr_common::sort_expr::LexRequirement; @@ -59,8 +59,8 @@ use datafusion_datasource::source::DataSourceExec; use datafusion_datasource::write::demux::DemuxedStreamReceiver; use datafusion_physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan}; use datafusion_session::Session; -use futures::StreamExt; use futures::stream::BoxStream; +use futures::StreamExt; use object_store::{ GetOptions, GetRange, GetResultPayload, ObjectMeta, ObjectStore, ObjectStoreExt, path::Path, @@ -539,8 +539,8 @@ mod tests { use std::any::Any; use chrono::DateTime; - use datafusion_common::DFSchema; use datafusion_common::config::TableOptions; + use datafusion_common::DFSchema; use datafusion_execution::config::SessionConfig; use datafusion_execution::runtime_env::RuntimeEnv; use datafusion_expr::execution_props::ExecutionProps; diff --git a/datafusion/datasource/src/url.rs b/datafusion/datasource/src/url.rs index 4d7f5bf14c697..217469701bb18 100644 --- a/datafusion/datasource/src/url.rs +++ b/datafusion/datasource/src/url.rs @@ -18,8 +18,8 @@ use std::sync::Arc; use datafusion_common::{DataFusionError, Result, TableReference}; -use datafusion_execution::cache::TableScopedPath; use datafusion_execution::cache::cache_manager::CachedFileList; +use datafusion_execution::cache::TableScopedPath; use datafusion_execution::object_store::ObjectStoreUrl; use datafusion_session::Session; @@ -28,7 +28,6 @@ use futures::{StreamExt, TryStreamExt}; use glob::Pattern; use itertools::Itertools; use log::debug; -use object_store::path::DELIMITER; use object_store::path::Path; use object_store::{ObjectMeta, ObjectStore, ObjectStoreExt}; use url::Url; @@ -511,11 +510,11 @@ mod tests { use super::*; use async_trait::async_trait; use bytes::Bytes; - use datafusion_common::DFSchema; use datafusion_common::config::TableOptions; - use datafusion_execution::TaskContext; + use datafusion_common::DFSchema; use datafusion_execution::config::SessionConfig; use datafusion_execution::runtime_env::RuntimeEnv; + use datafusion_execution::TaskContext; use datafusion_expr::execution_props::ExecutionProps; use datafusion_expr::registry::ExtensionTypeRegistryRef; use datafusion_expr::{ From dcb4cabc71ed9c9e0d326103a77123511b64b93a Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Thu, 12 Feb 2026 14:33:39 +0100 Subject: [PATCH 08/39] Fix error in listing_table_factory.rs, Formatting --- datafusion/datasource-arrow/src/file_format.rs | 14 +++++++------- datafusion/datasource/src/url.rs | 6 +++--- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/datafusion/datasource-arrow/src/file_format.rs b/datafusion/datasource-arrow/src/file_format.rs index 3c7412d37cdb4..9297486ad66e7 100644 --- a/datafusion/datasource-arrow/src/file_format.rs +++ b/datafusion/datasource-arrow/src/file_format.rs @@ -29,22 +29,22 @@ use arrow::error::ArrowError; use arrow::ipc::convert::fb_to_schema; use arrow::ipc::reader::{FileReader, StreamReader}; use arrow::ipc::writer::IpcWriteOptions; -use arrow::ipc::{root_as_message, CompressionType}; +use arrow::ipc::{CompressionType, root_as_message}; use datafusion_common::error::Result; use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::{ - internal_datafusion_err, not_impl_err, DataFusionError, GetExt, - Statistics, DEFAULT_ARROW_EXTENSION, + DEFAULT_ARROW_EXTENSION, DataFusionError, GetExt, Statistics, + internal_datafusion_err, not_impl_err, }; use datafusion_common_runtime::{JoinSet, SpawnedTask}; +use datafusion_datasource::TableSchema; use datafusion_datasource::display::FileGroupDisplay; use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; use datafusion_datasource::sink::{DataSink, DataSinkExec}; use datafusion_datasource::write::{ - get_writer_schema, ObjectWriterBuilder, SharedBuffer, + ObjectWriterBuilder, SharedBuffer, get_writer_schema, }; -use datafusion_datasource::TableSchema; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_expr::dml::InsertOp; use datafusion_physical_expr_common::sort_expr::LexRequirement; @@ -59,8 +59,8 @@ use datafusion_datasource::source::DataSourceExec; use datafusion_datasource::write::demux::DemuxedStreamReceiver; use datafusion_physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan}; use datafusion_session::Session; -use futures::stream::BoxStream; use futures::StreamExt; +use futures::stream::BoxStream; use object_store::{ GetOptions, GetRange, GetResultPayload, ObjectMeta, ObjectStore, ObjectStoreExt, path::Path, @@ -539,8 +539,8 @@ mod tests { use std::any::Any; use chrono::DateTime; - use datafusion_common::config::TableOptions; use datafusion_common::DFSchema; + use datafusion_common::config::TableOptions; use datafusion_execution::config::SessionConfig; use datafusion_execution::runtime_env::RuntimeEnv; use datafusion_expr::execution_props::ExecutionProps; diff --git a/datafusion/datasource/src/url.rs b/datafusion/datasource/src/url.rs index 217469701bb18..3fc9daa8b3f0c 100644 --- a/datafusion/datasource/src/url.rs +++ b/datafusion/datasource/src/url.rs @@ -18,8 +18,8 @@ use std::sync::Arc; use datafusion_common::{DataFusionError, Result, TableReference}; -use datafusion_execution::cache::cache_manager::CachedFileList; use datafusion_execution::cache::TableScopedPath; +use datafusion_execution::cache::cache_manager::CachedFileList; use datafusion_execution::object_store::ObjectStoreUrl; use datafusion_session::Session; @@ -510,11 +510,11 @@ mod tests { use super::*; use async_trait::async_trait; use bytes::Bytes; - use datafusion_common::config::TableOptions; use datafusion_common::DFSchema; + use datafusion_common::config::TableOptions; + use datafusion_execution::TaskContext; use datafusion_execution::config::SessionConfig; use datafusion_execution::runtime_env::RuntimeEnv; - use datafusion_execution::TaskContext; use datafusion_expr::execution_props::ExecutionProps; use datafusion_expr::registry::ExtensionTypeRegistryRef; use datafusion_expr::{ From 3453c1e6a0c6810f38501e7035ddd59894d1baad Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Thu, 12 Feb 2026 15:24:13 +0100 Subject: [PATCH 09/39] Lints and formatting --- .../examples/extension_types/event_id.rs | 19 +++++-------------- datafusion/common/Cargo.toml | 1 - 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/datafusion-examples/examples/extension_types/event_id.rs b/datafusion-examples/examples/extension_types/event_id.rs index 6ee11754b6cbe..11a27c0742f62 100644 --- a/datafusion-examples/examples/extension_types/event_id.rs +++ b/datafusion-examples/examples/extension_types/event_id.rs @@ -67,18 +67,10 @@ async fn register_events_table(ctx: &SessionContext) -> Result { schema, vec![ Arc::new(UInt32Array::from(vec![ - 20_01_000000, - 20_01_000001, - 21_03_000000, - 21_03_000001, - 21_03_000002, + 2001000000, 2001000001, 2103000000, 2103000001, 2103000002, ])), Arc::new(UInt32Array::from(vec![ - 2020_01_0000, - 2020_01_0001, - 2021_03_0000, - 2021_03_0001, - 2021_03_0002, + 2020010000, 2020010001, 2021030000, 2021030001, 2021030002, ])), Arc::new(StringArray::from(vec![ "First Event Jan 2020", @@ -167,8 +159,7 @@ impl ExtensionType for EventIdExtensionType { "short" => Ok(IdYearMode::Short), "long" => Ok(IdYearMode::Long), _ => Err(ArrowError::InvalidArgumentError(format!( - "Invalid metadata for event id type: {}", - metadata + "Invalid metadata for event id type: {metadata}" ))), }, } @@ -256,7 +247,7 @@ impl DisplayIndex for EventIdDisplayIndex<'_> { let month = rest % 100; let year = rest / 100; - write!(f, "{:02}-{:02}-{:06}", year, month, counter)?; + write!(f, "{year:02}-{month:02}-{counter:06}")?; } IdYearMode::Long => { // Format: YYYY-MM-CCCC @@ -269,7 +260,7 @@ impl DisplayIndex for EventIdDisplayIndex<'_> { let month = rest % 100; let year = rest / 100; - write!(f, "{:04}-{:02}-{:04}", year, month, counter)?; + write!(f, "{year:04}-{month:02}-{counter:04}")?; } } Ok(()) diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index 75ccef056959a..740d4e45b8d05 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -66,7 +66,6 @@ name = "stats_merge" [dependencies] arrow = { workspace = true } -arrow-schema = { workspace = true, features = ["canonical_extension_types"] } arrow-ipc = { workspace = true } arrow-schema = { workspace = true, features = ["canonical_extension_types"] } chrono = { workspace = true } From f07434f57922e15262ab99b1ec787665519999e1 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 19 Mar 2026 18:02:37 -0500 Subject: [PATCH 10/39] first pass --- .../src/types/canonical_extensions/uuid.rs | 55 ++++++++++++++++++- datafusion/common/src/types/extension.rs | 37 ++++++++++++- .../physical-expr/src/expressions/cast.rs | 1 + datafusion/physical-expr/src/planner.rs | 32 +++++++++-- 4 files changed, 116 insertions(+), 9 deletions(-) diff --git a/datafusion/common/src/types/canonical_extensions/uuid.rs b/datafusion/common/src/types/canonical_extensions/uuid.rs index 8cbcf3f58a80e..234499eae63c7 100644 --- a/datafusion/common/src/types/canonical_extensions/uuid.rs +++ b/datafusion/common/src/types/canonical_extensions/uuid.rs @@ -17,8 +17,10 @@ use crate::Result; use crate::error::_internal_err; +use crate::types::CastExtension; use crate::types::extension::DFExtensionType; -use arrow::array::{Array, FixedSizeBinaryArray}; +use arrow::array::{Array, ArrayRef, FixedSizeBinaryArray}; +use arrow::compute::CastOptions; use arrow::datatypes::DataType; use arrow::util::display::{ArrayFormatter, DisplayIndex, FormatOptions, FormatResult}; use arrow_schema::extension::{ExtensionType, Uuid}; @@ -74,6 +76,22 @@ impl DFExtensionType for DFUuid { options.safe(), ))) } + + fn create_cast_extension( + &self, + other: &Field, + ) -> crate::Result>> { + if other.extension_type_name().is_some() { + return Ok(None); + } + + match other.data_type() { + DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 => { + Ok(Some(Arc::new(UuidCastExtension {}))) + } + _ => Ok(None), + } + } } /// Pretty printer for binary UUID values. @@ -98,6 +116,41 @@ impl DisplayIndex for UuidValueDisplayIndex<'_> { } } +#[derive(Debug)] +struct UuidCastExtension {} + +impl CastExtension for UuidCastExtension { + fn can_cast(&self, to: &Field, options: CastOptions<'static>) -> crate::Result { + todo!() + } + + fn cast( + &self, + value: ArrayRef, + to: &Field, + options: CastOptions<'static>, + ) -> crate::Result { + todo!() + } + + fn can_cast_from( + &self, + from: &Field, + options: CastOptions<'static>, + ) -> crate::Result { + todo!() + } + + fn cast_from( + &self, + value: ArrayRef, + to: &Field, + options: CastOptions<'static>, + ) -> crate::Result { + todo!() + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/datafusion/common/src/types/extension.rs b/datafusion/common/src/types/extension.rs index 3bcb533dbf9e6..87bfe3f1211f4 100644 --- a/datafusion/common/src/types/extension.rs +++ b/datafusion/common/src/types/extension.rs @@ -16,9 +16,10 @@ // under the License. use crate::error::Result; -use arrow::array::Array; +use arrow::array::{Array, ArrayRef}; +use arrow::compute::CastOptions; use arrow::util::display::{ArrayFormatter, FormatOptions}; -use arrow_schema::DataType; +use arrow_schema::{DataType, Field}; use std::fmt::Debug; use std::sync::Arc; @@ -87,4 +88,36 @@ pub trait DFExtensionType: Debug + Send + Sync { ) -> Result>> { Ok(None) } + + // None for "not handled by this extension type" (could be handled by the other) + fn create_cast_extension( + &self, + _other: &Field, + ) -> Result>> { + Ok(None) + } +} + +pub trait CastExtension: Debug + Send + Sync { + fn can_cast(&self, to: &Field, options: CastOptions<'static>) + -> Result; + + // None for fallback + fn cast( + &self, + value: ArrayRef, + to: &Field, + options: CastOptions<'static>, + ) -> Result; + + fn can_cast_from(&self, from: &Field, options: CastOptions<'static>) + -> Result; + + // None for fallback + fn cast_from( + &self, + value: ArrayRef, + to: &Field, + options: CastOptions<'static>, + ) -> Result; } diff --git a/datafusion/physical-expr/src/expressions/cast.rs b/datafusion/physical-expr/src/expressions/cast.rs index ad214a89ceb71..54e8212e07e74 100644 --- a/datafusion/physical-expr/src/expressions/cast.rs +++ b/datafusion/physical-expr/src/expressions/cast.rs @@ -63,6 +63,7 @@ pub struct CastExpr { target_field: FieldRef, /// Cast options cast_options: CastOptions<'static>, + // CastExtension might go here } // Manually derive PartialEq and Hash to work around https://github.com/rust-lang/rust/issues/78808 diff --git a/datafusion/physical-expr/src/planner.rs b/datafusion/physical-expr/src/planner.rs index 9cb20de252aa0..95d4c5eebf36a 100644 --- a/datafusion/physical-expr/src/planner.rs +++ b/datafusion/physical-expr/src/planner.rs @@ -295,12 +295,32 @@ pub fn create_physical_expr( }; Ok(expressions::case(expr, when_then_expr, else_expr)?) } - Expr::Cast(Cast { expr, field }) => expressions::cast_with_target_field( - create_physical_expr(expr, input_dfschema, execution_props)?, - input_schema, - Arc::clone(field), - None, - ), + Expr::Cast(Cast { expr, field }) => { + + // This is where we cast + + // Need to figure out what happened here + + + if !field.metadata().is_empty() { + let (_, src_field) = expr.to_field(input_dfschema)?; + return plan_err!( + "Cast from {} to {} is not supported", + format_type_and_metadata( + src_field.data_type(), + Some(src_field.metadata()), + ), + format_type_and_metadata(field.data_type(), Some(field.metadata())) + ); + } + + expressions::cast_with_target_field( + create_physical_expr(expr, input_dfschema, execution_props)?, + input_schema, + Arc::clone(field), + None, + ) + } Expr::TryCast(TryCast { expr, field }) => { if !field.metadata().is_empty() { let (_, src_field) = expr.to_field(input_dfschema)?; From c988a09ef56bb99f16ec94b08874fc09860bde8c Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 19 Mar 2026 21:32:03 -0500 Subject: [PATCH 11/39] prototype cast compute --- .../src/types/canonical_extensions/uuid.rs | 112 +++++++++++++++--- 1 file changed, 95 insertions(+), 17 deletions(-) diff --git a/datafusion/common/src/types/canonical_extensions/uuid.rs b/datafusion/common/src/types/canonical_extensions/uuid.rs index 234499eae63c7..a3f34c90cd0ac 100644 --- a/datafusion/common/src/types/canonical_extensions/uuid.rs +++ b/datafusion/common/src/types/canonical_extensions/uuid.rs @@ -16,15 +16,20 @@ // under the License. use crate::Result; +use crate::cast::{as_fixed_size_binary_array, as_string_array}; use crate::error::_internal_err; use crate::types::CastExtension; use crate::types::extension::DFExtensionType; -use arrow::array::{Array, ArrayRef, FixedSizeBinaryArray}; -use arrow::compute::CastOptions; +use arrow::array::{ + Array, ArrayRef, FixedSizeBinaryArray, StringBuilder, builder::FixedSizeBinaryBuilder, +}; +use arrow::compute::{CastOptions, cast}; use arrow::datatypes::DataType; use arrow::util::display::{ArrayFormatter, DisplayIndex, FormatOptions, FormatResult}; +use arrow_schema::Field; use arrow_schema::extension::{ExtensionType, Uuid}; use std::fmt::Write; +use std::sync::Arc; use uuid::Bytes; /// Defines the extension type logic for the canonical `arrow.uuid` extension type. This extension @@ -80,7 +85,7 @@ impl DFExtensionType for DFUuid { fn create_cast_extension( &self, other: &Field, - ) -> crate::Result>> { + ) -> Result>> { if other.extension_type_name().is_some() { return Ok(None); } @@ -120,8 +125,29 @@ impl DisplayIndex for UuidValueDisplayIndex<'_> { struct UuidCastExtension {} impl CastExtension for UuidCastExtension { - fn can_cast(&self, to: &Field, options: CastOptions<'static>) -> crate::Result { - todo!() + fn can_cast(&self, to: &Field, options: CastOptions<'static>) -> Result { + if to.extension_type_name().is_some() { + return Ok(false); + } + + match to.data_type() { + // Only explicit casts to string + DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 => { + if options.safe { + Ok(false) + } else { + Ok(true) + } + } + // Can implicitly cast to storage + DataType::FixedSizeBinary(16) => Ok(true), + _ => Ok(false), + } + } + + fn can_cast_from(&self, from: &Field, options: CastOptions<'static>) -> Result { + // Symmetric behaviour between cast from and cast to + self.can_cast(from, options) } fn cast( @@ -129,25 +155,77 @@ impl CastExtension for UuidCastExtension { value: ArrayRef, to: &Field, options: CastOptions<'static>, - ) -> crate::Result { - todo!() - } + ) -> Result { + if !self.can_cast(to, options)? { + return _internal_err!("Unhandled cast"); + } - fn can_cast_from( - &self, - from: &Field, - options: CastOptions<'static>, - ) -> crate::Result { - todo!() + let storage = as_fixed_size_binary_array(&value)?; + match to.data_type() { + DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 => { + let mut builder = + StringBuilder::with_capacity(storage.len(), storage.len() * 36); + for bytes_opt in storage { + match bytes_opt { + Some(bytes) => { + let bytes16 = Bytes::try_from(bytes).map_err(|e| { + crate::DataFusionError::Execution(e.to_string()) + })?; + let uuid = uuid::Uuid::from_bytes(bytes16); + write!(builder, "{uuid}")?; + builder.append_value(""); + } + None => builder.append_null(), + } + } + + let string_array = Arc::new(builder.finish()) as ArrayRef; + return Ok(cast(&string_array, to.data_type())?); + } + DataType::FixedSizeBinary(16) => return Ok(value), + _ => {} + } + + _internal_err!("Unexpected difference between can_cast()") } fn cast_from( &self, value: ArrayRef, - to: &Field, + from: &Field, options: CastOptions<'static>, - ) -> crate::Result { - todo!() + ) -> Result { + if !self.can_cast_from(from, options)? { + return _internal_err!("Unhandled cast"); + } + + match from.data_type() { + DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 => { + let string_array_ref = cast(&value, &DataType::Utf8)?; + let string_array = as_string_array(&string_array_ref)?; + let mut builder = FixedSizeBinaryBuilder::new(16); + for string_opt in string_array { + match string_opt { + Some(string) => { + let uuid = uuid::Uuid::try_parse(string).map_err(|_| { + crate::DataFusionError::Execution(format!( + "Failed to parsed string '{string}' as UUID" + )) + })?; + builder.append_value(uuid.as_bytes())?; + } + None => { + builder.append_null(); + } + } + } + } + // Can implicitly cast from storage + DataType::FixedSizeBinary(16) => return Ok(value), + _ => {} + } + + _internal_err!("Unexpected difference between can_cast_from()") } } From c577b00137c5a2c569625045d2752314d4f3f1b3 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 19 Mar 2026 23:02:12 -0500 Subject: [PATCH 12/39] pipe it through --- .../src/types/canonical_extensions/uuid.rs | 62 +++++++++++-------- datafusion/common/src/types/extension.rs | 40 +++++++----- datafusion/expr/src/execution_props.rs | 3 + .../physical-expr/src/expressions/cast.rs | 59 +++++++++++++++++- datafusion/physical-expr/src/planner.rs | 53 +++++++++++++++- 5 files changed, 170 insertions(+), 47 deletions(-) diff --git a/datafusion/common/src/types/canonical_extensions/uuid.rs b/datafusion/common/src/types/canonical_extensions/uuid.rs index a3f34c90cd0ac..3ca6fad946b16 100644 --- a/datafusion/common/src/types/canonical_extensions/uuid.rs +++ b/datafusion/common/src/types/canonical_extensions/uuid.rs @@ -82,20 +82,16 @@ impl DFExtensionType for DFUuid { ))) } - fn create_cast_extension( + fn cast_from( &self, - other: &Field, - ) -> Result>> { - if other.extension_type_name().is_some() { - return Ok(None); - } + ) -> Result> { + Ok(Arc::new(CastToUuid {})) + } - match other.data_type() { - DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 => { - Ok(Some(Arc::new(UuidCastExtension {}))) - } - _ => Ok(None), - } + fn cast_to( + &self, + ) -> Result> { + Ok(Arc::new(CastFromUuid {})) } } @@ -122,10 +118,15 @@ impl DisplayIndex for UuidValueDisplayIndex<'_> { } #[derive(Debug)] -struct UuidCastExtension {} +struct CastFromUuid {} -impl CastExtension for UuidCastExtension { - fn can_cast(&self, to: &Field, options: CastOptions<'static>) -> Result { +impl CastExtension for CastFromUuid { + fn can_cast( + &self, + _from: &Field, + to: &Field, + options: &CastOptions, + ) -> Result { if to.extension_type_name().is_some() { return Ok(false); } @@ -145,18 +146,14 @@ impl CastExtension for UuidCastExtension { } } - fn can_cast_from(&self, from: &Field, options: CastOptions<'static>) -> Result { - // Symmetric behaviour between cast from and cast to - self.can_cast(from, options) - } - fn cast( &self, value: ArrayRef, + from: &Field, to: &Field, - options: CastOptions<'static>, + options: &CastOptions, ) -> Result { - if !self.can_cast(to, options)? { + if !self.can_cast(from, to, options)? { return _internal_err!("Unhandled cast"); } @@ -188,14 +185,29 @@ impl CastExtension for UuidCastExtension { _internal_err!("Unexpected difference between can_cast()") } +} - fn cast_from( +#[derive(Debug)] +struct CastToUuid {} + +impl CastExtension for CastToUuid { + fn can_cast( + &self, + from: &Field, + to: &Field, + options: &CastOptions, + ) -> Result { + CastFromUuid {}.can_cast(to, from, options) + } + + fn cast( &self, value: ArrayRef, from: &Field, - options: CastOptions<'static>, + to: &Field, + options: &CastOptions, ) -> Result { - if !self.can_cast_from(from, options)? { + if !self.can_cast(from, to, options)? { return _internal_err!("Unhandled cast"); } diff --git a/datafusion/common/src/types/extension.rs b/datafusion/common/src/types/extension.rs index 87bfe3f1211f4..8651b46db19a2 100644 --- a/datafusion/common/src/types/extension.rs +++ b/datafusion/common/src/types/extension.rs @@ -89,35 +89,43 @@ pub trait DFExtensionType: Debug + Send + Sync { Ok(None) } - // None for "not handled by this extension type" (could be handled by the other) - fn create_cast_extension( - &self, - _other: &Field, - ) -> Result>> { - Ok(None) + fn cast_from(&self) -> Result> { + Ok(Arc::new(DefaultExtensionCast {})) + } + + fn cast_to(&self) -> Result> { + Ok(Arc::new(DefaultExtensionCast {})) } } pub trait CastExtension: Debug + Send + Sync { - fn can_cast(&self, to: &Field, options: CastOptions<'static>) - -> Result; + fn can_cast(&self, from: &Field, to: &Field, options: &CastOptions) -> Result; // None for fallback fn cast( &self, value: ArrayRef, + from: &Field, to: &Field, - options: CastOptions<'static>, + options: &CastOptions, ) -> Result; +} - fn can_cast_from(&self, from: &Field, options: CastOptions<'static>) - -> Result; +#[derive(Debug)] +struct DefaultExtensionCast {} - // None for fallback - fn cast_from( +impl CastExtension for DefaultExtensionCast { + fn can_cast(&self, from: &Field, to: &Field, _options: &CastOptions) -> Result { + Ok(from.data_type() == to.data_type()) + } + + fn cast( &self, value: ArrayRef, - to: &Field, - options: CastOptions<'static>, - ) -> Result; + _from: &Field, + _to: &Field, + _options: &CastOptions, + ) -> Result { + Ok(value) + } } diff --git a/datafusion/expr/src/execution_props.rs b/datafusion/expr/src/execution_props.rs index 24d0f333a6e56..83e8d525c95fd 100644 --- a/datafusion/expr/src/execution_props.rs +++ b/datafusion/expr/src/execution_props.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::registry::ExtensionTypeRegistry; use crate::var_provider::{VarProvider, VarType}; use chrono::{DateTime, Utc}; use datafusion_common::HashMap; @@ -69,6 +70,7 @@ pub struct ExecutionProps { /// Shared results container for uncorrelated scalar subquery values. /// Populated at execution time by `ScalarSubqueryExec`. pub subquery_results: ScalarSubqueryResults, + pub extension_types: Option>, } impl Default for ExecutionProps { @@ -87,6 +89,7 @@ impl ExecutionProps { var_providers: None, subquery_indexes: HashMap::new(), subquery_results: ScalarSubqueryResults::default(), + extension_types: None, } } diff --git a/datafusion/physical-expr/src/expressions/cast.rs b/datafusion/physical-expr/src/expressions/cast.rs index 54e8212e07e74..b48243e8b52b0 100644 --- a/datafusion/physical-expr/src/expressions/cast.rs +++ b/datafusion/physical-expr/src/expressions/cast.rs @@ -55,7 +55,7 @@ fn can_cast_named_struct_types(source: &DataType, target: &DataType) -> bool { } /// CAST expression casts an expression to a specific data type and returns a runtime error on invalid cast -#[derive(Debug, Clone, Eq)] +#[derive(Debug, Clone)] pub struct CastExpr { /// The expression to cast pub expr: Arc, @@ -63,7 +63,8 @@ pub struct CastExpr { target_field: FieldRef, /// Cast options cast_options: CastOptions<'static>, - // CastExtension might go here + // CastExtension + cast_extension: Option>, } // Manually derive PartialEq and Hash to work around https://github.com/rust-lang/rust/issues/78808 @@ -83,6 +84,8 @@ impl Hash for CastExpr { } } +impl Eq for CastExpr {} + impl CastExpr { /// Create a new `CastExpr` using only a `DataType`. /// @@ -128,6 +131,17 @@ impl CastExpr { expr, target_field, cast_options: cast_options.unwrap_or(DEFAULT_CAST_OPTIONS), + cast_extension: None, + } + } + + pub fn with_cast_extension( + self, + cast_extension: Option>, + ) -> Self { + Self { + cast_extension, + ..self } } @@ -246,7 +260,35 @@ impl PhysicalExpr for CastExpr { fn evaluate(&self, batch: &RecordBatch) -> Result { let value = self.expr.evaluate(batch)?; - value.cast_to(self.cast_type(), Some(&self.cast_options)) + if let Some(cast_extension) = &self.cast_extension { + let from_field = self.expr.return_field(&batch.schema())?; + let to_field = self.return_field(&batch.schema())?; + match value { + ColumnarValue::Array(array) => { + Ok(ColumnarValue::Array(cast_extension.cast( + array, + &from_field, + &to_field, + &self.cast_options, + )?)) + } + ColumnarValue::Scalar(scalar_value) => { + let array = scalar_value.to_array()?; + let array_result = cast_extension.cast( + array, + &from_field, + &to_field, + &self.cast_options, + )?; + Ok(ColumnarValue::Scalar(ScalarValue::try_from_array( + &array_result, + 0, + )?)) + } + } + } else { + value.cast_to(self.cast_type(), Some(&self.cast_options)) + } } fn return_field(&self, input_schema: &Schema) -> Result { @@ -373,6 +415,17 @@ pub fn cast( cast_with_options(expr, input_schema, cast_type, None) } +pub fn cast_with_extension( + expr: Arc, + input_schema: &Schema, + cast_type: DataType, + cast_extension: Arc, +) -> Result> { + Ok(Arc::new( + CastExpr::new(expr, cast_type, None).with_cast_extension(Some(cast_extension)), + )) +} + #[cfg(test)] mod tests { use super::*; diff --git a/datafusion/physical-expr/src/planner.rs b/datafusion/physical-expr/src/planner.rs index 95d4c5eebf36a..a277e8d44a6c2 100644 --- a/datafusion/physical-expr/src/planner.rs +++ b/datafusion/physical-expr/src/planner.rs @@ -25,6 +25,7 @@ use crate::{ expressions::{self, Column, Literal, binary, like, similar_to}, }; +use arrow::compute::CastOptions; use arrow::datatypes::Schema; use datafusion_common::config::ConfigOptions; use datafusion_common::datatype::FieldExt; @@ -296,14 +297,35 @@ pub fn create_physical_expr( Ok(expressions::case(expr, when_then_expr, else_expr)?) } Expr::Cast(Cast { expr, field }) => { - - // This is where we cast + let (_, src_field) = expr.to_field(input_dfschema)?; + const DEFAULT_CAST_OPTIONS: CastOptions<'static> = CastOptions { + safe: false, + format_options: DEFAULT_FORMAT_OPTIONS, + }; // Need to figure out what happened here if !field.metadata().is_empty() { - let (_, src_field) = expr.to_field(input_dfschema)?; + if let Some(registry) = &execution_props.extension_types + && let Some(extension_type) = + registry.create_extension_type_for_field(&field)? + { + let cast_extension = extension_type.cast_from()?; + if cast_extension.can_cast( + &src_field, + &field, + &DEFAULT_CAST_OPTIONS, + )? { + return expressions::cast_with_extension( + create_physical_expr(expr, input_dfschema, execution_props)?, + input_schema, + field.data_type().clone(), + cast_extension, + ); + } + } + return plan_err!( "Cast from {} to {} is not supported", format_type_and_metadata( @@ -312,6 +334,31 @@ pub fn create_physical_expr( ), format_type_and_metadata(field.data_type(), Some(field.metadata())) ); + } else if let Some(registry) = &execution_props.extension_types + && let Some(extension_type) = + registry.create_extension_type_for_field(&src_field)? + { + let cast_extension = extension_type.cast_to()?; + if cast_extension.can_cast(&src_field, &field, &DEFAULT_CAST_OPTIONS)? { + return expressions::cast_with_extension( + create_physical_expr(expr, input_dfschema, execution_props)?, + input_schema, + field.data_type().clone(), + cast_extension, + ); + } else { + return plan_err!( + "Cast from {} to {} is not supported", + format_type_and_metadata( + src_field.data_type(), + Some(src_field.metadata()), + ), + format_type_and_metadata( + field.data_type(), + Some(field.metadata()) + ) + ); + } } expressions::cast_with_target_field( From e479fc1f95bba6d4bb3fc51478f40d4dd50c28a2 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 19 Mar 2026 23:38:25 -0500 Subject: [PATCH 13/39] piped with failing test --- .../core/src/execution/session_state.rs | 4 ++ .../tests/extension_types/pretty_printing.rs | 56 ++++++++++++++++++- .../physical-expr/src/expressions/cast.rs | 5 +- 3 files changed, 62 insertions(+), 3 deletions(-) diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index eae93630e4129..917916ad3195e 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -1728,6 +1728,10 @@ impl SessionStateBuilder { } } + // Temporary hack while we figure out how to get the extension types where they + // need to go + state.execution_props.extension_types = Some(state.extension_types.clone()); + state } diff --git a/datafusion/core/tests/extension_types/pretty_printing.rs b/datafusion/core/tests/extension_types/pretty_printing.rs index c0796887b8b6e..8bb337326a784 100644 --- a/datafusion/core/tests/extension_types/pretty_printing.rs +++ b/datafusion/core/tests/extension_types/pretty_printing.rs @@ -17,7 +17,8 @@ use arrow::array::{FixedSizeBinaryArray, RecordBatch}; use arrow_schema::extension::Uuid; -use arrow_schema::{DataType, Field, Schema, SchemaRef}; +use arrow_schema::{DataType, Field, FieldRef, Schema, SchemaRef}; +use datafusion::assert_batches_eq; use datafusion::dataframe::DataFrame; use datafusion::error::Result; use datafusion::execution::SessionStateBuilder; @@ -58,6 +59,8 @@ async fn create_test_table() -> Result { ctx.table("test").await } +// Test here + #[tokio::test] async fn test_pretty_print_extension_type_formatter() -> Result<()> { let result = create_test_table().await?.to_string().await?; @@ -76,3 +79,54 @@ async fn test_pretty_print_extension_type_formatter() -> Result<()> { Ok(()) } + +#[tokio::test] +async fn create_cast_uuid_sql() -> Result<()> { + let schema = test_schema(); + + // define data. + let batch = RecordBatch::try_new( + schema, + vec![Arc::new(FixedSizeBinaryArray::from(vec![ + &[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 5, 6], + ]))], + )?; + + let state = SessionStateBuilder::default() + .with_canonical_extension_types()? + .with_type_planner(Arc::new(CustomTypePlanner {})) + .build(); + let ctx = SessionContext::new_with_state(state); + + ctx.register_batch("test", batch)?; + + let df = ctx.sql("SELECT my_uuids::VARCHAR FROM test").await?; + println!("{}", df.clone().explain(false, false)?.to_string().await?); + + let batches = df.collect().await?; + + assert_batches_eq!(vec![""], &batches); + + Ok(()) +} + +#[derive(Debug)] +pub struct CustomTypePlanner {} + +impl TypePlanner for CustomTypePlanner { + fn plan_type_field( + &self, + sql_type: &sqlparser::ast::DataType, + ) -> Result> { + match sql_type { + sqlparser::ast::DataType::Uuid => Ok(Some(Arc::new( + Field::new("", DataType::FixedSizeBinary(16), true).with_metadata( + [("ARROW:extension:name".to_string(), "arrow.uuid".to_string())] + .into(), + ), + ))), + _ => Ok(None), + } + } +} diff --git a/datafusion/physical-expr/src/expressions/cast.rs b/datafusion/physical-expr/src/expressions/cast.rs index b48243e8b52b0..8ca5467d2ae90 100644 --- a/datafusion/physical-expr/src/expressions/cast.rs +++ b/datafusion/physical-expr/src/expressions/cast.rs @@ -417,12 +417,13 @@ pub fn cast( pub fn cast_with_extension( expr: Arc, - input_schema: &Schema, + _input_schema: &Schema, cast_type: DataType, cast_extension: Arc, ) -> Result> { Ok(Arc::new( - CastExpr::new(expr, cast_type, None).with_cast_extension(Some(cast_extension)), + CastExpr::new(expr, cast_type, Some(DEFAULT_CAST_OPTIONS)) + .with_cast_extension(Some(cast_extension)), )) } From bded4fc121be0fb8b642b68a0a2f567c7d26b222 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 20 Mar 2026 00:07:39 -0500 Subject: [PATCH 14/39] with passing test --- .../src/types/canonical_extensions/uuid.rs | 24 +++-------- .../core/src/execution/session_state.rs | 4 ++ .../tests/extension_types/pretty_printing.rs | 42 +++++++++++++++++-- datafusion/expr/src/simplify.rs | 15 +++++++ datafusion/optimizer/src/optimizer.rs | 6 ++- .../simplify_expressions/expr_simplifier.rs | 12 +++++- .../simplify_expressions/simplify_exprs.rs | 1 + 7 files changed, 79 insertions(+), 25 deletions(-) diff --git a/datafusion/common/src/types/canonical_extensions/uuid.rs b/datafusion/common/src/types/canonical_extensions/uuid.rs index 3ca6fad946b16..da532cae3b339 100644 --- a/datafusion/common/src/types/canonical_extensions/uuid.rs +++ b/datafusion/common/src/types/canonical_extensions/uuid.rs @@ -82,15 +82,11 @@ impl DFExtensionType for DFUuid { ))) } - fn cast_from( - &self, - ) -> Result> { + fn cast_from(&self) -> Result> { Ok(Arc::new(CastToUuid {})) } - fn cast_to( - &self, - ) -> Result> { + fn cast_to(&self) -> Result> { Ok(Arc::new(CastFromUuid {})) } } @@ -121,12 +117,7 @@ impl DisplayIndex for UuidValueDisplayIndex<'_> { struct CastFromUuid {} impl CastExtension for CastFromUuid { - fn can_cast( - &self, - _from: &Field, - to: &Field, - options: &CastOptions, - ) -> Result { + fn can_cast(&self, _from: &Field, to: &Field, options: &CastOptions) -> Result { if to.extension_type_name().is_some() { return Ok(false); } @@ -191,12 +182,7 @@ impl CastExtension for CastFromUuid { struct CastToUuid {} impl CastExtension for CastToUuid { - fn can_cast( - &self, - from: &Field, - to: &Field, - options: &CastOptions, - ) -> Result { + fn can_cast(&self, from: &Field, to: &Field, options: &CastOptions) -> Result { CastFromUuid {}.can_cast(to, from, options) } @@ -231,6 +217,8 @@ impl CastExtension for CastToUuid { } } } + + return Ok(Arc::new(builder.finish())); } // Can implicitly cast from storage DataType::FixedSizeBinary(16) => return Ok(value), diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 917916ad3195e..af70af5ea58ff 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -2315,6 +2315,10 @@ impl OptimizerConfig for SessionState { fn function_registry(&self) -> Option<&dyn FunctionRegistry> { Some(self) } + + fn extension_types(&self) -> Option> { + Some(self.extension_types.clone()) + } } /// Create a new task context instance from SessionState diff --git a/datafusion/core/tests/extension_types/pretty_printing.rs b/datafusion/core/tests/extension_types/pretty_printing.rs index 8bb337326a784..01b40bd516e32 100644 --- a/datafusion/core/tests/extension_types/pretty_printing.rs +++ b/datafusion/core/tests/extension_types/pretty_printing.rs @@ -81,7 +81,7 @@ async fn test_pretty_print_extension_type_formatter() -> Result<()> { } #[tokio::test] -async fn create_cast_uuid_sql() -> Result<()> { +async fn create_cast_uuid_to_char() -> Result<()> { let schema = test_schema(); // define data. @@ -102,11 +102,45 @@ async fn create_cast_uuid_sql() -> Result<()> { ctx.register_batch("test", batch)?; let df = ctx.sql("SELECT my_uuids::VARCHAR FROM test").await?; - println!("{}", df.clone().explain(false, false)?.to_string().await?); - let batches = df.collect().await?; - assert_batches_eq!(vec![""], &batches); + assert_batches_eq!( + vec![ + "+--------------------------------------+", + "| test.my_uuids |", + "+--------------------------------------+", + "| 00000000-0000-0000-0000-000000000000 |", + "| 00010203-0405-0607-0809-000102030506 |", + "+--------------------------------------+", + ], + &batches + ); + + Ok(()) +} + +#[tokio::test] +async fn create_cast_char_to_uuid() -> Result<()> { + let state = SessionStateBuilder::default() + .with_canonical_extension_types()? + .with_type_planner(Arc::new(CustomTypePlanner {})) + .build(); + let ctx = SessionContext::new_with_state(state); + + let df = ctx + .sql("SELECT '00010203-0405-0607-0809-000102030506'::UUID AS uuid") + .await?; + let batches = df.collect().await?; + assert_batches_eq!( + vec![ + "+----------------------------------+", + "| uuid |", + "+----------------------------------+", + "| 00010203040506070809000102030506 |", + "+----------------------------------+", + ], + &batches + ); Ok(()) } diff --git a/datafusion/expr/src/simplify.rs b/datafusion/expr/src/simplify.rs index 522cf122a273c..70b7bd4d71696 100644 --- a/datafusion/expr/src/simplify.rs +++ b/datafusion/expr/src/simplify.rs @@ -24,6 +24,7 @@ use chrono::{DateTime, Utc}; use datafusion_common::config::ConfigOptions; use datafusion_common::{DFSchema, DFSchemaRef, Result}; +use crate::registry::ExtensionTypeRegistry; use crate::{Expr, ExprSchemable}; /// Provides simplification information based on schema, query execution time, @@ -38,6 +39,7 @@ pub struct SimplifyContext { schema: DFSchemaRef, query_execution_start_time: Option>, config_options: Arc, + extension_types: Option>, } /// Builder for [`SimplifyContext`]. @@ -54,6 +56,7 @@ impl Default for SimplifyContext { schema: Arc::new(DFSchema::empty()), query_execution_start_time: None, config_options: Arc::new(ConfigOptions::default()), + extension_types: None, } } } @@ -107,6 +110,14 @@ impl SimplifyContext { self } + pub fn with_extension_types( + mut self, + extension_types: Option>, + ) -> Self { + self.extension_types = extension_types; + self + } + /// Returns the schema pub fn schema(&self) -> &DFSchemaRef { &self.schema @@ -137,6 +148,10 @@ impl SimplifyContext { pub fn config_options(&self) -> &Arc { &self.config_options } + + pub fn extension_types(&self) -> Option<&Arc> { + self.extension_types.as_ref() + } } impl SimplifyContextBuilder { diff --git a/datafusion/optimizer/src/optimizer.rs b/datafusion/optimizer/src/optimizer.rs index d0fbb31414dab..eef2db8b7b64e 100644 --- a/datafusion/optimizer/src/optimizer.rs +++ b/datafusion/optimizer/src/optimizer.rs @@ -21,7 +21,7 @@ use std::fmt::Debug; use std::sync::Arc; use chrono::{DateTime, Utc}; -use datafusion_expr::registry::FunctionRegistry; +use datafusion_expr::registry::{ExtensionTypeRegistry, FunctionRegistry}; use datafusion_expr::{InvariantLevel, assert_expected_schema}; use log::{debug, warn}; @@ -146,6 +146,10 @@ pub trait OptimizerConfig { fn function_registry(&self) -> Option<&dyn FunctionRegistry> { None } + + fn extension_types(&self) -> Option> { + None + } } /// A standalone [`OptimizerConfig`] that can be used independently diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 143d8eae695af..8be9b6b9a24da 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -43,6 +43,7 @@ use datafusion_expr::expr::HigherOrderFunction; use datafusion_expr::{ BinaryExpr, Case, ColumnarValue, Expr, ExprSchemable, Like, Operator, Volatility, and, binary::BinaryTypeCoercer, lit, or, preimage::PreimageResult, + registry::ExtensionTypeRegistry, }; use datafusion_expr::{Cast, TryCast, simplify::ExprSimplifyResult}; use datafusion_expr::{expr::ScalarFunction, interval_arithmetic::NullableInterval}; @@ -211,7 +212,10 @@ impl ExprSimplifier { ) -> Result<(Transformed, u32)> { let mut simplifier = Simplifier::new(&self.info); let config_options = Some(Arc::clone(self.info.config_options())); - let mut const_evaluator = ConstEvaluator::try_new(config_options)?; + let mut const_evaluator = ConstEvaluator::try_new( + config_options, + self.info.extension_types().cloned(), + )?; let mut shorten_in_list_simplifier = ShortenInListSimplifier::new(); let guarantees_map: HashMap<&Expr, &NullableInterval> = self.guarantees.iter().map(|(k, v)| (k, v)).collect(); @@ -597,12 +601,16 @@ impl ConstEvaluator { /// /// The `config_options` parameter is used to pass session configuration /// (like timezone) to scalar functions during constant evaluation. - pub fn try_new(config_options: Option>) -> Result { + pub fn try_new( + config_options: Option>, + extension_types: Option>, + ) -> Result { // The dummy column name is unused and doesn't matter as only // expressions without column references can be evaluated let mut execution_props = ExecutionProps::new(); execution_props.config_options = config_options; + execution_props.extension_types = extension_types; Ok(Self { can_evaluate: vec![], diff --git a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs index 3e495f5355103..240d7a257fc14 100644 --- a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs +++ b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs @@ -102,6 +102,7 @@ impl SimplifyExpressions { .with_schema(schema) .with_config_options(config.options()) .with_query_execution_start_time(config.query_execution_start_time()) + .with_extension_types(config.extension_types().clone()) .build(); // Inputs have already been rewritten (due to bottom-up traversal handled by Optimizer) From 2f89e971eb7445bf1a42be751671794a1f1e7711 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 20 Mar 2026 10:05:08 -0500 Subject: [PATCH 15/39] fix test --- datafusion/expr/src/execution_props.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/expr/src/execution_props.rs b/datafusion/expr/src/execution_props.rs index 83e8d525c95fd..e7d516738d33e 100644 --- a/datafusion/expr/src/execution_props.rs +++ b/datafusion/expr/src/execution_props.rs @@ -255,7 +255,7 @@ mod test { fn debug() { let props = ExecutionProps::new(); assert_eq!( - "ExecutionProps { query_execution_start_time: None, alias_generator: AliasGenerator { next_id: 1 }, config_options: None, var_providers: None, subquery_indexes: {}, subquery_results: [] }", + "ExecutionProps { query_execution_start_time: None, alias_generator: AliasGenerator { next_id: 1 }, config_options: None, var_providers: None, subquery_indexes: {}, subquery_results: [], extension_types: None }", format!("{props:?}") ); } From 397b2cfae10ceb8b340810ae1f08a45dd3a65d80 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 20 Mar 2026 10:09:44 -0500 Subject: [PATCH 16/39] fix clippy --- datafusion/core/src/execution/session_state.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index af70af5ea58ff..5a209793ec612 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -1730,7 +1730,7 @@ impl SessionStateBuilder { // Temporary hack while we figure out how to get the extension types where they // need to go - state.execution_props.extension_types = Some(state.extension_types.clone()); + state.execution_props.extension_types = Some(Arc::clone(&state.extension_types)); state } @@ -2317,7 +2317,7 @@ impl OptimizerConfig for SessionState { } fn extension_types(&self) -> Option> { - Some(self.extension_types.clone()) + Some(Arc::clone(&self.extension_types)) } } From aee9136b9170a90262093ba74d8d36e3bd47161a Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 20 Mar 2026 10:26:53 -0500 Subject: [PATCH 17/39] more clippy --- datafusion/physical-expr/src/planner.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/physical-expr/src/planner.rs b/datafusion/physical-expr/src/planner.rs index a277e8d44a6c2..13856ade49064 100644 --- a/datafusion/physical-expr/src/planner.rs +++ b/datafusion/physical-expr/src/planner.rs @@ -309,12 +309,12 @@ pub fn create_physical_expr( if !field.metadata().is_empty() { if let Some(registry) = &execution_props.extension_types && let Some(extension_type) = - registry.create_extension_type_for_field(&field)? + registry.create_extension_type_for_field(field)? { let cast_extension = extension_type.cast_from()?; if cast_extension.can_cast( &src_field, - &field, + field, &DEFAULT_CAST_OPTIONS, )? { return expressions::cast_with_extension( @@ -339,7 +339,7 @@ pub fn create_physical_expr( registry.create_extension_type_for_field(&src_field)? { let cast_extension = extension_type.cast_to()?; - if cast_extension.can_cast(&src_field, &field, &DEFAULT_CAST_OPTIONS)? { + if cast_extension.can_cast(&src_field, field, &DEFAULT_CAST_OPTIONS)? { return expressions::cast_with_extension( create_physical_expr(expr, input_dfschema, execution_props)?, input_schema, From 567e947b9ff7c5e5c9ac96f5fc10ea241f574303 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 20 Mar 2026 10:45:12 -0500 Subject: [PATCH 18/39] more clippy --- datafusion/core/tests/extension_types/pretty_printing.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/core/tests/extension_types/pretty_printing.rs b/datafusion/core/tests/extension_types/pretty_printing.rs index 01b40bd516e32..117e9735b4ff9 100644 --- a/datafusion/core/tests/extension_types/pretty_printing.rs +++ b/datafusion/core/tests/extension_types/pretty_printing.rs @@ -105,7 +105,7 @@ async fn create_cast_uuid_to_char() -> Result<()> { let batches = df.collect().await?; assert_batches_eq!( - vec![ + [ "+--------------------------------------+", "| test.my_uuids |", "+--------------------------------------+", @@ -132,7 +132,7 @@ async fn create_cast_char_to_uuid() -> Result<()> { .await?; let batches = df.collect().await?; assert_batches_eq!( - vec![ + [ "+----------------------------------+", "| uuid |", "+----------------------------------+", From 4f821295be95c3f3651c8a5889c0ba2f451bb182 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 29 Apr 2026 10:47:13 -0500 Subject: [PATCH 19/39] fix more merge stuff --- datafusion/core/src/dataframe/mod.rs | 1 - .../array_formatter_factory.rs | 2 +- .../tests/extension_types/pretty_printing.rs | 1 + datafusion/datasource/src/url.rs | 2 +- datafusion/expr/src/registry.rs | 135 +++++++++--------- datafusion/expr/src/simplify.rs | 18 +-- datafusion/ffi/src/session/mod.rs | 11 +- .../physical-expr/src/expressions/cast.rs | 3 +- .../physical-expr/src/expressions/mod.rs | 1 + datafusion/physical-expr/src/planner.rs | 2 +- 10 files changed, 86 insertions(+), 90 deletions(-) diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 08a3f18992b3c..0f38988c69405 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -68,7 +68,6 @@ use datafusion_functions_aggregate::expr_fn::{ avg, count, max, median, min, stddev, sum, }; -use crate::extension_types::DFArrayFormatterFactory; use async_trait::async_trait; use datafusion_catalog::Session; use datafusion_expr::extension_types::DFArrayFormatterFactory; diff --git a/datafusion/core/src/extension_types/array_formatter_factory.rs b/datafusion/core/src/extension_types/array_formatter_factory.rs index f10576e816bda..1d970612f9078 100644 --- a/datafusion/core/src/extension_types/array_formatter_factory.rs +++ b/datafusion/core/src/extension_types/array_formatter_factory.rs @@ -60,7 +60,7 @@ impl ArrayFormatterFactory for DFArrayFormatterFactory { }; registration - .create_df_extension_type(field.extension_type_metadata())? + .create_df_extension_type(field.data_type(), field.extension_type_metadata())? .create_array_formatter(array, options) .map_err(ArrowError::from) } diff --git a/datafusion/core/tests/extension_types/pretty_printing.rs b/datafusion/core/tests/extension_types/pretty_printing.rs index 117e9735b4ff9..12123f650484d 100644 --- a/datafusion/core/tests/extension_types/pretty_printing.rs +++ b/datafusion/core/tests/extension_types/pretty_printing.rs @@ -23,6 +23,7 @@ use datafusion::dataframe::DataFrame; use datafusion::error::Result; use datafusion::execution::SessionStateBuilder; use datafusion::prelude::SessionContext; +use datafusion_expr::planner::TypePlanner; use datafusion_expr::registry::MemoryExtensionTypeRegistry; use insta::assert_snapshot; use std::sync::Arc; diff --git a/datafusion/datasource/src/url.rs b/datafusion/datasource/src/url.rs index 3fc9daa8b3f0c..60aa2d909437c 100644 --- a/datafusion/datasource/src/url.rs +++ b/datafusion/datasource/src/url.rs @@ -28,7 +28,7 @@ use futures::{StreamExt, TryStreamExt}; use glob::Pattern; use itertools::Itertools; use log::debug; -use object_store::path::Path; +use object_store::path::{DELIMITER, Path}; use object_store::{ObjectMeta, ObjectStore, ObjectStoreExt}; use url::Url; diff --git a/datafusion/expr/src/registry.rs b/datafusion/expr/src/registry.rs index f03cc5936c6ed..ddf039e2daa05 100644 --- a/datafusion/expr/src/registry.rs +++ b/datafusion/expr/src/registry.rs @@ -23,14 +23,7 @@ use crate::planner::ExprPlanner; use crate::{AggregateUDF, ScalarUDF, UserDefinedLogicalNode, WindowUDF}; use arrow::datatypes::Field; use arrow_schema::DataType; -use arrow_schema::extension::{ - Bool8, ExtensionType, FixedShapeTensor, Json, Opaque, TimestampWithOffset, Uuid, - VariableShapeTensor, -}; -use datafusion_common::types::{ - DFBool8, DFExtensionTypeRef, DFFixedShapeTensor, DFJson, DFOpaque, - DFTimestampWithOffset, DFUuid, DFVariableShapeTensor, -}; +use datafusion_common::types::DFExtensionTypeRef; use datafusion_common::{HashMap, Result, not_impl_err, plan_datafusion_err}; use std::collections::HashSet; use std::fmt::{Debug, Formatter}; @@ -450,68 +443,70 @@ impl MemoryExtensionTypeRegistry { /// Pre-registers the [canonical extension types](https://arrow.apache.org/docs/format/CanonicalExtensions.html) /// in the extension type registry. pub fn new_with_canonical_extension_types() -> Self { - let mapping = [ - ExtensionTypeRegistration::new_arc( - FixedShapeTensor::NAME, - |storage_type, metadata| { - Ok(Arc::new(DFFixedShapeTensor::try_new( - storage_type, - FixedShapeTensor::deserialize_metadata(metadata)?, - )?)) - }, - ), - ExtensionTypeRegistration::new_arc( - VariableShapeTensor::NAME, - |storage_type, metadata| { - Ok(Arc::new(DFVariableShapeTensor::try_new( - storage_type, - VariableShapeTensor::deserialize_metadata(metadata)?, - )?)) - }, - ), - ExtensionTypeRegistration::new_arc(Json::NAME, |storage_type, metadata| { - Ok(Arc::new(DFJson::try_new( - storage_type, - Json::deserialize_metadata(metadata)?, - )?)) - }), - ExtensionTypeRegistration::new_arc(Uuid::NAME, |storage_type, metadata| { - Ok(Arc::new(DFUuid::try_new( - storage_type, - Uuid::deserialize_metadata(metadata)?, - )?)) - }), - ExtensionTypeRegistration::new_arc(Opaque::NAME, |storage_type, metadata| { - Ok(Arc::new(DFOpaque::try_new( - storage_type, - Opaque::deserialize_metadata(metadata)?, - )?)) - }), - ExtensionTypeRegistration::new_arc(Bool8::NAME, |storage_type, metadata| { - Ok(Arc::new(DFBool8::try_new( - storage_type, - Bool8::deserialize_metadata(metadata)?, - )?)) - }), - ExtensionTypeRegistration::new_arc( - TimestampWithOffset::NAME, - |storage_type, metadata| { - Ok(Arc::new(DFTimestampWithOffset::try_new( - storage_type, - TimestampWithOffset::deserialize_metadata(metadata)?, - )?)) - }, - ), - ]; - - let mut extension_types = HashMap::new(); - for registration in mapping.into_iter() { - extension_types.insert(registration.type_name().to_owned(), registration); - } - - Self { - extension_types: Arc::new(RwLock::new(HashMap::from(extension_types))), - } + // Figure out what happened here + // let mapping = [ + // ExtensionTypeRegistration::new_arc( + // FixedShapeTensor::NAME, + // |storage_type, metadata| { + // Ok(Arc::new(DFFixedShapeTensor::try_new( + // storage_type, + // FixedShapeTensor::deserialize_metadata(metadata)?, + // )?)) + // }, + // ), + // ExtensionTypeRegistration::new_arc( + // VariableShapeTensor::NAME, + // |storage_type, metadata| { + // Ok(Arc::new(DFVariableShapeTensor::try_new( + // storage_type, + // VariableShapeTensor::deserialize_metadata(metadata)?, + // )?)) + // }, + // ), + // ExtensionTypeRegistration::new_arc(Json::NAME, |storage_type, metadata| { + // Ok(Arc::new(DFJson::try_new( + // storage_type, + // Json::deserialize_metadata(metadata)?, + // )?)) + // }), + // ExtensionTypeRegistration::new_arc(Uuid::NAME, |storage_type, metadata| { + // Ok(Arc::new(DFUuid::try_new( + // storage_type, + // Uuid::deserialize_metadata(metadata)?, + // )?)) + // }), + // ExtensionTypeRegistration::new_arc(Opaque::NAME, |storage_type, metadata| { + // Ok(Arc::new(DFOpaque::try_new( + // storage_type, + // Opaque::deserialize_metadata(metadata)?, + // )?)) + // }), + // ExtensionTypeRegistration::new_arc(Bool8::NAME, |storage_type, metadata| { + // Ok(Arc::new(DFBool8::try_new( + // storage_type, + // Bool8::deserialize_metadata(metadata)?, + // )?)) + // }), + // ExtensionTypeRegistration::new_arc( + // TimestampWithOffset::NAME, + // |storage_type, metadata| { + // Ok(Arc::new(DFTimestampWithOffset::try_new( + // storage_type, + // TimestampWithOffset::deserialize_metadata(metadata)?, + // )?)) + // }, + // ), + // ]; + + // let mut extension_types = HashMap::new(); + // for registration in mapping.into_iter() { + // extension_types.insert(registration.type_name().to_owned(), registration); + // } + + // Self { + // extension_types: Arc::new(RwLock::new(HashMap::from(extension_types))), + // } + Self::new_empty() } /// Creates a new [MemoryExtensionTypeRegistry] with the provided `types`. diff --git a/datafusion/expr/src/simplify.rs b/datafusion/expr/src/simplify.rs index 70b7bd4d71696..d2fe2739de150 100644 --- a/datafusion/expr/src/simplify.rs +++ b/datafusion/expr/src/simplify.rs @@ -48,6 +48,7 @@ pub struct SimplifyContextBuilder { schema: Option, query_execution_start_time: Option>, config_options: Option>, + extension_types: Option>, } impl Default for SimplifyContext { @@ -110,14 +111,6 @@ impl SimplifyContext { self } - pub fn with_extension_types( - mut self, - extension_types: Option>, - ) -> Self { - self.extension_types = extension_types; - self - } - /// Returns the schema pub fn schema(&self) -> &DFSchemaRef { &self.schema @@ -182,6 +175,14 @@ impl SimplifyContextBuilder { self } + pub fn with_extension_types( + mut self, + extension_types: Option>, + ) -> Self { + self.extension_types = extension_types; + self + } + /// Build a [`SimplifyContext`], filling in any unspecified fields with defaults. pub fn build(self) -> SimplifyContext { SimplifyContext { @@ -190,6 +191,7 @@ impl SimplifyContextBuilder { config_options: self .config_options .unwrap_or_else(|| Arc::new(ConfigOptions::default())), + extension_types: self.extension_types, } } } diff --git a/datafusion/ffi/src/session/mod.rs b/datafusion/ffi/src/session/mod.rs index c8562f5ae8696..e7ccca1190251 100644 --- a/datafusion/ffi/src/session/mod.rs +++ b/datafusion/ffi/src/session/mod.rs @@ -20,8 +20,6 @@ use std::collections::HashMap; use std::ffi::c_void; use std::sync::Arc; -use arrow_schema::SchemaRef; -use arrow_schema::ffi::FFI_ArrowSchema; use arrow_schema::SchemaRef; use arrow_schema::ffi::FFI_ArrowSchema; use async_ffi::{FfiFuture, FutureExt}; @@ -47,11 +45,6 @@ use datafusion_proto::protobuf::LogicalExprNode; use datafusion_session::Session; use prost::Message; -use stabby::str::Str as SStr; -use stabby::string::String as SString; -use stabby::vec::Vec as SVec; -use tokio::runtime::Handle; -use datafusion_expr::registry::{ExtensionTypeRegistry, ExtensionTypeRegistryRef}; use crate::arrow_wrappers::WrappedSchema; use crate::execution::FFI_TaskContext; use crate::execution_plan::FFI_ExecutionPlan; @@ -63,6 +56,10 @@ use crate::udf::FFI_ScalarUDF; use crate::udwf::FFI_WindowUDF; use crate::util::FFI_Result; use crate::{df_result, sresult, sresult_return}; +use stabby::str::Str as SStr; +use stabby::string::String as SString; +use stabby::vec::Vec as SVec; +use tokio::runtime::Handle; pub mod config; diff --git a/datafusion/physical-expr/src/expressions/cast.rs b/datafusion/physical-expr/src/expressions/cast.rs index 8ca5467d2ae90..90329fbb71090 100644 --- a/datafusion/physical-expr/src/expressions/cast.rs +++ b/datafusion/physical-expr/src/expressions/cast.rs @@ -29,7 +29,8 @@ use datafusion_common::format::DEFAULT_FORMAT_OPTIONS; use datafusion_common::nested_struct::{ requires_nested_struct_cast, validate_data_type_compatibility, }; -use datafusion_common::{Result, not_impl_err}; +use datafusion_common::types::CastExtension; +use datafusion_common::{Result, ScalarValue, not_impl_err}; use datafusion_expr_common::columnar_value::ColumnarValue; use datafusion_expr_common::interval_arithmetic::Interval; use datafusion_expr_common::sort_properties::ExprProperties; diff --git a/datafusion/physical-expr/src/expressions/mod.rs b/datafusion/physical-expr/src/expressions/mod.rs index bb6d9ca9c9c78..c587ee4f3427f 100644 --- a/datafusion/physical-expr/src/expressions/mod.rs +++ b/datafusion/physical-expr/src/expressions/mod.rs @@ -59,4 +59,5 @@ pub use not::{NotExpr, not}; pub use try_cast::{TryCastExpr, try_cast}; pub use unknown_column::UnKnownColumn; +pub use cast::cast_with_extension; pub(crate) use cast::cast_with_target_field; diff --git a/datafusion/physical-expr/src/planner.rs b/datafusion/physical-expr/src/planner.rs index 13856ade49064..b05c3e60b0cda 100644 --- a/datafusion/physical-expr/src/planner.rs +++ b/datafusion/physical-expr/src/planner.rs @@ -29,6 +29,7 @@ use arrow::compute::CastOptions; use arrow::datatypes::Schema; use datafusion_common::config::ConfigOptions; use datafusion_common::datatype::FieldExt; +use datafusion_common::format::DEFAULT_FORMAT_OPTIONS; use datafusion_common::metadata::{FieldMetadata, format_type_and_metadata}; use datafusion_common::{ DFSchema, Result, ScalarValue, ToDFSchema, exec_err, internal_datafusion_err, @@ -305,7 +306,6 @@ pub fn create_physical_expr( // Need to figure out what happened here - if !field.metadata().is_empty() { if let Some(registry) = &execution_props.extension_types && let Some(extension_type) = From 240adae33809b7e67c18b88dfed17d96302477ff Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 29 Apr 2026 14:32:52 -0500 Subject: [PATCH 20/39] clean up the diff --- .../examples/extension_types/event_id.rs | 297 ------------------ datafusion/common/src/types/mod.rs | 1 + .../core/src/execution/session_state.rs | 33 +- .../array_formatter_factory.rs | 67 ---- datafusion/core/src/extension_types/mod.rs | 22 -- datafusion/core/src/lib.rs | 1 - .../tests/extension_types/pretty_printing.rs | 8 +- datafusion/expr/src/registry.rs | 134 ++++---- 8 files changed, 79 insertions(+), 484 deletions(-) delete mode 100644 datafusion-examples/examples/extension_types/event_id.rs delete mode 100644 datafusion/core/src/extension_types/array_formatter_factory.rs delete mode 100644 datafusion/core/src/extension_types/mod.rs diff --git a/datafusion-examples/examples/extension_types/event_id.rs b/datafusion-examples/examples/extension_types/event_id.rs deleted file mode 100644 index 11a27c0742f62..0000000000000 --- a/datafusion-examples/examples/extension_types/event_id.rs +++ /dev/null @@ -1,297 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use arrow::array::{Array, RecordBatch, StringArray, UInt32Array}; -use arrow::util::display::{ArrayFormatter, DisplayIndex, FormatOptions, FormatResult}; -use arrow_schema::extension::ExtensionType; -use arrow_schema::{ArrowError, DataType, Field, Schema, SchemaRef}; -use datafusion::dataframe::DataFrame; -use datafusion::error::Result; -use datafusion::execution::SessionStateBuilder; -use datafusion::prelude::SessionContext; -use datafusion_common::internal_err; -use datafusion_common::types::DFExtensionType; -use datafusion_expr::registry::{ - DefaultExtensionTypeRegistration, ExtensionTypeRegistry, MemoryExtensionTypeRegistry, -}; -use std::fmt::Write; -use std::sync::Arc; - -/// This example demonstrates using DataFusion's extension type API to create a custom identifier -/// type [`EventIdExtensionType`]. -/// -/// The following use cases are demonstrated: -/// - Use a custom implementation for pretty-printing data frames. -pub async fn event_id_example() -> Result<()> { - let ctx = create_session_context()?; - register_events_table(&ctx).await?; - - // Print the example table with the custom pretty-printer. - ctx.table("example").await?.show().await -} - -/// Creates the DataFusion session context with the custom extension type implementation. -fn create_session_context() -> Result { - // Create a registry with a reference to the custom extension type implementation. - let registry = MemoryExtensionTypeRegistry::new(); - let event_id_registration = DefaultExtensionTypeRegistration::new_arc(|metadata| { - Ok(EventIdExtensionType(metadata)) - }); - registry.add_extension_type_registration(event_id_registration)?; - - // Set the extension type registry in the session state so that DataFusion can use it. - let state = SessionStateBuilder::default() - .with_extension_type_registry(Arc::new(registry)) - .build(); - Ok(SessionContext::new_with_state(state)) -} - -/// Registers the example table and returns the data frame. -async fn register_events_table(ctx: &SessionContext) -> Result { - let schema = example_schema(); - let batch = RecordBatch::try_new( - schema, - vec![ - Arc::new(UInt32Array::from(vec![ - 2001000000, 2001000001, 2103000000, 2103000001, 2103000002, - ])), - Arc::new(UInt32Array::from(vec![ - 2020010000, 2020010001, 2021030000, 2021030001, 2021030002, - ])), - Arc::new(StringArray::from(vec![ - "First Event Jan 2020", - "Second Event Jan 2020", - "First Event Mar 2021", - "Second Event Mar 2021", - "Third Event Mar 2021", - ])), - ], - )?; - - // Register the table and return the data frame. - ctx.register_batch("example", batch)?; - ctx.table("example").await -} - -/// The schema of the example table. -fn example_schema() -> SchemaRef { - Arc::new(Schema::new(vec![ - Field::new("event_id_short", DataType::UInt32, false) - .with_extension_type(EventIdExtensionType(IdYearMode::Short)), - Field::new("event_id_long", DataType::UInt32, false) - .with_extension_type(EventIdExtensionType(IdYearMode::Long)), - Field::new("name", DataType::Utf8, false), - ])) -} - -/// Represents a 32-bit custom identifier that represents a single event. Using this format is -/// probably not a good idea in practice, but it is useful for demonstrating the API usage. -/// -/// An event is constructed of three parts: -/// - The year -/// - The month -/// - An auto-incrementing counter within the month -/// -/// For example, the event id `2024-01-0000` represents the first event in 2024. -/// -/// # Year Mode -/// -/// In addition, each event id can be represented in two modes. A short year mode `24-01-000000` and -/// a long year mode `2024-01-0000`. This showcases how extension types can be parameterized using -/// metadata. -#[derive(Debug)] -pub struct EventIdExtensionType(IdYearMode); - -/// Represents whether the id uses the short or long format. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub enum IdYearMode { - /// The short year format (e.g., `24-01-000000`). Allows for more events per month. - Short, - /// The long year format (e.g., `2024-01-0000`). Allows distinguishing between centuries. - Long, -} - -/// Implementation of [`ExtensionType`] for [`EventIdExtensionType`]. -/// -/// This is for the arrow-rs side of the API usage. The [`ExtensionType::Metadata`] type provides -/// static guarantees on the deserialized metadata for the extension type. We will use this -/// implementation to read and write the type metadata to arrow [`Field`]s. -/// -/// This trait does allow users to customize the behavior of DataFusion for this extension type. -/// This is done in [`DFExtensionType`]. -impl ExtensionType for EventIdExtensionType { - const NAME: &'static str = "custom.event_id"; - type Metadata = IdYearMode; - - fn metadata(&self) -> &Self::Metadata { - &self.0 - } - - fn serialize_metadata(&self) -> Option { - // Arrow extension type metadata is encoded as a string. We simply use the lowercase name. - // In a more involved scenario, more complex serialization formats such as JSON are - // appropriate. - Some(format!("{:?}", self.0).to_lowercase()) - } - - fn deserialize_metadata( - metadata: Option<&str>, - ) -> std::result::Result { - match metadata { - None => Err(ArrowError::InvalidArgumentError( - "Event id type requires metadata".to_owned(), - )), - Some(metadata) => match metadata { - "short" => Ok(IdYearMode::Short), - "long" => Ok(IdYearMode::Long), - _ => Err(ArrowError::InvalidArgumentError(format!( - "Invalid metadata for event id type: {metadata}" - ))), - }, - } - } - - fn supports_data_type( - &self, - data_type: &DataType, - ) -> std::result::Result<(), ArrowError> { - match data_type { - DataType::UInt32 => Ok(()), - _ => Err(ArrowError::InvalidArgumentError(format!( - "Invalid data type: {data_type} for event id type", - ))), - } - } - - fn try_new( - data_type: &DataType, - metadata: Self::Metadata, - ) -> std::result::Result { - let instance = Self(metadata); - instance.supports_data_type(data_type)?; // Check that the data type is supported. - Ok(instance) - } -} - -/// Implementation of [`ExtensionType`] for [`EventIdExtensionType`]. -/// -/// This is for the DataFusion side of the API usage. Here users can override the default behavior -/// of DataFusion for supported extension points. -impl DFExtensionType for EventIdExtensionType { - fn create_array_formatter<'fmt>( - &self, - array: &'fmt dyn Array, - options: &FormatOptions<'fmt>, - ) -> Result>> { - if array.data_type() != &DataType::UInt32 { - return internal_err!("Wrong array type for Event Id"); - } - - // Create the formatter and pass in the year formatting mode of the type - let display_index = EventIdDisplayIndex { - array: array.as_any().downcast_ref().unwrap(), - null_str: options.null(), - mode: self.0, - }; - Ok(Some(ArrayFormatter::new( - Box::new(display_index), - options.safe(), - ))) - } -} - -/// Pretty printer for event ids. -#[derive(Debug)] -struct EventIdDisplayIndex<'a> { - array: &'a UInt32Array, - null_str: &'a str, - mode: IdYearMode, -} - -/// This implements the arrow-rs API for printing individual values of a column. DataFusion will -/// automatically pass in the reference to this implementation if a column is annotated with the -/// extension type metadata. -impl DisplayIndex for EventIdDisplayIndex<'_> { - fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult { - // Handle nulls first - if self.array.is_null(idx) { - write!(f, "{}", self.null_str)?; - return Ok(()); - } - - let value = self.array.value(idx); - - match self.mode { - IdYearMode::Short => { - // Format: YY-MM-CCCCCC - // Logic: - // - The last 6 digits are the counter. - // - The next 2 digits are the month. - // - The remaining digits are the year. - let counter = value % 1_000_000; - let rest = value / 1_000_000; - let month = rest % 100; - let year = rest / 100; - - write!(f, "{year:02}-{month:02}-{counter:06}")?; - } - IdYearMode::Long => { - // Format: YYYY-MM-CCCC - // Logic: - // - The last 4 digits are the counter. - // - The next 2 digits are the month. - // - The remaining digits are the year. - let counter = value % 10_000; - let rest = value / 10_000; - let month = rest % 100; - let year = rest / 100; - - write!(f, "{year:04}-{month:02}-{counter:04}")?; - } - } - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use insta::assert_snapshot; - - #[tokio::test] - async fn test_print_example_table() -> Result<()> { - let ctx = create_session_context()?; - let table = register_events_table(&ctx).await?; - - assert_snapshot!( - table.to_string().await?, - @r" - +----------------+---------------+-----------------------+ - | event_id_short | event_id_long | name | - +----------------+---------------+-----------------------+ - | 20-01-000000 | 2020-01-0000 | First Event Jan 2020 | - | 20-01-000001 | 2020-01-0001 | Second Event Jan 2020 | - | 21-03-000000 | 2021-03-0000 | First Event Mar 2021 | - | 21-03-000001 | 2021-03-0001 | Second Event Mar 2021 | - | 21-03-000002 | 2021-03-0002 | Third Event Mar 2021 | - +----------------+---------------+-----------------------+ - " - ); - - Ok(()) - } -} diff --git a/datafusion/common/src/types/mod.rs b/datafusion/common/src/types/mod.rs index 82455063bc6ce..57bf921a6d564 100644 --- a/datafusion/common/src/types/mod.rs +++ b/datafusion/common/src/types/mod.rs @@ -23,6 +23,7 @@ mod logical; mod native; pub use builtin::*; +pub use canonical_extensions::*; pub use extension::*; pub use field::*; pub use logical::*; diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 5a209793ec612..9ba24bab30dcc 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -57,8 +57,8 @@ use datafusion_expr::planner::ExprPlanner; #[cfg(feature = "sql")] use datafusion_expr::planner::{RelationPlanner, TypePlanner}; use datafusion_expr::registry::{ - ExtensionTypeRegistrationRef, ExtensionTypeRegistry, ExtensionTypeRegistryRef, - FunctionRegistry, MemoryExtensionTypeRegistry, SerializerRegistry, + ExtensionTypeRegistry, ExtensionTypeRegistryRef, FunctionRegistry, + MemoryExtensionTypeRegistry, SerializerRegistry, }; use datafusion_expr::simplify::SimplifyContext; use datafusion_expr::{ @@ -2270,35 +2270,6 @@ impl datafusion_execution::TaskContextProvider for SessionState { } } -impl ExtensionTypeRegistry for SessionState { - fn extension_type_registration( - &self, - name: &str, - ) -> datafusion_common::Result { - self.extension_types.extension_type_registration(name) - } - - fn extension_type_registrations(&self) -> Vec { - self.extension_types.extension_type_registrations() - } - - fn add_extension_type_registration( - &self, - extension_type: ExtensionTypeRegistrationRef, - ) -> datafusion_common::Result> { - self.extension_types - .add_extension_type_registration(extension_type) - } - - fn remove_extension_type_registration( - &self, - name: &str, - ) -> datafusion_common::Result> { - self.extension_types - .remove_extension_type_registration(name) - } -} - impl OptimizerConfig for SessionState { fn query_execution_start_time(&self) -> Option> { self.execution_props.query_execution_start_time diff --git a/datafusion/core/src/extension_types/array_formatter_factory.rs b/datafusion/core/src/extension_types/array_formatter_factory.rs deleted file mode 100644 index 1d970612f9078..0000000000000 --- a/datafusion/core/src/extension_types/array_formatter_factory.rs +++ /dev/null @@ -1,67 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use arrow::array::Array; -use arrow::util::display::{ArrayFormatter, ArrayFormatterFactory, FormatOptions}; -use arrow_schema::{ArrowError, Field}; -use datafusion_expr::registry::ExtensionTypeRegistryRef; - -/// A factory for creating [`ArrayFormatter`]s that checks whether a registered extension type can -/// format a given array based on its metadata. -#[derive(Debug)] -pub struct DFArrayFormatterFactory { - /// The extension type registry - registry: ExtensionTypeRegistryRef, -} - -impl DFArrayFormatterFactory { - /// Creates a new [`DFArrayFormatterFactory`]. - pub fn new(registry: ExtensionTypeRegistryRef) -> Self { - Self { registry } - } -} - -impl ArrayFormatterFactory for DFArrayFormatterFactory { - fn create_array_formatter<'formatter>( - &self, - array: &'formatter dyn Array, - options: &FormatOptions<'formatter>, - field: Option<&'formatter Field>, - ) -> Result>, ArrowError> { - let Some(field) = field else { - return Ok(None); - }; - - let Some(extension_type_name) = field.extension_type_name() else { - return Ok(None); - }; - - let Some(registration) = self - .registry - .extension_type_registration(extension_type_name) - .ok() - else { - // If the extension type is not registered, we fall back to the default formatter - return Ok(None); - }; - - registration - .create_df_extension_type(field.data_type(), field.extension_type_metadata())? - .create_array_formatter(array, options) - .map_err(ArrowError::from) - } -} diff --git a/datafusion/core/src/extension_types/mod.rs b/datafusion/core/src/extension_types/mod.rs deleted file mode 100644 index 55ec1ad95b5a1..0000000000000 --- a/datafusion/core/src/extension_types/mod.rs +++ /dev/null @@ -1,22 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! This module contains code that enables DataFusion's extension type capabilities. - -mod array_formatter_factory; - -pub use array_formatter_factory::*; diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs index 1f2ef15a356dc..1d8368f54ba20 100644 --- a/datafusion/core/src/lib.rs +++ b/datafusion/core/src/lib.rs @@ -894,7 +894,6 @@ pub mod variable { #[cfg(not(target_arch = "wasm32"))] pub mod test; -mod extension_types; mod schema_equivalence; pub mod test_util; diff --git a/datafusion/core/tests/extension_types/pretty_printing.rs b/datafusion/core/tests/extension_types/pretty_printing.rs index 12123f650484d..aa0eaec3916d2 100644 --- a/datafusion/core/tests/extension_types/pretty_printing.rs +++ b/datafusion/core/tests/extension_types/pretty_printing.rs @@ -95,8 +95,10 @@ async fn create_cast_uuid_to_char() -> Result<()> { )?; let state = SessionStateBuilder::default() - .with_canonical_extension_types()? .with_type_planner(Arc::new(CustomTypePlanner {})) + .with_extension_type_registry(Arc::new( + MemoryExtensionTypeRegistry::new_with_canonical_extension_types(), + )) .build(); let ctx = SessionContext::new_with_state(state); @@ -123,8 +125,10 @@ async fn create_cast_uuid_to_char() -> Result<()> { #[tokio::test] async fn create_cast_char_to_uuid() -> Result<()> { let state = SessionStateBuilder::default() - .with_canonical_extension_types()? .with_type_planner(Arc::new(CustomTypePlanner {})) + .with_extension_type_registry(Arc::new( + MemoryExtensionTypeRegistry::new_with_canonical_extension_types(), + )) .build(); let ctx = SessionContext::new_with_state(state); diff --git a/datafusion/expr/src/registry.rs b/datafusion/expr/src/registry.rs index ddf039e2daa05..ed8db80c4463b 100644 --- a/datafusion/expr/src/registry.rs +++ b/datafusion/expr/src/registry.rs @@ -23,7 +23,14 @@ use crate::planner::ExprPlanner; use crate::{AggregateUDF, ScalarUDF, UserDefinedLogicalNode, WindowUDF}; use arrow::datatypes::Field; use arrow_schema::DataType; -use datafusion_common::types::DFExtensionTypeRef; +use arrow_schema::extension::{ + Bool8, ExtensionType, FixedShapeTensor, Json, Opaque, TimestampWithOffset, Uuid, + VariableShapeTensor, +}; +use datafusion_common::types::{ + DFBool8, DFExtensionTypeRef, DFFixedShapeTensor, DFJson, DFOpaque, + DFTimestampWithOffset, DFUuid, DFVariableShapeTensor, +}; use datafusion_common::{HashMap, Result, not_impl_err, plan_datafusion_err}; use std::collections::HashSet; use std::fmt::{Debug, Formatter}; @@ -444,69 +451,68 @@ impl MemoryExtensionTypeRegistry { /// in the extension type registry. pub fn new_with_canonical_extension_types() -> Self { // Figure out what happened here - // let mapping = [ - // ExtensionTypeRegistration::new_arc( - // FixedShapeTensor::NAME, - // |storage_type, metadata| { - // Ok(Arc::new(DFFixedShapeTensor::try_new( - // storage_type, - // FixedShapeTensor::deserialize_metadata(metadata)?, - // )?)) - // }, - // ), - // ExtensionTypeRegistration::new_arc( - // VariableShapeTensor::NAME, - // |storage_type, metadata| { - // Ok(Arc::new(DFVariableShapeTensor::try_new( - // storage_type, - // VariableShapeTensor::deserialize_metadata(metadata)?, - // )?)) - // }, - // ), - // ExtensionTypeRegistration::new_arc(Json::NAME, |storage_type, metadata| { - // Ok(Arc::new(DFJson::try_new( - // storage_type, - // Json::deserialize_metadata(metadata)?, - // )?)) - // }), - // ExtensionTypeRegistration::new_arc(Uuid::NAME, |storage_type, metadata| { - // Ok(Arc::new(DFUuid::try_new( - // storage_type, - // Uuid::deserialize_metadata(metadata)?, - // )?)) - // }), - // ExtensionTypeRegistration::new_arc(Opaque::NAME, |storage_type, metadata| { - // Ok(Arc::new(DFOpaque::try_new( - // storage_type, - // Opaque::deserialize_metadata(metadata)?, - // )?)) - // }), - // ExtensionTypeRegistration::new_arc(Bool8::NAME, |storage_type, metadata| { - // Ok(Arc::new(DFBool8::try_new( - // storage_type, - // Bool8::deserialize_metadata(metadata)?, - // )?)) - // }), - // ExtensionTypeRegistration::new_arc( - // TimestampWithOffset::NAME, - // |storage_type, metadata| { - // Ok(Arc::new(DFTimestampWithOffset::try_new( - // storage_type, - // TimestampWithOffset::deserialize_metadata(metadata)?, - // )?)) - // }, - // ), - // ]; - - // let mut extension_types = HashMap::new(); - // for registration in mapping.into_iter() { - // extension_types.insert(registration.type_name().to_owned(), registration); - // } - - // Self { - // extension_types: Arc::new(RwLock::new(HashMap::from(extension_types))), - // } - Self::new_empty() + let mapping = [ + ExtensionTypeRegistration::new_arc( + FixedShapeTensor::NAME, + |storage_type, metadata| { + Ok(Arc::new(DFFixedShapeTensor::try_new( + storage_type, + FixedShapeTensor::deserialize_metadata(metadata)?, + )?)) + }, + ), + ExtensionTypeRegistration::new_arc( + VariableShapeTensor::NAME, + |storage_type, metadata| { + Ok(Arc::new(DFVariableShapeTensor::try_new( + storage_type, + VariableShapeTensor::deserialize_metadata(metadata)?, + )?)) + }, + ), + ExtensionTypeRegistration::new_arc(Json::NAME, |storage_type, metadata| { + Ok(Arc::new(DFJson::try_new( + storage_type, + Json::deserialize_metadata(metadata)?, + )?)) + }), + ExtensionTypeRegistration::new_arc(Uuid::NAME, |storage_type, metadata| { + Ok(Arc::new(DFUuid::try_new( + storage_type, + Uuid::deserialize_metadata(metadata)?, + )?)) + }), + ExtensionTypeRegistration::new_arc(Opaque::NAME, |storage_type, metadata| { + Ok(Arc::new(DFOpaque::try_new( + storage_type, + Opaque::deserialize_metadata(metadata)?, + )?)) + }), + ExtensionTypeRegistration::new_arc(Bool8::NAME, |storage_type, metadata| { + Ok(Arc::new(DFBool8::try_new( + storage_type, + Bool8::deserialize_metadata(metadata)?, + )?)) + }), + ExtensionTypeRegistration::new_arc( + TimestampWithOffset::NAME, + |storage_type, metadata| { + Ok(Arc::new(DFTimestampWithOffset::try_new( + storage_type, + TimestampWithOffset::deserialize_metadata(metadata)?, + )?)) + }, + ), + ]; + + let mut extension_types = HashMap::new(); + for registration in mapping.into_iter() { + extension_types.insert(registration.type_name().to_owned(), registration); + } + + Self { + extension_types: Arc::new(RwLock::new(HashMap::from(extension_types))), + } } /// Creates a new [MemoryExtensionTypeRegistry] with the provided `types`. From ae03e0e4dc01cf479c06df438a5cc7e5eeb1968a Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 29 Apr 2026 14:33:37 -0500 Subject: [PATCH 21/39] undo testing change --- testing | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testing b/testing index 0d60ccae40d0e..7df2b70baf4f0 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit 0d60ccae40d0e8f2d22c15fafb01c5d4be8c63a6 +Subproject commit 7df2b70baf4f081ebf8e0c6bd22745cf3cbfd824 From a1c1e02d3c64130b9851070f2296a1f041d8a29c Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 29 Apr 2026 15:18:55 -0500 Subject: [PATCH 22/39] tests passing maybe Co-authored-by: Copilot --- datafusion/physical-expr/src/expressions/cast.rs | 7 +++++++ datafusion/physical-expr/src/planner.rs | 2 -- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/datafusion/physical-expr/src/expressions/cast.rs b/datafusion/physical-expr/src/expressions/cast.rs index 90329fbb71090..c8379912ee3d7 100644 --- a/datafusion/physical-expr/src/expressions/cast.rs +++ b/datafusion/physical-expr/src/expressions/cast.rs @@ -167,7 +167,14 @@ impl CastExpr { } fn resolved_target_field(&self, input_schema: &Schema) -> Result { + // When using a cast_extension, return the explicit target_field to avoid + // propagating source metadata (e.g., extension type metadata) to the output. + if self.cast_extension.is_some() { + return Ok(Arc::clone(&self.target_field)); + } + if is_default_target_field(&self.target_field) { + // TODO: not correct, metadata should not be propagated here self.expr.return_field(input_schema).map(|field| { Arc::new( field diff --git a/datafusion/physical-expr/src/planner.rs b/datafusion/physical-expr/src/planner.rs index b05c3e60b0cda..85c30891068d6 100644 --- a/datafusion/physical-expr/src/planner.rs +++ b/datafusion/physical-expr/src/planner.rs @@ -304,8 +304,6 @@ pub fn create_physical_expr( format_options: DEFAULT_FORMAT_OPTIONS, }; - // Need to figure out what happened here - if !field.metadata().is_empty() { if let Some(registry) = &execution_props.extension_types && let Some(extension_type) = From 359627305e418719623dff46c6b7c8bd9c989454 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 7 May 2026 13:18:49 -0500 Subject: [PATCH 23/39] reintroduce the cast extension --- datafusion/common/src/nested_struct.rs | 111 ++++++++++++++++++++++++- 1 file changed, 108 insertions(+), 3 deletions(-) diff --git a/datafusion/common/src/nested_struct.rs b/datafusion/common/src/nested_struct.rs index cdd6215d08e2f..ec1298ce15af5 100644 --- a/datafusion/common/src/nested_struct.rs +++ b/datafusion/common/src/nested_struct.rs @@ -15,7 +15,11 @@ // specific language governing permissions and limitations // under the License. -use crate::error::{_plan_err, Result}; +use crate::{ + datatype::DataTypeExt, + error::{_internal_err, _plan_err, Result}, + metadata::format_type_and_metadata, +}; use arrow::{ array::{ Array, ArrayRef, DictionaryArray, GenericListArray, GenericListViewArray, @@ -173,7 +177,34 @@ pub fn cast_column( target_type: &DataType, cast_options: &CastOptions, ) -> Result { - match (source_col.data_type(), target_type) { + cast_column_fields( + source_col, + &source_col.data_type().clone().into_nullable_field(), + &target_type.clone().into_nullable_field(), + None, + cast_options, + ) +} + +pub fn cast_column_fields( + source_col: &ArrayRef, + source_field: &Field, + target_field: &Field, + cast_extension: Option<&dyn CastExtension>, + cast_options: &CastOptions, +) -> Result { + if let Some(cast_extension) = cast_extension + && cast_extension.can_cast_types(source_field, target_field, cast_options)? + { + return cast_extension.cast_array( + source_col, + source_field, + target_field, + cast_options, + ); + } + + match (source_field.data_type(), target_field.data_type()) { (_, Struct(target_fields)) => { cast_struct_column(source_col, target_fields, cast_options) } @@ -199,7 +230,11 @@ pub fn cast_column( target_value_type, cast_options, ), - _ => Ok(cast_with_options(source_col, target_type, cast_options)?), + _ => Ok(cast_with_options( + source_col, + target_field.data_type(), + cast_options, + )?), } } @@ -502,6 +537,76 @@ pub fn has_one_of_more_common_fields( .any(|field| source_names.contains(field.name().as_str())) } +pub trait CastExtension: std::fmt::Debug + Send + Sync { + fn can_cast_types( + &self, + source_field: &Field, + target_field: &Field, + cast_options: &CastOptions, + ) -> Result; + + fn cast_array( + &self, + array: &ArrayRef, + source_field: &Field, + target_field: &Field, + cast_options: &CastOptions, + ) -> Result; +} + +#[derive(Debug)] +pub struct VecCastExtension { + extensions: Vec>, +} + +impl CastExtension for VecCastExtension { + fn can_cast_types( + &self, + source_field: &Field, + target_field: &Field, + cast_options: &CastOptions, + ) -> Result { + for extension in &self.extensions { + if extension.can_cast_types(source_field, target_field, cast_options)? { + return Ok(true); + } + } + + Ok(false) + } + + fn cast_array( + &self, + array: &ArrayRef, + source_field: &Field, + target_field: &Field, + cast_options: &CastOptions, + ) -> Result { + for extension in &self.extensions { + if extension.can_cast_types(source_field, target_field, cast_options)? { + return extension.cast_array( + array, + source_field, + target_field, + cast_options, + ); + } + } + + let source_display = format_type_and_metadata( + source_field.data_type(), + Some(source_field.metadata()), + ); + let target_display = format_type_and_metadata( + target_field.data_type(), + Some(target_field.metadata()), + ); + _internal_err!( + "Can't resolve extension to cast from {source_display} to {target_display}" + ) + } +} + #[cfg(test)] mod tests { use super::*; From 1a33151a33064282724fe8b2b672b69ee0ec71d9 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 7 May 2026 14:18:35 -0500 Subject: [PATCH 24/39] reimplement the cast extension --- datafusion/common/src/metadata.rs | 42 +++++- datafusion/common/src/nested_struct.rs | 140 ++++++++++++------ .../physical-expr/src/expressions/cast.rs | 2 +- 3 files changed, 138 insertions(+), 46 deletions(-) diff --git a/datafusion/common/src/metadata.rs b/datafusion/common/src/metadata.rs index d6d8fb7b0ed0c..cba52d35af84e 100644 --- a/datafusion/common/src/metadata.rs +++ b/datafusion/common/src/metadata.rs @@ -17,10 +17,16 @@ use std::{collections::BTreeMap, sync::Arc}; -use arrow::datatypes::{DataType, Field, FieldRef}; +use arrow::{ + compute::CastOptions, + datatypes::{DataType, Field, FieldRef}, +}; use hashbrown::HashMap; -use crate::{DataFusionError, ScalarValue, error::_plan_err}; +use crate::{ + DataFusionError, ScalarValue, datatype::DataTypeExt, error::_plan_err, + nested_struct::CastExtension, +}; /// A [`ScalarValue`] with optional [`FieldMetadata`] #[derive(Debug, Clone)] @@ -62,6 +68,38 @@ impl ScalarAndMetadata { let new_value = self.value().cast_to(target_type)?; Ok(Self::new(new_value, self.metadata.clone())) } + + /// Try to cast this value to a ScalarValue of type `target_field` with [`CastOptions`] + pub fn cast_to_with_options( + &self, + target_field: &Field, + cast_extension: Option<&dyn CastExtension>, + cast_options: &CastOptions, + ) -> Result { + let mut source_field = self.value.data_type().into_nullable_field(); + if let Some(metadata) = &self.metadata { + source_field = metadata.add_to_field(source_field); + } + + if let Some(cast_extension) = cast_extension + && cast_extension.can_cast_types(&source_field, target_field)? + { + let cast_arr = cast_extension.cast_array( + &self.value.to_array()?, + &source_field, + target_field, + cast_options, + )?; + let storage = ScalarValue::try_from_array(&cast_arr, 0)?; + let metadata = FieldMetadata::new_from_field(target_field); + return Ok(Self { + value: storage, + metadata: Some(metadata), + }); + } + + self.cast_storage_to(target_field.data_type()) + } } /// create a new ScalarAndMetadata from a ScalarValue without diff --git a/datafusion/common/src/nested_struct.rs b/datafusion/common/src/nested_struct.rs index ec1298ce15af5..12d6c09d80fbb 100644 --- a/datafusion/common/src/nested_struct.rs +++ b/datafusion/common/src/nested_struct.rs @@ -60,6 +60,7 @@ use std::{collections::HashSet, sync::Arc}; fn cast_struct_column( source_col: &ArrayRef, target_fields: &[Arc], + cast_extension: Option<&dyn CastExtension>, cast_options: &CastOptions, ) -> Result { if source_col.data_type() == &DataType::Null @@ -82,14 +83,20 @@ fn cast_struct_column( for target_child_field in target_fields.iter() { fields.push(Arc::clone(target_child_field)); - let source_child_opt = - source_struct.column_by_name(target_child_field.name()); - - match source_child_opt { - Some(source_child_col) => { - let adapted_child = cast_column( - source_child_col, - target_child_field.data_type(), + let source_child_index_opt = source_struct + .column_names() + .iter() + .position(|name| *name == target_child_field.name()); + // let source_child_opt = + // source_struct.column_by_name(target_child_field.name()); + + match source_child_index_opt { + Some(source_child_index) => { + let adapted_child = cast_column_fields( + source_struct.column(source_child_index), + source_struct.fields()[source_child_index].as_ref(), + target_child_field.as_ref(), + cast_extension, cast_options, ) .map_err(|e| { @@ -194,7 +201,7 @@ pub fn cast_column_fields( cast_options: &CastOptions, ) -> Result { if let Some(cast_extension) = cast_extension - && cast_extension.can_cast_types(source_field, target_field, cast_options)? + && cast_extension.can_cast_types(source_field, target_field)? { return cast_extension.cast_array( source_col, @@ -206,20 +213,45 @@ pub fn cast_column_fields( match (source_field.data_type(), target_field.data_type()) { (_, Struct(target_fields)) => { - cast_struct_column(source_col, target_fields, cast_options) + cast_struct_column(source_col, target_fields, cast_extension, cast_options) } - (DataType::List(_), DataType::List(target_inner)) => { - cast_list_column::(source_col, target_inner, cast_options) + (DataType::List(source_inner), DataType::List(target_inner)) => { + cast_list_column::( + source_col, + source_inner, + target_inner, + cast_extension, + cast_options, + ) } - (DataType::LargeList(_), DataType::LargeList(target_inner)) => { - cast_list_column::(source_col, target_inner, cast_options) + (DataType::LargeList(source_inner), DataType::LargeList(target_inner)) => { + cast_list_column::( + source_col, + source_inner, + target_inner, + cast_extension, + cast_options, + ) } - (DataType::ListView(_), DataType::ListView(target_inner)) => { - cast_list_view_column::(source_col, target_inner, cast_options) - } - (DataType::LargeListView(_), DataType::LargeListView(target_inner)) => { - cast_list_view_column::(source_col, target_inner, cast_options) + (DataType::ListView(source_inner), DataType::ListView(target_inner)) => { + cast_list_view_column::( + source_col, + source_inner, + target_inner, + cast_extension, + cast_options, + ) } + ( + DataType::LargeListView(source_inner), + DataType::LargeListView(target_inner), + ) => cast_list_view_column::( + source_col, + source_inner, + target_inner, + cast_extension, + cast_options, + ), ( DataType::Dictionary(source_key_type, _), DataType::Dictionary(target_key_type, target_value_type), @@ -228,6 +260,7 @@ pub fn cast_column_fields( source_key_type, target_key_type, target_value_type, + cast_extension, cast_options, ), _ => Ok(cast_with_options( @@ -240,7 +273,9 @@ pub fn cast_column_fields( fn cast_list_column( source_col: &ArrayRef, + source_inner_field: &FieldRef, target_inner_field: &FieldRef, + cast_extension: Option<&dyn CastExtension>, cast_options: &CastOptions, ) -> Result { let source_list = source_col @@ -253,9 +288,11 @@ fn cast_list_column( )) })?; - let cast_values = cast_column( + let cast_values = cast_column_fields( source_list.values(), - target_inner_field.data_type(), + source_inner_field.as_ref(), + target_inner_field.as_ref(), + cast_extension, cast_options, )?; @@ -270,7 +307,9 @@ fn cast_list_column( fn cast_list_view_column( source_col: &ArrayRef, + source_inner_field: &FieldRef, target_inner_field: &FieldRef, + cast_extension: Option<&dyn CastExtension>, cast_options: &CastOptions, ) -> Result { let source_list = source_col @@ -283,9 +322,11 @@ fn cast_list_view_column( )) })?; - let cast_values = cast_column( + let cast_values = cast_column_fields( source_list.values(), - target_inner_field.data_type(), + source_inner_field.as_ref(), + target_inner_field.as_ref(), + cast_extension, cast_options, )?; @@ -304,6 +345,7 @@ fn cast_dictionary_column( source_key_type: &DataType, target_key_type: &DataType, target_value_type: &DataType, + cast_extension: Option<&dyn CastExtension>, cast_options: &CastOptions, ) -> Result { // Dispatch on source key type to access keys/values, then recursively @@ -314,8 +356,13 @@ fn cast_dictionary_column( .as_any() .downcast_ref::>() .expect("downcast must succeed"); - let cast_values = - cast_column(source_dict.values(), target_value_type, cast_options)?; + let cast_values = cast_column_fields( + source_dict.values(), + &source_dict.data_type().clone().into_nullable_field(), + &target_value_type.clone().into_nullable_field(), + cast_extension, + cast_options, + )?; Ok(Arc::new(DictionaryArray::<$t>::new( source_dict.keys().clone(), cast_values, @@ -380,6 +427,14 @@ fn cast_dictionary_column( pub fn validate_struct_compatibility( source_fields: &[FieldRef], target_fields: &[FieldRef], +) -> Result<()> { + validate_struct_compatibility_with_extension(source_fields, target_fields, None) +} + +pub fn validate_struct_compatibility_with_extension( + source_fields: &[FieldRef], + target_fields: &[FieldRef], + cast_extension: Option<&dyn CastExtension>, ) -> Result<()> { let has_overlap = has_one_of_more_common_fields(source_fields, target_fields); if !has_overlap { @@ -397,7 +452,7 @@ pub fn validate_struct_compatibility( .iter() .find(|f| f.name() == target_field.name()) { - validate_field_compatibility(source_field, target_field)?; + validate_field_compatibility(source_field, target_field, cast_extension)?; } else { // Target field is missing from source // If it's non-nullable, we cannot fill it with NULL @@ -418,6 +473,7 @@ pub fn validate_struct_compatibility( fn validate_field_compatibility( source_field: &Field, target_field: &Field, + cast_extension: Option<&dyn CastExtension>, ) -> Result<()> { if source_field.data_type() == &DataType::Null { // Validate that target allows nulls before returning early. @@ -442,10 +498,17 @@ fn validate_field_compatibility( ); } + if let Some(cast_extension) = cast_extension + && cast_extension.can_cast_types(source_field, target_field)? + { + return Ok(()); + } + validate_data_type_compatibility( target_field.name(), source_field.data_type(), target_field.data_type(), + cast_extension, ) } @@ -455,6 +518,7 @@ pub fn validate_data_type_compatibility( field_name: &str, source_type: &DataType, target_type: &DataType, + cast_extension: Option<&dyn CastExtension>, ) -> Result<()> { match (source_type, target_type) { (Struct(source_nested), Struct(target_nested)) => { @@ -464,7 +528,7 @@ pub fn validate_data_type_compatibility( | (DataType::LargeList(s), DataType::LargeList(t)) | (DataType::ListView(s), DataType::ListView(t)) | (DataType::LargeListView(s), DataType::LargeListView(t)) => { - validate_field_compatibility(s, t)?; + validate_field_compatibility(s, t, cast_extension)?; } (DataType::Dictionary(s_key, s_val), DataType::Dictionary(t_key, t_val)) => { if !can_cast_types(s_key, t_key) { @@ -475,7 +539,7 @@ pub fn validate_data_type_compatibility( field_name ); } - validate_data_type_compatibility(field_name, s_val, t_val)?; + validate_data_type_compatibility(field_name, s_val, t_val, cast_extension)?; } _ => { if !can_cast_types(source_type, target_type) { @@ -538,12 +602,7 @@ pub fn has_one_of_more_common_fields( } pub trait CastExtension: std::fmt::Debug + Send + Sync { - fn can_cast_types( - &self, - source_field: &Field, - target_field: &Field, - cast_options: &CastOptions, - ) -> Result; + fn can_cast_types(&self, source_field: &Field, target_field: &Field) -> Result; fn cast_array( &self, @@ -560,14 +619,9 @@ pub struct VecCastExtension { } impl CastExtension for VecCastExtension { - fn can_cast_types( - &self, - source_field: &Field, - target_field: &Field, - cast_options: &CastOptions, - ) -> Result { + fn can_cast_types(&self, source_field: &Field, target_field: &Field) -> Result { for extension in &self.extensions { - if extension.can_cast_types(source_field, target_field, cast_options)? { + if extension.can_cast_types(source_field, target_field)? { return Ok(true); } } @@ -583,7 +637,7 @@ impl CastExtension for VecCastExtension { cast_options: &CastOptions, ) -> Result { for extension in &self.extensions { - if extension.can_cast_types(source_field, target_field, cast_options)? { + if extension.can_cast_types(source_field, target_field)? { return extension.cast_array( array, source_field, @@ -1319,7 +1373,7 @@ mod tests { DataType::Dictionary(Box::new(DataType::Int32), Box::new(source_inner)); let target = DataType::Dictionary(Box::new(DataType::Int32), Box::new(target_inner)); - assert!(validate_data_type_compatibility("col", &source, &target).is_ok()); + assert!(validate_data_type_compatibility("col", &source, &target, None).is_ok()); } #[test] diff --git a/datafusion/physical-expr/src/expressions/cast.rs b/datafusion/physical-expr/src/expressions/cast.rs index c8379912ee3d7..5c49f33540e04 100644 --- a/datafusion/physical-expr/src/expressions/cast.rs +++ b/datafusion/physical-expr/src/expressions/cast.rs @@ -52,7 +52,7 @@ const DEFAULT_SAFE_CAST_OPTIONS: CastOptions<'static> = CastOptions { /// instead of deferring errors to execution. Handles structs at any nesting level /// (e.g., `List`, `Dictionary<_, Struct>`). fn can_cast_named_struct_types(source: &DataType, target: &DataType) -> bool { - validate_data_type_compatibility("", source, target).is_ok() + validate_data_type_compatibility("", source, target, None).is_ok() } /// CAST expression casts an expression to a specific data type and returns a runtime error on invalid cast From 8bd685fff16995e755ac218c8a289ffe01c47038 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 7 May 2026 14:31:37 -0500 Subject: [PATCH 25/39] try to align previous cast extension with the new one --- datafusion/common/src/metadata.rs | 4 +-- datafusion/common/src/nested_struct.rs | 25 +++++++++------ .../src/types/canonical_extensions/uuid.rs | 32 ++++++++++--------- datafusion/common/src/types/extension.rs | 8 ++--- .../src/schema_rewriter.rs | 2 ++ .../physical-expr/src/expressions/cast.rs | 5 +-- datafusion/physical-expr/src/planner.rs | 14 ++------ 7 files changed, 45 insertions(+), 45 deletions(-) diff --git a/datafusion/common/src/metadata.rs b/datafusion/common/src/metadata.rs index cba52d35af84e..a0078a56e7888 100644 --- a/datafusion/common/src/metadata.rs +++ b/datafusion/common/src/metadata.rs @@ -82,9 +82,9 @@ impl ScalarAndMetadata { } if let Some(cast_extension) = cast_extension - && cast_extension.can_cast_types(&source_field, target_field)? + && cast_extension.can_cast_fields(&source_field, target_field)? { - let cast_arr = cast_extension.cast_array( + let cast_arr = cast_extension.cast_array_fields( &self.value.to_array()?, &source_field, target_field, diff --git a/datafusion/common/src/nested_struct.rs b/datafusion/common/src/nested_struct.rs index 12d6c09d80fbb..264fc87fee39a 100644 --- a/datafusion/common/src/nested_struct.rs +++ b/datafusion/common/src/nested_struct.rs @@ -201,9 +201,9 @@ pub fn cast_column_fields( cast_options: &CastOptions, ) -> Result { if let Some(cast_extension) = cast_extension - && cast_extension.can_cast_types(source_field, target_field)? + && cast_extension.can_cast_fields(source_field, target_field)? { - return cast_extension.cast_array( + return cast_extension.cast_array_fields( source_col, source_field, target_field, @@ -499,7 +499,7 @@ fn validate_field_compatibility( } if let Some(cast_extension) = cast_extension - && cast_extension.can_cast_types(source_field, target_field)? + && cast_extension.can_cast_fields(source_field, target_field)? { return Ok(()); } @@ -602,9 +602,10 @@ pub fn has_one_of_more_common_fields( } pub trait CastExtension: std::fmt::Debug + Send + Sync { - fn can_cast_types(&self, source_field: &Field, target_field: &Field) -> Result; + fn can_cast_fields(&self, source_field: &Field, target_field: &Field) + -> Result; - fn cast_array( + fn cast_array_fields( &self, array: &ArrayRef, source_field: &Field, @@ -619,9 +620,13 @@ pub struct VecCastExtension { } impl CastExtension for VecCastExtension { - fn can_cast_types(&self, source_field: &Field, target_field: &Field) -> Result { + fn can_cast_fields( + &self, + source_field: &Field, + target_field: &Field, + ) -> Result { for extension in &self.extensions { - if extension.can_cast_types(source_field, target_field)? { + if extension.can_cast_fields(source_field, target_field)? { return Ok(true); } } @@ -629,7 +634,7 @@ impl CastExtension for VecCastExtension { Ok(false) } - fn cast_array( + fn cast_array_fields( &self, array: &ArrayRef, source_field: &Field, @@ -637,8 +642,8 @@ impl CastExtension for VecCastExtension { cast_options: &CastOptions, ) -> Result { for extension in &self.extensions { - if extension.can_cast_types(source_field, target_field)? { - return extension.cast_array( + if extension.can_cast_fields(source_field, target_field)? { + return extension.cast_array_fields( array, source_field, target_field, diff --git a/datafusion/common/src/types/canonical_extensions/uuid.rs b/datafusion/common/src/types/canonical_extensions/uuid.rs index da532cae3b339..0561d7236522d 100644 --- a/datafusion/common/src/types/canonical_extensions/uuid.rs +++ b/datafusion/common/src/types/canonical_extensions/uuid.rs @@ -17,7 +17,7 @@ use crate::Result; use crate::cast::{as_fixed_size_binary_array, as_string_array}; -use crate::error::_internal_err; +use crate::error::{_exec_err, _internal_err}; use crate::types::CastExtension; use crate::types::extension::DFExtensionType; use arrow::array::{ @@ -117,40 +117,38 @@ impl DisplayIndex for UuidValueDisplayIndex<'_> { struct CastFromUuid {} impl CastExtension for CastFromUuid { - fn can_cast(&self, _from: &Field, to: &Field, options: &CastOptions) -> Result { + fn can_cast_fields(&self, _from: &Field, to: &Field) -> Result { if to.extension_type_name().is_some() { return Ok(false); } match to.data_type() { // Only explicit casts to string - DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 => { - if options.safe { - Ok(false) - } else { - Ok(true) - } - } + DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 => Ok(true), // Can implicitly cast to storage DataType::FixedSizeBinary(16) => Ok(true), _ => Ok(false), } } - fn cast( + fn cast_array_fields( &self, value: ArrayRef, from: &Field, to: &Field, options: &CastOptions, ) -> Result { - if !self.can_cast(from, to, options)? { + if !self.can_cast_fields(from, to)? { return _internal_err!("Unhandled cast"); } let storage = as_fixed_size_binary_array(&value)?; match to.data_type() { DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 => { + if options.safe { + return _exec_err!("Cast from string to UUID must be explicit"); + } + let mut builder = StringBuilder::with_capacity(storage.len(), storage.len() * 36); for bytes_opt in storage { @@ -182,23 +180,27 @@ impl CastExtension for CastFromUuid { struct CastToUuid {} impl CastExtension for CastToUuid { - fn can_cast(&self, from: &Field, to: &Field, options: &CastOptions) -> Result { - CastFromUuid {}.can_cast(to, from, options) + fn can_cast_fields(&self, from: &Field, to: &Field) -> Result { + CastFromUuid {}.can_cast_fields(to, from) } - fn cast( + fn cast_array_fields( &self, value: ArrayRef, from: &Field, to: &Field, options: &CastOptions, ) -> Result { - if !self.can_cast(from, to, options)? { + if !self.can_cast_fields(from, to)? { return _internal_err!("Unhandled cast"); } match from.data_type() { DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 => { + if options.safe { + return _exec_err!("Cast from UUID to string must be explicit"); + } + let string_array_ref = cast(&value, &DataType::Utf8)?; let string_array = as_string_array(&string_array_ref)?; let mut builder = FixedSizeBinaryBuilder::new(16); diff --git a/datafusion/common/src/types/extension.rs b/datafusion/common/src/types/extension.rs index 8651b46db19a2..b97f378dccc07 100644 --- a/datafusion/common/src/types/extension.rs +++ b/datafusion/common/src/types/extension.rs @@ -99,10 +99,10 @@ pub trait DFExtensionType: Debug + Send + Sync { } pub trait CastExtension: Debug + Send + Sync { - fn can_cast(&self, from: &Field, to: &Field, options: &CastOptions) -> Result; + fn can_cast_fields(&self, from: &Field, to: &Field) -> Result; // None for fallback - fn cast( + fn cast_array_fields( &self, value: ArrayRef, from: &Field, @@ -115,11 +115,11 @@ pub trait CastExtension: Debug + Send + Sync { struct DefaultExtensionCast {} impl CastExtension for DefaultExtensionCast { - fn can_cast(&self, from: &Field, to: &Field, _options: &CastOptions) -> Result { + fn can_cast_fields(&self, from: &Field, to: &Field) -> Result { Ok(from.data_type() == to.data_type()) } - fn cast( + fn cast_array_fields( &self, value: ArrayRef, _from: &Field, diff --git a/datafusion/physical-expr-adapter/src/schema_rewriter.rs b/datafusion/physical-expr-adapter/src/schema_rewriter.rs index 9fb4950317ff8..f8a393c7912b7 100644 --- a/datafusion/physical-expr-adapter/src/schema_rewriter.rs +++ b/datafusion/physical-expr-adapter/src/schema_rewriter.rs @@ -434,10 +434,12 @@ impl DefaultPhysicalExprAdapterRewriter { // TODO: add optimization to move the cast from the column to literal expressions in the case of `col = 123` // since that's much cheaper to evalaute. // See https://github.com/apache/datafusion/issues/15780#issuecomment-2824716928 + // TODO don't pass None here validate_data_type_compatibility( resolved_column.name(), physical_field.data_type(), logical_field.data_type(), + None ) .map_err(|e| { DataFusionError::Execution(format!( diff --git a/datafusion/physical-expr/src/expressions/cast.rs b/datafusion/physical-expr/src/expressions/cast.rs index 5c49f33540e04..dd6cede80e5f1 100644 --- a/datafusion/physical-expr/src/expressions/cast.rs +++ b/datafusion/physical-expr/src/expressions/cast.rs @@ -52,6 +52,7 @@ const DEFAULT_SAFE_CAST_OPTIONS: CastOptions<'static> = CastOptions { /// instead of deferring errors to execution. Handles structs at any nesting level /// (e.g., `List`, `Dictionary<_, Struct>`). fn can_cast_named_struct_types(source: &DataType, target: &DataType) -> bool { + // TODO: don't pass None here validate_data_type_compatibility("", source, target, None).is_ok() } @@ -273,7 +274,7 @@ impl PhysicalExpr for CastExpr { let to_field = self.return_field(&batch.schema())?; match value { ColumnarValue::Array(array) => { - Ok(ColumnarValue::Array(cast_extension.cast( + Ok(ColumnarValue::Array(cast_extension.cast_array_fields( array, &from_field, &to_field, @@ -282,7 +283,7 @@ impl PhysicalExpr for CastExpr { } ColumnarValue::Scalar(scalar_value) => { let array = scalar_value.to_array()?; - let array_result = cast_extension.cast( + let array_result = cast_extension.cast_array_fields( array, &from_field, &to_field, diff --git a/datafusion/physical-expr/src/planner.rs b/datafusion/physical-expr/src/planner.rs index a9cfc827234cd..e7bb21c30e5ce 100644 --- a/datafusion/physical-expr/src/planner.rs +++ b/datafusion/physical-expr/src/planner.rs @@ -24,11 +24,9 @@ use crate::{ expressions::{self, Column, Literal, binary, like, similar_to}, }; -use arrow::compute::CastOptions; use arrow::datatypes::Schema; use datafusion_common::config::ConfigOptions; use datafusion_common::datatype::FieldExt; -use datafusion_common::format::DEFAULT_FORMAT_OPTIONS; use datafusion_common::metadata::{FieldMetadata, format_type_and_metadata}; use datafusion_common::{ DFSchema, Result, ScalarValue, TableReference, ToDFSchema, exec_err, @@ -298,10 +296,6 @@ pub fn create_physical_expr( } Expr::Cast(Cast { expr, field }) => { let (_, src_field) = expr.to_field(input_dfschema)?; - const DEFAULT_CAST_OPTIONS: CastOptions<'static> = CastOptions { - safe: false, - format_options: DEFAULT_FORMAT_OPTIONS, - }; if !field.metadata().is_empty() { if let Some(registry) = &execution_props.extension_types @@ -309,11 +303,7 @@ pub fn create_physical_expr( registry.create_extension_type_for_field(field)? { let cast_extension = extension_type.cast_from()?; - if cast_extension.can_cast( - &src_field, - field, - &DEFAULT_CAST_OPTIONS, - )? { + if cast_extension.can_cast_fields(&src_field, field)? { return expressions::cast_with_extension( create_physical_expr(expr, input_dfschema, execution_props)?, input_schema, @@ -336,7 +326,7 @@ pub fn create_physical_expr( registry.create_extension_type_for_field(&src_field)? { let cast_extension = extension_type.cast_to()?; - if cast_extension.can_cast(&src_field, field, &DEFAULT_CAST_OPTIONS)? { + if cast_extension.can_cast_fields(&src_field, field)? { return expressions::cast_with_extension( create_physical_expr(expr, input_dfschema, execution_props)?, input_schema, From 57c4bb6d810e7eb40b47e665bc68b666fe3d467f Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 7 May 2026 14:35:12 -0500 Subject: [PATCH 26/39] use the new trait --- .../src/types/canonical_extensions/uuid.rs | 10 +++++----- datafusion/common/src/types/extension.rs | 18 +++--------------- .../physical-expr/src/expressions/cast.rs | 7 +++---- 3 files changed, 11 insertions(+), 24 deletions(-) diff --git a/datafusion/common/src/types/canonical_extensions/uuid.rs b/datafusion/common/src/types/canonical_extensions/uuid.rs index 0561d7236522d..db8ec7d0848dd 100644 --- a/datafusion/common/src/types/canonical_extensions/uuid.rs +++ b/datafusion/common/src/types/canonical_extensions/uuid.rs @@ -18,7 +18,7 @@ use crate::Result; use crate::cast::{as_fixed_size_binary_array, as_string_array}; use crate::error::{_exec_err, _internal_err}; -use crate::types::CastExtension; +use crate::nested_struct::CastExtension; use crate::types::extension::DFExtensionType; use arrow::array::{ Array, ArrayRef, FixedSizeBinaryArray, StringBuilder, builder::FixedSizeBinaryBuilder, @@ -133,7 +133,7 @@ impl CastExtension for CastFromUuid { fn cast_array_fields( &self, - value: ArrayRef, + value: &ArrayRef, from: &Field, to: &Field, options: &CastOptions, @@ -168,7 +168,7 @@ impl CastExtension for CastFromUuid { let string_array = Arc::new(builder.finish()) as ArrayRef; return Ok(cast(&string_array, to.data_type())?); } - DataType::FixedSizeBinary(16) => return Ok(value), + DataType::FixedSizeBinary(16) => return Ok(value.clone()), _ => {} } @@ -186,7 +186,7 @@ impl CastExtension for CastToUuid { fn cast_array_fields( &self, - value: ArrayRef, + value: &ArrayRef, from: &Field, to: &Field, options: &CastOptions, @@ -223,7 +223,7 @@ impl CastExtension for CastToUuid { return Ok(Arc::new(builder.finish())); } // Can implicitly cast from storage - DataType::FixedSizeBinary(16) => return Ok(value), + DataType::FixedSizeBinary(16) => return Ok(value.clone()), _ => {} } diff --git a/datafusion/common/src/types/extension.rs b/datafusion/common/src/types/extension.rs index b97f378dccc07..dc972973bd2f8 100644 --- a/datafusion/common/src/types/extension.rs +++ b/datafusion/common/src/types/extension.rs @@ -16,6 +16,7 @@ // under the License. use crate::error::Result; +use crate::nested_struct::CastExtension; use arrow::array::{Array, ArrayRef}; use arrow::compute::CastOptions; use arrow::util::display::{ArrayFormatter, FormatOptions}; @@ -98,19 +99,6 @@ pub trait DFExtensionType: Debug + Send + Sync { } } -pub trait CastExtension: Debug + Send + Sync { - fn can_cast_fields(&self, from: &Field, to: &Field) -> Result; - - // None for fallback - fn cast_array_fields( - &self, - value: ArrayRef, - from: &Field, - to: &Field, - options: &CastOptions, - ) -> Result; -} - #[derive(Debug)] struct DefaultExtensionCast {} @@ -121,11 +109,11 @@ impl CastExtension for DefaultExtensionCast { fn cast_array_fields( &self, - value: ArrayRef, + value: &ArrayRef, _from: &Field, _to: &Field, _options: &CastOptions, ) -> Result { - Ok(value) + Ok(value.clone()) } } diff --git a/datafusion/physical-expr/src/expressions/cast.rs b/datafusion/physical-expr/src/expressions/cast.rs index dd6cede80e5f1..5f0a1cf87b3d6 100644 --- a/datafusion/physical-expr/src/expressions/cast.rs +++ b/datafusion/physical-expr/src/expressions/cast.rs @@ -27,9 +27,8 @@ use arrow::record_batch::RecordBatch; use datafusion_common::datatype::DataTypeExt; use datafusion_common::format::DEFAULT_FORMAT_OPTIONS; use datafusion_common::nested_struct::{ - requires_nested_struct_cast, validate_data_type_compatibility, + CastExtension, requires_nested_struct_cast, validate_data_type_compatibility, }; -use datafusion_common::types::CastExtension; use datafusion_common::{Result, ScalarValue, not_impl_err}; use datafusion_expr_common::columnar_value::ColumnarValue; use datafusion_expr_common::interval_arithmetic::Interval; @@ -275,7 +274,7 @@ impl PhysicalExpr for CastExpr { match value { ColumnarValue::Array(array) => { Ok(ColumnarValue::Array(cast_extension.cast_array_fields( - array, + &array, &from_field, &to_field, &self.cast_options, @@ -284,7 +283,7 @@ impl PhysicalExpr for CastExpr { ColumnarValue::Scalar(scalar_value) => { let array = scalar_value.to_array()?; let array_result = cast_extension.cast_array_fields( - array, + &array, &from_field, &to_field, &self.cast_options, From e970a2c5552d198b118b28def2a59d3642a483e7 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 7 May 2026 15:20:45 -0500 Subject: [PATCH 27/39] cast ext integration --- datafusion/expr/src/registry.rs | 9 +++ .../src/schema_rewriter.rs | 5 +- .../src/equivalence/properties/dependency.rs | 1 + .../physical-expr/src/expressions/cast.rs | 41 +++++++++++--- datafusion/physical-expr/src/planner.rs | 56 ++----------------- datafusion/physical-plan/src/common.rs | 1 + datafusion/pruning/src/pruning_predicate.rs | 1 + 7 files changed, 53 insertions(+), 61 deletions(-) diff --git a/datafusion/expr/src/registry.rs b/datafusion/expr/src/registry.rs index ed8db80c4463b..cd2a8b58d86bd 100644 --- a/datafusion/expr/src/registry.rs +++ b/datafusion/expr/src/registry.rs @@ -27,6 +27,7 @@ use arrow_schema::extension::{ Bool8, ExtensionType, FixedShapeTensor, Json, Opaque, TimestampWithOffset, Uuid, VariableShapeTensor, }; +use datafusion_common::nested_struct::CastExtension; use datafusion_common::types::{ DFBool8, DFExtensionTypeRef, DFFixedShapeTensor, DFJson, DFOpaque, DFTimestampWithOffset, DFUuid, DFVariableShapeTensor, @@ -346,6 +347,14 @@ pub trait ExtensionTypeRegistry: Debug + Send + Sync { &self, name: &str, ) -> Result>; + + fn cast_extension( + &self, + _source_field: &Field, + _target_field: &Field, + ) -> Option> { + None + } } /// A factory that creates instances of extension types from a storage [`DataType`] and the diff --git a/datafusion/physical-expr-adapter/src/schema_rewriter.rs b/datafusion/physical-expr-adapter/src/schema_rewriter.rs index f8a393c7912b7..d1e04c54f5f63 100644 --- a/datafusion/physical-expr-adapter/src/schema_rewriter.rs +++ b/datafusion/physical-expr-adapter/src/schema_rewriter.rs @@ -434,12 +434,11 @@ impl DefaultPhysicalExprAdapterRewriter { // TODO: add optimization to move the cast from the column to literal expressions in the case of `col = 123` // since that's much cheaper to evalaute. // See https://github.com/apache/datafusion/issues/15780#issuecomment-2824716928 - // TODO don't pass None here validate_data_type_compatibility( resolved_column.name(), physical_field.data_type(), logical_field.data_type(), - None + None // TODO: can we get a cast extension here? ) .map_err(|e| { DataFusionError::Execution(format!( @@ -453,6 +452,7 @@ impl DefaultPhysicalExprAdapterRewriter { Ok(Transformed::yes(Arc::new(CastExpr::new_with_target_field( Arc::new(resolved_column), Arc::new(logical_field.clone()), + None, // TODO: can we get a cast extension here? None, )))) } @@ -856,6 +856,7 @@ mod tests { Arc::new(Column::new("data", 0)), logical_field, None, + None, )) as Arc; assert_eq!(result.to_string(), expected.to_string()); diff --git a/datafusion/physical-expr/src/equivalence/properties/dependency.rs b/datafusion/physical-expr/src/equivalence/properties/dependency.rs index 2ebc71559fcf4..d3fa5e0f4b8a0 100644 --- a/datafusion/physical-expr/src/equivalence/properties/dependency.rs +++ b/datafusion/physical-expr/src/equivalence/properties/dependency.rs @@ -942,6 +942,7 @@ mod tests { col_c, Arc::new(Field::new("c", DataType::Date32, true)), None, + None, )) as _; let required_sort = vec![PhysicalSortExpr::new_default(col("c", &schema)?)]; diff --git a/datafusion/physical-expr/src/expressions/cast.rs b/datafusion/physical-expr/src/expressions/cast.rs index 5f0a1cf87b3d6..0f05c7e7a7d55 100644 --- a/datafusion/physical-expr/src/expressions/cast.rs +++ b/datafusion/physical-expr/src/expressions/cast.rs @@ -26,6 +26,7 @@ use arrow::datatypes::{DataType, DataType::*, FieldRef, Schema}; use arrow::record_batch::RecordBatch; use datafusion_common::datatype::DataTypeExt; use datafusion_common::format::DEFAULT_FORMAT_OPTIONS; +use datafusion_common::metadata::format_type_and_metadata; use datafusion_common::nested_struct::{ CastExtension, requires_nested_struct_cast, validate_data_type_compatibility, }; @@ -50,9 +51,12 @@ const DEFAULT_SAFE_CAST_OPTIONS: CastOptions<'static> = CastOptions { /// planning-time validation matches runtime validation, enabling fail-fast behavior /// instead of deferring errors to execution. Handles structs at any nesting level /// (e.g., `List`, `Dictionary<_, Struct>`). -fn can_cast_named_struct_types(source: &DataType, target: &DataType) -> bool { - // TODO: don't pass None here - validate_data_type_compatibility("", source, target, None).is_ok() +fn can_cast_named_struct_types( + source: &DataType, + target: &DataType, + cast_extension: Option<&dyn CastExtension>, +) -> bool { + validate_data_type_compatibility("", source, target, cast_extension).is_ok() } /// CAST expression casts an expression to a specific data type and returns a runtime error on invalid cast @@ -110,6 +114,7 @@ impl CastExpr { Self::new_with_target_field( expr, cast_type.into_nullable_field_ref(), + None, cast_options, ) } @@ -126,13 +131,14 @@ impl CastExpr { pub fn new_with_target_field( expr: Arc, target_field: FieldRef, + cast_extension: Option>, cast_options: Option>, ) -> Self { Self { expr, target_field, cast_options: cast_options.unwrap_or(DEFAULT_CAST_OPTIONS), - cast_extension: None, + cast_extension, } } @@ -314,6 +320,7 @@ impl PhysicalExpr for CastExpr { Ok(Arc::new(CastExpr::new_with_target_field( Arc::clone(&children[0]), Arc::clone(&self.target_field), + self.cast_extension.clone(), Some(self.cast_options.clone()), ))) } @@ -365,6 +372,7 @@ pub fn cast_with_options( expr, input_schema, cast_type.into_nullable_field_ref(), + None, cast_options, ) } @@ -381,11 +389,19 @@ pub fn cast_with_target_field( expr: Arc, input_schema: &Schema, target_field: FieldRef, + cast_extension: Option>, cast_options: Option>, ) -> Result> { - let expr_type = expr.data_type(input_schema)?; + let expr_field = expr.return_field(input_schema)?; + if let Some(cast_extension) = cast_extension.as_deref() + && cast_extension.can_cast_fields(&expr_field, target_field.as_ref())? + { + todo!() + } + + let expr_type = expr_field.data_type(); let cast_type = target_field.data_type(); - if expr_type == *cast_type && is_default_target_field(&target_field) { + if expr_type == cast_type && is_default_target_field(&target_field) { return Ok(Arc::clone(&expr)); } @@ -395,18 +411,22 @@ pub fn cast_with_target_field( // applied at planning time (now) to fail fast, rather than deferring errors // to execution time. The name-based casting logic will be executed at runtime // via ColumnarValue::cast_to. - can_cast_named_struct_types(&expr_type, cast_type) + can_cast_named_struct_types(&expr_type, cast_type, cast_extension.as_deref()) } else { can_cast_types(&expr_type, cast_type) }; + let source_fmt = format_type_and_metadata(expr_type, Some(expr_field.metadata())); + let target_fmt = + format_type_and_metadata(target_field.data_type(), Some(expr_field.metadata())); if !can_build_cast { - return not_impl_err!("Unsupported CAST from {expr_type} to {cast_type}"); + return not_impl_err!("Unsupported CAST from {source_fmt} to {target_fmt}"); } Ok(Arc::new(CastExpr::new_with_target_field( expr, target_field, + None, cast_options, ))) } @@ -477,6 +497,7 @@ mod tests { col(column, schema.as_ref())?, Arc::new(target_field), None, + None, ); let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?; @@ -1002,6 +1023,7 @@ mod tests { .with_metadata(metadata.clone()), ), None, + None, ); let field = expr.return_field(&schema)?; @@ -1171,7 +1193,8 @@ mod tests { let literal = Arc::new(crate::expressions::Literal::new(ScalarValue::Struct( Arc::new(scalar_struct), ))); - let expr = CastExpr::new_with_target_field(literal, Arc::new(target_field), None); + let expr = + CastExpr::new_with_target_field(literal, Arc::new(target_field), None, None); let batch = RecordBatch::new_empty(schema); let result = expr.evaluate(&batch)?; diff --git a/datafusion/physical-expr/src/planner.rs b/datafusion/physical-expr/src/planner.rs index e7bb21c30e5ce..c0ddf4cd693b2 100644 --- a/datafusion/physical-expr/src/planner.rs +++ b/datafusion/physical-expr/src/planner.rs @@ -296,62 +296,18 @@ pub fn create_physical_expr( } Expr::Cast(Cast { expr, field }) => { let (_, src_field) = expr.to_field(input_dfschema)?; - - if !field.metadata().is_empty() { - if let Some(registry) = &execution_props.extension_types - && let Some(extension_type) = - registry.create_extension_type_for_field(field)? - { - let cast_extension = extension_type.cast_from()?; - if cast_extension.can_cast_fields(&src_field, field)? { - return expressions::cast_with_extension( - create_physical_expr(expr, input_dfschema, execution_props)?, - input_schema, - field.data_type().clone(), - cast_extension, - ); - } - } - - return plan_err!( - "Cast from {} to {} is not supported", - format_type_and_metadata( - src_field.data_type(), - Some(src_field.metadata()), - ), - format_type_and_metadata(field.data_type(), Some(field.metadata())) - ); - } else if let Some(registry) = &execution_props.extension_types - && let Some(extension_type) = - registry.create_extension_type_for_field(&src_field)? - { - let cast_extension = extension_type.cast_to()?; - if cast_extension.can_cast_fields(&src_field, field)? { - return expressions::cast_with_extension( - create_physical_expr(expr, input_dfschema, execution_props)?, - input_schema, - field.data_type().clone(), - cast_extension, - ); + let cast_extension = + if let Some(extension_types) = &execution_props.extension_types { + extension_types.cast_extension(&src_field, &field) } else { - return plan_err!( - "Cast from {} to {} is not supported", - format_type_and_metadata( - src_field.data_type(), - Some(src_field.metadata()), - ), - format_type_and_metadata( - field.data_type(), - Some(field.metadata()) - ) - ); - } - } + None + }; expressions::cast_with_target_field( create_physical_expr(expr, input_dfschema, execution_props)?, input_schema, Arc::clone(field), + cast_extension, None, ) } diff --git a/datafusion/physical-plan/src/common.rs b/datafusion/physical-plan/src/common.rs index 0dafcf6bd3390..45a14f00c8735 100644 --- a/datafusion/physical-plan/src/common.rs +++ b/datafusion/physical-plan/src/common.rs @@ -163,6 +163,7 @@ pub fn project_plan_to_schema( Arc::new(CastExpr::new_with_target_field( column, Arc::clone(expected_field), + None, // TODO: can we get a cast extension here? None, )) as _ } else { diff --git a/datafusion/pruning/src/pruning_predicate.rs b/datafusion/pruning/src/pruning_predicate.rs index 76cf14be88f5a..4027b03c66c42 100644 --- a/datafusion/pruning/src/pruning_predicate.rs +++ b/datafusion/pruning/src/pruning_predicate.rs @@ -1126,6 +1126,7 @@ fn rewrite_expr_to_prunable( let left = Arc::new(phys_expr::CastExpr::new_with_target_field( left, Arc::clone(cast.target_field()), + None, // TODO: can we get a CastExtension here? None, )); // PruningPredicate does not support pruning on nested fields yet. From b33d2fea4a4de19162fdf5352a79fc9cbc7f8689 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 7 May 2026 16:01:23 -0500 Subject: [PATCH 28/39] try to integrat into the cast expr --- datafusion/common/src/nested_struct.rs | 6 ++++ .../src/types/canonical_extensions/mod.rs | 2 +- .../src/types/canonical_extensions/uuid.rs | 4 +-- datafusion/expr/src/registry.rs | 33 ++++++++++++++++--- .../physical-expr/src/expressions/cast.rs | 16 ++++++--- 5 files changed, 49 insertions(+), 12 deletions(-) diff --git a/datafusion/common/src/nested_struct.rs b/datafusion/common/src/nested_struct.rs index 264fc87fee39a..7d0371f18a62c 100644 --- a/datafusion/common/src/nested_struct.rs +++ b/datafusion/common/src/nested_struct.rs @@ -619,6 +619,12 @@ pub struct VecCastExtension { extensions: Vec>, } +impl VecCastExtension { + pub fn new(extensions: Vec>) -> Self { + Self { extensions } + } +} + impl CastExtension for VecCastExtension { fn can_cast_fields( &self, diff --git a/datafusion/common/src/types/canonical_extensions/mod.rs b/datafusion/common/src/types/canonical_extensions/mod.rs index 2d74d0669d213..38fc03f064eed 100644 --- a/datafusion/common/src/types/canonical_extensions/mod.rs +++ b/datafusion/common/src/types/canonical_extensions/mod.rs @@ -20,7 +20,7 @@ mod fixed_shape_tensor; mod json; mod opaque; mod timestamp_with_offset; -mod uuid; +pub mod uuid; mod variable_shape_tensor; pub use bool8::DFBool8; diff --git a/datafusion/common/src/types/canonical_extensions/uuid.rs b/datafusion/common/src/types/canonical_extensions/uuid.rs index db8ec7d0848dd..b033f22b9570c 100644 --- a/datafusion/common/src/types/canonical_extensions/uuid.rs +++ b/datafusion/common/src/types/canonical_extensions/uuid.rs @@ -114,7 +114,7 @@ impl DisplayIndex for UuidValueDisplayIndex<'_> { } #[derive(Debug)] -struct CastFromUuid {} +pub struct CastFromUuid {} impl CastExtension for CastFromUuid { fn can_cast_fields(&self, _from: &Field, to: &Field) -> Result { @@ -177,7 +177,7 @@ impl CastExtension for CastFromUuid { } #[derive(Debug)] -struct CastToUuid {} +pub struct CastToUuid {} impl CastExtension for CastToUuid { fn can_cast_fields(&self, from: &Field, to: &Field) -> Result { diff --git a/datafusion/expr/src/registry.rs b/datafusion/expr/src/registry.rs index cd2a8b58d86bd..9496c0ca4eda9 100644 --- a/datafusion/expr/src/registry.rs +++ b/datafusion/expr/src/registry.rs @@ -27,7 +27,8 @@ use arrow_schema::extension::{ Bool8, ExtensionType, FixedShapeTensor, Json, Opaque, TimestampWithOffset, Uuid, VariableShapeTensor, }; -use datafusion_common::nested_struct::CastExtension; +use datafusion_common::nested_struct::{CastExtension, VecCastExtension}; +use datafusion_common::types::uuid::{CastFromUuid, CastToUuid}; use datafusion_common::types::{ DFBool8, DFExtensionTypeRef, DFFixedShapeTensor, DFJson, DFOpaque, DFTimestampWithOffset, DFUuid, DFVariableShapeTensor, @@ -440,6 +441,7 @@ impl Debug for ExtensionTypeRegistration { pub struct MemoryExtensionTypeRegistry { /// Holds a mapping between the name of an extension type and its logical type. extension_types: Arc>>, + cast_extensions: Arc, } impl Default for MemoryExtensionTypeRegistry { @@ -453,6 +455,7 @@ impl MemoryExtensionTypeRegistry { pub fn new_empty() -> Self { Self { extension_types: Arc::new(RwLock::new(HashMap::new())), + cast_extensions: Arc::new(VecCastExtension::new(vec![])), } } @@ -514,6 +517,11 @@ impl MemoryExtensionTypeRegistry { ), ]; + let cast_extensions = vec![ + Arc::new(CastFromUuid {}) as Arc, + Arc::new(CastToUuid {}) as Arc, + ]; + let mut extension_types = HashMap::new(); for registration in mapping.into_iter() { extension_types.insert(registration.type_name().to_owned(), registration); @@ -521,14 +529,11 @@ impl MemoryExtensionTypeRegistry { Self { extension_types: Arc::new(RwLock::new(HashMap::from(extension_types))), + cast_extensions: Arc::new(VecCastExtension::new(cast_extensions)), } } /// Creates a new [MemoryExtensionTypeRegistry] with the provided `types`. - /// - /// # Errors - /// - /// Returns an error if one of the `types` is a native type. pub fn new_with_types( types: impl IntoIterator, ) -> Result { @@ -538,6 +543,7 @@ impl MemoryExtensionTypeRegistry { .collect::>(); Ok(Self { extension_types: Arc::new(RwLock::new(extension_types)), + cast_extensions: Arc::new(VecCastExtension::new(vec![])), }) } @@ -595,12 +601,29 @@ impl ExtensionTypeRegistry for MemoryExtensionTypeRegistry { .expect("Extension type registry lock poisoned") .remove(name)) } + + fn cast_extension( + &self, + source_field: &Field, + target_field: &Field, + ) -> Option> { + if self + .cast_extensions + .can_cast_fields(source_field, target_field) + .unwrap_or(false) + { + Some(self.cast_extensions.clone()) + } else { + None + } + } } impl From> for MemoryExtensionTypeRegistry { fn from(value: HashMap) -> Self { Self { extension_types: Arc::new(RwLock::new(value)), + cast_extensions: Arc::new(VecCastExtension::new(vec![])), } } } diff --git a/datafusion/physical-expr/src/expressions/cast.rs b/datafusion/physical-expr/src/expressions/cast.rs index 0f05c7e7a7d55..48699b1e09afb 100644 --- a/datafusion/physical-expr/src/expressions/cast.rs +++ b/datafusion/physical-expr/src/expressions/cast.rs @@ -393,10 +393,15 @@ pub fn cast_with_target_field( cast_options: Option>, ) -> Result> { let expr_field = expr.return_field(input_schema)?; - if let Some(cast_extension) = cast_extension.as_deref() - && cast_extension.can_cast_fields(&expr_field, target_field.as_ref())? + if let Some(cast_extension_ref) = cast_extension.as_deref() + && cast_extension_ref.can_cast_fields(&expr_field, target_field.as_ref())? { - todo!() + return Ok(Arc::new(CastExpr::new_with_target_field( + expr, + target_field, + cast_extension, + cast_options, + ))); } let expr_type = expr_field.data_type(); @@ -411,7 +416,9 @@ pub fn cast_with_target_field( // applied at planning time (now) to fail fast, rather than deferring errors // to execution time. The name-based casting logic will be executed at runtime // via ColumnarValue::cast_to. - can_cast_named_struct_types(&expr_type, cast_type, cast_extension.as_deref()) + // TODO: we can pass the cast extension here if we will end up using it for + // the nested casting + can_cast_named_struct_types(&expr_type, cast_type, None) } else { can_cast_types(&expr_type, cast_type) }; @@ -423,6 +430,7 @@ pub fn cast_with_target_field( return not_impl_err!("Unsupported CAST from {source_fmt} to {target_fmt}"); } + // TODO: pass the cast extension here anyway so that nested casts work Ok(Arc::new(CastExpr::new_with_target_field( expr, target_field, From 1b60f1bd2fb65369aaace6db486d2ce32fbe2a4f Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 7 May 2026 16:17:44 -0500 Subject: [PATCH 29/39] remove some previous work --- .../physical-expr/src/expressions/cast.rs | 49 ++++++------------- .../physical-expr/src/expressions/mod.rs | 1 - 2 files changed, 14 insertions(+), 36 deletions(-) diff --git a/datafusion/physical-expr/src/expressions/cast.rs b/datafusion/physical-expr/src/expressions/cast.rs index 48699b1e09afb..0d78b712facad 100644 --- a/datafusion/physical-expr/src/expressions/cast.rs +++ b/datafusion/physical-expr/src/expressions/cast.rs @@ -142,16 +142,6 @@ impl CastExpr { } } - pub fn with_cast_extension( - self, - cast_extension: Option>, - ) -> Self { - Self { - cast_extension, - ..self - } - } - /// The expression to cast pub fn expr(&self) -> &Arc { &self.expr @@ -173,21 +163,22 @@ impl CastExpr { } fn resolved_target_field(&self, input_schema: &Schema) -> Result { - // When using a cast_extension, return the explicit target_field to avoid - // propagating source metadata (e.g., extension type metadata) to the output. - if self.cast_extension.is_some() { - return Ok(Arc::clone(&self.target_field)); - } - if is_default_target_field(&self.target_field) { - // TODO: not correct, metadata should not be propagated here self.expr.return_field(input_schema).map(|field| { - Arc::new( - field - .as_ref() - .clone() - .with_data_type(self.cast_type().clone()), - ) + let cast_type = self.cast_type(); + let mut out_field = + field.as_ref().clone().with_data_type(cast_type.clone()); + + // If we modify the storage type we can't ensure that the metadata + // is valid on the target type (e.g., a cast from UUID with extension + // metadata to Utf8 should not result in extension metadata + // on a Utf8 type, which would be invalid and may be rejected by + // consumers). + if field.data_type() != cast_type { + out_field = out_field.with_metadata(Default::default()); + } + + Arc::new(out_field) }) } else { Ok(Arc::clone(&self.target_field)) @@ -451,18 +442,6 @@ pub fn cast( cast_with_options(expr, input_schema, cast_type, None) } -pub fn cast_with_extension( - expr: Arc, - _input_schema: &Schema, - cast_type: DataType, - cast_extension: Arc, -) -> Result> { - Ok(Arc::new( - CastExpr::new(expr, cast_type, Some(DEFAULT_CAST_OPTIONS)) - .with_cast_extension(Some(cast_extension)), - )) -} - #[cfg(test)] mod tests { use super::*; diff --git a/datafusion/physical-expr/src/expressions/mod.rs b/datafusion/physical-expr/src/expressions/mod.rs index e44ff3aec4901..7cf874c448ea0 100644 --- a/datafusion/physical-expr/src/expressions/mod.rs +++ b/datafusion/physical-expr/src/expressions/mod.rs @@ -59,5 +59,4 @@ pub use not::{NotExpr, not}; pub use try_cast::{TryCastExpr, try_cast}; pub use unknown_column::UnKnownColumn; -pub use cast::cast_with_extension; pub(crate) use cast::cast_with_target_field; From 88d20f82dc37ef316bbea338f271040b85ea790e Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 7 May 2026 20:02:27 -0500 Subject: [PATCH 30/39] almost with the default extension casting --- .../src/types/canonical_extensions/mod.rs | 2 +- .../src/types/canonical_extensions/uuid.rs | 96 ++++----------- datafusion/common/src/types/extension.rs | 115 ++++++++++++++++-- datafusion/expr/src/expr_schema.rs | 29 +++-- datafusion/expr/src/registry.rs | 6 +- .../physical-expr/src/expressions/cast.rs | 7 +- 6 files changed, 147 insertions(+), 108 deletions(-) diff --git a/datafusion/common/src/types/canonical_extensions/mod.rs b/datafusion/common/src/types/canonical_extensions/mod.rs index 38fc03f064eed..2d74d0669d213 100644 --- a/datafusion/common/src/types/canonical_extensions/mod.rs +++ b/datafusion/common/src/types/canonical_extensions/mod.rs @@ -20,7 +20,7 @@ mod fixed_shape_tensor; mod json; mod opaque; mod timestamp_with_offset; -pub mod uuid; +mod uuid; mod variable_shape_tensor; pub use bool8::DFBool8; diff --git a/datafusion/common/src/types/canonical_extensions/uuid.rs b/datafusion/common/src/types/canonical_extensions/uuid.rs index b033f22b9570c..972e3b36019d0 100644 --- a/datafusion/common/src/types/canonical_extensions/uuid.rs +++ b/datafusion/common/src/types/canonical_extensions/uuid.rs @@ -16,12 +16,13 @@ // under the License. use crate::Result; -use crate::cast::{as_fixed_size_binary_array, as_string_array}; +use crate::cast::as_string_array; use crate::error::{_exec_err, _internal_err}; use crate::nested_struct::CastExtension; +use crate::types::DefaultExtensionCast; use crate::types::extension::DFExtensionType; use arrow::array::{ - Array, ArrayRef, FixedSizeBinaryArray, StringBuilder, builder::FixedSizeBinaryBuilder, + Array, ArrayRef, FixedSizeBinaryArray, builder::FixedSizeBinaryBuilder, }; use arrow::compute::{CastOptions, cast}; use arrow::datatypes::DataType; @@ -52,6 +53,13 @@ impl DFUuid { ) -> Result { Ok(Self(::try_new(data_type, metadata)?)) } + + pub fn cast_extensions() -> Vec> { + vec![ + Arc::new(DefaultExtensionCast::new(Uuid::NAME)), + Arc::new(ParseUuid), + ] + } } impl DFExtensionType for DFUuid { @@ -81,14 +89,6 @@ impl DFExtensionType for DFUuid { options.safe(), ))) } - - fn cast_from(&self) -> Result> { - Ok(Arc::new(CastToUuid {})) - } - - fn cast_to(&self) -> Result> { - Ok(Arc::new(CastFromUuid {})) - } } /// Pretty printer for binary UUID values. @@ -114,74 +114,24 @@ impl DisplayIndex for UuidValueDisplayIndex<'_> { } #[derive(Debug)] -pub struct CastFromUuid {} +pub struct ParseUuid; -impl CastExtension for CastFromUuid { - fn can_cast_fields(&self, _from: &Field, to: &Field) -> Result { - if to.extension_type_name().is_some() { +impl CastExtension for ParseUuid { + fn can_cast_fields(&self, from: &Field, to: &Field) -> Result { + if from.extension_type_name().is_some() { return Ok(false); } - match to.data_type() { - // Only explicit casts to string - DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 => Ok(true), - // Can implicitly cast to storage - DataType::FixedSizeBinary(16) => Ok(true), - _ => Ok(false), - } - } - - fn cast_array_fields( - &self, - value: &ArrayRef, - from: &Field, - to: &Field, - options: &CastOptions, - ) -> Result { - if !self.can_cast_fields(from, to)? { - return _internal_err!("Unhandled cast"); - } - - let storage = as_fixed_size_binary_array(&value)?; - match to.data_type() { - DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 => { - if options.safe { - return _exec_err!("Cast from string to UUID must be explicit"); - } - - let mut builder = - StringBuilder::with_capacity(storage.len(), storage.len() * 36); - for bytes_opt in storage { - match bytes_opt { - Some(bytes) => { - let bytes16 = Bytes::try_from(bytes).map_err(|e| { - crate::DataFusionError::Execution(e.to_string()) - })?; - let uuid = uuid::Uuid::from_bytes(bytes16); - write!(builder, "{uuid}")?; - builder.append_value(""); - } - None => builder.append_null(), - } - } - - let string_array = Arc::new(builder.finish()) as ArrayRef; - return Ok(cast(&string_array, to.data_type())?); - } - DataType::FixedSizeBinary(16) => return Ok(value.clone()), - _ => {} + if let Some(to_extension_name) = to.extension_type_name() + && to_extension_name == Uuid::NAME + { + Ok(matches!( + from.data_type(), + DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 + )) + } else { + Ok(false) } - - _internal_err!("Unexpected difference between can_cast()") - } -} - -#[derive(Debug)] -pub struct CastToUuid {} - -impl CastExtension for CastToUuid { - fn can_cast_fields(&self, from: &Field, to: &Field) -> Result { - CastFromUuid {}.can_cast_fields(to, from) } fn cast_array_fields( diff --git a/datafusion/common/src/types/extension.rs b/datafusion/common/src/types/extension.rs index dc972973bd2f8..a5f0a20966d6c 100644 --- a/datafusion/common/src/types/extension.rs +++ b/datafusion/common/src/types/extension.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::error::Result; +use crate::error::{_exec_err, _internal_err, Result}; +use crate::metadata::format_type_and_metadata; use crate::nested_struct::CastExtension; use arrow::array::{Array, ArrayRef}; use arrow::compute::CastOptions; @@ -89,31 +90,119 @@ pub trait DFExtensionType: Debug + Send + Sync { ) -> Result>> { Ok(None) } +} + +#[derive(Debug)] +pub struct DefaultExtensionCast { + extension_name: &'static str, + instance: Option>, + can_cast_to_storage: bool, + can_cast_from_storage: bool, + use_default_cast_to_string: bool, +} - fn cast_from(&self) -> Result> { - Ok(Arc::new(DefaultExtensionCast {})) +impl DefaultExtensionCast { + pub fn new(extension_name: &'static str) -> Self { + Self { + extension_name, + instance: None, + can_cast_to_storage: true, + can_cast_from_storage: true, + use_default_cast_to_string: false, + } } - fn cast_to(&self) -> Result> { - Ok(Arc::new(DefaultExtensionCast {})) + fn is_cast_to_storage(&self, from: &Field, to: &Field) -> bool { + self.is_this_extension(from) + && !Self::is_any_extension(to) + && to.data_type() == from.data_type() } -} -#[derive(Debug)] -struct DefaultExtensionCast {} + fn is_cast_from_storage(&self, from: &Field, to: &Field) -> bool { + self.is_this_extension(to) + && !Self::is_any_extension(from) + && from.data_type() == to.data_type() + } + + fn is_cast_to_string(&self, from: &Field, to: &Field) -> bool { + self.is_this_extension(from) + && !Self::is_any_extension(to) + && matches!( + to.data_type(), + DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View + ) + } + + fn is_this_extension(&self, field: &Field) -> bool { + if let Some(from_extension_name) = field.extension_type_name() + && from_extension_name == self.extension_name + { + true + } else { + false + } + } + + fn is_any_extension(field: &Field) -> bool { + field.extension_type_name().is_some() + } + + fn default_cast_to_string( + &self, + _value: &ArrayRef, + _to: &DataType, + ) -> Result { + // Use the array formatter + todo!() + } +} impl CastExtension for DefaultExtensionCast { fn can_cast_fields(&self, from: &Field, to: &Field) -> Result { - Ok(from.data_type() == to.data_type()) + if self.can_cast_to_storage && self.is_cast_to_storage(from, to) { + return Ok(true); + } + + if self.can_cast_from_storage && self.is_cast_from_storage(from, to) { + return Ok(true); + } + + if self.use_default_cast_to_string && self.is_cast_to_string(from, to) { + return Ok(true); + } + + Ok(false) } fn cast_array_fields( &self, value: &ArrayRef, - _from: &Field, - _to: &Field, - _options: &CastOptions, + from: &Field, + to: &Field, + options: &CastOptions, ) -> Result { - Ok(value.clone()) + if options.safe { + let from_display = + format_type_and_metadata(from.data_type(), Some(from.metadata())); + let to_display = + format_type_and_metadata(to.data_type(), Some(to.metadata())); + return _exec_err!( + "Can't cast from {from_display} to {to_display} with safe = true" + ); + } + + if self.can_cast_to_storage && self.is_cast_to_storage(from, to) { + return Ok(value.clone()); + } + + if self.can_cast_from_storage && self.is_cast_from_storage(from, to) { + return Ok(value.clone()); + } + + if self.use_default_cast_to_string && self.is_cast_to_string(from, to) { + return self.default_cast_to_string(value, to.data_type()); + } + + _internal_err!("Unhandled cast from {from} to {to} in default extension cast") } } diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index c989bab3048ad..5b72cafecaa25 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -73,17 +73,21 @@ pub trait ExprSchemable { /// For `TryCast`, `force_nullable` is `true` since a failed cast returns NULL. fn cast_output_field( source_field: &FieldRef, - target_type: &DataType, + target_field: &FieldRef, force_nullable: bool, ) -> Arc { - let mut f = source_field + // Do not propagate metadata through casts because extension metadata (1) + // should be derived from the target_field and (2) source extension metadata + // may become non-sensical if applied to an unrelated storage output type. + let mut f = target_field .as_ref() .clone() - .with_data_type(target_type.clone()) - .with_metadata(source_field.metadata().clone()); + .with_nullable(source_field.is_nullable()); + if force_nullable { f = f.with_nullable(true); } + Arc::new(f) } @@ -594,21 +598,16 @@ impl ExprSchemable for Expr { func.return_field_from_args(args) } - // _ => Ok((self.get_type(schema)?, self.nullable(schema)?)), - Expr::Cast(Cast { expr, field }) => { - expr.to_field(schema).map(|(_table_ref, src)| { - cast_output_field(&src, field.data_type(), false) - }) - } + Expr::Cast(Cast { expr, field }) => expr + .to_field(schema) + .map(|(_table_ref, src)| cast_output_field(&src, field, false)), Expr::Placeholder(Placeholder { id: _, field: Some(field), }) => Ok(Arc::clone(field).renamed(&schema_name)), - Expr::TryCast(TryCast { expr, field }) => { - expr.to_field(schema).map(|(_table_ref, src)| { - cast_output_field(&src, field.data_type(), true) - }) - } + Expr::TryCast(TryCast { expr, field }) => expr + .to_field(schema) + .map(|(_table_ref, src)| cast_output_field(&src, field, true)), Expr::LambdaVariable(LambdaVariable { field: Some(field), .. }) => Ok(Arc::clone(field).renamed(&schema_name)), diff --git a/datafusion/expr/src/registry.rs b/datafusion/expr/src/registry.rs index 9496c0ca4eda9..c6ed1f5b1f852 100644 --- a/datafusion/expr/src/registry.rs +++ b/datafusion/expr/src/registry.rs @@ -28,7 +28,6 @@ use arrow_schema::extension::{ VariableShapeTensor, }; use datafusion_common::nested_struct::{CastExtension, VecCastExtension}; -use datafusion_common::types::uuid::{CastFromUuid, CastToUuid}; use datafusion_common::types::{ DFBool8, DFExtensionTypeRef, DFFixedShapeTensor, DFJson, DFOpaque, DFTimestampWithOffset, DFUuid, DFVariableShapeTensor, @@ -517,10 +516,7 @@ impl MemoryExtensionTypeRegistry { ), ]; - let cast_extensions = vec![ - Arc::new(CastFromUuid {}) as Arc, - Arc::new(CastToUuid {}) as Arc, - ]; + let cast_extensions = DFUuid::cast_extensions(); let mut extension_types = HashMap::new(); for registration in mapping.into_iter() { diff --git a/datafusion/physical-expr/src/expressions/cast.rs b/datafusion/physical-expr/src/expressions/cast.rs index 0d78b712facad..b53533a225e07 100644 --- a/datafusion/physical-expr/src/expressions/cast.rs +++ b/datafusion/physical-expr/src/expressions/cast.rs @@ -233,6 +233,7 @@ pub(crate) fn cast_expr_properties( child: &ExprProperties, target_type: &DataType, ) -> Result { + // TODO check the cast extension for this property let unbounded = Interval::make_unbounded(target_type)?; if is_order_preserving_cast_family(&child.range.data_type(), target_type) { Ok(child.clone().with_range(unbounded)) @@ -292,6 +293,8 @@ impl PhysicalExpr for CastExpr { } } } else { + // TODO: this should use the struct casting directly so we can pass on the + // cast extension value.cast_to(self.cast_type(), Some(&self.cast_options)) } } @@ -318,6 +321,7 @@ impl PhysicalExpr for CastExpr { fn evaluate_bounds(&self, children: &[&Interval]) -> Result { // Cast current node's interval to the right type: + // TODO: check the cast extension or cast the interval children[0].cast_to(self.cast_type(), &self.cast_options) } @@ -326,6 +330,7 @@ impl PhysicalExpr for CastExpr { interval: &Interval, children: &[&Interval], ) -> Result>> { + // Check cast extension for this let child_interval = children[0]; // Get child's datatype: let cast_type = child_interval.data_type(); @@ -416,7 +421,7 @@ pub fn cast_with_target_field( let source_fmt = format_type_and_metadata(expr_type, Some(expr_field.metadata())); let target_fmt = - format_type_and_metadata(target_field.data_type(), Some(expr_field.metadata())); + format_type_and_metadata(target_field.data_type(), Some(target_field.metadata())); if !can_build_cast { return not_impl_err!("Unsupported CAST from {source_fmt} to {target_fmt}"); } From 19bca5a384a99759a1a7518968a6b2f95628d9f0 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 7 May 2026 20:26:21 -0500 Subject: [PATCH 31/39] with the cast to --- .../src/types/canonical_extensions/uuid.rs | 7 +-- datafusion/common/src/types/extension.rs | 46 ++++++++++++++++--- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/datafusion/common/src/types/canonical_extensions/uuid.rs b/datafusion/common/src/types/canonical_extensions/uuid.rs index 972e3b36019d0..fe71cb5ff1183 100644 --- a/datafusion/common/src/types/canonical_extensions/uuid.rs +++ b/datafusion/common/src/types/canonical_extensions/uuid.rs @@ -56,7 +56,10 @@ impl DFUuid { pub fn cast_extensions() -> Vec> { vec![ - Arc::new(DefaultExtensionCast::new(Uuid::NAME)), + Arc::new( + DefaultExtensionCast::new(Uuid::NAME) + .with_default_cast_to_string(Some(Arc::new(DFUuid(Uuid)))), + ), Arc::new(ParseUuid), ] } @@ -172,8 +175,6 @@ impl CastExtension for ParseUuid { return Ok(Arc::new(builder.finish())); } - // Can implicitly cast from storage - DataType::FixedSizeBinary(16) => return Ok(value.clone()), _ => {} } diff --git a/datafusion/common/src/types/extension.rs b/datafusion/common/src/types/extension.rs index a5f0a20966d6c..3d575d9936036 100644 --- a/datafusion/common/src/types/extension.rs +++ b/datafusion/common/src/types/extension.rs @@ -18,7 +18,7 @@ use crate::error::{_exec_err, _internal_err, Result}; use crate::metadata::format_type_and_metadata; use crate::nested_struct::CastExtension; -use arrow::array::{Array, ArrayRef}; +use arrow::array::{Array, ArrayRef, StringBuilder}; use arrow::compute::CastOptions; use arrow::util::display::{ArrayFormatter, FormatOptions}; use arrow_schema::{DataType, Field}; @@ -112,6 +112,15 @@ impl DefaultExtensionCast { } } + pub fn with_default_cast_to_string( + mut self, + instance: Option>, + ) -> Self { + self.use_default_cast_to_string = true; + self.instance = instance; + self + } + fn is_cast_to_storage(&self, from: &Field, to: &Field) -> bool { self.is_this_extension(from) && !Self::is_any_extension(to) @@ -149,11 +158,34 @@ impl DefaultExtensionCast { fn default_cast_to_string( &self, - _value: &ArrayRef, - _to: &DataType, + value: &ArrayRef, + to: &DataType, ) -> Result { - // Use the array formatter - todo!() + let format_options = FormatOptions::default(); + + // Try to get a custom formatter from the extension type instance, + // otherwise fall back to the default formatter for the storage type + let formatter = if let Some(instance) = &self.instance { + match instance.create_array_formatter(value.as_ref(), &format_options)? { + Some(f) => f, + None => ArrayFormatter::try_new(value.as_ref(), &format_options)?, + } + } else { + ArrayFormatter::try_new(value.as_ref(), &format_options)? + }; + + // Format each value into a string type and cast to the target + let len = value.len(); + let mut builder = StringBuilder::with_capacity(len, len * 16); + for i in 0..len { + if value.is_null(i) { + builder.append_null(); + } else { + builder.append_value(formatter.value(i).to_string()); + } + } + + Ok(arrow::compute::cast(&builder.finish(), to)?) } } @@ -192,11 +224,11 @@ impl CastExtension for DefaultExtensionCast { } if self.can_cast_to_storage && self.is_cast_to_storage(from, to) { - return Ok(value.clone()); + return Ok(Arc::clone(value)); } if self.can_cast_from_storage && self.is_cast_from_storage(from, to) { - return Ok(value.clone()); + return Ok(Arc::clone(value)); } if self.use_default_cast_to_string && self.is_cast_to_string(from, to) { From afbc1834c365e415ee03053b66d1d787b1ea79fe Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 7 May 2026 20:31:48 -0500 Subject: [PATCH 32/39] maybe fix some tests --- datafusion/core/src/execution/session_state.rs | 2 +- datafusion/sqllogictest/test_files/metadata.slt | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 9ba24bab30dcc..b18104246304a 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -1395,7 +1395,7 @@ impl SessionStateBuilder { self } - /// Sets the [`ExtensionTypeRegistry`](datafusion_expr::registry::ExtensionTypeRegistry). + /// Sets the [`ExtensionTypeRegistry`] pub fn with_extension_type_registry( mut self, registry: ExtensionTypeRegistryRef, diff --git a/datafusion/sqllogictest/test_files/metadata.slt b/datafusion/sqllogictest/test_files/metadata.slt index 3fea8df260f05..bc9bb7b7a28d2 100644 --- a/datafusion/sqllogictest/test_files/metadata.slt +++ b/datafusion/sqllogictest/test_files/metadata.slt @@ -218,7 +218,7 @@ FROM table_with_metadata; 2020-09-08 2020-09-08 -# Regression test: CAST should preserve source field metadata +# CAST should not preserve source field metadata query DT SELECT CAST(ts AS DATE) as casted, @@ -229,22 +229,22 @@ FROM table_with_metadata; 2020-09-08 ts non-nullable field 2020-09-08 ts non-nullable field -# Regression test: CAST preserves metadata on integer column +# CAST should not preserve metadata on integer column query IT SELECT CAST(id AS BIGINT) as casted, arrow_metadata(CAST(id AS BIGINT), 'metadata_key') FROM table_with_metadata; ---- -1 the id field -NULL the id field -3 the id field +1 NULL +NULL NULL +3 NULL -# Regression test: CAST with single-argument arrow_metadata (returns full map) +# CAST with single-argument arrow_metadata (returns full map) query ? select arrow_metadata(CAST(id AS BIGINT)) from table_with_metadata limit 1; ---- -{metadata_key: the id field} +NULL # Regression test: distinct with cast query D From 079f5056b58af7d1ecf20bf7b4134c878345fa45 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 7 May 2026 20:42:21 -0500 Subject: [PATCH 33/39] diff cleanup --- datafusion/datasource/src/url.rs | 3 ++- datafusion/expr/src/registry.rs | 1 - datafusion/ffi/src/session/mod.rs | 9 +++++---- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/datafusion/datasource/src/url.rs b/datafusion/datasource/src/url.rs index 60aa2d909437c..4d7f5bf14c697 100644 --- a/datafusion/datasource/src/url.rs +++ b/datafusion/datasource/src/url.rs @@ -28,7 +28,8 @@ use futures::{StreamExt, TryStreamExt}; use glob::Pattern; use itertools::Itertools; use log::debug; -use object_store::path::{DELIMITER, Path}; +use object_store::path::DELIMITER; +use object_store::path::Path; use object_store::{ObjectMeta, ObjectStore, ObjectStoreExt}; use url::Url; diff --git a/datafusion/expr/src/registry.rs b/datafusion/expr/src/registry.rs index c6ed1f5b1f852..db41146e36e6e 100644 --- a/datafusion/expr/src/registry.rs +++ b/datafusion/expr/src/registry.rs @@ -461,7 +461,6 @@ impl MemoryExtensionTypeRegistry { /// Pre-registers the [canonical extension types](https://arrow.apache.org/docs/format/CanonicalExtensions.html) /// in the extension type registry. pub fn new_with_canonical_extension_types() -> Self { - // Figure out what happened here let mapping = [ ExtensionTypeRegistration::new_arc( FixedShapeTensor::NAME, diff --git a/datafusion/ffi/src/session/mod.rs b/datafusion/ffi/src/session/mod.rs index e7ccca1190251..dfc9d1c7dfebd 100644 --- a/datafusion/ffi/src/session/mod.rs +++ b/datafusion/ffi/src/session/mod.rs @@ -45,6 +45,11 @@ use datafusion_proto::protobuf::LogicalExprNode; use datafusion_session::Session; use prost::Message; +use stabby::str::Str as SStr; +use stabby::string::String as SString; +use stabby::vec::Vec as SVec; +use tokio::runtime::Handle; + use crate::arrow_wrappers::WrappedSchema; use crate::execution::FFI_TaskContext; use crate::execution_plan::FFI_ExecutionPlan; @@ -56,10 +61,6 @@ use crate::udf::FFI_ScalarUDF; use crate::udwf::FFI_WindowUDF; use crate::util::FFI_Result; use crate::{df_result, sresult, sresult_return}; -use stabby::str::Str as SStr; -use stabby::string::String as SString; -use stabby::vec::Vec as SVec; -use tokio::runtime::Handle; pub mod config; From 0a0c7b83e0d01efb9be4a601cecc6584eba86323 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 7 May 2026 20:46:44 -0500 Subject: [PATCH 34/39] maybe fix more errors --- datafusion/expr/src/registry.rs | 2 +- datafusion/sqllogictest/test_files/metadata.slt | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/datafusion/expr/src/registry.rs b/datafusion/expr/src/registry.rs index db41146e36e6e..848ad0434ea02 100644 --- a/datafusion/expr/src/registry.rs +++ b/datafusion/expr/src/registry.rs @@ -607,7 +607,7 @@ impl ExtensionTypeRegistry for MemoryExtensionTypeRegistry { .can_cast_fields(source_field, target_field) .unwrap_or(false) { - Some(self.cast_extensions.clone()) + Some(Arc::clone(&self.cast_extensions) as Arc) } else { None } diff --git a/datafusion/sqllogictest/test_files/metadata.slt b/datafusion/sqllogictest/test_files/metadata.slt index bc9bb7b7a28d2..54c83678ea5d1 100644 --- a/datafusion/sqllogictest/test_files/metadata.slt +++ b/datafusion/sqllogictest/test_files/metadata.slt @@ -225,9 +225,9 @@ SELECT arrow_metadata(CAST(ts AS DATE), 'metadata_key') FROM table_with_metadata; ---- -2020-09-08 ts non-nullable field -2020-09-08 ts non-nullable field -2020-09-08 ts non-nullable field +2020-09-08 ts NULL +2020-09-08 ts NULL +2020-09-08 ts NULL # CAST should not preserve metadata on integer column query IT @@ -244,7 +244,7 @@ NULL NULL query ? select arrow_metadata(CAST(id AS BIGINT)) from table_with_metadata limit 1; ---- -NULL +{} # Regression test: distinct with cast query D From 000ccf6031b95fc7ba04354bccb807134bc4299d Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 7 May 2026 20:47:44 -0500 Subject: [PATCH 35/39] remove one more metadata test --- datafusion/expr/src/expr_schema.rs | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 5b72cafecaa25..22dcb697dd03d 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -1042,17 +1042,9 @@ mod tests { .with_data_type(DataType::Int32) .with_metadata(meta.clone()); - // col, alias, and cast should be metadata-preserving + // col and alias should be metadata-preserving assert_eq!(meta, expr.metadata(&schema).unwrap()); assert_eq!(meta, expr.clone().alias("bar").metadata(&schema).unwrap()); - assert_eq!( - meta, - expr.clone() - .cast_to(&DataType::Int64, &schema) - .unwrap() - .metadata(&schema) - .unwrap() - ); let schema = DFSchema::from_unqualified_fields( vec![meta.add_to_field(Field::new("foo", DataType::Int32, true))].into(), From 40907ca6e4ddd65e33f30bdf6dce4debc056150e Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 7 May 2026 20:54:09 -0500 Subject: [PATCH 36/39] one more clippy thing --- datafusion/physical-expr/src/planner.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/physical-expr/src/planner.rs b/datafusion/physical-expr/src/planner.rs index c0ddf4cd693b2..756fd853fafce 100644 --- a/datafusion/physical-expr/src/planner.rs +++ b/datafusion/physical-expr/src/planner.rs @@ -298,7 +298,7 @@ pub fn create_physical_expr( let (_, src_field) = expr.to_field(input_dfschema)?; let cast_extension = if let Some(extension_types) = &execution_props.extension_types { - extension_types.cast_extension(&src_field, &field) + extension_types.cast_extension(&src_field, field) } else { None }; From addddbb9498cf362fed7f98c3e1e46369ddcd631 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 7 May 2026 21:08:54 -0500 Subject: [PATCH 37/39] benchark fix --- datafusion/sqllogictest/test_files/metadata.slt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/sqllogictest/test_files/metadata.slt b/datafusion/sqllogictest/test_files/metadata.slt index 54c83678ea5d1..1d2cdf494005b 100644 --- a/datafusion/sqllogictest/test_files/metadata.slt +++ b/datafusion/sqllogictest/test_files/metadata.slt @@ -225,9 +225,9 @@ SELECT arrow_metadata(CAST(ts AS DATE), 'metadata_key') FROM table_with_metadata; ---- -2020-09-08 ts NULL -2020-09-08 ts NULL -2020-09-08 ts NULL +2020-09-08 NULL +2020-09-08 NULL +2020-09-08 NULL # CAST should not preserve metadata on integer column query IT From 0f186eb9d10581b07b0f7a942d23234366d6cebb Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 7 May 2026 21:09:52 -0500 Subject: [PATCH 38/39] more clippy --- datafusion/physical-expr/src/expressions/cast.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/physical-expr/src/expressions/cast.rs b/datafusion/physical-expr/src/expressions/cast.rs index b53533a225e07..189b75e5096e0 100644 --- a/datafusion/physical-expr/src/expressions/cast.rs +++ b/datafusion/physical-expr/src/expressions/cast.rs @@ -406,7 +406,7 @@ pub fn cast_with_target_field( return Ok(Arc::clone(&expr)); } - let can_build_cast = if requires_nested_struct_cast(&expr_type, cast_type) { + let can_build_cast = if requires_nested_struct_cast(expr_type, cast_type) { // Allow casts involving structs (including nested inside Lists, Dictionaries, // etc.) that pass name-based compatibility validation. This validation is // applied at planning time (now) to fail fast, rather than deferring errors @@ -414,9 +414,9 @@ pub fn cast_with_target_field( // via ColumnarValue::cast_to. // TODO: we can pass the cast extension here if we will end up using it for // the nested casting - can_cast_named_struct_types(&expr_type, cast_type, None) + can_cast_named_struct_types(expr_type, cast_type, None) } else { - can_cast_types(&expr_type, cast_type) + can_cast_types(expr_type, cast_type) }; let source_fmt = format_type_and_metadata(expr_type, Some(expr_field.metadata())); From c449973e4f732cecaeada7182e6b339708006ebd Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 8 May 2026 11:49:19 -0500 Subject: [PATCH 39/39] try a fix --- datafusion/expr/src/expr_rewriter/mod.rs | 70 +++++++++++- .../optimizer/src/analyzer/type_coercion.rs | 1 + datafusion/physical-plan/src/union.rs | 104 ++++++++++++++++++ 3 files changed, 169 insertions(+), 6 deletions(-) diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs index 32a88ab8cf310..5478607994df1 100644 --- a/datafusion/expr/src/expr_rewriter/mod.rs +++ b/datafusion/expr/src/expr_rewriter/mod.rs @@ -22,14 +22,18 @@ use std::collections::HashSet; use std::fmt::Debug; use std::sync::Arc; -use crate::expr::{Alias, Sort, Unnest}; +use arrow::compute::can_cast_types; +use arrow::datatypes::FieldRef; + +use crate::expr::{Alias, Cast, Sort, Unnest}; +use crate::expr_schema::cast_subquery; use crate::logical_plan::Projection; use crate::{Expr, ExprSchemable, LogicalPlan, LogicalPlanBuilder}; use datafusion_common::TableReference; use datafusion_common::config::ConfigOptions; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; -use datafusion_common::{Column, DFSchema, Result}; +use datafusion_common::{Column, DFSchema, ExprSchema, Result, plan_err}; mod guarantees; pub use guarantees::GuaranteeRewriter; @@ -252,11 +256,14 @@ fn coerce_exprs_for_schema( .into_iter() .enumerate() .map(|(idx, expr)| { - let new_type = dst_schema.field(idx).data_type(); + let dst_field = dst_schema.field(idx); + let new_type = dst_field.data_type(); if new_type != &expr.get_type(src_schema)? { match expr { Expr::Alias(Alias { expr, name, .. }) => { - Ok(expr.cast_to(new_type, src_schema)?.alias(name)) + // Use new_from_field to preserve metadata from dst_schema + Ok(cast_to_field(*expr, Arc::clone(dst_field), src_schema)? + .alias(name)) } #[expect(deprecated)] Expr::Wildcard { .. } => Ok(expr), @@ -267,9 +274,18 @@ fn coerce_exprs_for_schema( // (see: https://github.com/apache/datafusion/issues/18818) Expr::Column(ref column) => { let name = column.name().to_owned(); - Ok(expr.cast_to(new_type, src_schema)?.alias(name)) + // Use new_from_field to preserve metadata from dst_schema + Ok(cast_to_field( + expr, + Arc::clone(dst_field), + src_schema, + )? + .alias(name)) + } + _ => { + // Use new_from_field to preserve metadata from dst_schema + cast_to_field(expr, Arc::clone(dst_field), src_schema) } - _ => Ok(expr.cast_to(new_type, src_schema)?), } } } @@ -280,6 +296,48 @@ fn coerce_exprs_for_schema( .collect::>() } +// TODO: move to `ExprSchemable::cast_to_field`? + +/// Cast an expression to a target field, preserving field metadata. +/// This is similar to `ExprSchemable::cast_to` but uses the full field +/// (including metadata) rather than just the data type. +fn cast_to_field( + expr: Expr, + target_field: FieldRef, + schema: &dyn ExprSchema, +) -> Result { + use arrow::datatypes::DataType; + + let this_type = expr.get_type(schema)?; + let cast_to_type = target_field.data_type(); + if &this_type == cast_to_type { + return Ok(expr); + } + + // Special handling for struct-to-struct casts with name-based field matching + let can_cast = match (&this_type, cast_to_type) { + (DataType::Struct(_), DataType::Struct(_)) => { + // Always allow struct-to-struct casts; field matching happens at runtime + true + } + _ => can_cast_types(&this_type, cast_to_type), + }; + + if can_cast { + match expr { + Expr::ScalarSubquery(subquery) => { + Ok(Expr::ScalarSubquery(cast_subquery(subquery, cast_to_type)?)) + } + _ => Ok(Expr::Cast(Cast::new_from_field( + Box::new(expr), + target_field, + ))), + } + } else { + plan_err!("Cannot automatically convert {this_type} to {cast_to_type}") + } +} + /// Recursively un-alias an expressions #[inline] pub fn unalias(expr: Expr) -> Expr { diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 7b81feab47a99..f55e86b63d2cb 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -1223,6 +1223,7 @@ fn coerce_union_schema_with_schema( ); } + // TODO: this type coercsion was causing an issue in one of the benchmark bits // coerce data type and nullability for each field for (union_datatype, union_nullable, union_field_map, plan_field) in izip!( union_datatypes.iter_mut(), diff --git a/datafusion/physical-plan/src/union.rs b/datafusion/physical-plan/src/union.rs index ec9ea376e0b6d..e17d363dd93b0 100644 --- a/datafusion/physical-plan/src/union.rs +++ b/datafusion/physical-plan/src/union.rs @@ -1307,4 +1307,108 @@ mod tests { )); Ok(()) } + + #[test] + fn test_union_schema_metadata_preservation() { + use crate::empty::EmptyExec; + use std::collections::HashMap; + + // Create schemas - one with metadata, one without + let mut metadata = HashMap::new(); + metadata.insert("key".to_string(), "value".to_string()); + + let schema_with_metadata = Arc::new(Schema::new(vec![ + Field::new("name", DataType::Utf8, true).with_metadata(metadata.clone()), + ])); + + let schema_without_metadata = + Arc::new(Schema::new(vec![Field::new("name", DataType::Utf8, true)])); + + // Create two EmptyExec plans with different schemas + let input1 = Arc::new(EmptyExec::new(Arc::clone(&schema_with_metadata))); + let input2 = Arc::new(EmptyExec::new(Arc::clone(&schema_without_metadata))); + + // Test both orderings + let inputs_with_first = vec![ + Arc::clone(&input1) as Arc, + Arc::clone(&input2) as Arc, + ]; + let inputs_without_first = vec![ + Arc::clone(&input2) as Arc, + Arc::clone(&input1) as Arc, + ]; + + // Call union_schema directly + let result1 = union_schema(&inputs_with_first).unwrap(); + let result2 = union_schema(&inputs_without_first).unwrap(); + + // Both should have the metadata + assert!( + !result1.field(0).metadata().is_empty(), + "Expected metadata in result1 (with metadata first), got empty" + ); + assert!( + !result2.field(0).metadata().is_empty(), + "Expected metadata in result2 (without metadata first), got empty" + ); + assert_eq!( + result1.field(0).metadata().get("key"), + Some(&"value".to_string()) + ); + assert_eq!( + result2.field(0).metadata().get("key"), + Some(&"value".to_string()) + ); + } + + #[test] + fn test_union_schema_metadata_with_non_nullable() { + use crate::empty::EmptyExec; + use std::collections::HashMap; + + // Test case that matches the failing test: + // input 0: nonnull_name (NOT nullable, has metadata) + // input 1: NULL::string (nullable, no metadata) + + let mut metadata = HashMap::new(); + metadata.insert("key".to_string(), "value".to_string()); + + // input 0: NOT nullable, has metadata + let schema_with_metadata = Arc::new(Schema::new(vec![ + Field::new( + "name", + DataType::Utf8, + false, // NOT nullable + ) + .with_metadata(metadata.clone()), + ])); + + // input 1: nullable, no metadata + let schema_without_metadata = + Arc::new(Schema::new(vec![Field::new("name", DataType::Utf8, true)])); // nullable + + let input1 = Arc::new(EmptyExec::new(Arc::clone(&schema_with_metadata))); + let input2 = Arc::new(EmptyExec::new(Arc::clone(&schema_without_metadata))); + + let inputs = vec![ + Arc::clone(&input1) as Arc, + Arc::clone(&input2) as Arc, + ]; + + let result = union_schema(&inputs).unwrap(); + + // The result should be nullable (since one input is nullable) and have metadata + assert!( + result.field(0).is_nullable(), + "Expected nullable field in union result" + ); + assert!( + !result.field(0).metadata().is_empty(), + "Expected metadata preserved from non-nullable input, got empty" + ); + assert_eq!( + result.field(0).metadata().get("key"), + Some(&"value".to_string()) + ); + } }