-
Notifications
You must be signed in to change notification settings - Fork 60
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: incremental Snapshot update #651
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -166,21 +166,59 @@ impl LogSegment { | |
.filter_ok(|x| x.is_commit()) | ||
.try_collect()?; | ||
|
||
// Return a FileNotFound error if there are no new commits. | ||
// Callers can then decide how to handle this case. | ||
require!( | ||
!ascending_commit_files.is_empty(), | ||
Error::file_not_found("No new commits.") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the error should be more like "No commits in range", or "Failed to find commits in the range". "New commits" makes sense for an update operation, but CDF could operate over old commits. |
||
); | ||
|
||
// - Here check that the start version is correct. | ||
// - [`LogSegment::try_new`] will verify that the `end_version` is correct if present. | ||
// - [`LogSegment::try_new`] also checks that there are no gaps between commits. | ||
// If all three are satisfied, this implies that all the desired commits are present. | ||
require!( | ||
ascending_commit_files | ||
.first() | ||
.is_some_and(|first_commit| first_commit.version == start_version), | ||
// safety: we validated the list is not empty above | ||
ascending_commit_files[0].version == start_version, | ||
Error::generic(format!( | ||
"Expected the first commit to have version {}", | ||
start_version | ||
)) | ||
); | ||
LogSegment::try_new(ascending_commit_files, vec![], log_root, end_version) | ||
} | ||
|
||
/// Extends this LogSegment with the contents of another LogSegment. | ||
/// | ||
/// The other LogSegment must be contiguous with this one, i.e. the end | ||
/// version of this LogSegment must be one less than the start version of | ||
/// the other LogSegment and contain only commit files. | ||
pub(crate) fn extend(&mut self, other: &LogSegment) -> DeltaResult<()> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this passes by reference. I'm not sure when we'd need to keep around such a log segment besides extending an existing one. Should we pass ownership and avoid the clone? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd go a step further -- most of this should probably be inlined at the call site, which eliminates many of the correctness checks that caller already had to perform (or can prove are redundant for other reasons) (see also my other comment about making an entirely new snapshot instead of mutating an existing one) |
||
require!( | ||
self.log_root == other.log_root, | ||
Error::generic("Cannot merge LogSegments with different log roots") | ||
); | ||
require!( | ||
other.checkpoint_parts.is_empty(), | ||
Error::generic("Cannot extend by LogSegments with checkpoint parts") | ||
); | ||
|
||
if other.ascending_commit_files.is_empty() { | ||
return Ok(()); | ||
} | ||
|
||
require!( | ||
Some(self.end_version + 1) == other.ascending_commit_files.first().map(|f| f.version), | ||
Error::generic("Cannot merge non contiguous LogSegments") | ||
); | ||
|
||
self.ascending_commit_files | ||
.extend(other.ascending_commit_files.iter().cloned()); | ||
self.end_version = other.end_version; | ||
|
||
Ok(()) | ||
} | ||
|
||
/// Read a stream of log data from this log segment. | ||
/// | ||
/// The log files will be read from most recent to oldest. | ||
|
@@ -226,8 +264,11 @@ impl LogSegment { | |
Ok(commit_stream.chain(checkpoint_stream)) | ||
} | ||
|
||
// Get the most up-to-date Protocol and Metadata actions | ||
pub(crate) fn read_metadata(&self, engine: &dyn Engine) -> DeltaResult<(Metadata, Protocol)> { | ||
// Scan the log segment for metadata and protocol actions | ||
pub(crate) fn read_metadata_opt( | ||
&self, | ||
engine: &dyn Engine, | ||
) -> DeltaResult<(Option<Metadata>, Option<Protocol>)> { | ||
let data_batches = self.replay_for_metadata(engine)?; | ||
let (mut metadata_opt, mut protocol_opt) = (None, None); | ||
for batch in data_batches { | ||
|
@@ -243,7 +284,12 @@ impl LogSegment { | |
break; | ||
} | ||
} | ||
match (metadata_opt, protocol_opt) { | ||
Ok((metadata_opt, protocol_opt)) | ||
} | ||
|
||
// Get the most up-to-date Protocol and Metadata actions | ||
pub(crate) fn read_metadata(&self, engine: &dyn Engine) -> DeltaResult<(Metadata, Protocol)> { | ||
match self.read_metadata_opt(engine)? { | ||
(Some(m), Some(p)) => Ok((m, p)), | ||
(None, Some(_)) => Err(Error::MissingMetadata), | ||
(Some(_), None) => Err(Error::MissingProtocol), | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -59,7 +59,7 @@ impl Snapshot { | |
pub fn try_new( | ||
table_root: Url, | ||
engine: &dyn Engine, | ||
version: Option<Version>, | ||
version: impl Into<Option<Version>>, | ||
) -> DeltaResult<Self> { | ||
let fs_client = engine.get_file_system_client(); | ||
let log_root = table_root.join("_delta_log/")?; | ||
|
@@ -152,6 +152,64 @@ impl Snapshot { | |
pub fn into_scan_builder(self) -> ScanBuilder { | ||
ScanBuilder::new(self) | ||
} | ||
|
||
/// Update the `Snapshot` to the target version. | ||
/// | ||
/// # Parameters | ||
/// - `target_version`: desired version of the `Snapshot` after update, defaults to latest. | ||
/// - `engine`: Implementation of [`Engine`] apis. | ||
/// | ||
/// # Returns | ||
/// - boolean flag indicating if the `Snapshot` was updated. | ||
pub fn update( | ||
&mut self, | ||
Comment on lines
+164
to
+165
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's not clear to me that we want a mut Either way, it seems better to return an entirely new Snapshot? Especially since that makes this code exception safe (an error partway through doesn't leave any visible changes). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also -- if we do take There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yea perhaps we need some discussion on what API we want here? I had a PR that gives a new |
||
target_version: impl Into<Option<Version>>, | ||
engine: &dyn Engine, | ||
) -> DeltaResult<bool> { | ||
let fs_client = engine.get_file_system_client(); | ||
let log_root = self.table_root.join("_delta_log/")?; | ||
let log_segment = match LogSegment::for_table_changes( | ||
fs_client.as_ref(), | ||
log_root, | ||
self.version() + 1, | ||
target_version, | ||
) { | ||
Ok(segment) => segment, | ||
Err(Error::FileNotFound(_)) => return Ok(false), | ||
Err(e) => return Err(e), | ||
}; | ||
|
||
let (metadata, protocol) = log_segment.read_metadata_opt(engine)?; | ||
if let Some(p) = &protocol { | ||
p.ensure_read_supported()?; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should the column mapping mode also be checked in the case only Protocol is updated? the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In fact this is another usecase for my TableConfiguration PR where we do all the validations in one place without having to think about these cases. |
||
} | ||
let (schema, table_properties) = if let Some(m) = &metadata { | ||
let schema = m.parse_schema()?; | ||
let table_properties = m.parse_table_properties(); | ||
let column_mapping_mode = column_mapping_mode( | ||
protocol.as_ref().unwrap_or(&self.protocol), | ||
&table_properties, | ||
); | ||
validate_schema_column_mapping(&schema, column_mapping_mode)?; | ||
(Some(schema), Some(table_properties)) | ||
} else { | ||
(None, None) | ||
}; | ||
|
||
// NOTE: we try to extend the log segment first, so that if it fails, we don't update the | ||
// snapshot. Otherwise callers might end up with an inconsistent snapshot. | ||
self.log_segment.extend(&log_segment)?; | ||
if let Some(p) = protocol { | ||
self.protocol = p; | ||
} | ||
if let (Some(m), Some(s), Some(t)) = (metadata, schema, table_properties) { | ||
self.metadata = m; | ||
self.schema = s; | ||
self.table_properties = t; | ||
} | ||
|
||
Ok(true) | ||
} | ||
} | ||
|
||
#[derive(Debug, Deserialize, Serialize)] | ||
|
@@ -217,7 +275,7 @@ mod tests { | |
use crate::engine::default::filesystem::ObjectStoreFileSystemClient; | ||
use crate::engine::sync::SyncEngine; | ||
use crate::path::ParsedLogPath; | ||
use crate::schema::StructType; | ||
use crate::schema::{DataType, PrimitiveType, StructType}; | ||
|
||
#[test] | ||
fn test_snapshot_read_metadata() { | ||
|
@@ -348,4 +406,35 @@ mod tests { | |
3, | ||
); | ||
} | ||
|
||
#[test] | ||
fn test_snapshot_update() { | ||
let path = std::fs::canonicalize(PathBuf::from("./tests/data/type-widening/")).unwrap(); | ||
|
||
let location = url::Url::from_directory_path(path).unwrap(); | ||
let engine = SyncEngine::new(); | ||
let mut snapshot = Snapshot::try_new(location, &engine, 0).unwrap(); | ||
|
||
assert_eq!(snapshot.protocol().min_reader_version(), 1); | ||
assert_eq!(snapshot.table_properties.enable_type_widening, None); | ||
|
||
snapshot.update(1, &engine).unwrap(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Such a clean n simple api to update! 👌 |
||
assert_eq!(snapshot.protocol().min_reader_version(), 3); | ||
assert_eq!(snapshot.table_properties.enable_type_widening, Some(true)); | ||
assert!(matches!( | ||
snapshot.schema().field("int_decimal").unwrap().data_type(), | ||
&DataType::Primitive(PrimitiveType::Integer) | ||
)); | ||
|
||
snapshot.update(None, &engine).unwrap(); | ||
assert_eq!(snapshot.protocol().min_reader_version(), 3); | ||
assert_eq!(snapshot.table_properties.enable_type_widening, Some(true)); | ||
assert!(matches!( | ||
snapshot.schema().field("int_decimal").unwrap().data_type(), | ||
&DataType::Primitive(PrimitiveType::Decimal(11, 1)) | ||
)); | ||
|
||
// update return false if no new version | ||
assert!(!snapshot.update(None, &engine).unwrap()); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -137,6 +137,9 @@ pub struct TableProperties { | |
/// whether to enable row tracking during writes. | ||
pub enable_row_tracking: Option<bool>, | ||
|
||
/// whether to enable type widening | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't see There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like it's still an RFC at this point: https://github.com/delta-io/delta/blob/master/protocol_rfcs/type-widening.md. We can totally implement support for it, but details of the feature could technically change before the RFC is merged into the actual spec. I believe the feature name also needs to be |
||
pub enable_type_widening: Option<bool>, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Re
The semver check fails here with the message:
That seems like a legit (tho annoying) callout of a breaking change? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yep +1 came here to comment the same thing (this is a breaking change) - also this seems a tangential change, do we want to include it here? |
||
|
||
/// any unrecognized properties are passed through and ignored by the parser | ||
pub unknown_properties: HashMap<String, String>, | ||
} | ||
|
@@ -268,6 +271,7 @@ mod tests { | |
("delta.tuneFileSizesForRewrites", "true"), | ||
("delta.checkpointPolicy", "v2"), | ||
("delta.enableRowTracking", "true"), | ||
("delta.enableTypeWidening", "true"), | ||
]; | ||
let actual = TableProperties::from(properties.into_iter()); | ||
let expected = TableProperties { | ||
|
@@ -293,6 +297,7 @@ mod tests { | |
tune_file_sizes_for_rewrites: Some(true), | ||
checkpoint_policy: Some(CheckpointPolicy::V2), | ||
enable_row_tracking: Some(true), | ||
enable_type_widening: Some(true), | ||
unknown_properties: HashMap::new(), | ||
}; | ||
assert_eq!(actual, expected); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Splitting it up makes it clearer that we want nonempty 👌