diff --git a/pyiceberg/table/inspect.py b/pyiceberg/table/inspect.py index 470c00f464..52ff152c93 100644 --- a/pyiceberg/table/inspect.py +++ b/pyiceberg/table/inspect.py @@ -17,13 +17,14 @@ from __future__ import annotations from datetime import datetime -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple +from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Set, Tuple from pyiceberg.conversions import from_bytes from pyiceberg.manifest import DataFile, DataFileContent, ManifestContent, PartitionFieldSummary from pyiceberg.partitioning import PartitionSpec from pyiceberg.table.snapshots import Snapshot, ancestors_of from pyiceberg.types import PrimitiveType +from pyiceberg.utils.concurrent import ExecutorFactory from pyiceberg.utils.singleton import _convert_to_hashable_type if TYPE_CHECKING: @@ -32,6 +33,41 @@ from pyiceberg.table import Table +def get_manifests_schema() -> "pa.Schema": + import pyarrow as pa + + partition_summary_schema = pa.struct([ + pa.field("contains_null", pa.bool_(), nullable=False), + pa.field("contains_nan", pa.bool_(), nullable=True), + pa.field("lower_bound", pa.string(), nullable=True), + pa.field("upper_bound", pa.string(), nullable=True), + ]) + + manifest_schema = pa.schema([ + pa.field("content", pa.int8(), nullable=False), + pa.field("path", pa.string(), nullable=False), + pa.field("length", pa.int64(), nullable=False), + pa.field("partition_spec_id", pa.int32(), nullable=False), + pa.field("added_snapshot_id", pa.int64(), nullable=False), + pa.field("added_data_files_count", pa.int32(), nullable=False), + pa.field("existing_data_files_count", pa.int32(), nullable=False), + pa.field("deleted_data_files_count", pa.int32(), nullable=False), + pa.field("added_delete_files_count", pa.int32(), nullable=False), + pa.field("existing_delete_files_count", pa.int32(), nullable=False), + pa.field("deleted_delete_files_count", pa.int32(), nullable=False), + pa.field("partition_summaries", pa.list_(partition_summary_schema), nullable=False), + ]) + return manifest_schema + + +def get_all_manifests_schema() -> "pa.Schema": + import pyarrow as pa + + all_manifests_schema = get_manifests_schema() + all_manifests_schema = all_manifests_schema.append(pa.field("reference_snapshot_id", pa.int64(), nullable=False)) + return all_manifests_schema + + class InspectTable: tbl: Table @@ -326,31 +362,9 @@ def update_partitions_map( schema=table_schema, ) - def manifests(self) -> "pa.Table": + def _generate_manifests_table(self, snapshot: Optional[Snapshot], is_all_manifests_table: bool = False) -> "pa.Table": import pyarrow as pa - partition_summary_schema = pa.struct([ - pa.field("contains_null", pa.bool_(), nullable=False), - pa.field("contains_nan", pa.bool_(), nullable=True), - pa.field("lower_bound", pa.string(), nullable=True), - pa.field("upper_bound", pa.string(), nullable=True), - ]) - - manifest_schema = pa.schema([ - pa.field("content", pa.int8(), nullable=False), - pa.field("path", pa.string(), nullable=False), - pa.field("length", pa.int64(), nullable=False), - pa.field("partition_spec_id", pa.int32(), nullable=False), - pa.field("added_snapshot_id", pa.int64(), nullable=False), - pa.field("added_data_files_count", pa.int32(), nullable=False), - pa.field("existing_data_files_count", pa.int32(), nullable=False), - pa.field("deleted_data_files_count", pa.int32(), nullable=False), - pa.field("added_delete_files_count", pa.int32(), nullable=False), - pa.field("existing_delete_files_count", pa.int32(), nullable=False), - pa.field("deleted_delete_files_count", pa.int32(), nullable=False), - pa.field("partition_summaries", pa.list_(partition_summary_schema), nullable=False), - ]) - def _partition_summaries_to_rows( spec: PartitionSpec, partition_summaries: List[PartitionFieldSummary] ) -> List[Dict[str, Any]]: @@ -386,11 +400,11 @@ def _partition_summaries_to_rows( specs = self.tbl.metadata.specs() manifests = [] - if snapshot := self.tbl.metadata.current_snapshot(): + if snapshot: for manifest in snapshot.manifests(self.tbl.io): is_data_file = manifest.content == ManifestContent.DATA is_delete_file = manifest.content == ManifestContent.DELETES - manifests.append({ + manifest_row = { "content": manifest.content, "path": manifest.manifest_path, "length": manifest.manifest_length, @@ -405,13 +419,19 @@ def _partition_summaries_to_rows( "partition_summaries": _partition_summaries_to_rows(specs[manifest.partition_spec_id], manifest.partitions) if manifest.partitions else [], - }) + } + if is_all_manifests_table: + manifest_row["reference_snapshot_id"] = snapshot.snapshot_id + manifests.append(manifest_row) return pa.Table.from_pylist( manifests, - schema=manifest_schema, + schema=get_all_manifests_schema() if is_all_manifests_table else get_manifests_schema(), ) + def manifests(self) -> "pa.Table": + return self._generate_manifests_table(self.tbl.current_snapshot()) + def metadata_log_entries(self) -> "pa.Table": import pyarrow as pa @@ -586,3 +606,16 @@ def data_files(self, snapshot_id: Optional[int] = None) -> "pa.Table": def delete_files(self, snapshot_id: Optional[int] = None) -> "pa.Table": return self._files(snapshot_id, {DataFileContent.POSITION_DELETES, DataFileContent.EQUALITY_DELETES}) + + def all_manifests(self) -> "pa.Table": + import pyarrow as pa + + snapshots = self.tbl.snapshots() + if not snapshots: + return pa.Table.from_pylist([], schema=get_all_manifests_schema()) + + executor = ExecutorFactory.get_or_create() + manifests_by_snapshots: Iterator["pa.Table"] = executor.map( + lambda args: self._generate_manifests_table(*args), [(snapshot, True) for snapshot in snapshots] + ) + return pa.concat_tables(manifests_by_snapshots) diff --git a/tests/integration/test_inspect_table.py b/tests/integration/test_inspect_table.py index 68b10f3262..6eb630eb27 100644 --- a/tests/integration/test_inspect_table.py +++ b/tests/integration/test_inspect_table.py @@ -846,3 +846,95 @@ def inspect_files_asserts(df: pa.Table) -> None: inspect_files_asserts(files_df) inspect_files_asserts(data_files_df) inspect_files_asserts(delete_files_df) + + +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_inspect_all_manifests(spark: SparkSession, session_catalog: Catalog, format_version: int) -> None: + identifier = "default.table_metadata_all_manifests" + try: + session_catalog.drop_table(identifier=identifier) + except NoSuchTableError: + pass + + spark.sql( + f""" + CREATE TABLE {identifier} ( + id int, + data string + ) + PARTITIONED BY (data) + TBLPROPERTIES ('write.update.mode'='merge-on-read', + 'write.delete.mode'='merge-on-read') + """ + ) + tbl = session_catalog.load_table(identifier) + + # check all_manifests when there are no snapshots + lhs = tbl.inspect.all_manifests().to_pandas() + rhs = spark.table(f"{identifier}.all_manifests").toPandas() + assert lhs.empty + assert rhs.empty + + spark.sql(f"INSERT INTO {identifier} VALUES (1, 'a')") + + spark.sql(f"INSERT INTO {identifier} VALUES (2, 'b')") + + spark.sql(f"UPDATE {identifier} SET data = 'c' WHERE id = 1") + + spark.sql(f"DELETE FROM {identifier} WHERE id = 2") + + spark.sql(f"INSERT OVERWRITE {identifier} VALUES (1, 'a')") + + df = tbl.inspect.all_manifests() + + assert df.column_names == [ + "content", + "path", + "length", + "partition_spec_id", + "added_snapshot_id", + "added_data_files_count", + "existing_data_files_count", + "deleted_data_files_count", + "added_delete_files_count", + "existing_delete_files_count", + "deleted_delete_files_count", + "partition_summaries", + "reference_snapshot_id", + ] + + int_cols = [ + "content", + "length", + "partition_spec_id", + "added_snapshot_id", + "added_data_files_count", + "existing_data_files_count", + "deleted_data_files_count", + "added_delete_files_count", + "existing_delete_files_count", + "deleted_delete_files_count", + "reference_snapshot_id", + ] + + for column in int_cols: + for value in df[column]: + assert isinstance(value.as_py(), int) + + for value in df["path"]: + assert isinstance(value.as_py(), str) + + for value in df["partition_summaries"]: + assert isinstance(value.as_py(), list) + for row in value: + assert isinstance(row["contains_null"].as_py(), bool) + assert isinstance(row["contains_nan"].as_py(), (bool, type(None))) + assert isinstance(row["lower_bound"].as_py(), (str, type(None))) + assert isinstance(row["upper_bound"].as_py(), (str, type(None))) + + lhs = spark.table(f"{identifier}.all_manifests").toPandas() + rhs = df.to_pandas() + for column in df.column_names: + for left, right in zip(lhs[column].to_list(), rhs[column].to_list()): + assert left == right, f"Difference in column {column}: {left} != {right}"