diff --git a/pyproject.toml b/pyproject.toml index efa77cfdab..9aa299032e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ classifiers = [ "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11" + "Programming Language :: Python :: 3.11", ] packages = [ { include = "pyiceberg" }, @@ -37,7 +37,10 @@ packages = [ { from = "vendor", include = "hive_metastore" }, { include = "tests", format = "sdist" }, { include = "Makefile", format = "sdist" }, - { include = "NOTICE", format = ["sdist", "wheel"] } + { include = "NOTICE", format = [ + "sdist", + "wheel", + ] }, ] include = [ { path = "dev", format = "sdist" }, @@ -62,8 +65,8 @@ pyarrow = { version = ">=9.0.0,<18.0.0", optional = true } pandas = { version = ">=1.0.0,<3.0.0", optional = true } duckdb = { version = ">=0.5.0,<2.0.0", optional = true } ray = [ - { version = "==2.10.0", python = "<3.9", optional = true}, - { version = ">=2.10.0,<3.0.0", python = ">=3.9", optional = true} + { version = "==2.10.0", python = "<3.9", optional = true }, + { version = ">=2.10.0,<3.0.0", python = ">=3.9", optional = true }, ] python-snappy = { version = ">=0.6.0,<1.0.0", optional = true } thrift = { version = ">=0.13.0,<1.0.0", optional = true } @@ -599,13 +602,17 @@ markers = [ "s3: marks a test as requiring access to s3 compliant storage (use with --aws-access-key-id, --aws-secret-access-key, and --endpoint args)", "adlfs: marks a test as requiring access to adlfs compliant storage (use with --adlfs.account-name, --adlfs.account-key, and --adlfs.endpoint args)", "integration: marks integration tests against Apache Spark", - "gcs: marks a test as requiring access to gcs compliant storage (use with --gs.token, --gs.project, and --gs.endpoint)" + "gcs: marks a test as requiring access to gcs compliant storage (use with --gs.token, --gs.project, and --gs.endpoint)", ] # Turns a warning into an error -#filterwarnings = [ -# "error" -#] +filterwarnings = [ + "error", + "ignore:A plugin raised an exception during an old-style hookwrapper teardown.", + "ignore:unclosed None: identifier = "default.table_partitioned_delete" @@ -175,6 +176,7 @@ def test_partitioned_table_no_match(spark: SparkSession, session_catalog: RestCa @pytest.mark.integration +@pytest.mark.filterwarnings("ignore:Merge on read is not yet supported, falling back to copy-on-write") def test_delete_partitioned_table_positional_deletes(spark: SparkSession, session_catalog: RestCatalog) -> None: identifier = "default.table_partitioned_delete" @@ -223,6 +225,7 @@ def test_delete_partitioned_table_positional_deletes(spark: SparkSession, sessio @pytest.mark.integration +@pytest.mark.filterwarnings("ignore:Merge on read is not yet supported, falling back to copy-on-write") def test_overwrite_partitioned_table(spark: SparkSession, session_catalog: RestCatalog) -> None: identifier = "default.table_partitioned_delete" @@ -274,6 +277,7 @@ def test_overwrite_partitioned_table(spark: SparkSession, session_catalog: RestC @pytest.mark.integration +@pytest.mark.filterwarnings("ignore:Merge on read is not yet supported, falling back to copy-on-write") def test_partitioned_table_positional_deletes_sequence_number(spark: SparkSession, session_catalog: RestCatalog) -> None: identifier = "default.table_partitioned_delete_sequence_number" diff --git a/tests/integration/test_inspect_table.py b/tests/integration/test_inspect_table.py index 9415d7146d..9f63225846 100644 --- a/tests/integration/test_inspect_table.py +++ b/tests/integration/test_inspect_table.py @@ -79,7 +79,7 @@ def test_inspect_snapshots( identifier = "default.table_metadata_snapshots" tbl = _create_table(session_catalog, identifier, properties={"format-version": format_version}) - tbl.overwrite(arrow_table_with_null) + tbl.append(arrow_table_with_null) # should produce a DELETE entry tbl.overwrite(arrow_table_with_null) # Since we don't rewrite, this should produce a new manifest with an ADDED entry @@ -295,7 +295,7 @@ def test_inspect_refs( tbl = _create_table(session_catalog, identifier, properties={"format-version": format_version}) # write data to create snapshot - tbl.overwrite(arrow_table_with_null) + tbl.append(arrow_table_with_null) # create a test branch spark.sql( @@ -667,7 +667,7 @@ def test_inspect_files( tbl = _create_table(session_catalog, identifier, properties={"format-version": format_version}) - tbl.overwrite(arrow_table_with_null) + tbl.append(arrow_table_with_null) # append more data tbl.append(arrow_table_with_null) diff --git a/tests/integration/test_writes/test_writes.py b/tests/integration/test_writes/test_writes.py index 8ea51f4bc7..3aaafa85fe 100644 --- a/tests/integration/test_writes/test_writes.py +++ b/tests/integration/test_writes/test_writes.py @@ -256,7 +256,7 @@ def test_data_files(spark: SparkSession, session_catalog: Catalog, arrow_table_w identifier = "default.arrow_data_files" tbl = _create_table(session_catalog, identifier, {"format-version": "1"}, []) - tbl.overwrite(arrow_table_with_null) + tbl.append(arrow_table_with_null) # should produce a DELETE entry tbl.overwrite(arrow_table_with_null) # Since we don't rewrite, this should produce a new manifest with an ADDED entry @@ -288,7 +288,7 @@ def get_current_snapshot_id(identifier: str) -> int: .snapshot_id ) - tbl.overwrite(arrow_table_with_null) + tbl.append(arrow_table_with_null) assert tbl.current_snapshot().snapshot_id == get_current_snapshot_id(identifier) # type: ignore tbl.overwrite(arrow_table_with_null) assert tbl.current_snapshot().snapshot_id == get_current_snapshot_id(identifier) # type: ignore @@ -330,7 +330,7 @@ def test_python_writes_special_character_column_with_spark_reads( arrow_table_with_special_character_column = pa.Table.from_pydict(TEST_DATA_WITH_SPECIAL_CHARACTER_COLUMN, schema=pa_schema) tbl = _create_table(session_catalog, identifier, {"format-version": format_version}, schema=pa_schema) - tbl.overwrite(arrow_table_with_special_character_column) + tbl.append(arrow_table_with_special_character_column) spark_df = spark.sql(f"SELECT * FROM {identifier}").toPandas() pyiceberg_df = tbl.scan().to_pandas() assert spark_df.equals(pyiceberg_df) @@ -354,7 +354,7 @@ def test_python_writes_dictionary_encoded_column_with_spark_reads( tbl = _create_table(session_catalog, identifier, {"format-version": format_version}, schema=pa_schema) - tbl.overwrite(arrow_table) + tbl.append(arrow_table) spark_df = spark.sql(f"SELECT * FROM {identifier}").toPandas() pyiceberg_df = tbl.scan().to_pandas() assert spark_df.equals(pyiceberg_df) @@ -393,7 +393,7 @@ def test_python_writes_with_small_and_large_types_spark_reads( arrow_table = pa.Table.from_pydict(TEST_DATA, schema=pa_schema) tbl = _create_table(session_catalog, identifier, {"format-version": format_version}, schema=pa_schema) - tbl.overwrite(arrow_table) + tbl.append(arrow_table) spark_df = spark.sql(f"SELECT * FROM {identifier}").toPandas() pyiceberg_df = tbl.scan().to_pandas() assert spark_df.equals(pyiceberg_df) @@ -429,7 +429,7 @@ def get_data_files_count(identifier: str) -> int: # writes 1 data file since the table is smaller than default target file size assert arrow_table_with_null.nbytes < TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT - tbl.overwrite(arrow_table_with_null) + tbl.append(arrow_table_with_null) assert get_data_files_count(identifier) == 1 # writes 1 data file as long as table is smaller than default target file size @@ -820,7 +820,7 @@ def test_inspect_snapshots( identifier = "default.table_metadata_snapshots" tbl = _create_table(session_catalog, identifier, properties={"format-version": format_version}) - tbl.overwrite(arrow_table_with_null) + tbl.append(arrow_table_with_null) # should produce a DELETE entry tbl.overwrite(arrow_table_with_null) # Since we don't rewrite, this should produce a new manifest with an ADDED entry @@ -979,6 +979,7 @@ def test_table_write_subset_of_schema(session_catalog: Catalog, arrow_table_with @pytest.mark.integration @pytest.mark.parametrize("format_version", [1, 2]) +@pytest.mark.filterwarnings("ignore:Delete operation did not match any records") def test_table_write_out_of_order_schema(session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int) -> None: identifier = "default.test_table_write_out_of_order_schema" # rotate the schema fields by 1 @@ -989,6 +990,7 @@ def test_table_write_out_of_order_schema(session_catalog: Catalog, arrow_table_w tbl = _create_table(session_catalog, identifier, {"format-version": format_version}, schema=rotated_schema) tbl.overwrite(arrow_table_with_null) + tbl.append(arrow_table_with_null) # overwrite and then append should produce twice the data assert len(tbl.scan().to_arrow()) == len(arrow_table_with_null) * 2