From 1441d0ac1cad5f72ba9eb3c417989bed8fb51487 Mon Sep 17 00:00:00 2001 From: Ryan Eakman <6326532+eakmanrq@users.noreply.github.com> Date: Wed, 22 May 2024 21:28:25 -0700 Subject: [PATCH] feat: properly handle filename duckdb csv (#23) --- sqlframe/duckdb/readwriter.py | 5 +- .../engines/duck/test_duckdb_reader.py | 55 +++++++++++++++++-- 2 files changed, 53 insertions(+), 7 deletions(-) diff --git a/sqlframe/duckdb/readwriter.py b/sqlframe/duckdb/readwriter.py index 609509b..e357951 100644 --- a/sqlframe/duckdb/readwriter.py +++ b/sqlframe/duckdb/readwriter.py @@ -74,7 +74,10 @@ def load( """ if schema: column_mapping = ensure_column_mapping(schema) - select_columns = [x.expression for x in self._to_casted_columns(column_mapping)] + select_column_mapping = column_mapping.copy() + if options.get("filename"): + select_column_mapping["filename"] = "VARCHAR" + select_columns = [x.expression for x in self._to_casted_columns(select_column_mapping)] if format == "csv": duckdb_columns = ", ".join( [f"'{column}': '{dtype}'" for column, dtype in column_mapping.items()] diff --git a/tests/integration/engines/duck/test_duckdb_reader.py b/tests/integration/engines/duck/test_duckdb_reader.py index 1e487fe..eb4d570 100644 --- a/tests/integration/engines/duck/test_duckdb_reader.py +++ b/tests/integration/engines/duck/test_duckdb_reader.py @@ -17,13 +17,56 @@ def test_employee_extra_line_csv(duckdb_session: DuckDBSession): auto_detect=False, ) assert df.collect() == [ - Row(**{"employee_id": 1, "fname": "Jack", "lname": "Shephard", "age": 37, "store_id": 1}), - Row(**{"employee_id": 2, "fname": "John", "lname": "Locke", "age": 65, "store_id": 1}), - Row(**{"employee_id": 3, "fname": "Kate", "lname": "Austen", "age": 37, "store_id": 2}), Row( - **{"employee_id": 4, "fname": "Claire", "lname": "Littleton", "age": 27, "store_id": 2} + **{ + "employee_id": 1, + "fname": "Jack", + "lname": "Shephard", + "age": 37, + "store_id": 1, + "filename": "tests/fixtures/employee_extra_line.csv", + } + ), + Row( + **{ + "employee_id": 2, + "fname": "John", + "lname": "Locke", + "age": 65, + "store_id": 1, + "filename": "tests/fixtures/employee_extra_line.csv", + } + ), + Row( + **{ + "employee_id": 3, + "fname": "Kate", + "lname": "Austen", + "age": 37, + "store_id": 2, + "filename": "tests/fixtures/employee_extra_line.csv", + } + ), + Row( + **{ + "employee_id": 4, + "fname": "Claire", + "lname": "Littleton", + "age": 27, + "store_id": 2, + "filename": "tests/fixtures/employee_extra_line.csv", + } + ), + Row( + **{ + "employee_id": 5, + "fname": "Hugo", + "lname": "Reyes", + "age": 29, + "store_id": 100, + "filename": "tests/fixtures/employee_extra_line.csv", + } ), - Row(**{"employee_id": 5, "fname": "Hugo", "lname": "Reyes", "age": 29, "store_id": 100}), ] @@ -34,7 +77,7 @@ def test_employee_extra_line_csv_multiple(duckdb_session: DuckDBSession): schema="employee_id INT, fname STRING, lname STRING, age INT, store_id INT", skip=1, header=1, - filename=1, + filename=0, null_padding=True, ignore_errors=1, auto_detect=False,